def run_cleanup(self, df: pandas.DataFrame) -> Tuple[bool, pandas.DataFrame]: clean_up = True # check for interactive mode if self.flags.interactive: logger.info( yellow( "PRE-CLEANING PREVIEW: The DataFrame you would push to the database would look like this:" )) self._show_dry_run_preview(df) clean_up = self._collect_and_check_answer() if clean_up is True: logger.debug("Performing clean ups") clean_df = SheetCleaner( df, bool(self.config.sheet_config.get("snake_case_camel", False))).cleanup() if self.flags.dry_run or self.flags.interactive: logger.info(yellow("\nPOST-CLEANING PREVIEW:")) self._show_dry_run_preview(clean_df) carry_on = self._collect_and_check_answer(post_cleanup=True) if not carry_on: logger.info(timed_message(red("User Aborted."))) sys.exit(1) return True, clean_df return True, df
def override_object_creation_from_flags(self) -> None: if self.flags.create_table: logger.debug(yellow("going to create table")) self.object_creation_dct.update({"create_table": True}) if self.flags.create_schema: logger.debug(yellow("going to create schema")) self.object_creation_dct.update({"create_schema": True}) logger.debug(yellow(f"Object creation dict after override\n {self.object_creation_dct}")) if self.flags.destructive_create_table: logger.debug(yellow("going to perform destuctive table creation")) self.destructive_create_table = True
def check_and_compare_version(external_version: Optional[str] = str() ) -> Tuple[bool, str]: """Checks what the currently installed version of sheetwork is and compares it to the one on PyPI. This requires an internet connection. In the case where this doesn't happen a URLError will probably be thrown and in that case we just return False not to cause annoying user experience. Args: external_version (Optional[str], optional): Mainly for testing purposes. Defaults to str(). Returns: bool: True when sheetwork needs an update. False when good. """ try: pypi_version: str = luddite.get_version_pypi("sheetwork") if external_version: installed_version = external_version else: installed_version = __version__ needs_update = semver_parse(pypi_version) > semver_parse( installed_version) if needs_update: logger.warning( yellow( f"Looks like you're a bit behind. A newer version of Sheetwork v{pypi_version} is available." )) return needs_update, pypi_version except URLError: return False, str()
def run(self): self.load_sheet() if self.push_anyway: self.push_sheet() self.check_table() else: logger.info( yellow("Nothing pushed since you were in --dry_run mode."))
def decide_object_creation(self) -> None: self.handle_deprecations() create_everything_label = "always_create_objects" object_creation_mapping = { # ! DEPRECATE "always_create" "create_table": ["always_create_table", "always_create"], "create_schema": ["always_create_schema"], } for object_type, rule in object_creation_mapping.items(): if self.project_dict.get(create_everything_label): create = [True] else: create = [True for x in rule if self.project_dict.get(x) is True] self.object_creation_dct.update({object_type: True in create}) self.destructive_create_table = ( True if self.project_dict.get("destructive_create_table", self.destructive_create_table) is True else False ) logger.debug(yellow(f"Object creation dict:\n {self.object_creation_dct}")) logger.debug(yellow(str(self.project_dict)))
def deprecate(message: str, colour: str = "yellow") -> None: """Handles deperecation messages more using proper DeprecationWarnings. It also makes sure deprecatio warnings are enabled globally as certain shells might have them turned off by default. Args: message (str): Deprecation message to print. colour (str, optional): Colour name to wrap the decprecation message. For now only "yellow", "red" or None are supported. Defaults to "yellow". """ global DEPRECATION_WARNINGS_ENABLED, _WARNINGS_ALREADY_ENABLED if colour == "yellow": _message = yellow(message) elif colour == "red": _message = red(message) elif colour is None: _message = message else: logger.error( f"{colour} is not supported, painting error mesage 'yellow'") _message = yellow(colour) if DEPRECATION_WARNINGS_ENABLED and not _WARNINGS_ALREADY_ENABLED: _WARNINGS_ALREADY_ENABLED = True warnings.filterwarnings("default", ".*", category=DeprecationWarning, module="gspread_pandas") if _WARNINGS_ALREADY_ENABLED and not DEPRECATION_WARNINGS_ENABLED: warnings.filterwarnings("ignore", ".*", category=DeprecationWarning, module="gspread_pandas") warnings.warn(_message, DeprecationWarning, stacklevel=2)
def _create_schema(self) -> None: if self._has_connection is False: raise NoAcquiredConnectionError( f"No acquired connection for {type(self).__name__}. Make sure you call " "`acquire_connection` before.") try: if self.config.project.object_creation_dct["create_schema"]: schema_exists = (True if self.config.target_schema in self.con.dialect.get_schema_names( self.con) else False) if schema_exists is False: logger.debug( yellow( f"Creating schema: {self.config.target_schema} in {self._database}" )) self.con.execute(CreateSchema(self.config.target_schema)) except Exception as e: raise DatabaseError(str(e))
def check_columns_in_df( df: pandas.DataFrame, columns: Union[List[str], str], warn_only: bool = False, suppress_warning: bool = False, ) -> Tuple[bool, List[str]]: """Checks if a bunch of columns are present in a dataframe. Args: df (pandas.DataFrame): df to check. columns (Union[List[str], str]): column names to check for. warn_only (bool, optional): When True will only warn otherwise raises. Defaults to False. suppress_warning (bool, optional): When true warning isn't shown only return. Defaults to False. Raises: ColumnNotFoundInDataFrame: If warn_only is False, this error will be raised when any of the columns to check for are not present in the dataframe. Returns: Tuple[bool, List[str]]: Boolean if all columns are present in df, List of missing columns. """ if isinstance(columns, str): columns = [columns] is_subset = set(columns).issubset(df.columns) if is_subset: return True, columns # else reduce columms, provide filtered list set bool to false and warn or raise cols_not_in_df = [x for x in columns if x not in df.columns.tolist()] reduced_cols = [x for x in columns if x in df.columns.tolist()] message = f"The following columns were not found in the sheet: {cols_not_in_df} " if warn_only and not suppress_warning: logger.warning( yellow( message + "they were ignored. Consider cleaning your sheets.yml file")) elif not warn_only and not suppress_warning: raise ColumnNotFoundInDataFrame( message + "Google Sheet or sheets.yml needs to be cleaned") return False, reduced_cols
def make_df_from_worksheet(self, worksheet_name: str = str(), grab_header: bool = True) -> pandas.DataFrame: if not self.workbook: raise NoWorkbookLoadedError( "Workbook object seems empty, cannot turn a None object into a dataframe" ) try: if worksheet_name: worksheet = self.workbook.worksheet(worksheet_name) else: worksheet_name = "default sheet" worksheet = self.workbook.get_worksheet(0) logger.debug(green("Sheet loaded successfully")) if grab_header: values: List[Any] = worksheet.get_all_values() check_dupe_cols(values[0]) df = pandas.DataFrame(values[1:], columns=values[0]) else: df = pandas.DataFrame(worksheet.get_all_values()) logger.debug(yellow(f"Raw obtained google sheet: \n {df.head()}")) return df except Exception as e: raise SheetLoadingError(f"Error loading sheet: \n {e}")
def upload(self, df: pandas.DataFrame, override_schema: str = str()) -> None: # cast columns # !: note integer conversion doesn't actually happen it is left as a str see #204, #205 df = cast_pandas_dtypes(df, overwrite_dict=self.config.sheet_columns) dtypes_dict = self.sqlalchemy_dtypes(self.config.sheet_columns) # potentially override target schema from config. if override_schema: schema = override_schema else: schema = self.config.target_schema # write to csv and try to talk to db temp = tempfile.NamedTemporaryFile() df.to_csv(temp.name, index=False, header=False, sep="|") self.acquire_connection() # set up schema creation self._create_schema() try: # set the table creation behaviour _if_exists = "fail" if self.config.project.object_creation_dct["create_table"] is True: if self.config.project.destructive_create_table: _if_exists = "replace" # perform the create ops try: df.head(0).to_sql( name=self.config.target_table, schema=schema, con=self.con, if_exists=_if_exists, index=False, dtype=dtypes_dict, ) # if _if_exists is fail pandas will throw a ValueError which we want to escape when # destructive_create_table is set to False (or not provided) and throw a warning instead. except ValueError as e: if _if_exists == "fail": logger.warning( yellow( f"{self._database}" f".{schema}.{self.config.target_table} already exists and was not\n" "recreated because 'destructive_create_table' is set to False in your profile \n" "APPENDING instead.")) else: raise DatabaseError(str(e)) # Now push the actual data --the pandas create above is only for creation the logic below # is actually faster as pandas does it row by row qualified_table = ( f"{self._database}.{self.config.target_schema}.{self.config.target_table}" ) self.con.execute(f""" create or replace temporary stage {self.config.target_table}_stg file_format = (type = 'CSV' field_delimiter = '|' skip_header = 0 field_optionally_enclosed_by = '"') """) self.con.execute( f"put file://{temp.name} @{self.config.target_table}_stg") self.con.execute( f"copy into {qualified_table} from @{self.config.target_table}_stg" ) self.con.execute(f"drop stage {self.config.target_table}_stg") except Exception as e: raise DatabaseError(str(e)) finally: logger.debug("CLOSING CONNECTION & CLEANING TMP FILE") temp.close() self.close_connection()