Exemplo n.º 1
0
    def _override_gspread_default_creds(self) -> None:
        """Temporary workaround to allow `gspread.oauth()` to look for credentials in another location.

        For more info: https://github.com/burnash/gspread/issues/826
        This will likely be removed if work on gspread #826 gets carried out.
        """
        logger.debug(
            "Overriding `gspread`'s DEFAULT_AUTHORISED_USER_FILENAME and stuff. "
            "This is temporary (hopefully) see `GoogleSpreadsheet._override_gspread_default_creds()` "
            "docstring for more info.")
        logger.debug(
            f"Overriding to: {self._profile.google_credentials_dir}/{self._profile.profile_name}"
        )
        gspread.auth.DEFAULT_CONFIG_DIR = Path(
            self._profile.google_credentials_dir)

        gspread.auth.DEFAULT_CREDENTIALS_FILENAME = gspread.auth.DEFAULT_CONFIG_DIR.joinpath(
            self._profile.profile_name).with_suffix(self.CREDS_EXT)

        gspread.auth.DEFAULT_AUTHORIZED_USER_FILENAME = gspread.auth.DEFAULT_CONFIG_DIR.joinpath(
            f"{self._profile.profile_name}_authorised_user").with_suffix(
                self.CREDS_EXT)

        gspread.auth.DEFAULT_SERVICE_ACCOUNT_FILENAME = gspread.auth.DEFAULT_CONFIG_DIR.joinpath(
            f"{self._profile.profile_name}_service_account").with_suffix(
                self.CREDS_EXT)

        # doing this skipping for when I'm testing this function
        gspread.auth.load_credentials.__defaults__ = (
            gspread.auth.DEFAULT_AUTHORIZED_USER_FILENAME, )

        gspread.auth.store_credentials.__defaults__ = (
            gspread.auth.DEFAULT_AUTHORIZED_USER_FILENAME,
            "token",
        )
Exemplo n.º 2
0
    def run_cleanup(self,
                    df: pandas.DataFrame) -> Tuple[bool, pandas.DataFrame]:
        clean_up = True
        # check for interactive mode
        if self.flags.interactive:
            logger.info(
                yellow(
                    "PRE-CLEANING PREVIEW: The DataFrame you would push to the database would look like this:"
                ))
            self._show_dry_run_preview(df)
            clean_up = self._collect_and_check_answer()

        if clean_up is True:
            logger.debug("Performing clean ups")
            clean_df = SheetCleaner(
                df,
                bool(self.config.sheet_config.get("snake_case_camel",
                                                  False))).cleanup()
            if self.flags.dry_run or self.flags.interactive:
                logger.info(yellow("\nPOST-CLEANING PREVIEW:"))
                self._show_dry_run_preview(clean_df)
                carry_on = self._collect_and_check_answer(post_cleanup=True)
                if not carry_on:
                    logger.info(timed_message(red("User Aborted.")))
                    sys.exit(1)
            return True, clean_df
        return True, df
Exemplo n.º 3
0
 def read_profile(self):
     logger.debug(f"Profile Name: {self.profile_name}")
     filename = Path(self.profile_dir, "profiles.yml")
     if filename.exists():
         yaml_dict = open_yaml(filename)
         is_valid_yaml = validate_yaml(yaml_dict, profiles_schema)
         profile = yaml_dict["profiles"].get(self.profile_name)
         if profile:
             # set target name from profile unless one was given at init from flags parse.
             if not self.target_name:
                 self.target_name = profile.get("target")
             if profile.get("outputs"):
                 target_profile = profile["outputs"].get(self.target_name)
             if target_profile and is_valid_yaml:
                 is_valid_profile = self._validate_profile(target_profile)
                 if is_valid_profile:
                     self.profile_dict = target_profile
             else:
                 raise ProfileParserError(
                     f"Error finding and entry for  target: {self.target_name}, "
                     f"under the {self.profile_name} profile.")
         else:
             raise ProfileParserError(
                 f"Could not find an entry for {self.profile_name} in your profile.yml"
             )
     else:
         raise FileNotFoundError(
             f"Could not open or find {filename.resolve()} check that it exists"
         )
Exemplo n.º 4
0
    def find_nearest_dir_and_file(
        self, yaml_file: str, current: Path = Path.cwd()) -> Tuple[Path, Path]:
        """Looks for the yaml_file you ask for.

        Starting from the current directory and going up with
        recursion while the iteration number is still within the max allowed.

        Args:
            yaml_file (str): Name and extension of the file to find.
            current (Path, optional): Path() objects from which to start. Defaults to Path.cwd().

        Raises:
            NearestFileNotFound: When no file that matches the required name can be found.

        Returns:
            Tuple[Path, Path]: The directory up to the file name, and the full path to the filename,
            respectively. Maybe we'll end up deprecating one of these returns down the line but for
            now it's handy.
        """
        filename = Path(current, yaml_file)
        while self.iteration < self.max_iter:
            logger.debug(f"Looking for {filename}")
            if filename.exists():
                project_dir = filename.parent
                logger.debug(f"{filename} exists and was returned")
                return project_dir, filename
            current = current.parent
            filename = Path(current, yaml_file)
            self.iteration += 1
        else:
            raise NearestFileNotFound(
                f"Unable to find {yaml_file} in the nearby directories after {self.max_iter} "
                "iterations upwards.")
Exemplo n.º 5
0
def make_dir(path: "Path"):
    """Creates a directory.

    Args:
        path (Path): Where you want it to be.
    """
    logger.debug(f"Making folder: {path}")
    path.mkdir()
Exemplo n.º 6
0
 def authenticate(self) -> None:
     if self.is_service_account:
         logger.debug("Using SERVICE_ACCOUNT auth")
         self.google_client = gspread.service_account(self.creds_path)
     else:
         logger.debug("Using END_USER auth")
         # ! This override should be temporary ideally we'll have a more long term solution in:
         # ! https://github.com/burnash/gspread/issues/826
         self._override_gspread_default_creds()
         self.google_client = gspread.oauth()
     self.is_authenticated = True
Exemplo n.º 7
0
def make_file(path: "Path", contents: str = str()):
    """Creates a text file with potential things in it. WOW!

    Args:
        path (Path): Where you want it to be
        contents (str, optional): What you want to put in that text file. Defaults to str().
    """
    logger.debug(f"Making file: {path}")
    path.touch()
    if contents:
        with path.open("w", encoding="utf-8") as f:
            f.write(contents)
Exemplo n.º 8
0
    def create_google_dir_and_file(self):
        self.google_path = self.profiles_path / "google"
        google_file = self.google_path / f"{self.project_name}.json"

        if not self.google_path.exists():
            make_dir(self.google_path)
        else:
            logger.debug(f"{self.google_path} already exists.")

        if not google_file.exists():
            make_file(google_file)
        else:
            logger.debug(f"{google_file} already exists.")
Exemplo n.º 9
0
    def load_sheet(self):
        """Loads a google sheet, and calls clean up steps if applicable.

        Sheet must have been shared with account admin email address used in storage.

        Raises:
            TypeError: When loader does not return results that can be converted into a pandas
            DataFrame a type error will be raised.
        """
        if self.flags.sheet_name:
            logger.info(timed_message(f"Importing: {self.flags.sheet_name}"))
            logger.debug(
                f"Importing data from: {self.config.sheet_config['sheet_key']}"
            )
        else:
            logger.info(
                timed_message(
                    f"Importing data from: {self.config.sheet_config.get('sheet_key')}"
                ))
        df = self._obtain_googlesheet()
        if not isinstance(df, pandas.DataFrame):
            raise TypeError("import_sheet did not return a pandas DataFrame")
        logger.debug(f"Columns imported from sheet: {df.columns.tolist()}")

        # Perform exclusions, renamings and cleanups before releasing the sheet.
        df = self.exclude_columns(df)
        df = self.rename_columns(df)
        self.push_anyway, df = self.run_cleanup(df)
        logger.debug(f"Columns after cleanups and exclusions: {df.columns}")
        logger.debug(f"Loaded SHEET HEAD: {df}")
        self.sheet_df = df
Exemplo n.º 10
0
 def _create_schema(self) -> None:
     if self._has_connection is False:
         raise NoAcquiredConnectionError(
             f"No acquired connection for {type(self).__name__}. Make sure you call "
             "`acquire_connection` before.")
     try:
         if self.config.project.object_creation_dct["create_schema"]:
             schema_exists = (True if self.config.target_schema
                              in self.con.dialect.get_schema_names(
                                  self.con) else False)
             if schema_exists is False:
                 logger.debug(
                     yellow(
                         f"Creating schema: {self.config.target_schema} in {self._database}"
                     ))
                 self.con.execute(CreateSchema(self.config.target_schema))
     except Exception as e:
         raise DatabaseError(str(e))
Exemplo n.º 11
0
    def __init__(self, project: Project, target_name: str = str()):
        """Profile constructor. Mainly just needs an initted Project object.

        Args:
            project (Project): initted project object
            target_name (str, optional): Mainly used in unit testing if you want to override the
                project name. Pretty useless in all other practice cases I think.
                Defaults to str().
        """
        self.profile_name = project.project_name
        self.target_name = target_name
        self.profile_dict: Dict[str, str] = dict()
        self.cannot_be_none = {"db_type", "guser"}
        self.profile_dir: Path = project.profile_dir
        self.google_credentials_dir = Path(project.profile_dir,
                                           "google").resolve()
        self.read_profile()
        logger.debug(f"PROFILE_DIR {self.profile_dir}")
        logger.debug(f"PROFILE_NAME: {self.profile_name}")
Exemplo n.º 12
0
    def override_object_creation_from_flags(self) -> None:
        if self.flags.create_table:
            logger.debug(yellow("going to create table"))
            self.object_creation_dct.update({"create_table": True})

        if self.flags.create_schema:
            logger.debug(yellow("going to create schema"))
            self.object_creation_dct.update({"create_schema": True})
        logger.debug(yellow(f"Object creation dict after override\n {self.object_creation_dct}"))

        if self.flags.destructive_create_table:
            logger.debug(yellow("going to perform destuctive table creation"))
            self.destructive_create_table = True
Exemplo n.º 13
0
def cast_pandas_dtypes(
    df: pandas.DataFrame, overwrite_dict: dict = dict()) -> pandas.DataFrame:
    """Converts a dataframe's columns along a provided dictionary of {col: dype}.

    Args:
        df (pandas.DataFrame): dataframe to cast.
        overwrite_dict (dict, optional): Dict of shate {column: dtype}. Defaults to dict().

    Raises:
        UnsupportedDataTypeError: When a dtype isn't currently supported (see dtypes_map inside function).
        ColumnNotFoundInDataFrame: When a column that is required for casting isn't found.

    Returns:
        pandas.DataFrame: df with converted dtypes
    """
    overwrite_dict = overwrite_dict.copy()
    dtypes_map = dict(
        varchar="object",
        # this is intentional in case of nulls. currently pandas doesn't play well with converting mixed types
        # see https://github.com/bastienboutonnet/sheetwork/issues/204 for more details
        int="object",
        numeric="float64",
        # ! HOT_FIX
        # this is intentional pandas
        # see https://github.com/bastienboutonnet/sheetwork/issues/288
        boolean="boolean",
        timestamp_ntz="datetime64[ns]",
        date=
        "datetime64[ns]",  # this intentional pandas doesn't really have just dates.
    )

    # Check for type support
    unsupported_dtypes = set(overwrite_dict.values()).difference(
        dtypes_map.keys())
    if unsupported_dtypes:
        raise UnsupportedDataTypeError(
            f"{unsupported_dtypes} are currently not supported")

    # check overwrite col is in df
    invalid_columns = set(overwrite_dict.keys()).difference(
        set(df.columns.tolist()))
    if invalid_columns:
        raise ColumnNotFoundInDataFrame(
            f"{invalid_columns} not in DataFrame. Check spelling?")

    # recode dict in pandas terms
    for col, data_type in overwrite_dict.items():
        overwrite_dict.update({col: dtypes_map[data_type]})

    # cast
    logger.debug(f"DF BEFORE CASTING: {df.head()}")
    logger.debug(f"DF BEFORE CASTING DTYPES: {df.dtypes}")

    # handle boolean "manually" because .astype(bool) leads to everythin being true if not null.
    df = handle_booleans(df, overwrite_dict=overwrite_dict)
    # use pandas native function for all other data types as they are not problematic and we have
    # already handled booleans specificatlly.
    df = df.astype(overwrite_dict)
    logger.debug(f"Head of cast dataframe:\n {df.head()}")
    return df
Exemplo n.º 14
0
 def push_sheet(self):
     logger.info(timed_message("Pushing sheet to database..."))
     logger.debug(
         f"Column override dict is a {type(self.config.sheet_columns)}")
     logger.debug(f"Sheet columns: {self.config.sheet_columns}")
     logger.debug(f"Columns in final df: {self.sheet_df.columns.tolist()}")
     self.sql_adapter.upload(self.sheet_df, self.target_schema)
Exemplo n.º 15
0
 def decide_object_creation(self) -> None:
     self.handle_deprecations()
     create_everything_label = "always_create_objects"
     object_creation_mapping = {
         # ! DEPRECATE "always_create"
         "create_table": ["always_create_table", "always_create"],
         "create_schema": ["always_create_schema"],
     }
     for object_type, rule in object_creation_mapping.items():
         if self.project_dict.get(create_everything_label):
             create = [True]
         else:
             create = [True for x in rule if self.project_dict.get(x) is True]
         self.object_creation_dct.update({object_type: True in create})
     self.destructive_create_table = (
         True
         if self.project_dict.get("destructive_create_table", self.destructive_create_table)
         is True
         else False
     )
     logger.debug(yellow(f"Object creation dict:\n {self.object_creation_dct}"))
     logger.debug(yellow(str(self.project_dict)))
Exemplo n.º 16
0
    def __init__(self, flags: FlagParser) -> None:
        """Constructs project object.

        Args:
            flags (FlagParser): Inited flags object.
        """
        self.project_dict: Dict[str, Union[str, bool]] = dict()
        self.target_schema: str = str()
        self.object_creation_dct: Dict[str, bool] = dict()
        self.destructive_create_table: bool = False
        self.flags = flags

        # directories (first overwritten by flags, then by project) This may not always be able to
        # be like this we might wanna give prio to CLI but for now this removes some complication.
        self.project_file_fullpath: Path = Path("dumpy_path")
        self.profile_dir: Path = Path("~/.sheetwork/").expanduser()
        self.sheet_config_dir: Path = Path.cwd()

        # override defaults
        self.override_paths_from_flags()
        self.load_project_from_yaml()
        self.decide_object_creation()
        self.override_object_creation_from_flags()
        logger.debug(f"Project name: {self.project_name}")
Exemplo n.º 17
0
 def make_df_from_worksheet(self,
                            worksheet_name: str = str(),
                            grab_header: bool = True) -> pandas.DataFrame:
     if not self.workbook:
         raise NoWorkbookLoadedError(
             "Workbook object seems empty, cannot turn a None object into a dataframe"
         )
     try:
         if worksheet_name:
             worksheet = self.workbook.worksheet(worksheet_name)
         else:
             worksheet_name = "default sheet"
             worksheet = self.workbook.get_worksheet(0)
         logger.debug(green("Sheet loaded successfully"))
         if grab_header:
             values: List[Any] = worksheet.get_all_values()
             check_dupe_cols(values[0])
             df = pandas.DataFrame(values[1:], columns=values[0])
         else:
             df = pandas.DataFrame(worksheet.get_all_values())
         logger.debug(yellow(f"Raw obtained google sheet: \n {df.head()}"))
         return df
     except Exception as e:
         raise SheetLoadingError(f"Error loading sheet: \n {e}")
Exemplo n.º 18
0
 def create_profiles_dir(self):
     if not self.profiles_path.exists():
         make_dir(self.profiles_path)
     else:
         logger.debug(f"{self.profiles_path} already exists.")
Exemplo n.º 19
0
 def create_profiles_file(self):
     profile_file = Path(self.profiles_path, "profiles").with_suffix(".yml")
     if not profile_file.exists():
         make_file(profile_file)
     else:
         logger.debug(f"{profile_file} already exists.")
Exemplo n.º 20
0
    def upload(self, df: pandas.DataFrame,
               override_schema: str = str()) -> None:
        # cast columns
        # !: note integer conversion doesn't actually happen it is left as a str see #204, #205
        df = cast_pandas_dtypes(df, overwrite_dict=self.config.sheet_columns)
        dtypes_dict = self.sqlalchemy_dtypes(self.config.sheet_columns)

        # potentially override target schema from config.
        if override_schema:
            schema = override_schema
        else:
            schema = self.config.target_schema

        # write to csv and try to talk to db
        temp = tempfile.NamedTemporaryFile()
        df.to_csv(temp.name, index=False, header=False, sep="|")

        self.acquire_connection()

        # set up schema creation
        self._create_schema()

        try:
            # set the table creation behaviour
            _if_exists = "fail"
            if self.config.project.object_creation_dct["create_table"] is True:
                if self.config.project.destructive_create_table:
                    _if_exists = "replace"

                # perform the create ops
                try:
                    df.head(0).to_sql(
                        name=self.config.target_table,
                        schema=schema,
                        con=self.con,
                        if_exists=_if_exists,
                        index=False,
                        dtype=dtypes_dict,
                    )

                # if _if_exists is fail pandas will throw a ValueError which we want to escape when
                # destructive_create_table is set to False (or not provided) and throw a warning instead.
                except ValueError as e:
                    if _if_exists == "fail":
                        logger.warning(
                            yellow(
                                f"{self._database}"
                                f".{schema}.{self.config.target_table} already exists and was not\n"
                                "recreated because 'destructive_create_table' is set to False in your profile \n"
                                "APPENDING instead."))
                    else:
                        raise DatabaseError(str(e))

            # Now push the actual data --the pandas create above is only for creation the logic below
            # is actually faster as pandas does it row by row
            qualified_table = (
                f"{self._database}.{self.config.target_schema}.{self.config.target_table}"
            )
            self.con.execute(f"""
                create or replace temporary stage {self.config.target_table}_stg
                file_format = (type = 'CSV' field_delimiter = '|'
                skip_header = 0 field_optionally_enclosed_by = '"')
                """)
            self.con.execute(
                f"put file://{temp.name} @{self.config.target_table}_stg")
            self.con.execute(
                f"copy into {qualified_table} from @{self.config.target_table}_stg"
            )
            self.con.execute(f"drop stage {self.config.target_table}_stg")
        except Exception as e:
            raise DatabaseError(str(e))
        finally:
            logger.debug("CLOSING CONNECTION & CLEANING TMP FILE")
            temp.close()
            self.close_connection()