示例#1
0
class Connector:
    """ETL connector class"""
    def __init__(self):
        self.data_dir = "data"
        self.sql = MSSQL()
        self.ftp = FTP(self.data_dir)

    def sync_all_ftp_data(self):
        for table_name, directory_name in data_reports.items():
            self.ftp.download_files(directory_name)
            self._load_new_records_into_table(table_name, directory_name)

    def _load_new_records_into_table(self, table_name, report_name):
        """Find and insert new records into the data warehouse."""
        start_date = self._get_latest_date(table_name) + timedelta(days=1)
        yesterday = datetime.today() - timedelta(days=1)
        if start_date > yesterday:
            logging.info(
                f"Clever_{table_name} is up to date. No records inserted.")
            return
        else:
            file_names = self._generate_file_names(start_date, yesterday,
                                                   report_name)
            df = self._read_and_concat_files(file_names)
            self.sql.insert_into(f"Clever_{table_name}",
                                 df,
                                 if_exists="append")
            logging.info(
                f"Inserted {len(df)} records into Clever_{table_name}.")

    def _get_latest_date(self, table_name):
        """Get the latest date record in this table."""
        date = self.sql.query(
            f"SELECT TOP(1) [date] FROM custom.Clever_{table_name} ORDER BY [date] DESC"
        )
        latest_date = date["date"][0]
        return datetime.strptime(latest_date, "%Y-%m-%d")

    def _generate_file_names(self, start_date, yesterday, report_name):
        file_names = []
        while start_date <= yesterday:  # loop through yesterday's date
            formatted_date = start_date.strftime("%Y-%m-%d")
            file_names.append(f"{formatted_date}-{report_name}-students.csv")
            start_date += timedelta(days=1)
        return file_names

    def _read_and_concat_files(self, file_names):
        dfs = []
        for file_name in file_names:
            df = pd.read_csv(f"{self.data_dir}/{file_name}")
            logging.info(f"Read {len(df)} records from '{file_name}'.")
            dfs.append(df)
        data = pd.concat(dfs)
        return data

    def sync_student_google_accounts(self):
        """Get student emails from Google Accounts Manager app."""
        browser = Browser(self.data_dir)
        browser.export_student_google_accounts()
        # Transform and load csv data into database table
        df = self._get_data_from_csv_by_name("Student_export")
        df.rename(columns={"ID": "SIS_ID"}, inplace=True)
        self.sql.insert_into("Clever_StudentGoogleAccounts",
                             df,
                             if_exists="replace")
        logging.info(
            f"Inserted {len(df)} new records into Clever_StudentGoogleAccounts."
        )

    def _get_data_from_csv_by_name(self, string_to_match):
        """Get the downloaded csv BY NAME and store it in a dataframe."""
        for filename in os.listdir(self.data_dir):
            if fnmatch(filename, f"*{string_to_match}*"):
                file_path = f"{self.data_dir}/{filename}"
                break
        df = pd.read_csv(file_path)
        logging.info(f"Loaded {len(df)} records from downloaded file.")
        return df
示例#2
0
class Connector:
    """ETL connector class"""
    def __init__(self):
        self.data_dir = "data"
        self.sql = MSSQL()
        self.ftp = FTP(self.data_dir)

    def sync_all_ftp_data(self):
        for table_name, directory_name in data_reports.items():
            self.ftp.download_files(directory_name)
            self._load_new_records_into_table(table_name, directory_name)

    def _load_new_records_into_table(self, table_name, report_name):
        """Find and insert new records into the data warehouse."""
        if report_name == "idm-reports":
            # this folder contains student emails file, which has no datestamp in the file name
            self._process_files_without_datestamp(table_name, report_name)
        else:
            self._process_files_with_datestamp(table_name, report_name)

    def _process_files_without_datestamp(self, table_name, report_name):
        # Student Emails file doesn't contain a datestamp in the file name
        # This table should be truncated and replaced.
        df = self._read_file(f"{self.data_dir}/google-student-emails.csv")
        self.sql.insert_into(f"Clever_{table_name}", df, if_exists="replace")
        logging.info(f"Inserted {len(df)} records into Clever_{table_name}.")

    def _process_files_with_datestamp(self, table_name, report_name):
        # Generate names for files with datestamps in the file name and process those files
        # These tables should be appended to, not truncated.
        start_date = self._get_latest_date(table_name) + timedelta(days=1)
        yesterday = datetime.today() - timedelta(days=1)
        if start_date > yesterday:
            logging.info(
                f"Clever_{table_name} is up to date. No records inserted.")
            return
        else:
            file_names = self._generate_file_names(start_date, yesterday,
                                                   report_name)
            df = self._read_and_concat_files(file_names)
            self.sql.insert_into(f"Clever_{table_name}",
                                 df,
                                 if_exists="append")
            logging.info(
                f"Inserted {len(df)} records into Clever_{table_name}.")

    def _get_latest_date(self, table_name):
        """Get the latest date record in this table."""
        date = self.sql.query(
            f"SELECT TOP(1) [date] FROM custom.Clever_{table_name} ORDER BY [date] DESC"
        )
        latest_date = date["date"][0]
        return datetime.strptime(latest_date, "%Y-%m-%d")

    def _generate_file_names(self, start_date, yesterday, report_name):
        file_names = []
        while start_date <= yesterday:  # loop through yesterday's date
            formatted_date = start_date.strftime("%Y-%m-%d")
            file_names.append(f"{formatted_date}-{report_name}-students.csv")
            start_date += timedelta(days=1)
        return file_names

    def _read_and_concat_files(self, file_names):
        dfs = []
        for file_name in file_names:
            df = pd.read_csv(f"{self.data_dir}/{file_name}")
            logging.info(f"Read {len(df)} records from '{file_name}'.")
            dfs.append(df)
        data = pd.concat(dfs)
        return data

    def _read_file(self, file_name):
        df = pd.read_csv(file_name)
        logging.info(f"Read {len(df)} records from '{file_name}'.")
        return df