def test_this_works(): config = { "server": "db", "db": "master", "user": "******", "pwd": "Just4TestingThis", "schema": "dbo", } sql = MSSQL(**config) data = [{"id": 1, "value": "test1"}, {"id": 2, "value": "test2"}] df = pd.DataFrame(data) sql.insert_into("test_table", df) results = pd.read_sql_table("test_table", con=sql.engine, schema=sql.schema) assert len(results) == 2
def migrate_mssql(): try: sql = MSSQL() # Tables sql.exec_cmd_from_file( "sql/mssql/tables/schoolmint_ApplicationData_changehistory.sql") sql.exec_cmd_from_file( "sql/mssql/tables/schoolmint_ApplicationData_raw_backup.sql") sql.exec_cmd_from_file( "sql/mssql/tables/schoolmint_ApplicationData_raw.sql") sql.exec_cmd_from_file( "sql/mssql/tables/schoolmint_ApplicationDataIndex_raw_backup.sql") sql.exec_cmd_from_file( "sql/mssql/tables/schoolmint_ApplicationDataIndex_raw.sql") sql.exec_cmd_from_file( "sql/mssql/tables/schoolmint_ApplicationStatuses.sql") sql.exec_cmd_from_file( "sql/mssql/tables/schoolmint_FactDailyStatus.sql") sql.exec_cmd_from_file("sql/mssql/tables/schoolmint_lk_Enrollment.sql") sql.exec_cmd_from_file( "sql/mssql/tables/schoolmint_ProgressMonitoring.sql") sql.exec_cmd_from_file("sql/mssql/tables/schoolmint_SchoolCodes.sql") # Load lookup tables enrollments = pd.read_csv("sql/data/lk_enrollment.csv") sql.insert_into("schoolmint_lk_Enrollment", enrollments) application_statuses = pd.read_csv("sql/data/application_statuses.csv") sql.insert_into("schoolmint_ApplicationStatuses", application_statuses) # Views sql.exec_cmd_from_file( "sql/mssql/views/vw_schoolmint_AppStatusList.sql") sql.exec_cmd_from_file( "sql/mssql/views/vw_schoolmint_FactDailyStatus_InterimTargets.sql") sql.exec_cmd_from_file( "sql/mssql/views/vw_schoolmint_FactDailyStatus.sql") sql.exec_cmd_from_file( "sql/mssql/views/vw_schoolmint_ProgressMonitoring.sql") sql.exec_cmd_from_file( "sql/mssql/views/vw_schoolmint_FactProgressMonitoring.sql") sql.exec_cmd_from_file( "sql/mssql/views/vw_schoolmint_Index_Demographics.sql") # Stored Procedures sql.exec_cmd_from_file( "sql/mssql/sprocs/sproc_schoolmint_Create_ChangeTracking_Entries.sql" ) sql.exec_cmd_from_file( "sql/mssql/sprocs/sproc_schoolmint_Create_FactDailyStatus.sql") sql.exec_cmd_from_file( "sql/mssql/sprocs/sproc_schoolmint_Index_PostProcess.sql") sql.exec_cmd_from_file( "sql/mssql/sprocs/sproc_schoolmint_Index_PrepareTables.sql") sql.exec_cmd_from_file( "sql/mssql/sprocs/sproc_schoolmint_Raw_PostProcess.sql") sql.exec_cmd_from_file( "sql/mssql/sprocs/sproc_schoolmint_Raw_PrepareTables.sql") except ProgrammingError as e: if "Cannot open database" in str(e): print("ERROR: First create your database and schema manually")
import urllib.request import zipfile import pandas as pd from sqlsorcery import MSSQL # Download 2019 SBAC Research Files url = "http://caaspp-elpac.cde.ca.gov/caaspp/researchfiles/sb_ca2019_1_csv_v2.zip" urllib.request.urlretrieve(url, "sb_ca2019_1_csv_v2.zip") # Extract flat files from zip folder with zipfile.ZipFile("sb_ca2019_1_csv_v2.zip", "r") as z: z.extractall() # Read datasets into dataframes entities = pd.read_csv("sb_ca2019entities_csv.txt", encoding="ISO-8859-1") scores = pd.read_csv("sb_ca2019_1_csv_v2.txt", encoding="ISO-8859-1") # Establish database connection sql = MSSQL() # Insert SBAC files into database tables sql.insert_into("sbac_entities_2019", entities, chunksize=1000) sql.insert_into("sbac_scores_2019", scores, chunksize=1000) # Query district scores for Oakland and San Francisco df = sql.query_from_file("query.sql") print(df)
class Connector: """ETL connector class""" def __init__(self): self.data_dir = "data" self.sql = MSSQL() self.ftp = FTP(self.data_dir) def sync_all_ftp_data(self): for table_name, directory_name in data_reports.items(): self.ftp.download_files(directory_name) self._load_new_records_into_table(table_name, directory_name) def _load_new_records_into_table(self, table_name, report_name): """Find and insert new records into the data warehouse.""" start_date = self._get_latest_date(table_name) + timedelta(days=1) yesterday = datetime.today() - timedelta(days=1) if start_date > yesterday: logging.info( f"Clever_{table_name} is up to date. No records inserted.") return else: file_names = self._generate_file_names(start_date, yesterday, report_name) df = self._read_and_concat_files(file_names) self.sql.insert_into(f"Clever_{table_name}", df, if_exists="append") logging.info( f"Inserted {len(df)} records into Clever_{table_name}.") def _get_latest_date(self, table_name): """Get the latest date record in this table.""" date = self.sql.query( f"SELECT TOP(1) [date] FROM custom.Clever_{table_name} ORDER BY [date] DESC" ) latest_date = date["date"][0] return datetime.strptime(latest_date, "%Y-%m-%d") def _generate_file_names(self, start_date, yesterday, report_name): file_names = [] while start_date <= yesterday: # loop through yesterday's date formatted_date = start_date.strftime("%Y-%m-%d") file_names.append(f"{formatted_date}-{report_name}-students.csv") start_date += timedelta(days=1) return file_names def _read_and_concat_files(self, file_names): dfs = [] for file_name in file_names: df = pd.read_csv(f"{self.data_dir}/{file_name}") logging.info(f"Read {len(df)} records from '{file_name}'.") dfs.append(df) data = pd.concat(dfs) return data def sync_student_google_accounts(self): """Get student emails from Google Accounts Manager app.""" browser = Browser(self.data_dir) browser.export_student_google_accounts() # Transform and load csv data into database table df = self._get_data_from_csv_by_name("Student_export") df.rename(columns={"ID": "SIS_ID"}, inplace=True) self.sql.insert_into("Clever_StudentGoogleAccounts", df, if_exists="replace") logging.info( f"Inserted {len(df)} new records into Clever_StudentGoogleAccounts." ) def _get_data_from_csv_by_name(self, string_to_match): """Get the downloaded csv BY NAME and store it in a dataframe.""" for filename in os.listdir(self.data_dir): if fnmatch(filename, f"*{string_to_match}*"): file_path = f"{self.data_dir}/{filename}" break df = pd.read_csv(file_path) logging.info(f"Loaded {len(df)} records from downloaded file.") return df
class Connector: """ Data connector for Extracting data, Transforming into dataframes, and Loading into a database. """ def __init__(self): self.sql = MSSQL() self.ftp = FTP() self.localdir = "files" self.remotedir = "seis" self.table_prefix = "SEIS" self.schools = self.ftp.get_directory_names(self.remotedir) def remove_local_files(self): """Remove any leftover files from local project directory.""" filelist = [f for f in os.listdir(self.localdir)] for filename in filelist: if "gitkeep" not in filename: os.remove(os.path.join(self.localdir, filename)) def get_files_from_ftp(self): """Loop through all sub-folders and download files from FTP.""" self.remove_local_files() self.ftp.download_all(self.remotedir, self.localdir) self.filenames = [ f for f in os.listdir(self.localdir) if f.endswith(".csv") ] logging.info(f"{len(self.filenames)} files downloaded. ") def read_files_into_df(self, file_name): """ Given the file name (eg. Student or Service), read the files and concat into one DataFrame. Params: file_name (str): name of the file that you are trying to combine. Return: DataFrame: combined data from all files with the same name (ie. same type of data) """ dfs = [] for school in self.schools: path = os.path.join(self.localdir, f"{school}_{file_name}.csv") df = pd.read_csv(path, sep=",", quotechar='"', doublequote=True, dtype=str, header=0) dfs.append(df) merged = pd.concat(dfs) merged.replace(np.nan, "", regex=True, inplace=True) return merged def insert_df_into_db(self, df, table_name): """ Insert DataFrame into database with given table name. Params: df (DataFrame): data to insert into the database. table_name (str): name of the database table that you want to update. Return: none """ table = f"{self.table_prefix}_{table_name}" self.sql.insert_into(table, df, if_exists="replace") logging.info(f"Inserted {len(df)} records into {table}.")
def insert_table(df, tablename): """Drop and replace database table with each run""" df_len = len(df.index) sql = MSSQL() sql.insert_into(tablename, df, if_exists="replace") logging.info(f"Loaded {df_len} students into {tablename} table")
class Connector: """ETL connector class""" def __init__(self): self.data_dir = "data" self.sql = MSSQL() self.ftp = FTP(self.data_dir) def sync_all_ftp_data(self): for table_name, directory_name in data_reports.items(): self.ftp.download_files(directory_name) self._load_new_records_into_table(table_name, directory_name) def _load_new_records_into_table(self, table_name, report_name): """Find and insert new records into the data warehouse.""" if report_name == "idm-reports": # this folder contains student emails file, which has no datestamp in the file name self._process_files_without_datestamp(table_name, report_name) else: self._process_files_with_datestamp(table_name, report_name) def _process_files_without_datestamp(self, table_name, report_name): # Student Emails file doesn't contain a datestamp in the file name # This table should be truncated and replaced. df = self._read_file(f"{self.data_dir}/google-student-emails.csv") self.sql.insert_into(f"Clever_{table_name}", df, if_exists="replace") logging.info(f"Inserted {len(df)} records into Clever_{table_name}.") def _process_files_with_datestamp(self, table_name, report_name): # Generate names for files with datestamps in the file name and process those files # These tables should be appended to, not truncated. start_date = self._get_latest_date(table_name) + timedelta(days=1) yesterday = datetime.today() - timedelta(days=1) if start_date > yesterday: logging.info( f"Clever_{table_name} is up to date. No records inserted.") return else: file_names = self._generate_file_names(start_date, yesterday, report_name) df = self._read_and_concat_files(file_names) self.sql.insert_into(f"Clever_{table_name}", df, if_exists="append") logging.info( f"Inserted {len(df)} records into Clever_{table_name}.") def _get_latest_date(self, table_name): """Get the latest date record in this table.""" date = self.sql.query( f"SELECT TOP(1) [date] FROM custom.Clever_{table_name} ORDER BY [date] DESC" ) latest_date = date["date"][0] return datetime.strptime(latest_date, "%Y-%m-%d") def _generate_file_names(self, start_date, yesterday, report_name): file_names = [] while start_date <= yesterday: # loop through yesterday's date formatted_date = start_date.strftime("%Y-%m-%d") file_names.append(f"{formatted_date}-{report_name}-students.csv") start_date += timedelta(days=1) return file_names def _read_and_concat_files(self, file_names): dfs = [] for file_name in file_names: df = pd.read_csv(f"{self.data_dir}/{file_name}") logging.info(f"Read {len(df)} records from '{file_name}'.") dfs.append(df) data = pd.concat(dfs) return data def _read_file(self, file_name): df = pd.read_csv(file_name) logging.info(f"Read {len(df)} records from '{file_name}'.") return df
def main(): df = pd.read_csv(file, sep=",") sql = MSSQL() sql.insert_into(table, df)