def __init__(self): self.MDCReport = common.df_list( db.pandas_read( 'SELECT RICCompanyDataID, CompanyID,DataSource,BatchID,DateID,AdvisoryServicesHours,' 'VolunteerMentorHours, AnnualRevenue, NumberEmployees,FundingToDate, FundingCurrentQuarter, ' 'HighPotential,SocialEnterprise ' 'FROM MDCReport.BAPQ.FactRICCompany')) self.MaRSDataCatalyst = common.df_list( db.pandas_read( 'SELECT RICCompanyDataID, CompanyID,DataSourceID,BatchID,DateID,AdvisoryServicesHours,' 'VolunteerMentorHours, AnnualRevenue, NumberEmployees,FundingToDate, FundingCurrentQuarter, ' 'HighPotential,SocialEnterprise FROM MaRSDataCatalyst.Reporting.FactRICCompanyData' )) self.records = []
def save_organization_detail(self, uuid, json_properties): # print('{}. UUID: {}'.format(self.i, uuid)) df = self.db.pandas_read(self.enum.SQL.sql_org_detail_exists.value.format(uuid)) if len(df) == 0: json_properties['org_uuid'] = uuid json_properties['batch'] = 3862 json_properties['company_id'] = None json_properties['BasicName'] = None json_properties['fetched'] = 0 df_properties = pd.DataFrame([json_properties], columns=self.org_columns) values = CM.df_list(df_properties) val = [] tup = () for l, j in enumerate(values[0]): if isinstance(values[0][l], list): val.append(''.join(str(x) for x in values[0][l])) elif isinstance(values[0][l], str): val.append(self.common.sql_compliant(values[0][l]).replace('\r',' ').replace('\n',' ').replace('(',' - ').replace(')','')) elif values[0][l] is None: val.append(self.common.sql_compliant('')) else: val.append(values[0][l]) # print(val) tup = tuple(val) # print(tup) ival = [val] sql_insert = self.enum.SQL.sql_org_short_insert.value.format(tup) # print(sql_insert) sql_insert = sql_insert.replace('True', '1').replace('False','0').replace('"',"'") # print(sql_insert) self.db.execute(sql_insert) else: print('[{}] exists.'.format(json_properties['name']))
def move_annual_company_data(self): i, j = 0, 0 dfac = db.pandas_read('SELECT ID, BatchID, CompanyID,[Company Name] FROM BAP.AnnualCompanyData') dfdc = db.pandas_read('SELECT CompanyID, CompanyName FROM Reporting.DimCompany') dfac['BasicName'] = dfac.apply(lambda dfs: CM.get_basic_name(dfs['Company Name']), axis=1) dfdc['BasicName'] = dfdc.apply(lambda dfs: CM.get_basic_name(dfs.CompanyName), axis=1) for i, c in dfac.iterrows(): dfc = dfdc[dfdc['BasicName'] == c.BasicName] val = dict() if len(dfc) > 0: i+=1 db.execute(sql.sql_annual_comapny_data_update.value.format(dfc.CompanyID.values[0], c.ID)) print(sql.sql_annual_comapny_data_update.value.format(dfc.CompanyID.values[0], c.ID)) else: j+=1 print(sql.sql_dim_company_insert.value) new_com_id = self.batch.get_table_seed('MaRSDataCatalyst.Reporting.DimCompany', 'CompanyID') + 1 val['CompanyID'] = new_com_id val['Company Name'] = c['Company Name'] val['Description'] = None val['Phone'] = None val['Phone2'] = None val['Fax'] = None val['Email'] = None val['Website'] = None val['CompanyType'] = None val['BatchID'] = c.BatchID val['ModifiedDate'] = str(dt.datetime.utcnow())[:-3] val['CreatedDate'] = str(dt.datetime.utcnow())[:-3] df = pd.DataFrame([val], columns=val.keys()) values = CM.df_list(df) db.bulk_insert(sql.sql_dim_company_insert.value, values) db.execute(sql.sql_annual_comapny_data_update.value.format(new_com_id, c.ID)) print('{} exists and {} doesn not exist'.format(i, j))
def push_entity_to_db(self, json, org_uuid, sql_insert, uuid, i=0, fk_uuid='org_uuid', columns=[]): try: json_properties = None if CBDict.properties.value in json.keys(): json_properties = json[CBDict.properties.value] elif json[CBDict.cardinality.value] == 'OneToOne': json_properties = json[CBDict.item.value][CBDict.properties.value] elif json[CBDict.cardinality.value] == 'OneToMany': json_properties = json[CBDict.items.value][i][CBDict.properties.value] if 'uuid' not in json_properties.keys(): json_properties['uuid'] = uuid if fk_uuid not in json_properties.keys(): json_properties[fk_uuid] = org_uuid # print(list(json_properties.keys())) df_properties = pd.DataFrame([json_properties], columns=json_properties.keys()) if len(columns) > 0: df_properties = df_properties[columns] values = CM.df_list(df_properties) val = [] for l, j in enumerate(values[0]): if isinstance(values[0][l], list): val.append(' , '.join(str(x) for x in values[0][l])) elif isinstance(values[0][l], str): val.append(self.common.sql_compliant(values[0][l])) else: val.append(values[0][l]) db.bulk_insert(sql_insert, [val]) except Exception as ex: print(ex)
def save_data_chunk(df, sql_insert, chunk_size=1000, capture_fails=False, fail_path_key=''): i = 0 j = i + chunk_size total_size = len(df) + 1 while i < total_size: now = int(round(time.time() * 1000)) print('From {} to {}'.format(i, j)) df_insert = df.iloc[i:j] values = Common.df_list(df_insert) if capture_fails: msg = DB.bulk_insert(sql_insert, values, rtrn_msg=True) if msg == 'FAILURE': filename = '{}_fail_chunk_{}_to_{}.xlsx'.format(now, i, j) if fail_path_key != '': Common.save_as_excel(dfs=[df_insert], file_name=filename, path_key=fail_path_key) print("\tCHUNK FAILED. SAVED TO {}".format(filename)) else: DB.bulk_insert(sql_insert, values) print('-' * 150) i, j = i + chunk_size, j + chunk_size if j > total_size: j = total_size
def push_bap_missing_data_to_temp_table(): current_path = os.path.join(os.path.expanduser("~"), '/Users/mnadew/Box Sync/Workbench/BAP/BAP_FY18/FY18_Q3/for ETL/Missing data Reports') os.chdir(current_path) df = pd.read_excel('00 BAP Missing data Combined.xlsx', 'BAP Missing data') df['CompanyID'] = 0 new_col = ['CompanyID','CompanyName','BasicName','Website','AnnualRevenue','NumberOfEmployees','FundingToDate','DataSource'] dfs = df[new_col] sql = 'INSERT INTO BAP.BAP_FY18Q3_Missing_Data VALUES (?, ?, ?, ?, ?, ?, ?, ?)' values = COM.df_list(dfs) db.bulk_insert(sql, values)
def transfer_fact_ric_company_data(): df = db.pandas_read(sql.sql_bap_fact_ric_data_fyq4.value) df_frc = BapQuarterly.get_proper_values(df) # BapQuarterly.update_month_year(df_frc) # df_frc['IntakeDate'] = pd.to_datetime(df_frc['IntakeDate']) df_frc['Age'] = None # df_frc['Date of Incorporation'] = pd.to_datetime(df_frc['Date of Incorporation']) # df_ric = df_frc.drop(columns=['ID', 'Incorporate year (YYYY)', 'Incorporation month (MM)']) # BapQuarterly.file.save_as_csv(df_frc, '00 FactRICCompany.xlsx', os.getcwd(), 'FactRICCompany') values_list = COM.df_list(df_frc) db.bulk_insert(sql.sql_bap_fact_ric_company_insert.value, values_list)
def update(self, table_name, source_id_col, company_id_col): etl = common.df_list( db.pandas_read('SELECT ' + source_id_col + ',' + company_id_col + ' FROM ' + table_name)) for index1, val1 in enumerate(self.source_table): for index2, val2 in enumerate(etl): if val1[0] == str(val2[0]): db.execute('UPDATE ' + table_name + ' SET ' + company_id_col + ' = ' + str(val1[1]) + ' WHERE ' + source_id_col + ' = ' + str(val2[0])) break
def nomatch_create_new(self): """Add non-duplicate ventures that are new companies (-ve ID) as new ventures to the venture table """ new_ventures = common.df_list( db.pandas_read( "SELECT * FROM MDC_DEV.dbo.ProcessedVenture AS a WHERE a.ID NOT IN " "(SELECT ID FROM MDC_DEV.dbo.EntityMap) AND a.ID < 0 ")) sql = 'INSERT INTO MDC_DEV.dbo.Venture VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)' db.bulk_insert(sql, new_ventures) # Update ID to match Venture Table in the given source table if self.source_table is not None: sql = 'UPDATE ' + self.source_table + ' SET ID = b.ID FROM ' + self.source_table + ' AS a INNER JOIN MDC_DEV.dbo.Venture AS b ON a.Name = b.Name' db.execute(sql)
def insert_dim_company_source(self, new_company): try: date_time = str(dt.datetime.utcnow())[:-3] self.dim_company_source_id = self.get_table_seed('Reporting.DimCompanySource', 'SourceCompanyID') + 1 dc = dict() dc['aSourceID'] = self.dim_company_source_id dc['bCompanyID'] = self.dim_company_id dc['cName'] = new_company['Name'] dc['dSCC'] = None dc['eDataSource'] = new_company['DataSource'] dc['eBatchID'] = new_company['BatchID'] dc['fCT'] = None dc['gModified'] = date_time dc['hCreated'] = date_time df = pd.DataFrame.from_dict([dc], orient='columns') values = CM.df_list(df) db.bulk_insert(sql.sql_dim_company_source_insert.value, values) except Exception as ex: print(ex)
def insert_dim_company(self, new_company): try: self.dim_company_id = self.get_table_seed('Reporting.DimCompany', 'CompanyID') + 1 date_time = str(dt.datetime.utcnow())[:-3] dc = dict() dc['aCompanyID'] = self.dim_company_id dc['bName'] = new_company['Name'] dc['cDescription'] = None dc['dPhone'] = None dc['ePhone2'] = None dc['fFax'] = None dc['gEmail'] = None dc['hWebsite'] = new_company['Website'] dc['iCompanyType'] = None dc['jBatchID'] = new_company['BatchID'] dc['kModifiedDate'] = date_time dc['lCreatedDate'] = date_time df = pd.DataFrame.from_dict([dc], orient='columns') values = CM.df_list(df) db.bulk_insert(sql.sql_dim_company_insert.value, values) except Exception as es: print(es)
def fp_create_new(self): new_ventures = common.df_list( db.pandas_read( "SELECT * FROM MDC_DEV.dbo.ProcessedVenture AS a WHERE a.ID IN " "(SELECT ID FROM MDC_DEV.dbo.MatchingFalsePositives) AND a.ID < 0" )) sql = 'INSERT INTO MDC_DEV.dbo.Venture VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)' db.bulk_insert(sql, new_ventures) # Update MFP with the new ventures new ID db.execute( "UPDATE MDC_DEV.dbo.MatchingFalsePositives SET ID = a.ID " "FROM MDC_DEV.dbo.MatchingFalsePositives AS m INNER JOIN MDC_DEV.dbo.Venture AS a ON m.Name = a.Name" ) db.execute( "UPDATE MDC_DEV.dbo.MatchingFalsePositives SET FalseID = a.ID " "FROM MDC_DEV.dbo.MatchingFalsePositives AS m INNER JOIN MDC_DEV.dbo.Venture AS a ON m.FalseName = a.Name" ) # Update sourcetable with new ID if self.source_table is not None: sql = 'UPDATE ' + self.source_table + ' SET ID = b.ID FROM ' + self.source_table + 'as a INNER JOIN MDC_DEV.dbo.Venture AS b ON a.Name = b.Name' db.execute(sql)
def insert_new_venture(self): data = db.pandas_read(sql.sql_bap_new_company.value) values = CM.df_list(data) db.bulk_insert(sql.sql_venture_insert.value,values)
def __init__(self): self.source_table = common.df_list( db.pandas_read( 'SELECT SourceID, ID, Name FROM MDC_DEV.dbo.SourceTable'))
def bulk_insert_annual_data(dataframe): val = COM.df_list(dataframe) db.bulk_insert(sql.sql_bap_ric_venture_annual_insert.value, val)
def bulk_insert_quarterly_data(dataframe): val = COM.df_list(dataframe) db.bulk_insert(sql.sql_bap_ric_venture_quarterly_insert.value, val)
def transfer_csv_program_youth(dataframe): val = COM.df_list(dataframe) db.bulk_insert(sql.sql_bap_ric_program_youth_insert.value, val)
def bap_insert(df): values_list = COM.df_list(df) db.bulk_insert(sql.sql_postal_code_insert.value, values_list)
# # EXACT MATCHING db.execute("DELETE FROM MDC_DEV.dbo.ProcessedVenture") db.execute("INSERT INTO MDC_DEV.dbo.ProcessedVenture SELECT * FROM MDC_DEV.dbo.Venture") # # Insert ACTia target list xlsx into ProcessedVenture to match with database # df = common.xl_to_dfs('/Users/ssimmons/Documents/',input) # df = df['ACTia_targetlist_2018'] # vals_to_insert = common.df_list(df) db.execute("DELETE FROM MDC_DEV.dbo.SourceTable") # sql = 'INSERT INTO MDC_DEV.dbo.SourceTable (ID,Name,Email,Phone) VALUES (?,?,?,?)' ## Edit based on dataset # db.bulk_insert(sql, vals_to_insert) db.execute('INSERT INTO MDC_DEV.dbo.SourceTable (SourceID, Name, BasicName, Website, Address, BatchID) ' 'SELECT ID,CompanyName, BasicName, Website, City, BatchID FROM MDCRaw.BAP.QuarterlyCompanyData') source = common.df_list(db.pandas_read("SELECT SourceID FROM MDC_DEV.dbo.SourceTable")) vals = [] k = -1 for i,v in enumerate(source): vals.append([k,v[0]]) k -= 1 sql = 'UPDATE MDC_DEV.dbo.SourceTable SET ID = ? WHERE SourceID = ?' db.bulk_insert(sql, vals) print('Starting exact matching') e1 = exact() stime = time.time() e1.match()