def block(self, selection): """ :param deduper: deduper object created in training :param selection: sql statement selecting all relevant columns to use in deduplication :return: None """ # If dedupe learned a Index Predicate, we have to take a pass # through the data and create indices. for field in self.deduper.blocker.index_fields: df = db.pandas_read("SELECT DISTINCT {field} FROM MDC_DEV.dbo.ProcessedVenture " "WHERE {field} IS NOT NULL".format(field=field)) dataset = [tuple(x) for x in df.values] field_data = set(row[0] for row in dataset) self.deduper.blocker.index(field_data, field) # Now we are ready to write our blocking map table by creating a # generator that yields unique `(BlockKey, ID)` tuples. db.execute("DELETE FROM MDC_DEV.dbo.BlockingMap") df = db.pandas_read(selection).set_index('ID').to_dict('index') b_data = self.deduper.blocker(df) sql = 'INSERT INTO MDC_DEV.dbo.BlockingMap VALUES (?,?)' print('Populating BlockingMap... ') # Chunk the blocked data into groups of 30,000 blocks to be inserted in the BlockingMap size = 30000 main_list = list(b_data) b_data = None chunks = [main_list[x:x + size] for x in range(0, len(main_list), size)] main_list = None for chunk in chunks: db.bulk_insert(sql, chunk) self.deduper.blocker.resetIndices()
def move_annual_company_data(self): i, j = 0, 0 dfac = db.pandas_read('SELECT ID, BatchID, CompanyID,[Company Name] FROM BAP.AnnualCompanyData') dfdc = db.pandas_read('SELECT CompanyID, CompanyName FROM Reporting.DimCompany') dfac['BasicName'] = dfac.apply(lambda dfs: CM.get_basic_name(dfs['Company Name']), axis=1) dfdc['BasicName'] = dfdc.apply(lambda dfs: CM.get_basic_name(dfs.CompanyName), axis=1) for i, c in dfac.iterrows(): dfc = dfdc[dfdc['BasicName'] == c.BasicName] val = dict() if len(dfc) > 0: i+=1 db.execute(sql.sql_annual_comapny_data_update.value.format(dfc.CompanyID.values[0], c.ID)) print(sql.sql_annual_comapny_data_update.value.format(dfc.CompanyID.values[0], c.ID)) else: j+=1 print(sql.sql_dim_company_insert.value) new_com_id = self.batch.get_table_seed('MaRSDataCatalyst.Reporting.DimCompany', 'CompanyID') + 1 val['CompanyID'] = new_com_id val['Company Name'] = c['Company Name'] val['Description'] = None val['Phone'] = None val['Phone2'] = None val['Fax'] = None val['Email'] = None val['Website'] = None val['CompanyType'] = None val['BatchID'] = c.BatchID val['ModifiedDate'] = str(dt.datetime.utcnow())[:-3] val['CreatedDate'] = str(dt.datetime.utcnow())[:-3] df = pd.DataFrame([val], columns=val.keys()) values = CM.df_list(df) db.bulk_insert(sql.sql_dim_company_insert.value, values) db.execute(sql.sql_annual_comapny_data_update.value.format(new_com_id, c.ID)) print('{} exists and {} doesn not exist'.format(i, j))
def update_cb_basic_company(self): df = db.pandas_read(sql.sql_cb_basic_company.value) for _, r in df.iterrows(): basic_name = CM.get_basic_name(r['name']) sql_update = sql.sql_cb_basic_company_update.value.format(basic_name, CM.sql_compliant(r['org_uuid'])) print(sql_update) db.execute(sql_update)
def update_tdw_basic_company(self): df = db.pandas_read(sql.sql_tdw_basic_company.value) for _, r in df.iterrows(): basic_name = CM.get_basic_name(r.legal_name) sql_update = sql.sql_tdw_basic_company_update.value.format(basic_name, CM.sql_compliant(r.legal_name)) print(sql_update) db.execute(sql_update)
def remove_false_positives(): """Consult MatchingFalsePositives and remove known false positive clusters from EntityMap""" db.execute("SELECT * FROM MDC_DEV.dbo.EntityMap ORDER BY CanonID") entity_map = db.pandas_read("SELECT * FROM MDC_DEV.dbo.EntityMap").set_index('ID').to_dict('index') clusters = [] for index1, val1 in entity_map.items(): for index2, val2 in entity_map.items(): if val1['CanonID'] == val2['CanonID'] and index1 != index2: clusters.append([index1, index2]) break fal_pos = db.pandas_read("SELECT * FROM MDC_DEV.dbo.MatchingFalsePositives").to_dict('index') remove = [] for cluster in clusters: for i, v in fal_pos.items(): if cluster[0] == v['ID'] and cluster[1] == v['FalseID']: remove.append([cluster[0]]) remove.append([cluster[1]]) break else: continue sql = 'DELETE FROM MDC_DEV.dbo.EntityMap WHERE ID = (?)' db.bulk_insert(sql, remove)
def write_results(clustered_dupes_list): """ Load finalized clusters into EntityMap table :param clustered_dupes_list: list of tuples returned from clustering :return: number of duplicate sets """ # We now have a sequence of tuples of company IDs that dedupe believes # all refer to the same entity. We write this out onto an entity map table db.execute("DELETE FROM MDC_DEV.dbo.EntityMap") print('Populating EntityMap table.') sql = 'INSERT INTO MDC_DEV.dbo.EntityMap (ID, CanonID, ClusterScore) VALUES (?,?,?)' values = [] cluster_id = 1 for cluster, scores in clustered_dupes_list: for id, score in zip(cluster, scores): values.append([int(id), int(cluster_id), float(score)]) cluster_id = cluster_id + 1 db.bulk_insert(sql, values) db.execute("UPDATE MDC_DEV.dbo.EntityMap SET Name = p.Name, BatchID = p.BatchID, " "Description = p.Description, Website = p.Website, Email = p.Email, " "Phone = p.Phone, Address = p.Address " "FROM MDC_DEV.dbo.EntityMap AS a INNER JOIN MDC_DEV.dbo.ProcessedVenture AS p " "ON a.ID = p.ID") return len(clustered_dupes_list)
def delete_old_ans(): # delete old ans using answer ids # store old ans in xl file old_ans_sql = CM.get_config('config.ini', 'secondary_etl', 'old_ans') old_ans_df = DB.pandas_read(old_ans_sql) DBInteractions.store_df(old_ans_df, '_OLD_PIPE_ANS') # run sql to delete old ans del_old_ans_sql = CM.get_config('config.ini', 'secondary_etl', 'del_old_ans') DB.execute(del_old_ans_sql)
def update(self, table_name, source_id_col, company_id_col): etl = common.df_list( db.pandas_read('SELECT ' + source_id_col + ',' + company_id_col + ' FROM ' + table_name)) for index1, val1 in enumerate(self.source_table): for index2, val2 in enumerate(etl): if val1[0] == str(val2[0]): db.execute('UPDATE ' + table_name + ' SET ' + company_id_col + ' = ' + str(val1[1]) + ' WHERE ' + source_id_col + ' = ' + str(val2[0])) break
def nomatch_create_new(self): """Add non-duplicate ventures that are new companies (-ve ID) as new ventures to the venture table """ new_ventures = common.df_list( db.pandas_read( "SELECT * FROM MDC_DEV.dbo.ProcessedVenture AS a WHERE a.ID NOT IN " "(SELECT ID FROM MDC_DEV.dbo.EntityMap) AND a.ID < 0 ")) sql = 'INSERT INTO MDC_DEV.dbo.Venture VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)' db.bulk_insert(sql, new_ventures) # Update ID to match Venture Table in the given source table if self.source_table is not None: sql = 'UPDATE ' + self.source_table + ' SET ID = b.ID FROM ' + self.source_table + ' AS a INNER JOIN MDC_DEV.dbo.Venture AS b ON a.Name = b.Name' db.execute(sql)
def save_orgs_relationship(self, api_url): url = api_url + self.api_org_token print(url) orgs = CM.get_api_data(url) if orgs.ok: self.org_uuid = orgs.json()[CBDict.data.value][CBDict.uuid.value] df = self.db.pandas_read(self.enum.SQL.sql_org_detail_exists.value.format(self.org_uuid)) if len(df) == 0:#self.db.entity_exists('MDCRaw.CRUNCHBASE.Organization', 'org_uuid', self.org_uuid): self.save_organization_detail(self.org_uuid, orgs.json()[CBDict.data.value][CBDict.properties.value]) rs_json = orgs.json()[CBDict.data.value][CBDict.relationships.value] self.save_funding_rounds(rs_json['funding_rounds'], self.org_uuid) self.save_relational_entity(rs_json[self.enum.CBDict.headquarters.value], self.org_uuid, self.enum.SQL.sql_offices_exists.value, self.enum.SQL.sql_offices_insert.value, self.office_col) self.save_relational_entity(rs_json['categories'], self.org_uuid, self.enum.SQL.sql_org_category_exists.value, self.enum.SQL.sql_org_category_insert.value, self.category_col) # save all the related entities # self.save_teams(rs_json['featured_team'], self.org_uuid, TeamStatus.Featured.value) # self.save_teams(rs_json['current_team'], self.org_uuid, TeamStatus.Current.value) # self.save_teams(rs_json['past_team'], self.org_uuid, TeamStatus.Past.value) # self.save_teams(rs_json['board_members_and_advisors'], self.org_uuid, TeamStatus.Board.value) # self.save_investments_invested_in(rs_json['investments']) # self.save_relational_entity(rs_json['sub_organizations'], self.org_uuid, self.sql_sub_organization_insert) # if rs_json[self.enum.CBDict.offices.value][self.enum.CBDict.items] is not None: # self.save_relational_entity(rs_json[self.enum.CBDict.offices.value], self.org_uuid, self.enum.SQL.sql_offices_exists, self.sql_offices_insert, self.office_col) # self.save_relational_entity(rs_json['founders'], self.org_uuid, self.sql_founders_insert) # self.save_relational_entity(rs_json['acquisitions'], self.org_uuid, self.sql_acquisition_insert) # self.save_relational_entity(rs_json['acquired_by'], self.org_uuid, self.sql_acquired_insert) # self.save_relational_entity(rs_json['ipo'], self.org_uuid, self.sql_ipo_insert) # self.save_relational_entity(rs_json['funds'], self.org_uuid, self.sql_funds_insert) # self.save_relational_entity(rs_json['websites'], self.org_uuid, self.sql_websites_insert) # self.save_relational_entity(rs_json['images'], self.org_uuid, self.sql_image_insert) # self.save_relational_entity(rs_json['news'], self.org_uuid, self.sql_news_insert) db.execute(self.orgs_summary_update.format(self.org_uuid)) else: print('organization already exists.')
def update_raw_company(self): dfnew, dfdc, _ = self.get_company() raw_company = self.generate_basic_name(dfnew) dim_company = self.generate_basic_name(dfdc) if len(dfdc) > 0 else None for index, com in raw_company.iterrows(): try: company_name = com['BasicName'] print(company_name) if len(dim_company[dim_company.BasicName == company_name].CompanyID.values) > 0: companyid = dim_company[dim_company.BasicName == company_name].CompanyID.values[0] if companyid > 0: sql_update = sql.sql_update.value.format('BAP.QuarterlyCompanyData', 'CompanyID', companyid, 'ID', com.ID) print(sql_update) db.execute(sql_update) else: print('{} >> {}'.format(com.ID, com.CompanyName)) except Exception as ex: print('UPDATE ISSUE: {}'.format(ex))
def preprocess(): # # CLEANING # Website Cleaning # Where all the fields are 'n/a' 'NA' etc need to change them to null db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture " "SET Website = NULL WHERE Website ='na' OR Website = 'n/a' OR Website = '-' " "OR Website = 'http://' OR Website = 'no website' OR Website = '--' " "OR Website = 'http://N/A' OR Website = 'no data' OR Website = 'http://www.facebook.com' " "OR Website = '0' OR Website = 'http://www.nowebsite.co'" "OR Website = 'http://N/A - Development stage.' OR Website = ' -'" "OR Website = 'http://NOT ON WEB' OR Website = 'http://none'" "OR Website = 'http://coming soon' OR Website = 'http://not.yet' " "OR Website = 'http://no website' OR Website = 'http://not yet' " "OR Website = 'none' OR Website = 'http://NA'OR Website = 'tbd' " "OR Website = 'https' OR Website = 'http://www.nowebsite.com' " "OR Website = 'http://nowebsite.com' OR Website = 'http://Nowebsiteyet' " "OR Website = 'Coming soon' OR Website = 'not set up yet' OR Website = 'http://www.yahoo.com' " "OR Website = 'http://under construction' OR Website = 'http://www.nwebsite.com' " "OR Website = 'http://www.google.com' OR Website = 'http://www.google.ca' OR Website = 'youtube.com'" ) # # Phone Cleaning db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET Phone = NULL WHERE LEN(Phone)<10 " ) # # Name Cleaning db.execute( "DELETE FROM MDC_DEV.dbo.ProcessedVenture WHERE Name LIKE '%communitech%' OR Name LIKE '%company:%' " "OR Name LIKE '%nwoic%' OR Name LIKE '%riccentre%' OR Name LIKE '%sparkcentre%'OR Name LIKE '%venture%' " "OR Name LIKE '%Wetch_%' OR Name LIKE '%testBAP%' OR Name LIKE '%SSMIC%' OR Name LIKE '%Techalliance%' " "OR Name LIKE '%RICC_%' OR Name LIKE '%_anon_%' OR Name LIKE '%InnovationFactory_%' " "OR Name LIKE '%InvestOttawa_%' OR Name LIKE '%Québec inc%' OR Name LIKE '%Haltech_%' " "OR Name LIKE '%InnovNiag_survey%' OR Name LIKE '%NOIC%'")
def remove_discovered_matches(): db.execute("SELECT * FROM MDC_DEV.dbo.EntityMap ORDER BY CanonID") entity_map = db.pandas_read("SELECT * FROM MDC_DEV.dbo.EntityMap").set_index('ID').to_dict('index') clusters = [] for index1, val1 in entity_map.items(): for index2, val2 in entity_map.items(): if val1['CanonID'] == val2['CanonID'] and index1 != index2: clusters.append([index1, index2]) break d_matches = db.pandas_read("SELECT * FROM MDC_DEV.dbo.DuplicateVenture").to_dict('index') remove = [] for cluster in clusters: for i, v in d_matches.items(): if (cluster[0] == v['CompanyID'] and cluster[1] == v['DuplicateCompanyID']) or \ (cluster[1] == v['CompanyID'] and cluster[0] == v['DuplicateCompanyID']): remove.append([cluster[0]]) remove.append([cluster[1]]) break else: continue sql = 'DELETE FROM MDC_DEV.dbo.EntityMap WHERE ID = (?)' db.bulk_insert(sql, remove)
def del_survey_components(self, survey_id): del_sql = CM.get_config("config.ini", "sql_queries", "del_all_for_survey") del_sql = del_sql.replace("WHAT_SURVEY", str(survey_id)) DB.execute(del_sql) print("\nDeletion attempt was made. Survey components check:") comps_dict = { "questions": "select_questions", "options": "select_options", "answers": "select_answers", "responses": "select_responses", "emails": "select_emails", "campaigns": "select_campaigns" } for component, sql in comps_dict.items(): sql = CM.get_config("config.ini", "sql_queries", sql).replace("WHAT_SURVEY", str(survey_id)) df = DB.pandas_read(sql) print("\nCount of {}: {}".format(component, len(df))) return
def create_new(self, new_venture, second_venture): """new_venture: main record to be loaded into Venture table, second_venture: matching record that need to be updated with the same ID as new_venture""" db.execute( 'INSERT INTO MDC_DEV.dbo.Venture (AlternateName, BasicName,DateFounded,DateOfIncorporation,VentureType,' 'Description,Website,Email,Phone,Fax ,Address,VentureStatus,ModifiedDate,CreateDate) SELECT AlternateName, ' 'BasicName,DateFounded,DateOfIncorporation,VentureType,Description,Website,Email,Phone,Fax ,' 'Address,VentureStatus,ModifiedDate,CreateDate FROM ' + self.source_table + ' AS a WHERE a.ID = ' + str(new_venture['ID'])) # Update ID to match Venture Table in the given source table for both records if self.source_table is not None: sql = 'UPDATE ' + self.source_table + ' SET ID = b.ID FROM ' + self.source_table + ' AS a INNER JOIN MDC_DEV.dbo.Venture AS b ON a.Name = b.Name' db.execute(sql) sql = "UPDATE " + self.source_table + " SET ID = b.ID FROM " + self.source_table + " AS a INNER JOIN MDC_DEV.dbo.Venture AS b ON " \ "a.Name = '{Name1}' AND b.Name = '{Name2}'".format(Name1=(str(second_venture['Name'])),Name2=(str(new_venture['Name']))) db.execute(sql)
def fp_create_new(self): new_ventures = common.df_list( db.pandas_read( "SELECT * FROM MDC_DEV.dbo.ProcessedVenture AS a WHERE a.ID IN " "(SELECT ID FROM MDC_DEV.dbo.MatchingFalsePositives) AND a.ID < 0" )) sql = 'INSERT INTO MDC_DEV.dbo.Venture VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)' db.bulk_insert(sql, new_ventures) # Update MFP with the new ventures new ID db.execute( "UPDATE MDC_DEV.dbo.MatchingFalsePositives SET ID = a.ID " "FROM MDC_DEV.dbo.MatchingFalsePositives AS m INNER JOIN MDC_DEV.dbo.Venture AS a ON m.Name = a.Name" ) db.execute( "UPDATE MDC_DEV.dbo.MatchingFalsePositives SET FalseID = a.ID " "FROM MDC_DEV.dbo.MatchingFalsePositives AS m INNER JOIN MDC_DEV.dbo.Venture AS a ON m.FalseName = a.Name" ) # Update sourcetable with new ID if self.source_table is not None: sql = 'UPDATE ' + self.source_table + ' SET ID = b.ID FROM ' + self.source_table + 'as a INNER JOIN MDC_DEV.dbo.Venture AS b ON a.Name = b.Name' db.execute(sql)
def set_to_none(): # Set all blank cells to null db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET Name = NULL WHERE Name = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET ID = NULL WHERE ID = ''") db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET AlternateName = NULL WHERE AlternateName = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET BasicName = NULL WHERE BasicName = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET BatchID = NULL WHERE BatchID = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET DateFounded = NULL WHERE DateFounded = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET DateOfIncorporation = NULL WHERE DateOfIncorporation = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET VentureType = NULL WHERE VentureType = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET Description = NULL WHERE Description = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET Website = NULL WHERE Website = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET Email = NULL WHERE Email = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET Phone = NULL WHERE Phone = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET Fax = NULL WHERE Fax = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET VentureStatus = NULL WHERE VentureStatus = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET ModifiedDate = NULL WHERE ModifiedDate = ''" ) db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET CreateDate = NULL WHERE CreateDate = ''" )
class Duplicate: file = FileService('') def __init__(self): self.db = DB() self.training_file = 'company_traning.json' self.setting_file = 'company_settings' self.result_set = None self.start_time = time.time() # CLUSTERING def candidates_gen(self): lset = set block_id = None records = [] i = 0 for row in self.result_set: if row['block_id'] != block_id: if records: yield records block_id = row['block_id'] records = [] i += 1 if i % 10000 == 0: print(i, "blocks") print(time.time() - self.start_time, "seconds") smaller_ids = row['smaller_ids'] if smaller_ids: smaller_ids = lset(smaller_ids.split(',')) else: smaller_ids = lset([]) records.append((row['donor_id'], row, smaller_ids)) if records: yield records def company_deduplicate(self): optp = optparse.OptionParser() optp.add_option( '-v', '--verbose', dest='verbose', action='count', help='Increase verbosity (specify multiple times for more)') (opts, args) = optp.parse_args() log_level = logging.WARNING if opts.verbose: if opts.verbose == 1: log_level = logging.INFO elif opts.verbose >= 2: log_level = logging.DEBUG logging.getLogger().setLevel(log_level) sql_data = 'SELECT CompanyID, CompanyName,Website,Email,Phone FROM Reporting.DimCompany' sql = 'SELECT C.CompanyID, C.CompanyName, C.Website,L.Address1,Y.[Name] AS [CityName],L.PostalCode ' \ 'FROM Reporting.DimCompany C LEFT JOIN [Reporting].[DimCompanyLocation] L ON L.CompanyID = C.CompanyID ' \ 'LEFT JOIN [Reporting].[DimCity] Y ON Y.CityID = L.CityID WHERE C.CompanyName IS NOT NULL' if os.path.exists(self.setting_file): print('reading from '.format(self.setting_file)) with open(self.setting_file, 'rb') as sf: deduper = dedupe.StaticDedupe(sf, num_cores=4) fields = [{ 'field': 'CompanyID', 'variable name': 'CompanyID', 'type': 'String' }, { 'field': 'CompanyName', 'variable name': 'CompanyName', 'type': 'Exists' }, { 'field': 'Website', 'variable name': 'Website', 'type': 'String', 'has missing': True }, { 'field': 'Email', 'variable name': 'Email', 'type': 'String', 'has missing': True }, { 'field': 'Phone', 'variable name': 'Phone', 'type': 'String', 'has missing': True }, { 'type': 'Interaction', 'interaction variables': ['CompanyName', 'Website'] }, { 'type': 'Interaction', 'interaction variables': ['CompanyID', 'Email'] }] deduper = dedupe.Dedupe(fields, num_cores=4) data = self.db.pandas_read(sql_data) temp_data = dict((index, row) for index, row in enumerate(data)) deduper.sample(temp_data, sample_size=10000) del temp_data if os.path.exists(self.training_file): print('reading labeled examples from {}'.format( self.training_file)) with open(self.training_file) as tf: deduper.readTraining(tf) print('start active labeling...') # ACTIVE LEARNING dedupe.convenience.consoleLabel(deduper) with open(self.training_file, 'w') as tf: deduper.writeTraining(tf) deduper.train(recall=0.90) with open(self.setting_file, 'wb') as sf: deduper.writeSettings(sf) deduper.cleanupTraining() # BLOCKING print('blocking...') print('creating blocking_map database') self.db.execute('DROP TABLE IF EXISTS blocking_map') self.db.execute('CREATE TABLE blocking_map ' '(block_key VARCHAR(200), company_id INTEGER) ' 'CHARACTER SET utf8 COLLATE utf8_unicode_ci') print('creating inverted index') for field in deduper.blocker.index_fields: df = self.db.pandas_read( 'SELECT DISTINCT {} FROM Staging.DimCompany WHERE {} IS NOT NULL' .format(field)) field_data = (row[0] for row in df) deduper.blocker.index(field_data, field) print('writing blocking map') d_company = self.db.pandas_read(sql) full_data = ((row['CompanyID'], row) for row in d_company) b_data = deduper.blocker(full_data) val = b_data.toList() self.db.bulk_insert('INSERT INTO blocking_map VALUES (?,?)', val) deduper.blocker.resetIndices() # PREPARE BLOCKING TABLE print('prepare blocking table. ...this take a while') logging.info('indexing block_key') self.db.execute( 'ALTER TABLE blocking_map ADD UNIQUE INDEX(block_key, company_id)') self.db.execute('DROP TABLE IF EXISTS plural_key') self.db.execute('DROP TABLE IF EXISTS plural_block') self.db.execute('DROP TABLE IF EXISTS covered_blocks') self.db.execute('DROP TABLE IF EXISTS smaller_coverage') logging.info('calculating plural_key') self.db.execute( 'CREATE TABLE plural_key (block_key VARCHAR(200), block_id INTEGER UNSIGNED AUTO_INCREMENT, ' 'PRIMARY KEY (block_id)) (SELECT MIN(block_key) ' 'FROM (' 'SELECT block_key,GROUP_CONCAT(donor_id ORDER BY donor_id) AS block ' 'FROM blocking_map GROUP BY block_key HAVING COUNT(*) > 1' ') AS blocks ' 'GROUP BY block)') logging.info('creating block_key index') self.db.execute( 'CREATE UNIQUE INDEX block_key_idx ON plural_key (block_key)') logging.info("calculating plural_block") self.db.execute( 'CREATE TABLE plural_block (' 'SELECT block_id, donor_id FROM blocking_map INNER JOIN plural_key USING (block_key))' ) logging.info("adding donor_id index and sorting index") self.db.execute( 'ALTER TABLE plural_block ADD INDEX (donor_id), ADD UNIQUE INDEX (block_id, donor_id)' ) self.db.execute('SET group_concat_max_len = 2048') logging.info("creating covered_blocks") self.db.execute( 'CREATE TABLE covered_blocks (' 'SELECT donor_id, GROUP_CONCAT(block_id ORDER BY block_id) AS sorted_ids ' 'FROM plural_block GROUP BY donor_id)') self.db.execute( "CREATE UNIQUE INDEX donor_idx ON covered_blocks (donor_id)") logging.info("creating smaller_coverage") self.db.execute( 'CREATE TABLE smaller_coverage (' 'SELECT donor_id, block_id, TRIM(\',\' ' 'FROM SUBSTRING_INDEX(sorted_ids, block_id, 1)) AS smaller_ids ' 'FROM plural_block INNER JOIN covered_blocks USING (donor_id))') # CORRECT THIS SQL STATEMENT TO CORRECT THE ATTRIBUTES c_data = self.db.execute( 'SELECT company_id, city, name, zip, state, address, occupation, ' 'employer, person, block_id, smaller_ids ' 'FROM smaller_coverage INNER JOIN processed_donors ' 'USING (company_id) ORDER BY (block_id)') print('clustering...') clustered_dupes = deduper.matchBlocks(self.candidates_gen(c_data), threshold=0.5) self.db.execute('DROP TABLE IF EXISTS entity_map') # WRITING OUT RESULTS print('creating entity_map database') self.db.execute( 'CREATE TABLE entity_map (' 'donor_id INTEGER, canon_id INTEGER, cluster_score FLOAT, PRIMARY KEY(donor_id))' ) for cluster, scores in clustered_dupes: cluster_id = cluster[0] for donor_id, score in zip(cluster, scores): self.db.execute('INSERT INTO entity_map VALUES (%s, %s, %s)', (donor_id, cluster_id, score)) self.db.execute("CREATE INDEX head_index ON entity_map (canon_id)") print('# of duplicate sets are: {}'.format(len(clustered_dupes))) self.file.df_to_excel(clustered_dupes, 'Clustered Companies') def get_ventures(self): sql_venture = 'SELECT CompanyID, CompanyName FROM Reporting.DimCompany WHERE BasicName IS NULL AND CompanyName IS NOT NULL' #AND BatchID NOT IN (3496, 3497,3498, 3499)' data = self.db.pandas_read(sql_venture) sql_update = 'UPDATE Reporting.DimCompany SET BasicName = \'{}\' WHERE CompanyID = {}' for index, row in data.iterrows(): basic_name = common.get_basic_name(row[1]) # print(sql_update.format(basic_name, row[0])) self.db.execute(sql_update.format(basic_name, row[0]))
def update_dim_company_source(self, company_id, company_name): sql_update = self.sql_dim_company_source_update.format(company_id, CM.sql_friendly(company_name)) print(sql_update) db.execute(sql_update)
def load_resps_ans_contacts__lists(self, survey_id, api_token): # get api resps print("\nGetting API responses (respondents)") api_ans, api_resps = self.get_ans(survey_id, api_token) print("\nGetting contact lists on account") contact_lists = self.get_contact_lists(survey_id, api_token) contact_lists = contact_lists.apply(pd.to_numeric, errors='ignore') print("\nGetting contact lists from DB") lists_in_db_sql = CM.get_config("config.ini", "sql_queries", "contact_lists") lists_in_db = DB.pandas_read(lists_in_db_sql) print( "\nChecking for diffs b/t API contact lists and DB contact lists") lists_not_in_db = pd.merge(contact_lists, lists_in_db, how='outer', indicator=True, on="id") lists_not_in_db2 = lists_not_in_db[lists_not_in_db['_merge'] == 'left_only'].drop("_merge", axis=1) if len(lists_not_in_db2) > 0: print( "\nOne or more new contact lists detected on acct. Loading into DB now" ) insert_lists_sql = "insert_contactlists" lists_not_in_db2 = lists_not_in_db2.drop_duplicates() self.df_to_db(lists_not_in_db2, insert_lists_sql, remove_single_quotes=False) print( "\nGathering all contacts from all lists on acct into single dataframe" ) all_contacts = [] for list_id in contact_lists["id"]: contact_list = self.get_contacts(api_token, list_id) all_contacts.append(contact_list) # gather all contacts from current survey all_campaigns = self.get_campaigns(api_token, survey_id, 0, 0) for campaign_id in all_campaigns['id']: campaign_contacts = self.get_contacts(api_token, list_id=0, survey_id=survey_id, campaign_id=campaign_id) if type(campaign_contacts) == int: continue else: all_contacts.append(campaign_contacts) all_contacts = pd.concat(all_contacts) all_contacts = all_contacts.apply(pd.to_numeric, errors='ignore') all_contacts['email_address'] = all_contacts[ 'email_address'].str.lower() print("\nGathering all contacts from DB") all_contacts_sql = CM.get_config("config.ini", "sql_queries", "all_contacts") all_db_contacts = DB.pandas_read(all_contacts_sql) all_db_contacts = all_db_contacts.apply(pd.to_numeric, errors='ignore') all_db_contacts['email_address'] = all_db_contacts[ 'email_address'].str.lower() contact_merge = pd.merge(all_contacts[[ "id", "mdc_contact_id", "contact_list_id", "email_address", "firstname", "lastname" ]], all_db_contacts, how='left', on='email_address', indicator=True) new_contacts = contact_merge[[ "id_x", "email_address", "firstname_x", "lastname_x" ]][contact_merge['_merge'] == 'left_only'] new_contacts.columns = ["id", "email_address", "firstname", "lastname"] if len(new_contacts) > 0: print("Writing new contacts to DB.") insert_cs_sql = "insert_contacts" new_contacts = new_contacts.drop_duplicates() self.df_to_db(new_contacts, insert_cs_sql, clean_numeric_cols=True) else: print("\nNo new contacts to write to DB.") updated_db_contacts = DB.pandas_read(all_contacts_sql) updated_db_contacts = updated_db_contacts.apply(pd.to_numeric, errors='ignore') updated_db_contacts['email_address'] = updated_db_contacts[ 'email_address'].str.lower() updated_contact_merge = pd.merge(all_contacts[[ "id", "mdc_contact_id", "contact_list_id", "email_address", "firstname", "lastname" ]], updated_db_contacts, how='left', on='email_address', indicator=True) api_contacts_lists_df = updated_contact_merge[[ "id_x", "id_y", "contact_list_id" ]] api_contacts_lists_df = api_contacts_lists_df.apply(pd.to_numeric, errors='ignore') api_contacts_lists_df.columns = [ "sg_cid", "mdc_contact_id", "contact_list_id" ] print("\nGetting Contacts__Lists table from DB.") db_cl_sql = CM.get_config("config.ini", "sql_queries", "all_contacts__lists") db_contacts_lists_df = DB.pandas_read(db_cl_sql) db_contacts_lists_df = db_contacts_lists_df.apply(pd.to_numeric, errors='ignore') cl_merge = pd.merge(api_contacts_lists_df, db_contacts_lists_df, how='left', indicator=True, on=["sg_cid", "mdc_contact_id", "contact_list_id"]) new_cl = cl_merge[["sg_cid", "mdc_contact_id", "contact_list_id" ]][cl_merge["_merge"] == 'left_only'] new_cl = new_cl.apply(pd.to_numeric, errors='ignore') # get api answers where response_id = resps.id # get db resps where resps.survey_id = survey_id print("\nGetting all responses for this survey from DB.") r_sql = CM.get_config("config.ini", "sql_queries", "all_resps_for_survey") r_sql = r_sql.replace("WHAT_SURVEY_ID", str(survey_id)) db_resps = DB.pandas_read(r_sql) db_resps["date_submitted"] = db_resps["date_submitted"].astype(str) print( "\nDetecting responses that have changed (looking for discrepancy between DB date_submitted and API date_submitted)" ) # changed_resps = [] i = 0 changed_resps = pd.merge(db_resps[["id", "date_submitted"]], api_resps[["id", "date_submitted"]], how='outer', indicator=True, on=["id", "date_submitted"]) changed_resps = changed_resps[[ "id" ]][changed_resps["_merge"] == 'right_only'] changed_resps = changed_resps["id"].tolist() print("{} responses changed".format(len(changed_resps))) print("\nDetecting responses in API that are not in DB at all.") resps_not_in_db = pd.merge(api_resps, db_resps[["id"]], how='outer', indicator=True, on="id") resps_not_in_db2 = resps_not_in_db[resps_not_in_db['_merge'] == 'left_only'].drop("_merge", axis=1) inserted_resps = [] # SECOND INSERT OF contacts__lists new_cl = pd.merge(new_cl, db_contacts_lists_df, how='left', indicator=True, on=["sg_cid"]) new_cl = new_cl[new_cl["_merge"] == 'left_only'] new_cl = new_cl[["sg_cid", "mdc_contact_id_x", "contact_list_id_x"]] new_cl.columns = ["sg_cid", "mdc_contact_id", "contact_list_id"] if len(new_cl) > 0: print("Writing new entries to Contacts__Lists") insert_cl_sql = "insert_contacts_lists" new_cl = new_cl.drop_duplicates() self.df_to_db(new_cl, insert_cl_sql, clean_numeric_cols=True) else: print("\nNo new Contacts__Lists entries to write to DB.") # update Survey_Responses where date_submitted has changed for existing response if len(changed_resps) > 0: print( "\nUpdating DB respondent entries that have changed (have diff date_submitted)" ) resp_headers, resp_qmarks, resp_vals = self.get_sql_params( api_resps) resp_header_str = self.get_header_str(resp_headers) update_r_sql = CM.get_config("config.ini", "sql_queries", "update_rs") for id in changed_resps: j = changed_resps.index(id) where_sql = "WHERE id = " + str(id) set_strs = "" for i in range(2, len(resp_headers)): header = resp_headers[i] val = resp_vals[j][i] set_str = "[" + header + "]" + " = '" + str(val) + "', " set_strs = set_strs + set_str final_update_sql = update_r_sql + set_strs[:-2] + " " + where_sql DB.execute(final_update_sql) # insert resps that aren't db at all if len(resps_not_in_db2) > 0: print("\nInserting new responses that aren't in DB at all") insert_resp_sql = "insert_rs" resps_not_in_db2 = resps_not_in_db2.drop_duplicates() self.df_to_db(resps_not_in_db2, insert_resp_sql, remove_single_quotes=False) for id in resps_not_in_db2["id"]: inserted_resps.append(id) # write to db only answers where answers.response_id is in list of response ids written to db above # del where id in changed_resps, then insert if len(changed_resps) > 0: print( "\nDeleting answers of respondents who updated their response." ) update_a_sql = CM.get_config("config.ini", "sql_queries", "update_a_sql") changed_ans_df = api_ans[api_ans["survey_response_id"].isin( changed_resps)] ans_headers, ans_qmarks, ans_vals = self.get_sql_params( changed_ans_df) del_ans_sql = CM.get_config("config.ini", "sql_queries", "del_ans") for id in changed_resps: del_ans_sql_for_id = del_ans_sql.replace( "WHAT_RESP_ID", str(id)) DB.execute(del_ans_sql_for_id) inserted_resps.append(id) # insert ans where id in inserted_resps if len(inserted_resps) > 0: print( "\nInserting answers into DB (includes updated responses and new responses)" ) ans_insert_df = api_ans[api_ans["survey_response_id"].isin( inserted_resps)] inserts_ans_sql = "insert_as" ans_insert_df = ans_insert_df.drop_duplicates() ans_vals = self.df_to_db(ans_insert_df, inserts_ans_sql, remove_single_quotes=False, return_vals=True) elif len(inserted_resps) == 0: print("\nNo new answers to insert or update.") return print("\nChecking that all answers were inserted") check_ans_sql = CM.get_config("config.ini", "sql_queries", "check_ans") inserted_resp_ids_str = '' for id in inserted_resps: inserted_resp_ids_str = inserted_resp_ids_str + str(id) + ", " inserted_resp_ids_str = inserted_resp_ids_str[:-2] check_ans_sql = check_ans_sql.replace("WHAT_RESP_IDS", inserted_resp_ids_str) ans_inserted_this_session = DB.pandas_read(check_ans_sql) if len(ans_inserted_this_session) != len(ans_vals): print( "\nNot all answers were loaded. Rolling back insert operation " "(deleting answers and responses inserted into DB)") # del ans inserted this session, if any del_ans_sql = CM.get_config("config.ini", "sql_queries", "del_ans_by_respids") del_ans_sql = del_ans_sql.replace("WHAT_RESP_IDS", inserted_resp_ids_str) DB.execute(del_ans_sql) # del resps inserted this session, if any del_resps_sql = CM.get_config("config.ini", "sql_queries", "del_resps_by_list") del_resps_sql = del_resps_sql.replace("WHAT_RESP_IDS", inserted_resp_ids_str) DB.execute(del_resps_sql) elif len(ans_inserted_this_session) == len(ans_vals): print( "All answers successfully inserted. This means that all the responses that were inserted during this " "session have all their respective answers in the DB now.") return
from VentureMatch.match import Match as match from Shared.db import DB as db from VentureMatch.clean import Clean as clean from VentureMatch.validate import Validate as validate from Shared.common import Common as common from VentureMatch.exact_match import Exact as exact from VentureMatch.update_etl import update_etl from VentureMatch.clean_website import toURL from urllib.parse import urlparse import time # # EXACT MATCHING db.execute("DELETE FROM MDC_DEV.dbo.ProcessedVenture") db.execute("INSERT INTO MDC_DEV.dbo.ProcessedVenture SELECT * FROM MDC_DEV.dbo.Venture") # # Insert ACTia target list xlsx into ProcessedVenture to match with database # df = common.xl_to_dfs('/Users/ssimmons/Documents/',input) # df = df['ACTia_targetlist_2018'] # vals_to_insert = common.df_list(df) db.execute("DELETE FROM MDC_DEV.dbo.SourceTable") # sql = 'INSERT INTO MDC_DEV.dbo.SourceTable (ID,Name,Email,Phone) VALUES (?,?,?,?)' ## Edit based on dataset # db.bulk_insert(sql, vals_to_insert) db.execute('INSERT INTO MDC_DEV.dbo.SourceTable (SourceID, Name, BasicName, Website, Address, BatchID) ' 'SELECT ID,CompanyName, BasicName, Website, City, BatchID FROM MDCRaw.BAP.QuarterlyCompanyData') source = common.df_list(db.pandas_read("SELECT SourceID FROM MDC_DEV.dbo.SourceTable")) vals = []
def prematch_processing(): """ Organize the data by passing it through the processing tables :return: None """ # Many BlockKeys will only form blocks that contain a single # record. Since there are no comparisons possible within such a # singleton ID we can ignore them. print("Populating TransitionMap table.") db.execute("DELETE FROM MDC_DEV.dbo.TransitionMap") db.execute("INSERT INTO MDC_DEV.dbo.TransitionMap (BlockKey, IDs) SELECT BlockKey, " "STUFF((SELECT ',' + CONVERT(varchar, a.ID) FROM MDC_DEV.dbo.BlockingMap " "WHERE BlockKey = a.BlockKey ORDER BY ID " "FOR XML Path (''), TYPE).value('.','VARCHAR(MAX)'),1,1,'') AS IDs " "FROM MDC_DEV.dbo.BlockingMap AS a") # Move BlockeyKeys with multiple company IDs from TransitionMap to PluralKey # and make sure a BlockKey isn't entered more than once print("Populating PluralKey table.") db.execute("DELETE FROM MDC_DEV.dbo.PluralKey") db.execute("INSERT INTO MDC_DEV.dbo.PluralKey (BlockKey) SELECT a.BlockKey " "FROM MDC_DEV.dbo.TransitionMap AS a WHERE a.IDs LIKE '%,%' AND a.BlockKey " "NOT IN (SELECT BlockKey FROM MDC_DEV.dbo.PluralKey)") # Keep track of all company IDs associated with a particular BlockKey print("Populating PluralBlock table.") db.execute("DELETE FROM MDC_DEV.dbo.PluralBlock") db.execute("INSERT INTO MDC_DEV.dbo.PluralBlock SELECT a.BlockKey, a.ID " "FROM MDC_DEV.dbo.BlockingMap AS a WHERE a.BlockKey IN " "(SELECT b.BlockKey FROM MDC_DEV.dbo.PluralKey AS b)") # Keep track of all the BlockKeys associated with a particular company ID print("Populating CoveredBlocks table.") db.execute("DELETE FROM MDC_DEV.dbo.CoveredBlocks") db.execute("INSERT INTO MDC_DEV.dbo.CoveredBlocks (ID, SortedKeys) SELECT ID, " "STUFF((SELECT ',' + BlockKey FROM MDC_DEV.dbo.PluralBlock " "WHERE ID = a.ID ORDER BY BlockKey FOR XML Path ('')),1,1,'') " "AS SortedKeys FROM MDC_DEV.dbo.PluralBlock AS a GROUP BY ID") # In particular, for every block of records, we need to keep # track of a venture records's associated BlockKeys that are SMALLER than # the current block's id. Because we ordered the ids we can achieve this by using some string hacks. print("Populating SmallerCoverage table.") db.execute("DELETE FROM MDC_DEV.dbo.SmallerCoverage") db.execute("INSERT INTO MDC_DEV.dbo.SmallerCoverage (ID, BlockKey, SmallerKeys) SELECT a.ID, a.BlockKey, " "REPLACE(SUBSTRING(b.SortedKeys, 0, CHARINDEX(a.BlockKey, b.SortedKeys)), ',', ' ') " "AS SmallerKeys FROM MDC_DEV.dbo.PluralBlock AS a " "INNER JOIN MDC_DEV.dbo.CoveredBlocks AS b ON a.ID = b.ID") return None