def createSourceDataSQLQuery(self): """ Create query for pulling source data from db """ logging.info("Obtaining source data...") query = \ """ SELECT distinct t.aw_supplier_name as src_name, '' as src_tag, '' as src_address_locality, '' as src_address_postalcode, '' as src_address_countryname, '' as src_address_streetaddress, t.source as source FROM {0} as t WHERE TRUE AND t.countryname IN ('United Kingdom','England','Scotland','Northern Ireland','Wales') AND t.aw_supplier_name NOTNULL AND t.releasedate >= {1} AND t.releasedate <= {2} ; """.format(self.src_data_source, "'" + self.in_args.data_from_date + "'", "'" + self.in_args.data_to_date + "'") return query
def upload_file(self, file_name, bucket, object_name=None): """Upload a file to an S3 bucket :param file_name: File to upload :param bucket: Bucket to upload to :param object_name: S3 object name. If not specified then file_name is used :return: True if file was uploaded, else False """ # If S3 object_name was not specified, use file_name if object_name is None: object_name = file_name # Upload the file s3_client = boto3.client( 's3', aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key) try: s3_client.upload_file(file_name, bucket, object_name) except ClientError as e: logging.exception(e) return False logging.info("Upload to S3 bucket complete!") return True
def dedupeCluster(self): if not os.path.exists(self.clustered_fp): # Copy training file from first clustering session if recycle mode if self.in_args.recycle: copyfile( self.directories['cluster_training_backup'].format( self.region_dir), self.cluster_training_file) logging.info("Starting clustering...") sys.argv = [ 'csvdedupe', self.matched_fp, '--field_names', ' '.join(self.src_fields), '--training_file', self.cluster_training_file, '--settings_file', self.learned_settings_file, '--output_file', self.clustered_fp, str(self.ctrain[0]) ] if self.ctrain[0] == '': sys.argv = sys.argv[:-1] launch_clustering() if not self.in_args.recycle: # Copy training file to backup, so it can be found and copied into recycle phase clustering copyfile( self.cluster_training_file, self.directories['cluster_training_backup'].format( self.region_dir)) else: pass
def fetchdata(self, query): """ Retrieve data from the db using query """ conn, _ = self.db_calls.DbCalls.createConnection(self) logging.info('Importing data...') df = pd.read_sql(query, con=conn) conn.close() return df
def assign(self, df, assigned_file=None): df.sort_values(by=['Cluster_ID'], inplace=True, axis=0, ascending=True) df.reset_index(drop=True, inplace=True) tqdm.pandas() logging.info("Assigning close matches within clusters...") df = df.groupby(['Cluster_ID' ]).progress_apply(AssignRegDataToClusters.getMaxId) df.to_csv(assigned_file, index=False) return df
def dedupeSplitMatch(self): ''' Takes all files in Adj_Data/Splits and iteratively runs them through matching process before combining into one output file :return: ''' # Define filepaths and vars to be passed to dedupe splits_input_dir = self.directories["splits_inputs_dir"].format( self.region_dir, self.proc_type) splits_output_dir = self.directories['splits_outputs_dir'].format( self.region_dir, self.proc_type) files = glob.glob(os.path.join(splits_input_dir, '*')) numfiles = len(files) fileno = 0 for src_fp in files: fileno += 1 if not os.path.exists(self.matched_fp): logging.info( f"Starting matching of split file {str(fileno)} / {str(numfiles)}" ) sys.argv = [ 'csvlink', str(src_fp), str(self.reg_fp), '--field_names_1', ' '.join(self.src_fields), '--field_names_2', ' '.join(self.reg_fields), '--training_file', self.matches_training_file, '--settings_file', self.learned_settings_file, '--output_file', os.path.join(splits_output_dir, str(fileno) + '.csv'), str(self.mtrain[0]) ] if self.mtrain[0] == '': sys.argv = sys.argv[:-1] launch_matching() files = glob.glob(os.path.join(splits_output_dir, '*')) frames = [] # Load in each file and add to frames list for file in files: df = pd.read_csv(file) frames.append(df) # Concatenate files in list into one df df = pd.concat(frames) df = df.drop_duplicates() # Save as normal to match outputs folder df.to_csv(self.matched_fp, index=False)
def createSourceDataSQLQuery(self): """ Create query for pulling source data from db """ logging.info("Obtaining source data...") if self.in_args.prodn: query = \ """ SELECT distinct t.buyer as src_name, t.json -> 'releases' -> 0 -> 'tag' as src_tag, t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'locality' as src_address_locality, t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'postalCode' as src_address_postalcode, t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'countryName' as src_address_countryname, t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'streetAddress' as src_address_streetaddress, t.source as source FROM {0} as t WHERE TRUE AND t.countryname IN ('United Kingdom','England','Scotland','Northern Ireland','Wales') AND t.buyer NOTNULL AND t.releasedate >= {1} AND t.releasedate <= {2} ; """.format(self.src_data_source, "'" + self.in_args.data_from_date + "'", "'" + self.in_args.data_to_date + "'") else: query = \ """ SELECT distinct t.buyer as src_name, t.json -> 'releases' -> 0 -> 'tag' as src_tag, t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'locality' as src_address_locality, t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'postalCode' as src_address_postalcode, t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'countryName' as src_address_countryname, t.json -> 'releases' -> 0 -> 'buyer' -> 'address' ->> 'streetAddress' as src_address_streetaddress, t.source as source FROM {0} as t WHERE TRUE AND t.buyer NOTNULL AND t.releasedate >= '2009-09-11 00:00:00.000000' AND t.releasedate <= '2015-09-11 00:00:00.000000' AND t.countryname IN ('United Kingdom','England','Scotland','Northern Ireland','Wales') LIMIT 2000 ; """.format(self.src_data_source) return query
def createConnection(self): ''' Creates a connection to the database specified .env :return connection : the database connection object :return cur : the cursor (temporary storage for retrieved data ''' logging.info('Connecting to database...') conn = psy.connect(host=self.host_remote, dbname=self.dbname_remote, user=self.user_remote, password=self.password_remote) cur = conn.cursor() return conn, cur
def dedupeMatch(self): """ Deduping - first the registry and source data are matched using dedupes csvlink, then the matched file is put into clusters :param src_file: :param directories: file/folder locations :param config_files: the main config files :param proc_type: the 'type' of the process (Name, Name & Address) :param proc_num: the individual process within the config file :return None :output : matched output file :output : matched and clustered output file """ src_fp = self.directories['adj_dir'].format( self.region_dir) + self.directories['adj_src_data'].format( self.in_args.src_adj) # Matching: if not os.path.exists(self.matched_fp): if self.in_args.recycle: # Copy manual matching file over to build on for clustering copyfile( self.directories['manual_matching_train_backup'].format( self.region_dir), self.matches_training_file) logging.info("Starting matching...") sys.argv = [ 'csvlink', str(src_fp), str(self.reg_fp), '--field_names_1', ' '.join(self.src_fields), '--field_names_2', ' '.join(self.reg_fields), '--training_file', self.matches_training_file, '--settings_file', self.learned_settings_file, '--output_file', self.matched_fp, str(self.mtrain[0]) ] if self.mtrain[0] == '': sys.argv = sys.argv[:-1] launch_matching() df = pd.read_csv(self.matched_fp, dtype=self.df_dtypes) df = df[pd.notnull(df['src_name'])] df = df.drop_duplicates() df.to_csv(self.matched_fp, index=False)
def transferMatches(self): logging.info( f"Tranferring new matches from {self.upload_table} to {self.transfer_table}." ) query = \ """ INSERT INTO {} SELECT DISTINCT src_name, reg_scheme, reg_id, reg_name, match_source, match_date, match_by FROM {} m WHERE NOT EXISTS (SELECT src_name, reg_name FROM {} t WHERE m.src_name = t.org_string) AND m.manual_match_n LIKE 'Y' ON CONFLICT DO NOTHING """.format(self.transfer_table, self.upload_table, self.transfer_table) return query
def createRegistryDataSQLQuery(self): """ Create query for downloading registry data from db """ logging.info("Obtaining registry data...") query = \ """ SELECT legalname as reg_name, id as reg_id, '' as reg_address, scheme as reg_scheme, 'dedupe_script' as match_source from {} """.format(self.reg_data_source) return query
def addDataToTable(self): ''' Adds the confirmed_matches data to table :param table_name: the database table to which the confirmed matches will be addded :param directories: directory variables :param proc_type: Process type, initially name_only :param upload_file: the dataframe containing the data :return: None ''' conn, cur = self.createConnection() logging.info(f"Connected to {self.upload_table}") files = glob.glob( os.path.join( self.directories['verified_matches_dir'].format( self.region_dir, self.proc_type), '*')) for upload_file in files: with open(upload_file, 'r', encoding='utf-8-sig') as f: # Get headers dynamically reader = csv.reader(f) headers = next(reader, None) headers = ", ".join(headers) self.headers = headers next(f) # Skip header row # Input the data into the dedupe table # copy_expert allows access to csv methods (i.e. char escaping) cur.copy_expert( """COPY {}({}) from stdin (format csv)""".format( self.upload_table, headers), f) conn.commit() # Remove any exact duplicates from db table try: query = self.removeTableDuplicates() cur.execute(query) conn.commit() except: next # Also transfer matches to transfer table (orgs_lookup, where doesn't exist already) query = self.transferMatches() cur.execute(query) conn.commit()
def removeTableDuplicates(self): """ Remove any exact duplicates from db table :param table_name: the database table containing duplicates :param headers: the csv headers :return: the sql query to be executed """ logging.info("Removing duplicates from table...") query = \ """ WITH dups AS (SELECT DISTINCT ON ({}) * FROM {}) DELETE FROM {} WHERE ({}.id) NOT IN (SELECT id FROM dups); """.format(self.headers, self.upload_table, self.upload_table, self.upload_table, self.upload_table) return query
def manualMatching(self): """ Provides user-input functionality for manual matching based on the extracted records :return manual_match_file: extracted file with added column (Y/N/Unsure) """ best_filtered = pd.read_csv(self.best_filtered, index_col=None, dtype=self.df_dtypes) best_filtered['Manual_Match_N'] = '' best_filtered['Manual_Match_NA'] = '' if self.in_args.terminal_matching: # Iterate over the file, shuffled with sample, as best matches otherwise would show first: for index, row in best_filtered.sample(frac=1).iterrows(): logging.info("\nsource name: " + str(row.src_name_adj)) logging.info("\nRegistry name: " + str(row.reg_name_adj)) logging.info("\nLevenshtein distance: " + str(row.leven_dist_N)) match_options = ["y", "n", "u", "f"] match = input("\nMatch? Yes, No, Unsure, Finished (Y/N/U/F):") while match.lower() not in match_options: match = input( "\nMatch? Yes, No, Unsure, Finished (Y/N/U/F):") if str(match).lower() != "f": best_filtered.at[index, 'Manual_Match_N'] = str( match).capitalize() continue else: break best_filtered.sort_values(by=['Cluster_ID'], inplace=True, axis=0, ascending=True) logging.info("Saving...") best_filtered.to_csv(self.unverified_file, index=False, columns=self.dbUpload_cols) else: best_filtered.to_csv(self.unverified_file, index=False, columns=self.dbUpload_cols)
def clean(self): raw_data = self.directories['raw_dir'].format( self.region_dir) + self.directories['raw_reg_data'].format( self.in_args.reg) adj_data = self.directories['adj_dir'].format( self.region_dir) + self.directories['adj_reg_data'].format( self.in_args.reg_adj) if not os.path.exists(adj_data): logging.info("Re-organising registry data...") df = pd.read_csv(raw_data, dtype=self.df_dtypes, chunksize=500000) dffullmerge = pd.DataFrame([]) for chunk in df: # Remove punctuation and double spacing adj_col = str('reg_name_adj') orig_col = str('reg_name') # chunk[adj_col] = '' chunk = DataProcessing.remvPunct(self, chunk, orig_col, adj_col) # # Duplicate rows containing 'and' then convert to & # chunk1 = DataProcessing.duplicaterowscontainingand(self, df, adj_col) # # # Duplicate rows containing '&' and convert to 'and' # chunk2 = DataProcessing.duplicaterowscontainingampersand(self, df, adj_col) # # # Concatenate the two together # chunk = pd.concat([chunk1, chunk2]).sort_index().reset_index(drop=True) # Above gives 'Uncaught exception: 'TextFileReader' object has no attribute 'copy'' # Replace organisation suffixes with standardised version chunk[adj_col].replace(self.org_suffixes.org_suffixes_dict, regex=True, inplace=True) # Remove punctuation and double spacing in address adj_col = str('reg_address_adj') orig_col = str('reg_address') # dfmerge = self.remvPunct(dfmerge, orig_col, adj_col) dfmerge = DataProcessing.remvPunct(self, chunk, orig_col, adj_col) # dfmerge = self.remvStreetNumber(dfmerge, adj_col) dfmerge = DataProcessing.joinFields(self, dfmerge, 'reg') dffullmerge = pd.concat([dffullmerge, dfmerge], ignore_index=True) dffullmerge.drop_duplicates(inplace=True) logging.info("...done") dffullmerge['reg_joinfields'] = dffullmerge[ 'reg_joinfields'].astype(str) dffullmerge['match_source'] = dffullmerge['match_source'].astype( str) dffullmerge['reg_scheme'] = dffullmerge['reg_scheme'].astype(str) dffullmerge['reg_id'] = dffullmerge['reg_id'].astype(str) dffullmerge['reg_address_adj'] = dffullmerge[ 'reg_address_adj'].astype(str) dffullmerge.to_csv(adj_data, index=False) return dffullmerge else: return pd.read_csv(adj_data, dtype=self.df_dtypes)
def clean(self): raw_data = self.directories['raw_dir'].format( self.region_dir) + self.directories['raw_src_data'].format( self.in_args.src) adj_data = self.directories['adj_dir'].format( self.region_dir) + self.directories['adj_src_data'].format( self.in_args.src_adj) if not os.path.exists(adj_data): df = pd.read_csv(raw_data, usecols=self.raw_src_data_cols, dtype=self.df_dtypes) logging.info("Re-organising source data...") # Remove punctuation and double spacing in name adj_col = str('src_name_adj') orig_col = str('src_name') df = df.dropna(subset=[orig_col]) # Decode html entities i.e. "&" into "&" df[adj_col] = df[orig_col].apply(html.unescape) # Duplicate rows containing 'and' then convert to & df1 = DataProcessing.duplicaterowscontainingand(self, df, adj_col) # Duplicate rows containing '&' and convert to 'and' df2 = DataProcessing.duplicaterowscontainingampersand( self, df, adj_col) # Concatenate the two together df = pd.concat([df1, df2]).sort_index().reset_index(drop=True) # df = DataProcessing.remvPunct(self, df, orig_col, adj_col) df = DataProcessing.remvPunct(self, df, adj_col, adj_col) # Replace organisation suffixes with standardised version df[adj_col].replace(self.org_suffixes.org_suffixes_dict, regex=True, inplace=True) # # Remove punctuation and double spacing in address adj_col = str('src_address_adj') df[adj_col] = df['src_address_streetaddress'] + ', ' + df[ 'src_address_locality'] + ', ' + df[ 'src_address_postalcode'] + ', ' + df[ 'src_address_countryname'] df = DataProcessing.remvPunct(self, df, adj_col, adj_col) df = DataProcessing.joinFields(self, df, 'src') df = df.drop([ 'src_address_streetaddress', 'src_address_locality', 'src_address_postalcode', 'src_address_countryname' ], axis=1) logging.info("...done") # Remove blacklisted entities df = self.filter_blacklisted(df) df.to_csv(adj_data, index=False) # If flag 'split' has been used, split the source file into smaller files if self.in_args.split: chunksize = self.in_args.splitsize numberofchunks = len(df) // chunksize + 1 for i in range(numberofchunks): if i == 0: df[:(i + 1) * chunksize].to_csv(os.path.join( os.getcwd(), 'Regions', str(self.in_args.region), 'Data_Inputs', 'Adj_Data', 'Splits', str(self.in_args.src)[:-4] + str(i) + '.csv'), index=False) else: df[chunksize * i:(i + 1) * chunksize].to_csv( os.path.join( os.getcwd(), 'Regions', str(self.in_args.region), 'Data_Inputs', 'Adj_Data', 'Splits', str(self.in_args.src)[:-4] + str(i) + '.csv'), index=False)