def handle(self, *args, **options): self.init_options(options) self.log('Started at: {}'.format(datetime.now())) if self.options.get("verbose") > 0: with open(self.options["model_file"], 'rb') as sf: describe_dedupe(sys.stdout, dedupe.StaticDedupe(sf)) with open(self.options["points_file"], 'w', encoding="utf8") as outf: for test_pool in options["test_pool"]: self.process_test(test_pool, outf)
def deduper_setup(self,settings_file, training_file, field_list, selection, sample): """ Trains (if training and settings files do not exist) otherwise set up deduper object :param settings_file: settings file name :param training_file: training file name :param field_list: list of lists (field(string), comparator(string), missing?(bool)) :param selection: sql statement selecting all relevant columns to use in deduplication :param sample: sample size of data to be used for training :return: deduper object """ if os.path.exists(settings_file): print('Reading from ', settings_file) with open(settings_file, 'rb') as sf: self.deduper = dedupe.StaticDedupe(sf, num_cores=4) else: # Define the fields dedupe will pay attention to fields = [] for field in field_list: fields.append({'field': field[0], 'type': field[1], 'has missing': field[2]}) # Create a new deduper object and pass our data model to it. self.deduper = dedupe.Dedupe(fields, num_cores=4) data = db.pandas_read(selection).to_dict('index') print('Collecting sample data for active learning... this may take a while.') self.deduper.sample(data, sample) if os.path.exists(training_file): print('Reading labeled examples from ', training_file) with open(training_file) as tf: self.deduper.readTraining(tf) print('Starting active labeling...') dedupe.convenience.consoleLabel(self.deduper) # When finished, save our labeled, training pairs to disk with open(training_file, 'w') as tf: self.deduper.writeTraining(tf) # `recall` is the proportion of true dupes pairs that the learned # rules must cover. You may want to reduce this if your are making # too many blocks and too many comparisons. self.deduper.train(recall=0.90) with open(settings_file, 'wb') as sf: self.deduper.writeSettings(sf) self.deduper.cleanupTraining()
def print_roc_points_quick_and_dirty(self, test_pool_file_name, output_points_file): clustered_dupes = [] with open(self.options["model_file"], 'rb') as sf: dedupe_model = dedupe.StaticDedupe(sf) clustered_dupes = dedupe_model.match(self.dedupe_objects, 0) for t in ROC_POINTS: threshold = t / 100.0 pairs = get_pairs_from_clusters(clustered_dupes, threshold=threshold) (metrics, test_results) = calcMetrics(pairs, self.test_data) self.print_test_output(test_results) metrics['Threshold'] = threshold metrics['TestName'] = os.path.basename(test_pool_file_name) output_points_file.write(json.dumps(metrics) + "\n")
def deduplicate( df, recall_weight=1, sample_size=0.3, settings_file="training-data/dedupe_learned_settings", training_file="training-data/dedupe_training.json", ): fields = df.columns df, data_d = data_prep(df) if os.path.exists(settings_file): logger.info("Existing settings found. Loading from: %s", settings_file) with open(settings_file, "rb") as f: deduper = dedupe.StaticDedupe(f) else: fields = select_fields(fields) deduper = dedupe.Dedupe(fields) sample_num = math.floor(len(data_d) * sample_size) logger.info("Extracting data sample of %s records", sample_num) deduper.sample(data_d, sample_num) if os.path.exists(training_file): logger.info("Reading training examples from: %s", training_file) with open(training_file, "rb") as f: deduper.readTraining(f) logger.info("Starting active labeling") dedupe.consoleLabel(deduper) deduper.train() with open(training_file, "w") as tf: deduper.writeTraining(tf) with open(settings_file, "wb") as sf: deduper.writeSettings(sf) threshold = deduper.threshold(data_d, recall_weight=recall_weight) clustered_df = data_cluster(deduper, data_d, threshold) results = df.join(clustered_df, how="left") results.drop(["dictionary"], axis=1, inplace=True) return results
def deduper_eval(dataset_type: str, dataset): # Create deduper model with open('trained_{}_settings.json'.format(dataset_type), 'rb') as fin: deduper = dedupe.StaticDedupe(fin, num_cores=8) # Prepare the data if dataset_type in ['x2', 'x3']: cols = [ 'instance_id', 'brand', 'model_name', # 'model_number', 'cpu_brand', 'cpu_model', 'cpu_type', 'ram_capacity', 'hdd_capacity', 'ssd_capacity', 'title', 'screen_size', 'model' ] else: cols = [ 'name', 'name_2', 'model', 'line', 'brand', 'size', 'product_type' ] to_dedupe = dataset[cols] to_dedupe_dict = to_dedupe.to_dict(orient='index') # Cluster (prediction stage) clustered_dupes = deduper.partition(to_dedupe_dict, partition_threshold[dataset_type]) print('# duplicate sets', len(clustered_dupes)) # Save the result res = [] for el in clustered_dupes: for i in range(len(el[0])): for j in range(i + 1, len(el[0])): res.append((el[0][i], el[0][j])) res_df = pd.DataFrame(res) res_df.columns = ['left_instance_id', 'right_instance_id'] return res_df
def match(self): ''' generate the clusters ''' print('Start Match Parent Method') print(datetime.now()) with self.settings_file.get_file_bytes() as s: deduper = dedupe.StaticDedupe(s) src_data = self.data_source.get_data_dict() print('Start Threshold') print(datetime.now()) threshold = deduper.threshold(src_data, recall_weight=self.recall_weight) print('threshold equals ' + str(threshold)) print('Start Dedupe Match') print(datetime.now()) clustered_dupes = deduper.match(src_data, threshold) print('Start Writing Clusters') print(datetime.now()) self.write_clusters(clustered_dupes, src_data)
def is_duplicate(self, flaw): """ Checks whether the flaw passed as parameter is a duplicate or not Uses training information already available in training data folder. NOTE: should be called from the RVD respository directory. :param flaw, Flaw :return bool """ data_d = self.read_data(None, invalid=False) # data dict # pprint.pprint(data_d) # Append the flaw to the data dictonary with the ID 0 data_d[0] = flaw.document_duplicates() # pprint.pprint(data_d) if os.path.exists(self.settings_file): print("reading from", self.settings_file) with open(self.settings_file, "rb") as f: deduper = dedupe.StaticDedupe(f) else: red("Error: settings file does not exist, stoping") sys.exit(1) cyan("Finding the threshold for data...") threshold = deduper.threshold(data_d, recall_weight=1) cyan("Clustering...") clustered_dupes = deduper.match(data_d, threshold) # pprint.pprint(clustered_dupes) # debug purposes # If ID 0 (corresponds with flaw passed as arg) is in there, is_duplicate for set in clustered_dupes: ids, values = set for id in ids: # print(id) if int(id) == 0: return True return False
def blockDedupe(session_id, table_name=None, entity_table_name=None, canonical=False): if not table_name: table_name = 'processed_{0}'.format(session_id) if not entity_table_name: entity_table_name = 'entity_{0}'.format(session_id) dd_session = worker_session.query(DedupeSession)\ .get(session_id) deduper = dedupe.StaticDedupe(StringIO(dd_session.settings_file)) engine = worker_session.bind metadata = MetaData() proc_table = Table(table_name, metadata, autoload=True, autoload_with=engine) entity_table = Table(entity_table_name, metadata, autoload=True, autoload_with=engine) for field in deduper.blocker.tfidf_fields: with engine.begin() as conn: fd = conn.execute('select record_id, {0} from "{1}"'.format(field, table_name)) deduper.blocker.tfIdfBlock(fd, field) """ SELECT p.* <-- need the fields that we trained on at least FROM processed as p LEFT OUTER JOIN entity_map as e ON p.record_id = e.record_id WHERE e.target_record_id IS NULL """ proc_records = worker_session.query(proc_table)\ .outerjoin(entity_table, proc_table.c.record_id == entity_table.c.record_id)\ .filter(entity_table.c.target_record_id == None) fields = proc_table.columns.keys() full_data = ((getattr(row, 'record_id'), dict(zip(fields, row))) \ for row in proc_records.yield_per(50000)) return deduper.blocker(full_data)
def main(self): data_d = {} # import the specified CSV file data_d = csvhelpers.readData(self.input, self.field_names) logging.info('imported %d rows', len(data_d)) # sanity check for provided field names in CSV file for field in self.field_definition: if field['type'] != 'Interaction': if not field['field'] in data_d[0]: raise self.parser.error("Could not find field '" + field['field'] + "' in input") logging.info('using fields: %s' % [field['field'] for field in self.field_definition]) # If --skip_training has been selected, and we have a settings cache still # persisting from the last run, use it in this next run. # __Note:__ if you want to add more training data, don't use skip training if self.skip_training and os.path.exists(self.settings_file): # Load our deduper from the last training session cache. logging.info('reading from previous training cache %s' % self.settings_file) with open(self.settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) fields = {variable.field for variable in deduper.data_model.primary_fields} unique_d, parents = exact_matches(data_d, fields) else: # # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(self.field_definition) fields = {variable.field for variable in deduper.data_model.primary_fields} unique_d, parents = exact_matches(data_d, fields) # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) deduper.sample(unique_d, self.sample_size) # Perform standard training procedures self.dedupe_training(deduper) # ## Blocking logging.info('blocking...') # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.threshold(unique_d, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.match(unique_d, threshold) expanded_clustered_dupes = [] for cluster, scores in clustered_dupes: new_cluster = list(cluster) new_scores = list(scores) for row_id, score in zip(cluster, scores): children = parents.get(row_id, []) new_cluster.extend(children) new_scores.extend([score] * len(children)) expanded_clustered_dupes.append((new_cluster, new_scores)) clustered_dupes = expanded_clustered_dupes logging.info('# duplicate sets %s' % len(clustered_dupes)) write_function = csvhelpers.writeResults # write out our results if self.destructive: write_function = csvhelpers.writeUniqueResults if self.output_file: with open(self.output_file, 'w', encoding='utf-8') as output_file: write_function(clustered_dupes, self.input, output_file) else: if sys.version < '3' : out = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) write_function(clustered_dupes, self.input, out) else : write_function(clustered_dupes, self.input, sys.stdout)
def train(clients): if os.path.exists(settings_file): with open(settings_file) as sf: print 'reading from', settings_file return dedupe.StaticDedupe(sf) else: fields = [ { 'field': 'city', 'type': 'ShortString', 'Has Missing': True }, { 'field': 'country', 'type': 'ShortString', 'Has Missing': True }, { 'field': 'zip', 'type': 'ShortString', 'Has Missing': True }, { 'field': 'state', 'type': 'ShortString', 'Has Missing': True }, { 'field': 'address', 'type': 'String', 'Has Missing': True }, { 'field': 'description', 'type': 'Text', 'Has Missing': True, 'corpus': [] #map(lambda x: x['description'],clients.values())}, }, { 'field': 'specific_issues', 'type': 'Text', 'Has Missing': True, 'corpus': [] #map(lambda x: x['specific_issues'],clients.values()) }, { 'field': 'exact_name', 'type': 'String' }, { 'field': 'alis', 'type': 'Set', 'corpus': [] #map(lambda x: x['alis'],clients.values()) }, { 'field': 'houseID', 'type': 'Exact' }, { 'field': 'senate', 'type': 'Exact' }, ] #for definition in fields[:] : # if definition.field != 'houseID': # fields[k+"-houseID"] = {'type':'Interaction', # 'Interaction Fields': ["houseID", k]} # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields) # To train dedupe, we feed it a random sample of records. deduper.sample(clients, 150000) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): with open(training_file) as tf: print 'reading labeled examples from ', training_file deduper.readTraining(tf) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print 'starting active labeling...' dedupe.consoleLabel(deduper) deduper.train(ppc=0.1, uncovered_dupes=2) # When finished, save our training away to disk with open(training_file, 'w') as tf: deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'w') as sf: deduper.writeSettings(sf) deduper.cleanupTraining() return deduper
def run_dedupe(settings_file, training_file, type): start_time = time.time() print("dedupe file ", dedupe.__file__) print("dedupe path ", dedupe.__path__) print("dedupe name ", dedupe.__name__) print("dedupe spec ", dedupe.__spec__) print("dedupe doc ", dedupe.__doc__) print("dedupe loader ", dedupe.__loader__) # Set the database connection from environment variable using # [dj_database_url](https://github.com/kennethreitz/dj-database-url) # For example: # export DATABASE_URL=postgres://user:password@host/mydatabase db_conf = dj_database_url.config() if not db_conf: raise Exception( 'set DATABASE_URL environment variable with your connection, e.g. ' 'export DATABASE_URL=postgres://user:password@host/mydatabase') read_con = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT'], cursor_factory=psycopg2.extras.RealDictCursor) write_con = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT']) # We'll be using variations on this following select statement to pull # in campaign donor info. if type == 'IND': DONOR_SELECT = "SELECT donor_id, city, name, zip, state, street " \ "from processed_donors where person = 1 and name not like '%unitem%' " else: DONOR_SELECT = "SELECT donor_id, city, name, zip, state, street " \ "from processed_donors where person != 1 and name not like '%unitem%' " # ## Training if os.path.exists(settings_file): print('reading from ', settings_file) with open(settings_file, 'rb') as sf: deduper = dedupe.StaticDedupe(sf, num_cores=1) else: # Define the fields dedupe will pay attention to # # The street, city, and zip fields are often missing, so we'll # tell dedupe that, and we'll learn a model that take that into # account fields = [ { 'field': 'name', 'type': 'String' }, { 'field': 'street', 'type': 'String', 'has missing': True }, { 'field': 'city', 'type': 'String', 'has missing': True }, { 'field': 'state', 'type': 'ShortString', 'has missing': True }, { 'field': 'zip', 'type': 'ShortString', 'has missing': True }, ] # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields, num_cores=1) # Named cursor runs server side with psycopg2 with read_con.cursor('donor_select') as cur: cur.execute(DONOR_SELECT) temp_d = {i: row for i, row in enumerate(cur)} # If we have training data saved from a previous run of dedupe, # look for it an load it in. # # __Note:__ if you want to train from # scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf: deduper.prepare_training(temp_d, tf) else: deduper.prepare_training(temp_d) del temp_d # ## Active learning print('starting active labeling...') # Starts the training loop. Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished dedupe.console_label(deduper) # When finished, save our labeled, training pairs to disk with open(training_file, 'w') as tf: deduper.write_training(tf) # Notice our argument here # # `recall` is the proportion of true dupes pairs that the learned # rules must cover. You may want to reduce this if your are making # too many blocks and too many comparisons. deduper.train(recall=0.90) with open(settings_file, 'wb') as sf: deduper.write_settings(sf) # We can now remove some of the memory hogging objects we used # for training deduper.cleanup_training() print(f'deduper predicates: {deduper.predicates}') # ## Blocking print('blocking...') # To run blocking on such a large set of data, we create a separate table # that contains blocking keys and record ids print('creating blocking_map database') blocking_table = 'blocking_map_' + type with write_con: with write_con.cursor() as cur: cur.execute("DROP TABLE IF EXISTS " + blocking_table) cur.execute("CREATE TABLE " + blocking_table + " " "(block_key text, donor_id INTEGER)") # If dedupe learned a Index Predicate, we have to take a pass # through the data and create indices. print('creating inverted index') for field in deduper.fingerprinter.index_fields: with read_con.cursor('field_values') as cur: cur.execute("SELECT DISTINCT %s FROM processed_donors" % field) field_data = (row[field] for row in cur) deduper.fingerprinter.index(field_data, field) # Now we are ready to write our blocking map table by creating a # generator that yields unique `(block_key, donor_id)` tuples. print('writing blocking map') with read_con.cursor('donor_select') as read_cur: read_cur.execute(DONOR_SELECT) full_data = ((row['donor_id'], row) for row in read_cur) b_data = deduper.fingerprinter(full_data) with write_con: with write_con.cursor() as write_cur: write_cur.copy_expert('COPY ' + blocking_table + ' FROM STDIN WITH CSV', Readable(b_data), size=100000) # free up memory by removing indices deduper.fingerprinter.reset_indices() logging.info("indexing block_key") with write_con: with write_con.cursor() as cur: cur.execute("CREATE UNIQUE INDEX ON " + blocking_table + " " "(block_key text_pattern_ops, donor_id)") # ## Clustering entity_map_table = "entity_map_" + type with write_con: with write_con.cursor() as cur: cur.execute("DROP TABLE IF EXISTS " + entity_map_table) print('creating entity_map database') cur.execute("CREATE TABLE " + entity_map_table + " " "(donor_id INTEGER, canon_id INTEGER, " " cluster_score FLOAT, PRIMARY KEY(donor_id))") read_con1 = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT'], cursor_factory=psycopg2.extras.RealDictCursor) with read_con1.cursor( 'pairs', cursor_factory=psycopg2.extensions.cursor) as read_cur: read_cur.execute( "SELECT a.donor_id, " "row_to_json((SELECT d FROM (SELECT a.city, " "a.name, " "a.zip, " "a.state, " "a.street) d)), " "b.donor_id, " "row_to_json((SELECT d FROM (SELECT b.city, " "b.name, " "b.zip, " "b.state, " "b.street) d)) " "FROM (SELECT DISTINCT l.donor_id AS east, r.donor_id AS west " "FROM " + blocking_table + " AS l " "INNER JOIN " + blocking_table + " AS r " "USING (block_key) " "WHERE l.donor_id < r.donor_id) ids " "INNER JOIN processed_donors a ON ids.east=a.donor_id " "INNER JOIN processed_donors b ON ids.west=b.donor_id") print('clustering...') clustered_dupes = deduper.cluster(deduper.score( record_pairs(read_cur)), threshold=0.5) # ## Writing out results # We now have a sequence of tuples of donor ids that dedupe believes # all refer to the same entity. We write this out onto an entity map # table print('writing results') write_con1 = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT']) with write_con1: with write_con1.cursor() as write_cur: write_cur.copy_expert('COPY ' + entity_map_table + ' FROM STDIN WITH CSV', Readable(cluster_ids(clustered_dupes)), size=100000) with write_con1: with write_con1.cursor() as cur: cur.execute("CREATE INDEX head_index_" + type + " ON " + entity_map_table + " (canon_id)") # Print out the number of duplicates found # ## Payoff # With all this done, we can now begin to ask interesting questions # of the data # # For example, let's see who the top 10 donors are. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') # for pretty printing numbers read_con2 = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT'], cursor_factory=psycopg2.extras.RealDictCursor) # save entity map entity_map_filename = entity_map_table + "_" + settings_file.split( '/')[-1] + '_' + time.strftime('%d_%m_%y_%H%M', time.localtime()) + '.csv' donors_filename = 'processed_donors_' + settings_file.split( '/')[-1] + '_' + time.strftime('%d_%m_%y_%H%M', time.localtime()) + '.csv' with read_con2.cursor() as cur: with open(entity_map_filename, 'w') as file_out: cur.copy_expert( 'COPY ' + entity_map_table + ' TO STDOUT WITH CSV HEADER', file_out) with open(donors_filename, 'w') as file_out: cur.copy_expert('COPY processed_donors TO STDOUT WITH CSV HEADER', file_out) if type == 'IND': top_donor_where = 'donors.corp is null' else: top_donor_where = 'donors.corp is not null' # Create a temporary table so each group and unmatched record has # a unique id with read_con2.cursor() as cur: cur.execute( "CREATE TEMPORARY TABLE e_map " "AS SELECT COALESCE(canon_id, donor_id) AS canon_id, donor_id " "FROM " + entity_map_table + " " "RIGHT JOIN donors USING(donor_id)") cur.execute( "SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, " "donation_totals.totals AS totals " "FROM (SELECT * FROM donors WHERE " + top_donor_where + ") as donors INNER JOIN " "(SELECT canon_id, SUM(CAST(contributions.amount AS FLOAT)) AS totals " " FROM contributions INNER JOIN e_map " " USING (donor_id) " " GROUP BY (canon_id) " " ORDER BY totals " " DESC) " "AS donation_totals ON donors.donor_id=donation_totals.canon_id " "WHERE donors.donor_id = donation_totals.canon_id and donation_totals.totals IS NOT NULL LIMIT 10" ) print("Top Donors (deduped)") for row in cur: row['totals'] = locale.currency(row['totals'], grouping=True) print('%(totals)20s: %(name)s' % row) # Compare this to what we would have gotten if we hadn't done any # deduplication cur.execute( "SELECT name, totals FROM (SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, " "SUM(CAST(contributions.amount AS FLOAT)) AS totals " "FROM donors INNER JOIN contributions " "USING (donor_id) " "WHERE " + top_donor_where + " " "GROUP BY (donor_id) " "ORDER BY totals DESC) AS t where totals IS NOT NULL " "LIMIT 10") print("Top Donors (raw)") for row in cur: row['totals'] = locale.currency(row['totals'], grouping=True) print('%(totals)20s: %(name)s' % row) cur.execute( " SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, " " cluster_size, cluster_id " " FROM (SELECT * FROM donors WHERE " + top_donor_where + ") as donors " " INNER JOIN (SELECT count(*) AS cluster_size, canon_id AS cluster_id " " FROM " + entity_map_table + " " " GROUP BY canon_id) " " AS cluster_totals " " ON donors.donor_id = cluster_totals.cluster_id " " ORDER BY cluster_size DESC LIMIT 10") # ##Print stats that are saved to match_runs table # Biggest cluster first print("Biggest Clusters") biggest_name = '' for row in cur: print('%(cluster_size)20s: %(name)s' % row) if biggest_name == '': biggest_name = row['name'] + ':' + str(row['cluster_id']) # Then total of donors if type == 'IND': processed_donors_where = '= 1' else: processed_donors_where = '!= 1' cur.execute( " SELECT count(*) AS num_donors FROM processed_donors WHERE person " + processed_donors_where) for row in cur: number_of_donors = row['num_donors'] # Then max, average and number of clusters cur.execute( " SELECT MAX(cluster_size) AS Biggest, AVG(cluster_size) AS Average, COUNT(cluster_id) AS number_of_clusters " " FROM (SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, " " cluster_size, cluster_id " " FROM (SELECT * FROM donors WHERE " + top_donor_where + ") as donors" " INNER JOIN (SELECT count(*) AS cluster_size, canon_id AS cluster_id " " FROM " + entity_map_table + " " " GROUP BY canon_id) " " AS cluster_totals " " ON donors.donor_id = cluster_totals.cluster_id " " ORDER BY cluster_size DESC) AS stats") # Print the stats print("stats...") print(f"total number of donors: {number_of_donors}") for row in cur: print( 'max: %(biggest)s, average: %(average)s, number of clusters: %(number_of_clusters)s' % row) biggest_size = row['biggest'] average_size = row['average'] number_of_clusters = row['number_of_clusters'] # write to the match_run table runtime = time.time() - start_time donor_cluster_ratio = number_of_donors / number_of_clusters with write_con1.cursor() as cur: cur.execute( """ INSERT INTO match_runs (completed, predicates, total_clusters, avg_cluster_size, biggest_cluster_size, biggest_cluster, total_donors, donor_type, total_run_time, donor_cluster_ratio, settings_file) VALUES (NOW(), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, (' '.join(str(pred) for pred in deduper.predicates), number_of_clusters, average_size, biggest_size, biggest_name, number_of_donors, type, runtime, donor_cluster_ratio, settings_file)) write_con1.commit() read_con.close() write_con.close() read_con1.close() write_con1.close() read_con2.close() print('ran in', runtime, 'seconds')
def train(con, config): if config['seed'] is not None: if os.environ.get('PYTHONHASHSEED', 'random') == 'random': logging.warn( """dedupe is only deterministic with hash randomization disabled. Set the PYTHONHASHSEED environment variable to a constant.""" ) random.seed(config['seed']) numpy.random.seed(config['seed']) if config['use_saved_model']: print('reading from ', config['settings_file']) with open(config['settings_file'], 'rb') as sf: return dedupe.StaticDedupe(sf, num_cores=config['num_cores']) # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(config['all_fields'], num_cores=config['num_cores']) cur = con.cursor(as_dict=True) cur.execute("""SELECT {all_columns} FROM {schema}.entries_unique ORDER BY _unique_id""".format(**config)) temp_d = dict((i, unicode_to_str(row)) for i, row in enumerate(cur)) deduper.sample(temp_d, 75000) del temp_d # If we have training data saved from a previous run of dedupe, # look for it an load it in. # # __Note:__ if you want to train from # scratch, delete the training_file if os.path.exists(config['training_file']): print('reading labeled examples from ', config['training_file']) with open(config['training_file']) as tf: deduper.readTraining(tf) if config['prompt_for_labels']: # ## Active learning print('starting active labeling...') # Starts the training loop. Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished dedupe.convenience.consoleLabel(deduper) # When finished, save our labeled, training pairs to disk with open(config['training_file'], 'w') as tf: deduper.writeTraining(tf) # `recall` is the proportion of true dupes pairs that the learned # rules must cover. You may want to reduce this if your are making # too many blocks and too many comparisons. deduper.train(recall=config['recall']) with open(config['settings_file'], 'wb') as sf: deduper.writeSettings(sf) # We can now remove some of the memory hobbing objects we used # for training deduper.cleanupTraining() return deduper
def find_duplicates(self, train, push, label): """ Find duplicates and print them via stdout """ # data_d = self.read_data() data_d = self.read_data(label, invalid=False) # pprint.pprint(data_d) if train: deduper = self.train(data_d) else: if os.path.exists(self.settings_file): print("reading from", self.settings_file) with open(self.settings_file, "rb") as f: deduper = dedupe.StaticDedupe(f) else: red("Error: settings file does not exist, stoping") sys.exit(1) cyan("Finding the threshold for data...") threshold = deduper.threshold(data_d, recall_weight=1) cyan("Clustering...") clustered_dupes = deduper.match(data_d, threshold) cyan("Number of duplicate sets: " + str(len(clustered_dupes))) for aset in clustered_dupes: yellow("Found a duplicated pair...") ids, values = aset primary_issue = None # reflects the primary ticket in a aset of # duplicates all the duplicates should point # to this one for id in ids: # print(id) issue = self.repo.get_issue(int(id)) # if duplicate in issue, do nothing labels = [l.name for l in issue.labels] if "duplicate" in labels: gray(str(id) + " already duplicate, skipping it") continue # print(issue) if primary_issue is None: primary_issue = issue else: # Indicate that this issue is duplicated yellow( "Marking " + str(id) + " as duplicate, referencing to: " + str(primary_issue) ) if push: duplicate_text = ( "- <ins>DUPLICATE</ins>: Tagging this ticket as duplicate. Referencing to #" + str(primary_issue.number) + "\n" ) issue.create_comment(duplicate_text) # labeling issue.add_to_labels("duplicate") issue.add_to_labels("triage")
def company_deduplicate(self): optp = optparse.OptionParser() optp.add_option( '-v', '--verbose', dest='verbose', action='count', help='Increase verbosity (specify multiple times for more)') (opts, args) = optp.parse_args() log_level = logging.WARNING if opts.verbose: if opts.verbose == 1: log_level = logging.INFO elif opts.verbose >= 2: log_level = logging.DEBUG logging.getLogger().setLevel(log_level) sql_data = 'SELECT CompanyID, CompanyName,Website,Email,Phone FROM Reporting.DimCompany' sql = 'SELECT C.CompanyID, C.CompanyName, C.Website,L.Address1,Y.[Name] AS [CityName],L.PostalCode ' \ 'FROM Reporting.DimCompany C LEFT JOIN [Reporting].[DimCompanyLocation] L ON L.CompanyID = C.CompanyID ' \ 'LEFT JOIN [Reporting].[DimCity] Y ON Y.CityID = L.CityID WHERE C.CompanyName IS NOT NULL' if os.path.exists(self.setting_file): print('reading from '.format(self.setting_file)) with open(self.setting_file, 'rb') as sf: deduper = dedupe.StaticDedupe(sf, num_cores=4) fields = [{ 'field': 'CompanyID', 'variable name': 'CompanyID', 'type': 'String' }, { 'field': 'CompanyName', 'variable name': 'CompanyName', 'type': 'Exists' }, { 'field': 'Website', 'variable name': 'Website', 'type': 'String', 'has missing': True }, { 'field': 'Email', 'variable name': 'Email', 'type': 'String', 'has missing': True }, { 'field': 'Phone', 'variable name': 'Phone', 'type': 'String', 'has missing': True }, { 'type': 'Interaction', 'interaction variables': ['CompanyName', 'Website'] }, { 'type': 'Interaction', 'interaction variables': ['CompanyID', 'Email'] }] deduper = dedupe.Dedupe(fields, num_cores=4) data = self.db.pandas_read(sql_data) temp_data = dict((index, row) for index, row in enumerate(data)) deduper.sample(temp_data, sample_size=10000) del temp_data if os.path.exists(self.training_file): print('reading labeled examples from {}'.format( self.training_file)) with open(self.training_file) as tf: deduper.readTraining(tf) print('start active labeling...') # ACTIVE LEARNING dedupe.convenience.consoleLabel(deduper) with open(self.training_file, 'w') as tf: deduper.writeTraining(tf) deduper.train(recall=0.90) with open(self.setting_file, 'wb') as sf: deduper.writeSettings(sf) deduper.cleanupTraining() # BLOCKING print('blocking...') print('creating blocking_map database') self.db.execute('DROP TABLE IF EXISTS blocking_map') self.db.execute('CREATE TABLE blocking_map ' '(block_key VARCHAR(200), company_id INTEGER) ' 'CHARACTER SET utf8 COLLATE utf8_unicode_ci') print('creating inverted index') for field in deduper.blocker.index_fields: df = self.db.pandas_read( 'SELECT DISTINCT {} FROM Staging.DimCompany WHERE {} IS NOT NULL' .format(field)) field_data = (row[0] for row in df) deduper.blocker.index(field_data, field) print('writing blocking map') d_company = self.db.pandas_read(sql) full_data = ((row['CompanyID'], row) for row in d_company) b_data = deduper.blocker(full_data) val = b_data.toList() self.db.bulk_insert('INSERT INTO blocking_map VALUES (?,?)', val) deduper.blocker.resetIndices() # PREPARE BLOCKING TABLE print('prepare blocking table. ...this take a while') logging.info('indexing block_key') self.db.execute( 'ALTER TABLE blocking_map ADD UNIQUE INDEX(block_key, company_id)') self.db.execute('DROP TABLE IF EXISTS plural_key') self.db.execute('DROP TABLE IF EXISTS plural_block') self.db.execute('DROP TABLE IF EXISTS covered_blocks') self.db.execute('DROP TABLE IF EXISTS smaller_coverage') logging.info('calculating plural_key') self.db.execute( 'CREATE TABLE plural_key (block_key VARCHAR(200), block_id INTEGER UNSIGNED AUTO_INCREMENT, ' 'PRIMARY KEY (block_id)) (SELECT MIN(block_key) ' 'FROM (' 'SELECT block_key,GROUP_CONCAT(donor_id ORDER BY donor_id) AS block ' 'FROM blocking_map GROUP BY block_key HAVING COUNT(*) > 1' ') AS blocks ' 'GROUP BY block)') logging.info('creating block_key index') self.db.execute( 'CREATE UNIQUE INDEX block_key_idx ON plural_key (block_key)') logging.info("calculating plural_block") self.db.execute( 'CREATE TABLE plural_block (' 'SELECT block_id, donor_id FROM blocking_map INNER JOIN plural_key USING (block_key))' ) logging.info("adding donor_id index and sorting index") self.db.execute( 'ALTER TABLE plural_block ADD INDEX (donor_id), ADD UNIQUE INDEX (block_id, donor_id)' ) self.db.execute('SET group_concat_max_len = 2048') logging.info("creating covered_blocks") self.db.execute( 'CREATE TABLE covered_blocks (' 'SELECT donor_id, GROUP_CONCAT(block_id ORDER BY block_id) AS sorted_ids ' 'FROM plural_block GROUP BY donor_id)') self.db.execute( "CREATE UNIQUE INDEX donor_idx ON covered_blocks (donor_id)") logging.info("creating smaller_coverage") self.db.execute( 'CREATE TABLE smaller_coverage (' 'SELECT donor_id, block_id, TRIM(\',\' ' 'FROM SUBSTRING_INDEX(sorted_ids, block_id, 1)) AS smaller_ids ' 'FROM plural_block INNER JOIN covered_blocks USING (donor_id))') # CORRECT THIS SQL STATEMENT TO CORRECT THE ATTRIBUTES c_data = self.db.execute( 'SELECT company_id, city, name, zip, state, address, occupation, ' 'employer, person, block_id, smaller_ids ' 'FROM smaller_coverage INNER JOIN processed_donors ' 'USING (company_id) ORDER BY (block_id)') print('clustering...') clustered_dupes = deduper.matchBlocks(self.candidates_gen(c_data), threshold=0.5) self.db.execute('DROP TABLE IF EXISTS entity_map') # WRITING OUT RESULTS print('creating entity_map database') self.db.execute( 'CREATE TABLE entity_map (' 'donor_id INTEGER, canon_id INTEGER, cluster_score FLOAT, PRIMARY KEY(donor_id))' ) for cluster, scores in clustered_dupes: cluster_id = cluster[0] for donor_id, score in zip(cluster, scores): self.db.execute('INSERT INTO entity_map VALUES (%s, %s, %s)', (donor_id, cluster_id, score)) self.db.execute("CREATE INDEX head_index ON entity_map (canon_id)") print('# of duplicate sets are: {}'.format(len(clustered_dupes))) self.file.df_to_excel(clustered_dupes, 'Clustered Companies')
def deduping_function(fields, training_file, settings_file, input_file, output_file, input_dict): """ Function to dedupe rows given a fields: list of dictionaries of fields relevant for the training training_file: json with training set if the function was already run settings_file: file with settings if the function was already run input_file: CSV with values to dedup output_file: CSV where results are going to be written input_dict: dictionary with keys as id and values as dictionary of features for each row Result is to write the output_file """ if os.path.exists(settings_file): print('reading from {}'.format(settings_file)) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) else: deduper = dedupe.Dedupe(fields) deduper.sample(input_dict) if os.path.exists(training_file): print('reading labeled examples from ', format(training_file)) with open(training_file) as tf: deduper.readTraining(tf) print('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train() print('writing training file') with open(training_file, 'w') as tf: deduper.writeTraining(tf) print('writing settings file') with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) print('blocking...') threshold = deduper.threshold(input_dict, recall_weight=2) print('clustering...') clustered_dupes = deduper.match(input_dict, threshold) print('# duplicate sets {}'.format(len(clustered_dupes))) cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [input_dict[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } singleton_id = cluster_id + 1 with open(output_file, 'w') as f_output: writer = csv.writer(f_output) with open(input_file) as f_input: reader = csv.reader(f_input) heading_row = next(reader) heading_row.insert(0, 'confidence_score') heading_row.insert(0, 'Cluster ID') canonical_keys = canonical_rep.keys() for key in canonical_keys: heading_row.append('canonical_' + key) writer.writerow(heading_row) for row in reader: row_id = int(row[0]) if row_id in cluster_membership: cluster_id = cluster_membership[row_id]["cluster id"] canonical_rep = cluster_membership[row_id][ "canonical representation"] row.insert(0, cluster_membership[row_id]['confidence']) row.insert(0, cluster_id) for key in canonical_keys: row.append(canonical_rep[key].encode('utf8')) else: row.insert(0, None) row.insert(0, singleton_id) singleton_id += 1 for key in canonical_keys: row.append(None) writer.writerow(row) print('deduping sucessfully finished')
{ 'field': 'enddate', 'type': 'DateTime', 'dayfirst': True } ] start_time = time.time() debugger_setup() # if settings file exists if os.path.exists(tender_settings_file): # load from settings file print('reading from', tender_settings_file) with open(tender_settings_file) as sf: deduper = dedupe.StaticDedupe(sf) # construct results table create_table(results_table, results_schema, output_fields) # start the cluster_id at 0 current_index = 0 # loop over different dates for date_range in select_date_ranges: # construct the query for getting data to be deduped select_query = construct_query(input_fields, data_source, date_range, select_country) # get data from the database using the query selected_data = fetch_data(select_query)
start_time = time.time() with open(opts.CONFIG_PATH) as f: config = json.load(f) con = psy.connect(cursor_factory=psycopg2.extras.RealDictCursor, **config) c = con.cursor() FIELD_NAMES = "entry_id, hash_ssn, hash_fname, hash_lname, dob, race, sex, block2010id" INDIVIDUAL_SELECT = "SELECT %s FROM dedupe.entries" % FIELD_NAMES # ## Training if False: #os.path.exists(settings_file): print('reading from ', settings_file) with open(settings_file, 'rb') as sf: deduper = dedupe.StaticDedupe(sf, num_cores=4) else: # Define the fields dedupe will pay attention to # # The address, city, and zip fields are often missing, so we'll # tell dedupe that, and we'll learn a model that take that into # account fields = [ { 'field': 'hash_ssn', 'variable name': 'hash_ssn', 'type': 'Exact', 'has missing': True }, {
cur.execute(DONOR_SELECT) for row in cur: temp_d[int(row[id_column])] = dedupe.core.frozendict(row) def random_pair_generator(): for k1, k2 in random_pairs: yield (temp_d[k1], temp_d[k2]) return tuple(pair for pair in random_pair_generator()) # ## Training if os.path.exists(settings_file): print 'reading from ', settings_file deduper = dedupe.StaticDedupe(settings_file, num_processes=2) else: # Select a large sample of duplicate pairs. As the dataset grows, # duplicate pairs become relatively more rare so we have to take a # fairly large sample compared to `csv_example.py` print 'selecting random sample from donors table...' data_sample = getSample(c, 750000, 'donor_id', 'donors') # Define the fields dedupe will pay attention to # # The address, city, and zip fields are often missing, so we'll # tell dedupe that, and we'll learn a model that take that into # account fields = { 'name': {
def deDuplication(): input_file = 'csv_example_input.csv' output_file = 'csv_example_output.csv' settings_file = 'csv_example_learned_settings3' training_file = 'csv_example_training.json3' def preProcess(column): try: column = column.decode('utf-8') except AttributeError: pass column = unidecode(column) column = re.sub(' +', ' ', column) column = re.sub('\n', ' ', column) column = column.strip().strip('"').strip("'").lower().strip() if not column: column = None return column # Read in the data from CSV file: def readData(filename): data_d = {} with open(filename, encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: clean_row = [(k, preProcess(v)) for (k, v) in row.items()] row_id = row['id'] data_d[row_id] = dict(clean_row) return data_d print('importing data ...') data_d = readData(input_file) if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) else: fields = [ { 'field': 'DisplayName_Processed', 'type': 'String' }, { 'field': 'Email_Processed', 'type': 'String' }, { 'field': 'Info', 'type': 'String' }, ] deduper = dedupe.Dedupe(fields) deduper.sample(data_d, 15000) if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.readTraining(f) print('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train() with open(training_file, 'w') as tf: deduper.writeTraining(tf) with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) threshold = deduper.threshold(data_d, recall_weight=1) print('clustering...') clustered_dupes = deduper.match(data_d, threshold) print('# duplicate sets', len(clustered_dupes)) cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data_d[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } singleton_id = cluster_id + 1 with open(output_file, 'w', encoding="utf-8") as f_output, open(input_file, encoding="utf-8") as f_input: writer = csv.writer(f_output) reader = csv.reader(f_input) heading_row = next(reader) heading_row.insert(0, 'confidence_score') heading_row.insert(0, 'Cluster ID') canonical_keys = canonical_rep.keys() for key in canonical_keys: heading_row.append('canonical_' + key) writer.writerow(heading_row) for row in reader: row_id = row[0] if row_id in cluster_membership: cluster_id = cluster_membership[row_id]["cluster id"] canonical_rep = cluster_membership[row_id][ "canonical representation"] row.insert(0, cluster_membership[row_id]['confidence']) row.insert(0, cluster_id) for key in canonical_keys: row.append(canonical_rep[key].encode('utf8')) else: row.insert(0, None) row.insert(0, singleton_id) singleton_id += 1 for key in canonical_keys: row.append(None) writer.writerow(row) return clustered_dupes
def _train(settings_file, training_file, data, field_properties, sample_size, update_model): """Internal method that trains the deduper model from scratch or update an existing dedupe model. Parameters ---------- settings_file : str A path to a settings file that will be loaded if it exists. training_file : str A path to a training file that will be loaded to keep training from. data : dict The dictionary form of the dataframe that dedupe requires. field_properties : dict The mapping of fields to their respective data types. Please see the dedupe documentation for further details. sample_size : float, default 0.3 Specify the sample size used for training as a float from 0 to 1. By default it is 30% (0.3) of our data. update_model : bool, default False If True, it allows user to update existing model by uploading training file. Returns ------- dedupe.Dedupe A dedupe model instance. """ # Define the fields dedupe will pay attention to fields = [] select_fields(fields, field_properties) if update_model == False: # If a settings file already exists, we'll just load that and skip training if os.path.exists(settings_file): print('Reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f, num_cores=None) #Create a new deduper object and pass our data model to it. else: # Initialise dedupe deduper = dedupe.Dedupe(fields, num_cores=None) # Launch active learning deduper = _active_learning(data, sample_size, deduper, training_file, settings_file) else: # ## Training # Initialise dedupe deduper = dedupe.Dedupe(fields, num_cores=None) # Import existing model print('Reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.prepare_training(data, training_file=f) # Launch active learning deduper = _active_learning(data, sample_size, deduper, training_file, settings_file) return deduper
def start_dedupe(df): # If a settings file already exists, we'll just load that and skip training if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) else: # ## Training # Define the fields dedupe will pay attention to fields = [ { 'field': 'MESSZEIT', 'type': 'Exact' }, { 'field': 'KENNUNG', 'type': 'Exact' }, { 'field': 'GEOGR_BREITE', 'type': 'Custom', 'comparator': sim_num_abs }, { 'field': 'GEOGR_LAENGE', 'type': 'Custom', 'comparator': sim_num_abs }, { 'field': 'HORIZONTALE_SICHT', 'type': 'ShortString', 'has missing': True }, { 'field': 'WETTER', 'type': 'Custom', 'comparator': sim_ww, 'has missing': True }, ] # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields) # To train dedupe, we feed it a sample of records. deduper.sample(df, 15000, .5) # If we have training data saved from a previous run of dedupe, # look for it and load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.readTraining(f) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print('starting active labeling...') dedupe.consoleLabel(deduper) # Using the examples we just labeled, train the deduper and learn # blocking predicates deduper.train() # When finished, save our training to disk with open(training_file, 'w') as tf: deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) threshold = deduper.threshold(df, recall_weight=1) print(f'threshold: {threshold}') print('clustering...') # clustered_dupes = deduper.match(df, .98) clustered_dupes = deduper.match(df, threshold) print(f'# duplicate sets {len(clustered_dupes)}') return clustered_dupes
for row in reader: clean_row = [(k, preProcess(v)) for (k, v) in row.items()] row_id = int(row['Id']) data_d[row_id] = dict(clean_row) return data_d print 'importing data ...' data_d = readData(input_file) # ## Training if os.path.exists(settings_file): print 'reading from', settings_file deduper = dedupe.StaticDedupe(settings_file) else: # Define the fields dedupe will pay attention to # # Notice how we are telling dedupe to use a custom field comparator # for the 'Zip' field. fields = { 'Site name': { 'type': 'String' }, 'Address': { 'type': 'String' }, 'Zip': { 'type': 'Custom',
def dedupe_dataframe(df, field_properties, canonicalize=False, config_name="dedupe_dataframe"): # Import Data print('runnning CUSTOM dedupe_dataframe LOCAL') config_name = config_name.replace(" ", "_") settings_file = config_name + '_learned_settings' training_file = config_name + '_training.json' print('importing data ...') df = clean_punctuation(df) specify_type(df, field_properties) df['dictionary'] = df.apply(lambda x: dict(zip(df.columns,x.tolist())), axis=1) data_d = dict(zip(df.index,df.dictionary)) for col in df.columns: print("df col - - - ", col) print(' df[dictionary].. shape', df.shape) # If a settings file already exists, we'll just load that and skip training if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) else: # ## Training # Define the fields dedupe will pay attention to fields = [] select_fields(fields, field_properties) # print("fields", fields) # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields) # To train dedupe, we feed it a sample of records. deduper.sample(data_d, 15000) # If we have training data saved from a previous run of dedupe, # look for it and load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.readTraining(f) # print('starting active labeling...') dedupe.consoleLabel(deduper) # Using the examples we just labeled, train the deduper and learn # blocking predicates print('calling train...') deduper.train() # When finished, save our training to disk with open(training_file, 'w') as tf: deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) print('saved our training to disk.') # ## Set threshold threshold = deduper.threshold(data_d, recall_weight=1) print('threshold=', threshold) # ## Clustering print('clustering...') clustered_dupes = deduper.match(data_d, threshold) print('# duplicate sets', len(clustered_dupes)) # Convert data_d to string so that Price & LatLong won't get traceback during dedupe.canonicalize() for i in data_d.values(): for key in i: if i[key] == None: pass else: i[key] = str(i[key]) # ## Writing Results cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data_d[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id" : cluster_id, "canonical representation" : canonical_rep, "confidence": score } cluster_index=[] for i in cluster_membership.items(): cluster_index.append(i) # turn results into dataframe dfa = pd.DataFrame(cluster_index) dfa.rename(columns={0: 'Id index'}, inplace=True) canonical_list=[] if canonicalize== True: for i in dfa[1][0]['canonical representation'].keys(): canonical_list.append(i) dfa[i + ' - ' + 'canonical'] = None dfa[i + ' - ' + 'canonical'] = dfa[1].apply(lambda x: x['canonical representation'][i]) elif canonicalize == False: pass elif type(canonicalize) == list: for i in canonicalize: dfa[i + ' - ' + 'canonical'] = None dfa[i + ' - ' + 'canonical'] = dfa[1].apply(lambda x: x['canonical representation'][i]) dfa.set_index('Id index', inplace=True) df = df.join(dfa) return df
def train(con, config): """Trains or retrieves a previously trained Dedupe object If config['use_saved_model'] is set, it will read the saved model from config['settings_file'] instead of training. If config['seed'] is set, both random.seed and numpy.random.seed will be set so the run will be deterministic given configuration. If config['prompt_for_labels'] is set, the console will be used to ask the user to help label examples Args: con (psycopg2.connection) config (dict) configuration options for a deduping run. Expected to have defaults applied Returns: (dedupe.Dedupe or dedupe.StaticDedupe) """ if config['seed'] is not None: if os.environ.get('PYTHONHASHSEED', 'random') == 'random': raise EnvironmentError( """dedupe is only deterministic with hash randomization disabled. Set the PYTHONHASHSEED environment variable to a constant.""") random.seed(config['seed']) numpy.random.seed(config['seed']) if config['use_saved_model']: logging.info('reading saved model from %s', config['settings_file']) with open(config['settings_file'], 'rb') as sf: return dedupe.StaticDedupe(sf, num_cores=config['num_cores']) # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(config['all_fields'], num_cores=config['num_cores']) module_name, class_name = config['classifier'].rsplit(".", 1) module = importlib.import_module(module_name) cls = getattr(module, class_name) deduper.classifier = cls(**config['hyperparameters']) # Named cursor runs server side with psycopg2 cur = con.cursor('individual_select') cur.execute("""SELECT {all_columns} FROM {schema}.entries_unique ORDER BY _unique_id""".format(**config)) temp_d = dict((i, row) for i, row in enumerate(cur)) num_records = 75000 logging.info('Creating sample of %s records', num_records) deduper.sample(temp_d, num_records) del temp_d # If we have training data saved from a previous run of dedupe, # look for it an load it in. # # __Note:__ if you want to train from # scratch, delete the training_file if os.path.exists(config['training_file']): logging.info('reading labeled examples from %s', config['training_file']) with open(config['training_file']) as tf: deduper.readTraining(tf) if config['prompt_for_labels']: # ## Active learning logging.info('starting active labeling...') # Starts the training loop. Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished dedupe.convenience.consoleLabel(deduper) # When finished, save our labeled, training pairs to disk with open(config['training_file'], 'w') as tf: deduper.writeTraining(tf) # `recall` is the proportion of true dupes pairs that the learned # rules must cover. You may want to reduce this if your are making # too many blocks and too many comparisons. deduper.train(recall=config['recall']) with open(config['settings_file'], 'wb') as sf: deduper.writeSettings(sf) return deduper
def eval_lightgbm_dedupe(dataset_type: str, dataset_rf, dataset_dedupe): # Create deduper model with open('../../trained_models/combined_2/trained_{}_settings.json'.format(dataset_type), 'rb') as fin: deduper = dedupe.StaticDedupe(fin, num_cores=NUM_CORES) cols = ['name', 'brand', 'size', 'product_type'] to_dedupe = dataset_dedupe[cols] to_dedupe_dict = to_dedupe.to_dict(orient='index') # Cluster (prediction stage) clustered_dupes = deduper.partition(to_dedupe_dict, partition_threshold[dataset_type]) print('# duplicate sets', len(clustered_dupes)) # Create the record linkage model # Indexer indexer = rl.Index() indexer.add(rl.index.Block('product_type')) indexer.add(rl.index.Block('brand')) indexer.add(rl.index.Block('size')) candidate_links = indexer.index(dataset_rf) # Comparing compare_cl = rl.Compare(n_jobs=-1) compare_cl.exact('brand', 'brand') compare_cl.exact('product_type', 'product_type') compare_cl.exact('size', 'size') compare_cl.string('name', 'name', method='qgram') compare_cl.string('name', 'name', method='damerau_levenshtein') compare_cl.string('name', 'name', method='levenshtein') compare_cl.string('name', 'name', method='jarowinkler') compare_cl.string('name', 'name', method='smith_waterman') compare_cl.string('name', 'name', method='lcs') # compare_cl.string('price', 'price') # Features features = compare_cl.compute(candidate_links, dataset_rf) # Add dedupe features features['xy_same_entity'] = pd.Series(np.zeros(len(features))) features.xy_same_entity = 0.0 # Save the result for el in clustered_dupes: for i in range(len(el[0])): for j in range(i + 1, len(el[0])): k = (el[0][i], el[0][j]) r_k = (el[0][j], el[0][i]) p = el[1][i] * el[1][j] if k in features.index: features.loc[k, 'xy_same_entity'] = p if r_k in features.index: features.loc[r_k, 'xy_same_entity'] = p # Now load the lightgbm bst = lgb.Booster(model_file='../../trained_models/combined_2/x4_lgb_classifier.txt') # Predict confs = bst.predict(features) features['label'] = confs # Save the csv file # Now export the left and right instance ids # Save the result res = [] for i in range(len(confs)): record = features.iloc[i] label = record.label if label > 0.5: res.append(record.name) res_df = pd.DataFrame(res) res_df.columns = ['left_instance_id', 'right_instance_id'] return res_df
def main(): reload(sys) sys.setdefaultencoding("utf-8") optp = optparse.OptionParser() optp.add_option('-v', '--verbose', dest='verbose', action='count', help='Increase verbosity (specify multiple times for more)' ) (opts, args) = optp.parse_args() log_level = logging.WARNING if opts.verbose == 1: log_level = logging.INFO elif opts.verbose >= 2: log_level = logging.DEBUG logging.getLogger().setLevel(log_level) settings_file = 'crm_person_deduping_settings' training_file = 'crm_person_deduping_training.json' start_time = time.time() server = "name_of_server" user = "******" password = "******" con = pymssql.connect(server,user,password,"ew_DataMatching", as_dict=True) con.autocommit(True) c = con.cursor() con2 = pymssql.connect(server,user,password,"ew_DataMatching") c2 = con2.cursor() # c2.execute("SET net_write_timeout = 3600") COMPANY_SELECT = "SELECT * from [vr_unmatched_person_remainder]" \ # ## Training # Define the fields dedupe will pay attention to if os.path.exists(settings_file): print 'reading from ', settings_file with open(settings_file) as sf : deduper = dedupe.StaticDedupe(sf, num_cores=4) else: fields = [ {'field' : 'company_id','variable name' : 'company', 'type': 'Exact'} , {'field' : 'fullname', 'variable name' : 'fullname','type': 'Text','has missing':'True'}, {'field' : 'lname', 'variable name' : 'Surname', 'type' : 'String'}, {'field' : 'initial', 'type' : 'Exact', 'Has Missing' : True}, {'field' : 'gender', 'type' : 'Exact', 'Has Missing' : True} , {'type' : 'Interaction', 'interaction variables' : ['fullname', 'company']}, # {'type' : 'Interaction', # 'interaction variables' : ['AbbrFullName', 'Gender']}, {'type' : 'Interaction', 'interaction variables' : ['Surname', 'company']} ] # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields, num_cores=4) # We will sample pairs from the entire donor table for training c.execute(COMPANY_SELECT) temp_d = dict((i, row) for i, row in enumerate(c)) print temp_d # -*- coding: utf-8 -*- deduper.sample(temp_d,11000) del temp_d # If we have training data saved from a previous run of dedupe, # look for it an load it in. # # __Note:__ if you want to train from # scratch, delete the training_file if os.path.exists(training_file): print 'reading labeled examples from ', training_file with open(training_file) as tf : deduper.readTraining(tf) # ## Active learning print 'starting active labeling...' # Starts the training loop. Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished dedupe.convenience.consoleLabel(deduper) # Notice our two arguments here # # `ppc` limits the Proportion of Pairs Covered that we allow a # predicate to cover. If a predicate puts together a fraction of # possible pairs greater than the ppc, that predicate will be removed # from consideration. As the size of the data increases, the user # will generally want to reduce ppc. # # `uncovered_dupes` is the number of true dupes pairs in our training # data that we are willing to accept will never be put into any # block. If true duplicates are never in the same block, we will never # compare them, and may never declare them to be duplicates. # # However, requiring that we cover every single true dupe pair may # mean that we have to use blocks that put together many, many # distinct pairs that we'll have to expensively, compare as well. deduper.train(ppc=0.01, uncovered_dupes=5) # When finished, save our labeled, training pairs to disk with open(training_file, 'w') as tf: deduper.writeTraining(tf) with open(settings_file, 'w') as sf: deduper.writeSettings(sf) # We can now remove some of the memory hobbing objects we used # for training deduper.cleanupTraining() ## Blocking print 'blocking...' # To run blocking on such a large set of data, we create a separate table # that contains blocking keys and record ids print 'creating blocking_map database' c.execute("IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'ew_DataMatching.dedupe.blocking_map'))" "DROP TABLE ew_DataMatching.dedupe.blocking_map") c.execute("CREATE TABLE ew_DataMatching.dedupe.blocking_map " "(block_key VARCHAR(200), matching_id INT)" ) # If dedupe learned a TF-IDF blocking rule, we have to take a pass # through the data and create TF-IDF canopies. This can take up to an # hour print 'creating inverted index' for field in deduper.blocker.index_fields : c2.execute("SELECT %s FROM [vr_unmatched_person_remainder]" % field) field_data = (row[0] for row in c2) deduper.blocker.index(field_data, field) # Now we are ready to write our blocking map table by creating a # generator that yields unique `(block_key, donor_id)` tuples. print 'writing blocking map' c.execute(COMPANY_SELECT) full_data = ((row['matching_id'], row) for row in c) # for line in full_data: # id_number,data = line # if data['gender'] == '': # print id_number b_data = deduper.blocker(full_data) def dbWriter(sql, rows) : conn = pymssql.connect(server,user,password,"ew_DataMatching") cursor = conn.cursor() cursor.executemany(sql, rows) cursor.close() conn.commit() conn.close() results = dbWriter("INSERT INTO ew_DataMatching.dedupe.blocking_map VALUES (%s, %s)",b_data) print 'prepare blocking table. this will probably take a while ...' logging.info("indexing block_key") c.execute("CREATE INDEX blocking_map_key_idx ON dedupe.blocking_map (block_key)") c.execute("IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'ew_DataMatching.dedupe.[plural_key]')) DROP TABLE ew_DataMatching.dedupe.[plural_key]") c.execute("IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'ew_DataMatching.dedupe.[plural_block]')) DROP TABLE ew_DataMatching.dedupe.[plural_block]") c.execute("IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'ew_DataMatching.dedupe.[covered_blocks]')) DROP TABLE ew_DataMatching.dedupe.[covered_blocks]") c.execute("IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'ew_DataMatching.dedupe.[smaller_coverage]')) DROP TABLE ew_DataMatching.dedupe.[smaller_coverage]") # Many block_keys will only form blocks that contain a single # record. Since there are no comparisons possible withing such a # singleton block we can ignore them. logging.info("calculating plural_key") c.execute("CREATE TABLE ew_DataMatching.dedupe.plural_key " "(block_key VARCHAR(200), " " block_id INT identity(1,1), " " PRIMARY KEY (block_id)) ") c.execute("INSERT INTO dedupe.plural_key(block_key) SELECT block_key FROM ew_DataMatching.dedupe.blocking_map " "GROUP BY block_key HAVING COUNT(*) > 1") logging.info("creating block_key index") c.execute("CREATE UNIQUE INDEX block_key_idx ON ew_DataMatching.dedupe.plural_key (block_key)") logging.info("calculating plural_block") c.execute("SELECT distinct block_id, matching_id into dedupe.plural_block" " FROM ew_DataMatching.dedupe.blocking_map INNER JOIN ew_DataMatching.dedupe.plural_key " " on blocking_map.block_key = plural_key.block_key order by block_id") logging.info("adding donor_id index and sorting index") c.execute("CREATE INDEX matching_id ON dedupe.plural_block (matching_id)") c.execute("CREATE UNIQUE INDEX idx_matching_id_2 on dedupe.plural_block (block_id, matching_id)") # To use Kolb, et.al's Redundant Free Comparison scheme, we need to # keep track of all the block_ids that are associated with a # particular donor records. Original code had group_concat function from MySQL, below code replaces this for SQL Server logging.info("creating covered_blocks") c.execute("SELECT matching_id," "dbo.trim_character(CAST((" "SELECT cast(block_id as varchar)+',' " "FROM ew_DataMatching.dedupe.plural_block T WHERE plural_block.matching_id = T.matching_id order by block_id FOR XML PATH(''))as varchar(max)),',') AS sorted_ids into dedupe.covered_blocks FROM ew_DataMatching.dedupe.plural_block " "GROUP BY matching_id") c.execute("CREATE UNIQUE INDEX donor_idx ON ew_DataMatching.dedupe.covered_blocks (matching_id)") # In particular, for every block of records, we need to keep # track of a donor records's associated block_ids that are SMALLER than # the current block's id. Code converted for SQL SERVER logging.info("creating smaller_coverage") c.execute("SELECT plural_block.matching_id, block_id, " "case when charindex(cast(block_id as varchar),sorted_ids,1) = 1 then null else substring(sorted_ids,1,charindex(cast(block_id as varchar),sorted_ids,1)-2) end" " AS smaller_ids INTO dedupe.smaller_coverage" " FROM ew_DataMatching.dedupe.plural_block INNER JOIN ew_DataMatching.dedupe.covered_blocks " " on plural_block.matching_id = covered_blocks.matching_id order by matching_id,block_id") con.commit() ## Clustering def candidates_gen(result_set) : lset = set block_id = None records = [] i = 0 for row in result_set : if row['block_id'] != block_id : if records : yield records block_id = row['block_id'] records = [] i += 1 if i % 10000 == 0 : print i, "blocks" print time.time() - start_time, "seconds" smaller_ids = row['smaller_ids'] if smaller_ids : smaller_ids = lset(smaller_ids.split(',')) else : smaller_ids = lset([]) records.append((row['matching_id'], row, smaller_ids)) #print records if records : yield records c.execute("SELECT MasterMatching.matching_id, fullname,gender, " " initial,lname,company_id," " block_id, smaller_ids " "FROM dedupe.smaller_coverage " "INNER JOIN [vr_unmatched_person_remainder] MasterMatching" " on MasterMatching.matching_id = smaller_coverage.matching_id order by block_id,matching_id") # for row in c: # if row["company"] == 'Cheshunt School & Technology College': # print('row = %r' % (row,)) print 'clustering...' #0.5 Default clustered_dupes = deduper.matchBlocks(candidates_gen(c), threshold=0.5) ## Writing out results # We now have a sequence of tuples of donor ids that dedupe believes # all refer to the same entity. We write this out onto an entity map # table c.execute("IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'ew_DataMatching.dedupe.[company_entity_map]')) DROP TABLE ew_DataMatching.dedupe.[company_entity_map]") print 'creating entity_map database' c.execute("CREATE TABLE ew_DataMatching.dedupe.company_entity_map " "(matching_id INTEGER, canon_id INTEGER, score FLOAT" " PRIMARY KEY(matching_id))") for cluster, score in clustered_dupes : cluster_id = cluster[0] for matching_id, score in zip(cluster, score) : #print cluster_id, matching_id c.execute('INSERT INTO ew_DataMatching.dedupe.company_entity_map (matching_id, canon_id, score) VALUES (%s, %s , %s)', (int(matching_id), int(cluster_id),round(score,2) )) con.commit() c.execute("CREATE INDEX head_index ON ew_DataMatching.dedupe.company_entity_map (canon_id)") con.commit() # Print out the number of duplicates found print '# duplicate sets' print len(clustered_dupes) print 'ran in', time.time() - start_time, 'seconds'
def _train(settings_file, training_file, data, field_properties, sample_size): """Internal method that trains the deduper model if a training file does not exist. Parameters ---------- settings_file : str A path to a settings file that will be loaded if it exists. training_file : str A path to a training file that will be loaded to keep training from. data : dict The dictionary form of the dataframe that dedupe requires. field_properties : dict The mapping of fields to their respective data types. Please see the dedupe documentation for further details. sample_size : float, default 0.3 Specify the sample size used for training as a float from 0 to 1. By default it is 30% (0.3) of our data. Returns ------- dedupe.Dedupe A dedupe model instance. """ # If a settings file already exists, we'll just load that and skip training if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) else: # ## Training # Define the fields dedupe will pay attention to fields = [] select_fields(fields, field_properties) # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields) # To train dedupe, we feed it a sample of records. sample_num = math.floor(len(data) * sample_size) deduper.sample(data, sample_num) # If we have training data saved from a previous run of dedupe, # look for it and load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.readTraining(f) print('starting active labeling...') dedupe.consoleLabel(deduper) # Using the examples we just labeled, train the deduper and learn # blocking predicates deduper.train() # When finished, save our training to disk with open(training_file, 'w') as tf: deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) return deduper
def run_dedupe(settings_file, training_file, type): start_time = time.time() print("dedupe file ", dedupe.__file__) # Set the database connection from environment variable using # [dj_database_url](https://github.com/kennethreitz/dj-database-url) # For example: # export DATABASE_URL=postgres://user:password@host/mydatabase db_conf = dj_database_url.config() if not db_conf: raise Exception( 'set DATABASE_URL environment variable with your connection, e.g. ' 'export DATABASE_URL=postgres://user:password@host/mydatabase') read_con = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT'], cursor_factory=psycopg2.extras.RealDictCursor) write_con = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT']) # We'll be using variations on this following select statement to pull # in campaign donor info. if type == 'IND': DONOR_SELECT = "SELECT donor_id, city, name, zip, state, street " \ "from processed_donors where person = 1 and name not like '%unitem%' " else: DONOR_SELECT = "SELECT donor_id, city, name, zip, state, street " \ "from processed_donors where person != 1 and name not like '%unitem%' " # Open Settings print('reading from ', settings_file) with open(settings_file, 'rb') as sf: deduper = dedupe.StaticDedupe(sf, num_cores=1) print(f'deduper predicates: {deduper.predicates}') blocking_table = 'blocking_map_' + type # ## Clustering entity_map_table = "entity_map_" + type with write_con: with write_con.cursor() as cur: cur.execute("DROP TABLE IF EXISTS " + entity_map_table) print('creating entity_map database') cur.execute("CREATE TABLE " + entity_map_table + " " "(donor_id INTEGER, canon_id INTEGER, " " cluster_score FLOAT, PRIMARY KEY(donor_id))") read_con1 = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT'], cursor_factory=psycopg2.extras.RealDictCursor) with read_con1.cursor( 'pairs', cursor_factory=psycopg2.extensions.cursor) as read_cur: read_cur.execute( "SELECT a.donor_id, " "row_to_json((SELECT d FROM (SELECT a.city, " "a.name, " "a.zip, " "a.state, " "a.street) d)), " "b.donor_id, " "row_to_json((SELECT d FROM (SELECT b.city, " "b.name, " "b.zip, " "b.state, " "b.street) d)) " "FROM (SELECT DISTINCT l.donor_id AS east, r.donor_id AS west " "FROM " + blocking_table + " AS l " "INNER JOIN " + blocking_table + " AS r " "USING (block_key) " "WHERE l.donor_id < r.donor_id) ids " "INNER JOIN processed_donors a ON ids.east=a.donor_id " "INNER JOIN processed_donors b ON ids.west=b.donor_id") print('clustering...') clustered_dupes = deduper.cluster(deduper.score( record_pairs(read_cur)), threshold=0.5) # ## Writing out results # We now have a sequence of tuples of donor ids that dedupe believes # all refer to the same entity. We write this out onto an entity map # table print('writing results') write_con1 = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT']) with write_con1: with write_con1.cursor() as write_cur: write_cur.copy_expert('COPY ' + entity_map_table + ' FROM STDIN WITH CSV', Readable(cluster_ids(clustered_dupes)), size=100000) with write_con1: with write_con1.cursor() as cur: cur.execute("CREATE INDEX head_index_" + type + " ON " + entity_map_table + " (canon_id)") # Print out the number of duplicates found # ## Payoff # With all this done, we can now begin to ask interesting questions # of the data # # For example, let's see who the top 10 donors are. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') # for pretty printing numbers read_con2 = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT'], cursor_factory=psycopg2.extras.RealDictCursor) # save entity map entity_map_filename = 'entity_map_' + settings_file.split( '/')[-1] + '_' + time.strftime('%d_%m_%y_%H%M', time.localtime()) + '.csv' donors_filename = 'processed_donors_' + settings_file.split( '/')[-1] + '_' + time.strftime('%d_%m_%y_%H%M', time.localtime()) + '.csv' with read_con2.cursor() as cur: with open(entity_map_filename, 'w') as file_out: cur.copy_expert( 'COPY ' + entity_map_table + ' TO STDOUT WITH CSV HEADER', file_out) with open(donors_filename, 'w') as file_out: cur.copy_expert('COPY processed_donors TO STDOUT WITH CSV HEADER', file_out) if type == 'IND': top_donor_where = 'donors.corp is null' else: top_donor_where = 'donors.corp is not null' # Create a temporary table so each group and unmatched record has # a unique id with read_con2.cursor() as cur: cur.execute( "CREATE TEMPORARY TABLE e_map " "AS SELECT COALESCE(canon_id, donor_id) AS canon_id, donor_id " "FROM " + entity_map_table + " " "RIGHT JOIN donors USING(donor_id)") cur.execute( "SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, " "donation_totals.totals AS totals " "FROM (SELECT * FROM donors WHERE " + top_donor_where + ") as donors INNER JOIN " "(SELECT canon_id, SUM(CAST(contributions.amount AS FLOAT)) AS totals " " FROM contributions INNER JOIN e_map " " USING (donor_id) " " GROUP BY (canon_id) " " ORDER BY totals " " DESC) " "AS donation_totals ON donors.donor_id=donation_totals.canon_id " "WHERE donors.donor_id = donation_totals.canon_id and donation_totals.totals IS NOT NULL LIMIT 10" ) print("Top Donors (deduped)") for row in cur: row['totals'] = locale.currency(row['totals'], grouping=True) print('%(totals)20s: %(name)s' % row) # Compare this to what we would have gotten if we hadn't done any # deduplication cur.execute( "SELECT name, totals FROM (SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, " "SUM(CAST(contributions.amount AS FLOAT)) AS totals " "FROM donors INNER JOIN contributions " "USING (donor_id) " "WHERE " + top_donor_where + " " "GROUP BY (donor_id) " "ORDER BY totals DESC) AS t where totals IS NOT NULL " "LIMIT 10") print("Top Donors (raw)") for row in cur: row['totals'] = locale.currency(row['totals'], grouping=True) print('%(totals)20s: %(name)s' % row) cur.execute( " SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, " " cluster_size, cluster_id " " FROM (SELECT * FROM donors WHERE " + top_donor_where + ") as donors " " INNER JOIN (SELECT count(*) AS cluster_size, canon_id AS cluster_id " " FROM " + entity_map_table + " " " GROUP BY canon_id) " " AS cluster_totals " " ON donors.donor_id = cluster_totals.cluster_id " " ORDER BY cluster_size DESC LIMIT 10") # ##Print stats that are saved to match_runs table # Biggest cluster first print("Biggest Clusters") biggest_name = '' for row in cur: print('%(cluster_size)20s: %(name)s' % row) if biggest_name == '': biggest_name = row['name'] + ':' + str(row['cluster_id']) # Then total of donors if type == 'IND': processed_donors_where = '= 1' else: processed_donors_where = '!= 1' cur.execute( " SELECT count(*) AS num_donors FROM processed_donors WHERE person " + processed_donors_where) for row in cur: number_of_donors = row['num_donors'] # Then max, average and number of clusters cur.execute( " SELECT MAX(cluster_size) AS Biggest, AVG(cluster_size) AS Average, COUNT(cluster_id) AS number_of_clusters " " FROM (SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, " " cluster_size, cluster_id " " FROM (SELECT * FROM donors WHERE " + top_donor_where + ") as donors" " INNER JOIN (SELECT count(*) AS cluster_size, canon_id AS cluster_id " " FROM " + entity_map_table + " " " GROUP BY canon_id) " " AS cluster_totals " " ON donors.donor_id = cluster_totals.cluster_id " " ORDER BY cluster_size DESC) AS stats") # Print the stats print("stats...") print(f"total number of donors: {number_of_donors}") for row in cur: print( 'max: %(biggest)s, average: %(average)s, number of clusters: %(number_of_clusters)s' % row) biggest_size = row['biggest'] average_size = row['average'] number_of_clusters = row['number_of_clusters'] # write to the match_run table runtime = time.time() - start_time donor_cluster_ratio = number_of_donors / number_of_clusters with write_con1.cursor() as cur: cur.execute( """ INSERT INTO match_runs (completed, predicates, total_clusters, avg_cluster_size, biggest_cluster_size, biggest_cluster, total_donors, donor_type, total_run_time, donor_cluster_ratio, settings_file) VALUES (NOW(), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, (' '.join(str(pred) for pred in deduper.predicates), number_of_clusters, average_size, biggest_size, biggest_name, number_of_donors, type, runtime, donor_cluster_ratio, settings_file)) write_con1.commit() read_con.close() write_con.close() read_con1.close() write_con1.close() read_con2.close() print('ran in', runtime, 'seconds')
row_id = int(row['Id']) data_d[row_id] = dict(clean_row) row_id += 1 return data_d print('importing data ...') data_d = readData(input_file) # ## Training if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) else: # Define the fields dedupe will pay attention to fields = [ { 'field': 'name', 'type': 'String', 'has missing': True }, { 'field': 'address', 'type': 'String', 'has missing': True },
def process(): """entry point""" api = sesamclient.Connection(sesamapi_base_url=INSTANCE, jwt_auth_token=JWT, timeout=60 * 10) result = api.session.get( api.sesamapi_base_url + "publishers/{}/entities?deleted=false&history=false".format(SOURCE)) if result.status_code != 200: logging.warning('Sesam API returned non 200 code {}'.format( result.status_code)) iterator = iter([]) else: iterator = iter(result.json()) raw_data = [] for dataset_entity in iterator: raw_data.append(dataset_entity) # this is in memory service, for large datasets we probably may want to have service with DB storage if len(raw_data) > 100_000: logging.warning( 'This service is not suitable for datasets with more than 100 000 elements' ' and may lead to OutOfMemory errors') cleaned_data = read_data(raw_data) logging.info('reading from %s', SETTINGS_FILE) with open(SETTINGS_FILE, 'rb') as f: deduper = dedupe.StaticDedupe(f) threshold = deduper.threshold(cleaned_data, recall_weight=1) logging.info('clustering...') clustered_dupes = deduper.match(cleaned_data, threshold) logging.info('%d duplicate sets found', len(clustered_dupes)) cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [cleaned_data[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } result_dataset = [] for row in raw_data: row_id = row['_id'] if row_id in cluster_membership: result_dict = { '_id': row_id, 'cluster_id': cluster_membership[row_id]["cluster id"], 'confidence_score': cluster_membership[row_id]['confidence'] } if ADD_ORIGINALS: result_dict['originals'] = {key: row[key] for key in KEYS} if ADD_CANONICALS: result_dict['canonical_rep'] = cluster_membership[row_id][ "canonical representation"] result_dataset.append(result_dict) if TARGET is not None: target_url = api.sesamapi_base_url + "receivers/{}/entities".format( TARGET) requests.post(target_url, json.dumps(sorted(result_dataset, key=lambda k: k['cluster_id']), cls=NumpyEncoder), headers={ 'Authorization': 'Bearer {}'.format(JWT), 'Content-Type': 'application/json' }) return Response() return Response(json.dumps(sorted(result_dataset, key=lambda k: k['cluster_id']), cls=NumpyEncoder), mimetype='application/json')