def test_single_blocking_key(self): """BLOCKING: Test class arguments.""" # all the following cases return in the same index. # situation 1 index_cl1 = recordlinkage.BlockIndex('var_arange') pairs1 = index_cl1.index((self.a, self.b)) # situation 2 index_cl2 = recordlinkage.BlockIndex(on='var_arange') pairs2 = index_cl2.index((self.a, self.b)) # situation 3 index_cl3 = recordlinkage.BlockIndex(left_on='var_arange', right_on='var_arange') pairs3 = index_cl3.index((self.a, self.b)) # situation 4 index_cl4 = recordlinkage.BlockIndex(on=['var_arange']) pairs4 = index_cl4.index((self.a, self.b)) # situation 5 index_cl5 = recordlinkage.BlockIndex(left_on=['var_arange'], right_on=['var_arange']) pairs5 = index_cl5.index((self.a, self.b)) # test ptm.assert_index_equal(pairs1, pairs2) ptm.assert_index_equal(pairs1, pairs3) ptm.assert_index_equal(pairs1, pairs4) ptm.assert_index_equal(pairs1, pairs5)
def compute_record_linkage(df_full): """Given the fully-concatenated table of records calculate which pairs are address-matches using fuzzy matching on street name, unit number and zipcode """ print("Setting up blocking for pairwise comparisons") blocking_indices = [ recordlinkage.BlockIndex(on="APN (int)"), recordlinkage.BlockIndex( on=["Address Number (float)", "Zip Code (int)"]), ] print("Finding blocked pairs") pairs = None for df_subset in tqdm(np.array_split(df_full, 10)): for bi in blocking_indices: _new_pairs = bi.index(df_full, df_subset) if pairs is not None: pairs = pairs.union(_new_pairs) else: pairs = _new_pairs print("Setting up similarity calculations") compare_cl = recordlinkage.Compare() compare_cl.exact('APN (int)', 'APN (int)', label='APN') compare_cl.exact('Zip Code (int)', 'Zip Code (int)', label='Zip') compare_cl.exact('Address Number (float)', 'Address Number (float)', label='number') #compare_cl.numeric('Address Number (float)', 'Address Number (float)', # offset=3, scale=2, # label='number') compare_cl.string('Street Name', 'Street Name', method='levenshtein', threshold=0.9, label='street') print("Calculating similarities") features = compare_cl.compute(pairs, df_full) features.to_pickle(FEATURES_OUTPUT_FILE) return features
def time_block_index(self): # setup class c_pairs = rl.BlockIndex('given_name') # Make pairs c_pairs.index(self.A)
def __init__(self, data, new_data, name, n_sample=50000, bin_size=500, block=None, method='jarowinkler', threshold=0.93): super(Linkage, self).__init__() self.data = data self.new_data = pd.DataFrame(new_data) self.n_sample = n_sample self.bin_size = bin_size self.name = name self.block = block self.method = method self.threshold = threshold self.sample = data.sample(n=self.n_sample, replace=False, random_state=0) if self.block != None: self.indexer = recordlinkage.BlockIndex(on=self.block) else: self.indexer = recordlinkage.FullIndex() self.compare_cl = recordlinkage.Compare(n_jobs=4) self.compare_cl.string(self.name, self.name, method=self.method, threshold=self.threshold, label=self.name) if self.new_data.empty: self.List = list(itertools.combinations(np.split(self.sample, indices_or_sections=round(self.n_sample/self.bin_size, 0)), 2)) else: self.Linst = list(np.split(self.sample, indices_or_sections=round(self.n_sample/self.bin_size, 0))) self.results_df = pd.DataFrame(columns=['pairs', 'company_1', 'company_2']) self.results_tmp = None
def block_and_match(df, block_variable, comparison_dict, match_threshold=MATCH_THRESHOLD, string_method="damerau_levenshtein", string_threshold=STRING_THRESHOLD): """ Use recordlinkage to block on one variable and compare on others """ indexer = rl.BlockIndex(on=block_variable) pairs = indexer.index(df) compare = rl.Compare() for k, v in comparison_dict.items(): if v == "string": compare.string(k, k, method=string_method, threshold=string_threshold, label=k, missing_value=np.nan) if v == "date": compare.date(k, k, label=k, missing_value=np.nan) features = compare.compute(pairs, df) features["mean"] = features.mean(axis=1, skipna=True) features["match"] = features["mean"] > match_threshold return features
def blockingMatching(): # first csv tables with the appropriate encoding dfA = pd.read_csv("all_gamestop.csv", encoding='utf-8') # Second csv table with the appropriate encoding dfB = pd.read_csv("jnl_half_edit2.csv", encoding='utf-8') # effectuate the blocking on the "title" column # because it is the column where the likelyhood # of matching needs to be determined block_class = rl.BlockIndex(on="title") # Used function from the recordlinkage library # to index the blocks ["title"] from both # tables condidate_links = block_class.index(dfA, dfB) # Used the Compare() function # from the recordlinkage library # to effectuate the comparison compare = rl.Compare() # Do the first comparison with the string comparison. Knowing that # the string structure may differ the edit distance algorithm (jarawinkler) # is used witht threshold of 0.85 compare.string("title", "title", method='jarowinkler', threshold=0.85, label="Title") # knowing that the console structure will be the same in both # sets we compare the strings with the exact method compare.exact('console', 'console', label="Console") # Now we have both sets of record compute them to find # the mathches using the compute() function from the # recordlinkage library. Storing the result in a vector compare_vectors = compare.compute(condidate_links, dfA, dfB) # Try try put records that belong to the same entity together matches = compare_vectors[compare_vectors.sum(axis=1) > 1] # Called the showMaching function to assemble # the matches in the same set match_records = ShowMatching(dfA, dfB, matches) # Write the matches in csv file np.savetxt("Matching.csv", match_records, delimiter=",", fmt="%s, %s, %s, %s, %s, %s, %s, %s") # Get the matches indexes from the matche vector df2 = pd.DataFrame(matches) # Put the indexes in a csv file df2.to_csv("index_of_matching.csv")
def test_multiple_blocking_keys(self): """BLOCKING: test multiple blocking keys""" # all the following cases return in the same index. # situation 1 index_cl1 = recordlinkage.BlockIndex(['var_arange', 'var_block10']) pairs1 = index_cl1.index((self.a, self.b)) # situation 2 index_cl2 = recordlinkage.BlockIndex( left_on=['var_arange', 'var_block10'], right_on=['var_arange', 'var_block10']) pairs2 = index_cl2.index((self.a, self.b)) # test ptm.assert_index_equal(pairs1, pairs2)
def test_blocking_algorithm_link(self): """BLOCKING: test blocking algorithm for linking""" # situation 1: eye index index_cl1 = recordlinkage.BlockIndex(on='var_arange') pairs1 = index_cl1.index((self.a, self.b)) self.assertEqual(len(pairs1), len(self.a)) self.assertTrue(pairs1.is_unique) # situation 2: 10 blocks index_cl2 = recordlinkage.BlockIndex(on='var_block10') pairs2 = index_cl2.index((self.a, self.b)) self.assertEqual(len(pairs2), len(self.a) * 10) self.assertTrue(pairs2.is_unique) # situation 3: full index index_cl3 = recordlinkage.BlockIndex(on='var_single') pairs3 = index_cl3.index((self.a, self.b)) self.assertEqual(len(pairs3), len(self.a) * len(self.b)) self.assertTrue(pairs3.is_unique)
def test_blocking_algorithm_dedup(self): """BLOCKING: test blocking algorithm for deduplication""" len_a = len(self.a) # situation 1: eye index index_cl1 = recordlinkage.BlockIndex(on='var_arange') pairs1 = index_cl1.index(self.a) self.assertEqual(len(pairs1), 0) self.assertTrue(pairs1.is_unique) # situation 2: 10 blocks index_cl2 = recordlinkage.BlockIndex(on='var_block10') pairs2 = index_cl2.index(self.a) self.assertEqual(len(pairs2), (len_a * 10 - len_a) / 2) self.assertTrue(pairs2.is_unique) # situation 3: full index index_cl3 = recordlinkage.BlockIndex(on='var_single') pairs3 = index_cl3.index(self.a) self.assertEqual(len(pairs3), (len_a * len_a - len_a) / 2) self.assertTrue(pairs3.is_unique)
def get_pairs(self): for block in self.block_on: indexer = rl.BlockIndex(on=block) pairs = indexer.index(self.dfA, self.dfB) try: combined_pairs = last_pairs.union(pairs) except: pass last_pairs = pairs try: # more than one block return combined_pairs except: # one block return pairs
def table_length(): # first csv tables with the appropriate encoding dfA = pd.read_csv("all_gamestop.csv", encoding = 'utf-8' ) # Second csv table with the appropriate encoding dfB = pd.read_csv("jnl_half_edit2.csv", encoding = 'utf-8') #dfC = pd.read_csv("Matching.csv", encoding = 'utf-8' ) indexer = rl.BlockIndex(on='title') pairs = indexer.index( dfA, dfB ) print("DataFrame1:", len( dfA ) ) print("DataFrame2:", len( dfB ) ) print("Pair: ", len( pairs ) )
dfFile2 = tools.load_file_as_df(config.common.filename_2, fieldnames) perfect_match_index = tools.load_perfect_match_as_index( config.common.filename_perfect_match) # for each config item for index, config_item in enumerate(config.items): # init Random with a fixes seed (for reproducibility) tools.init_random_with_seed() print("Indexing") if config_item.index_type == "sorted_neighbourhood": indexer = rl.SortedNeighbourhoodIndex( config_item.index_field_name, window=config_item.sorted_neighborhood_window) elif config_item.index_type == "block": indexer = rl.BlockIndex(config_item.index_field_name) elif config_item.index_type == "canopy": indexer = tools.CanopyClusterIndex( config_item.index_field_name, threshold_add=config_item.canopy_threshold_add, threshold_remove=config_item.canopy_threshold_remove) elif config_item.index_type == "full": indexer = tools.FullIndex(config_item.index_field_name) else: raise ValueError( "index_type {0} is invalid: must be sorted_neighbourhood, block, canopy or full" .format(config_item.index_type)) pairs_index = indexer.index(dfFile1, dfFile2) print("Comparing {0} Pairs".format(pairs_index.size))
import os import unittest import tempfile import shutil import pickle import numpy as np import pandas as pd import pandas.util.testing as ptm from parameterized import parameterized, param import recordlinkage TEST_INDEXATION_OBJECTS = [ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)), ] class TestData(unittest.TestCase): """Unittest object to setup test data.""" @classmethod def setUpClass(cls): n_a = 100 n_b = 150 cls.index_a = ['rec_a_%s' % i for i in range(0, n_a)]
import recordlinkage #import sys import csv import numpy as np from matplotlib import pyplot pd = pandas.read_csv("A PubmedData.csv") ct = pandas.read_csv("A ClinicalTrialsData.csv") #print pd print("\n") #indexer = recordlinkage.FullIndex() #pairs = indexer.index(pd, ct) indexer = recordlinkage.BlockIndex(on='Title') pairs = indexer.index(pd, ct) print("RecordLinkage Based on Title Through BlockIndex") print(len(pd), len(ct), len(pairs)) print("\n") indexer = recordlinkage.BlockIndex(on='Abstract') pairs = indexer.index(pd, ct) print("RecordLinkage Based on Abstract Through BlockIndex") print(len(pd), len(ct), len(pairs)) print("\n") indexer = recordlinkage.SortedNeighbourhoodIndex(on="Title") pairs = indexer.index(pd, ct) print("RecordLinkage Based on Title Through SortedNeighbourhoodIndex") print(len(pd), len(ct), len(pairs))
(CSV) file.We start with a CSV file containing our messy data. In this example, it is listings of patient information having ln,dob,gn,fn as its fields compiled from several different sources. The output will be a CSV with our clustered results. """ import recordlinkage as rl import pandas as pd import numpy as np # The CSV file is read into a dataframe dfA dfA = pd.read_csv("Deduplication Problem - Sample Dataset.csv",encoding="utf8") # Making record pairs using indexing method- blocking # Returns record pairs having same values for gender and date_of_birth indexer = rl.BlockIndex(on=['dob','gn']) pairs = indexer.index(dfA) # The record pairs are compared using various methods for corresponding fields # Attribute comparisons are stored in a DataFrame-features thereafter compare_cl = rl.Compare() compare_cl.string('fn', 'fn', method='jarowinkler', threshold=0.85,label='fn') compare_cl.string('ln', 'ln', method='jarowinkler', threshold=0.85, label='ln') compare_cl.exact('dob', 'dob', label='dob') compare_cl.exact('gn', 'gn', label='gn') features = compare_cl.compute(pairs, dfA) # To decide which record belong to the same person matches = features[features.sum(axis=1) > 3] # To produce the CSV file having no duplicate/similar entries
@author: VictorO """ #importing the required libraries import pandas as pd import recordlinkage as rl import os import numpy as np #Setting the working directory os.chdir('M:\\Projects\\Record Linkage\\Python\\data\\') #importing the data dfA = pd.read_csv('for_linkage_data1.csv', sep=',', encoding='utf-8', index_col='dummy_id') # Indexation step - Block Index indexer = rl.BlockIndex(on='surname' and 'gender' and 'date_of_birth') #you can apply and or suppose you want to block with two variables pairs = indexer.index(dfA) #Printing the number of pairs creating from the blocking step print(len(pairs)) #Doing comparision compare_cl = rl.Compare(pairs, dfA) compare_cl.string('firstname', 'firstname', method='jarowinkler', threshold=0.85, name='firstname') compare_cl.string('surname', 'surname', method='jarowinkler', threshold=0.85, name='surname') compare_cl.exact('initial', 'initial', name='initial') compare_cl.exact('gender', 'gender', name='gender') compare_cl.exact('dob_day', 'dob_day', name='dob_day') compare_cl.exact('dob_month', 'dob_month', name='dob_month') compare_cl.exact('dob_year', 'dob_year', name='dob_year') compare_cl.exact('facility_code', 'facility_code', name='facility_code');
DF = DF.loc[DF.Province == PR] N1 = len(DF) print(N1, 'records to match in ', PR) DF['Number'] = DF['Number'].astype(int).astype(str) ADD = pd.read_csv(r"Output_OA_STD_Files\\" + PR + '_OA_STD.csv', low_memory=False) ADD[NUMBER_COL] = ADD[NUMBER_COL].astype(str) ADD = ADD.reset_index() DF = DF.reset_index() DF = DF.fillna('') DF['Postal_Code'] = DF['Postal_Code'].str.replace(' ', '').str.upper() indexer = recordlinkage.BlockIndex(left_on=['Number'], right_on=[NUMBER_COL]) M_idx = indexer.index(DF, ADD) # print(len(M_idx)) # DF['Street_Add']=DF['StdOpAddressStreetName']+' '+DF['StdOpAddressStreetType']+ ' '+DF['StdOpAddressStreetDir'] # DF['Street_Add']=DF['Street_Add'].str.replace(' ',' ') # DF['Street_Add']=DF['Street_Add'].str.replace(' ',' ') if PR != 'QC': DF = AddressClean_en(DF, 'Street', 'Street') DF = Type_Drop_en(DF, 'Street', 'Street_Name') ADD = Type_Drop_en(ADD, STREET_COL, 'STREET_NAME') else: DF = AddressClean_fr(DF, 'Street', 'Street') DF = Type_Drop_fr(DF, 'Street', 'Street_Name') ADD = Type_Drop_fr(ADD, STREET_COL, 'STREET_NAME')
def link_datasets(yelp_results, dj_df, df_type="wages"): """ (Assisted by Record Linkage Toolkit library and documentation) This functions compares the Yelp query results to database results and produces the best matches based on computing the qgram score. Depending on the specific database table characteristics the qgram calculation will be between the zip_code, business name, address strings, latitude, longitude, or a combination of those charateristics. Inputs: - yelp_results: a pandas dataframe of yelp business results based on a user's input - dj_df: a pandas dataframe of django results. Ex. labour statistics, healthcode violations, Divvy, etc. - df_type: a string of which specific dataframe is being passed to be compared to the Yelp results Outputs: - link: a tuple containing the indices of Yelp query results dataframe and the database dataframe AND the best matches qgram scores """ # set thresholds for comparing strings using qgram method name_thresh = 0.55 addr_thresh = 0.55 strong_addr_thresh = 0.90 # initialize a Record Linkage comparison object compare = rl.Compare() # Labour & Food data comparisons to Yelp are made on zip, business name, # and address if df_type == "wages" or df_type == "food": indexer = rl.BlockIndex(on='zip_code') # block on zip code compare.numeric('zip_code', 'zip_code', method='linear', scale=30.0, label='zip_score') compare.string('name', 'name', method='qgram', threshold=name_thresh, label='name_score') compare.string('addr', 'addr', method='qgram', threshold=addr_thresh, label='addr_score') # Environmental data comparisons to Yelp are made on address elif df_type == "enviro": indexer = rl.FullIndex() # no blocking available compare.string('addr', 'addr', method='qgram', threshold=strong_addr_thresh, label='addr_score') # all other data comparisons to Yelp else: indexer = rl.FullIndex() pairs = indexer.index(yelp_results, dj_df) # In addition to above comparisons, ALL data sets are also compared to # Yelp based on latitude and longitude compare.geo('latitude', 'longitude', 'latitude', 'longitude', method='linear', scale=30.0, label='coord_score') # compute record linkage scores features = compare.compute(pairs, yelp_results, dj_df) # set classification thresholds zip_classif_thresh = 1.0 addr_classif_thresh = 1.0 coord_classif_thresh = 0.99 name_classif_thresh = 1.0 # Classification and final filtering if df_type == "wages" or df_type == "food": best_matches = features[(features['zip_score'] == zip_classif_thresh) & (features['name_score'] == name_classif_thresh) & (features['addr_score'] == addr_classif_thresh) & (features['coord_score'] >= coord_classif_thresh)] elif df_type == "enviro": best_matches = features[(features['addr_score'] == addr_classif_thresh) & (features['coord_score'] >= coord_classif_thresh)] else: best_matches = features[(features['coord_score'] >= coord_classif_thresh)] # obtain the index values from best_matches index_array = best_matches.index.values # create tuple of indices and best matches df link = (index_array, best_matches) return link
#Script for the Deduplication of Records import numpy as np import csv import recordlinkage as rl import pandas as pd #Reading the CSV file f1 = pd.read_csv(open('Deduplication Problem - Sample Dataset.csv', 'rb')) data1 = pd.DataFrame(f1) data1 = pd.DataFrame(data=data1, dtype='U') #Making Record Pairs For Similar Data in Various Fields block_class = rl.BlockIndex(on=["ln", "dob", "gn", "fn"]) block_pairs = block_class.index(data1) #Comparing each record pair with the data. compare_class = rl.Compare() compare_class.string('ln', 'ln', method="jarowinkler", threshold=0.85) compare_class.exact('dob', 'dob') compare_class.exact('gn', 'gn') compare_class.string('fn', 'fn', method="jarowinkler", threshold=0.85) compare_result = compare_class.compute(block_pairs, data1) #Filtering the Records further by comparing the Columns matched matches = compare_result[compare_result.sum(axis=1) >= 3]
#start with two reach datasets print(len(df_reach18_ref) * len(df_reach17_ref)) # =516,060 possible pairs # df_reach18_ref['province'].value_counts() # df_reach17_ref['province'].value_counts() # df_reach18_ref['district'].value_counts() # df_reach17_ref['district'].value_counts() #Blocking - only comparing pairs of records that are identical on some attributes indexer = recordlinkage.Index() blocking_vars = [ "province", "displacement_year" ] #, 'displacement_month','arrival_month','hoh_sex','hoh_age','hh_size','hh_families' indexer = recordlinkage.BlockIndex(on=blocking_vars) candidate_links = indexer.index(df_reach18_ref, df_reach17_ref) print(len(candidate_links)) # = 134,220 # ============================================================================= # Compare records # ============================================================================= # initialise class comp = recordlinkage.Compare() # initialise similarity measurement algorithms #displacement info comp.string('district', 'district', method='jarowinkler', label='district') comp.numeric('displacement_month',
class TestIndexApi(TestData): """General unittest for the indexing API.""" @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_repr(self, index_class): index_str = str(index_class) index_repr = repr(index_class) self.assertEqual(index_str, index_repr) start_str = '<{}'.format(index_class.__class__.__name__) self.assertTrue(index_str.startswith(start_str)) @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_arguments(self, index_class): """Test the index method arguments""" # The following should work index_class.index(self.a) index_class.index(self.a, self.b) index_class.index((self.a)) index_class.index([self.a]) index_class.index((self.a, self.b)) index_class.index([self.a, self.b]) index_class.index(x=(self.a, self.b)) def test_iterative(self): """Test the iterative behaviour.""" # SINGLE STEP index_class = recordlinkage.FullIndex() pairs = index_class.index((self.a, self.b)) pairs = pd.DataFrame(index=pairs).sort_index() # MULTI STEP index_class = recordlinkage.FullIndex() pairs1 = index_class.index((self.a[0:50], self.b)) pairs2 = index_class.index((self.a[50:100], self.b)) pairs_split = pairs1.append(pairs2) pairs_split = pd.DataFrame(index=pairs_split).sort_index() ptm.assert_frame_equal(pairs, pairs_split) # note possible to sort MultiIndex, so made a frame out of it. @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_empty_imput_dataframes(self, index_class): """Empty DataFrames""" # make an empty dataframe with the columns of self.a and self.b df_a = pd.DataFrame(columns=self.a.columns.tolist()) df_b = pd.DataFrame(columns=self.b.columns.tolist()) if not isinstance(index_class, recordlinkage.RandomIndex): # make an index pairs = index_class.index((df_a, df_b)) # check if the MultiIndex has length 0 self.assertIsInstance(pairs, pd.MultiIndex) self.assertEqual(len(pairs), 0) else: with self.assertRaises(ValueError): index_class.index((df_a, df_b)) @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_error_handling(self, index_class): """Test error handling on non-unique index.""" # make a non_unique index df_a = self.a.rename( index={self.a.index[1]: self.a.index[0]}, inplace=False) with self.assertRaises(ValueError): index_class.index(df_a) @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_index_names_dedup(self, index_class): index_names = ['dedup', None, 'index', int(1)] expected = [ ['dedup_1', 'dedup_2'], [None, None], ['index_1', 'index_2'], ['1_1', '1_2'], ] for i, name in enumerate(index_names): index_A = pd.Index(self.a.index).rename(name) df_A = pd.DataFrame(self.a, index=index_A) pairs = index_class.index((df_A)) self.assertEqual(pairs.names, expected[i]) self.assertEqual(df_A.index.name, name) @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_duplicated_index_names_dedup(self, index_class): # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) # make the index pairs = index_class.index(df_a) self.assertEqual(pairs.names, ['index_1', 'index_2']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') # make the index index_class.suffixes = ['_a', '_b'] pairs = index_class.index(df_a) self.assertEqual(pairs.names, ['index_a', 'index_b']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_index_names_link(self, index_class): # tuples with the name of the first and second index index_names = [ ('index1', 'index2'), ('index1', None), (None, 'index2'), (None, None), (10, 'index2'), (10, 11) ] for name_a, name_b in index_names: # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name=name_a) df_a = pd.DataFrame(self.a, index=index_a) index_b = pd.Index(self.b.index, name=name_b) df_b = pd.DataFrame(self.b, index=index_b) pairs = index_class.index((df_a, df_b)) self.assertEqual(pairs.names, [name_a, name_b]) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, name_a) self.assertEqual(df_b.index.name, name_b) @parameterized.expand([ param(recordlinkage.FullIndex()), param(recordlinkage.BlockIndex(on='var_arange')), param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')), param(recordlinkage.RandomIndex(10, random_state=100, replace=True)), param(recordlinkage.RandomIndex(10, random_state=100, replace=False)) ]) def test_duplicated_index_names_link(self, index_class): # make an index for each dataframe with a new index name index_a = pd.Index(self.a.index, name='index') df_a = pd.DataFrame(self.a, index=index_a) index_b = pd.Index(self.b.index, name='index') df_b = pd.DataFrame(self.b, index=index_b) # make the index pairs = index_class.index((df_a, df_b)) self.assertEqual(pairs.names, ['index_1', 'index_2']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') self.assertEqual(df_b.index.name, 'index') # make the index index_class.suffixes = ['_a', '_b'] pairs = index_class.index((df_a, df_b)) self.assertEqual(pairs.names, ['index_a', 'index_b']) # check for inplace editing (not the intention) self.assertEqual(df_a.index.name, 'index') self.assertEqual(df_b.index.name, 'index') @parameterized.expand(TEST_INDEXATION_OBJECTS) def test_pickle(self, index_class): """Test if it is possible to pickle the class.""" pickle_path = os.path.join(self.test_dir, 'pickle_compare_obj.pickle') # pickle before indexing pickle.dump(index_class, open(pickle_path, 'wb')) # compute the record pairs index_class.index(self.a, self.b) # pickle after indexing pickle.dump(index_class, open(pickle_path, 'wb'))
import recordlinkage import pandas as pd path = "C:/Users/DELL/Desktop/Data-de-duplication/Deduplication Problem - Sample Dataset.csv" df = pd.read_csv(path) print(df) indexer = recordlinkage.FullIndex() pairs = indexer.index(df) print(len(df), len(pairs)) print(pairs) indexer = recordlinkage.BlockIndex(on=['dob', 'gn']) pairs = indexer.index(df) print(len(pairs)) # (1000*1000-1000)/2 = 499500 # This cell can take some time to compute. compare_cl = recordlinkage.Compare() compare_cl.string('ln', 'ln', label='last_name') compare_cl.string('fn', 'fn', method='jarowinkler', threshold=0.85, label='first_name') compare_cl.exact('dob', 'dob', label='date_of_birth') compare_cl.exact('gn', 'gn', label='gender')
# # However, as we have seen in the previous notebook, we might want to consider how strict we want our matching to be. For example, we want to make sure that we catch any typos or common misspellings, but we want to avoid relaxing the matching condition to the point that anything will match anything. # markdown # ### Indexing # # Indexing allows us to create candidate links, which basically means identifying pairs of data rows which might refer to the same real world entity. This is also called the comparison space (matrix). There are different ways to index data. The easiest is to create a full index and consider every pair a match. This is also the least efficient method, because we will be comparing every row of one dataset with every row of the other dataset. # # If we had 10,000 records in data frame A and 100,000 records in data frame B, we would have 1,000,000,000 candidate links. You can see that comparing over a full index is getting inefficient when working with big data. # markdown # We can do better if we actually include our knowledge about the data to eliminate bad link from the start. This can be done through blocking. The `recordlinkage` package gives you multiple options for this. For example, you can block by using variables, which means only links exactly equal on specified values will be kept. # markdown # Here we will start by blocking on `city` and `state`, to narrow down the number of candidate links. # markdown # You can try and see how the number of candidate links change when blocking on more or less variables. # indexerBL = rl.BlockIndex(on=['state', 'city']) candidate_links = indexerBL.index(inventors, pi) # len(candidate_links) # candidate_links[:10] # markdown # Let's check the first pair of candidate links blocked on city and state: (0, 85) # inventors.iloc[0] # pi.iloc[85] # # Now, in addition to blocking on city and state, we can also try blocking on first name indexerBL = rl.BlockIndex(on=['name_first', 'city', 'state']) candidate_links = indexerBL.index(inventors, pi)
def run_experiment(win_len, preproc, comparison_variant, run_only=None): # window length if win_len == 0: index_description = "block" indexer = recordlinkage.BlockIndex('year') elif win_len > 0: index_description = f"nb{win_len}" indexer = recordlinkage.SortedNeighbourhoodIndex('year', window=win_len) else: raise ValueError(f"Invalid window length {win_len}") pairs_train = indexer.index(dataDBLP_train, dataScholar_train) pairs_test = indexer.index(dataDBLP_test, dataScholar_test) if debug: print(f"Number of candidates (index={index_description}):") print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)") # preprocessing if preproc == 0: print("No preprocesing") field_suffix = "" preproc_description = "none" elif preproc == 1: print("Cleaned fields") field_suffix = "_clean" preproc_description = "clean" elif preproc == 2: print("Soundex encoding") field_suffix = "_soundex" preproc_description = "soundex" elif preproc == 3: print("Nysiis encoding") field_suffix = "_nysiis" preproc_description = "nysiis" elif preproc == 4: print("Metaphone encoding") field_suffix = "_metaphone" preproc_description = "metaphone" elif preproc == 5: print("Match-rating encoding") field_suffix = "_match_rating" preproc_description = "match_rating" else: raise ValueError(f"Unknown preprocessing variant {preproc}") print(f"Preprocessing used: {preproc_description}") # comparator comp = recordlinkage.Compare() if comparison_variant == 0: comp_description = "exact" comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix)) comp.add( compare.Exact('authors' + field_suffix, 'authors' + field_suffix)) comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix)) elif comparison_variant == 1: comp_description = "levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='levenshtein')) elif comparison_variant == 2: comp_description = "damerau_levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='damerau_levenshtein')) elif comparison_variant == 3: comp_description = "jaro" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jaro')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jaro')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jaro')) elif comparison_variant == 4: comp_description = "jarowinkler" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jarowinkler')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jarowinkler')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jarowinkler')) elif comparison_variant == 5: comp_description = "qgram" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='qgram')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='qgram')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='qgram')) elif comparison_variant == 6: comp_description = "cosine" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='cosine')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='cosine')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='cosine')) elif comparison_variant == 7: comp_description = "smith_waterman" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='smith_waterman')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='smith_waterman')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='smith_waterman')) else: raise ValueError(f"Unknown comparison variant {comparison_variant}") print(f"String comparison: {comp_description}") print("Start compare for training data set") start = time.time() result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train) print("Compare on training data took %.2fs" % (time.time() - start)) print("Start compare for test data set") start = time.time() result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test) # save time compare for evaluation time_compare = time.time() - start print("Compare on test data took %.2fs" % (time_compare)) matches = [] for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']: # skip others if only one classifier is requested if run_only is not None and run_only != classifier_description: continue if classifier_description == 'logreg': print("Logistic Regression classifier") classifier = recordlinkage.LogisticRegressionClassifier() supervised = True elif classifier_description == 'bayes': print("Naive Bayes classifier") classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75) supervised = True elif classifier_description == 'svm': print("Support Vector Machine classifier") classifier = recordlinkage.SVMClassifier() supervised = True elif classifier_description == 'kmeans': print("KMeans classifier") classifier = recordlinkage.KMeansClassifier() supervised = False elif classifier_description == 'ecm': print("ECM classifier") classifier = recordlinkage.ECMClassifier(binarize=0.75) supervised = False else: raise ValueError( f"Unknown classifier variant {classifier_description}") if supervised: start = time.time() classifier.fit(result_train, links_train) time_train = time.time() - start start = time.time() match = classifier.predict(result_test) time_classify = time.time() - start else: start = time.time() match = classifier.fit_predict(result_test) time_classify = time.time() - start time_train = 0 matches.append( (index_description, preproc_description, comp_description, classifier_description, match, 1000 * time_compare, 1000 * time_train, 1000 * time_classify)) if debug: print("%d matches" % len(match)) print_experiment_evaluation( match, "-".join((index_description, preproc_description, comp_description))) return matches
@author: Tony """ #Refer http://recordlinkage.readthedocs.io/en/latest/notebooks/data_deduplication.html import pandas as pd import recordlinkage raw_ds = pd.read_csv( 'F:\Data_Scientist\hackathon-master\CRS_Test_Data29112017 223508.csv', encoding='ISO-8859-1') raw_ds_sliced = raw_ds[[ 'Account Party ID', 'Account Holder First Name', 'Account Holder Last Name', 'Account Holder City', 'Account Holder Date of Birth' ]] #print(raw_ds.head()) indexer = recordlinkage.BlockIndex(on='Account Party ID') pairs = indexer.index(raw_ds_sliced) #print (len(pairs)) compare_cl = recordlinkage.Compare() compare_cl.exact('Account Party ID', 'Account Party ID', label='Account Party ID') #compare_cl.string('Account Holder First Name', 'Account Holder First Name', method='jarowinkler', threshold=0.85, label='Account Holder First Name') #compare_cl.string('Account Holder Last Name', 'Account Holder Last Name', method='jarowinkler', threshold=0.85, label='Account Holder Last Name') compare_cl.exact('Account Holder Date of Birth', 'Account Holder Date of Birth', label='Account Holder Date of Birth') #compare_cl.exact('Account Holder City', 'Account Holder City', label='Account Holder City')