Exemplo n.º 1
0
    def test_single_blocking_key(self):
        """BLOCKING: Test class arguments."""

        # all the following cases return in the same index.

        # situation 1
        index_cl1 = recordlinkage.BlockIndex('var_arange')
        pairs1 = index_cl1.index((self.a, self.b))

        # situation 2
        index_cl2 = recordlinkage.BlockIndex(on='var_arange')
        pairs2 = index_cl2.index((self.a, self.b))

        # situation 3
        index_cl3 = recordlinkage.BlockIndex(left_on='var_arange',
                                             right_on='var_arange')
        pairs3 = index_cl3.index((self.a, self.b))

        # situation 4
        index_cl4 = recordlinkage.BlockIndex(on=['var_arange'])
        pairs4 = index_cl4.index((self.a, self.b))

        # situation 5
        index_cl5 = recordlinkage.BlockIndex(left_on=['var_arange'],
                                             right_on=['var_arange'])
        pairs5 = index_cl5.index((self.a, self.b))

        # test
        ptm.assert_index_equal(pairs1, pairs2)
        ptm.assert_index_equal(pairs1, pairs3)
        ptm.assert_index_equal(pairs1, pairs4)
        ptm.assert_index_equal(pairs1, pairs5)
Exemplo n.º 2
0
def compute_record_linkage(df_full):
    """Given the fully-concatenated table of records
    calculate which pairs are address-matches using
    fuzzy matching on street name, unit number and zipcode
    """

    print("Setting up blocking for pairwise comparisons")
    blocking_indices = [
        recordlinkage.BlockIndex(on="APN (int)"),
        recordlinkage.BlockIndex(
            on=["Address Number (float)", "Zip Code (int)"]),
    ]

    print("Finding blocked pairs")
    pairs = None
    for df_subset in tqdm(np.array_split(df_full, 10)):
        for bi in blocking_indices:
            _new_pairs = bi.index(df_full, df_subset)

            if pairs is not None:
                pairs = pairs.union(_new_pairs)
            else:
                pairs = _new_pairs

    print("Setting up similarity calculations")
    compare_cl = recordlinkage.Compare()

    compare_cl.exact('APN (int)', 'APN (int)', label='APN')
    compare_cl.exact('Zip Code (int)', 'Zip Code (int)', label='Zip')
    compare_cl.exact('Address Number (float)',
                     'Address Number (float)',
                     label='number')

    #compare_cl.numeric('Address Number (float)', 'Address Number (float)',
    #                   offset=3, scale=2,
    #                   label='number')

    compare_cl.string('Street Name',
                      'Street Name',
                      method='levenshtein',
                      threshold=0.9,
                      label='street')

    print("Calculating similarities")
    features = compare_cl.compute(pairs, df_full)
    features.to_pickle(FEATURES_OUTPUT_FILE)

    return features
Exemplo n.º 3
0
    def time_block_index(self):

        # setup class
        c_pairs = rl.BlockIndex('given_name')

        # Make pairs
        c_pairs.index(self.A)
Exemplo n.º 4
0
    def __init__(self, data, new_data, name, n_sample=50000, bin_size=500, block=None, method='jarowinkler', threshold=0.93):
        super(Linkage, self).__init__()
        self.data = data
        self.new_data = pd.DataFrame(new_data)
        self.n_sample = n_sample
        self.bin_size = bin_size
        self.name = name
        self.block = block
        self.method = method
        self.threshold = threshold
        
        self.sample = data.sample(n=self.n_sample, replace=False, random_state=0)

        if self.block != None:
            self.indexer = recordlinkage.BlockIndex(on=self.block)
        else:
            self.indexer = recordlinkage.FullIndex()
        self.compare_cl = recordlinkage.Compare(n_jobs=4)
        self.compare_cl.string(self.name, self.name, method=self.method, threshold=self.threshold, label=self.name)

        if self.new_data.empty:
            self.List = list(itertools.combinations(np.split(self.sample, indices_or_sections=round(self.n_sample/self.bin_size, 0)), 2))
        else:
            self.Linst = list(np.split(self.sample, indices_or_sections=round(self.n_sample/self.bin_size, 0)))
        self.results_df = pd.DataFrame(columns=['pairs', 'company_1', 'company_2'])
        self.results_tmp = None
Exemplo n.º 5
0
def block_and_match(df,
                    block_variable,
                    comparison_dict,
                    match_threshold=MATCH_THRESHOLD,
                    string_method="damerau_levenshtein",
                    string_threshold=STRING_THRESHOLD):
    """
    Use recordlinkage to block on one variable and compare on others

    """

    indexer = rl.BlockIndex(on=block_variable)
    pairs = indexer.index(df)
    compare = rl.Compare()
    for k, v in comparison_dict.items():
        if v == "string":
            compare.string(k,
                           k,
                           method=string_method,
                           threshold=string_threshold,
                           label=k,
                           missing_value=np.nan)
        if v == "date":
            compare.date(k, k, label=k, missing_value=np.nan)

    features = compare.compute(pairs, df)
    features["mean"] = features.mean(axis=1, skipna=True)
    features["match"] = features["mean"] > match_threshold

    return features
Exemplo n.º 6
0
def blockingMatching():

    # first csv tables with the appropriate encoding
    dfA = pd.read_csv("all_gamestop.csv", encoding='utf-8')

    # Second csv  table with the appropriate encoding
    dfB = pd.read_csv("jnl_half_edit2.csv", encoding='utf-8')

    # effectuate the blocking on the "title" column
    # because it is the column where the likelyhood
    # of matching needs to be determined
    block_class = rl.BlockIndex(on="title")

    # Used function from the recordlinkage library
    # to index the blocks ["title"] from both
    # tables
    condidate_links = block_class.index(dfA, dfB)

    #  Used the Compare() function
    #  from the recordlinkage library
    #  to effectuate the comparison
    compare = rl.Compare()

    # Do the first comparison with the string comparison. Knowing that
    # the string structure may differ the edit distance algorithm (jarawinkler)
    # is used witht threshold of 0.85
    compare.string("title",
                   "title",
                   method='jarowinkler',
                   threshold=0.85,
                   label="Title")

    # knowing that the console structure  will be the same in both
    # sets we compare the strings with the exact method
    compare.exact('console', 'console', label="Console")

    # Now we have both sets of record compute them to find
    # the mathches using the compute() function from the
    # recordlinkage library. Storing the result in a vector
    compare_vectors = compare.compute(condidate_links, dfA, dfB)

    # Try try put records that belong to the same  entity together
    matches = compare_vectors[compare_vectors.sum(axis=1) > 1]

    # Called the showMaching function to assemble
    # the matches in the same set
    match_records = ShowMatching(dfA, dfB, matches)

    # Write the matches in csv file
    np.savetxt("Matching.csv",
               match_records,
               delimiter=",",
               fmt="%s, %s, %s, %s, %s, %s, %s, %s")

    # Get the matches indexes from the matche vector
    df2 = pd.DataFrame(matches)

    # Put the indexes in a csv file
    df2.to_csv("index_of_matching.csv")
Exemplo n.º 7
0
    def test_multiple_blocking_keys(self):
        """BLOCKING: test multiple blocking keys"""

        # all the following cases return in the same index.

        # situation 1
        index_cl1 = recordlinkage.BlockIndex(['var_arange', 'var_block10'])
        pairs1 = index_cl1.index((self.a, self.b))

        # situation 2
        index_cl2 = recordlinkage.BlockIndex(
            left_on=['var_arange', 'var_block10'],
            right_on=['var_arange', 'var_block10'])
        pairs2 = index_cl2.index((self.a, self.b))

        # test
        ptm.assert_index_equal(pairs1, pairs2)
Exemplo n.º 8
0
    def test_blocking_algorithm_link(self):
        """BLOCKING: test blocking algorithm for linking"""

        # situation 1: eye index
        index_cl1 = recordlinkage.BlockIndex(on='var_arange')
        pairs1 = index_cl1.index((self.a, self.b))
        self.assertEqual(len(pairs1), len(self.a))
        self.assertTrue(pairs1.is_unique)

        # situation 2: 10 blocks
        index_cl2 = recordlinkage.BlockIndex(on='var_block10')
        pairs2 = index_cl2.index((self.a, self.b))
        self.assertEqual(len(pairs2), len(self.a) * 10)
        self.assertTrue(pairs2.is_unique)

        # situation 3: full index
        index_cl3 = recordlinkage.BlockIndex(on='var_single')
        pairs3 = index_cl3.index((self.a, self.b))
        self.assertEqual(len(pairs3), len(self.a) * len(self.b))
        self.assertTrue(pairs3.is_unique)
Exemplo n.º 9
0
    def test_blocking_algorithm_dedup(self):
        """BLOCKING: test blocking algorithm for deduplication"""

        len_a = len(self.a)

        # situation 1: eye index
        index_cl1 = recordlinkage.BlockIndex(on='var_arange')
        pairs1 = index_cl1.index(self.a)
        self.assertEqual(len(pairs1), 0)
        self.assertTrue(pairs1.is_unique)

        # situation 2: 10 blocks
        index_cl2 = recordlinkage.BlockIndex(on='var_block10')
        pairs2 = index_cl2.index(self.a)
        self.assertEqual(len(pairs2), (len_a * 10 - len_a) / 2)
        self.assertTrue(pairs2.is_unique)

        # situation 3: full index
        index_cl3 = recordlinkage.BlockIndex(on='var_single')
        pairs3 = index_cl3.index(self.a)
        self.assertEqual(len(pairs3), (len_a * len_a - len_a) / 2)
        self.assertTrue(pairs3.is_unique)
Exemplo n.º 10
0
 def get_pairs(self):
     for block in self.block_on:
         indexer = rl.BlockIndex(on=block)
         pairs = indexer.index(self.dfA, self.dfB)
         try:
             combined_pairs = last_pairs.union(pairs)
         except:
             pass
         last_pairs = pairs
     try:  # more than one block
         return combined_pairs
     except:  # one block
         return pairs
Exemplo n.º 11
0
def table_length():

    # first csv tables with the appropriate encoding 
    dfA = pd.read_csv("all_gamestop.csv",  encoding = 'utf-8' )

    # Second csv  table with the appropriate encoding
    dfB = pd.read_csv("jnl_half_edit2.csv",  encoding = 'utf-8')
    
    #dfC = pd.read_csv("Matching.csv", encoding = 'utf-8' )
    
    indexer =  rl.BlockIndex(on='title')

    pairs = indexer.index( dfA, dfB )


    print("DataFrame1:", len( dfA ) ) 

    print("DataFrame2:", len( dfB ) ) 

    print("Pair: ", len( pairs )  )
Exemplo n.º 12
0
dfFile2 = tools.load_file_as_df(config.common.filename_2, fieldnames)
perfect_match_index = tools.load_perfect_match_as_index(
    config.common.filename_perfect_match)

# for each config item
for index, config_item in enumerate(config.items):
    # init Random with a fixes seed (for reproducibility)
    tools.init_random_with_seed()

    print("Indexing")
    if config_item.index_type == "sorted_neighbourhood":
        indexer = rl.SortedNeighbourhoodIndex(
            config_item.index_field_name,
            window=config_item.sorted_neighborhood_window)
    elif config_item.index_type == "block":
        indexer = rl.BlockIndex(config_item.index_field_name)
    elif config_item.index_type == "canopy":
        indexer = tools.CanopyClusterIndex(
            config_item.index_field_name,
            threshold_add=config_item.canopy_threshold_add,
            threshold_remove=config_item.canopy_threshold_remove)
    elif config_item.index_type == "full":
        indexer = tools.FullIndex(config_item.index_field_name)
    else:
        raise ValueError(
            "index_type {0} is invalid: must be sorted_neighbourhood, block, canopy or full"
            .format(config_item.index_type))

    pairs_index = indexer.index(dfFile1, dfFile2)

    print("Comparing {0} Pairs".format(pairs_index.size))
Exemplo n.º 13
0
import os
import unittest
import tempfile
import shutil
import pickle

import numpy as np
import pandas as pd
import pandas.util.testing as ptm
from parameterized import parameterized, param

import recordlinkage

TEST_INDEXATION_OBJECTS = [
    param(recordlinkage.FullIndex()),
    param(recordlinkage.BlockIndex(on='var_arange')),
    param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
    param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
    param(recordlinkage.RandomIndex(10, random_state=100, replace=False)),
]


class TestData(unittest.TestCase):
    """Unittest object to setup test data."""
    @classmethod
    def setUpClass(cls):

        n_a = 100
        n_b = 150

        cls.index_a = ['rec_a_%s' % i for i in range(0, n_a)]
Exemplo n.º 14
0
import recordlinkage
#import sys
import csv
import numpy as np
from matplotlib import pyplot

pd = pandas.read_csv("A PubmedData.csv")
ct = pandas.read_csv("A ClinicalTrialsData.csv")

#print pd
print("\n")

#indexer = recordlinkage.FullIndex()
#pairs = indexer.index(pd, ct)

indexer = recordlinkage.BlockIndex(on='Title')
pairs = indexer.index(pd, ct)
print("RecordLinkage Based on Title Through BlockIndex")
print(len(pd), len(ct), len(pairs))
print("\n")

indexer = recordlinkage.BlockIndex(on='Abstract')
pairs = indexer.index(pd, ct)
print("RecordLinkage Based on Abstract Through BlockIndex")
print(len(pd), len(ct), len(pairs))
print("\n")

indexer = recordlinkage.SortedNeighbourhoodIndex(on="Title")
pairs = indexer.index(pd, ct)
print("RecordLinkage Based on Title Through SortedNeighbourhoodIndex")
print(len(pd), len(ct), len(pairs))
Exemplo n.º 15
0
(CSV) file.We start with a CSV file containing our messy data. In this example,
it is listings of patient information having ln,dob,gn,fn as its fields
compiled from several different sources.
The output will be a CSV with our clustered results.
"""

import recordlinkage as rl
import pandas as pd
import numpy as np

# The CSV file is read into a dataframe dfA
dfA = pd.read_csv("Deduplication Problem - Sample Dataset.csv",encoding="utf8")

# Making record pairs using indexing method- blocking
# Returns record pairs having same values for gender and date_of_birth
indexer = rl.BlockIndex(on=['dob','gn'])
pairs = indexer.index(dfA)

# The record pairs are compared using various methods for corresponding fields
# Attribute comparisons are stored in a DataFrame-features thereafter
compare_cl = rl.Compare()
compare_cl.string('fn', 'fn', method='jarowinkler', threshold=0.85,label='fn')
compare_cl.string('ln', 'ln', method='jarowinkler', threshold=0.85, label='ln')
compare_cl.exact('dob', 'dob', label='dob')
compare_cl.exact('gn', 'gn', label='gn')
features = compare_cl.compute(pairs, dfA)

# To decide which record belong to the same person
matches = features[features.sum(axis=1) > 3]

# To produce the CSV file having no duplicate/similar entries
Exemplo n.º 16
0
@author: VictorO
"""

#importing the required libraries
import pandas as pd
import recordlinkage as rl
import os
import numpy as np

#Setting the working directory
os.chdir('M:\\Projects\\Record Linkage\\Python\\data\\')
#importing the data
dfA = pd.read_csv('for_linkage_data1.csv', sep=',', encoding='utf-8', index_col='dummy_id')

# Indexation step - Block Index
indexer = rl.BlockIndex(on='surname' and 'gender' and 'date_of_birth') #you can apply and or suppose you want to block with two variables
pairs = indexer.index(dfA)
#Printing the number of pairs creating from the blocking step
print(len(pairs))

#Doing comparision
compare_cl = rl.Compare(pairs, dfA)
compare_cl.string('firstname', 'firstname', method='jarowinkler', threshold=0.85, name='firstname')
compare_cl.string('surname', 'surname', method='jarowinkler', threshold=0.85, name='surname')
compare_cl.exact('initial', 'initial', name='initial')
compare_cl.exact('gender', 'gender', name='gender')
compare_cl.exact('dob_day', 'dob_day', name='dob_day')
compare_cl.exact('dob_month', 'dob_month', name='dob_month')
compare_cl.exact('dob_year', 'dob_year', name='dob_year')
compare_cl.exact('facility_code', 'facility_code', name='facility_code');
Exemplo n.º 17
0
DF = DF.loc[DF.Province == PR]
N1 = len(DF)
print(N1, 'records to match in ', PR)
DF['Number'] = DF['Number'].astype(int).astype(str)

ADD = pd.read_csv(r"Output_OA_STD_Files\\" + PR + '_OA_STD.csv',
                  low_memory=False)

ADD[NUMBER_COL] = ADD[NUMBER_COL].astype(str)
ADD = ADD.reset_index()

DF = DF.reset_index()
DF = DF.fillna('')
DF['Postal_Code'] = DF['Postal_Code'].str.replace(' ', '').str.upper()

indexer = recordlinkage.BlockIndex(left_on=['Number'], right_on=[NUMBER_COL])
M_idx = indexer.index(DF, ADD)
# print(len(M_idx))

# DF['Street_Add']=DF['StdOpAddressStreetName']+' '+DF['StdOpAddressStreetType']+ ' '+DF['StdOpAddressStreetDir']
# DF['Street_Add']=DF['Street_Add'].str.replace('  ',' ')
# DF['Street_Add']=DF['Street_Add'].str.replace('   ',' ')

if PR != 'QC':
    DF = AddressClean_en(DF, 'Street', 'Street')
    DF = Type_Drop_en(DF, 'Street', 'Street_Name')
    ADD = Type_Drop_en(ADD, STREET_COL, 'STREET_NAME')
else:
    DF = AddressClean_fr(DF, 'Street', 'Street')
    DF = Type_Drop_fr(DF, 'Street', 'Street_Name')
    ADD = Type_Drop_fr(ADD, STREET_COL, 'STREET_NAME')
Exemplo n.º 18
0
def link_datasets(yelp_results, dj_df, df_type="wages"):
    """
    (Assisted by Record Linkage Toolkit library and documentation)
    This functions compares the Yelp query results to database results and 
    produces the best matches based on computing the qgram score. Depending
    on the specific database table characteristics the qgram calculation
    will be between the zip_code, business name, address strings, latitude, 
    longitude, or a combination of those charateristics. 

    Inputs:
        - yelp_results: a pandas dataframe of yelp business results based 
                        on a user's input
        - dj_df: a pandas dataframe of django results.
                 Ex. labour statistics, healthcode violations, Divvy, etc.
        - df_type: a string of which specific dataframe is being passed to
                   be compared to the Yelp results

    Outputs:
        - link: a tuple containing the indices of Yelp query results dataframe
                and the database dataframe AND the best matches qgram scores
    """
    # set thresholds for comparing strings using qgram method
    name_thresh = 0.55
    addr_thresh = 0.55
    strong_addr_thresh = 0.90

    # initialize a Record Linkage comparison object
    compare = rl.Compare()
    
    # Labour & Food data comparisons to Yelp are made on zip, business name,
    # and address
    if df_type == "wages" or df_type == "food": 
        indexer = rl.BlockIndex(on='zip_code')  # block on zip code
        compare.numeric('zip_code', 'zip_code', method='linear',
                        scale=30.0, label='zip_score')
        compare.string('name', 'name', method='qgram',
                       threshold=name_thresh, label='name_score')
        compare.string('addr', 'addr', method='qgram',
                       threshold=addr_thresh, label='addr_score')
    
    # Environmental data comparisons to Yelp are made on address
    elif df_type == "enviro":
        indexer = rl.FullIndex() # no blocking available
        compare.string('addr', 'addr', method='qgram',
                       threshold=strong_addr_thresh, label='addr_score')
    
    # all other data comparisons to Yelp 
    else:
        indexer = rl.FullIndex()
    pairs = indexer.index(yelp_results, dj_df)

    # In addition to above comparisons, ALL data sets are also compared to
    # Yelp based on latitude and longitude
    compare.geo('latitude', 'longitude', 'latitude', 'longitude',
                method='linear', scale=30.0, label='coord_score')

    # compute record linkage scores
    features = compare.compute(pairs, yelp_results, dj_df)

    # set classification thresholds
    zip_classif_thresh = 1.0
    addr_classif_thresh = 1.0
    coord_classif_thresh = 0.99
    name_classif_thresh = 1.0

    # Classification and final filtering
    if df_type == "wages" or df_type == "food": 
        best_matches = features[(features['zip_score'] == zip_classif_thresh) &
                                (features['name_score'] == name_classif_thresh) &
                                (features['addr_score'] == addr_classif_thresh) &
                                (features['coord_score'] >= coord_classif_thresh)]
    elif df_type == "enviro":
        best_matches = features[(features['addr_score'] == addr_classif_thresh) &
                                (features['coord_score'] >= coord_classif_thresh)]
    else:
        best_matches = features[(features['coord_score'] >= coord_classif_thresh)]
    
    # obtain the index values from best_matches
    index_array = best_matches.index.values

    # create tuple of indices and best matches df
    link = (index_array, best_matches)
    
    return link
#Script for the Deduplication of Records

import numpy as np
import csv
import recordlinkage as rl
import pandas as pd

#Reading the CSV file
f1 = pd.read_csv(open('Deduplication Problem - Sample Dataset.csv', 'rb'))

data1 = pd.DataFrame(f1)
data1 = pd.DataFrame(data=data1, dtype='U')

#Making Record Pairs For Similar Data in Various Fields
block_class = rl.BlockIndex(on=["ln", "dob", "gn", "fn"])
block_pairs = block_class.index(data1)

#Comparing each record pair with the data.

compare_class = rl.Compare()

compare_class.string('ln', 'ln', method="jarowinkler", threshold=0.85)
compare_class.exact('dob', 'dob')
compare_class.exact('gn', 'gn')
compare_class.string('fn', 'fn', method="jarowinkler", threshold=0.85)

compare_result = compare_class.compute(block_pairs, data1)

#Filtering the Records further by comparing the Columns matched

matches = compare_result[compare_result.sum(axis=1) >= 3]
#start with two reach datasets
print(len(df_reach18_ref) * len(df_reach17_ref))
# =516,060 possible pairs

# df_reach18_ref['province'].value_counts()
# df_reach17_ref['province'].value_counts()

# df_reach18_ref['district'].value_counts()
# df_reach17_ref['district'].value_counts()

#Blocking - only comparing pairs of records that are identical on some attributes
indexer = recordlinkage.Index()
blocking_vars = [
    "province", "displacement_year"
]  #, 'displacement_month','arrival_month','hoh_sex','hoh_age','hh_size','hh_families'
indexer = recordlinkage.BlockIndex(on=blocking_vars)
candidate_links = indexer.index(df_reach18_ref, df_reach17_ref)
print(len(candidate_links))
# = 134,220

# =============================================================================
# Compare records
# =============================================================================

# initialise class
comp = recordlinkage.Compare()

# initialise similarity measurement algorithms
#displacement info
comp.string('district', 'district', method='jarowinkler', label='district')
comp.numeric('displacement_month',
Exemplo n.º 21
0
class TestIndexApi(TestData):
    """General unittest for the indexing API."""

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_repr(self, index_class):

        index_str = str(index_class)
        index_repr = repr(index_class)
        self.assertEqual(index_str, index_repr)

        start_str = '<{}'.format(index_class.__class__.__name__)
        self.assertTrue(index_str.startswith(start_str))

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_arguments(self, index_class):
        """Test the index method arguments"""

        # The following should work
        index_class.index(self.a)
        index_class.index(self.a, self.b)
        index_class.index((self.a))
        index_class.index([self.a])
        index_class.index((self.a, self.b))
        index_class.index([self.a, self.b])
        index_class.index(x=(self.a, self.b))

    def test_iterative(self):
        """Test the iterative behaviour."""

        # SINGLE STEP
        index_class = recordlinkage.FullIndex()
        pairs = index_class.index((self.a, self.b))
        pairs = pd.DataFrame(index=pairs).sort_index()

        # MULTI STEP
        index_class = recordlinkage.FullIndex()

        pairs1 = index_class.index((self.a[0:50], self.b))
        pairs2 = index_class.index((self.a[50:100], self.b))

        pairs_split = pairs1.append(pairs2)
        pairs_split = pd.DataFrame(index=pairs_split).sort_index()

        ptm.assert_frame_equal(pairs, pairs_split)
        # note possible to sort MultiIndex, so made a frame out of it.

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_empty_imput_dataframes(self, index_class):
        """Empty DataFrames"""

        # make an empty dataframe with the columns of self.a and self.b
        df_a = pd.DataFrame(columns=self.a.columns.tolist())
        df_b = pd.DataFrame(columns=self.b.columns.tolist())

        if not isinstance(index_class, recordlinkage.RandomIndex):
            # make an index
            pairs = index_class.index((df_a, df_b))

            # check if the MultiIndex has length 0
            self.assertIsInstance(pairs, pd.MultiIndex)
            self.assertEqual(len(pairs), 0)
        else:
            with self.assertRaises(ValueError):
                index_class.index((df_a, df_b))

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_error_handling(self, index_class):
        """Test error handling on non-unique index."""

        # make a non_unique index
        df_a = self.a.rename(
            index={self.a.index[1]: self.a.index[0]}, inplace=False)

        with self.assertRaises(ValueError):
            index_class.index(df_a)

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_index_names_dedup(self, index_class):

        index_names = ['dedup', None, 'index', int(1)]
        expected = [
            ['dedup_1', 'dedup_2'],
            [None, None],
            ['index_1', 'index_2'],
            ['1_1', '1_2'],
        ]

        for i, name in enumerate(index_names):

            index_A = pd.Index(self.a.index).rename(name)
            df_A = pd.DataFrame(self.a, index=index_A)

            pairs = index_class.index((df_A))

            self.assertEqual(pairs.names, expected[i])
            self.assertEqual(df_A.index.name, name)

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_duplicated_index_names_dedup(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        # make the index
        pairs = index_class.index(df_a)
        self.assertEqual(pairs.names, ['index_1', 'index_2'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')

        # make the index
        index_class.suffixes = ['_a', '_b']
        pairs = index_class.index(df_a)
        self.assertEqual(pairs.names, ['index_a', 'index_b'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_index_names_link(self, index_class):

        # tuples with the name of the first and second index
        index_names = [
            ('index1', 'index2'),
            ('index1', None),
            (None, 'index2'),
            (None, None),
            (10, 'index2'),
            (10, 11)
        ]

        for name_a, name_b in index_names:

            # make an index for each dataframe with a new index name
            index_a = pd.Index(self.a.index, name=name_a)
            df_a = pd.DataFrame(self.a, index=index_a)

            index_b = pd.Index(self.b.index, name=name_b)
            df_b = pd.DataFrame(self.b, index=index_b)

            pairs = index_class.index((df_a, df_b))
            self.assertEqual(pairs.names, [name_a, name_b])

            # check for inplace editing (not the intention)
            self.assertEqual(df_a.index.name, name_a)
            self.assertEqual(df_b.index.name, name_b)

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_duplicated_index_names_link(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        index_b = pd.Index(self.b.index, name='index')
        df_b = pd.DataFrame(self.b, index=index_b)

        # make the index
        pairs = index_class.index((df_a, df_b))
        self.assertEqual(pairs.names, ['index_1', 'index_2'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')
        self.assertEqual(df_b.index.name, 'index')

        # make the index
        index_class.suffixes = ['_a', '_b']
        pairs = index_class.index((df_a, df_b))
        self.assertEqual(pairs.names, ['index_a', 'index_b'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')
        self.assertEqual(df_b.index.name, 'index')

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_pickle(self, index_class):
        """Test if it is possible to pickle the class."""

        pickle_path = os.path.join(self.test_dir, 'pickle_compare_obj.pickle')

        # pickle before indexing
        pickle.dump(index_class, open(pickle_path, 'wb'))

        # compute the record pairs
        index_class.index(self.a, self.b)

        # pickle after indexing
        pickle.dump(index_class, open(pickle_path, 'wb'))
Exemplo n.º 22
0
import recordlinkage
import pandas as pd

path = "C:/Users/DELL/Desktop/Data-de-duplication/Deduplication Problem - Sample Dataset.csv"
df = pd.read_csv(path)

print(df)

indexer = recordlinkage.FullIndex()
pairs = indexer.index(df)

print(len(df), len(pairs))
print(pairs)

indexer = recordlinkage.BlockIndex(on=['dob', 'gn'])
pairs = indexer.index(df)

print(len(pairs))
# (1000*1000-1000)/2 = 499500

# This cell can take some time to compute.
compare_cl = recordlinkage.Compare()

compare_cl.string('ln', 'ln', label='last_name')
compare_cl.string('fn',
                  'fn',
                  method='jarowinkler',
                  threshold=0.85,
                  label='first_name')
compare_cl.exact('dob', 'dob', label='date_of_birth')
compare_cl.exact('gn', 'gn', label='gender')
#
# However, as we have seen in the previous notebook, we might want to consider how strict we want our matching to be. For example, we want to make sure that we catch any typos or common misspellings, but we want to avoid relaxing the matching condition to the point that anything will match anything.
# markdown
# ### Indexing
#
# Indexing allows us to create candidate links, which basically means identifying pairs of data rows which might refer to the same real world entity. This is also called the comparison space (matrix). There are different ways to index data. The easiest is to create a full index and consider every pair a match. This is also the least efficient method, because we will be comparing every row of one dataset with every row of the other dataset.
#
# If we had 10,000 records in data frame A and 100,000 records in data frame B, we would have 1,000,000,000 candidate links. You can see that comparing over a full index is getting inefficient when working with big data.
# markdown
# We can do better if we actually include our knowledge about the data to eliminate bad link from the start. This can be done through blocking. The `recordlinkage` package gives you multiple options for this. For example, you can block by using variables, which means only links exactly equal on specified values will be kept.
# markdown
# Here we will start by blocking on `city` and `state`, to narrow down the number of candidate links.
# markdown
# You can try and see how the number of candidate links change when blocking on more or less variables.
#
indexerBL = rl.BlockIndex(on=['state', 'city'])
candidate_links = indexerBL.index(inventors, pi)
#
len(candidate_links)
#
candidate_links[:10]
# markdown
# Let's check the first pair of candidate links blocked on city and state: (0, 85)
#
inventors.iloc[0]
#
pi.iloc[85]
#
# Now, in addition to blocking on city and state, we can also try blocking on first name
indexerBL = rl.BlockIndex(on=['name_first', 'city', 'state'])
candidate_links = indexerBL.index(inventors, pi)
Exemplo n.º 24
0
def run_experiment(win_len, preproc, comparison_variant, run_only=None):
    # window length
    if win_len == 0:
        index_description = "block"
        indexer = recordlinkage.BlockIndex('year')
    elif win_len > 0:
        index_description = f"nb{win_len}"
        indexer = recordlinkage.SortedNeighbourhoodIndex('year',
                                                         window=win_len)
    else:
        raise ValueError(f"Invalid window length {win_len}")
    pairs_train = indexer.index(dataDBLP_train, dataScholar_train)
    pairs_test = indexer.index(dataDBLP_test, dataScholar_test)
    if debug:
        print(f"Number of candidates (index={index_description}):")
        print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)")

    # preprocessing
    if preproc == 0:
        print("No preprocesing")
        field_suffix = ""
        preproc_description = "none"
    elif preproc == 1:
        print("Cleaned fields")
        field_suffix = "_clean"
        preproc_description = "clean"
    elif preproc == 2:
        print("Soundex encoding")
        field_suffix = "_soundex"
        preproc_description = "soundex"
    elif preproc == 3:
        print("Nysiis encoding")
        field_suffix = "_nysiis"
        preproc_description = "nysiis"
    elif preproc == 4:
        print("Metaphone encoding")
        field_suffix = "_metaphone"
        preproc_description = "metaphone"
    elif preproc == 5:
        print("Match-rating encoding")
        field_suffix = "_match_rating"
        preproc_description = "match_rating"
    else:
        raise ValueError(f"Unknown preprocessing variant {preproc}")
    print(f"Preprocessing used: {preproc_description}")

    # comparator
    comp = recordlinkage.Compare()
    if comparison_variant == 0:
        comp_description = "exact"
        comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix))
        comp.add(
            compare.Exact('authors' + field_suffix, 'authors' + field_suffix))
        comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix))
    elif comparison_variant == 1:
        comp_description = "levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='levenshtein'))
    elif comparison_variant == 2:
        comp_description = "damerau_levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='damerau_levenshtein'))
    elif comparison_variant == 3:
        comp_description = "jaro"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jaro'))
    elif comparison_variant == 4:
        comp_description = "jarowinkler"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jarowinkler'))
    elif comparison_variant == 5:
        comp_description = "qgram"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='qgram'))
    elif comparison_variant == 6:
        comp_description = "cosine"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='cosine'))
    elif comparison_variant == 7:
        comp_description = "smith_waterman"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='smith_waterman'))
    else:
        raise ValueError(f"Unknown comparison variant {comparison_variant}")
    print(f"String comparison: {comp_description}")

    print("Start compare for training data set")
    start = time.time()
    result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train)
    print("Compare on training data took %.2fs" % (time.time() - start))
    print("Start compare for test data set")
    start = time.time()
    result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test)
    # save time compare for evaluation
    time_compare = time.time() - start
    print("Compare on test data took %.2fs" % (time_compare))

    matches = []
    for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']:
        # skip others if only one classifier is requested
        if run_only is not None and run_only != classifier_description:
            continue
        if classifier_description == 'logreg':
            print("Logistic Regression classifier")
            classifier = recordlinkage.LogisticRegressionClassifier()
            supervised = True
        elif classifier_description == 'bayes':
            print("Naive Bayes classifier")
            classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75)
            supervised = True
        elif classifier_description == 'svm':
            print("Support Vector Machine classifier")
            classifier = recordlinkage.SVMClassifier()
            supervised = True
        elif classifier_description == 'kmeans':
            print("KMeans classifier")
            classifier = recordlinkage.KMeansClassifier()
            supervised = False
        elif classifier_description == 'ecm':
            print("ECM classifier")
            classifier = recordlinkage.ECMClassifier(binarize=0.75)
            supervised = False
        else:
            raise ValueError(
                f"Unknown classifier variant {classifier_description}")

        if supervised:
            start = time.time()
            classifier.fit(result_train, links_train)
            time_train = time.time() - start
            start = time.time()
            match = classifier.predict(result_test)
            time_classify = time.time() - start
        else:
            start = time.time()
            match = classifier.fit_predict(result_test)
            time_classify = time.time() - start
            time_train = 0
        matches.append(
            (index_description, preproc_description, comp_description,
             classifier_description, match, 1000 * time_compare,
             1000 * time_train, 1000 * time_classify))

        if debug:
            print("%d matches" % len(match))
            print_experiment_evaluation(
                match, "-".join((index_description, preproc_description,
                                 comp_description)))

    return matches
Exemplo n.º 25
0
@author: Tony
"""
#Refer http://recordlinkage.readthedocs.io/en/latest/notebooks/data_deduplication.html
import pandas as pd
import recordlinkage

raw_ds = pd.read_csv(
    'F:\Data_Scientist\hackathon-master\CRS_Test_Data29112017 223508.csv',
    encoding='ISO-8859-1')
raw_ds_sliced = raw_ds[[
    'Account Party ID', 'Account Holder First Name',
    'Account Holder Last Name', 'Account Holder City',
    'Account Holder Date of Birth'
]]
#print(raw_ds.head())
indexer = recordlinkage.BlockIndex(on='Account Party ID')
pairs = indexer.index(raw_ds_sliced)

#print (len(pairs))
compare_cl = recordlinkage.Compare()

compare_cl.exact('Account Party ID',
                 'Account Party ID',
                 label='Account Party ID')
#compare_cl.string('Account Holder First Name', 'Account Holder First Name', method='jarowinkler', threshold=0.85, label='Account Holder First Name')
#compare_cl.string('Account Holder Last Name', 'Account Holder Last Name', method='jarowinkler', threshold=0.85, label='Account Holder Last Name')
compare_cl.exact('Account Holder Date of Birth',
                 'Account Holder Date of Birth',
                 label='Account Holder Date of Birth')
#compare_cl.exact('Account Holder City', 'Account Holder City', label='Account Holder City')