def _sortedneighbourhood(df_a, df_b, column, window=3, sorting_key_values=None, block_on=[], block_left_on=[], block_right_on=[]): # Check if window is an odd number if not bool(window % 2): raise ValueError('The given window length is not an odd integer.') block_on = [block_on] if type(block_on) != list else block_on block_left_on = [block_left_on] if type(block_left_on) != list else block_left_on block_right_on = [block_right_on] if type(block_right_on) != list else block_right_on block_left_on, block_right_on = [block_on, block_on] if block_on else ([], []) keys_left = [column] + block_left_on keys_right = [column] + block_right_on df_a = df_a[df_a[column].notnull()] # df_a.dropna(inplace=True) df_b = df_b[df_b[column].notnull()] # df_a.dropna(inplace=True) # sorting_key_values is the terminology in Data Matching [Christen, 2012] if sorting_key_values is None: # Combine the results sorting_key_values = numpy.sort(numpy.unique( numpy.concatenate([df_a[column].values,df_b[column].values]) )) sorting_key_factors = numpy.arange(len(sorting_key_values)) data_dict_A = {kl:df_a[kl] for kl in keys_left} data_dict_B = {kl:df_b[kl] for kl in keys_right} sorted_df_A = pandas.DataFrame(merge_dicts(data_dict_A, {column:df_a[column].replace(sorting_key_values, sorting_key_factors), df_a.index.name: df_a.index.values})) sorted_df_B = pandas.DataFrame({column:df_b[column].replace(sorting_key_values, sorting_key_factors), df_b.index.name: df_b.index.values}) pairs_concat = None # Internal window size _window = int((window-1)/2) for w in range(-_window, _window+1): df = pandas.DataFrame(merge_dicts(data_dict_B, {column:sorted_df_B[column]+w, df_b.index.name: df_b.index.values})) pairs = sorted_df_A.merge(df, left_on=keys_left, right_on=keys_right, how='inner').set_index([df_a.index.name, df_b.index.name]) pairs_concat = pairs.index if pairs_concat is None else pairs.index.append(pairs_concat) return pairs_concat
def _sortedneighbourhood( df_a, df_b, column, window=3, sorting_key_values=None, block_on=[], block_left_on=[], block_right_on=[]): # Check if window is an odd number if not isinstance(window, int) or (window < 0) or not bool(window % 2): raise ValueError('The given window length is not a positive and odd integer.') block_on = [block_on] if type(block_on) != list else block_on block_left_on = [block_left_on] if type( block_left_on) != list else block_left_on block_right_on = [block_right_on] if type( block_right_on) != list else block_right_on block_left_on, block_right_on = [ block_on, block_on] if block_on else ([], []) keys_left = [column] + block_left_on keys_right = [column] + block_right_on df_a = df_a[df_a[column].notnull()] # df_a.dropna(inplace=True) df_b = df_b[df_b[column].notnull()] # df_a.dropna(inplace=True) # sorting_key_values is the terminology in Data Matching [Christen, 2012] if sorting_key_values is None: # Combine the results sorting_key_values = numpy.sort(numpy.unique( numpy.concatenate([df_a[column].values, df_b[column].values]) )) sorting_key_factors = numpy.arange(len(sorting_key_values)) data_dict_A = {kl: df_a[kl] for kl in keys_left} data_dict_B = {kl: df_b[kl] for kl in keys_right} sorted_index = pandas.Series(index=sorting_key_values, data=sorting_key_factors) sorted_df_A = pandas.DataFrame( merge_dicts( data_dict_A, {column: df_a[column].map(sorted_index), df_a.index.name: df_a.index.values})) sorted_df_B = pandas.DataFrame( {column: df_b[column].map(sorted_index), df_b.index.name: df_b.index.values}) pairs_concat = None # Internal window size _window = int((window - 1) / 2) for w in range(-_window, _window + 1): df = pandas.DataFrame( merge_dicts( data_dict_B, { column: sorted_df_B[column] + w, df_b.index.name: df_b.index.values } ) ) pairs = sorted_df_A.merge( df, left_on=keys_left, right_on=keys_right, how='inner' ).set_index( [df_a.index.name, df_b.index.name] ) if pairs_concat is None: pairs_concat = pairs.index else: pairs_concat = pairs.index.append(pairs_concat) return pairs_concat