def test_get_fuzzy_columns(self): ''' Test the basic functionality of get_fuzzy_columns ''' left_fuzzy_df = pd.DataFrame({ 'ID': ['123314', '123213', '43543', '35435', '987'], 'sCol': [ 'kitten', 'siting', 'the times of best', 'the worst times', 'not in there' ], 'zCol': [ 'oboe', 'trvmpet', 'over te rainbow', 'in Symphony C', 'not in there' ], 'fuzzy_sCol': [ 'kitten', 'sitting', 'the best of times', 'the worst of times', 'not in there' ] }) fpd.get_fuzzy_columns(self.left_df, self.right_df, left_cols=['sCol'], right_cols=['RsCol']) self.assertTrue(self.left_df.equals(left_fuzzy_df))
def test_null_return(self): ''' Test the get_fuzzy_columns null_return='NULL' argument ''' left_fuzzy_df = pd.DataFrame({ 'ID': ['123314', '123213', '43543', '35435', '987'], 'sCol': [ 'kitten', 'siting', 'the times of best', 'the worst times', 'not in there' ], 'zCol': [ 'oboe', 'trvmpet', 'over te rainbow', 'in Symphony C', 'not in there' ], 'fuzzy_sCol': [ 'kitten', 'sitting', 'the best of times', 'the worst of times', 'NULL' ] }) fpd.get_fuzzy_columns(self.left_df, self.right_df, left_cols=['sCol'], right_cols=['RsCol'], null_return='NULL') self.assertTrue(self.left_df.equals(left_fuzzy_df))
def test_no_right_cols(self): ''' Test no right_cols input ''' self.left_df.columns = ['ID', 'col_1', 'col_2'] self.right_df.columns = ['ID', 'col_1', 'col_2'] left_fuzzy_df = pd.DataFrame({ 'ID': ['123314', '123213', '43543', '35435', '987'], 'col_1': [ 'kitten', 'siting', 'the times of best', 'the worst times', 'not in there' ], 'col_2': [ 'oboe', 'trvmpet', 'over te rainbow', 'in Symphony C', 'not in there' ], 'fuzzy_col_1': [ 'kitten', 'sitting', 'the best of times', 'the worst of times', 'not in there' ], 'fuzzy_col_2': [ 'oboe', 'trumpet', 'over the rainbow', 'Symphony in C#', 'not in there' ] }) fpd.get_fuzzy_columns(self.left_df, self.right_df, left_cols=['col_1', 'col_2']) self.assertTrue(self.left_df.equals(left_fuzzy_df))
def do_fuzzy_match(left_dataframe, right_dataframe, left_cols, target_cols, max_edit_distance, mapping_dict, right_cols='description'): # perform fuzzy match on source (left) column and category (right) column # the match returns a value that represents the source column but that matches the category column exactly -> proxy # use the proxy to match to the dictionary as a direct match N = len(left_dataframe) logging.info(f'-- Trying to map {N} records with fuzzy match.') logging.info( f'--- match left on {left_cols} | match right on {right_cols}') matching.get_fuzzy_columns(left_dataframe=left_dataframe, right_dataframe=right_dataframe, left_cols=[left_cols], right_cols=[right_cols], max_edit_distance=max_edit_distance) fuzzy_col = f'fuzzy_{left_cols}' left_dataframe[target_cols] = left_dataframe[fuzzy_col].map(mapping_dict) left_dataframe = left_dataframe.drop(columns=[fuzzy_col]) return left_dataframe