Пример #1
0
    def test_get_fuzzy_columns(self):
        '''
        Test the basic functionality of get_fuzzy_columns
        '''
        left_fuzzy_df = pd.DataFrame({
            'ID': ['123314', '123213', '43543', '35435', '987'],
            'sCol': [
                'kitten', 'siting', 'the times of best', 'the worst times',
                'not in there'
            ],
            'zCol': [
                'oboe', 'trvmpet', 'over te rainbow', 'in Symphony C',
                'not in there'
            ],
            'fuzzy_sCol': [
                'kitten', 'sitting', 'the best of times', 'the worst of times',
                'not in there'
            ]
        })

        fpd.get_fuzzy_columns(self.left_df,
                              self.right_df,
                              left_cols=['sCol'],
                              right_cols=['RsCol'])
        self.assertTrue(self.left_df.equals(left_fuzzy_df))
Пример #2
0
 def test_null_return(self):
     '''
     Test the get_fuzzy_columns null_return='NULL' argument
     '''
     left_fuzzy_df = pd.DataFrame({
         'ID': ['123314', '123213', '43543', '35435', '987'],
         'sCol': [
             'kitten', 'siting', 'the times of best', 'the worst times',
             'not in there'
         ],
         'zCol': [
             'oboe', 'trvmpet', 'over te rainbow', 'in Symphony C',
             'not in there'
         ],
         'fuzzy_sCol': [
             'kitten', 'sitting', 'the best of times', 'the worst of times',
             'NULL'
         ]
     })
     fpd.get_fuzzy_columns(self.left_df,
                           self.right_df,
                           left_cols=['sCol'],
                           right_cols=['RsCol'],
                           null_return='NULL')
     self.assertTrue(self.left_df.equals(left_fuzzy_df))
Пример #3
0
 def test_no_right_cols(self):
     '''
     Test no right_cols input
     '''
     self.left_df.columns = ['ID', 'col_1', 'col_2']
     self.right_df.columns = ['ID', 'col_1', 'col_2']
     left_fuzzy_df = pd.DataFrame({
         'ID': ['123314', '123213', '43543', '35435', '987'],
         'col_1': [
             'kitten', 'siting', 'the times of best', 'the worst times',
             'not in there'
         ],
         'col_2': [
             'oboe', 'trvmpet', 'over te rainbow', 'in Symphony C',
             'not in there'
         ],
         'fuzzy_col_1': [
             'kitten', 'sitting', 'the best of times', 'the worst of times',
             'not in there'
         ],
         'fuzzy_col_2': [
             'oboe', 'trumpet', 'over the rainbow', 'Symphony in C#',
             'not in there'
         ]
     })
     fpd.get_fuzzy_columns(self.left_df,
                           self.right_df,
                           left_cols=['col_1', 'col_2'])
     self.assertTrue(self.left_df.equals(left_fuzzy_df))
Пример #4
0
def do_fuzzy_match(left_dataframe,
                   right_dataframe,
                   left_cols,
                   target_cols,
                   max_edit_distance,
                   mapping_dict,
                   right_cols='description'):
    # perform fuzzy match on source (left) column and category (right) column
    # the match returns a value that represents the source column but that matches the category column exactly -> proxy
    # use the proxy to match to the dictionary as a direct match
    N = len(left_dataframe)
    logging.info(f'-- Trying to map {N} records with fuzzy match.')
    logging.info(
        f'--- match left on {left_cols} | match right on {right_cols}')

    matching.get_fuzzy_columns(left_dataframe=left_dataframe,
                               right_dataframe=right_dataframe,
                               left_cols=[left_cols],
                               right_cols=[right_cols],
                               max_edit_distance=max_edit_distance)

    fuzzy_col = f'fuzzy_{left_cols}'

    left_dataframe[target_cols] = left_dataframe[fuzzy_col].map(mapping_dict)

    left_dataframe = left_dataframe.drop(columns=[fuzzy_col])

    return left_dataframe