def _find_best_matching_rows(strings, right, right_on, na_ratio, two_na_ratio, case_sensitivity, score_name, num_threads, similarity_function, weights, num_results, echo): """ :param strings: :param right: :param right_on: :param na_ratio: :param two_na_ratio: :param case_sensitivity: :param score_name: :param num_threads: :param num_results: :param echo: :rtype: DataFrame """ right = right.copy() if num_threads == 1: right[score_name] = ProgressBar.apply( data=right, function=lambda row: _get_similarity_between_strings_and_row( strings=strings, row=row, right_on=right_on, na_ratio=na_ratio, two_na_ratio=two_na_ratio, case_sensitivity=case_sensitivity, similarity_function=similarity_function, weights=weights), echo=echo) else: parallel = Parallel(n_jobs=num_threads, backend='threading', require='sharedmem') progress_bar = ProgressBar(total=len(right) + 1, echo=echo) right[score_name] = parallel( delayed(_get_similarity_between_strings_and_row)( strings=strings, row=row, right_on=right_on, na_ratio=na_ratio, two_na_ratio=two_na_ratio, case_sensitivity=case_sensitivity, similarity_function=similarity_function, weights=weights) for index, row in iterate(right.iterrows(), progress_bar=progress_bar)) progress_bar.show(amount=len(right) + 1) right = right.sort_values(by=score_name, ascending=False) return right.iloc[0:num_results]
def fuzzy_left_merge(left, right, left_on=None, right_on=None, on=None, suffixes=('_x', '_y'), score_name='match_ratio', na_ratio=0.5, two_na_ratio=0.75, similarity_function=None, weights=None, case_sensitivity=0.5, num_results=1, num_threads=-1, echo=1): """ :type left: DataFrame :type right: DataFrame :type left_on: list[str] or str or NoneType :type right_on: list[str] or str or NoneType :type on: list[str] or str or NoneType :type how: str or NoneType :type case_sensitivity: float :type num_results: int :type similarity_function: callable :type echo: int or bool or ProgressBar :type num_threads: int :rtype: DataFrame """ if score_name in left.columns or score_name in right.columns: raise ValueError('use a score_name different from column names.') data1 = left.copy() data2 = right.copy() if on is None: on = data1.columns & data2.columns if left_on is None: left_on = on if right_on is None: right_on = on missing_left = [col for col in left_on if col not in data1.columns] if len(missing_left) > 0: raise KeyError(f'missing columns on left: {missing_left}') missing_right = [col for col in right_on if col not in data2.columns] if len(missing_right) > 0: raise KeyError(f'missing columns on right: {missing_right}') data1['fuzzy_id'] = range(len(data1)) if num_threads == 1: results = ProgressBar.apply( data=data1, echo=echo, function=lambda row: _match_rows(row=row, right=data2, left_on=left_on, right_on=right_on, na_ratio=na_ratio, two_na_ratio=two_na_ratio, case_sensitivity=case_sensitivity, score_name=score_name, num_results=num_results, similarity_function= similarity_function, weights=weights, num_threads=1, echo=echo - 1)) else: parallel = Parallel(n_jobs=num_threads, backend='threading', require='sharedmem') progress_bar = ProgressBar(total=len(data1) + 1, echo=echo) results = parallel( delayed(_match_rows)(row=row, right=data2, left_on=left_on, right_on=right_on, na_ratio=na_ratio, two_na_ratio=two_na_ratio, case_sensitivity=case_sensitivity, score_name=score_name, num_results=num_results, similarity_function=similarity_function, weights=weights, num_threads=1, echo=echo - 1) for index, row in iterate(data1.iterrows(), progress_bar=progress_bar)) progress_bar.show(amount=len(data1) + 1) data2 = concat(results).reset_index(drop=True) return data1.merge(right=data2, on='fuzzy_id', how='left', suffixes=suffixes).drop(columns='fuzzy_id')