示例#1
0
def get_similarities(string,
                     strings,
                     method='jaro_winkler',
                     similarity_function=None,
                     case_sensitivity=1.0,
                     first_char_weight=0.0,
                     first_word_weight=0.0,
                     echo=0):
    """
	:type string: str
	:type strings: list[str]
	:type treat_as_sentence: bool
	:param str method: can be one of 'jaro_winkler', 'levenshtein', 'sentence_jaro_winkler', 'sentence_levenshtein'
	:type case_sensitivity: float
	:type first_char_weight: float
	:type first_word_weight: float
	:rtype:
	"""
    echo = max(0, echo)
    string = str(string)
    text = string + ' ? '
    return list(
        ProgressBar.map(function=lambda x: get_similarity(
            s1=string,
            s2=x,
            method=method,
            similarity_function=similarity_function,
            first_char_weight=first_char_weight,
            case_sensitivity=case_sensitivity,
            first_word_weight=first_word_weight),
                        iterable=strings,
                        iterable_text=strings,
                        text=text,
                        echo=echo))
示例#2
0
def find_most_similar(strings,
                      candidates,
                      candidate_ids=None,
                      string_ids=None,
                      method='jaro_winkler',
                      similarity_function=None,
                      case_sensitivity=1.0,
                      first_char_weight=0.0,
                      first_word_weight=0.0,
                      num_results=1,
                      echo=0):
    """
	:type strings: str or list[str]
	:type candidates: list[str]
	:type candidate_ids: list or NoneType
	:type string_ids: list or NoneType
	:param str method: can be one of 'jaro_winkler', 'levenshtein', 'sentence_jaro_winkler', 'sentence_levenshtein'
	:type treat_as_sentence: bool
	:type first_char_weight: float
	:type first_word_weight: float
	:type case_sensitivity: float
	:rtype: pd.DataFrame
	"""
    echo = max(0, echo)
    if string_ids is None:
        string_ids = range(len(strings))

    return pd.concat(
        list(
            ProgressBar.map(
                function=lambda x: find_most_similar_for_one_string(
                    string=x[0],
                    string_id=x[1],
                    candidates=candidates,
                    candidate_ids=candidate_ids,
                    method=method,
                    similarity_function=similarity_function,
                    case_sensitivity=case_sensitivity,
                    first_char_weight=first_char_weight,
                    first_word_weight=first_word_weight,
                    num_results=num_results,
                    echo=echo - 1),
                iterable=list(zip(strings, string_ids)),
                iterable_text=strings,
                echo=echo))).reset_index(drop=True)[[
                    'string_id', 'string', 'candidate_id', 'candidate',
                    'similarity_rank', 'similarity'
                ]]