예제 #1
0
 def __init__(self, corpus_list=None, sim_func=Jaro().get_raw_score,
              threshold=0.5, dampen = True):
     self.__corpus_list = corpus_list
     self.__document_frequency = {}
     self.__compute_document_frequency()
     self.__corpus_size = 0 if self.__corpus_list is None else (
                                                      len(self.__corpus_list))
     self.sim_func = sim_func
     self.threshold = threshold
     self.dampen = dampen
     super(SoftTfIdf, self).__init__()
예제 #2
0
    def get_raw_score(self, string1, string2):
        """Computes the raw Jaro-Winkler score between two strings.

        Args:
            string1,string2 (str): Input strings.

        Returns:
            Jaro-Winkler similarity score (float).

        Raises:
            TypeError : If the inputs are not strings or if one of the inputs is None.

        Examples:
            >>> jw = JaroWinkler()
            >>> jw.get_raw_score('MARTHA', 'MARHTA')
            0.9611111111111111
            >>> jw.get_raw_score('DWAYNE', 'DUANE')
            0.84
            >>> jw.get_raw_score('DIXON', 'DICKSONX')
            0.8133333333333332

        """
        
        # input validations
        utils.sim_check_for_none(string1, string2)

        # convert input to unicode.
        string1 = utils.convert_to_unicode(string1)
        string2 = utils.convert_to_unicode(string2)

        utils.tok_check_for_string_input(string1, string2)

        # if one of the strings is empty return 0
        if utils.sim_check_for_empty(string1, string2):
            return 0

        jw_score = Jaro().get_raw_score(string1, string2)
        min_len = min(len(string1), len(string2))

        # prefix length can be at max 4
        j = min(min_len, 4)
        i = 0
        while i < j and string1[i] == string2[i] and string1[i]:
            i += 1

        if i:
            jw_score += i * self.prefix_weight * (1 - jw_score)

        return jw_score
예제 #3
0
    def __init__(self,
                 corpus_list=None,
                 sim_func=Jaro().get_raw_score,
                 threshold=0.95):
        self.__corpus_list = corpus_list
        self.__document_frequency = {}
        self.__compute_document_frequency()

        # TODO remove me. Just to get the most common words
        # import operator
        # sorted_x = sorted(self.__document_frequency.items(), key=operator.itemgetter(1))
        # sorted_zz = sorted_x.reverse()

        self.__corpus_size = 0 if self.__corpus_list is None else (len(
            self.__corpus_list))
        self.sim_func = sim_func
        self.threshold = threshold
예제 #4
0
def soft_tfidf_norm(bag1, bag2):

    # if the strings match exactly return 1.0
    if sim_check_for_exact_match(bag1, bag2):
        return 1.0

    # if one of the strings is empty return 0
    if sim_check_for_empty(bag1, bag2):
        return 0

    sim_func = Jaro().get_raw_score
    threshold = 0.5

    # term frequency for input strings
    tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)

    # find unique elements in the input lists and their document frequency 
    local_df = {}
    for element in tf_x:
        local_df[element] = local_df.get(element, 0) + 1
    for element in tf_y:
        local_df[element] = local_df.get(element, 0) + 1

    # if corpus is not provided treat input string as corpus
    curr_df, corpus_size = (local_df, 2)

    # calculating the term sim score against the input string 2,
    # construct similarity map
    similarity_map = {}
    for term_x in tf_x:
        max_score = 0.0
        for term_y in tf_y:
            score = sim_func(term_x, term_y)
            # adding sim only if it is above threshold and
            # highest for this element
            if score > threshold and score > max_score:
                similarity_map[term_x] = (score, term_x, term_y)
                max_score = score

    # position of first string, second string and sim score
    # in the tuple
    first_string_pos = 1
    second_string_pos = 2
    sim_score_pos = 0

    result, v_x_2, v_y_2 = 0.0, 0.0, 0.0
    # soft-tfidf calculation
    for element in local_df.keys():
        # denominator
        idf = corpus_size / curr_df[element]
        v_x = idf * tf_x.get(element, 0)
        v_x_2 += v_x * v_x
        v_y = idf * tf_y.get(element, 0)
        v_y_2 += v_y * v_y

    used_x = {}
    used_y = {}
    for sim in sorted(similarity_map.values(), reverse=True):
        if used_x.get(sim[first_string_pos]) is not None or used_y.get(sim[second_string_pos]) is not None:
            continue
        idf_first = corpus_size / curr_df.get(sim[first_string_pos], 1)     
        idf_second = corpus_size / curr_df.get(sim[second_string_pos], 1)   
        v_x = idf_first * tf_x.get(sim[first_string_pos], 0)                
        v_y = idf_second * tf_y.get(sim[second_string_pos], 0)              
        result += v_x * v_y * sim[sim_score_pos] 
        used_x[sim[first_string_pos]] = True
        used_y[sim[second_string_pos]]= True

    return result if v_x_2 == 0 else result / (sqrt(v_x_2) * sqrt(v_y_2))
 def __init__(self, sim_func=Jaro().get_raw_score, threshold=0.5):
     self.sim_func = sim_func
     self.threshold = threshold
     super(GeneralizedJaccard, self).__init__()
예제 #6
0
 def setup(self):
     self.jaro = Jaro()
예제 #7
0
class TimeJaro:
    def setup(self):
        self.jaro = Jaro()

    def time_short_short(self):
        self.jaro.get_raw_score(_short_string_1, _short_string_2)

    def time_medium_medium(self):
        self.jaro.get_raw_score(_medium_string_1, _medium_string_2)

    def time_long_long(self):
        self.jaro.get_raw_score(_long_string_1, _long_string_2)

    def time_short_medium(self):
        self.jaro.get_raw_score(_short_string_1, _medium_string_1)

    def time_short_long(self):
        self.jaro.get_raw_score(_short_string_1, _long_string_1)

    def time_medium_long(self):
        self.jaro.get_raw_score(_medium_string_1, _long_string_1)
예제 #8
0
 def setup(self):
     self.jaro = Jaro()
예제 #9
0
class TimeJaro:
    def setup(self):
        self.jaro = Jaro()

    def time_short_short(self):
        self.jaro.get_raw_score(_short_string_1, _short_string_2)

    def time_medium_medium(self):
        self.jaro.get_raw_score(_medium_string_1, _medium_string_2)

    def time_long_long(self):
        self.jaro.get_raw_score(_long_string_1, _long_string_2)

    def time_short_medium(self):
        self.jaro.get_raw_score(_short_string_1, _medium_string_1)

    def time_short_long(self):
        self.jaro.get_raw_score(_short_string_1, _long_string_1)

    def time_medium_long(self):
        self.jaro.get_raw_score(_medium_string_1, _long_string_1)
af = Affine()
me = MongeElkan()
nw = NeedlemanWunsch()
sw = SmithWaterman()
bd = BagDistance()
cos = Cosine()
pr = PartialRatio()
sf = SoftTfIdf()
edx = Editex()
gj = GeneralizedJaccard()
jw = JaroWinkler()
lev = Levenshtein()
dice = Dice()
jac = Jaccard()
jaro = Jaro()
pts = PartialTokenSort()
rat = Ratio()
sound = Soundex()
tfidf = TfIdf()
ts = TokenSort()
tv_ind = TverskyIndex()
over_coef = OverlapCoefficient()

# It's long
print('Loading word2vec model...')
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',
                                          binary=True)
print('Word2vec model are loaded.')