예제 #1
0
def add_query_features(df, inc, exc, k1list, k2list):
    """
    Return a copy of a dataframe with summary features added for
    the named text files defining the query
    """
    df_new = df.copy()
    k1lens = list(map(len, k1list))
    k2lens = list(map(len, k2list))
    k1max = max(k1lens)
    k2max = max(k2lens)
    k1count = len(k1list)
    k2count = len(k2list)
    df_new['k1_count'] = k1count
    df_new['k2_count'] = k2count
    df_new['k1_max'] = k1max
    df_new['k2_max'] = k2max
    jaro_dist = jellyfish.jaro_distance(inc, exc)
    lev_dist = jellyfish.levenshtein_distance(inc, exc)
    ji = textdistance.jaccard(inc, exc)
    sd = textdistance.sorensen(inc, exc)
    ro = textdistance.ratcliff_obershelp(inc, exc)
    #jellyfish.damerau_levenshtein_distance(inc,exc)
    #jellyfish.jaro_winkler(inc,exc)
    df_new['inc_jaro_exc'] = jaro_dist
    df_new['inc_lev_exc'] = lev_dist
    df_new['inc_ji_exc'] = ji
    df_new['inc_sd_exc'] = sd
    df_new['inc_ro_exc'] = ro
    return df_new
예제 #2
0
 def criteria_features(x, col):
     raw_text = x[col].lower()
     jd = jellyfish.jaro_distance(raw_text, crit)
     ld = jellyfish.levenshtein_distance(raw_text, crit)
     ji = textdistance.jaccard(raw_text, crit)
     sd = textdistance.sorensen(raw_text, crit)
     ro = textdistance.ratcliff_obershelp(raw_text, crit)
     return jd, ld, ji, sd, ro
def similarity_score(word_list1, word_list2):
    string1 = " ".join(word_list1)
    string2 = " ".join(word_list2)
    sorensen = textdistance.sorensen(word_list1, word_list2)
    cosine = textdistance.cosine(word_list1, word_list2)
    ratcliff = textdistance.ratcliff_obershelp.normalized_similarity(
        string1, string2)
    return (sorensen + cosine + ratcliff) / STRING_METRICS_COUNT
예제 #4
0
def recherche(dicos, mot):
    score = dict()
    for dico in dicos:
        for (id, listes) in dico[1].items():
            alias = listes[0]
            score[id] = 0
            for a in alias:
                score_a = textdistance.sorensen(mot, a)
                if (score_a > score[id]):
                    score[id] = score_a
        res = sorted(score.items(), key=lambda item: item[1])

    return res[len(res) - 5:len(res)]
예제 #5
0
 def sm_features(x, col1, col2):
     if (x[col1] != x[col1]) or (x[col2] != x[col2]):
         jd = np.nan
         ld = np.nan
         ji = np.nan
         sd = np.nan
     else:
         raw_text1 = x[col1].lower()
         raw_text2 = x[col2].lower()
         jd = jellyfish.jaro_distance(raw_text1, raw_text2)
         ld = jellyfish.levenshtein_distance(raw_text1, raw_text2)
         ji = textdistance.jaccard(raw_text1, raw_text2)
         sd = textdistance.sorensen(raw_text1, raw_text2)
     return jd, ld, ji, sd
예제 #6
0
def search_in_list(s_list, x, min_score=0.5):
    import textdistance
    max_sim = -1
    res = None
    if (isinstance(s_list, set) or isinstance(s_list, map)) and x in s_list:
        return x, 1
    for s in s_list:
        if s == x:
            return x, 1
        sim = textdistance.sorensen(s, x)
        if sim > max_sim:
            max_sim = sim
            res = s
    if min_score <= max_sim:
        return res, max_sim
예제 #7
0
    def compare(self, str1, str2):

        if self.debug:
            self.log("sorensen comparison")

        self.start_time()

        self.result.distance = sorensen(str1, str2)

        self.end_time()

        self.result.nos = max(len(str1), len(str2))
        self.result.threshold = 90
        self.result.similarity = self.result.distance * 100

        return self.result
예제 #8
0
def compare(s_inp, s_out):
	'''nlp = spacy.load("en_core_web_sm")
	str_inp = nlp(s_inp)
	srt_inp = " ".join([token.lemma_ for token in str_inp])
	#print(str_inp)
	#inp_lower = str_inp.lower()
	print("Lower String Input: {}".format(str_inp))
	str_out = nlp(s_out)
	srt_out = " ".join([token.lemma_ for token in str_out])
	#print(str_out)
	#out_lower = str_out.lower()
	print("Lower String Output: {}".format(str_out))
	#print("Inside Compare")
	#print("Str1: ", s_inp)
	#print("Str2: ", s_out)'''
	

	#Jaccard Index
	jacc = textdistance.jaccard(s_inp, s_out)
	print("jaccard: ", jacc)
	
	#Sorens
	soren = textdistance.sorensen(s_inp, s_out)
	print("Sorensen: ", soren)
	
	#TVR Value
	tvr = textdistance.tversky(s_inp, s_out)
	print("Tversky: ", tvr)
	
	#Over Lap Index
	overlap = textdistance.overlap(s_inp, s_out)
	print("overlap_cofficient: ", overlap)
	
	#Tanimoto Distance
	#tanimoto_distance = textdistance.tanimoto(str_inp, str_out)
	#print("Tanimoto: ", tanimoto_distance)

	res = (jacc+soren+tvr+overlap)/4
	if res == 0:
		pass
	else:
		lst.append(res)
	print("Result: {}".format(res))
	'''if (res >= 0.6):
예제 #9
0
파일: search.py 프로젝트: SakalikPeter/VINF
def similarity(type, a, b):
    """
  String similarity metrics
  input:  type: hamming (similarity type)
          a: John (string 1)
          b: John Snow (string 2)
  output: 0.73 (probability)
  """
    if type == 'hamming':
        return textdistance.hamming.normalized_similarity(a, b)
    elif type == 'levenshtein':
        return textdistance.levenshtein.normalized_similarity(a, b)
    elif type == 'jaro_winkler':
        return textdistance.jaro_winkler(a, b)
    elif type == 'jaccard':
        tokens_1 = a.split()
        tokens_2 = b.split()
        return textdistance.jaccard(tokens_1, tokens_2)
    elif type == 'sorensen':
        tokens_1 = a.split()
        tokens_2 = b.split()
        return textdistance.sorensen(tokens_1, tokens_2)
    elif type == 'ratcliff_obershelp':
        return textdistance.ratcliff_obershelp(a, b)
예제 #10
0
    n2 = dt.datetime.now()
    ld_time.append((n2 - n1).microseconds)
    #end = timeit.timeit()
    #ld_time.append(end - start)

    #start = timeit.timeit()
    n1 = dt.datetime.now()
    ji = textdistance.jaccard(raw_text1, raw_text2)
    n2 = dt.datetime.now()
    ji_time.append((n2 - n1).microseconds)
    #end = timeit.timeit()
    #ji_time.append(end - start)

    #start = timeit.timeit()
    n1 = dt.datetime.now()
    sd = textdistance.sorensen(raw_text1, raw_text2)
    n2 = dt.datetime.now()
    sd_time.append((n2 - n1).microseconds)
    #end = timeit.timeit()
    #sd_time.append(end - start)

    #start = timeit.timeit()
    n1 = dt.datetime.now()
    ro = textdistance.ratcliff_obershelp(raw_text1, raw_text2)
    n2 = dt.datetime.now()
    ro_time.append((n2 - n1).microseconds)
    #end = timeit.timeit()
    #ro_time.append(end - start)

print("jellyfish.jaro_distance")
print(sum(jd_time) / 50000)
예제 #11
0
 def dice_similarity(self, string_1, string_2):
     """
     Calculate the Dice Distance between two string lists
     """
     return textdistance.sorensen(string_1, string_2)
예제 #12
0
def simple_example():
    str1, str2 = 'test', 'text'
    qval = 2

    #--------------------
    # Edit-based.
    if True:
        print("textdistance.hamming({}, {}) = {}.".format(
            str1, str2, textdistance.hamming(str1, str2)))
        print("textdistance.hamming.distance({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.distance(str1, str2)))
        print("textdistance.hamming.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.similarity(str1, str2)))
        print("textdistance.hamming.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.normalized_distance(str1, str2)))
        print(
            "textdistance.hamming.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.hamming.normalized_similarity(str1, str2)))
        print(
            "textdistance.Hamming(qval={}, test_func=None, truncate=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Hamming(qval=qval,
                                     test_func=None,
                                     truncate=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.mlipns({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns(str1, str2)))
        print("textdistance.mlipns.distance({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.distance(str1, str2)))
        print("textdistance.mlipns.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.similarity(str1, str2)))
        print("textdistance.mlipns.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.normalized_distance(str1, str2)))
        print("textdistance.mlipns.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.normalized_similarity(str1, str2)))
        print(
            "textdistance.MLIPNS(threshold=0.25, maxmismatches=2, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.MLIPNS(threshold=0.25,
                                    maxmismatches=2,
                                    qval=qval,
                                    external=True).distance(str1, str2)))

        print("textdistance.levenshtein({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein(str1, str2)))
        print("textdistance.levenshtein.distance({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein.distance(str1, str2)))
        print("textdistance.levenshtein.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein.similarity(str1, str2)))
        print("textdistance.levenshtein.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.levenshtein.normalized_distance(str1, str2)))
        print("textdistance.levenshtein.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.levenshtein.normalized_similarity(str1, str2)))
        print(
            "textdistance.Levenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Levenshtein(qval=qval,
                                         test_func=None,
                                         external=True).distance(str1, str2)))

        print("textdistance.damerau_levenshtein({}, {}) = {}.".format(
            str1, str2, textdistance.damerau_levenshtein(str1, str2)))
        print("textdistance.damerau_levenshtein.distance({}, {}) = {}.".format(
            str1, str2, textdistance.damerau_levenshtein.distance(str1, str2)))
        print(
            "textdistance.damerau_levenshtein.similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.damerau_levenshtein.similarity(str1, str2)))
        print(
            "textdistance.damerau_levenshtein.normalized_distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.damerau_levenshtein.normalized_distance(
                    str1, str2)))
        print(
            "textdistance.damerau_levenshtein.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.damerau_levenshtein.normalized_similarity(
                    str1, str2)))
        print(
            "textdistance.DamerauLevenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.DamerauLevenshtein(qval=qval,
                                                test_func=None,
                                                external=True).distance(
                                                    str1, str2)))

        print("textdistance.jaro({}, {}) = {}.".format(
            str1, str2, textdistance.jaro(str1, str2)))
        print("textdistance.jaro.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.distance(str1, str2)))
        print("textdistance.jaro.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.similarity(str1, str2)))
        print("textdistance.jaro.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.normalized_distance(str1, str2)))
        print("textdistance.jaro.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.normalized_similarity(str1, str2)))
        print(
            "textdistance.Jaro(long_tolerance=False, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Jaro(long_tolerance=False,
                                  qval=qval,
                                  external=True).distance(str1, str2)))

        print("textdistance.jaro_winkler({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler(str1, str2)))
        print("textdistance.jaro_winkler.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler.distance(str1, str2)))
        print("textdistance.jaro_winkler.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler.similarity(str1, str2)))
        print("textdistance.jaro_winkler.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.jaro_winkler.normalized_distance(str1,
                                                                   str2)))
        print("textdistance.jaro_winkler.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.jaro_winkler.normalized_similarity(str1, str2)))
        print(
            "textdistance.JaroWinkler(long_tolerance=False, winklerize=True, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.JaroWinkler(long_tolerance=False,
                                         winklerize=True,
                                         qval=qval,
                                         external=True).distance(str1, str2)))

        print("textdistance.strcmp95({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95(str1, str2)))
        print("textdistance.strcmp95.distance({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.distance(str1, str2)))
        print("textdistance.strcmp95.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.similarity(str1, str2)))
        print("textdistance.strcmp95.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.normalized_distance(str1, str2)))
        print(
            "textdistance.strcmp95.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.strcmp95.normalized_similarity(str1, str2)))
        print(
            "textdistance.StrCmp95(long_strings=False, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.StrCmp95(long_strings=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.needleman_wunsch({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch(str1, str2)))
        print("textdistance.needleman_wunsch.distance({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch.distance(str1, str2)))
        print("textdistance.needleman_wunsch.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch.similarity(str1, str2)))
        print(
            "textdistance.needleman_wunsch.normalized_distance({}, {}) = {}.".
            format(
                str1, str2,
                textdistance.needleman_wunsch.normalized_distance(str1, str2)))
        print(
            "textdistance.needleman_wunsch.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.needleman_wunsch.normalized_similarity(
                    str1, str2)))
        print(
            "textdistance.NeedlemanWunsch(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.NeedlemanWunsch(gap_cost=1.0,
                                             sim_func=None,
                                             qval=qval,
                                             external=True).distance(
                                                 str1, str2)))

        print("textdistance.gotoh({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh(str1, str2)))
        print("textdistance.gotoh.distance({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.distance(str1, str2)))
        print("textdistance.gotoh.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.similarity(str1, str2)))
        print("textdistance.gotoh.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.normalized_distance(str1, str2)))
        print("textdistance.gotoh.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.normalized_similarity(str1, str2)))
        print(
            "textdistance.Gotoh(gap_open=1, gap_ext=0.4, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Gotoh(gap_open=1,
                                   gap_ext=0.4,
                                   sim_func=None,
                                   qval=qval,
                                   external=True).distance(str1, str2)))

        print("textdistance.smith_waterman({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman(str1, str2)))
        print("textdistance.smith_waterman.distance({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman.distance(str1, str2)))
        print("textdistance.smith_waterman.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman.similarity(str1, str2)))
        print("textdistance.smith_waterman.normalized_distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.smith_waterman.normalized_distance(str1, str2)))
        print(
            "textdistance.smith_waterman.normalized_similarity({}, {}) = {}.".
            format(
                str1, str2,
                textdistance.smith_waterman.normalized_similarity(str1, str2)))
        print(
            "textdistance.SmithWaterman(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.SmithWaterman(gap_cost=1.0,
                                           sim_func=None,
                                           qval=qval,
                                           external=True).distance(str1,
                                                                   str2)))

    #--------------------
    # Token-based.
    if False:
        print("textdistance.jaccard({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard(str1, str2)))
        print("textdistance.jaccard.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.distance(str1, str2)))
        print("textdistance.jaccard.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.similarity(str1, str2)))
        print("textdistance.jaccard.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.normalized_distance(str1, str2)))
        print(
            "textdistance.jaccard.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.jaccard.normalized_similarity(str1, str2)))
        print(
            "textdistance.Jaccard(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Jaccard(qval=qval, as_set=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.sorensen({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen(str1, str2)))
        print("textdistance.sorensen.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.distance(str1, str2)))
        print("textdistance.sorensen.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.similarity(str1, str2)))
        print("textdistance.sorensen.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.normalized_distance(str1, str2)))
        print(
            "textdistance.sorensen.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.sorensen.normalized_similarity(str1, str2)))
        print(
            "textdistance.Sorensen(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Sorensen(qval=qval, as_set=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.sorensen_dice({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice(str1, str2)))
        print("textdistance.sorensen_dice.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice.distance(str1, str2)))
        print("textdistance.sorensen_dice.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice.similarity(str1, str2)))
        print("textdistance.sorensen_dice.normalized_distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.sorensen_dice.normalized_distance(str1, str2)))
        print("textdistance.sorensen_dice.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.sorensen_dice.normalized_similarity(str1,
                                                                   str2)))
        #print("textdistance.SorensenDice().distance({}, {}) = {}.".format(str1, str2, textdistance.SorensenDice().distance(str1, str2)))

        print("textdistance.tversky({}, {}) = {}.".format(
            str1, str2, textdistance.tversky(str1, str2)))
        print("textdistance.tversky.distance({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.distance(str1, str2)))
        print("textdistance.tversky.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.similarity(str1, str2)))
        print("textdistance.tversky.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.normalized_distance(str1, str2)))
        print(
            "textdistance.tversky.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.tversky.normalized_similarity(str1, str2)))
        print(
            "textdistance.Tversky(qval={}, ks=None, bias=None, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Tversky(qval=qval,
                                     ks=None,
                                     bias=None,
                                     as_set=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.overlap({}, {}) = {}.".format(
            str1, str2, textdistance.overlap(str1, str2)))
        print("textdistance.overlap.distance({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.distance(str1, str2)))
        print("textdistance.overlap.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.similarity(str1, str2)))
        print("textdistance.overlap.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.normalized_distance(str1, str2)))
        print(
            "textdistance.overlap.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.overlap.normalized_similarity(str1, str2)))
        print(
            "textdistance.Overlap(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Overlap(qval=qval, as_set=False,
                                     external=True).distance(str1, str2)))

        # This is identical to the Jaccard similarity coefficient and the Tversky index for alpha=1 and beta=1.
        print("textdistance.tanimoto({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto(str1, str2)))
        print("textdistance.tanimoto.distance({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.distance(str1, str2)))
        print("textdistance.tanimoto.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.similarity(str1, str2)))
        print("textdistance.tanimoto.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.normalized_distance(str1, str2)))
        print(
            "textdistance.tanimoto.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.tanimoto.normalized_similarity(str1, str2)))
        print(
            "textdistance.Tanimoto(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Tanimoto(qval=qval, as_set=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.cosine({}, {}) = {}.".format(
            str1, str2, textdistance.cosine(str1, str2)))
        print("textdistance.cosine.distance({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.distance(str1, str2)))
        print("textdistance.cosine.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.similarity(str1, str2)))
        print("textdistance.cosine.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.normalized_distance(str1, str2)))
        print("textdistance.cosine.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.normalized_similarity(str1, str2)))
        print(
            "textdistance.Cosine(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Cosine(qval=qval, as_set=False,
                                    external=True).distance(str1, str2)))

        print("textdistance.monge_elkan({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan(str1, str2)))
        print("textdistance.monge_elkan.distance({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan.distance(str1, str2)))
        print("textdistance.monge_elkan.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan.similarity(str1, str2)))
        print("textdistance.monge_elkan.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.monge_elkan.normalized_distance(str1, str2)))
        print("textdistance.monge_elkan.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.monge_elkan.normalized_similarity(str1, str2)))
        print(
            "textdistance.MongeElkan(algorithm=textdistance.DamerauLevenshtein(), symmetric=False, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.MongeElkan(
                    algorithm=textdistance.DamerauLevenshtein(),
                    symmetric=False,
                    qval=qval,
                    external=True).distance(str1, str2)))

        print("textdistance.bag({}, {}) = {}.".format(
            str1, str2, textdistance.bag(str1, str2)))
        print("textdistance.bag.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bag.distance(str1, str2)))
        print("textdistance.bag.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bag.similarity(str1, str2)))
        print("textdistance.bag.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.bag.normalized_distance(str1, str2)))
        print("textdistance.bag.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bag.normalized_similarity(str1, str2)))
        print("textdistance.Bag(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.Bag(qval=qval).distance(str1, str2)))

    #--------------------
    # Sequence-based.
    if False:
        print("textdistance.lcsseq({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq(str1, str2)))
        print("textdistance.lcsseq.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.distance(str1, str2)))
        print("textdistance.lcsseq.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.similarity(str1, str2)))
        print("textdistance.lcsseq.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.normalized_distance(str1, str2)))
        print("textdistance.lcsseq.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.normalized_similarity(str1, str2)))
        #print("textdistance.LCSSeq(qval={}, test_func=None, external=True).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.LCSSeq(qval=qval, test_func=None, external=True).distance(str1, str2)))
        print("textdistance.LCSSeq().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.LCSSeq().distance(str1, str2)))

        print("textdistance.lcsstr({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr(str1, str2)))
        print("textdistance.lcsstr.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.distance(str1, str2)))
        print("textdistance.lcsstr.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.similarity(str1, str2)))
        print("textdistance.lcsstr.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.normalized_distance(str1, str2)))
        print("textdistance.lcsstr.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.normalized_similarity(str1, str2)))
        print("textdistance.LCSStr(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.LCSStr(qval=qval).distance(str1, str2)))

        print("textdistance.ratcliff_obershelp({}, {}) = {}.".format(
            str1, str2, textdistance.ratcliff_obershelp(str1, str2)))
        print("textdistance.ratcliff_obershelp.distance({}, {}) = {}.".format(
            str1, str2, textdistance.ratcliff_obershelp.distance(str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.ratcliff_obershelp.similarity(str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.normalized_distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.ratcliff_obershelp.normalized_distance(
                    str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.ratcliff_obershelp.normalized_similarity(
                    str1, str2)))
        print("textdistance.RatcliffObershelp().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.RatcliffObershelp().distance(str1, str2)))

    #--------------------
    # Compression-based.
    if False:
        print("textdistance.arith_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd(str1, str2)))
        print("textdistance.arith_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd.distance(str1, str2)))
        print("textdistance.arith_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd.similarity(str1, str2)))
        print(
            "textdistance.arith_ncd.normalized_distance({}, {}) = {}.".format(
                str1, str2,
                textdistance.arith_ncd.normalized_distance(str1, str2)))
        print("textdistance.arith_ncd.normalized_similarity({}, {}) = {}.".
              format(str1, str2,
                     textdistance.arith_ncd.normalized_similarity(str1, str2)))
        #print("textdistance.ArithNCD(base=2, terminator=None, qval={}).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.ArithNCD(base=2, terminator=None, qval=qval).distance(str1, str2)))
        print("textdistance.ArithNCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.ArithNCD().distance(str1, str2)))

        print("textdistance.rle_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd(str1, str2)))
        print("textdistance.rle_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.distance(str1, str2)))
        print("textdistance.rle_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.similarity(str1, str2)))
        print("textdistance.rle_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.rle_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.rle_ncd.normalized_similarity(str1, str2)))
        print("textdistance.RLENCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.RLENCD().distance(str1, str2)))

        print("textdistance.bwtrle_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd(str1, str2)))
        print("textdistance.bwtrle_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd.distance(str1, str2)))
        print("textdistance.bwtrle_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd.similarity(str1, str2)))
        print(
            "textdistance.bwtrle_ncd.normalized_distance({}, {}) = {}.".format(
                str1, str2,
                textdistance.bwtrle_ncd.normalized_distance(str1, str2)))
        print("textdistance.bwtrle_ncd.normalized_similarity({}, {}) = {}.".
              format(str1, str2,
                     textdistance.bwtrle_ncd.normalized_similarity(str1,
                                                                   str2)))
        print("textdistance.BWTRLENCD(terminator='\0').distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.BWTRLENCD(terminator='\0').distance(str1,
                                                                   str2)))

        print("textdistance.sqrt_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd(str1, str2)))
        print("textdistance.sqrt_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.distance(str1, str2)))
        print("textdistance.sqrt_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.similarity(str1, str2)))
        print("textdistance.sqrt_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.sqrt_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.sqrt_ncd.normalized_similarity(str1, str2)))
        print("textdistance.SqrtNCD(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.SqrtNCD(qval=qval).distance(str1, str2)))

        print("textdistance.entropy_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd(str1, str2)))
        print("textdistance.entropy_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd.distance(str1, str2)))
        print("textdistance.entropy_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd.similarity(str1, str2)))
        print("textdistance.entropy_ncd.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.entropy_ncd.normalized_distance(str1, str2)))
        print("textdistance.entropy_ncd.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.entropy_ncd.normalized_similarity(str1, str2)))
        print(
            "textdistance.EntropyNCD(qval={}, coef=1, base=2).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.EntropyNCD(qval=qval, coef=1,
                                        base=2).distance(str1, str2)))

        print("textdistance.bz2_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd(str1, str2)))
        print("textdistance.bz2_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.distance(str1, str2)))
        print("textdistance.bz2_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.similarity(str1, str2)))
        print("textdistance.bz2_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.bz2_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.bz2_ncd.normalized_similarity(str1, str2)))
        print("textdistance.BZ2NCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.BZ2NCD().distance(str1, str2)))

        print("textdistance.lzma_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd(str1, str2)))
        print("textdistance.lzma_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.distance(str1, str2)))
        print("textdistance.lzma_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.similarity(str1, str2)))
        print("textdistance.lzma_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.lzma_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.lzma_ncd.normalized_similarity(str1, str2)))
        print("textdistance.LZMANCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.LZMANCD().distance(str1, str2)))

        print("textdistance.zlib_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd(str1, str2)))
        print("textdistance.zlib_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.distance(str1, str2)))
        print("textdistance.zlib_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.similarity(str1, str2)))
        print("textdistance.zlib_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.zlib_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.zlib_ncd.normalized_similarity(str1, str2)))
        print("textdistance.ZLIBNCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.ZLIBNCD().distance(str1, str2)))

    #--------------------
    # Phonetic.
    if False:
        print("textdistance.mra({}, {}) = {}.".format(
            str1, str2, textdistance.mra(str1, str2)))
        print("textdistance.mra.distance({}, {}) = {}.".format(
            str1, str2, textdistance.mra.distance(str1, str2)))
        print("textdistance.mra.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mra.similarity(str1, str2)))
        print("textdistance.mra.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.mra.normalized_distance(str1, str2)))
        print("textdistance.mra.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mra.normalized_similarity(str1, str2)))
        print("textdistance.MRA().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.MRA().distance(str1, str2)))

        print("textdistance.editex({}, {}) = {}.".format(
            str1, str2, textdistance.editex(str1, str2)))
        print("textdistance.editex.distance({}, {}) = {}.".format(
            str1, str2, textdistance.editex.distance(str1, str2)))
        print("textdistance.editex.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.editex.similarity(str1, str2)))
        print("textdistance.editex.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.editex.normalized_distance(str1, str2)))
        print("textdistance.editex.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.editex.normalized_similarity(str1, str2)))
        print(
            "textdistance.Editex(local=False, match_cost=0, group_cost=1, mismatch_cost=2, groups=None, ungrouped=None, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.Editex(local=False,
                                    match_cost=0,
                                    group_cost=1,
                                    mismatch_cost=2,
                                    groups=None,
                                    ungrouped=None,
                                    external=True).distance(str1, str2)))

    #--------------------
    # Simple.
    if False:
        print("textdistance.prefix({}, {}) = {}.".format(
            str1, str2, textdistance.prefix(str1, str2)))
        print("textdistance.prefix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.distance(str1, str2)))
        print("textdistance.prefix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.similarity(str1, str2)))
        print("textdistance.prefix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.normalized_distance(str1, str2)))
        print("textdistance.prefix.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.normalized_similarity(str1, str2)))
        print(
            "textdistance.Prefix(qval={}, sim_test=None).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Prefix(qval=qval,
                                    sim_test=None).distance(str1, str2)))

        print("textdistance.postfix({}, {}) = {}.".format(
            str1, str2, textdistance.postfix(str1, str2)))
        print("textdistance.postfix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.distance(str1, str2)))
        print("textdistance.postfix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.similarity(str1, str2)))
        print("textdistance.postfix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.normalized_distance(str1, str2)))
        print(
            "textdistance.postfix.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.postfix.normalized_similarity(str1, str2)))
        #print("textdistance.Postfix(qval={}, sim_test=None).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.Postfix(qval=qval, sim_test=None).distance(str1, str2)))
        print("textdistance.Postfix().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Postfix().distance(str1, str2)))

        print("textdistance.length({}, {}) = {}.".format(
            str1, str2, textdistance.length(str1, str2)))
        print("textdistance.length.distance({}, {}) = {}.".format(
            str1, str2, textdistance.length.distance(str1, str2)))
        print("textdistance.length.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.length.similarity(str1, str2)))
        print("textdistance.length.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.length.normalized_distance(str1, str2)))
        print("textdistance.length.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.length.normalized_similarity(str1, str2)))
        print("textdistance.Length().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Length().distance(str1, str2)))

        print("textdistance.identity({}, {}) = {}.".format(
            str1, str2, textdistance.identity(str1, str2)))
        print("textdistance.identity.distance({}, {}) = {}.".format(
            str1, str2, textdistance.identity.distance(str1, str2)))
        print("textdistance.identity.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.identity.similarity(str1, str2)))
        print("textdistance.identity.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.identity.normalized_distance(str1, str2)))
        print(
            "textdistance.identity.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.identity.normalized_similarity(str1, str2)))
        print("textdistance.Identity().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Identity().distance(str1, str2)))

        print("textdistance.matrix({}, {}) = {}.".format(
            str1, str2, textdistance.matrix(str1, str2)))
        print("textdistance.matrix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.distance(str1, str2)))
        print("textdistance.matrix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.similarity(str1, str2)))
        print("textdistance.matrix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.normalized_distance(str1, str2)))
        print("textdistance.matrix.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.normalized_similarity(str1, str2)))
        print(
            "textdistance.Matrix(mat=None, mismatch_cost=0, match_cost=1, symmetric=True, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.Matrix(mat=None,
                                    mismatch_cost=0,
                                    match_cost=1,
                                    symmetric=True,
                                    external=True).distance(str1, str2)))
예제 #13
0
    def scan(self, inputFile):
        '''
    :param inputFile: Input file path that needs to be scanned
    :return: Array of JSON with the output of scan of the file.

    +------------+-----------------------------------------------------------+
    | shortname  | Short name of the license                                 |
    +------------+-----------------------------------------------------------+
    | sim_type   | Type of similarity from which the result is generated     |
    +------------+-----------------------------------------------------------+
    | sim_score  | Similarity score for the algorithm used mentioned above   |
    +------------+-----------------------------------------------------------+
    | desc       | Description/ comments for the similarity measure          |
    +------------+-----------------------------------------------------------+
    '''
        processedData = super().loadFile(inputFile)
        matches = initial_match(self.commentFile, processedData,
                                self.licenseList)

        # Full text Bi-gram Cosine Similarity Match
        Cosine_matches = []
        Dice_matches = []
        Bigram_cosine_matches = []

        initial_guess = self.__Ngram_guess(processedData)
        ngram_guesses = []
        for guess in initial_guess:
            for x in guess['shortname']:
                ngram_guesses.append(x)

        all_guesses = unique([l['shortname'] for l in matches])
        self.licenseList = self.licenseList[
            (self.licenseList.shortname.isin(ngram_guesses)) |
            (self.licenseList.shortname.isin(all_guesses))]
        self.licenseList.sort_values('shortname').reset_index(drop=True)

        for idx in range(len(self.licenseList)):

            if self.simType == self.NgramAlgo.cosineSim:
                # cosine similarity with unigram
                cosineSim = cosine_similarity(
                    wordFrequency(self.licenseList.iloc[idx]
                                  ['processed_text'].split(" ")),
                    wordFrequency(processedData.split(" ")))
                if cosineSim >= 0.6:
                    Cosine_matches.append({
                        'shortname':
                        self.licenseList.iloc[idx]['shortname'],
                        'sim_type':
                        'CosineSim',
                        'sim_score':
                        cosineSim,
                        'description':
                        ''
                    })
                if self.verbose > 0:
                    print("Cosine Sim ", str(cosineSim),
                          self.licenseList.iloc[idx]['shortname'])

            elif self.simType == self.NgramAlgo.diceSim:
                # dice similarity
                diceSim = textdistance.sorensen(
                    self.licenseList.iloc[idx]['processed_text'].split(" "),
                    processedData.split(" "))
                if diceSim >= 0.6:
                    Dice_matches.append({
                        'shortname':
                        self.licenseList.iloc[idx]['shortname'],
                        'sim_type':
                        'DiceSim',
                        'sim_score':
                        diceSim,
                        'description':
                        ''
                    })
                if self.verbose > 0:
                    print("Dice Sim ", str(diceSim),
                          self.licenseList.iloc[idx]['shortname'])

            elif self.simType == self.NgramAlgo.bigramCosineSim:
                bigram_cosine_sim = cosine_similarity(
                    wordFrequency(
                        self.__bigram_tokenize(
                            self.licenseList.iloc[idx]['processed_text'])),
                    wordFrequency(self.__bigram_tokenize(processedData)))
                if bigram_cosine_sim >= 0.9:
                    Bigram_cosine_matches.append({
                        'shortname':
                        self.licenseList.iloc[idx]['shortname'],
                        'sim_type':
                        'BigramCosineSim',
                        'sim_score':
                        bigram_cosine_sim,
                        'description':
                        ''
                    })
                    if self.verbose > 0:
                        print("Bigram Cosine Sim ", str(bigram_cosine_sim),
                              self.licenseList.iloc[idx]['shortname'])

        if self.simType == self.NgramAlgo.cosineSim and len(
                Cosine_matches) > 0:
            matches = list(itertools.chain(matches, Cosine_matches))

        if self.simType == self.NgramAlgo.diceSim and len(Dice_matches) > 0:
            matches = list(itertools.chain(matches, Dice_matches))

        if self.simType == self.NgramAlgo.bigramCosineSim and len(
                Bigram_cosine_matches) > 0:
            matches = list(itertools.chain(matches, Bigram_cosine_matches))

        matches.sort(key=lambda x: x['sim_score'], reverse=True)
        return matches