예제 #1
0
 def __init__(self, spacy_model_name):
     self.parser = spacy.load(spacy_model_name)
     self.estimator = textdistance.Hamming()
     self.gen_func = lambda x: get_tense_distractors(
         x, self.estimator, self.parser)
예제 #2
0
# In[6]:


textdistance.hamming.normalized_distance('test', 'text')


# In[7]:


textdistance.hamming.normalized_similarity('test', 'text')


# In[8]:


textdistance.Hamming(qval=2).distance('test', 'text')


# In[9]:


hamming = textdistance.Hamming(external=False)
hamming('text', 'testit')


# # Levenshtein
# 
# https://itnext.io/string-similarity-the-basic-know-your-algorithms-guide-3de3d7346227

# In[10]:
예제 #3
0
def process_sudoku_reads(output_file,
                         fastqr1,
                         fastqr2,
                         fastqi1,
                         fastqi2,
                         bc_seed,
                         bc_length,
                         index_table_file,
                         is_zipped=True,
                         threshold_count=5,
                         merge_dist=1):

    assert len(fastqr1) == len(fastqr2)
    assert len(fastqr1) == len(fastqi1)
    assert len(fastqr1) == len(fastqi2)

    print("Loading index table")

    idx_table = pd.read_csv(index_table_file, index_col=None, sep="\t")
    allowed_i5 = idx_table[I5_COL].tolist()
    allowed_i7 = idx_table[I7_COL].tolist()

    print("Loaded {n} pools with {i5} i5 indices and {i7} i7 indices".format(
        n=idx_table.shape[0], i5=len(allowed_i5), i7=len(allowed_i7)))

    linker = LinkSudokuBC(bc_seed, bc_length, is_zipped=is_zipped)
    bcs = linker.parse_fastq_mp(fastqi1, fastqi2, fastqr1)

    print("Barcode extraction complete")

    # Only keep indices that are in the table
    for i5 in list(bcs.keys()):
        if i5 not in allowed_i5:
            del bcs[i5]
        else:
            for i7 in list(bcs[i5].keys()):
                if i7 not in allowed_i7:
                    del bcs[i5][i7]

    print("Index restriction to whitelist complete")

    hamming = textdistance.Hamming()

    outputs = []
    for i5 in list(bcs.keys()):

        i5_ref = bcs[i5]

        for i7 in list(i5_ref.keys()):
            i57_ref = bcs[i5][i7]

            try:
                group = idx_table.loc[(idx_table[I5_COL] == i5) &
                                      (idx_table[I7_COL] == i7),
                                      GROUP_COL].tolist()[0]
                pool = idx_table.loc[(idx_table[I5_COL] == i5) &
                                     (idx_table[I7_COL] == i7),
                                     LOC_COL].tolist()[0]
            except IndexError:
                continue

            print("Processing {g} {p}".format(g=group, p=pool))

            # Keep only BCs over a count threshold
            valid_bcs = []
            for bc in list(i57_ref.keys()):
                if i57_ref[bc] > threshold_count:
                    valid_bcs.append(bc)

            # Merge all BCs that are less than merge_dist hamming distance
            bc_merge_toward = set()
            bc_merge_away = set()
            merged_bcs = set()
            for i in range(len(valid_bcs)):
                bc = valid_bcs[i]
                end_check = False
                for opp in valid_bcs[i + 1:]:
                    if hamming.distance(bc, opp) < merge_dist:
                        end_check = True
                        if i57_ref[bc] > i57_ref[opp]:
                            bc_merge_toward.add(bc)
                            bc_merge_away.add(opp)
                        else:
                            bc_merge_toward.add(opp)
                            bc_merge_away.add(bc)

                if not end_check:
                    merged_bcs.add(bc)

            merged_bcs = merged_bcs.union(
                bc_merge_toward.difference(bc_merge_away))

            merged_bcs = list(merged_bcs)
            merged_counts = [i57_ref[bc] for bc in merged_bcs]

            output_df = pd.DataFrame(list(zip(merged_bcs, merged_counts)),
                                     columns=[BC_COL, COUNT_COL])
            output_df[GROUP_COL] = group
            output_df[LOC_COL] = pool
            outputs.append(output_df)

    outputs = pd.concat(outputs)
    outputs = outputs.reindex(labels=[GROUP_COL, LOC_COL, BC_COL, COUNT_COL],
                              axis=1)
    outputs.sort_values(by=[GROUP_COL, LOC_COL], inplace=True)
    outputs.reset_index(inplace=True, drop=True)
    outputs.to_csv("raw_" + output_file, sep="\t")

    pivoted = map_barcodes_to_wells(outputs)
    pivoted.to_csv(output_file, sep="\t")

    return outputs, pivoted
예제 #4
0
 def hamming(string1, string2):
     hamming = textdistance.Hamming()
     d = textdistance.hamming.normalized_distance(string1, string2)
     return d
예제 #5
0
    def __init__(self, pa_preprocessor, name, qval=1):
        super().__init__(pa_preprocessor)

        self.time_log = []
        self.qval = qval
        self.textdistance_name = name

        # Edited based:
        if name == 'Hamming':
            self.similar_measure = textdistance.Hamming(qval=qval)
        elif name == 'DamerauLevenshtein':
            self.similar_measure = textdistance.DamerauLevenshtein(qval=qval)
        elif name == 'Levenshtein':
            self.similar_measure = textdistance.Levenshtein(qval=qval)
        elif name == 'Mlipns':
            self.similar_measure = textdistance.MLIPNS(qval=qval)
        elif name == 'Jaro':
            self.similar_measure = textdistance.Jaro(qval=qval)
        elif name == 'JaroWinkler':
            self.similar_measure = textdistance.JaroWinkler(qval=qval)
        elif name == 'StrCmp95':
            self.similar_measure = textdistance.StrCmp95()
        elif name == 'NeedlemanWunsch':
            self.similar_measure = textdistance.NeedlemanWunsch(qval=qval)
        elif name == 'Gotoh':
            self.similar_measure = textdistance.Gotoh(qval=qval)
        elif name == 'SmithWaterman':
            self.similar_measure = textdistance.SmithWaterman(qval=qval)

        # Token based
        elif name == 'Jaccard':
            self.similar_measure = textdistance.Jaccard(qval=qval)
        elif name == 'Sorensen':
            self.similar_measure = textdistance.Sorensen(qval=qval)
        elif name == 'Tversky':
            self.similar_measure = textdistance.Tversky()
        elif name == 'Overlap':
            self.similar_measure = textdistance.Overlap(qval=qval)
        elif name == 'Tanimoto':
            self.similar_measure = textdistance.Tanimoto(qval=qval)
        elif name == 'Cosine':
            self.similar_measure = textdistance.Cosine(qval=qval)
        elif name == 'MongeElkan':
            self.similar_measure = textdistance.MongeElkan(qval=qval)
        elif name == 'Bag':
            self.similar_measure = textdistance.Bag(qval=qval)

        # Sequence based
        elif name == 'LCSSeq':
            self.similar_measure = textdistance.LCSSeq(qval=qval)
        elif name == 'LCSStr':
            self.similar_measure = textdistance.LCSStr(qval=qval)
        elif name == 'RatcliffObershelp':
            self.similar_measure = textdistance.RatcliffObershelp(qval=qval)

        # Compression based
        elif name == 'ArithNCD':
            self.similar_measure = textdistance.ArithNCD(qval=qval)
        elif name == 'RLENCD':
            self.similar_measure = textdistance.RLENCD(qval=qval)
        elif name == 'BWTRLENCD':
            self.similar_measure = textdistance.BWTRLENCD()
        elif name == 'SqrtNCD':
            self.similar_measure = textdistance.SqrtNCD(qval=qval)
        elif name == 'EntropyNCD':
            self.similar_measure = textdistance.EntropyNCD(qval=qval)

        # Simple:
        elif name == 'Prefix':
            self.similar_measure = textdistance.Prefix(qval=qval)
        elif name == 'Postfix':
            self.similar_measure = textdistance.Postfix(qval=qval)
        elif name == 'Length':
            self.similar_measure = textdistance.Length(qval=qval)
        elif name == 'Identity':
            self.similar_measure = textdistance.Identity(qval=qval)
        elif name == 'Matrix':
            self.similar_measure = textdistance.Matrix()
예제 #6
0
def simple_example():
    str1, str2 = 'test', 'text'
    qval = 2

    #--------------------
    # Edit-based.
    if True:
        print("textdistance.hamming({}, {}) = {}.".format(
            str1, str2, textdistance.hamming(str1, str2)))
        print("textdistance.hamming.distance({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.distance(str1, str2)))
        print("textdistance.hamming.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.similarity(str1, str2)))
        print("textdistance.hamming.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.normalized_distance(str1, str2)))
        print(
            "textdistance.hamming.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.hamming.normalized_similarity(str1, str2)))
        print(
            "textdistance.Hamming(qval={}, test_func=None, truncate=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Hamming(qval=qval,
                                     test_func=None,
                                     truncate=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.mlipns({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns(str1, str2)))
        print("textdistance.mlipns.distance({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.distance(str1, str2)))
        print("textdistance.mlipns.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.similarity(str1, str2)))
        print("textdistance.mlipns.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.normalized_distance(str1, str2)))
        print("textdistance.mlipns.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.normalized_similarity(str1, str2)))
        print(
            "textdistance.MLIPNS(threshold=0.25, maxmismatches=2, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.MLIPNS(threshold=0.25,
                                    maxmismatches=2,
                                    qval=qval,
                                    external=True).distance(str1, str2)))

        print("textdistance.levenshtein({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein(str1, str2)))
        print("textdistance.levenshtein.distance({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein.distance(str1, str2)))
        print("textdistance.levenshtein.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein.similarity(str1, str2)))
        print("textdistance.levenshtein.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.levenshtein.normalized_distance(str1, str2)))
        print("textdistance.levenshtein.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.levenshtein.normalized_similarity(str1, str2)))
        print(
            "textdistance.Levenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Levenshtein(qval=qval,
                                         test_func=None,
                                         external=True).distance(str1, str2)))

        print("textdistance.damerau_levenshtein({}, {}) = {}.".format(
            str1, str2, textdistance.damerau_levenshtein(str1, str2)))
        print("textdistance.damerau_levenshtein.distance({}, {}) = {}.".format(
            str1, str2, textdistance.damerau_levenshtein.distance(str1, str2)))
        print(
            "textdistance.damerau_levenshtein.similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.damerau_levenshtein.similarity(str1, str2)))
        print(
            "textdistance.damerau_levenshtein.normalized_distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.damerau_levenshtein.normalized_distance(
                    str1, str2)))
        print(
            "textdistance.damerau_levenshtein.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.damerau_levenshtein.normalized_similarity(
                    str1, str2)))
        print(
            "textdistance.DamerauLevenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.DamerauLevenshtein(qval=qval,
                                                test_func=None,
                                                external=True).distance(
                                                    str1, str2)))

        print("textdistance.jaro({}, {}) = {}.".format(
            str1, str2, textdistance.jaro(str1, str2)))
        print("textdistance.jaro.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.distance(str1, str2)))
        print("textdistance.jaro.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.similarity(str1, str2)))
        print("textdistance.jaro.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.normalized_distance(str1, str2)))
        print("textdistance.jaro.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.normalized_similarity(str1, str2)))
        print(
            "textdistance.Jaro(long_tolerance=False, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Jaro(long_tolerance=False,
                                  qval=qval,
                                  external=True).distance(str1, str2)))

        print("textdistance.jaro_winkler({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler(str1, str2)))
        print("textdistance.jaro_winkler.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler.distance(str1, str2)))
        print("textdistance.jaro_winkler.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler.similarity(str1, str2)))
        print("textdistance.jaro_winkler.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.jaro_winkler.normalized_distance(str1,
                                                                   str2)))
        print("textdistance.jaro_winkler.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.jaro_winkler.normalized_similarity(str1, str2)))
        print(
            "textdistance.JaroWinkler(long_tolerance=False, winklerize=True, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.JaroWinkler(long_tolerance=False,
                                         winklerize=True,
                                         qval=qval,
                                         external=True).distance(str1, str2)))

        print("textdistance.strcmp95({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95(str1, str2)))
        print("textdistance.strcmp95.distance({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.distance(str1, str2)))
        print("textdistance.strcmp95.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.similarity(str1, str2)))
        print("textdistance.strcmp95.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.normalized_distance(str1, str2)))
        print(
            "textdistance.strcmp95.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.strcmp95.normalized_similarity(str1, str2)))
        print(
            "textdistance.StrCmp95(long_strings=False, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.StrCmp95(long_strings=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.needleman_wunsch({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch(str1, str2)))
        print("textdistance.needleman_wunsch.distance({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch.distance(str1, str2)))
        print("textdistance.needleman_wunsch.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch.similarity(str1, str2)))
        print(
            "textdistance.needleman_wunsch.normalized_distance({}, {}) = {}.".
            format(
                str1, str2,
                textdistance.needleman_wunsch.normalized_distance(str1, str2)))
        print(
            "textdistance.needleman_wunsch.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.needleman_wunsch.normalized_similarity(
                    str1, str2)))
        print(
            "textdistance.NeedlemanWunsch(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.NeedlemanWunsch(gap_cost=1.0,
                                             sim_func=None,
                                             qval=qval,
                                             external=True).distance(
                                                 str1, str2)))

        print("textdistance.gotoh({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh(str1, str2)))
        print("textdistance.gotoh.distance({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.distance(str1, str2)))
        print("textdistance.gotoh.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.similarity(str1, str2)))
        print("textdistance.gotoh.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.normalized_distance(str1, str2)))
        print("textdistance.gotoh.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.normalized_similarity(str1, str2)))
        print(
            "textdistance.Gotoh(gap_open=1, gap_ext=0.4, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Gotoh(gap_open=1,
                                   gap_ext=0.4,
                                   sim_func=None,
                                   qval=qval,
                                   external=True).distance(str1, str2)))

        print("textdistance.smith_waterman({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman(str1, str2)))
        print("textdistance.smith_waterman.distance({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman.distance(str1, str2)))
        print("textdistance.smith_waterman.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman.similarity(str1, str2)))
        print("textdistance.smith_waterman.normalized_distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.smith_waterman.normalized_distance(str1, str2)))
        print(
            "textdistance.smith_waterman.normalized_similarity({}, {}) = {}.".
            format(
                str1, str2,
                textdistance.smith_waterman.normalized_similarity(str1, str2)))
        print(
            "textdistance.SmithWaterman(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.SmithWaterman(gap_cost=1.0,
                                           sim_func=None,
                                           qval=qval,
                                           external=True).distance(str1,
                                                                   str2)))

    #--------------------
    # Token-based.
    if False:
        print("textdistance.jaccard({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard(str1, str2)))
        print("textdistance.jaccard.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.distance(str1, str2)))
        print("textdistance.jaccard.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.similarity(str1, str2)))
        print("textdistance.jaccard.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.normalized_distance(str1, str2)))
        print(
            "textdistance.jaccard.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.jaccard.normalized_similarity(str1, str2)))
        print(
            "textdistance.Jaccard(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Jaccard(qval=qval, as_set=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.sorensen({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen(str1, str2)))
        print("textdistance.sorensen.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.distance(str1, str2)))
        print("textdistance.sorensen.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.similarity(str1, str2)))
        print("textdistance.sorensen.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.normalized_distance(str1, str2)))
        print(
            "textdistance.sorensen.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.sorensen.normalized_similarity(str1, str2)))
        print(
            "textdistance.Sorensen(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Sorensen(qval=qval, as_set=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.sorensen_dice({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice(str1, str2)))
        print("textdistance.sorensen_dice.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice.distance(str1, str2)))
        print("textdistance.sorensen_dice.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice.similarity(str1, str2)))
        print("textdistance.sorensen_dice.normalized_distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.sorensen_dice.normalized_distance(str1, str2)))
        print("textdistance.sorensen_dice.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.sorensen_dice.normalized_similarity(str1,
                                                                   str2)))
        #print("textdistance.SorensenDice().distance({}, {}) = {}.".format(str1, str2, textdistance.SorensenDice().distance(str1, str2)))

        print("textdistance.tversky({}, {}) = {}.".format(
            str1, str2, textdistance.tversky(str1, str2)))
        print("textdistance.tversky.distance({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.distance(str1, str2)))
        print("textdistance.tversky.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.similarity(str1, str2)))
        print("textdistance.tversky.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.normalized_distance(str1, str2)))
        print(
            "textdistance.tversky.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.tversky.normalized_similarity(str1, str2)))
        print(
            "textdistance.Tversky(qval={}, ks=None, bias=None, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Tversky(qval=qval,
                                     ks=None,
                                     bias=None,
                                     as_set=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.overlap({}, {}) = {}.".format(
            str1, str2, textdistance.overlap(str1, str2)))
        print("textdistance.overlap.distance({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.distance(str1, str2)))
        print("textdistance.overlap.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.similarity(str1, str2)))
        print("textdistance.overlap.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.normalized_distance(str1, str2)))
        print(
            "textdistance.overlap.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.overlap.normalized_similarity(str1, str2)))
        print(
            "textdistance.Overlap(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Overlap(qval=qval, as_set=False,
                                     external=True).distance(str1, str2)))

        # This is identical to the Jaccard similarity coefficient and the Tversky index for alpha=1 and beta=1.
        print("textdistance.tanimoto({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto(str1, str2)))
        print("textdistance.tanimoto.distance({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.distance(str1, str2)))
        print("textdistance.tanimoto.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.similarity(str1, str2)))
        print("textdistance.tanimoto.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.normalized_distance(str1, str2)))
        print(
            "textdistance.tanimoto.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.tanimoto.normalized_similarity(str1, str2)))
        print(
            "textdistance.Tanimoto(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Tanimoto(qval=qval, as_set=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.cosine({}, {}) = {}.".format(
            str1, str2, textdistance.cosine(str1, str2)))
        print("textdistance.cosine.distance({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.distance(str1, str2)))
        print("textdistance.cosine.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.similarity(str1, str2)))
        print("textdistance.cosine.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.normalized_distance(str1, str2)))
        print("textdistance.cosine.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.normalized_similarity(str1, str2)))
        print(
            "textdistance.Cosine(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Cosine(qval=qval, as_set=False,
                                    external=True).distance(str1, str2)))

        print("textdistance.monge_elkan({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan(str1, str2)))
        print("textdistance.monge_elkan.distance({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan.distance(str1, str2)))
        print("textdistance.monge_elkan.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan.similarity(str1, str2)))
        print("textdistance.monge_elkan.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.monge_elkan.normalized_distance(str1, str2)))
        print("textdistance.monge_elkan.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.monge_elkan.normalized_similarity(str1, str2)))
        print(
            "textdistance.MongeElkan(algorithm=textdistance.DamerauLevenshtein(), symmetric=False, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.MongeElkan(
                    algorithm=textdistance.DamerauLevenshtein(),
                    symmetric=False,
                    qval=qval,
                    external=True).distance(str1, str2)))

        print("textdistance.bag({}, {}) = {}.".format(
            str1, str2, textdistance.bag(str1, str2)))
        print("textdistance.bag.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bag.distance(str1, str2)))
        print("textdistance.bag.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bag.similarity(str1, str2)))
        print("textdistance.bag.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.bag.normalized_distance(str1, str2)))
        print("textdistance.bag.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bag.normalized_similarity(str1, str2)))
        print("textdistance.Bag(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.Bag(qval=qval).distance(str1, str2)))

    #--------------------
    # Sequence-based.
    if False:
        print("textdistance.lcsseq({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq(str1, str2)))
        print("textdistance.lcsseq.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.distance(str1, str2)))
        print("textdistance.lcsseq.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.similarity(str1, str2)))
        print("textdistance.lcsseq.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.normalized_distance(str1, str2)))
        print("textdistance.lcsseq.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.normalized_similarity(str1, str2)))
        #print("textdistance.LCSSeq(qval={}, test_func=None, external=True).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.LCSSeq(qval=qval, test_func=None, external=True).distance(str1, str2)))
        print("textdistance.LCSSeq().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.LCSSeq().distance(str1, str2)))

        print("textdistance.lcsstr({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr(str1, str2)))
        print("textdistance.lcsstr.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.distance(str1, str2)))
        print("textdistance.lcsstr.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.similarity(str1, str2)))
        print("textdistance.lcsstr.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.normalized_distance(str1, str2)))
        print("textdistance.lcsstr.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.normalized_similarity(str1, str2)))
        print("textdistance.LCSStr(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.LCSStr(qval=qval).distance(str1, str2)))

        print("textdistance.ratcliff_obershelp({}, {}) = {}.".format(
            str1, str2, textdistance.ratcliff_obershelp(str1, str2)))
        print("textdistance.ratcliff_obershelp.distance({}, {}) = {}.".format(
            str1, str2, textdistance.ratcliff_obershelp.distance(str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.ratcliff_obershelp.similarity(str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.normalized_distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.ratcliff_obershelp.normalized_distance(
                    str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.ratcliff_obershelp.normalized_similarity(
                    str1, str2)))
        print("textdistance.RatcliffObershelp().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.RatcliffObershelp().distance(str1, str2)))

    #--------------------
    # Compression-based.
    if False:
        print("textdistance.arith_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd(str1, str2)))
        print("textdistance.arith_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd.distance(str1, str2)))
        print("textdistance.arith_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd.similarity(str1, str2)))
        print(
            "textdistance.arith_ncd.normalized_distance({}, {}) = {}.".format(
                str1, str2,
                textdistance.arith_ncd.normalized_distance(str1, str2)))
        print("textdistance.arith_ncd.normalized_similarity({}, {}) = {}.".
              format(str1, str2,
                     textdistance.arith_ncd.normalized_similarity(str1, str2)))
        #print("textdistance.ArithNCD(base=2, terminator=None, qval={}).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.ArithNCD(base=2, terminator=None, qval=qval).distance(str1, str2)))
        print("textdistance.ArithNCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.ArithNCD().distance(str1, str2)))

        print("textdistance.rle_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd(str1, str2)))
        print("textdistance.rle_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.distance(str1, str2)))
        print("textdistance.rle_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.similarity(str1, str2)))
        print("textdistance.rle_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.rle_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.rle_ncd.normalized_similarity(str1, str2)))
        print("textdistance.RLENCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.RLENCD().distance(str1, str2)))

        print("textdistance.bwtrle_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd(str1, str2)))
        print("textdistance.bwtrle_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd.distance(str1, str2)))
        print("textdistance.bwtrle_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd.similarity(str1, str2)))
        print(
            "textdistance.bwtrle_ncd.normalized_distance({}, {}) = {}.".format(
                str1, str2,
                textdistance.bwtrle_ncd.normalized_distance(str1, str2)))
        print("textdistance.bwtrle_ncd.normalized_similarity({}, {}) = {}.".
              format(str1, str2,
                     textdistance.bwtrle_ncd.normalized_similarity(str1,
                                                                   str2)))
        print("textdistance.BWTRLENCD(terminator='\0').distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.BWTRLENCD(terminator='\0').distance(str1,
                                                                   str2)))

        print("textdistance.sqrt_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd(str1, str2)))
        print("textdistance.sqrt_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.distance(str1, str2)))
        print("textdistance.sqrt_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.similarity(str1, str2)))
        print("textdistance.sqrt_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.sqrt_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.sqrt_ncd.normalized_similarity(str1, str2)))
        print("textdistance.SqrtNCD(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.SqrtNCD(qval=qval).distance(str1, str2)))

        print("textdistance.entropy_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd(str1, str2)))
        print("textdistance.entropy_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd.distance(str1, str2)))
        print("textdistance.entropy_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd.similarity(str1, str2)))
        print("textdistance.entropy_ncd.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.entropy_ncd.normalized_distance(str1, str2)))
        print("textdistance.entropy_ncd.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.entropy_ncd.normalized_similarity(str1, str2)))
        print(
            "textdistance.EntropyNCD(qval={}, coef=1, base=2).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.EntropyNCD(qval=qval, coef=1,
                                        base=2).distance(str1, str2)))

        print("textdistance.bz2_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd(str1, str2)))
        print("textdistance.bz2_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.distance(str1, str2)))
        print("textdistance.bz2_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.similarity(str1, str2)))
        print("textdistance.bz2_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.bz2_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.bz2_ncd.normalized_similarity(str1, str2)))
        print("textdistance.BZ2NCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.BZ2NCD().distance(str1, str2)))

        print("textdistance.lzma_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd(str1, str2)))
        print("textdistance.lzma_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.distance(str1, str2)))
        print("textdistance.lzma_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.similarity(str1, str2)))
        print("textdistance.lzma_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.lzma_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.lzma_ncd.normalized_similarity(str1, str2)))
        print("textdistance.LZMANCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.LZMANCD().distance(str1, str2)))

        print("textdistance.zlib_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd(str1, str2)))
        print("textdistance.zlib_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.distance(str1, str2)))
        print("textdistance.zlib_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.similarity(str1, str2)))
        print("textdistance.zlib_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.zlib_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.zlib_ncd.normalized_similarity(str1, str2)))
        print("textdistance.ZLIBNCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.ZLIBNCD().distance(str1, str2)))

    #--------------------
    # Phonetic.
    if False:
        print("textdistance.mra({}, {}) = {}.".format(
            str1, str2, textdistance.mra(str1, str2)))
        print("textdistance.mra.distance({}, {}) = {}.".format(
            str1, str2, textdistance.mra.distance(str1, str2)))
        print("textdistance.mra.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mra.similarity(str1, str2)))
        print("textdistance.mra.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.mra.normalized_distance(str1, str2)))
        print("textdistance.mra.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mra.normalized_similarity(str1, str2)))
        print("textdistance.MRA().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.MRA().distance(str1, str2)))

        print("textdistance.editex({}, {}) = {}.".format(
            str1, str2, textdistance.editex(str1, str2)))
        print("textdistance.editex.distance({}, {}) = {}.".format(
            str1, str2, textdistance.editex.distance(str1, str2)))
        print("textdistance.editex.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.editex.similarity(str1, str2)))
        print("textdistance.editex.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.editex.normalized_distance(str1, str2)))
        print("textdistance.editex.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.editex.normalized_similarity(str1, str2)))
        print(
            "textdistance.Editex(local=False, match_cost=0, group_cost=1, mismatch_cost=2, groups=None, ungrouped=None, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.Editex(local=False,
                                    match_cost=0,
                                    group_cost=1,
                                    mismatch_cost=2,
                                    groups=None,
                                    ungrouped=None,
                                    external=True).distance(str1, str2)))

    #--------------------
    # Simple.
    if False:
        print("textdistance.prefix({}, {}) = {}.".format(
            str1, str2, textdistance.prefix(str1, str2)))
        print("textdistance.prefix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.distance(str1, str2)))
        print("textdistance.prefix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.similarity(str1, str2)))
        print("textdistance.prefix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.normalized_distance(str1, str2)))
        print("textdistance.prefix.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.normalized_similarity(str1, str2)))
        print(
            "textdistance.Prefix(qval={}, sim_test=None).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Prefix(qval=qval,
                                    sim_test=None).distance(str1, str2)))

        print("textdistance.postfix({}, {}) = {}.".format(
            str1, str2, textdistance.postfix(str1, str2)))
        print("textdistance.postfix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.distance(str1, str2)))
        print("textdistance.postfix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.similarity(str1, str2)))
        print("textdistance.postfix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.normalized_distance(str1, str2)))
        print(
            "textdistance.postfix.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.postfix.normalized_similarity(str1, str2)))
        #print("textdistance.Postfix(qval={}, sim_test=None).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.Postfix(qval=qval, sim_test=None).distance(str1, str2)))
        print("textdistance.Postfix().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Postfix().distance(str1, str2)))

        print("textdistance.length({}, {}) = {}.".format(
            str1, str2, textdistance.length(str1, str2)))
        print("textdistance.length.distance({}, {}) = {}.".format(
            str1, str2, textdistance.length.distance(str1, str2)))
        print("textdistance.length.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.length.similarity(str1, str2)))
        print("textdistance.length.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.length.normalized_distance(str1, str2)))
        print("textdistance.length.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.length.normalized_similarity(str1, str2)))
        print("textdistance.Length().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Length().distance(str1, str2)))

        print("textdistance.identity({}, {}) = {}.".format(
            str1, str2, textdistance.identity(str1, str2)))
        print("textdistance.identity.distance({}, {}) = {}.".format(
            str1, str2, textdistance.identity.distance(str1, str2)))
        print("textdistance.identity.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.identity.similarity(str1, str2)))
        print("textdistance.identity.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.identity.normalized_distance(str1, str2)))
        print(
            "textdistance.identity.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.identity.normalized_similarity(str1, str2)))
        print("textdistance.Identity().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Identity().distance(str1, str2)))

        print("textdistance.matrix({}, {}) = {}.".format(
            str1, str2, textdistance.matrix(str1, str2)))
        print("textdistance.matrix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.distance(str1, str2)))
        print("textdistance.matrix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.similarity(str1, str2)))
        print("textdistance.matrix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.normalized_distance(str1, str2)))
        print("textdistance.matrix.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.normalized_similarity(str1, str2)))
        print(
            "textdistance.Matrix(mat=None, mismatch_cost=0, match_cost=1, symmetric=True, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.Matrix(mat=None,
                                    mismatch_cost=0,
                                    match_cost=1,
                                    symmetric=True,
                                    external=True).distance(str1, str2)))
예제 #7
0
    def get_feature(self):
        result = {}
        feat_prefix = 'feat_'
        query_len = self.querys_seg.apply(self.get_len)
        doc_len = self.doc_text_seg.apply(self.get_len)

        jaccard_sim = list(
            map(self.get_jaccard_sim, self.querys_seg, self.doc_text_seg))
        edit_distance = list(
            map(lambda x, y: Levenshtein.distance(x, y) / (len(x) + 1),
                self.querys, self.doc_text))
        edit_jaro = list(
            map(lambda x, y: Levenshtein.jaro(x, y), self.querys,
                self.doc_text))
        edit_ratio = list(
            map(lambda x, y: Levenshtein.ratio(x, y), self.querys,
                self.doc_text))
        edit_jaro_winkler = list(
            map(lambda x, y: Levenshtein.jaro_winkler(x, y), self.querys,
                self.doc_text))
        hamming = list(
            map(
                lambda x, y: textdistance.Hamming(qval=None).
                normalized_distance(x, y), self.querys, self.doc_text))

        mht_sim, tf_mht_sim, cos_sim, tf_cos_sim, euc_sim, tf_euc_sim = self.get_tfidf_sim(
            self.querys_seg, self.doc_text_seg)
        gram_2_sim, gram_2_sim_ratio = self.get_n_grams(
            self.querys_seg, self.doc_text_seg, 2)
        gram_3_sim, gram_3_sim_ratio = self.get_n_grams(
            self.querys_seg, self.doc_text_seg, 3)

        bm25_group = self.get_bm25_group(
            self.data[['query_id', 'query_text', 'doc_text']])
        bm25_overall = list(
            map(self.get_bm25_overall, self.doc_ids, self.querys_seg))
        mat_cos_sim = list(
            map(lambda x, y: self.get_mat_cos_sim(x, y), self.querys_seg,
                self.doc_text_seg))
        query_vec = self.querys_seg.apply(lambda x: self.get_word_vec(x))
        doc_vec = self.doc_text_seg.apply(lambda x: self.get_word_vec(x))
        cos_mean_word2vec = list(map(self.get_cos_sim, query_vec, doc_vec))
        euc_mean_word2vec = list(
            map(self.get_euclidean_sim, query_vec, doc_vec))
        mhd_mean_word2vec = list(
            map(self.get_manhattan_distance, query_vec, doc_vec))
        # 'query_vec': query_vec, 'doc_vec': doc_vec,
        result = {
            'query_id': self.query_ids,
            'query_text': self.querys,
            'doc_id': self.doc_ids,
            'doc_text': self.doc_text,
            'relevence': self.relevences,
            'query_len': query_len,
            'doc_len': doc_len,
            'jaccard_sim': jaccard_sim,
            'edit_distance': edit_distance,
            'edit_jaro': edit_jaro,
            'edit_ratio': edit_ratio,
            'edit_jaro_winkler': edit_jaro_winkler,
            'hamming': hamming,
            'mht_sim': mht_sim,
            'tf_mht_sim': tf_mht_sim,
            'cos_sim': cos_sim,
            'tf_cos_sim': tf_cos_sim,
            'euc_sim': euc_sim,
            'tf_euc_sim': tf_euc_sim,
            'gram_2_sim': gram_2_sim,
            'gram_2_sim_ratio': gram_2_sim_ratio,
            'gram_3_sim': gram_3_sim,
            'gram_3_sim_ratio': gram_3_sim_ratio,
            'bm25_group': bm25_group,
            'bm25_overall': bm25_overall,
            'mat_cos_sim': mat_cos_sim,
            'cos_mean_word2vec': cos_mean_word2vec,
            'euc_mean_word2vec': euc_mean_word2vec,
            'mhd_mean_word2vec': mhd_mean_word2vec
        }

        return pd.DataFrame(result)
예제 #8
0
def metrics(x):
    a = x[4].strip()
    b = x[5].strip()

    al = a.lower()
    bl = b.lower()

    a_len = float(len(a))

    def tryit(x):
        try:
            return x()
        except Exception as e:
            return 0.0

    tempo = lambda a, b, x: \
      sum([
        1 if xi == a else (-1 if xi == b else 0) for xi in x
      ])

    M = [
        x[3],
        tryit(lambda: td.bz2_ncd(a, b)),
        tryit(lambda: td.zlib_ncd(a, b)),
        tryit(lambda: td.prefix.normalized_similarity(a, b)),
        tryit(lambda: td.postfix.normalized_similarity(a, b)),
        tryit(lambda: td.matrix.normalized_similarity(a, b)),
        tryit(lambda: td.length.normalized_similarity(a, b)),
        tryit(lambda: td.Hamming().normalized_similarity(a, b)),
        tryit(lambda: td.Hamming(qval=2).normalized_similarity(a, b)),
        tryit(lambda: td.Hamming(qval=3).normalized_similarity(a, b)),
        tryit(lambda: td.Hamming(qval=4).normalized_similarity(a, b)),
        tryit(lambda: td.Hamming(qval=5).normalized_similarity(a, b)),
        tryit(lambda: td.DamerauLevenshtein().normalized_similarity(a, b)),
        tryit(
            lambda: td.DamerauLevenshtein(qval=2).normalized_similarity(a, b)),
        tryit(
            lambda: td.DamerauLevenshtein(qval=3).normalized_similarity(a, b)),
        tryit(
            lambda: td.DamerauLevenshtein(qval=4).normalized_similarity(a, b)),
        tryit(
            lambda: td.DamerauLevenshtein(qval=5).normalized_similarity(a, b)),
        tryit(lambda: td.Jaccard().normalized_similarity(a, b)),
        tryit(lambda: td.Jaccard().normalized_similarity(al, bl)),
        tryit(lambda: td.Jaccard(qval=2).normalized_similarity(a, b)),
        tryit(lambda: td.Jaccard(qval=2).normalized_similarity(al, bl)),
        tryit(lambda: td.Jaccard(qval=3).normalized_similarity(a, b)),
        tryit(lambda: td.Jaccard(qval=3).normalized_similarity(al, bl)),
        tryit(lambda: td.Jaccard(qval=4).normalized_similarity(a, b)),
        tryit(lambda: td.Jaccard(qval=4).normalized_similarity(al, bl)),
        tryit(lambda: td.Jaccard(qval=5).normalized_similarity(a, b)),
        tryit(lambda: td.Jaccard(qval=5).normalized_similarity(al, bl)),
        tryit(lambda: td.Tversky().normalized_similarity(a, b)),
        tryit(lambda: td.Tversky(qval=2).normalized_similarity(a, b)),
        tryit(lambda: td.Tversky(qval=3).normalized_similarity(a, b)),
        tryit(lambda: td.Tversky(qval=4).normalized_similarity(a, b)),
        tryit(lambda: td.Tversky(qval=5).normalized_similarity(a, b)),
        tryit(lambda: td.JaroWinkler().normalized_similarity(a, b)),
        tryit(lambda: td.JaroWinkler(qval=2).normalized_similarity(a, b)),
        tryit(lambda: td.JaroWinkler(qval=3).normalized_similarity(a, b)),
        tryit(lambda: td.JaroWinkler(qval=4).normalized_similarity(a, b)),
        tryit(lambda: td.JaroWinkler(qval=5).normalized_similarity(a, b)),
        tryit(lambda: td.StrCmp95().normalized_similarity(a, b)),
        tryit(lambda: td.StrCmp95().normalized_similarity(al, bl)),
        1.0 - (float(abs(tempo('(', ')', a) - tempo('(', ')', b))) / a_len),
        1.0 - (float(abs(tempo('[', ']', a) - tempo('[', ']', b))) / a_len),
        1.0 - (float(abs(tempo('{', '}', a) - tempo('{', '}', b))) / a_len),
        1.0 - (float(abs(tempo('<', '>', a) - tempo('<', '>', b))) / a_len)
    ]

    return '{} qid:{} {} # {}'.format(
        x[0], x[1], ' '.join(
            ['{}:{:.4f}'.format(k + 1, float(y)) for k, y in enumerate(M)]),
        x[2])
예제 #9
0
def make_feature(data_or,vec_model):
    print('get features:')
    from gensim.models import Word2Vec
    vec_model = Word2Vec.load('pretrain_model/w2v_300.model')
    dictionary = corpora.Dictionary.load('temp_data/train_dictionary.dict')
    tfidf = models.TfidfModel.load("temp_data/train_tfidf.model")
    index = similarities.SparseMatrixSimilarity.load('temp_data/train_index.index')
    item_id_list = joblib.load('temp_data/paper_id.pkl')

    with open('temp_data/train_content.pkl','rb') as fr:
        corpus = pickle.load(fr)
    data = data_or.copy()

    data['abstract_pre'] = data['abstract_pre'].apply(
        lambda x: np.nan if str(x) == 'nan' or len(x) < 9 else x)

    data['abstract_pre'] = data['abstract_pre'].apply(
        lambda x: 'none' if str(x) == 'nan' or str(x).split(' ') == ['n', 'o', 'n', 'e'] else x)
    data['key_text_pre'] = data['key_text_pre'].fillna('none')
    data['description_text'] = data['description_text'].fillna('none')
    data['title_pro'] = data['title_pro'].fillna('none')
    data['description_text_pre'] = data['description_text_pre'].fillna('none')
    prefix = 'num_'
    
    # 长度
    data[prefix + 'key_text_len'] = data['key_text_pre'].apply(lambda x: len(x.split(' ')))

    # 长度append
    data[prefix + 'description_text_len'] = data['description_text'].apply(lambda x: len(x.split(' ')))

    data.loc[data[prefix + 'key_text_len'] < 7, 'key_text_pre'] = data[data[prefix + 'key_text_len'] < 7][
        'description_text'].apply(
        lambda x: ' '.join(pre_process(re.sub(r'[\[|,]+\*\*\#\#\*\*[\]|,]+', '', x)))).values

    # abstract是否为空
    data[prefix + 'cate_pa_isnull'] = data['abstract_pre'].apply(lambda x: 1 if str(x) == 'none' else 0)

    # key_words是否为空
    data[prefix + 'cate_pkeywords_isnull'] = data['keywords'].apply(lambda x: 1 if str(x) == 'nan' else 0)


    #描述在key_word中出现的次数
    def get_num_key(x,y):
        if str(y)=='nan':
            return -1
        y=y.strip(';').split(';')
        num=0
        for i in y:
            if i in x:
                num+=1
        return num

    data[prefix+'key_in_key_word_number']=list(map(lambda x,y: get_num_key(x,y),data['key_text_pre'],data['keywords']))
    #描述在key_word中出现的次数/key_words的个数
    data[prefix+'key_in_key_word_number_rate']=list(map(lambda x,y: 0 if x==-1 else x/len(y.strip(';').split(';')),data[prefix+'key_in_key_word_number'],
                                                data['keywords']))

    #append
    data[prefix+'key_in_key_word_number2']=list(map(lambda x,y: get_num_key(x,y),data['description_text'],data['keywords']))
    #描述在key_word中出现的次数/key_words的个数
    data[prefix+'key_in_key_word_number2_rate']=list(map(lambda x,y: 0 if x==-1 else x/len(y.strip(';').split(';')),data[prefix+'key_in_key_word_number2'],
                                                data['keywords']))

    # 描述在title出现单词的统计
    def get_num_common_words_and_ratio(merge, col):
        # merge data
        merge = merge[col]
        merge.columns = ['q1', 'q2']
        merge['q2'] = merge['q2'].apply(lambda x: 'none' if str(x) == 'nan' else x)

        q1_word_set = merge.q1.apply(lambda x: x.split(' ')).apply(set).values
        q2_word_set = merge.q2.apply(lambda x: x.split(' ')).apply(set).values

        q1_word_len = merge.q1.apply(lambda x: len(x.split(' '))).values
        q2_word_len = merge.q2.apply(lambda x: len(x.split(' '))).values

        q1_word_len_set = merge.q1.apply(lambda x: len(set(x.split(' ')))).values
        q2_word_len_set = merge.q2.apply(lambda x: len(set(x.split(' ')))).values

        result = [len(q1_word_set[i] & q2_word_set[i]) for i in range(len(q1_word_set))]
        result_ratio_q = [result[i] / q1_word_len[i] for i in range(len(q1_word_set))]
        result_ratio_t = [result[i] / q2_word_len[i] for i in range(len(q1_word_set))]

        result_ratio_q_set = [result[i] / q1_word_len_set[i] for i in range(len(q1_word_set))]
        result_ratio_t_set = [result[i] / q2_word_len_set[i] for i in range(len(q1_word_set))]

        return result, result_ratio_q, result_ratio_t, q1_word_len, q2_word_len, q1_word_len_set, q2_word_len_set, result_ratio_q_set, result_ratio_t_set

    data[prefix + 'common_words_k_pt'], \
    data[prefix + 'common_words_k_pt_k'], \
    data[prefix + 'common_words_k_pt_pt'], \
    data[prefix + 'k_len'], \
    data[prefix + 'pt_len'], \
    data[prefix + 'k_len_set'], \
    data[prefix + 'pt_len_set'], \
    data[prefix + 'common_words_k_pt_k_set'], \
    data[prefix + 'common_words_k_pt_pt_set'] = get_num_common_words_and_ratio(data, ['key_text_pre', 'title_pro'])

    data[prefix + 'common_words_k_at'], \
    data[prefix + 'common_words_k_at_k'], \
    data[prefix + 'common_words_k_at_at'], \
    data[prefix + 'k_len'], \
    data[prefix + 'at_len'], \
    data[prefix + 'k_len_set'], \
    data[prefix + 'at_len_set'], \
    data[prefix + 'common_words_k_at_k_set'], \
    data[prefix + 'common_words_k_at_at_set'] = get_num_common_words_and_ratio(data, ['key_text_pre', 'abstract_pre'])

    #append
    data[prefix + 'common_words_k_pt_2'], \
    data[prefix + 'common_words_k_pt_k_2'], \
    data[prefix + 'common_words_k_pt_pt_2'], \
    data[prefix + 'k_len_2'], \
    data[prefix + 'pt_len'], \
    data[prefix + 'k_len_set_2'], \
    data[prefix + 'pt_len_set'], \
    data[prefix + 'common_words_k_pt_k_set_2'], \
    data[prefix + 'common_words_k_pt_pt_set_2'] = get_num_common_words_and_ratio(data, ['description_text', 'title_pro'])

    data[prefix + 'common_words_k_at_2'], \
    data[prefix + 'common_words_k_at_k_2'], \
    data[prefix + 'common_words_k_at_at_2'], \
    data[prefix + 'k_len_2'], \
    data[prefix + 'at_len'], \
    data[prefix + 'k_len_set_2'], \
    data[prefix + 'at_len_set'], \
    data[prefix + 'common_words_k_at_k_set_2'], \
    data[prefix + 'common_words_k_at_at_set_2'] = get_num_common_words_and_ratio(data, ['description_text', 'abstract_pre'])



    # Jaccard 相似度
    def jaccard(x, y):
        if str(y) == 'nan':
            y = 'none'
        x = set(x)
        y = set(y)
        return float(len(x & y) / len(x | y))

    data[prefix + 'jaccard_sim_k_pt'] = list(map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['title_pro']))
    data[prefix + 'jaccard_sim_k_pa'] = list(
        map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['abstract_pre']))

    #append
    data[prefix + 'jaccard_sim_k_pt2'] = list(map(lambda x, y: jaccard(x, y), data['description_text'], data['title_pro']))
    data[prefix + 'jaccard_sim_k_pa2'] = list(
        map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['description_text']))

    # 编辑距离
    print('get edict distance:')
    data[prefix + 'edict_distance_k_pt'] = list(
        map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['key_text_pre']), data['title_pro']))
    data[prefix + 'edict_jaro'] = list(
        map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['key_text_pre']), data['title_pro']))
    data[prefix + 'edict_ratio'] = list(
        map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['key_text_pre']), data['title_pro']))
    data[prefix + 'edict_jaro_winkler'] = list(
        map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['key_text_pre']), data['title_pro']))

    data[prefix + 'edict_distance_k_pa'] = list(
        map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['key_text_pre']),
            data['abstract_pre']))
    data[prefix + 'edict_jaro_pa'] = list(
        map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['key_text_pre']), data['abstract_pre']))
    data[prefix + 'edict_ratio_pa'] = list(
        map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['key_text_pre']), data['abstract_pre']))
    data[prefix + 'edict_jaro_winkler_pa'] = list(
        map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['key_text_pre']), data['abstract_pre']))

    #append
    print('get edict distance:')
    data[prefix + 'edict_distance_k_pt_2'] = list(
        map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['description_text']), data['title_pro']))
    data[prefix + 'edict_jaro_2'] = list(
        map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['description_text']), data['title_pro']))
    data[prefix + 'edict_ratio_2'] = list(
        map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['description_text']), data['title_pro']))
    data[prefix + 'edict_jaro_winkler_2'] = list(
        map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['description_text']), data['title_pro']))

    data[prefix + 'edict_distance_k_pa_2'] = list(
        map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['description_text']),
            data['abstract_pre']))
    data[prefix + 'edict_jaro_pa_2'] = list(
        map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['description_text']), data['abstract_pre']))
    data[prefix + 'edict_ratio_pa_2'] = list(
        map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['description_text']), data['abstract_pre']))
    data[prefix + 'edict_jaro_winkler_pa_2'] = list(
        map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['description_text']), data['abstract_pre']))

    #余弦相似度
    def get_sim(doc, corpus):
        corpus = corpus.split(' ')
        corpus_vec = [dictionary.doc2bow(corpus)]
        corpus_tfidf = tfidf[corpus_vec]
        featurenum = len(dictionary.token2id.keys())
        index_i = similarities.SparseMatrixSimilarity(corpus_tfidf, num_features=featurenum)
        doc = doc.split(' ')
        vec = dictionary.doc2bow(doc)
        vec_tfidf = tfidf[vec]
        sim = index_i.get_similarities(vec_tfidf)
        return sim[0]

    data[prefix + 'sim'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['key_text_pre']), data['title_pro']))
    data[prefix + 'sim_pa'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['key_text_pre']), data['abstract_pre']))

    #append
    data[prefix + 'sim_2'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['description_text']), data['title_pro']))
    data[prefix + 'sim_pa_2'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['description_text']), data['abstract_pre']))

    # tfidf
    def get_simlilary(query, title):
        def get_weight_counter_and_tf_idf(x, y):
            x = x.split()
            y = y.split()
            corups = x + y
            obj = dict(collections.Counter(corups))
            x_weight = []
            y_weight = []
            idfs = []
            for key in obj.keys():
                idf = 1
                w = obj[key]
                if key in x:
                    idf += 1
                    x_weight.append(w)
                else:
                    x_weight.append(0)
                if key in y:
                    idf += 1
                    y_weight.append(w)
                else:
                    y_weight.append(0)
                idfs.append(math.log(3.0 / idf) + 1)
            return [np.array(x_weight), np.array(y_weight), np.array(x_weight) * np.array(idfs),
                    np.array(y_weight) * np.array(idfs), np.array(list(obj.keys()))]

        weight = list(map(lambda x, y: get_weight_counter_and_tf_idf(x, y),
                          tqdm(query), title))
        x_weight_couner = []
        y_weight_couner = []
        x_weight_tfidf = []
        y_weight_tfidf = []
        words = []
        for i in weight:
            x_weight_couner.append(i[0])
            y_weight_couner.append(i[1])
            x_weight_tfidf.append(i[2])
            y_weight_tfidf.append(i[3])
            words.append(i[4])

        # 曼哈顿距离
        def mhd_simlilary(x, y):
            return np.linalg.norm(x - y, ord=1)

        mhd_simlilary_counter = list(map(lambda x, y: mhd_simlilary(x, y),
                                         x_weight_couner, y_weight_couner))
        mhd_simlilary_tfidf = list(map(lambda x, y: mhd_simlilary(x, y),
                                       x_weight_tfidf, y_weight_tfidf))

        # 余弦相似度
        def cos_simlilary(x, y):
            return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

        cos_simlilary_counter = list(map(lambda x, y: cos_simlilary(x, y),
                                         x_weight_couner, y_weight_couner))
        cos_simlilary_tfidf = list(map(lambda x, y: cos_simlilary(x, y),
                                       x_weight_tfidf, y_weight_tfidf))

        # 欧式距离
        def Euclidean_simlilary(x, y):
            return np.sqrt(np.sum(x - y) ** 2)

        Euclidean_simlilary_counter = list(map(lambda x, y: Euclidean_simlilary(x, y),
                                               x_weight_couner, y_weight_couner))
        Euclidean__simlilary_tfidf = list(map(lambda x, y: Euclidean_simlilary(x, y),
                                              x_weight_tfidf, y_weight_tfidf))

        return mhd_simlilary_counter, mhd_simlilary_tfidf, cos_simlilary_counter, \
               cos_simlilary_tfidf, Euclidean_simlilary_counter, Euclidean__simlilary_tfidf

    data[prefix + 'mhd_similiary'], data[prefix + 'tf_mhd_similiary'], \
    data[prefix + 'cos_similiary'], data[prefix + 'tf_cos_similiary'], \
    data[prefix + 'os_similiary'], data[prefix + 'tf_os_similiary'] = get_simlilary(data['key_text_pre'],data['title_pro'])


    data[prefix + 'mhd_similiary_pa'], data[prefix + 'tf_mhd_similiary_pa'], \
    data[prefix + 'cos_similiary_pa'], data[prefix + 'tf_cos_similiary_pa'], \
    data[prefix + 'os_similiary_pa'], data[prefix + 'tf_os_similiary_pa'] = get_simlilary(data['key_text_pre'],data['abstract_pre'])

    '词向量平均的相似度'

    def get_vec(x):
        vec = []
        for word in x.split():
            if word in vec_model:
                vec.append(vec_model[word])
        if len(vec) == 0:
            return np.nan
        else:
            return np.mean(np.array(vec), axis=0)

    data['key_text_pre_vec'] = data['key_text_pre'].progress_apply(lambda x: get_vec(x))
    data['title_pro_vec'] = data['title_pro'].progress_apply(lambda x: get_vec(x))
    data['abstract_pre_vec'] = data['abstract_pre'].progress_apply(lambda x: get_vec(x))
    data['description_text_vec'] = data['description_text'].progress_apply(lambda x: get_vec(x))

    # cos
    data[prefix + 'cos_mean_word2vec'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  tqdm(data['key_text_pre_vec']), data['title_pro_vec']))
    data[prefix + 'cos_mean_word2vec'] = data[prefix + 'cos_mean_word2vec'].progress_apply(
        lambda x: np.nan if np.isnan(x).any() else x)

    # 欧式距离
    data[prefix + 'os_mean_word2vec'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 tqdm(data['key_text_pre_vec']), data['title_pro_vec']))

    # mhd
    data[prefix + 'mhd_mean_word2vec'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), tqdm(data['key_text_pre_vec']), data['title_pro_vec']))


    # cos
    data[prefix + 'cos_mean_word2vec_pa'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  tqdm(data['key_text_pre_vec']), data['abstract_pre_vec']))
    data[prefix + 'cos_mean_word2vec_pa'] = data[prefix + 'cos_mean_word2vec_pa'].progress_apply(
        lambda x: np.nan if np.isnan(x).any() else x)

    # 欧式距离
    data[prefix + 'os_mean_word2vec_pa'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 tqdm(data['key_text_pre_vec']), data['abstract_pre_vec']))

    # mhd
    data[prefix + 'mhd_mean_word2vec_pa'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), tqdm(data['key_text_pre_vec']), data['abstract_pre_vec']))


    #append
    data[prefix + 'cos_mean_word2vec_2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  tqdm(data['description_text_vec']), data['title_pro_vec']))
    data[prefix + 'cos_mean_word2vec_2'] = data[prefix + 'cos_mean_word2vec_2'].progress_apply(
        lambda x: np.nan if np.isnan(x).any() else x)

    # 欧式距离
    data[prefix + 'os_mean_word2vec_2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 tqdm(data['description_text_vec']), data['title_pro_vec']))

    # mhd
    data[prefix + 'mhd_mean_word2vec_2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), tqdm(data['description_text_vec']), data['title_pro_vec']))

    # cos
    data[prefix + 'cos_mean_word2vec_pa2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)),
                                                  tqdm(data['description_text_vec']), data['abstract_pre_vec']))
    data[prefix + 'cos_mean_word2vec_pa2'] = data[prefix + 'cos_mean_word2vec_pa2'].progress_apply(
        lambda x: np.nan if np.isnan(x).any() else x)

    # 欧式距离
    data[prefix + 'os_mean_word2vec_pa2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
                                                 tqdm(data['description_text_vec']), data['abstract_pre_vec']))

    # mhd
    data[prefix + 'mhd_mean_word2vec_pa2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else
    np.linalg.norm(x - y, ord=1), tqdm(data['description_text_vec']), data['abstract_pre_vec']))




    #n-gram距离相关
    data[prefix+'n_gram_sim'],data[prefix+'sim_numeber_rate']=get_df_grams(data,2,['key_text_pre','title_pro'])
    data[prefix+'n_gram_sim_pa'],data[prefix+'sim_numeber_rate_pa']=get_df_grams(data,2,['key_text_pre','abstract_pre'])

    #append
    #n-gram距离相关
    data[prefix+'n_gram_sim_2'],data[prefix+'sim_numeber_rate_2']=get_df_grams(data,2,['description_text','title_pro'])
    data[prefix+'n_gram_sim_pa_2'],data[prefix+'sim_numeber_rate_pa_2']=get_df_grams(data,2,['description_text','abstract_pre'])

    
#################################################朋哥已做##################################
#     def apply_fun(df):
#         df.columns = ['d_id', 'key', 'doc']
#         df['d_id'] = df['d_id'].fillna('always_nan')
#         query_id_group = df.groupby(['d_id'])
#         bm_list = []
#         for name, group in tqdm(query_id_group):
#             corpus = group['doc'].values.tolist()
#             corpus = [sentence.strip().split() for sentence in corpus]
#             query = group['key'].values[0].strip().split()
#             bm25Model = BM25(corpus)
#             bmscore = bm25Model.get_scores(query)
#             bm_list.extend(bmscore)

#         return bm_list

#     data[prefix + 'bm25'] = apply_fun(data[['description_id', 'key_text_pre', 'title_pro']])
#     data[prefix + 'bm25_pa'] = apply_fun(data[['description_id', 'key_text_pre', 'abstract_pre']])

#     #append
#     data[prefix + 'bm25_2'] = apply_fun(data[['description_id', 'description_text', 'title_pro']])
#     data[prefix + 'bm25_pa_2'] = apply_fun(data[['description_id', 'description_text', 'abstract_pre']])


#     # get bm25
#     def get_bm25(p_id, query):
#         query = query.split(' ')
#         score = bm25Model.get_score(query, item_id_list.index(p_id))
#         return score

#     data[prefix + 'bm_25_all'] = list(map(lambda x, y: get_bm25(x, y), tqdm(data['paper_id']), data['key_text_pre']))
#     #append
#     data[prefix + 'bm_25_all_2'] = list(map(lambda x, y: get_bm25(x, y), tqdm(data['paper_id']), data['description_text']))
#################################################朋哥已做##################################
    data[prefix + 'Hamming_kt'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).normalized_distance(x, y),
                                           tqdm(data['key_text_pre']), data['title_pro']))
    data[prefix + 'Hamming_dt'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).normalized_distance(x, y),
                                           tqdm(data['description_text_pre']), data['title_pro']))
    
    data[prefix + 'Hamming_ka'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).normalized_distance(x, y),
                                           tqdm(data['key_text_pre']), data['abstract_pre']))
    data[prefix + 'Hamming_da'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).normalized_distance(x, y),
                                           tqdm(data['description_text_pre']), data['abstract_pre']))
    
    data[prefix + 'Hamming_sim_kt'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).similarity(x, y),
                                           tqdm(data['key_text_pre']), data['title_pro']))
    data[prefix + 'Hamming_sim_dt'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).similarity(x, y),
                                           tqdm(data['description_text_pre']), data['title_pro']))
    
    data[prefix + 'Hamming_sim_ka'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).similarity(x, y),
                                           tqdm(data['key_text_pre']), data['abstract_pre']))
    data[prefix + 'Hamming_sim_da'] = list(map(lambda x, y: 
                                           textdistance.Hamming(qval=None).similarity(x, y),
                                           tqdm(data['description_text_pre']), data['abstract_pre']))
   
    def edit_distance(df,w1, w2):
        word1 = df[w1].split()
        word2 = df[w2].split()
        len1 = len(word1)
        len2 = len(word2)
        dp = np.zeros((len1 + 1, len2 + 1))
        for i in range(len1 + 1):
            dp[i][0] = i
        for j in range(len2 + 1):
            dp[0][j] = j

        for i in range(1, len1 + 1):
            for j in range(1, len2 + 1):
                delta = 0 if word1[i - 1] == word2[j - 1] else 1
                dp[i][j] = min(dp[i - 1][j - 1] + delta, min(dp[i - 1][j] + 1, dp[i][j - 1] + 1))
        return dp[len1][len2]
    
    data[prefix + 'edit_distance_kt'] = data.apply(edit_distance, axis=1, 
                                                   args=('key_text_pre', 'title_pro'))
    data[prefix + 'edit_distance_dt'] = data.apply(edit_distance, axis=1, 
                                                   args=('description_text_pre', 'title_pro'))
    data[prefix + 'edit_distance_ka'] = data.apply(edit_distance, axis=1, 
                                                   args=('key_text_pre', 'abstract_pre'))
    data[prefix + 'edit_distance_da'] = data.apply(edit_distance, axis=1, 
                                                   args=('description_text_pre', 'abstract_pre'))
    
    def get_same_word_features(query, title):
        q_list = query.split()
        t_list = title.split()
        set_query = set(q_list)
        set_title = set(t_list)
        count_words = len(set_query.union(set_title))

        comwords = [word for word in t_list if word in q_list]
        comwords_set = set(comwords)
        unique_rate = len(comwords_set) / count_words

        same_word1 = [w for w in q_list if w in t_list]
        same_word2 = [w for w in t_list if w in q_list]
        same_len_rate = (len(same_word1) + len(same_word2)) / (len(q_list) + len(t_list))
        if len(comwords) > 0:
            com_index1 = len(comwords)
            same_word_q = com_index1 / len(q_list)
            same_word_t = com_index1 / len(t_list)

            for word in comwords_set:
                index_list = [i for i, x in enumerate(q_list) if x == word]
                com_index1 += sum(index_list)
            q_loc = com_index1 / (len(q_list) * len(comwords))
            com_index2 = len(comwords)
            for word in comwords_set:
                index_list = [i for i, x in enumerate(t_list) if x == word]
                com_index2 += sum(index_list)
            t_loc = com_index2 / (len(t_list) * len(comwords))

            same_w_set_q = len(comwords_set) / len(set_query)
            same_w_set_t = len(comwords_set) / len(set_title)
            word_set_rate = 2 * len(comwords_set) / (len(set_query) + len(set_title))

            com_set_query_index = len(comwords_set)
            for word in comwords_set:
                index_list = [i for i, x in enumerate(q_list) if x == word]
                if len(index_list) > 0:
                    com_set_query_index += index_list[0]
            loc_set_q = com_set_query_index / (len(q_list) * len(comwords_set))
            com_set_title_index = len(comwords_set)
            for word in comwords_set:
                index_list = [i for i, x in enumerate(t_list) if x == word]
                if len(index_list) > 0:
                    com_set_title_index += index_list[0]
            loc_set_t = com_set_title_index / (len(t_list) * len(comwords_set))
            set_rate = (len(comwords_set) / len(comwords))
        else:
            unique_rate, same_len_rate, same_word_q, same_word_t, q_loc, t_loc, same_w_set_q, same_w_set_t, word_set_rate, loc_set_q, loc_set_t, set_rate = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        return unique_rate, same_len_rate, same_word_q, same_word_t, q_loc, t_loc, same_w_set_q, same_w_set_t, word_set_rate, loc_set_q, loc_set_t, set_rate
    
    data[prefix+"unique_rate_kt"],data[prefix+"same_len_rate_kt"],data[prefix+"same_word_q_kt"],\
    data[prefix+"same_word_t_kt"],data[prefix+"q_loc_kt"],data[prefix+"t_loc_kt"],data[prefix+"same_w_set_q_kt"],data[prefix+"same_w_set_t_kt"],data[prefix+"word_set_rate_kt"],\
    data[prefix+"loc_set_q_kt"], data[prefix+"loc_set_t_kt"], data[prefix+"set_rate_kt"]= zip(
    *data.apply(lambda line: get_same_word_features(line["key_text_pre"], line["title_pro"]), axis=1))
    
    data[prefix+"unique_rate_dt"],data[prefix+"same_len_rate_dt"],data[prefix+"same_word_q_dt"],\
    data[prefix+"same_word_t_dt"],data[prefix+"q_loc_dt"],data[prefix+"t_loc_dt"],data[prefix+"same_w_set_q_dt"],data[prefix+"same_w_set_t_dt"],data[prefix+"word_set_rate_dt"],\
    data[prefix+"loc_set_q_dt"], data[prefix+"loc_set_t_dt"], data[prefix+"set_rate_dt"]= zip(
    *data.apply(lambda line: get_same_word_features(line["description_text_pre"], line["title_pro"]), axis=1))

    data[prefix+"unique_rate_ka"],data[prefix+"same_len_rate_ka"],data[prefix+"same_word_q_ka"],\
    data[prefix+"same_word_t_ka"],data[prefix+"q_loc_ka"],data[prefix+"t_loc_ka"],data[prefix+"same_w_set_q_ka"],data[prefix+"same_w_set_t_ka"],data[prefix+"word_set_rate_ka"],\
    data[prefix+"loc_set_q_ka"], data[prefix+"loc_set_t_ka"], data[prefix+"set_rate_ka"]= zip(
    *data.apply(lambda line: get_same_word_features(line["key_text_pre"], line["abstract_pre"]), axis=1))
    
    data[prefix+"unique_rate_da"],data[prefix+"same_len_rate_da"],data[prefix+"same_word_q_da"],\
    data[prefix+"same_word_t_da"],data[prefix+"q_loc_da"],data[prefix+"t_loc_da"],data[prefix+"same_w_set_q_da"],data[prefix+"same_w_set_t_da"],data[prefix+"word_set_rate_da"],\
    data[prefix+"loc_set_q_da"], data[prefix+"loc_set_t_da"], data[prefix+"set_rate_da"]= zip(
    *data.apply(lambda line: get_same_word_features(line["description_text_pre"], line["abstract_pre"]), axis=1))

    
    
    def get_df_grams_3(train_sample,values,cols):
        def create_ngram_set(input_list, ngram_value=3):
            return set(zip(*[input_list[i:] for i in range(ngram_value)]))

        def get_n_gram(df, values=3):
            train_query = df.values
            train_query = [[word for word in str(sen).replace("'", '').split(' ')] for sen in train_query]
            train_query_n = []
            for input_list in train_query:
                train_query_n_gram = set()
                for value in range(3, values + 1):
                    train_query_n_gram = train_query_n_gram | create_ngram_set(input_list, value)
                train_query_n.append(train_query_n_gram)
            return train_query_n

        train_query = get_n_gram(train_sample[cols[0]], values)
        train_title = get_n_gram(train_sample[cols[1]], values)
        sim = list(map(lambda x, y: len(x) + len(y) - 2 * len(x & y),
                           train_query, train_title))
        sim_number_rate=list(map(lambda x, y:   len(x & y)/ len(x)  if len(x)!=0 else 0,
                           train_query, train_title))
        return sim ,sim_number_rate
    data[prefix+'3_gram_sim'],data[prefix+'sim_numeber_rate_3']=get_df_grams_3(data,3,['key_text_pre','title_pro'])
    data[prefix+'3_gram_sim_pa'],data[prefix+'sim_numeber_rate_pa_3']=get_df_grams_3(data,3,['key_text_pre','abstract_pre'])

    #append
    #n-gram距离相关
    data[prefix+'3_gram_sim_2'],data[prefix+'sim_numeber_rate_2_3']=get_df_grams_3(data,3,['description_text_pre','title_pro'])
    data[prefix+'3_gram_sim_pa_2'],data[prefix+'sim_numeber_rate_pa_2_3']=get_df_grams_3(data,3,['description_text_pre','abstract_pre'])
    
    
    def get_son_str_feature(query, title):
        q_list = query.split()
        query_len = len(q_list)
        t_list = title.split()
        title_len = len(t_list)
        count1 = np.zeros((query_len + 1, title_len + 1))
        index = np.zeros((query_len + 1, title_len + 1))
        for i in range(1, query_len + 1):
            for j in range(1, title_len + 1):
                if q_list[i - 1] == t_list[j - 1]:
                    count1[i][j] = count1[i - 1][j - 1] + 1
                    index[i][j] = index[i - 1][j - 1] + j
                else:
                    count1[i][j] = 0
                    index[i][j] = 0
        max_count1 = count1.max()

        if max_count1 != 0:
            row = int(np.where(count1 == np.max(count1))[0][0])
            col = int(np.where(count1 == np.max(count1))[1][0])
            mean_pos = index[row][col] / (max_count1 * title_len)
            begin_loc = (col - max_count1 + 1) / title_len
            rows = np.where(count1 != 0.0)[0]
            cols = np.where(count1 != 0.0)[1]
            total_loc = 0
            for i in range(0, len(rows)):
                total_loc += index[rows[i]][cols[i]]
            density = total_loc / (query_len * title_len)
            rate_q_len = max_count1 / query_len
            rate_t_len = max_count1 / title_len
        else:
            begin_loc, mean_pos, total_loc, density, rate_q_len, rate_t_len = 0, 0, 0, 0, 0, 0
        return max_count1, begin_loc, mean_pos, total_loc, density, rate_q_len, rate_t_len    

    data[prefix+"long_same_max_count1_kt"], data[prefix+"long_same_local_begin_kt"], data[prefix+"long_same_local_mean_kt"],data[prefix+"long_same_total_loc_kt"],\
    data[prefix+"long_same_density_kt"], data[prefix+"long_same_rate_q_len_kt"], data[prefix+"long_same_rate_t_len_kt"]= zip(
        *data.apply(lambda line: get_son_str_feature(line["key_text_pre"], line["title_pro"]), axis=1))
    
    data[prefix+"long_same_max_count1_dt"], data[prefix+"long_same_local_begin_dt"], data[prefix+"long_same_local_mean_dt"],data[prefix+"long_same_total_loc_dt"],\
    data[prefix+"long_same_density_dt"], data[prefix+"long_same_rate_q_len_dt"], data[prefix+"long_same_rate_t_len_dt"]= zip(
        *data.apply(lambda line: get_son_str_feature(line["description_text_pre"], line["title_pro"]), axis=1))
    
    data[prefix+"long_same_max_count1_da"], data[prefix+"long_same_local_begin_da"], data[prefix+"long_same_local_mean_da"],data[prefix+"long_same_total_loc_da"],\
    data[prefix+"long_same_density_da"], data[prefix+"long_same_rate_q_len_da"], data[prefix+"long_same_rate_t_len_da"]= zip(
        *data.apply(lambda line: get_son_str_feature(line["description_text_pre"], line["abstract_pre"]), axis=1))
    
    data[prefix+"long_same_max_count1_ka"], data[prefix+"long_same_local_begin_ka"], data[prefix+"long_same_local_mean_ka"],data[prefix+"long_same_total_loc_ka"],\
    data[prefix+"long_same_density_ka"], data[prefix+"long_same_rate_q_len_ka"], data[prefix+"long_same_rate_t_len_ka"]= zip(
        *data.apply(lambda line: get_son_str_feature(line["key_text_pre"], line["abstract_pre"]), axis=1))
    
    def q_t_common_words(query, title):
        query = set(query.split(' '))
        title = set(title.split(' '))
        return len(query & title)
    
    data[prefix+'common_words_kt'] = data.apply(lambda index: q_t_common_words(index.key_text_pre, index.title_pro), axis=1)
    data[prefix+'common_words_dt'] = data.apply(lambda index: q_t_common_words(index.description_text_pre, index.title_pro), axis=1)
    data[prefix+'common_words_ka'] = data.apply(lambda index: q_t_common_words(index.key_text_pre, index.abstract_pre), axis=1)
    data[prefix+'common_words_da'] = data.apply(lambda index: q_t_common_words(index.description_text_pre, index.abstract_pre), axis=1)

    
    data['key_text_len'] = data['key_text_pre'].apply(lambda x: len(x.split(' ')))
    data['description_text_pre_len'] = data['description_text_pre'].apply(lambda x: len(x.split(' ')))
    data['title_pro_len'] = data['title_pro'].apply(lambda x: len(x.split(' ')))
    data['abstract_pre_len'] = data['abstract_pre'].apply(lambda x: len(x.split(' ')))
    
    
    data[prefix+'common_words_kt_rate_k'] = data[prefix+'common_words_kt'] / data['key_text_len']
    data[prefix+'common_words_kt_rate_t'] = data[prefix+'common_words_kt'] / data['title_pro_len']

    data[prefix+'common_words_dt_rate_d'] = data[prefix+'common_words_dt'] / data['description_text_pre_len']
    data[prefix+'common_words_dt_rate_t'] = data[prefix+'common_words_dt'] / data['title_pro_len']

    data[prefix+'common_words_ka_rate_k'] = data[prefix+'common_words_ka'] / data['key_text_len']
    data[prefix+'common_words_ka_rate_a'] = data[prefix+'common_words_ka'] / data['abstract_pre_len']

    data[prefix+'common_words_da_rate_d'] = data[prefix+'common_words_da'] / data['description_text_pre_len']
    data[prefix+'common_words_da_rate_a'] = data[prefix+'common_words_da'] / data['abstract_pre_len']

    
    
    
    
    feat = ['description_id','paper_id']
    for col in data.columns:
        if re.match('num_', col) != None:
            feat.append(col)

    data = data[feat]

    return data