def __init__(self, spacy_model_name): self.parser = spacy.load(spacy_model_name) self.estimator = textdistance.Hamming() self.gen_func = lambda x: get_tense_distractors( x, self.estimator, self.parser)
# In[6]: textdistance.hamming.normalized_distance('test', 'text') # In[7]: textdistance.hamming.normalized_similarity('test', 'text') # In[8]: textdistance.Hamming(qval=2).distance('test', 'text') # In[9]: hamming = textdistance.Hamming(external=False) hamming('text', 'testit') # # Levenshtein # # https://itnext.io/string-similarity-the-basic-know-your-algorithms-guide-3de3d7346227 # In[10]:
def process_sudoku_reads(output_file, fastqr1, fastqr2, fastqi1, fastqi2, bc_seed, bc_length, index_table_file, is_zipped=True, threshold_count=5, merge_dist=1): assert len(fastqr1) == len(fastqr2) assert len(fastqr1) == len(fastqi1) assert len(fastqr1) == len(fastqi2) print("Loading index table") idx_table = pd.read_csv(index_table_file, index_col=None, sep="\t") allowed_i5 = idx_table[I5_COL].tolist() allowed_i7 = idx_table[I7_COL].tolist() print("Loaded {n} pools with {i5} i5 indices and {i7} i7 indices".format( n=idx_table.shape[0], i5=len(allowed_i5), i7=len(allowed_i7))) linker = LinkSudokuBC(bc_seed, bc_length, is_zipped=is_zipped) bcs = linker.parse_fastq_mp(fastqi1, fastqi2, fastqr1) print("Barcode extraction complete") # Only keep indices that are in the table for i5 in list(bcs.keys()): if i5 not in allowed_i5: del bcs[i5] else: for i7 in list(bcs[i5].keys()): if i7 not in allowed_i7: del bcs[i5][i7] print("Index restriction to whitelist complete") hamming = textdistance.Hamming() outputs = [] for i5 in list(bcs.keys()): i5_ref = bcs[i5] for i7 in list(i5_ref.keys()): i57_ref = bcs[i5][i7] try: group = idx_table.loc[(idx_table[I5_COL] == i5) & (idx_table[I7_COL] == i7), GROUP_COL].tolist()[0] pool = idx_table.loc[(idx_table[I5_COL] == i5) & (idx_table[I7_COL] == i7), LOC_COL].tolist()[0] except IndexError: continue print("Processing {g} {p}".format(g=group, p=pool)) # Keep only BCs over a count threshold valid_bcs = [] for bc in list(i57_ref.keys()): if i57_ref[bc] > threshold_count: valid_bcs.append(bc) # Merge all BCs that are less than merge_dist hamming distance bc_merge_toward = set() bc_merge_away = set() merged_bcs = set() for i in range(len(valid_bcs)): bc = valid_bcs[i] end_check = False for opp in valid_bcs[i + 1:]: if hamming.distance(bc, opp) < merge_dist: end_check = True if i57_ref[bc] > i57_ref[opp]: bc_merge_toward.add(bc) bc_merge_away.add(opp) else: bc_merge_toward.add(opp) bc_merge_away.add(bc) if not end_check: merged_bcs.add(bc) merged_bcs = merged_bcs.union( bc_merge_toward.difference(bc_merge_away)) merged_bcs = list(merged_bcs) merged_counts = [i57_ref[bc] for bc in merged_bcs] output_df = pd.DataFrame(list(zip(merged_bcs, merged_counts)), columns=[BC_COL, COUNT_COL]) output_df[GROUP_COL] = group output_df[LOC_COL] = pool outputs.append(output_df) outputs = pd.concat(outputs) outputs = outputs.reindex(labels=[GROUP_COL, LOC_COL, BC_COL, COUNT_COL], axis=1) outputs.sort_values(by=[GROUP_COL, LOC_COL], inplace=True) outputs.reset_index(inplace=True, drop=True) outputs.to_csv("raw_" + output_file, sep="\t") pivoted = map_barcodes_to_wells(outputs) pivoted.to_csv(output_file, sep="\t") return outputs, pivoted
def hamming(string1, string2): hamming = textdistance.Hamming() d = textdistance.hamming.normalized_distance(string1, string2) return d
def __init__(self, pa_preprocessor, name, qval=1): super().__init__(pa_preprocessor) self.time_log = [] self.qval = qval self.textdistance_name = name # Edited based: if name == 'Hamming': self.similar_measure = textdistance.Hamming(qval=qval) elif name == 'DamerauLevenshtein': self.similar_measure = textdistance.DamerauLevenshtein(qval=qval) elif name == 'Levenshtein': self.similar_measure = textdistance.Levenshtein(qval=qval) elif name == 'Mlipns': self.similar_measure = textdistance.MLIPNS(qval=qval) elif name == 'Jaro': self.similar_measure = textdistance.Jaro(qval=qval) elif name == 'JaroWinkler': self.similar_measure = textdistance.JaroWinkler(qval=qval) elif name == 'StrCmp95': self.similar_measure = textdistance.StrCmp95() elif name == 'NeedlemanWunsch': self.similar_measure = textdistance.NeedlemanWunsch(qval=qval) elif name == 'Gotoh': self.similar_measure = textdistance.Gotoh(qval=qval) elif name == 'SmithWaterman': self.similar_measure = textdistance.SmithWaterman(qval=qval) # Token based elif name == 'Jaccard': self.similar_measure = textdistance.Jaccard(qval=qval) elif name == 'Sorensen': self.similar_measure = textdistance.Sorensen(qval=qval) elif name == 'Tversky': self.similar_measure = textdistance.Tversky() elif name == 'Overlap': self.similar_measure = textdistance.Overlap(qval=qval) elif name == 'Tanimoto': self.similar_measure = textdistance.Tanimoto(qval=qval) elif name == 'Cosine': self.similar_measure = textdistance.Cosine(qval=qval) elif name == 'MongeElkan': self.similar_measure = textdistance.MongeElkan(qval=qval) elif name == 'Bag': self.similar_measure = textdistance.Bag(qval=qval) # Sequence based elif name == 'LCSSeq': self.similar_measure = textdistance.LCSSeq(qval=qval) elif name == 'LCSStr': self.similar_measure = textdistance.LCSStr(qval=qval) elif name == 'RatcliffObershelp': self.similar_measure = textdistance.RatcliffObershelp(qval=qval) # Compression based elif name == 'ArithNCD': self.similar_measure = textdistance.ArithNCD(qval=qval) elif name == 'RLENCD': self.similar_measure = textdistance.RLENCD(qval=qval) elif name == 'BWTRLENCD': self.similar_measure = textdistance.BWTRLENCD() elif name == 'SqrtNCD': self.similar_measure = textdistance.SqrtNCD(qval=qval) elif name == 'EntropyNCD': self.similar_measure = textdistance.EntropyNCD(qval=qval) # Simple: elif name == 'Prefix': self.similar_measure = textdistance.Prefix(qval=qval) elif name == 'Postfix': self.similar_measure = textdistance.Postfix(qval=qval) elif name == 'Length': self.similar_measure = textdistance.Length(qval=qval) elif name == 'Identity': self.similar_measure = textdistance.Identity(qval=qval) elif name == 'Matrix': self.similar_measure = textdistance.Matrix()
def simple_example(): str1, str2 = 'test', 'text' qval = 2 #-------------------- # Edit-based. if True: print("textdistance.hamming({}, {}) = {}.".format( str1, str2, textdistance.hamming(str1, str2))) print("textdistance.hamming.distance({}, {}) = {}.".format( str1, str2, textdistance.hamming.distance(str1, str2))) print("textdistance.hamming.similarity({}, {}) = {}.".format( str1, str2, textdistance.hamming.similarity(str1, str2))) print("textdistance.hamming.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.hamming.normalized_distance(str1, str2))) print( "textdistance.hamming.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.hamming.normalized_similarity(str1, str2))) print( "textdistance.Hamming(qval={}, test_func=None, truncate=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Hamming(qval=qval, test_func=None, truncate=False, external=True).distance(str1, str2))) print("textdistance.mlipns({}, {}) = {}.".format( str1, str2, textdistance.mlipns(str1, str2))) print("textdistance.mlipns.distance({}, {}) = {}.".format( str1, str2, textdistance.mlipns.distance(str1, str2))) print("textdistance.mlipns.similarity({}, {}) = {}.".format( str1, str2, textdistance.mlipns.similarity(str1, str2))) print("textdistance.mlipns.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.mlipns.normalized_distance(str1, str2))) print("textdistance.mlipns.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.mlipns.normalized_similarity(str1, str2))) print( "textdistance.MLIPNS(threshold=0.25, maxmismatches=2, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.MLIPNS(threshold=0.25, maxmismatches=2, qval=qval, external=True).distance(str1, str2))) print("textdistance.levenshtein({}, {}) = {}.".format( str1, str2, textdistance.levenshtein(str1, str2))) print("textdistance.levenshtein.distance({}, {}) = {}.".format( str1, str2, textdistance.levenshtein.distance(str1, str2))) print("textdistance.levenshtein.similarity({}, {}) = {}.".format( str1, str2, textdistance.levenshtein.similarity(str1, str2))) print("textdistance.levenshtein.normalized_distance({}, {}) = {}.". format(str1, str2, textdistance.levenshtein.normalized_distance(str1, str2))) print("textdistance.levenshtein.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.levenshtein.normalized_similarity(str1, str2))) print( "textdistance.Levenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Levenshtein(qval=qval, test_func=None, external=True).distance(str1, str2))) print("textdistance.damerau_levenshtein({}, {}) = {}.".format( str1, str2, textdistance.damerau_levenshtein(str1, str2))) print("textdistance.damerau_levenshtein.distance({}, {}) = {}.".format( str1, str2, textdistance.damerau_levenshtein.distance(str1, str2))) print( "textdistance.damerau_levenshtein.similarity({}, {}) = {}.".format( str1, str2, textdistance.damerau_levenshtein.similarity(str1, str2))) print( "textdistance.damerau_levenshtein.normalized_distance({}, {}) = {}." .format( str1, str2, textdistance.damerau_levenshtein.normalized_distance( str1, str2))) print( "textdistance.damerau_levenshtein.normalized_similarity({}, {}) = {}." .format( str1, str2, textdistance.damerau_levenshtein.normalized_similarity( str1, str2))) print( "textdistance.DamerauLevenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.DamerauLevenshtein(qval=qval, test_func=None, external=True).distance( str1, str2))) print("textdistance.jaro({}, {}) = {}.".format( str1, str2, textdistance.jaro(str1, str2))) print("textdistance.jaro.distance({}, {}) = {}.".format( str1, str2, textdistance.jaro.distance(str1, str2))) print("textdistance.jaro.similarity({}, {}) = {}.".format( str1, str2, textdistance.jaro.similarity(str1, str2))) print("textdistance.jaro.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.jaro.normalized_distance(str1, str2))) print("textdistance.jaro.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.jaro.normalized_similarity(str1, str2))) print( "textdistance.Jaro(long_tolerance=False, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Jaro(long_tolerance=False, qval=qval, external=True).distance(str1, str2))) print("textdistance.jaro_winkler({}, {}) = {}.".format( str1, str2, textdistance.jaro_winkler(str1, str2))) print("textdistance.jaro_winkler.distance({}, {}) = {}.".format( str1, str2, textdistance.jaro_winkler.distance(str1, str2))) print("textdistance.jaro_winkler.similarity({}, {}) = {}.".format( str1, str2, textdistance.jaro_winkler.similarity(str1, str2))) print("textdistance.jaro_winkler.normalized_distance({}, {}) = {}.". format(str1, str2, textdistance.jaro_winkler.normalized_distance(str1, str2))) print("textdistance.jaro_winkler.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.jaro_winkler.normalized_similarity(str1, str2))) print( "textdistance.JaroWinkler(long_tolerance=False, winklerize=True, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.JaroWinkler(long_tolerance=False, winklerize=True, qval=qval, external=True).distance(str1, str2))) print("textdistance.strcmp95({}, {}) = {}.".format( str1, str2, textdistance.strcmp95(str1, str2))) print("textdistance.strcmp95.distance({}, {}) = {}.".format( str1, str2, textdistance.strcmp95.distance(str1, str2))) print("textdistance.strcmp95.similarity({}, {}) = {}.".format( str1, str2, textdistance.strcmp95.similarity(str1, str2))) print("textdistance.strcmp95.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.strcmp95.normalized_distance(str1, str2))) print( "textdistance.strcmp95.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.strcmp95.normalized_similarity(str1, str2))) print( "textdistance.StrCmp95(long_strings=False, external=True).distance({}, {}) = {}." .format( str1, str2, textdistance.StrCmp95(long_strings=False, external=True).distance(str1, str2))) print("textdistance.needleman_wunsch({}, {}) = {}.".format( str1, str2, textdistance.needleman_wunsch(str1, str2))) print("textdistance.needleman_wunsch.distance({}, {}) = {}.".format( str1, str2, textdistance.needleman_wunsch.distance(str1, str2))) print("textdistance.needleman_wunsch.similarity({}, {}) = {}.".format( str1, str2, textdistance.needleman_wunsch.similarity(str1, str2))) print( "textdistance.needleman_wunsch.normalized_distance({}, {}) = {}.". format( str1, str2, textdistance.needleman_wunsch.normalized_distance(str1, str2))) print( "textdistance.needleman_wunsch.normalized_similarity({}, {}) = {}." .format( str1, str2, textdistance.needleman_wunsch.normalized_similarity( str1, str2))) print( "textdistance.NeedlemanWunsch(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.NeedlemanWunsch(gap_cost=1.0, sim_func=None, qval=qval, external=True).distance( str1, str2))) print("textdistance.gotoh({}, {}) = {}.".format( str1, str2, textdistance.gotoh(str1, str2))) print("textdistance.gotoh.distance({}, {}) = {}.".format( str1, str2, textdistance.gotoh.distance(str1, str2))) print("textdistance.gotoh.similarity({}, {}) = {}.".format( str1, str2, textdistance.gotoh.similarity(str1, str2))) print("textdistance.gotoh.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.gotoh.normalized_distance(str1, str2))) print("textdistance.gotoh.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.gotoh.normalized_similarity(str1, str2))) print( "textdistance.Gotoh(gap_open=1, gap_ext=0.4, sim_func=None, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Gotoh(gap_open=1, gap_ext=0.4, sim_func=None, qval=qval, external=True).distance(str1, str2))) print("textdistance.smith_waterman({}, {}) = {}.".format( str1, str2, textdistance.smith_waterman(str1, str2))) print("textdistance.smith_waterman.distance({}, {}) = {}.".format( str1, str2, textdistance.smith_waterman.distance(str1, str2))) print("textdistance.smith_waterman.similarity({}, {}) = {}.".format( str1, str2, textdistance.smith_waterman.similarity(str1, str2))) print("textdistance.smith_waterman.normalized_distance({}, {}) = {}.". format( str1, str2, textdistance.smith_waterman.normalized_distance(str1, str2))) print( "textdistance.smith_waterman.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.smith_waterman.normalized_similarity(str1, str2))) print( "textdistance.SmithWaterman(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.SmithWaterman(gap_cost=1.0, sim_func=None, qval=qval, external=True).distance(str1, str2))) #-------------------- # Token-based. if False: print("textdistance.jaccard({}, {}) = {}.".format( str1, str2, textdistance.jaccard(str1, str2))) print("textdistance.jaccard.distance({}, {}) = {}.".format( str1, str2, textdistance.jaccard.distance(str1, str2))) print("textdistance.jaccard.similarity({}, {}) = {}.".format( str1, str2, textdistance.jaccard.similarity(str1, str2))) print("textdistance.jaccard.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.jaccard.normalized_distance(str1, str2))) print( "textdistance.jaccard.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.jaccard.normalized_similarity(str1, str2))) print( "textdistance.Jaccard(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Jaccard(qval=qval, as_set=False, external=True).distance(str1, str2))) print("textdistance.sorensen({}, {}) = {}.".format( str1, str2, textdistance.sorensen(str1, str2))) print("textdistance.sorensen.distance({}, {}) = {}.".format( str1, str2, textdistance.sorensen.distance(str1, str2))) print("textdistance.sorensen.similarity({}, {}) = {}.".format( str1, str2, textdistance.sorensen.similarity(str1, str2))) print("textdistance.sorensen.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.sorensen.normalized_distance(str1, str2))) print( "textdistance.sorensen.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.sorensen.normalized_similarity(str1, str2))) print( "textdistance.Sorensen(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Sorensen(qval=qval, as_set=False, external=True).distance(str1, str2))) print("textdistance.sorensen_dice({}, {}) = {}.".format( str1, str2, textdistance.sorensen_dice(str1, str2))) print("textdistance.sorensen_dice.distance({}, {}) = {}.".format( str1, str2, textdistance.sorensen_dice.distance(str1, str2))) print("textdistance.sorensen_dice.similarity({}, {}) = {}.".format( str1, str2, textdistance.sorensen_dice.similarity(str1, str2))) print("textdistance.sorensen_dice.normalized_distance({}, {}) = {}.". format( str1, str2, textdistance.sorensen_dice.normalized_distance(str1, str2))) print("textdistance.sorensen_dice.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.sorensen_dice.normalized_similarity(str1, str2))) #print("textdistance.SorensenDice().distance({}, {}) = {}.".format(str1, str2, textdistance.SorensenDice().distance(str1, str2))) print("textdistance.tversky({}, {}) = {}.".format( str1, str2, textdistance.tversky(str1, str2))) print("textdistance.tversky.distance({}, {}) = {}.".format( str1, str2, textdistance.tversky.distance(str1, str2))) print("textdistance.tversky.similarity({}, {}) = {}.".format( str1, str2, textdistance.tversky.similarity(str1, str2))) print("textdistance.tversky.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.tversky.normalized_distance(str1, str2))) print( "textdistance.tversky.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.tversky.normalized_similarity(str1, str2))) print( "textdistance.Tversky(qval={}, ks=None, bias=None, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Tversky(qval=qval, ks=None, bias=None, as_set=False, external=True).distance(str1, str2))) print("textdistance.overlap({}, {}) = {}.".format( str1, str2, textdistance.overlap(str1, str2))) print("textdistance.overlap.distance({}, {}) = {}.".format( str1, str2, textdistance.overlap.distance(str1, str2))) print("textdistance.overlap.similarity({}, {}) = {}.".format( str1, str2, textdistance.overlap.similarity(str1, str2))) print("textdistance.overlap.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.overlap.normalized_distance(str1, str2))) print( "textdistance.overlap.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.overlap.normalized_similarity(str1, str2))) print( "textdistance.Overlap(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Overlap(qval=qval, as_set=False, external=True).distance(str1, str2))) # This is identical to the Jaccard similarity coefficient and the Tversky index for alpha=1 and beta=1. print("textdistance.tanimoto({}, {}) = {}.".format( str1, str2, textdistance.tanimoto(str1, str2))) print("textdistance.tanimoto.distance({}, {}) = {}.".format( str1, str2, textdistance.tanimoto.distance(str1, str2))) print("textdistance.tanimoto.similarity({}, {}) = {}.".format( str1, str2, textdistance.tanimoto.similarity(str1, str2))) print("textdistance.tanimoto.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.tanimoto.normalized_distance(str1, str2))) print( "textdistance.tanimoto.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.tanimoto.normalized_similarity(str1, str2))) print( "textdistance.Tanimoto(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Tanimoto(qval=qval, as_set=False, external=True).distance(str1, str2))) print("textdistance.cosine({}, {}) = {}.".format( str1, str2, textdistance.cosine(str1, str2))) print("textdistance.cosine.distance({}, {}) = {}.".format( str1, str2, textdistance.cosine.distance(str1, str2))) print("textdistance.cosine.similarity({}, {}) = {}.".format( str1, str2, textdistance.cosine.similarity(str1, str2))) print("textdistance.cosine.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.cosine.normalized_distance(str1, str2))) print("textdistance.cosine.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.cosine.normalized_similarity(str1, str2))) print( "textdistance.Cosine(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Cosine(qval=qval, as_set=False, external=True).distance(str1, str2))) print("textdistance.monge_elkan({}, {}) = {}.".format( str1, str2, textdistance.monge_elkan(str1, str2))) print("textdistance.monge_elkan.distance({}, {}) = {}.".format( str1, str2, textdistance.monge_elkan.distance(str1, str2))) print("textdistance.monge_elkan.similarity({}, {}) = {}.".format( str1, str2, textdistance.monge_elkan.similarity(str1, str2))) print("textdistance.monge_elkan.normalized_distance({}, {}) = {}.". format(str1, str2, textdistance.monge_elkan.normalized_distance(str1, str2))) print("textdistance.monge_elkan.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.monge_elkan.normalized_similarity(str1, str2))) print( "textdistance.MongeElkan(algorithm=textdistance.DamerauLevenshtein(), symmetric=False, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.MongeElkan( algorithm=textdistance.DamerauLevenshtein(), symmetric=False, qval=qval, external=True).distance(str1, str2))) print("textdistance.bag({}, {}) = {}.".format( str1, str2, textdistance.bag(str1, str2))) print("textdistance.bag.distance({}, {}) = {}.".format( str1, str2, textdistance.bag.distance(str1, str2))) print("textdistance.bag.similarity({}, {}) = {}.".format( str1, str2, textdistance.bag.similarity(str1, str2))) print("textdistance.bag.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.bag.normalized_distance(str1, str2))) print("textdistance.bag.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.bag.normalized_similarity(str1, str2))) print("textdistance.Bag(qval={}).distance({}, {}) = {}.".format( qval, str1, str2, textdistance.Bag(qval=qval).distance(str1, str2))) #-------------------- # Sequence-based. if False: print("textdistance.lcsseq({}, {}) = {}.".format( str1, str2, textdistance.lcsseq(str1, str2))) print("textdistance.lcsseq.distance({}, {}) = {}.".format( str1, str2, textdistance.lcsseq.distance(str1, str2))) print("textdistance.lcsseq.similarity({}, {}) = {}.".format( str1, str2, textdistance.lcsseq.similarity(str1, str2))) print("textdistance.lcsseq.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.lcsseq.normalized_distance(str1, str2))) print("textdistance.lcsseq.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.lcsseq.normalized_similarity(str1, str2))) #print("textdistance.LCSSeq(qval={}, test_func=None, external=True).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.LCSSeq(qval=qval, test_func=None, external=True).distance(str1, str2))) print("textdistance.LCSSeq().distance({}, {}) = {}.".format( str1, str2, textdistance.LCSSeq().distance(str1, str2))) print("textdistance.lcsstr({}, {}) = {}.".format( str1, str2, textdistance.lcsstr(str1, str2))) print("textdistance.lcsstr.distance({}, {}) = {}.".format( str1, str2, textdistance.lcsstr.distance(str1, str2))) print("textdistance.lcsstr.similarity({}, {}) = {}.".format( str1, str2, textdistance.lcsstr.similarity(str1, str2))) print("textdistance.lcsstr.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.lcsstr.normalized_distance(str1, str2))) print("textdistance.lcsstr.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.lcsstr.normalized_similarity(str1, str2))) print("textdistance.LCSStr(qval={}).distance({}, {}) = {}.".format( qval, str1, str2, textdistance.LCSStr(qval=qval).distance(str1, str2))) print("textdistance.ratcliff_obershelp({}, {}) = {}.".format( str1, str2, textdistance.ratcliff_obershelp(str1, str2))) print("textdistance.ratcliff_obershelp.distance({}, {}) = {}.".format( str1, str2, textdistance.ratcliff_obershelp.distance(str1, str2))) print( "textdistance.ratcliff_obershelp.similarity({}, {}) = {}.".format( str1, str2, textdistance.ratcliff_obershelp.similarity(str1, str2))) print( "textdistance.ratcliff_obershelp.normalized_distance({}, {}) = {}." .format( str1, str2, textdistance.ratcliff_obershelp.normalized_distance( str1, str2))) print( "textdistance.ratcliff_obershelp.normalized_similarity({}, {}) = {}." .format( str1, str2, textdistance.ratcliff_obershelp.normalized_similarity( str1, str2))) print("textdistance.RatcliffObershelp().distance({}, {}) = {}.".format( str1, str2, textdistance.RatcliffObershelp().distance(str1, str2))) #-------------------- # Compression-based. if False: print("textdistance.arith_ncd({}, {}) = {}.".format( str1, str2, textdistance.arith_ncd(str1, str2))) print("textdistance.arith_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.arith_ncd.distance(str1, str2))) print("textdistance.arith_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.arith_ncd.similarity(str1, str2))) print( "textdistance.arith_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.arith_ncd.normalized_distance(str1, str2))) print("textdistance.arith_ncd.normalized_similarity({}, {}) = {}.". format(str1, str2, textdistance.arith_ncd.normalized_similarity(str1, str2))) #print("textdistance.ArithNCD(base=2, terminator=None, qval={}).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.ArithNCD(base=2, terminator=None, qval=qval).distance(str1, str2))) print("textdistance.ArithNCD().distance({}, {}) = {}.".format( str1, str2, textdistance.ArithNCD().distance(str1, str2))) print("textdistance.rle_ncd({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd(str1, str2))) print("textdistance.rle_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd.distance(str1, str2))) print("textdistance.rle_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd.similarity(str1, str2))) print("textdistance.rle_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd.normalized_distance(str1, str2))) print( "textdistance.rle_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd.normalized_similarity(str1, str2))) print("textdistance.RLENCD().distance({}, {}) = {}.".format( str1, str2, textdistance.RLENCD().distance(str1, str2))) print("textdistance.bwtrle_ncd({}, {}) = {}.".format( str1, str2, textdistance.bwtrle_ncd(str1, str2))) print("textdistance.bwtrle_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.bwtrle_ncd.distance(str1, str2))) print("textdistance.bwtrle_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.bwtrle_ncd.similarity(str1, str2))) print( "textdistance.bwtrle_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.bwtrle_ncd.normalized_distance(str1, str2))) print("textdistance.bwtrle_ncd.normalized_similarity({}, {}) = {}.". format(str1, str2, textdistance.bwtrle_ncd.normalized_similarity(str1, str2))) print("textdistance.BWTRLENCD(terminator='\0').distance({}, {}) = {}.". format( str1, str2, textdistance.BWTRLENCD(terminator='\0').distance(str1, str2))) print("textdistance.sqrt_ncd({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd(str1, str2))) print("textdistance.sqrt_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd.distance(str1, str2))) print("textdistance.sqrt_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd.similarity(str1, str2))) print("textdistance.sqrt_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd.normalized_distance(str1, str2))) print( "textdistance.sqrt_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd.normalized_similarity(str1, str2))) print("textdistance.SqrtNCD(qval={}).distance({}, {}) = {}.".format( qval, str1, str2, textdistance.SqrtNCD(qval=qval).distance(str1, str2))) print("textdistance.entropy_ncd({}, {}) = {}.".format( str1, str2, textdistance.entropy_ncd(str1, str2))) print("textdistance.entropy_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.entropy_ncd.distance(str1, str2))) print("textdistance.entropy_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.entropy_ncd.similarity(str1, str2))) print("textdistance.entropy_ncd.normalized_distance({}, {}) = {}.". format(str1, str2, textdistance.entropy_ncd.normalized_distance(str1, str2))) print("textdistance.entropy_ncd.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.entropy_ncd.normalized_similarity(str1, str2))) print( "textdistance.EntropyNCD(qval={}, coef=1, base=2).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.EntropyNCD(qval=qval, coef=1, base=2).distance(str1, str2))) print("textdistance.bz2_ncd({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd(str1, str2))) print("textdistance.bz2_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd.distance(str1, str2))) print("textdistance.bz2_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd.similarity(str1, str2))) print("textdistance.bz2_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd.normalized_distance(str1, str2))) print( "textdistance.bz2_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd.normalized_similarity(str1, str2))) print("textdistance.BZ2NCD().distance({}, {}) = {}.".format( str1, str2, textdistance.BZ2NCD().distance(str1, str2))) print("textdistance.lzma_ncd({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd(str1, str2))) print("textdistance.lzma_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd.distance(str1, str2))) print("textdistance.lzma_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd.similarity(str1, str2))) print("textdistance.lzma_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd.normalized_distance(str1, str2))) print( "textdistance.lzma_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd.normalized_similarity(str1, str2))) print("textdistance.LZMANCD().distance({}, {}) = {}.".format( str1, str2, textdistance.LZMANCD().distance(str1, str2))) print("textdistance.zlib_ncd({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd(str1, str2))) print("textdistance.zlib_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd.distance(str1, str2))) print("textdistance.zlib_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd.similarity(str1, str2))) print("textdistance.zlib_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd.normalized_distance(str1, str2))) print( "textdistance.zlib_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd.normalized_similarity(str1, str2))) print("textdistance.ZLIBNCD().distance({}, {}) = {}.".format( str1, str2, textdistance.ZLIBNCD().distance(str1, str2))) #-------------------- # Phonetic. if False: print("textdistance.mra({}, {}) = {}.".format( str1, str2, textdistance.mra(str1, str2))) print("textdistance.mra.distance({}, {}) = {}.".format( str1, str2, textdistance.mra.distance(str1, str2))) print("textdistance.mra.similarity({}, {}) = {}.".format( str1, str2, textdistance.mra.similarity(str1, str2))) print("textdistance.mra.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.mra.normalized_distance(str1, str2))) print("textdistance.mra.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.mra.normalized_similarity(str1, str2))) print("textdistance.MRA().distance({}, {}) = {}.".format( str1, str2, textdistance.MRA().distance(str1, str2))) print("textdistance.editex({}, {}) = {}.".format( str1, str2, textdistance.editex(str1, str2))) print("textdistance.editex.distance({}, {}) = {}.".format( str1, str2, textdistance.editex.distance(str1, str2))) print("textdistance.editex.similarity({}, {}) = {}.".format( str1, str2, textdistance.editex.similarity(str1, str2))) print("textdistance.editex.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.editex.normalized_distance(str1, str2))) print("textdistance.editex.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.editex.normalized_similarity(str1, str2))) print( "textdistance.Editex(local=False, match_cost=0, group_cost=1, mismatch_cost=2, groups=None, ungrouped=None, external=True).distance({}, {}) = {}." .format( str1, str2, textdistance.Editex(local=False, match_cost=0, group_cost=1, mismatch_cost=2, groups=None, ungrouped=None, external=True).distance(str1, str2))) #-------------------- # Simple. if False: print("textdistance.prefix({}, {}) = {}.".format( str1, str2, textdistance.prefix(str1, str2))) print("textdistance.prefix.distance({}, {}) = {}.".format( str1, str2, textdistance.prefix.distance(str1, str2))) print("textdistance.prefix.similarity({}, {}) = {}.".format( str1, str2, textdistance.prefix.similarity(str1, str2))) print("textdistance.prefix.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.prefix.normalized_distance(str1, str2))) print("textdistance.prefix.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.prefix.normalized_similarity(str1, str2))) print( "textdistance.Prefix(qval={}, sim_test=None).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Prefix(qval=qval, sim_test=None).distance(str1, str2))) print("textdistance.postfix({}, {}) = {}.".format( str1, str2, textdistance.postfix(str1, str2))) print("textdistance.postfix.distance({}, {}) = {}.".format( str1, str2, textdistance.postfix.distance(str1, str2))) print("textdistance.postfix.similarity({}, {}) = {}.".format( str1, str2, textdistance.postfix.similarity(str1, str2))) print("textdistance.postfix.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.postfix.normalized_distance(str1, str2))) print( "textdistance.postfix.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.postfix.normalized_similarity(str1, str2))) #print("textdistance.Postfix(qval={}, sim_test=None).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.Postfix(qval=qval, sim_test=None).distance(str1, str2))) print("textdistance.Postfix().distance({}, {}) = {}.".format( str1, str2, textdistance.Postfix().distance(str1, str2))) print("textdistance.length({}, {}) = {}.".format( str1, str2, textdistance.length(str1, str2))) print("textdistance.length.distance({}, {}) = {}.".format( str1, str2, textdistance.length.distance(str1, str2))) print("textdistance.length.similarity({}, {}) = {}.".format( str1, str2, textdistance.length.similarity(str1, str2))) print("textdistance.length.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.length.normalized_distance(str1, str2))) print("textdistance.length.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.length.normalized_similarity(str1, str2))) print("textdistance.Length().distance({}, {}) = {}.".format( str1, str2, textdistance.Length().distance(str1, str2))) print("textdistance.identity({}, {}) = {}.".format( str1, str2, textdistance.identity(str1, str2))) print("textdistance.identity.distance({}, {}) = {}.".format( str1, str2, textdistance.identity.distance(str1, str2))) print("textdistance.identity.similarity({}, {}) = {}.".format( str1, str2, textdistance.identity.similarity(str1, str2))) print("textdistance.identity.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.identity.normalized_distance(str1, str2))) print( "textdistance.identity.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.identity.normalized_similarity(str1, str2))) print("textdistance.Identity().distance({}, {}) = {}.".format( str1, str2, textdistance.Identity().distance(str1, str2))) print("textdistance.matrix({}, {}) = {}.".format( str1, str2, textdistance.matrix(str1, str2))) print("textdistance.matrix.distance({}, {}) = {}.".format( str1, str2, textdistance.matrix.distance(str1, str2))) print("textdistance.matrix.similarity({}, {}) = {}.".format( str1, str2, textdistance.matrix.similarity(str1, str2))) print("textdistance.matrix.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.matrix.normalized_distance(str1, str2))) print("textdistance.matrix.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.matrix.normalized_similarity(str1, str2))) print( "textdistance.Matrix(mat=None, mismatch_cost=0, match_cost=1, symmetric=True, external=True).distance({}, {}) = {}." .format( str1, str2, textdistance.Matrix(mat=None, mismatch_cost=0, match_cost=1, symmetric=True, external=True).distance(str1, str2)))
def get_feature(self): result = {} feat_prefix = 'feat_' query_len = self.querys_seg.apply(self.get_len) doc_len = self.doc_text_seg.apply(self.get_len) jaccard_sim = list( map(self.get_jaccard_sim, self.querys_seg, self.doc_text_seg)) edit_distance = list( map(lambda x, y: Levenshtein.distance(x, y) / (len(x) + 1), self.querys, self.doc_text)) edit_jaro = list( map(lambda x, y: Levenshtein.jaro(x, y), self.querys, self.doc_text)) edit_ratio = list( map(lambda x, y: Levenshtein.ratio(x, y), self.querys, self.doc_text)) edit_jaro_winkler = list( map(lambda x, y: Levenshtein.jaro_winkler(x, y), self.querys, self.doc_text)) hamming = list( map( lambda x, y: textdistance.Hamming(qval=None). normalized_distance(x, y), self.querys, self.doc_text)) mht_sim, tf_mht_sim, cos_sim, tf_cos_sim, euc_sim, tf_euc_sim = self.get_tfidf_sim( self.querys_seg, self.doc_text_seg) gram_2_sim, gram_2_sim_ratio = self.get_n_grams( self.querys_seg, self.doc_text_seg, 2) gram_3_sim, gram_3_sim_ratio = self.get_n_grams( self.querys_seg, self.doc_text_seg, 3) bm25_group = self.get_bm25_group( self.data[['query_id', 'query_text', 'doc_text']]) bm25_overall = list( map(self.get_bm25_overall, self.doc_ids, self.querys_seg)) mat_cos_sim = list( map(lambda x, y: self.get_mat_cos_sim(x, y), self.querys_seg, self.doc_text_seg)) query_vec = self.querys_seg.apply(lambda x: self.get_word_vec(x)) doc_vec = self.doc_text_seg.apply(lambda x: self.get_word_vec(x)) cos_mean_word2vec = list(map(self.get_cos_sim, query_vec, doc_vec)) euc_mean_word2vec = list( map(self.get_euclidean_sim, query_vec, doc_vec)) mhd_mean_word2vec = list( map(self.get_manhattan_distance, query_vec, doc_vec)) # 'query_vec': query_vec, 'doc_vec': doc_vec, result = { 'query_id': self.query_ids, 'query_text': self.querys, 'doc_id': self.doc_ids, 'doc_text': self.doc_text, 'relevence': self.relevences, 'query_len': query_len, 'doc_len': doc_len, 'jaccard_sim': jaccard_sim, 'edit_distance': edit_distance, 'edit_jaro': edit_jaro, 'edit_ratio': edit_ratio, 'edit_jaro_winkler': edit_jaro_winkler, 'hamming': hamming, 'mht_sim': mht_sim, 'tf_mht_sim': tf_mht_sim, 'cos_sim': cos_sim, 'tf_cos_sim': tf_cos_sim, 'euc_sim': euc_sim, 'tf_euc_sim': tf_euc_sim, 'gram_2_sim': gram_2_sim, 'gram_2_sim_ratio': gram_2_sim_ratio, 'gram_3_sim': gram_3_sim, 'gram_3_sim_ratio': gram_3_sim_ratio, 'bm25_group': bm25_group, 'bm25_overall': bm25_overall, 'mat_cos_sim': mat_cos_sim, 'cos_mean_word2vec': cos_mean_word2vec, 'euc_mean_word2vec': euc_mean_word2vec, 'mhd_mean_word2vec': mhd_mean_word2vec } return pd.DataFrame(result)
def metrics(x): a = x[4].strip() b = x[5].strip() al = a.lower() bl = b.lower() a_len = float(len(a)) def tryit(x): try: return x() except Exception as e: return 0.0 tempo = lambda a, b, x: \ sum([ 1 if xi == a else (-1 if xi == b else 0) for xi in x ]) M = [ x[3], tryit(lambda: td.bz2_ncd(a, b)), tryit(lambda: td.zlib_ncd(a, b)), tryit(lambda: td.prefix.normalized_similarity(a, b)), tryit(lambda: td.postfix.normalized_similarity(a, b)), tryit(lambda: td.matrix.normalized_similarity(a, b)), tryit(lambda: td.length.normalized_similarity(a, b)), tryit(lambda: td.Hamming().normalized_similarity(a, b)), tryit(lambda: td.Hamming(qval=2).normalized_similarity(a, b)), tryit(lambda: td.Hamming(qval=3).normalized_similarity(a, b)), tryit(lambda: td.Hamming(qval=4).normalized_similarity(a, b)), tryit(lambda: td.Hamming(qval=5).normalized_similarity(a, b)), tryit(lambda: td.DamerauLevenshtein().normalized_similarity(a, b)), tryit( lambda: td.DamerauLevenshtein(qval=2).normalized_similarity(a, b)), tryit( lambda: td.DamerauLevenshtein(qval=3).normalized_similarity(a, b)), tryit( lambda: td.DamerauLevenshtein(qval=4).normalized_similarity(a, b)), tryit( lambda: td.DamerauLevenshtein(qval=5).normalized_similarity(a, b)), tryit(lambda: td.Jaccard().normalized_similarity(a, b)), tryit(lambda: td.Jaccard().normalized_similarity(al, bl)), tryit(lambda: td.Jaccard(qval=2).normalized_similarity(a, b)), tryit(lambda: td.Jaccard(qval=2).normalized_similarity(al, bl)), tryit(lambda: td.Jaccard(qval=3).normalized_similarity(a, b)), tryit(lambda: td.Jaccard(qval=3).normalized_similarity(al, bl)), tryit(lambda: td.Jaccard(qval=4).normalized_similarity(a, b)), tryit(lambda: td.Jaccard(qval=4).normalized_similarity(al, bl)), tryit(lambda: td.Jaccard(qval=5).normalized_similarity(a, b)), tryit(lambda: td.Jaccard(qval=5).normalized_similarity(al, bl)), tryit(lambda: td.Tversky().normalized_similarity(a, b)), tryit(lambda: td.Tversky(qval=2).normalized_similarity(a, b)), tryit(lambda: td.Tversky(qval=3).normalized_similarity(a, b)), tryit(lambda: td.Tversky(qval=4).normalized_similarity(a, b)), tryit(lambda: td.Tversky(qval=5).normalized_similarity(a, b)), tryit(lambda: td.JaroWinkler().normalized_similarity(a, b)), tryit(lambda: td.JaroWinkler(qval=2).normalized_similarity(a, b)), tryit(lambda: td.JaroWinkler(qval=3).normalized_similarity(a, b)), tryit(lambda: td.JaroWinkler(qval=4).normalized_similarity(a, b)), tryit(lambda: td.JaroWinkler(qval=5).normalized_similarity(a, b)), tryit(lambda: td.StrCmp95().normalized_similarity(a, b)), tryit(lambda: td.StrCmp95().normalized_similarity(al, bl)), 1.0 - (float(abs(tempo('(', ')', a) - tempo('(', ')', b))) / a_len), 1.0 - (float(abs(tempo('[', ']', a) - tempo('[', ']', b))) / a_len), 1.0 - (float(abs(tempo('{', '}', a) - tempo('{', '}', b))) / a_len), 1.0 - (float(abs(tempo('<', '>', a) - tempo('<', '>', b))) / a_len) ] return '{} qid:{} {} # {}'.format( x[0], x[1], ' '.join( ['{}:{:.4f}'.format(k + 1, float(y)) for k, y in enumerate(M)]), x[2])
def make_feature(data_or,vec_model): print('get features:') from gensim.models import Word2Vec vec_model = Word2Vec.load('pretrain_model/w2v_300.model') dictionary = corpora.Dictionary.load('temp_data/train_dictionary.dict') tfidf = models.TfidfModel.load("temp_data/train_tfidf.model") index = similarities.SparseMatrixSimilarity.load('temp_data/train_index.index') item_id_list = joblib.load('temp_data/paper_id.pkl') with open('temp_data/train_content.pkl','rb') as fr: corpus = pickle.load(fr) data = data_or.copy() data['abstract_pre'] = data['abstract_pre'].apply( lambda x: np.nan if str(x) == 'nan' or len(x) < 9 else x) data['abstract_pre'] = data['abstract_pre'].apply( lambda x: 'none' if str(x) == 'nan' or str(x).split(' ') == ['n', 'o', 'n', 'e'] else x) data['key_text_pre'] = data['key_text_pre'].fillna('none') data['description_text'] = data['description_text'].fillna('none') data['title_pro'] = data['title_pro'].fillna('none') data['description_text_pre'] = data['description_text_pre'].fillna('none') prefix = 'num_' # 长度 data[prefix + 'key_text_len'] = data['key_text_pre'].apply(lambda x: len(x.split(' '))) # 长度append data[prefix + 'description_text_len'] = data['description_text'].apply(lambda x: len(x.split(' '))) data.loc[data[prefix + 'key_text_len'] < 7, 'key_text_pre'] = data[data[prefix + 'key_text_len'] < 7][ 'description_text'].apply( lambda x: ' '.join(pre_process(re.sub(r'[\[|,]+\*\*\#\#\*\*[\]|,]+', '', x)))).values # abstract是否为空 data[prefix + 'cate_pa_isnull'] = data['abstract_pre'].apply(lambda x: 1 if str(x) == 'none' else 0) # key_words是否为空 data[prefix + 'cate_pkeywords_isnull'] = data['keywords'].apply(lambda x: 1 if str(x) == 'nan' else 0) #描述在key_word中出现的次数 def get_num_key(x,y): if str(y)=='nan': return -1 y=y.strip(';').split(';') num=0 for i in y: if i in x: num+=1 return num data[prefix+'key_in_key_word_number']=list(map(lambda x,y: get_num_key(x,y),data['key_text_pre'],data['keywords'])) #描述在key_word中出现的次数/key_words的个数 data[prefix+'key_in_key_word_number_rate']=list(map(lambda x,y: 0 if x==-1 else x/len(y.strip(';').split(';')),data[prefix+'key_in_key_word_number'], data['keywords'])) #append data[prefix+'key_in_key_word_number2']=list(map(lambda x,y: get_num_key(x,y),data['description_text'],data['keywords'])) #描述在key_word中出现的次数/key_words的个数 data[prefix+'key_in_key_word_number2_rate']=list(map(lambda x,y: 0 if x==-1 else x/len(y.strip(';').split(';')),data[prefix+'key_in_key_word_number2'], data['keywords'])) # 描述在title出现单词的统计 def get_num_common_words_and_ratio(merge, col): # merge data merge = merge[col] merge.columns = ['q1', 'q2'] merge['q2'] = merge['q2'].apply(lambda x: 'none' if str(x) == 'nan' else x) q1_word_set = merge.q1.apply(lambda x: x.split(' ')).apply(set).values q2_word_set = merge.q2.apply(lambda x: x.split(' ')).apply(set).values q1_word_len = merge.q1.apply(lambda x: len(x.split(' '))).values q2_word_len = merge.q2.apply(lambda x: len(x.split(' '))).values q1_word_len_set = merge.q1.apply(lambda x: len(set(x.split(' ')))).values q2_word_len_set = merge.q2.apply(lambda x: len(set(x.split(' ')))).values result = [len(q1_word_set[i] & q2_word_set[i]) for i in range(len(q1_word_set))] result_ratio_q = [result[i] / q1_word_len[i] for i in range(len(q1_word_set))] result_ratio_t = [result[i] / q2_word_len[i] for i in range(len(q1_word_set))] result_ratio_q_set = [result[i] / q1_word_len_set[i] for i in range(len(q1_word_set))] result_ratio_t_set = [result[i] / q2_word_len_set[i] for i in range(len(q1_word_set))] return result, result_ratio_q, result_ratio_t, q1_word_len, q2_word_len, q1_word_len_set, q2_word_len_set, result_ratio_q_set, result_ratio_t_set data[prefix + 'common_words_k_pt'], \ data[prefix + 'common_words_k_pt_k'], \ data[prefix + 'common_words_k_pt_pt'], \ data[prefix + 'k_len'], \ data[prefix + 'pt_len'], \ data[prefix + 'k_len_set'], \ data[prefix + 'pt_len_set'], \ data[prefix + 'common_words_k_pt_k_set'], \ data[prefix + 'common_words_k_pt_pt_set'] = get_num_common_words_and_ratio(data, ['key_text_pre', 'title_pro']) data[prefix + 'common_words_k_at'], \ data[prefix + 'common_words_k_at_k'], \ data[prefix + 'common_words_k_at_at'], \ data[prefix + 'k_len'], \ data[prefix + 'at_len'], \ data[prefix + 'k_len_set'], \ data[prefix + 'at_len_set'], \ data[prefix + 'common_words_k_at_k_set'], \ data[prefix + 'common_words_k_at_at_set'] = get_num_common_words_and_ratio(data, ['key_text_pre', 'abstract_pre']) #append data[prefix + 'common_words_k_pt_2'], \ data[prefix + 'common_words_k_pt_k_2'], \ data[prefix + 'common_words_k_pt_pt_2'], \ data[prefix + 'k_len_2'], \ data[prefix + 'pt_len'], \ data[prefix + 'k_len_set_2'], \ data[prefix + 'pt_len_set'], \ data[prefix + 'common_words_k_pt_k_set_2'], \ data[prefix + 'common_words_k_pt_pt_set_2'] = get_num_common_words_and_ratio(data, ['description_text', 'title_pro']) data[prefix + 'common_words_k_at_2'], \ data[prefix + 'common_words_k_at_k_2'], \ data[prefix + 'common_words_k_at_at_2'], \ data[prefix + 'k_len_2'], \ data[prefix + 'at_len'], \ data[prefix + 'k_len_set_2'], \ data[prefix + 'at_len_set'], \ data[prefix + 'common_words_k_at_k_set_2'], \ data[prefix + 'common_words_k_at_at_set_2'] = get_num_common_words_and_ratio(data, ['description_text', 'abstract_pre']) # Jaccard 相似度 def jaccard(x, y): if str(y) == 'nan': y = 'none' x = set(x) y = set(y) return float(len(x & y) / len(x | y)) data[prefix + 'jaccard_sim_k_pt'] = list(map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['title_pro'])) data[prefix + 'jaccard_sim_k_pa'] = list( map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['abstract_pre'])) #append data[prefix + 'jaccard_sim_k_pt2'] = list(map(lambda x, y: jaccard(x, y), data['description_text'], data['title_pro'])) data[prefix + 'jaccard_sim_k_pa2'] = list( map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['description_text'])) # 编辑距离 print('get edict distance:') data[prefix + 'edict_distance_k_pt'] = list( map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'edict_jaro'] = list( map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'edict_ratio'] = list( map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'edict_jaro_winkler'] = list( map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'edict_distance_k_pa'] = list( map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['key_text_pre']), data['abstract_pre'])) data[prefix + 'edict_jaro_pa'] = list( map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['key_text_pre']), data['abstract_pre'])) data[prefix + 'edict_ratio_pa'] = list( map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['key_text_pre']), data['abstract_pre'])) data[prefix + 'edict_jaro_winkler_pa'] = list( map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['key_text_pre']), data['abstract_pre'])) #append print('get edict distance:') data[prefix + 'edict_distance_k_pt_2'] = list( map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['description_text']), data['title_pro'])) data[prefix + 'edict_jaro_2'] = list( map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['description_text']), data['title_pro'])) data[prefix + 'edict_ratio_2'] = list( map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['description_text']), data['title_pro'])) data[prefix + 'edict_jaro_winkler_2'] = list( map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['description_text']), data['title_pro'])) data[prefix + 'edict_distance_k_pa_2'] = list( map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['description_text']), data['abstract_pre'])) data[prefix + 'edict_jaro_pa_2'] = list( map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['description_text']), data['abstract_pre'])) data[prefix + 'edict_ratio_pa_2'] = list( map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['description_text']), data['abstract_pre'])) data[prefix + 'edict_jaro_winkler_pa_2'] = list( map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['description_text']), data['abstract_pre'])) #余弦相似度 def get_sim(doc, corpus): corpus = corpus.split(' ') corpus_vec = [dictionary.doc2bow(corpus)] corpus_tfidf = tfidf[corpus_vec] featurenum = len(dictionary.token2id.keys()) index_i = similarities.SparseMatrixSimilarity(corpus_tfidf, num_features=featurenum) doc = doc.split(' ') vec = dictionary.doc2bow(doc) vec_tfidf = tfidf[vec] sim = index_i.get_similarities(vec_tfidf) return sim[0] data[prefix + 'sim'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'sim_pa'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['key_text_pre']), data['abstract_pre'])) #append data[prefix + 'sim_2'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['description_text']), data['title_pro'])) data[prefix + 'sim_pa_2'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['description_text']), data['abstract_pre'])) # tfidf def get_simlilary(query, title): def get_weight_counter_and_tf_idf(x, y): x = x.split() y = y.split() corups = x + y obj = dict(collections.Counter(corups)) x_weight = [] y_weight = [] idfs = [] for key in obj.keys(): idf = 1 w = obj[key] if key in x: idf += 1 x_weight.append(w) else: x_weight.append(0) if key in y: idf += 1 y_weight.append(w) else: y_weight.append(0) idfs.append(math.log(3.0 / idf) + 1) return [np.array(x_weight), np.array(y_weight), np.array(x_weight) * np.array(idfs), np.array(y_weight) * np.array(idfs), np.array(list(obj.keys()))] weight = list(map(lambda x, y: get_weight_counter_and_tf_idf(x, y), tqdm(query), title)) x_weight_couner = [] y_weight_couner = [] x_weight_tfidf = [] y_weight_tfidf = [] words = [] for i in weight: x_weight_couner.append(i[0]) y_weight_couner.append(i[1]) x_weight_tfidf.append(i[2]) y_weight_tfidf.append(i[3]) words.append(i[4]) # 曼哈顿距离 def mhd_simlilary(x, y): return np.linalg.norm(x - y, ord=1) mhd_simlilary_counter = list(map(lambda x, y: mhd_simlilary(x, y), x_weight_couner, y_weight_couner)) mhd_simlilary_tfidf = list(map(lambda x, y: mhd_simlilary(x, y), x_weight_tfidf, y_weight_tfidf)) # 余弦相似度 def cos_simlilary(x, y): return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) cos_simlilary_counter = list(map(lambda x, y: cos_simlilary(x, y), x_weight_couner, y_weight_couner)) cos_simlilary_tfidf = list(map(lambda x, y: cos_simlilary(x, y), x_weight_tfidf, y_weight_tfidf)) # 欧式距离 def Euclidean_simlilary(x, y): return np.sqrt(np.sum(x - y) ** 2) Euclidean_simlilary_counter = list(map(lambda x, y: Euclidean_simlilary(x, y), x_weight_couner, y_weight_couner)) Euclidean__simlilary_tfidf = list(map(lambda x, y: Euclidean_simlilary(x, y), x_weight_tfidf, y_weight_tfidf)) return mhd_simlilary_counter, mhd_simlilary_tfidf, cos_simlilary_counter, \ cos_simlilary_tfidf, Euclidean_simlilary_counter, Euclidean__simlilary_tfidf data[prefix + 'mhd_similiary'], data[prefix + 'tf_mhd_similiary'], \ data[prefix + 'cos_similiary'], data[prefix + 'tf_cos_similiary'], \ data[prefix + 'os_similiary'], data[prefix + 'tf_os_similiary'] = get_simlilary(data['key_text_pre'],data['title_pro']) data[prefix + 'mhd_similiary_pa'], data[prefix + 'tf_mhd_similiary_pa'], \ data[prefix + 'cos_similiary_pa'], data[prefix + 'tf_cos_similiary_pa'], \ data[prefix + 'os_similiary_pa'], data[prefix + 'tf_os_similiary_pa'] = get_simlilary(data['key_text_pre'],data['abstract_pre']) '词向量平均的相似度' def get_vec(x): vec = [] for word in x.split(): if word in vec_model: vec.append(vec_model[word]) if len(vec) == 0: return np.nan else: return np.mean(np.array(vec), axis=0) data['key_text_pre_vec'] = data['key_text_pre'].progress_apply(lambda x: get_vec(x)) data['title_pro_vec'] = data['title_pro'].progress_apply(lambda x: get_vec(x)) data['abstract_pre_vec'] = data['abstract_pre'].progress_apply(lambda x: get_vec(x)) data['description_text_vec'] = data['description_text'].progress_apply(lambda x: get_vec(x)) # cos data[prefix + 'cos_mean_word2vec'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)), tqdm(data['key_text_pre_vec']), data['title_pro_vec'])) data[prefix + 'cos_mean_word2vec'] = data[prefix + 'cos_mean_word2vec'].progress_apply( lambda x: np.nan if np.isnan(x).any() else x) # 欧式距离 data[prefix + 'os_mean_word2vec'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)), tqdm(data['key_text_pre_vec']), data['title_pro_vec'])) # mhd data[prefix + 'mhd_mean_word2vec'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else np.linalg.norm(x - y, ord=1), tqdm(data['key_text_pre_vec']), data['title_pro_vec'])) # cos data[prefix + 'cos_mean_word2vec_pa'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)), tqdm(data['key_text_pre_vec']), data['abstract_pre_vec'])) data[prefix + 'cos_mean_word2vec_pa'] = data[prefix + 'cos_mean_word2vec_pa'].progress_apply( lambda x: np.nan if np.isnan(x).any() else x) # 欧式距离 data[prefix + 'os_mean_word2vec_pa'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)), tqdm(data['key_text_pre_vec']), data['abstract_pre_vec'])) # mhd data[prefix + 'mhd_mean_word2vec_pa'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else np.linalg.norm(x - y, ord=1), tqdm(data['key_text_pre_vec']), data['abstract_pre_vec'])) #append data[prefix + 'cos_mean_word2vec_2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)), tqdm(data['description_text_vec']), data['title_pro_vec'])) data[prefix + 'cos_mean_word2vec_2'] = data[prefix + 'cos_mean_word2vec_2'].progress_apply( lambda x: np.nan if np.isnan(x).any() else x) # 欧式距离 data[prefix + 'os_mean_word2vec_2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)), tqdm(data['description_text_vec']), data['title_pro_vec'])) # mhd data[prefix + 'mhd_mean_word2vec_2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else np.linalg.norm(x - y, ord=1), tqdm(data['description_text_vec']), data['title_pro_vec'])) # cos data[prefix + 'cos_mean_word2vec_pa2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)), tqdm(data['description_text_vec']), data['abstract_pre_vec'])) data[prefix + 'cos_mean_word2vec_pa2'] = data[prefix + 'cos_mean_word2vec_pa2'].progress_apply( lambda x: np.nan if np.isnan(x).any() else x) # 欧式距离 data[prefix + 'os_mean_word2vec_pa2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)), tqdm(data['description_text_vec']), data['abstract_pre_vec'])) # mhd data[prefix + 'mhd_mean_word2vec_pa2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else np.linalg.norm(x - y, ord=1), tqdm(data['description_text_vec']), data['abstract_pre_vec'])) #n-gram距离相关 data[prefix+'n_gram_sim'],data[prefix+'sim_numeber_rate']=get_df_grams(data,2,['key_text_pre','title_pro']) data[prefix+'n_gram_sim_pa'],data[prefix+'sim_numeber_rate_pa']=get_df_grams(data,2,['key_text_pre','abstract_pre']) #append #n-gram距离相关 data[prefix+'n_gram_sim_2'],data[prefix+'sim_numeber_rate_2']=get_df_grams(data,2,['description_text','title_pro']) data[prefix+'n_gram_sim_pa_2'],data[prefix+'sim_numeber_rate_pa_2']=get_df_grams(data,2,['description_text','abstract_pre']) #################################################朋哥已做################################## # def apply_fun(df): # df.columns = ['d_id', 'key', 'doc'] # df['d_id'] = df['d_id'].fillna('always_nan') # query_id_group = df.groupby(['d_id']) # bm_list = [] # for name, group in tqdm(query_id_group): # corpus = group['doc'].values.tolist() # corpus = [sentence.strip().split() for sentence in corpus] # query = group['key'].values[0].strip().split() # bm25Model = BM25(corpus) # bmscore = bm25Model.get_scores(query) # bm_list.extend(bmscore) # return bm_list # data[prefix + 'bm25'] = apply_fun(data[['description_id', 'key_text_pre', 'title_pro']]) # data[prefix + 'bm25_pa'] = apply_fun(data[['description_id', 'key_text_pre', 'abstract_pre']]) # #append # data[prefix + 'bm25_2'] = apply_fun(data[['description_id', 'description_text', 'title_pro']]) # data[prefix + 'bm25_pa_2'] = apply_fun(data[['description_id', 'description_text', 'abstract_pre']]) # # get bm25 # def get_bm25(p_id, query): # query = query.split(' ') # score = bm25Model.get_score(query, item_id_list.index(p_id)) # return score # data[prefix + 'bm_25_all'] = list(map(lambda x, y: get_bm25(x, y), tqdm(data['paper_id']), data['key_text_pre'])) # #append # data[prefix + 'bm_25_all_2'] = list(map(lambda x, y: get_bm25(x, y), tqdm(data['paper_id']), data['description_text'])) #################################################朋哥已做################################## data[prefix + 'Hamming_kt'] = list(map(lambda x, y: textdistance.Hamming(qval=None).normalized_distance(x, y), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'Hamming_dt'] = list(map(lambda x, y: textdistance.Hamming(qval=None).normalized_distance(x, y), tqdm(data['description_text_pre']), data['title_pro'])) data[prefix + 'Hamming_ka'] = list(map(lambda x, y: textdistance.Hamming(qval=None).normalized_distance(x, y), tqdm(data['key_text_pre']), data['abstract_pre'])) data[prefix + 'Hamming_da'] = list(map(lambda x, y: textdistance.Hamming(qval=None).normalized_distance(x, y), tqdm(data['description_text_pre']), data['abstract_pre'])) data[prefix + 'Hamming_sim_kt'] = list(map(lambda x, y: textdistance.Hamming(qval=None).similarity(x, y), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'Hamming_sim_dt'] = list(map(lambda x, y: textdistance.Hamming(qval=None).similarity(x, y), tqdm(data['description_text_pre']), data['title_pro'])) data[prefix + 'Hamming_sim_ka'] = list(map(lambda x, y: textdistance.Hamming(qval=None).similarity(x, y), tqdm(data['key_text_pre']), data['abstract_pre'])) data[prefix + 'Hamming_sim_da'] = list(map(lambda x, y: textdistance.Hamming(qval=None).similarity(x, y), tqdm(data['description_text_pre']), data['abstract_pre'])) def edit_distance(df,w1, w2): word1 = df[w1].split() word2 = df[w2].split() len1 = len(word1) len2 = len(word2) dp = np.zeros((len1 + 1, len2 + 1)) for i in range(len1 + 1): dp[i][0] = i for j in range(len2 + 1): dp[0][j] = j for i in range(1, len1 + 1): for j in range(1, len2 + 1): delta = 0 if word1[i - 1] == word2[j - 1] else 1 dp[i][j] = min(dp[i - 1][j - 1] + delta, min(dp[i - 1][j] + 1, dp[i][j - 1] + 1)) return dp[len1][len2] data[prefix + 'edit_distance_kt'] = data.apply(edit_distance, axis=1, args=('key_text_pre', 'title_pro')) data[prefix + 'edit_distance_dt'] = data.apply(edit_distance, axis=1, args=('description_text_pre', 'title_pro')) data[prefix + 'edit_distance_ka'] = data.apply(edit_distance, axis=1, args=('key_text_pre', 'abstract_pre')) data[prefix + 'edit_distance_da'] = data.apply(edit_distance, axis=1, args=('description_text_pre', 'abstract_pre')) def get_same_word_features(query, title): q_list = query.split() t_list = title.split() set_query = set(q_list) set_title = set(t_list) count_words = len(set_query.union(set_title)) comwords = [word for word in t_list if word in q_list] comwords_set = set(comwords) unique_rate = len(comwords_set) / count_words same_word1 = [w for w in q_list if w in t_list] same_word2 = [w for w in t_list if w in q_list] same_len_rate = (len(same_word1) + len(same_word2)) / (len(q_list) + len(t_list)) if len(comwords) > 0: com_index1 = len(comwords) same_word_q = com_index1 / len(q_list) same_word_t = com_index1 / len(t_list) for word in comwords_set: index_list = [i for i, x in enumerate(q_list) if x == word] com_index1 += sum(index_list) q_loc = com_index1 / (len(q_list) * len(comwords)) com_index2 = len(comwords) for word in comwords_set: index_list = [i for i, x in enumerate(t_list) if x == word] com_index2 += sum(index_list) t_loc = com_index2 / (len(t_list) * len(comwords)) same_w_set_q = len(comwords_set) / len(set_query) same_w_set_t = len(comwords_set) / len(set_title) word_set_rate = 2 * len(comwords_set) / (len(set_query) + len(set_title)) com_set_query_index = len(comwords_set) for word in comwords_set: index_list = [i for i, x in enumerate(q_list) if x == word] if len(index_list) > 0: com_set_query_index += index_list[0] loc_set_q = com_set_query_index / (len(q_list) * len(comwords_set)) com_set_title_index = len(comwords_set) for word in comwords_set: index_list = [i for i, x in enumerate(t_list) if x == word] if len(index_list) > 0: com_set_title_index += index_list[0] loc_set_t = com_set_title_index / (len(t_list) * len(comwords_set)) set_rate = (len(comwords_set) / len(comwords)) else: unique_rate, same_len_rate, same_word_q, same_word_t, q_loc, t_loc, same_w_set_q, same_w_set_t, word_set_rate, loc_set_q, loc_set_t, set_rate = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 return unique_rate, same_len_rate, same_word_q, same_word_t, q_loc, t_loc, same_w_set_q, same_w_set_t, word_set_rate, loc_set_q, loc_set_t, set_rate data[prefix+"unique_rate_kt"],data[prefix+"same_len_rate_kt"],data[prefix+"same_word_q_kt"],\ data[prefix+"same_word_t_kt"],data[prefix+"q_loc_kt"],data[prefix+"t_loc_kt"],data[prefix+"same_w_set_q_kt"],data[prefix+"same_w_set_t_kt"],data[prefix+"word_set_rate_kt"],\ data[prefix+"loc_set_q_kt"], data[prefix+"loc_set_t_kt"], data[prefix+"set_rate_kt"]= zip( *data.apply(lambda line: get_same_word_features(line["key_text_pre"], line["title_pro"]), axis=1)) data[prefix+"unique_rate_dt"],data[prefix+"same_len_rate_dt"],data[prefix+"same_word_q_dt"],\ data[prefix+"same_word_t_dt"],data[prefix+"q_loc_dt"],data[prefix+"t_loc_dt"],data[prefix+"same_w_set_q_dt"],data[prefix+"same_w_set_t_dt"],data[prefix+"word_set_rate_dt"],\ data[prefix+"loc_set_q_dt"], data[prefix+"loc_set_t_dt"], data[prefix+"set_rate_dt"]= zip( *data.apply(lambda line: get_same_word_features(line["description_text_pre"], line["title_pro"]), axis=1)) data[prefix+"unique_rate_ka"],data[prefix+"same_len_rate_ka"],data[prefix+"same_word_q_ka"],\ data[prefix+"same_word_t_ka"],data[prefix+"q_loc_ka"],data[prefix+"t_loc_ka"],data[prefix+"same_w_set_q_ka"],data[prefix+"same_w_set_t_ka"],data[prefix+"word_set_rate_ka"],\ data[prefix+"loc_set_q_ka"], data[prefix+"loc_set_t_ka"], data[prefix+"set_rate_ka"]= zip( *data.apply(lambda line: get_same_word_features(line["key_text_pre"], line["abstract_pre"]), axis=1)) data[prefix+"unique_rate_da"],data[prefix+"same_len_rate_da"],data[prefix+"same_word_q_da"],\ data[prefix+"same_word_t_da"],data[prefix+"q_loc_da"],data[prefix+"t_loc_da"],data[prefix+"same_w_set_q_da"],data[prefix+"same_w_set_t_da"],data[prefix+"word_set_rate_da"],\ data[prefix+"loc_set_q_da"], data[prefix+"loc_set_t_da"], data[prefix+"set_rate_da"]= zip( *data.apply(lambda line: get_same_word_features(line["description_text_pre"], line["abstract_pre"]), axis=1)) def get_df_grams_3(train_sample,values,cols): def create_ngram_set(input_list, ngram_value=3): return set(zip(*[input_list[i:] for i in range(ngram_value)])) def get_n_gram(df, values=3): train_query = df.values train_query = [[word for word in str(sen).replace("'", '').split(' ')] for sen in train_query] train_query_n = [] for input_list in train_query: train_query_n_gram = set() for value in range(3, values + 1): train_query_n_gram = train_query_n_gram | create_ngram_set(input_list, value) train_query_n.append(train_query_n_gram) return train_query_n train_query = get_n_gram(train_sample[cols[0]], values) train_title = get_n_gram(train_sample[cols[1]], values) sim = list(map(lambda x, y: len(x) + len(y) - 2 * len(x & y), train_query, train_title)) sim_number_rate=list(map(lambda x, y: len(x & y)/ len(x) if len(x)!=0 else 0, train_query, train_title)) return sim ,sim_number_rate data[prefix+'3_gram_sim'],data[prefix+'sim_numeber_rate_3']=get_df_grams_3(data,3,['key_text_pre','title_pro']) data[prefix+'3_gram_sim_pa'],data[prefix+'sim_numeber_rate_pa_3']=get_df_grams_3(data,3,['key_text_pre','abstract_pre']) #append #n-gram距离相关 data[prefix+'3_gram_sim_2'],data[prefix+'sim_numeber_rate_2_3']=get_df_grams_3(data,3,['description_text_pre','title_pro']) data[prefix+'3_gram_sim_pa_2'],data[prefix+'sim_numeber_rate_pa_2_3']=get_df_grams_3(data,3,['description_text_pre','abstract_pre']) def get_son_str_feature(query, title): q_list = query.split() query_len = len(q_list) t_list = title.split() title_len = len(t_list) count1 = np.zeros((query_len + 1, title_len + 1)) index = np.zeros((query_len + 1, title_len + 1)) for i in range(1, query_len + 1): for j in range(1, title_len + 1): if q_list[i - 1] == t_list[j - 1]: count1[i][j] = count1[i - 1][j - 1] + 1 index[i][j] = index[i - 1][j - 1] + j else: count1[i][j] = 0 index[i][j] = 0 max_count1 = count1.max() if max_count1 != 0: row = int(np.where(count1 == np.max(count1))[0][0]) col = int(np.where(count1 == np.max(count1))[1][0]) mean_pos = index[row][col] / (max_count1 * title_len) begin_loc = (col - max_count1 + 1) / title_len rows = np.where(count1 != 0.0)[0] cols = np.where(count1 != 0.0)[1] total_loc = 0 for i in range(0, len(rows)): total_loc += index[rows[i]][cols[i]] density = total_loc / (query_len * title_len) rate_q_len = max_count1 / query_len rate_t_len = max_count1 / title_len else: begin_loc, mean_pos, total_loc, density, rate_q_len, rate_t_len = 0, 0, 0, 0, 0, 0 return max_count1, begin_loc, mean_pos, total_loc, density, rate_q_len, rate_t_len data[prefix+"long_same_max_count1_kt"], data[prefix+"long_same_local_begin_kt"], data[prefix+"long_same_local_mean_kt"],data[prefix+"long_same_total_loc_kt"],\ data[prefix+"long_same_density_kt"], data[prefix+"long_same_rate_q_len_kt"], data[prefix+"long_same_rate_t_len_kt"]= zip( *data.apply(lambda line: get_son_str_feature(line["key_text_pre"], line["title_pro"]), axis=1)) data[prefix+"long_same_max_count1_dt"], data[prefix+"long_same_local_begin_dt"], data[prefix+"long_same_local_mean_dt"],data[prefix+"long_same_total_loc_dt"],\ data[prefix+"long_same_density_dt"], data[prefix+"long_same_rate_q_len_dt"], data[prefix+"long_same_rate_t_len_dt"]= zip( *data.apply(lambda line: get_son_str_feature(line["description_text_pre"], line["title_pro"]), axis=1)) data[prefix+"long_same_max_count1_da"], data[prefix+"long_same_local_begin_da"], data[prefix+"long_same_local_mean_da"],data[prefix+"long_same_total_loc_da"],\ data[prefix+"long_same_density_da"], data[prefix+"long_same_rate_q_len_da"], data[prefix+"long_same_rate_t_len_da"]= zip( *data.apply(lambda line: get_son_str_feature(line["description_text_pre"], line["abstract_pre"]), axis=1)) data[prefix+"long_same_max_count1_ka"], data[prefix+"long_same_local_begin_ka"], data[prefix+"long_same_local_mean_ka"],data[prefix+"long_same_total_loc_ka"],\ data[prefix+"long_same_density_ka"], data[prefix+"long_same_rate_q_len_ka"], data[prefix+"long_same_rate_t_len_ka"]= zip( *data.apply(lambda line: get_son_str_feature(line["key_text_pre"], line["abstract_pre"]), axis=1)) def q_t_common_words(query, title): query = set(query.split(' ')) title = set(title.split(' ')) return len(query & title) data[prefix+'common_words_kt'] = data.apply(lambda index: q_t_common_words(index.key_text_pre, index.title_pro), axis=1) data[prefix+'common_words_dt'] = data.apply(lambda index: q_t_common_words(index.description_text_pre, index.title_pro), axis=1) data[prefix+'common_words_ka'] = data.apply(lambda index: q_t_common_words(index.key_text_pre, index.abstract_pre), axis=1) data[prefix+'common_words_da'] = data.apply(lambda index: q_t_common_words(index.description_text_pre, index.abstract_pre), axis=1) data['key_text_len'] = data['key_text_pre'].apply(lambda x: len(x.split(' '))) data['description_text_pre_len'] = data['description_text_pre'].apply(lambda x: len(x.split(' '))) data['title_pro_len'] = data['title_pro'].apply(lambda x: len(x.split(' '))) data['abstract_pre_len'] = data['abstract_pre'].apply(lambda x: len(x.split(' '))) data[prefix+'common_words_kt_rate_k'] = data[prefix+'common_words_kt'] / data['key_text_len'] data[prefix+'common_words_kt_rate_t'] = data[prefix+'common_words_kt'] / data['title_pro_len'] data[prefix+'common_words_dt_rate_d'] = data[prefix+'common_words_dt'] / data['description_text_pre_len'] data[prefix+'common_words_dt_rate_t'] = data[prefix+'common_words_dt'] / data['title_pro_len'] data[prefix+'common_words_ka_rate_k'] = data[prefix+'common_words_ka'] / data['key_text_len'] data[prefix+'common_words_ka_rate_a'] = data[prefix+'common_words_ka'] / data['abstract_pre_len'] data[prefix+'common_words_da_rate_d'] = data[prefix+'common_words_da'] / data['description_text_pre_len'] data[prefix+'common_words_da_rate_a'] = data[prefix+'common_words_da'] / data['abstract_pre_len'] feat = ['description_id','paper_id'] for col in data.columns: if re.match('num_', col) != None: feat.append(col) data = data[feat] return data