示例#1
0
 def testJD(self):
     s1 = 'i like to party'
     s2 = 'this party is started'
     s3 = 'this parte is started'
     s4 = 'this party wasstarted'
     self.assertEquals(distance.jaccard(s1, s1), 0)
     self.assertEquals(distance.jaccard(s1, s2), -1)
示例#2
0
def main():
    b1 = np.array([50000, 4, 1], dtype=np.float64)
    b2 = np.array([15000, 1, 5], dtype=np.float64)
    b3 = np.array([60000, 3, 2], dtype=np.float64)
    b4 = np.array([25000, 5, 3], dtype=np.float64)
    b5 = np.array([78000, 4, 5], dtype=np.float64)
    # Cosine
    print("Cosine\nCosine B5, B4 : " + str(dis.cosine(b5, b4)))
    print("Cosine B5, B3 : " + str(dis.cosine(b5, b3)))
    print("Cosine B5, B2 : " + str(dis.cosine(b5, b2)))
    print("Cosine B5, B1 : " + str(dis.cosine(b5, b1)))
    #  Jaccard
    print("Jaccard\nJaccard B5, B4 : " + str(dis.jaccard(b5, b4)))
    print("Jaccard B5, B3 : " + str(dis.jaccard(b5, b3)))
    print("Jaccard B5, B2 : " + str(dis.jaccard(b5, b2)))
    print("Jaccard B5, B1 : " + str(dis.jaccard(b5, b1)))
    # Dice
    print("Dice\nDice B5, B4 : " + str(dis.dice(b5, b4)))
    print("Dice B5, B3 : " + str(dis.dice(b5, b3)))
    print("Dice B5, B2 : " + str(dis.dice(b5, b2)))
    print("Dice B5, B1 : " + str(dis.dice(b5, b1)))
    # Euclidean
    print("Euclidean\nEuclidean B5, B4 : " + str(dis.euclidean(b5, b4)))
    print("Euclidean B5, B3 : " + str(dis.euclidean(b5, b3)))
    print("Euclidean B5, B2 : " + str(dis.euclidean(b5, b2)))
    print("Euclidean B5, B1 : " + str(dis.euclidean(b5, b1)))
    # Manhattan
    print("Manhattan\nManhattan B5, B4 : " + str(dis.manhattan(b5, b4)))
    print("Manhattan B5, B3 : " + str(dis.manhattan(b5, b3)))
    print("Manhattan B5, B2 : " + str(dis.manhattan(b5, b2)))
    print("Manhattan B5, B1 : " + str(dis.manhattan(b5, b1)))
def extract_features(df):
    features = pd.DataFrame()

    print('extracting space splitted sequence features...')

    df['q1_words'] = df.question1.map(space_split)
    df['q2_words'] = df.question2.map(space_split)

    features['str_leven1'] = df.apply(
        lambda r: distance.nlevenshtein(r.q1_words, r.q2_words, method=1),
        axis=1)
    features['str_leven2'] = df.apply(
        lambda r: distance.nlevenshtein(r.q1_words, r.q2_words, method=2),
        axis=1)
    features['str_jaccard'] = df.apply(
        lambda r: distance.jaccard(r.q1_words, r.q2_words), axis=1)
    #features['str_hamming'] = df.apply(lambda r: distance.hamming(r.q1_words, r.q2_words, normalized=True), axis=1)
    #features['str_sorensen'] = df.apply(lambda r: distance.jaccard(r.question1, r.question2), axis=1)

    print('extracting stemmed word sequence features...')

    df['q1_stems'] = df.question1.map(stem)
    df['q2_stems'] = df.question2.map(stem)

    features['stem_leven1'] = df.apply(
        lambda r: distance.nlevenshtein(r.q1_stems, r.q2_stems, method=1),
        axis=1)
    features['stem_leven2'] = df.apply(
        lambda r: distance.nlevenshtein(r.q1_stems, r.q2_stems, method=2),
        axis=1)
    features['stem_jaccard'] = df.apply(
        lambda r: distance.jaccard(r.q1_stems, r.q2_stems), axis=1)

    return features.fillna(.0)
def similarity_factor(s1, s2):
    """ Returns float number which corresponds to similarity order of two strings s1 and s2 """
    diffl = difflib.SequenceMatcher(None, s1, s2).ratio()*100
    ng = ngram.NGram.compare(s1, s2, N=1)*100
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1-distance.jaccard(jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower()))*100
    jac_soundex = (1-distance.jaccard(jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower()))*100
    return mean([diffl, ng, fpr, jac_soundex, jac_metaphone]) if mean([diffl, ng, fpr]) < jac_soundex else mean([diffl, ng, fpr, jac_metaphone])
def compare_for_seniority_finding(s1, s2):
    """ Returns the input word if it is similar (according to corresponding algorithms) to some another word.
        s1 - main string, s2 - string from list for comparison
    """
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1-distance.jaccard(jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower()))*100
    jac_soundex = (1-distance.jaccard(jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower()))*100
    jac_mrc = (1-distance.jaccard(jellyfish.match_rating_codex(unicode(s1)).lower(), jellyfish.match_rating_codex(unicode(s2)).lower()))*100
    return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65
示例#6
0
    def jaccard_preprocess(self, threshold):
        # track which two pairs have been tested
        test_edge_list = set([])
        test_vertices_list = set([])
        queue = deque([])
        for x in self.graph.vertices:
            queue.append(x)
            break
        while len(queue) != 0:
            cur = queue.popleft()
            if cur in test_vertices_list:
                continue

            test_vertices_list.add(cur)
            neighbors = self.graph.vertices_matrix[cur]
            for node in neighbors:
                if (node, cur) in test_edge_list or (cur,
                                                     node) in test_edge_list:
                    continue

                test_edge_list.add((cur, node))
                if node not in test_vertices_list:
                    queue.append(node)
                score = dis.jaccard(self.graph.vertices_matrix[cur],
                                    self.graph.vertices_matrix[node])
                if score < threshold:
                    self.graph.vertices_matrix[cur].remove(node)
                    self.graph.vertices_matrix[node].remove(cur)
                    if (cur, node) in self.graph.edges:
                        self.graph.edges.remove((cur, node))
                    else:
                        self.graph.edges.remove((node, cur))

                second_neighbors = self.graph.vertices_matrix[node]
                for second_node in second_neighbors:
                    if (cur, second_node) in test_edge_list or (
                            second_node, cur) in test_edge_list or (
                                cur, second_node) in self.graph.edges or (
                                    second_node, cur) in self.graph.edges:
                        continue
                    test_edge_list.add((cur, second_node))
                    second_score = dis.jaccard(
                        self.graph.vertices_matrix[cur],
                        self.graph.vertices_matrix[second_node])
                    if second_score > threshold:
                        self.graph.vertices_matrix[cur].append(second_node)
                        self.graph.vertices_matrix[second_node].append(cur)
                        self.graph.edges.add((cur, second_node))

        for vertex in self.graph.vertices:
            if len(self.graph.vertices_matrix[vertex]) < 1:
                self.graph.vertice.remove(vertex)

        output('after jaccard', len(self.graph.edges))
def similarity_factor(s1, s2):
    """ Returns float number which corresponds to similarity order of two strings s1 and s2 """
    diffl = difflib.SequenceMatcher(None, s1, s2).ratio() * 100
    ng = ngram.NGram.compare(s1, s2, N=1) * 100
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1 - distance.jaccard(
        jellyfish.metaphone(unicode(s1)).lower(),
        jellyfish.metaphone(unicode(s2)).lower())) * 100
    jac_soundex = (1 - distance.jaccard(
        jellyfish.soundex(unicode(s1)).lower(),
        jellyfish.soundex(unicode(s2)).lower())) * 100
    return mean([diffl, ng, fpr, jac_soundex, jac_metaphone
                 ]) if mean([diffl, ng, fpr]) < jac_soundex else mean(
                     [diffl, ng, fpr, jac_metaphone])
def compare_for_seniority_finding(s1, s2):
    """ Returns the input word if it is similar (according to corresponding algorithms) to some another word.
        s1 - main string, s2 - string from list for comparison
    """
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1 - distance.jaccard(
        jellyfish.metaphone(unicode(s1)).lower(),
        jellyfish.metaphone(unicode(s2)).lower())) * 100
    jac_soundex = (1 - distance.jaccard(
        jellyfish.soundex(unicode(s1)).lower(),
        jellyfish.soundex(unicode(s2)).lower())) * 100
    jac_mrc = (1 - distance.jaccard(
        jellyfish.match_rating_codex(unicode(s1)).lower(),
        jellyfish.match_rating_codex(unicode(s2)).lower())) * 100
    return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65
示例#9
0
def calculateSimilarity_WithDistancePackage(createdSentence): #createdSentence is list of words

    levinDist = {}
    jaccardDist = {}
    bestValues = {}
    for i in range(len(X_train)):
        currSentence = X_train[i]
        sentence_str = [index_to_word[x] for x in currSentence[1:-1]]  # sentence_str is list of words
        #Levinstein Distance
        dist = distance.levenshtein(createdSentence, sentence_str)
        dist2 = distance.jaccard(createdSentence,sentence_str)
        #print(dist)
        if (dist>0):
            #print ("Distance Levinshtein: %f" % (dist))
            levinDist[i]=dist
        jaccardDist[i]=dist2
        #print ("Jaccard Distance: %f" % (dist2))

    #take best value
    levinMin = min(levinDist.itervalues())
    jaccardMin = min(jaccardDist.itervalues())

    print ("Best Distance Levinshtein: %f" % (levinMin))
    print ("Best Distance Jaccard: %f" % (jaccardMin))
    bestValues["Jaccard"]=jaccardMin
    bestValues["Levin"]=levinMin
    return bestValues
示例#10
0
def get_connected_components_jaccard_similarity(documents, jaccard_threshold=.2, field_type="text"):
    """
        Find the connected components of documents sharing the same n-gram based on a threshold for Jaccard similarity.
    """
    document_text = {}
    for k,v in documents.items():
        try:
            document_text[k] = v[field_type]
        except:
            pass
    G = nx.Graph()
    similarity = {}
    ads = list(document_text)
    G.add_nodes_from(ads)

    for i in range(0,len(ads)-1):
        a = []
        for j in range(i+1,len(ads)):
            similarity[(ads[i],ads[j])] =  round(distance.jaccard(document_text[ads[i]], document_text[ads[j]]),3)

    for k, v in similarity.items():
        if v <= jaccard_threshold:
            G.add_edge(k[0],k[1])

    connected_components = set()

    for i in G.nodes():
        connected_components.add(str(sorted(nx.node_connected_component(G, i))))

    return connected_components
示例#11
0
def find_new_centroid(centroid, tweets):
    if not tweets:
        return centroid
    distToAll = lambda x: np.mean(
        list(map(lambda y: distance.jaccard(x[1], y[1]), tweets)))
    tweetVal = np.argmin(list(map(distToAll, tweets)))
    return tweets[tweetVal]
示例#12
0
文件: Main.py 项目: R-Varun/LeagueBot
def censor(aString):
    for word in aString.split(" "):
        for bannedword in Constants.getBannedWords():
            if distance.jaccard(word, bannedword) < 0.1:
                return True

    return False
示例#13
0
def similarity_test(products, mallCount):
    df = pd.read_csv("/Users/kang/Downloads/programming/dev/food_for_LEO/config/crawling", encoding="CP949")

    #products와 같은 id의 상품 이름, id가져오기
    for product in products:
        product_id = + df['id' == product.id]
        product_name = + df['name' == product.name]

    for i in range(len(product_name)):
        product_name[i] = re.sub('[-=+,#/\?:^$@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'… ]+', '', product_name[i]).lower()
        product_name[i] = product_name[i].replace('유기농', '').replace('플러스', '')

    #id:상품명 -> 상품 pk찾기위해 딕셔녀리 생성
    product_dic = {}
    for i in range(len(product_name)):
        product_dic[product_id[i]] = product_name[i]

    #jaccardDistance 유사도 측정
    similarity = {}
    product_list = []
    while(len(product_name) != 0):
        for i in range(product_name):
            jaccard = 1 - (distance.jaccard(product_name[0], product_name[i]))
            similarity[product_id[i]] = jaccard
        similarity = sorted(similarity.items(), key=operator.itemgetter(1), reverse=True)
        product_list += [list(similarity.keys())[0: 3]]
        for i in range(len(product_list[-1])):
            index = product_id.index(product_list[-1][i])
            del product_id[index]
            del product_name[index]

    return product_list
示例#14
0
def summarize(text_to_summarize):
    stokens = tokenize(text_to_summarize)
 
    # STEP 1
    # pattern.vector's Document is a nifty bag-o-words structure,
    # with a TF weighting scheme
    docs = [Document(string= s, name=e,stemmer=LEMMA)
            for e,s in enumerate(stokens) if len(s.split(" ")) > 7]
    
    linkgraph = []
    # STEP 2 and 3 happen interwovenly
    for doc in docs:
        for doc_copy in docs:
            if doc.name != doc_copy.name:
                # STEP 2 happens here
                wordset_a = [x[1] for x in doc.keywords()]
                wordset_b = [y[1] for y in doc_copy.keywords()]
                jacc_dist = distance.jaccard(wordset_a, wordset_b)
                if jacc_dist < 1:
                    linkgraph.append((str(doc.name), #index to sentence
                                      str(doc_copy.name),1-jacc_dist)) #dist. score
    # By the time we reach here, we'd have completed STEP 3
    
    # STEP 4
    #I referenced this SO post for help with pagerank'ing
    #http://stackoverflow.com/questions/9136539/how-to-weighted-edges-affect-pagerank-in-networkx
    D=nx.DiGraph()
    D.add_weighted_edges_from(linkgraph)
    pagerank = nx.pagerank(D)
    sort_pagerank = sorted(pagerank.items(),key=operator.itemgetter(1))
    sort_pagerank.reverse()
    top2 = sort_pagerank[:2]
    orderedtop2 = [int(x[0]) for x in top2]
    orderedtop2 = sorted(orderedtop2)
    return " ".join([ stokens[i] for i in orderedtop2 ])
def compute_similarity(X):
    """
    Compute similarity matrix with mean of 3 distances
    :param X: List of contracts ssdeep hashes
    :return: Similarity matrix
    """
    jaccard_matrix = pdist(X, lambda x, y: distance.jaccard(x[0], y[0]))
    np.savetxt("../data/jaccard_matrix.csv",
               np.asarray(squareform(jaccard_matrix)),
               delimiter=",")

    sorensen_matrix = pdist(X, lambda x, y: distance.sorensen(x[0], y[0]))
    np.savetxt("../data/sorensen_matrix.csv",
               np.asarray(squareform(sorensen_matrix)),
               delimiter=",")

    # normalized, so that the results can be meaningfully compared
    # method=1 means the shortest alignment between the sequences is taken as factor
    levenshtein_matrix = pdist(
        X, lambda x, y: distance.nlevenshtein(x[0], y[0], method=1))
    np.savetxt("../data/levenshtein_matrix.csv",
               np.asarray(squareform(levenshtein_matrix)),
               delimiter=",")

    mean_matrix = 1 - np.mean(np.array(
        [jaccard_matrix, sorensen_matrix, levenshtein_matrix]),
                              axis=0)
    np.savetxt("../data/similarity_matrix.csv",
               np.asarray(mean_matrix),
               delimiter=",")

    print("Similarity matrix computed.")
    return mean_matrix
示例#16
0
def get_connected_components_jaccard_similarity(documents,
                                                jaccard_threshold=.2,
                                                field_type="text"):
    """
        Find the connected components of documents sharing the same n-gram based on a threshold for Jaccard similarity.
    """
    document_text = {}
    for k, v in documents.items():
        try:
            document_text[k] = v[field_type]
        except:
            pass
    G = nx.Graph()
    similarity = {}
    ads = list(document_text)
    G.add_nodes_from(ads)

    for i in range(0, len(ads) - 1):
        a = []
        for j in range(i + 1, len(ads)):
            similarity[(ads[i], ads[j])] = round(
                distance.jaccard(document_text[ads[i]], document_text[ads[j]]),
                3)

    for k, v in similarity.items():
        if v <= jaccard_threshold:
            G.add_edge(k[0], k[1])

    connected_components = set()

    for i in G.nodes():
        connected_components.add(str(sorted(nx.node_connected_component(G,
                                                                        i))))

    return connected_components
示例#17
0
def summarize(text, sentence_count=5, language='english'):
    processor = LanguageProcessor(language)

    sentence_list = processor.split_sentences(text)
    wordset_list = map(processor.extract_significant_words, sentence_list)
    stemsets = [
        {processor.stem(word) for word in wordset}
        for wordset in wordset_list
    ]

    graph = Graph()
    pairs = combinations(enumerate(stemsets), 2)
    for (index_a, stems_a), (index_b, stems_b) in pairs:
        if stems_a and stems_b:
            similarity = 1 - jaccard(stems_a, stems_b)
            if similarity > 0:
                graph.add_edge(index_a, index_b, weight=similarity)

    ranked_sentence_indexes = list(pagerank(graph).items())
    if ranked_sentence_indexes:
        sentences_by_rank = sorted(
            ranked_sentence_indexes, key=itemgetter(1), reverse=True)
        best_sentences = map(itemgetter(0), sentences_by_rank[:sentence_count])
        best_sentences_in_order = sorted(best_sentences)
    else:
        best_sentences_in_order = range(min(sentence_count, len(sentence_list)))

    return ' '.join(sentence_list[index] for index in best_sentences_in_order)
示例#18
0
 def similarFrequences(self, a, b):
     '''
     Decides if a and b have similar namespace frequency
     '''
     freqA = [line[1] for line in a]
     freqB = [line[1] for line in b]
     return distance.jaccard(freqA, freqB) >= self.config.get('similarity', 'freq')
示例#19
0
 def similarNamespaces(self, a, b):
     '''
     Decides if a and b have similar namespaces
     '''
     nsA = [line[0] for line in a]
     nsB = [line[0] for line in b]
     return distance.jaccard(nsA, nsB) >= self.config.get('similarity', 'ns')
示例#20
0
def str_jaccard(str1, str2):


    str1_list = str1.split(" ")
    str2_list = str2.split(" ")
    res = distance.jaccard(str1_list, str2_list)
    return res
def str_jaccard(str1, str2):


    str1_list = str1.split(" ")
    str2_list = str2.split(" ")
    res = distance.jaccard(str1_list, str2_list)
    return res
示例#22
0
def calculateSimilarity_WithDistancePackage(
        createdSentence):  #createdSentence is list of words

    levinDist = {}
    jaccardDist = {}
    bestValues = {}
    for i in range(len(X_train)):
        currSentence = X_train[i]
        sentence_str = [index_to_word[x] for x in currSentence[1:-1]
                        ]  # sentence_str is list of words
        #Levinstein Distance
        dist = distance.levenshtein(createdSentence, sentence_str)
        dist2 = distance.jaccard(createdSentence, sentence_str)
        #print(dist)
        if (dist > 0):
            #print ("Distance Levinshtein: %f" % (dist))
            levinDist[i] = dist
        jaccardDist[i] = dist2
        #print ("Jaccard Distance: %f" % (dist2))

    #take best value
    levinMin = min(levinDist.itervalues())
    jaccardMin = min(jaccardDist.itervalues())

    print("Best Distance Levinshtein: %f" % (levinMin))
    print("Best Distance Jaccard: %f" % (jaccardMin))
    bestValues["Jaccard"] = jaccardMin
    bestValues["Levin"] = levinMin
    return bestValues
def find_closest_known_questag(questag):
    """
    Returns the "closest" known questag to the existing questag.
    """
    
    # If the unknown questag is a substring of a known questag,
    # return the shortest known questag with that property.
    for known_questag in known_questags:
        if questag in known_questag:
            return known_questag
        
    # If a known questag is the substring of the unknown questag,
    # return the longest known questag with that property.
    for known_questag in reversed(known_questags):
        if known_questag in questag:
            return known_questag
        
    # Now find the known questag with the maximum Jaccard distance.
    max_jaccard_dist = 0
    closest_known_questag = known_questags[0]
    for known_questag in known_questags:
        dist = distance.jaccard(questag, known_questag)
        if dist > max_jaccard_dist:
            closest_known_questag = known_questag
            max_jaccard_dist = dist
            
    return closest_known_questag
示例#24
0
def RatingJaccard(revista):

    #Get path file
    dirname = os.path.dirname(__file__)
    loc = os.path.join(dirname, r'JCR2018.xlsx')

    #Initialize reader
    workbook = xlrd.open_workbook(loc)
    sheet = workbook.sheet_by_index(0)

    tuplas = []

    start_time = time()

    for i in range(sheet.nrows):
        valor = (sheet.cell_value(i,
                                  1), jaccard(revista, sheet.cell_value(i, 1)))
        tuplas.append(valor)

    final_time = time()
    execution_time = round(final_time - start_time, 2)

    tuplas.sort(key=lambda revista: revista[1])

    top_5 = tuplas[:10]

    result = (top_5, execution_time)

    return result
def get_jaccard_score(group,name, chain, type):

    contacts= json.loads(open('equiv_contacts_dict_lsyozyme_pos_paratope_'+chain+'_'+type+'.json').read())
    non_bonded_list={}
    h_bonded_list={}
    for pdb in group:
        for k, v in contacts.items():
            if pdb in k:
                non_bonded_list[pdb]=[]
                for pos, res in v.items():
                    non_bonded_list[pdb].append(pos)
    d=non_bonded_list

    p={}
    key_list=[]
    for key, v in d.items():
        key_list.append(key)

    for k in key_list:
        p[k]=[]
        for i in key_list:
            if i!=k:
                while(len(d[k])!=len(d[i])):
                    if len(d[k])> len(d[i]):
                        d[i].append(0)
                    elif len(d[k])< len(d[i]):
                        d[k].append(0)
                p[k].append(1-distance.jaccard(set(d[k]),set(d[i])))
        dict_sum={}
        dict_avg={}
        for k, v in p.items():
            dict_sum[k]=0
            for e in v:
                dict_sum[k]+=e
            dict_avg[k]=(dict_sum[k]/len(p[k]))

    #with open('all_jaccard_scores.json', 'w') as ctr: #save the contact residues in a json file
    #json.dump(dict_avg, ctr)
    #p_sorted={k: v for k, v in sorted(dict_avg.items(), key=lambda x: x[1])}
    name_list=[]
    score_list=[]
    species_list=[]
    for k, v in dict_avg.items():
            name_list.append(k)
            score_list.append(v)
            species_list.append(species[k])
    df_groups=pd.DataFrame({"Group 1"})
    df= pd.DataFrame({'Epitope_similarity':score_list,
                      'PDB':name_list
                      })

    df.index = np.arange(1, len(df)+1)
    sorted=df.sort_values('PDB')
    #g=sns.scatterplot(y="Jaccard_score", x="PDB_codes", data=sorted, hue='species')
    #g.set(xticks=[])
    #plt.xticks(rotation=90)
    #plt.show()
    #g.get_figure().savefig(name+"_jaccard_scatter_plot.png")
    #df.to_excel(name+"_"+chain+"_"+type+"_jaccard_scores.xlsx")
    return df
示例#26
0
def similarity_test(products, mallCount):
    product_id = []
    product_name = []
    # products와 같은 id의 상품 이름, id가져오기
    for product in products:
        product_id += [product.pk]
        product_name += [product.name]

    for i in range(len(product_name)):
        product_name[i] = re.sub('[-=+,#/\?:^$@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'… ]+', '', product_name[i]).lower()
        product_name[i] = product_name[i].replace('유기농', '').replace('플러스', '').replace("eco", "에코")

    # id:상품명 -> 상품 pk찾기위해 딕셔녀리 생성
    product_dic = {}
    for i in range(len(product_dic)):
        product_dic[product_id[i]] = product_name[i]

    # jaccardDistance 유사도 측정
    similarity = {}
    product_list = []
    while (len(product_name) != 0):
        for i in range(len(product_name)):
            jaccard = 1 - (distance.jaccard(product_name[0], product_name[i]))
            similarity[product_id[i]] = jaccard

        similarity = sorted(similarity.items(), key=operator.itemgetter(1), reverse=True)
        product_list += [[pk for pk, sim in similarity[0: mallCount] if sim >= 0.95]]

        for j in range(len(product_list[-1])):
            index = int(product_id.index(product_list[-1][j]))
            del product_id[index]
            del product_name[index]
            similarity = {}

    return product_list
示例#27
0
def dsd_sim_load():
    # Get all dsds
    dsds = db.dsds.find({})
    with open('dsd_data.json', 'w') as outfile:
        with open('dsd_data.csv', 'w') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter=',',
                                   quotechar='\"', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(["dsd_a", "dsd_b", "distance"])
            outfile.write("[")
            for pair in itertools.combinations(dsds, 2):
                a_components = [comp["o"] for comp in pair[0]["dsd"]["components"]]
                b_components = [comp["o"] for comp in pair[1]["dsd"]["components"]]
                a_uri = pair[0]["dsd"]["uri"]
                b_uri = pair[1]["dsd"]["uri"]
                a_id = pair[0]["_id"]
                b_id = pair[1]["_id"]
                dist = distance.jaccard(a_components, b_components)
                item = {"uri_a" : "<a href='/dsds/%s'>%s</a>" % (a_id, a_uri),
                        "uri_b" : "<a href='/dsds/%s'>%s</a>" % (b_id, b_uri),
                        "dist" : dist}
                outfile.write(json.dumps(item, outfile)+",")
                csvwriter.writerow([a_uri, b_uri, dist])
            outfile.write("]")

    return "OK"
示例#28
0
def title_similarity_np(row1, row2, method="difflib"):
    if method.lower() == "levenshtein":
        return 1 - distance.nlevenshtein(row1[1], row2[1], method=1)
    if method.lower() == "sorensen":
        return 1 - distance.sorensen(row1[1], row2[1])
    if method.lower() == "jaccard":
        return 1 - distance.jaccard(row1[1], row2[1])
    return difflib.SequenceMatcher(None, row1[1], row2[1]).quick_ratio()
示例#29
0
    def jaccard(self, other):
        """
        Computes the jaccard similarity between this log and the other one.
        """
        a = [str(version) for version in self.iter_versions()]
        b = [str(version) for version in other.iter_versions()]

        return jaccard(a, b)
示例#30
0
    def jaccard(self, other):
        """
        Computes the jaccard similarity between this log and the other one.
        """
        a = [str(version) for version in self.iter_versions()]
        b = [str(version) for version in other.iter_versions()]

        return jaccard(a,b)
def calculate_similarity(e1, e2, s):
    '''
    Given 2 elements, it calculates the similarity between them, the computation depends on the type of similarity
    '''
    if s == 'edit distance':
        return editdistance.eval(e1, e2)
    elif s == 'jaccard':
        return distance.jaccard(e1, e2)
示例#32
0
def find_clusters(centroidList, tweets):
    clusters = [[] for x in range(len(centroidList))]
    for tweet in tweets:
        myclust = np.argmin(
            list(map(lambda x: distance.jaccard(x[1], tweet[1]),
                     centroidList)))
        clusters[myclust].append(tweet)
    return clusters
示例#33
0
def readData(inputFile):
    locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

    with open(inputFile) as data_file:
        data = json.load(data_file)
    if "product" in data[0].keys():
        data = [
            e for e in data
            if e["product"] == "Cellphones" or e["product"] == "Tablets"
        ]

    results = {}

    data_by_brand = {}
    for entry in data:
        brand = standardize(entry["brand"])
        if brand not in data_by_brand.keys():
            data_by_brand[brand] = [entry]
        else:
            data_by_brand[brand].append(entry)

    for brand in data_by_brand.keys():
        results[brand] = []

        devices = sorted(data_by_brand[brand],
                         key=lambda k: standardize(k['name']))

        while len(devices) > 0:
            device = devices[0]
            del (devices[0])

            device["std_name"] = standardize(brand + " " + device["name"])

            stats = [
                locale.atoi((device["stats"])[i])
                for i in range(len(device["stats"]))
            ]
            del (device["stats"])
            device["hit"] = sum(stats)

            while len(devices) > 0:
                ref_device = devices[0]
                ref_name = standardize(ref_device["name"])

                d = distance.jaccard(ref_name.split(),
                                     device["std_name"].split())
                if d == 0:
                    del (devices[0])
                    if "another_name" not in device.keys():
                        device["another_name"] = [ref_device["name"]]
                    else:
                        device["another_name"].append(ref_device["name"])
                else:
                    break

            results[brand].append(device)

    return results
示例#34
0
def jaccard_algo(A, B):
    A = A.lower().split()
    B = B.lower().split()
    counter = 0
    for i in range(len(A)):
        for j in range(len(B)):
            if jaccard(A[i], B[j]) < .2:
                counter = counter + 1
    return counter
示例#35
0
def DL_Distance(str1, str2):
    print(str1, str2)
    print("distance 1: ", distance.nlevenshtein(str1, str2))
    print("distance 2: ", damerau_levenshtein_distance(str1, str2))
    dls = (damerau_levenshtein_distance(str1, str2) /
           max(len(str1), len(str2)))
    print("distance 3: ", dls)

    print("distance 4: ", distance.jaccard(str1, str2))
示例#36
0
def compute_sim(cluster_a, cluster_b):
    len_a = len(cluster_a)
    len_b = len(cluster_b)

    sim_matrix = np.zeros((len_a, len_b))
    for index_a, word_a in enumerate(cluster_a):
        for index_b, word_b in enumerate(cluster_b):
            sim_matrix[index_a][index_b] = distance.jaccard(word_a, word_b)
    return sim_matrix
示例#37
0
 def _compute_vectorized(self, s1, s2):
     # calculate pair-wise levenshtein
     s1 = list(s1)
     s2 = list(s2)
     sim = np.array(
         [distance.jaccard(s1[i], s2[i]) for i in range(len(s1))])
     min_dist = np.min(sim)
     sim = np.array([1 if x == min_dist and x > .8 else 0 for x in sim])
     return sim
def get_jaccard_cdr(group, name):
    """
    Compute Jaccard similarity scores for CDR sequences.
    """
    key_list=[]
    d=json.loads(open(name+"_binding.json").read())
    for key, v in d.items():
        if key in group:
            key_list.append(key)
    p={}
    for k in key_list:
        p[k]=[]
        for i in key_list:
            if i!=k:
                while(len(d[k])!=len(d[i])):
                    if len(d[k])> len(d[i]):
                        d[i].append(0)
                    elif len(d[k])< len(d[i]):
                        d[k].append(0)
                p[k].append(1-distance.jaccard(set(d[k]),set(d[i])))
        sc_sum=0
        dict_sum={}
        dict_avg={}
        for k, v in p.items():

            dict_sum[k]=0
            for e in v:
                dict_sum[k]+=e
            dict_avg[k]=(dict_sum[k]/len(p[k]))

    #with open('all_jaccard_scores.json', 'w') as ctr: #save the contact residues in a json file
    #json.dump(dict_avg, ctr)
    #p_sorted={k: v for k, v in sorted(dict_avg.items(), key=lambda x: x[1])}
    name_list=[]
    score_list=[]
    species_list=[]
    for k, v in dict_avg.items():
        name_list.append(k)
        score_list.append(v)
        species_list.append(species[k])
    df_groups=pd.DataFrame({"Group 1"})
    df= pd.DataFrame({name+'_Preference_similarity':score_list,
                      'PDB':name_list
                      })


    #df.sort_values("species", inplace=True)
    df.index = np.arange(1, len(df)+1)
    sorted=df.sort_values('PDB')
    #g=sns.scatterplot(y="Jaccard_score", x="PDB_codes", data=sorted, hue='species')
    #g.set(xticks=[])
    #plt.xticks(rotation=90)
    #plt.show()
    #g.get_figure().savefig(name+"_jaccard_scatter_plot.png")
    #df.to_excel(name+"_"+chain+"_"+type+"_jaccard_scores.xlsx")
    return df
示例#39
0
 def jacc_minR(self):
     all_dis = []
     small_list = Strings_comp(self).clear_data()
     for i in range(0, len(small_list)-1):
         cursor = small_list[i]          
         for j in range(i+1, len(small_list)-1):             
             dis = distance.jaccard(cursor, small_list[j])   
             if (dis!=1.0):
                 all_dis.append(dis)
     return round(max(all_dis), 3)
示例#40
0
 def jacc(self):
     all_dis = []
     small_list = Strings_comp(self).clear_data()
     for i in range(0, len(small_list)-1):
         cursor = small_list[i]          
         for j in range(i+1, len(small_list)-1):             
             dis = distance.jaccard(cursor, small_list[j])
             all_dis.append(dis)
     
     dict_v = Counter(all_dis).values()        
     return all_dis
示例#41
0
def get_features(raw_data):
    fet_data = pd.DataFrame()

    print "extracting count features..."
    fet_data["q_len"] = raw_data["query"].map(word_len)
    fet_data["t_len"] = raw_data["product_title"].map(word_len)
    fet_data["d_len"] = raw_data["product_description"].map(word_len)

    print "extracting basic distance features from q and t..."
    fet_data["nleven1"] = raw_data.apply(lambda x: distance.nlevenshtein(x.q, x.t, method=1), axis=1)
    fet_data["nleven2"] = raw_data.apply(lambda x: distance.nlevenshtein(x.q, x.t, method=2), axis=1)
    fet_data["sorensen"] = raw_data.apply(lambda x: distance.sorensen(x.q, x.t), axis=1)
    fet_data["jaccard"] = raw_data.apply(lambda x: distance.jaccard(x.q, x.t), axis=1)
    fet_data["ncd"] = raw_data.apply(lambda x: ncd(x.q, x.t), axis=1)

    print "extracting basic distance features from q_ex and t..."
    fet_data["sorensen_ex"] = raw_data.apply(lambda x: distance.sorensen(get_uniq_words_text(x.q_ex), x.t), axis=1)
    print "extracting basic distance features from q_ex and t..."
    fet_data["jaccard_ex"] = raw_data.apply(lambda x: distance.jaccard(get_uniq_words_text(x.q_ex), x.t), axis=1)
    print "extracting basic distance features from q_ex and t..."
    fet_data["ncd_ex"] = raw_data.apply(lambda x: ncd(get_uniq_words_text(x.q_ex), x.t), axis=1)

    return fet_data
示例#42
0
 def jacc_high_pair(self):
     all_dis = []
     pair = []
     max_jacc = Strings_comp(self).jacc_max()
     small_list = Strings_comp(self).clear_data()
     for i in range(0, len(small_list)-1):
         cursor = small_list[i]          
         for j in range(i+1, len(small_list)-1):             
             dis = distance.jaccard(cursor, small_list[j])
             if (dis == max_jacc):
                 pair.append(cursor)
                 pair.append(small_list[j])
             if (dis!=0.0):
                 all_dis.append(dis)
     return str(pair[:2]).strip('[]').replace("'", "")
示例#43
0
def getPairFeatures(session):

  totalTime = 1.0 + (session[-1][QTIME] - session[0][QTIME]).total_seconds()
  for i in range(len(session) - 1):
    for j in range(i + 1, len(session)):
      e1 = session[i]
      e2 = session[j]
      jaccard = 1.0 - distance.jaccard(e1[QUERY].split(), e2[QUERY].split())
      edit = 1.0 - distance.nlevenshtein(e1[QUERY].split(), e2[QUERY].split())
      timeDiff = ((e2[QTIME] - e1[QTIME]).total_seconds()) / totalTime * 1.0
      #normalized distance
      dist = (j - i) * 1.0 / len(session)
      urlMatch = -1
      if CLICKU in e1 and CLICKU in e2:
        urlMatch = 1.0 - distance.nlevenshtein(e1[CLICKU], e2[CLICKU])
      cosine = get_cosine(text_to_vector(e1[QUERY]), text_to_vector(e2[QUERY]))
      edgeScore = .20 * cosine + .20 * jaccard + .20 * edit + .15 * dist + .15 * timeDiff + .10 * urlMatch
      yield i, j, edgeScore, cosine, jaccard, edit, dist, timeDiff, urlMatch
示例#44
0
 def distance(self, other, dist_between='characters'):
     """Compute the distance between this Scene and some other Scene
     on the basis of the Jaccard distance between their character or location sets
     or the union of their locations and characters."""
     if not isinstance(other, Scene):
         raise ValueError("Can't compare to %s" % type(other))
     if dist_between == 'characters':
         source, target = self.characters, other.characters
     elif dist_between == 'locations':
         source, target = self.locations, other.locations
     elif dist_between == 'both':
         source = self.characters.union(self.locations)
         target = other.characters.union(other.locations)
     else:
         raise ValueError(
             "The distance between %s cannot be computed." % dist_between)
     if not source or not other:
         return 1.0
     return distance.jaccard(source, target)
示例#45
0
def calcPrecision_2(expected,predicted):
	expectedSet = set(expected)
	precision = []	
	for i in [3,5,10]:
		predicted_truncate = predicted[:i]
		num = 0.0
		den = i*1.0
		for item in predicted_truncate:
			if item in expectedSet:
				num += 1.0
			else:
				for item2 in expectedSet:
					jac    = 1 - distance.jaccard(item, item2)
					#if jac > 0.8 or item in item2 or item2 in item:
					if jac > 1.6:
						num += 1.0
						break
		precision.append(num/den)
	return precision
示例#46
0
def calcRecall_2(expected,predicted):
	den = 1.0*len(expected)
	recall = []
	for i in [3,5,10]:
		predicted_truncate = predicted[:i]
		predictedSet = set(predicted_truncate)
		num = 0.0
		for item in expected:
			if item in predictedSet:
				num += 1.0
			else:
				for item2 in predictedSet:
					jac    = 1 - distance.jaccard(item, item2)
					#if jac > 0.8 or item in item2 or item2 in item:
					if jac > 1.6:
						num += 1.0
						break				
		recall.append(num/den)
	return recall
示例#47
0
def summarize(text, sentence_count=5, language='english'):
    stopwords = get_stopwords(language)
    sentence_list = tokenize.sent_tokenize(text, language)
    wordsets = [get_words(sentence, stopwords) for sentence in sentence_list]

    graph = Graph()
    pairs = combinations(enumerate(filter(None, wordsets)), 2)
    for (index_a, words_a), (index_b, words_b) in pairs:
        similarity = 1 - jaccard(words_a, words_b)
        if similarity > 0:
            graph.add_edge(index_a, index_b, weight=similarity)

    ranked_sentence_indexes = pagerank(graph).items()
    sentences_by_rank = sorted(
        ranked_sentence_indexes, key=itemgetter(1), reverse=True)
    best_sentences = map(itemgetter(0), sentences_by_rank[:sentence_count])
    best_sentences_in_order = sorted(best_sentences)

    return ' '.join(sentence_list[index] for index in best_sentences_in_order)
示例#48
0
def summarize(text, sentence_count=2):
    sentence_list = tokenize(text)

    # each document's name is the sentence's original index
    # so that we can put them back together later
    docs = [Document(string=sentence, name=index, stemmer=LEMMA)
            for index, sentence in enumerate(sentence_list)]

    graph = Graph()
    for doc_a, doc_b in combinations(docs, 2):
        wordset_a = [x[1] for x in doc_a.keywords()]
        wordset_b = [y[1] for y in doc_b.keywords()]
        similarity = 1 - jaccard(wordset_a, wordset_b)
        if similarity > 0:
            graph.add_edge(doc_a.name, doc_b.name, weight=similarity)

    ranked_sentence_indexes = pagerank(graph).items()
    sentences_by_rank = sorted(
        ranked_sentence_indexes, key=itemgetter(1), reverse=True)
    best_sentences = map(itemgetter(0), sentences_by_rank[:sentence_count])
    best_sentences_in_order = sorted(best_sentences)

    return ' '.join(sentence_list[index] for index in best_sentences_in_order)
示例#49
0
analyses_writer = shelve.open(os.path.join(os.pardir, ANALYSES_DIR, "support_analyses"))

all_models_sup_sets = []

for sup_info in all_sup_info:
    analysis_rec = []
    indx = 1
    sup_sets = []
    support_info = shelve.open(sup_info)
    for setting in SETTINGS:
        sup_sets.append(set(support_info[setting]))
        set1 = support_info[setting]
        for i in range(indx, len(SETTINGS)):
            set2 = support_info[SETTINGS[i]]
            if not (set2 == [] or set1 == []):
                analysis_rec.append(distance.jaccard(set1, set2))

            # UNKNOWN results:
            else:
                analysis_rec.append(float("nan"))

        indx += 1
    all_models_sup_sets.append(sup_sets)
    """
        later, you can read the 'support_analyses' file with shelve
        For X.lus, the key() is X.lus
        with a X.lus key, all pairwas Jaccard distances for X.lus can be retrieved in a list
        if you want to know all values for a particular model, you need to extract them with key = file_name
    """
    if analysis_rec != []:
        analyses_writer[sup_info[0 : len(sup_info) - 13]] = analysis_rec
示例#50
0
def writehaiku(trend, tweets):

    # Print preamble
    # print "Poet0: "

    # Create list of words in tweets
    allWords = []
    for tweet in tweets:
        allWords.extend(tweet.text.split())

    invalidWords = []
    for word in allWords:
        # remove URLs and twitter users and hashtags
        if sf.is_valid_url(word) or sf.is_twitter_user(word) or sf.is_hashtag(word):
            invalidWords.append(word)
        # remove non-alpha words
        if not word.isalpha():
            invalidWords.append(word)
        # remove words fewer than four characters or more than 25
        if len(word) < 4 or len(word) > 25:
            invalidWords.append(word)

    filteredWords = [word for word in allWords if word not in invalidWords]

    # logging.debug("Filtered wordlist is now: ")
    # logging.debug(filteredWords)

    # Get the list of unique words with their counts
    uniqueWords = Counter(filteredWords)

    # Get the most common words
    topWords = uniqueWords.most_common(5)

    # For top common filtered words, get phrases of length 5 containing them
    phrases = []
    for n in range(2, 7):
        for word in topWords:
            idx = n - 1
            try:
                while filteredWords[idx : len(filteredWords) - n].index(word[0]) >= 0:
                    idx = filteredWords[idx : len(filteredWords) - n].index(word[0]) + idx
                    for i in range(0, n):
                        phrases.append(" ".join(filteredWords[(idx - i) : (idx - i + n)]))
                    idx += 1
            except:
                idx = 0

    uniquePhrases = Counter(phrases)
    topPhrases = uniquePhrases.most_common(200)

    # Compute the syllable length for each phrase
    listPhrases = [list(phrase) for phrase in topPhrases]
    for phrase in listPhrases:
        phrase.append(sf.nsyllables(phrase[0]))

    # Use Jaccard similarity to choose top-tweeted phrases that are not similar to one another
    Phrase1 = ""
    Phrase2 = ""
    Phrase3 = ""

    # Populate 7-syllable phrase first with the top-tweeted phrase of five syllables
    for phrase in listPhrases:
        if phrase[2] == 7 and Phrase1 == "":
            Phrase2 = phrase[0]
            break

    # Get list of 5-syllable phrases and compute their Jaccard similarities from the 2nd
    # Choose the one that is popular with smaller Jaccard similarity
    if Phrase2 != "":
        for phrase in listPhrases:
            if phrase[2] == 5 and d.jaccard(Phrase2, phrase[0]) >= 0.4:
                if Phrase1 == "":
                    Phrase1 = phrase[0]

    # Get list of 5-syllable phrases and compute their Jaccard similarities from the 1st and 2nd
    # Choose the one that is popular with smaller Jaccard similarity
    if Phrase2 != "" and Phrase1 != "":
        for phrase in listPhrases:
            if phrase[2] == 5 and d.jaccard(Phrase2, phrase[0]) >= 0.6 and d.jaccard(Phrase1, phrase[0]) >= 0.6:
                if Phrase3 == "":
                    Phrase3 = phrase[0]

    myHaiku = h.Haiku()

    # Construct the haiku
    if Phrase1 != "" and Phrase2 != "" and Phrase3 != "":
        myHaiku.length = len(Phrase1) + len(Phrase2) + len(Phrase3)
        myHaiku.text = [Phrase1, Phrase2, Phrase3]
        # return [[Phrase1, Phrase2, Phrase3], len(Phrase1) + len(Phrase2) + len(Phrase3)]

    return myHaiku
			
			if len(word) == 0:
				continue

			words_ = word.split('.')
			for words_i in words_:
				words = convert(words_i).split('_')
				for word_i in words:
					word_i = word_i.lower()
					simTags = difflib.get_close_matches(word_i, taglist)
					simTags = simTags[:20]
					#print word_i
					#print simTags
					for simTag in simTags:
						#simTag = simTags[0]
						jac    = 1 - distance.jaccard(word_i, simTag)
						
						#print simTag,jac
						if jac > 0.6:
							if simTag not in tempDict:
								tempDict[simTag] = 0.0
							tempDict[simTag] += jac
							sum_ += jac

		sorted_x = sorted(tempDict.items(), key=operator.itemgetter(1), reverse=True)
		sorted_x = sorted_x[:topK]
		
		# sum_ = 0.0
		# for pair in sorted_x:
		# 	sum_ += pair[1]
示例#52
0
 def findJacardDistance(self, qFeat):
   #print self.query, qFeat.query, distance.jaccard(self.query, qFeat.query)
   qJac = 1.0 - distance.jaccard(self.query, qFeat.query)
   #uJac = 1.0-distance.jaccard(self.urlDict.keys(), qFeat.urlDict.keys());
   #userJac = 1.0-distance.jaccard(self.userDict.keys(), qFeat.userDict.keys());
   return qJac  #, uJac, userJac);
示例#53
0
from gensim.models import doc2vec
import scipy.spatial.distance as ds


if __name__ == '__main__':
	surveyqs = pickle.load(open(sys.argv[1],'rb'))
	for surveyq in surveyqs:
		text = surveyq['body']
		text = re.sub('(\\n)|(\')|(/)|(\d+)', ' ', text)
		text = text.lower()
		tempDict = {}
		textWords = text.split()
		simTags = surveyq['predicted_by_Hybrid_Matching']
		for textWord in textWords:
			for simTag in simTags:
				jac    = 1 - distance.jaccard(textWord, simTag)
				if jac > 0.8:
					if simTag not in tempDict:
						tempDict[simTag] = 0.0
					tempDict[simTag] += jac
		
		if len(tempDict.keys()) < len(simTags):
			for simTag in simTags:
				if simTag not in tempDict.keys():
					tempDict[simTag] = 0.0
				tempDict[simTag] += 0.8*tempDict[simTag] + 0.2*(1.0/len(simTags))
		
		sorted_x = sorted(tempDict.items(), key=operator.itemgetter(1), reverse = True)

		summ = 0.0
		for item in sorted_x:
示例#54
0
    except:
        core_elements_all_models.append(set([]))
        pass

#
# Calculate pairwise Jaccard distance of each configuration from core
#
overall_dist = []
for indx, model in enumerate(all_models_sup_sets):
    dist = []
    denum = 12.0
    for i, conf in enumerate(model):
        if i > 0:
            if core_elements_all_models[indx] != set([]):
                if conf != set([]):
                    dist.append(distance.jaccard(conf, core_elements_all_models[indx]))
                else:
                    denum -= 1.0
            else:
                dist.append(float('nan'))    
    # overal_dist[i] is overall distance for model i           
    overall_dist.append (sum(dist)/denum)
   
    
    
# a list of lists. sizes [i] shows the models whose size are between i and i+1 KB    
sizes = []
for i in range(9):
    sizes.append([])
    
for indx, model in enumerate(all_sup_info):
示例#55
0
 for b in pc:
     if checked == 0:
         if a == b:
             checked += 1
             for v1 in gt[a]:
                 partials = []
                 levs = []
                 jacs = []
                 sors = []
                 for v2 in pc[b]:
                     v2 = str(v2).translate(None, string.punctuation)
                     v2 = str(v2).replace('\t',' ')
                     try:
                         partials.append((1-(fuzz.partial_ratio(v1, v2)/100.0)))
                         levs.append(distance.levenshtein(v1,v2, normalized=True))
                         jacs.append(distance.jaccard(v1, v2))
                         sors.append(distance.sorensen(v1, v2))
                     except UnicodeDecodeError:
                         partials.append(1)
                         levs.append(1)
                         jacs.append(1)
                         sors.append(1)
                 ls_partials.append(partials)
                 ls_levs.append(levs)
                 ls_jacs.append(jacs)
                 ls_sors.append(sors)
         else:
             pass
     else:
         pass
 # create distance score matrices with row index as hand coded titles and 
示例#56
0
#-*- coding: utf-8 -*-
import distance

#词语相似度(字母重合度)
word1="decisive"
word2="decicive"
sen1=["whatsapp","messenger"]
sen2=["whatsapp","facebook"]
print distance.jaccard(word1,word2)

#词语相似度(编辑距离)
print distance.levenshtein(word1,word2)
print distance.hamming(word1,word2,normalized=True)

#词组/句子相似度(编辑距离)
print distance.levenshtein(sen1,sen2)

#一组比较
tokens=["tubbemate","tubemmate","tube cas"]
print sorted(distance.ilevenshtein("tubemate", tokens))