示例#1
0
def jaccard_distance_candidates(mention_spans,concept_spans,n_jaccard):
    candidate_indices = []
    for mention in mention_spans:
        distances = [nltk.jaccard_distance(set(mention),set(concept)) for concept in concept_spans]
        indices = np.argpartition(np.array(distances),-n_jaccard)[-n_jaccard:].tolist()
        candidate_indices.append(indices)
    return candidate_indices
示例#2
0
def spell_correction_english(input_sentence):
    input_tokens = nltk.word_tokenize(input_sentence)
    corrected = ""
    for token in input_tokens:
        bg_input_word = set(nltk.ngrams(token, n=2))
        words = nltk.corpus.words.words()
        min_jaccard = 5;
        for word in words:
            bg_word = set(nltk.ngrams(word, n=2))
            jd = nltk.jaccard_distance(bg_input_word, bg_word)
            if jd < min_jaccard:
                suggested_words = []
                min_jaccard = jd
                suggested_words.append(word)
            if jd == min_jaccard:
                suggested_words.append(word)
        print(suggested_words)
        min_edit_distance = 1000
        return_words = []
        for word in suggested_words:
            ed = nltk.edit_distance(token, word)
            if ed < min_edit_distance:
                return_words = []
                min_edit_distance = ed
                return_words.append(word)
            if ed == min_edit_distance:
                return_words.append(word)

        corrected = corrected + " " + return_words[0]
    return corrected
    def add_jackard_distance(self):

        for i in range(len(self.df)):
            head = self.df.loc[i]
            for j in range(len(self.df)):
                node = self.df.loc[j]
                self.jackard_distance[i, j] = jaccard_distance(
                    set(head.text.split()), set(node.text.split()))
示例#4
0
def token_sim(s1,s2):
#""" jaccard similarity between two strings"""
    try:
        aset = set(nltk.word_tokenize(s1))
        dset = set(nltk.word_tokenize(s2))
        return nltk.jaccard_distance(aset,dset)
    except:
        return 0
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']):
    recommendations = []
    for e in entries:
        distances = [(nltk.jaccard_distance(set(nltk.ngrams(e, n=4)),
                                            set(nltk.ngrams(a, n=4))), a)
                     for a in correct_spellings if a[0] == e[0] and len(a) > 2]
        recommendations.append(sorted(distances)[0][1])
    return recommendations  # Your answer here
示例#6
0
def nameCheck(str1, str2):
    tokens_1 = set(nltk.ngrams(str1, n=3))
    tokens_2 = set(nltk.ngrams(str2, n=3))
    distance = nltk.jaccard_distance(tokens_1, tokens_2)
    if distance > 0.75:
        return False
    else:
        return True
示例#7
0
文件: test.py 项目: ZZCzyh/NS-CQA
def jaccard_similarity(s1, s2):
    if s1 is None or len(s1) == 0:
        return 0.0
    elif s2 is None or len(s2) == 0:
        return 0.0
    else:
        jd = nltk.jaccard_distance(set(s1), set(s2))
        return 1.0 - jd
示例#8
0
def process_input():

    test_jaccard = []
    test_edit = []
    train_jaccard = []
    train_edit = []

    test_scores = test_gs.readlines()
    train_scores = train_gs.readlines()

    global gs
    for score in test_scores:
        gs.append(float(score.rstrip('\n')))
    for score in train_scores:
        gs.append(float(score.rstrip('\n')))

    test_lines = test.readlines()
    train_lines = train.readlines()

    test_i = 0
    for line in test_lines:
        pair = test_lines[test_i].split('\t')
        j_score = nltk.jaccard_distance(set(nltk.word_tokenize(pair[0])),
                                        set(nltk.word_tokenize(pair[1])))
        test_jaccard.append(j_score)
        edit_score = nltk.edit_distance(nltk.word_tokenize(pair[0]),
                                        nltk.word_tokenize(pair[1]))
        test_edit.append(edit_score)
        test_i += 1

    train_i = 0
    for line in train_lines:
        pair = train_lines[train_i].split('\t')
        j_score = nltk.jaccard_distance(set(nltk.word_tokenize(pair[0])),
                                        set(nltk.word_tokenize(pair[1])))
        train_jaccard.append(j_score)
        edit_score = nltk.edit_distance(nltk.word_tokenize(pair[0]),
                                        nltk.word_tokenize(pair[1]))
        train_edit.append(edit_score)
        train_i += 1

    global jaccard
    jaccard = test_jaccard + train_jaccard
    global edit
    edit = test_edit + train_edit
def match(csv, phrases):
    df = pd.read_csv(csv)
    indexes = []
    for item in df.iterrows():
        text = item[1]['comment']
        jaccards = float('inf')
        matched = ''
        index = 0
        for i, phrase in enumerate(phrases):
            if nltk.jaccard_distance(set(phrase), set(text)) < jaccards:
                tolerance = (abs(len(phrase) - len(text)) /
                             (max([len(phrase), len(text)
                                   ]))) * 0.8  ## this number can be tinkered
                matched = phrase
                jaccards = nltk.jaccard_distance(set(phrase), set(text))
        if jaccards < tolerance:
            indexes.append(item[0])
    return indexes
def content_distance_check(html1, html2):
    txt1 = extract_usable_text(html1)
    txt2 = extract_usable_text(html2)
    txt1 = re.sub('\s+', ' ', txt1)
    txt2 = re.sub('\s+', ' ', txt2)
    # print(txt1)
    # print(txt2)
    distance = jaccard_distance(set(txt1), set(txt2))
    return distance
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']):
    
    rcmd = []
    for entry in entries:
        spell_list = [spell for spell in correct_spellings if spell.startswith(entry[0]) and len(spell) > 2]
        dist_list = [nltk.jaccard_distance(set(nltk.ngrams(entry, n=4)), set(nltk.ngrams(spell, n=4))) for spell in spell_list]
        min_idx = dist_list.index(min(dist_list))
        rcmd.append(spell_list[min_idx])
    return rcmd
示例#12
0
def jaccard_dist(test_code, train_code):
    tc = transform_to_ngram(test_code)
    trc = transform_to_ngram(train_code)
    try:
        return nltk.jaccard_distance(tc, trc)
    except:
        print(test_code)
        print(train_code)
        exit()
示例#13
0
def jaccardDistance(text1, text2):
    # the tokenize function converts a string to a set.
    # def tokenize(self, s):
    #    return s.split(self._string)

    # by definition the jaccard distance calculates unique
    set1 = set(nltk.word_tokenize(text1))
    set2 = set(nltk.word_tokenize(text2))
    return nltk.jaccard_distance(set1, set2)
示例#14
0
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']):
    result = []
    import operator
    for entry in entries:
        spell_list = [spell for spell in correct_spellings if spell.startswith(entry[0]) and len(spell) > 2]
        distance_list = [(spell, nltk.jaccard_distance(set(nltk.ngrams(entry, n=4)), set(nltk.ngrams(spell, n=4)))) for spell in spell_list]

        result.append(sorted(distance_list, key=operator.itemgetter(1))[0][0])
    
    return result
示例#15
0
 def compare_edge_labels(e_a_list, e_b_list):
     score = 0
     for ea in e_a_list:
         for eb in e_b_list:
             # score += GSA.compare_strings_glove(ea, eb)
             # score += max(0, (1 - nltk.edit_distance(ea, eb) / max(len(ea), len(eb))))
             score += max(0, (1 - nltk.jaccard_distance(set(ea), set(eb))))  # jaccard_distance based on each character as element
             # score += max(0, (1 - nltk.jaccard_distance(set(ea.split()), set(eb.split()))))  # jaccard_distance based on each character as element
             # score += GSA.compare_strings(ea, eb)
     return score / (1 + len(e_a_list))
示例#16
0
    def evaluate_individual_sentence(self, original_sentence,
                                     paraphrase) -> Dict:

        original_sentence_tokens = nltk.word_tokenize(
            normalize_spaces_remove_urls(original_sentence))
        paraphrase_tokens = nltk.word_tokenize(
            normalize_spaces_remove_urls(paraphrase))

        # Bleu score
        bleu_score = nltk.translate.bleu_score.sentence_bleu(
            [normalize_spaces_remove_urls(original_sentence)],
            normalize_spaces_remove_urls(paraphrase))

        # Sentence embedding cosine similarity
        emb1 = self.model.encode(original_sentence)
        emb2 = self.model.encode(paraphrase)
        cos_sim = util.pytorch_cos_sim(emb1, emb2)

        # Levenshtein distance
        edit_distance = pylev.levenshtein(original_sentence_tokens,
                                          paraphrase_tokens)
        length = max(len(original_sentence_tokens), len(paraphrase_tokens))
        normalized_edit_distance = (length - edit_distance) / length

        # Jaccard
        jaccard = nltk.jaccard_distance(set(original_sentence_tokens),
                                        set(paraphrase_tokens))

        # Jaccard * cosine similarity
        jaccard_embedding_factor = jaccard * cos_sim.item()

        metrics = {
            'original_sentence':
            original_sentence,
            'paraphrase':
            paraphrase,
            'bleu_score':
            bleu_score,
            'normalized_original_sentence':
            normalize_spaces_remove_urls(original_sentence),
            'normalized_paraphrase':
            normalize_spaces_remove_urls(paraphrase),
            'embedding_cosine_similarity':
            cos_sim.item(),
            'edit_distance':
            edit_distance,
            'normalized_edit_distance':
            normalized_edit_distance,
            'jaccard':
            jaccard,
            'jaccard_embedding_factor':
            jaccard_embedding_factor
        }

        return metrics
示例#17
0
def text_to_code(text: str) -> Tuple[str, float]:
    text = text.strip()
    print(text)
    dists = [
        nltk.jaccard_distance(definition, set(text.split(" ")))
        for definition in definitions
    ]
    min_dist_index = np.argmin(dists)
    min_dist = dists[min_dist_index]
    code = codes[min_dist_index]
    return code, min_dist
示例#18
0
def display_concept(i_txt_file, concept_chosen, beginning, filename):
    with open(os.path.join('Uploads', filename)) as csvfile:
        csvdata = csv.reader(csvfile)
        next(csvdata, None)
        for row in csvdata:
            if row[12] == beginning and row[1] == i_txt_file:
                hof = str2bool(row[6])
                negated = str2bool(row[10])
                location = row[8]
                concept_to_display = row[3]
                range_txt_display = row[13]
                cosine_similarity_value = combined_cosine_similarity(
                    concept_to_display.lower(), range_txt_display.lower())
                jaccard_dist = nltk.jaccard_distance(set(concept_to_display),
                                                     set(range_txt_display))

    if request.method == 'POST':

        if not request.form['correct'] or not request.form['location'] or not request.form['hof'] \
                or not request.form['negation']:
            flash('Please insert yes or no.')
            return redirect(request.url)
        else:
            correct_answ = request.form['correct']
            location_answ = request.form['location']
            hof_answ = request.form['hof']
            negation_answ = request.form['negation']
            db = get_db()
            db.execute(
                'INSERT INTO feedback (concept, negation, hof, location, correct_answ,\
                 hof_answ, location_answ, negation_answ, cosine_sim, jaccard_dist)'
                ' VALUES (?,?,?,?,?,?,?,?,?,?)',
                (str(concept_chosen), int(negated), int(hof), str(location),
                 str2int(correct_answ), str2int(hof_answ),
                 str2int(location_answ), str2int(negation_answ),
                 float(cosine_similarity_value), float(jaccard_dist)))
            db.commit()
            return redirect(
                url_for('note.sentence',
                        i_txt_file=i_txt_file,
                        filename=filename,
                        conc=concept_chosen))

    return render_template('note/concept_display.html',
                           hof=hof,
                           negated=negated,
                           range_txt=range_txt_display,
                           location=location,
                           concept=concept_to_display,
                           filename=filename,
                           cosine=cosine_similarity_value,
                           jaccard=jaccard_dist,
                           i_txt_file=i_txt_file,
                           concept_chosen=concept_chosen)
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']):
    result = []
    n = 4
    for entry in entries:
        words = [word for word in correct_spellings if word[0] == entry[0]]
        distances = [(nltk.jaccard_distance(set(nltk.ngrams(entry, n=n)),
                                            set(nltk.ngrams(match,
                                                            n=n))), match)
                     for match in words]
        result.append(sorted(distances)[0][1])
    return result
示例#20
0
def similitud():
    for i in comments:
        first = i['text']
        corr_f = []
        vars.append([first, i['value'], get_color(i['value'])])
        print(first)
        for j in comments:
            second = j['text']
            jd = nltk.jaccard_distance(set(first), set(second))
            corr_f.append(round(1 - jd, 2))
        corr.append(corr_f)
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']):
    spellings = []
    for entry in entries:
        entry_spellings = [x for x in correct_spellings if x[0] == entry[0]]
        jac_distance = [
            nltk.jaccard_distance(set(nltk.ngrams(entry, n=4)),
                                  set(nltk.ngrams(x, n=4)))
            for x in entry_spellings
        ]
        spellings.append(entry_spellings[np.argmin(jac_distance)])
    return spellings
def nearestNeighbourGenre(movie):
    closestMovie = None
    closestDistance = 1
    for m in movies.values():
        if (m.title != movie.title):
            distance = nltk.jaccard_distance(movie.lemmas, m.lemmas)
            if distance < closestDistance:
                closestDistance = distance
                closestMovie = m
    print(closestMovie.title)
    return closestMovie.genre
示例#23
0
def save_distance_word_mapping(word_list,embeddings, target_path_edit, target_path_jaccard):
    print(len(word_list))
    with open(target_path_edit,'w',encoding='utf-8') as f:
        for word in word_list:
            distance_words = [(emb_word, nltk.edit_distance(emb_word,word)) for emb_word in embeddings if emb_word != word]
            new_word = min(distance_words, key = lambda t: t[1])
            f.write(str(word)+ ' '+str(new_word[0]) +' '+ str(new_word[1]) +"\n")
    with open(target_path_jaccard,'w',encoding='utf-8') as g:
        for word in word_list:
            distance_words = [(emb_word, nltk.jaccard_distance(set(emb_word),set(word))) for emb_word in embeddings if emb_word != word]
            new_word = min(distance_words, key = lambda t: t[1])
            g.write(str(word)+ ' '+str(new_word[0]) +' '+ str(new_word[1]) +"\n")
示例#24
0
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']):

    outcomes = []
    for entry in entries:
        spellings = [w for w in correct_spellings if w.startswith(entry[0])]
        distances = ((nltk.jaccard_distance(set(nltk.ngrams(entry, n=4)),
                                            set(nltk.ngrams(word, n=4))), word)
                     for word in spellings)
        closest = min(distances)
        outcomes.append(closest[1])

    return outcomes
def spellchecker(misspelled_word='caaar',n_gram = 1):
    correct_words = [i for i in correct_spelling if i.startswith(misspelled_word[0]) and len(i) > 1]
    min_distance = 1
    min_word = ''
    for word in correct_words:
        distance = nltk.jaccard_distance(set(nltk.ngrams(misspelled_word,n=n_gram)) , set(nltk.ngrams(word,n=n_gram)))
        
        if distance < min_distance:
            min_distance = distance
            min_word = word
    
    return min_word
def calculate_similarity(docs_list, similarity_type, threshold):
    ''' Calculate vector similarity of all possible pairs in list '''
    results = list()
    counter = 0

    # Get all possible combinations of tweets that have same NER
    all_combinations = list(combinations(docs_list, 2))

    # Filter handles, hashtags, emoticons, etc.

    for tweet_pair in all_combinations:
        tweet_pair[0].filter("*")
        tweet_pair[1].filter("*")

        # Filter out pairs with exact sentences
        if tweet_pair[0].clean_text != tweet_pair[1].clean_text:

            # Filter out sentences shorter than 4 words
            if tweet_pair[0].tweet_len() > 3 and tweet_pair[1].tweet_len() > 3:

                # Filter out those combinations with excesive word number differences
                if abs(tweet_pair[0].tweet_len() - tweet_pair[1].tweet_len()) < 4:


                    if similarity_type == "jaccard":
                        settext1 = tweet_pair[0].word_set()
                        settext2 = tweet_pair[1].word_set()
                        d = jaccard_distance(settext1, settext2)

                    if similarity_type == "jaro_winkler":
                        d = 1 - distance.jaro_winkler_similarity(tweet_pair[0].clean_text, tweet_pair[1].clean_text)

                    if similarity_type == "levenshtein":
                        d = damerau_levenshtein_distance(tweet_pair[0].clean_text, tweet_pair[1].clean_text)

                    # Only return those results above the threshold
                    if d < threshold:

                        # Put in source sentences with more oov words and extra filter target
                        if tweet_pair[0].oov_words() > tweet_pair[1].oov_words():
                            bi_combination = tweet_pair[0].source_filter(), tweet_pair[1].target_filter()
                        else:
                            bi_combination = tweet_pair[1].source_filter(), tweet_pair[0].target_filter()

                        if bi_combination not in results:
                            results.append(bi_combination)
                            counter += 1


        sys.stdout.write(f"\rAdding combinations...")
        sys.stdout.flush()
    return results
示例#27
0
def jaccard_similarity():
    train_list=[]
    for community_key in community_users_hashtags_dict.keys():
        for community_key_inside in community_users_hashtags_dict.keys():
            try:
                print("==============================================")
                print(community_key,community_key_inside)
                # print(community_users_hashtags_dict[community_key].hashtags_list)
                # print( cosine_similarity(community_users_hashtags_dict[community_key].hashtags_list,tfidf_matrix_train))  # here the first element of tfidf_matrix_train is matched with other three elements
                jd_sent_1_2 = nltk.jaccard_distance(set(nltk.ngrams(community_users_hashtags_dict[community_key].hashtags_list, n=3)), set(nltk.ngrams(community_users_hashtags_dict[community_key_inside].hashtags_list, n=3)))
                print(jd_sent_1_2)
            except Exception as ex:
                print("General Exception")
示例#28
0
def jacc_trigramTOKEN(tupla1, tupla2):
    sent1 = concatenate_list_data(tupla1)
    sent2 = concatenate_list_data(tupla2)

    tokens1 = nltk.word_tokenize(sent1)
    tokens2 = nltk.word_tokenize(sent2)

    ng1_tokens = set(nltk.ngrams(tokens1, n=3))
    ng2_tokens = set(nltk.ngrams(tokens2, n=3))

    jd_sent_1_2 = 1 - nltk.jaccard_distance(ng1_tokens, ng2_tokens)
    vector = [jd_sent_1_2]
    return vector
示例#29
0
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']):

    results = dict()
    for mistake in entries:
        results[mistake] = []
        words_to_ckeck = [i for i in correct_spellings if i[0] == mistake[0]]
        results[mistake] = [
            (nltk.jaccard_distance(set(nltk.ngrams(mistake, n=4)),
                                   set(nltk.ngrams(word, n=4))), word)
            for word in words_to_ckeck
        ]

    return [sorted(results[entry])[0][1] for entry in entries]
示例#30
0
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']):
    
    recommend = []
    for entry in entries:
        # Match first letter. input_spell contains all words in correct_spellings with the same first letter.
        input_spell = [x for x in correct_spellings if x[0] == entry[0]]
        
        # Find the jaccard distance between the entry word and every word in correct_spellings with the same first letter.
        jaccard_dist = [nltk.jaccard_distance(set(nltk.ngrams(entry,n=4)), set(nltk.ngrams(x,n=4))) for x in input_spell]
        
        # Recommend the word in input_spell with the minimum Jaccard distance.
        recommend.append(input_spell[np.argmin(jaccard_dist)])
        
    return recommend # Your answer here
示例#31
0
    """ Tokenizes string s, removes stopwords, and returns a set of k-shingles
    """
    s, k, stopwords = args
    return kshinglize(s, k, stopwords)

def kshinglize(s, k=KSHINGLES, stopwords=STOPWORDS):
    """ Tokenizes string s, removes stopwords, and returns a set of k-shingles
    """
    s = s.strip().lower()
    tokens_raw = twokenize.tokenize(s)
    tokens = filterstopwords(tokens_raw, stopwords)
    return tokens_to_kshingles(tokens, k)
    
def _calculate_distance(((eid_a, shingles_a), (eid_b, shingles_b))):
    if (shingles_a and shingles_b):
        jd = nltk.jaccard_distance(shingles_a, shingles_b)
    else:
        # One of the elements has no shingles
        jd = 1.0
    return ((eid_a, eid_b), jd) 

def build_distance_table(kshingles):
    """ Create a hash table of the Jaccard distance
        between all elements in the dict kshingles.
        kshingles = { id : set(shingle1, shingle2, ...) }

    """
    distance = ReversibleKeyDict()
    with closing(multiprocessing.Pool()) as pool:
        for (eid_a, eid_b), jd in pool.imap_unordered(_calculate_distance, itertools.combinations(kshingles.iteritems(), 2), 250):
            distance[eid_a, eid_b] = jd