def ngrams_similarity(s1, s2, filter_stop_words=True):
    # Tokenize by sentences into words in lower case
    tokenized_sentence_1 = nltk.word_tokenize(s1.lower())
    tokenized_sentence_2 = nltk.word_tokenize(s2.lower())

    if filter_stop_words:
        tokenized_sentence_1 = [
            token for token in tokenized_sentence_1 if token not in stop_words
        ]
        tokenized_sentence_2 = [
            token for token in tokenized_sentence_2 if token not in stop_words
        ]

    grams_lst_1 = [w for w in nltk.ngrams(tokenized_sentence_1, 2)]
    grams_lst_2 = [w for w in nltk.ngrams(tokenized_sentence_2, 2)]
    if len(grams_lst_1) > 0 and len(grams_lst_2) > 0:
        sim2 = 1 - jaccard_distance(set(grams_lst_1), set(grams_lst_2))
    else:
        sim2 = 0

    grams_lst_1 = [w for w in nltk.ngrams(tokenized_sentence_1, 3)]
    grams_lst_2 = [w for w in nltk.ngrams(tokenized_sentence_2, 3)]
    if len(grams_lst_1) > 0 and len(grams_lst_2) > 0:
        sim3 = 1 - jaccard_distance(set(grams_lst_1), set(grams_lst_2))
    else:
        sim3 = 0

    grams_lst_1 = [w for w in nltk.ngrams(tokenized_sentence_1, 4)]
    grams_lst_2 = [w for w in nltk.ngrams(tokenized_sentence_2, 4)]
    if len(grams_lst_1) > 0 and len(grams_lst_2) > 0:
        sim4 = 1 - jaccard_distance(set(grams_lst_1), set(grams_lst_2))
    else:
        sim4 = 0

    return sim2, sim3, sim4
Пример #2
0
def fun_1_5_2():
    def jacc_similarity(query, document):
        first = set(query).intersection(set(document))
        second = set(query).union(set(document))
        return len(first) / len(second)

    from nltk.metrics import jaccard_distance
    X = set([10, 20, 30, 40])
    Y = set([20, 30, 60])
    print jaccard_distance(X, Y)
Пример #3
0
def compute_feature5(frases1, frases2, X_train_or_test):
    sw = set(stopwords.words('english'))
    wnl = WordNetLemmatizer()
    feature = []

    for sent1, sent2 in zip(frases1, frases2):
        sent1 = preprocess(sent1, wnl, sw)
        sent2 = preprocess(sent2, wnl, sw)
        jaccard_distance(set(sent1), set(sent2))
        feature.append(jaccard_distance(set(sent1), set(sent2)))

    X_train_or_test = np.concatenate(
        (X_train_or_test, np.array(feature).reshape(len(feature), 1)), axis=1)
    return X_train_or_test
Пример #4
0
 def print_absolute_agreement(
         cls,
         dataframe: pd.DataFrame,
         iaa_by_column_dict: Optional[Dict] = None) -> None:
     if iaa_by_column_dict is None:
         iaa_by_column_dict = cls.run_closed_class_jaccard_and_masi(
             dataframe)
     for column in cls.CLOSED_CLASS_COLUMNS:
         df = iaa_by_column_dict[column]['df']
         print(f"Interannotator agreement for {column}")
         annotator_list = dataframe.source_spreadsheet.unique()
         print(" \t" +
               "\t".join([str(annotator) for annotator in annotator_list]))
         for a1 in annotator_list:
             a1_vals = list(df[df.source_spreadsheet == a1][column])
             print(f"{a1}", end="\t")
             pairwise_agreements = []
             for a2 in annotator_list:
                 a2_vals = list(df[df.source_spreadsheet == a2][column])
                 agreement_sum = 0
                 for a1_val, a2_val in zip(a1_vals, a2_vals):
                     agreement_sum += 1 - jaccard_distance(a1_val, a2_val)
                 pairwise_agreements.append(agreement_sum /
                                            min(len(a1_vals), len(a2_vals)))
                 print(f"{pairwise_agreements[-1]:.2f}", end="\t")
             print(
                 f"\t{(sum(pairwise_agreements) - 1) / (len(pairwise_agreements) - 1):.2f}"
             )
         print()
         print()
Пример #5
0
def jaccard_sim(word1, word2):
    set1 = set(word1)
    set2 = set(word2)

    coefficient = 1 - jaccard_distance(set1, set2)

    return coefficient
Пример #6
0
def jaccard_common(nominals):
    sents = brown.sents()
    sents_no_punct = []
    for sent in sents:
        sents_no_punct.append([
            ''.join(c for c in s if c not in string.punctuation) for s in sent
        ])
    sents_no_punct = [words for sent in sents_no_punct for words in sent]
    sents_no_punct = [word for word in sents_no_punct if word]
    five_grams = ngrams(sents_no_punct, 5)
    e1_words, e2_words = [], []
    for five_gram in five_grams:
        if nominals[0] in five_gram:
            for word in five_gram:
                if word != nominals[0]:
                    e1_words.append(word)
        elif nominals[1] in five_gram:
            for word in five_gram:
                if word != nominals[1]:
                    e2_words.append(word)
    e1_top, e2_top = [], []
    e1_count = Counter(e1_words)
    e2_count = Counter(e2_words)
    e1_top = [word[0] for word in e1_count.most_common(4)]
    e2_top = [word[0] for word in e2_count.most_common(4)]
    return [e1_top, e2_top], jaccard_distance(set(Counter(e1_words).keys()),
                                              set(Counter(e2_words).keys()))
Пример #7
0
def jaccard_POS(sen_1, sen_2):
    pos_1 = split_and_POS(sen_1)
    pos_2 = split_and_POS(sen_2)

    pos_1 = set(pos_1)
    pos_2 = set(pos_2)
    return jaccard_distance(pos_1, pos_2)
Пример #8
0
def word_word_is_similarity(phrase_word_1, phrase_word_2):
  words_1 = Util.split_unicode_words(phrase_word_1)
  words_2 = Util.split_unicode_words(phrase_word_2)
  if len(words_1) == 0 or len(words_2) == 0:
    return False

  jaccard_similarity = jaccard_distance(words_1, words_2)

  return jaccard_similarity < 0.1
Пример #9
0
def ngrams_word_for(sen_1, sen_2, n_grams):
  ngrams_1 = get_ngrams_for_sen(sen_1, n_grams)
  ngrams_2 = get_ngrams_for_sen(sen_2, n_grams)
  set_1 = set(ngrams_1)
  set_2 = set(ngrams_2)
  if len(set_1) == 0 or len(set_2) == 0:
    return 0.00001
  value = 1.00001 -  jaccard_distance(set_1, set_2)

  return value
Пример #10
0
    def score(self, lbl_types, ref_types, stemmed_word):
        """Gives the Jaccard distance between the two sets."""

        # Hack: ref 23643 is empty after applying rules & so it case "A*D" from csv file
        if not len(ref_types): return 1

        if stemmed_word:
            ref_types = self.replace_stem(stemmed_word, ref_types, lbl_types)

        return jaccard_distance(lbl_types, ref_types)
Пример #11
0
def jaccard_POS_ngrams(sen_1, sen_2, n_grams):
    pos_1 = split_and_POS(sen_1)
    pos_2 = split_and_POS(sen_2)

    pos_1 = get_ngrams_for(pos_1, n_grams)
    pos_2 = get_ngrams_for(pos_2, n_grams)

    pos_1 = set(pos_1)
    pos_2 = set(pos_2)
    return jaccard_distance(pos_1, pos_2)
Пример #12
0
def jaccard_distance_chunk(D):
    '''
    Calculates the jaccard distance between lemmatized list pairs.
    '''
    if len(D) > 0:
        D[JACCARD_DISTANCE] = D.loc[:, Q_WORD_TOKENIZED].apply(
            lambda x: jaccard_distance(set(literal_eval(x[0])),
                                       set(literal_eval(x[1]))),
            axis=1)
    return D
Пример #13
0
def getName(namesList, email):

    temp = []
    for name in namesList:
        if str(name).lower() != str(
                email[0]).lower() and str(name).lower() != str(
                    email[1]).lower():
            temp.append(name)
    namesList = temp

    sim = 0.0
    person = None
    for name in namesList:
        simn = 0.0
        division = 0
        for mail in email:
            namemail = mail.split('@')
            namemail = str(namemail[0])
            if mail != None:
                char1_2 = set(ngrams(namemail, 2))
                char1_3 = set(ngrams(namemail, 3))
                char1_4 = set(ngrams(namemail, 4))
                char2_2 = set(ngrams(name, 2))
                char2_3 = set(ngrams(name, 3))
                char2_4 = set(ngrams(name, 4))
                char2_jd = 1.0 - jaccard_distance(char1_2, char2_2)
                char3_jd = 1.0 - jaccard_distance(char1_3, char2_3)
                char4_jd = 1.0 - jaccard_distance(char1_4, char2_4)
                simn += 0.2 * char2_jd + 0.5 * char3_jd + 0.3 * char4_jd
                division += 1
        if division != 0:
            simn /= division
            if simn > sim:
                sim = simn
                person = name
        else:
            person = None
            break
    #EXTRA
    if sim < 0.05:
        person = None
    #print sim
    return person
def ne_simmilarity(s1, s2):
    sent1 = ner_transform(s1)
    sent2 = ner_transform(s2)
    # Compute similarity
    if len(sent1) > 0 and len(sent2) > 0:
        similarity = 1 - jaccard_distance(set(sent1), set(sent2))
        # Compute label of similarity
        return similarity
    else:
        return 0
Пример #15
0
    def predict(self, data_frame, maximum=5):
        predicted = []
        for index, row in data_frame.iterrows():
            s1 = row['sentence0']
            s2 = row['sentence1']
            jaccard_similarity = (1 -
                                  jaccard_distance(set(s1), set(s2))) * maximum
            predicted.append(jaccard_similarity)

        return predicted
Пример #16
0
def compute_feature10(frases1, frases2, X_train_or_test):
    feature = []

    for sent1, sent2 in zip(frases1, frases2):
        result1 = method(sent1)
        result2 = method(sent2)
        c = 1 - jaccard_distance(set(result1), set(result2))
        feature.append(c)

    X_train_or_test = np.concatenate(
        (X_train_or_test, np.array(feature).reshape(len(feature), 1)), axis=1)
    return X_train_or_test
Пример #17
0
def validator(mappings_dict, client, index_theta_one, index_theta_two, datetime_from_tm_2, datetime_to_tm_1,
              number_of_topics):
    """
    pass
    """
    from sklearn.preprocessing import MinMaxScaler
    from nltk.metrics import jaccard_distance
    scaler = MinMaxScaler()
    scores = dict(zip(mappings_dict.keys(), [0] * len(mappings_dict)))
    scores_for_normalization = []
    for threshhold, map_dict in mappings_dict.items():
        cnt_matches_for_threshhold = 0
        for topic_parent, topic_childs_list in map_dict.items():

            theta_1 = search(client=client, index=index_theta_one,
                             query={'datetime__gte': datetime_from_tm_2, 'datetime__lte': datetime_to_tm_1,
                                    'topic_id': topic_parent, 'topic_weight__gte': 0.05},
                             source=['document_es_id'],
                             start=0,
                             end=1000000,
                             get_scan_obj=True
                             )
            scanned_parent = set([elem.document_es_id for elem in theta_1])

            for topic_child in topic_childs_list:
                theta_2 = search(client=client, index=index_theta_two,
                                 query={'datetime__gte': datetime_from_tm_2, 'datetime__lte': datetime_to_tm_1,
                                        'topic_id': topic_child, 'topic_weight__gte': 0.05},
                                 source=['document_es_id'],
                                 start=0,
                                 end=1000000,
                                 get_scan_obj=True
                                 )
                jaccard_score = 1 - jaccard_distance(scanned_parent, set([elem.document_es_id for elem in theta_2]))

                scores[threshhold] += jaccard_score
                cnt_matches_for_threshhold += 1
        try:
            avg_score = scores[threshhold] / cnt_matches_for_threshhold

            scores_for_normalization.append(avg_score)
            scores[threshhold] = [len(map_dict) / number_of_topics, avg_score]

        except ZeroDivisionError:
            scores[threshhold] = [len(map_dict) / number_of_topics, 0]

    scores_normalized = [score[0] for score in scaler.fit_transform(np.array(scores_for_normalization).reshape(-1, 1))]

    for i, items in enumerate(scores.items()):
        scores[items[0]] += [scores_normalized[i]]

    return scores
Пример #18
0
def jaccard(sen_1, sen_2):
  tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_1))
  words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]

  sen_set_1 = set(words)

  tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_2))
  words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]

  sen_set_2 = set(words)

  jaccard_value = jaccard_distance(sen_set_1, sen_set_2)
  return jaccard_value
Пример #19
0
 def jaccard_similarity_stemmer(self, config, sentence1, sentence2):
     """
     Computes stem unigram similarity.
     """
     tokens1 = [
         self.stemmer.stem(token) for token in word_tokenize(sentence1[1])
         if token not in self.punctuation_set
     ]
     tokens2 = [
         self.stemmer.stem(token) for token in word_tokenize(sentence2[1])
         if token not in self.punctuation_set
     ]
     return 1 - jaccard_distance(set(tokens1), set(tokens2))
Пример #20
0
def cal_ngrams_by_jacc(wn_grams, ox_grams):

  matrix_similarity_jaccard = [[0 for x in range(len(ox_grams))] for x in range(len(wn_grams))];

  for iWnWord in range(len(wn_grams)):

    wn_set = set(wn_grams[iWnWord]);

    for iDictWord in range(len(ox_grams)):

      dict_set = set(ox_grams[iDictWord])
      matrix_similarity_jaccard[iWnWord][iDictWord] = 1 - jaccard_distance(wn_set,dict_set);

  return matrix_similarity_jaccard
def jaccard_similarity(s1, s2):
    try:
        tokenized_sentence_1 = nltk.word_tokenize(s1.lower())
        tokenized_sentence_2 = nltk.word_tokenize(s2.lower())
    except:
        print("Error: S1[%s] \n S2[%s]" % (s1, s2))
        return 0
    # Compute similarity
    if len(tokenized_sentence_1) > 0 and len(tokenized_sentence_2) > 0:
        similarity = 1 - jaccard_distance(set(tokenized_sentence_1),
                                          set(tokenized_sentence_2))
        return similarity
    else:
        return 0
Пример #22
0
    def computeDistances(self):
        for k, v in sorted(self.keywords.iteritems()):
            prev = None
            for tup in v:
                self.allItems.append((tup[0], tup[1], k))
                cnt = len(self.allItems)-1
                if prev != None:
                    self.allSeq.append([prev, cnt, k])
                prev = cnt

        n=len(self.allItems)
        self.dist = numpy.zeros(shape=(n,n))
        for i in range(0,n):
            for j in range(0,n):
                try:
                    self.dist[i,j] = jaccard_distance(set(self.allItems[i][1]), set(self.allItems[j][1]))
                except ZeroDivisionError:
                    self.dist[i,j] = 0 #sys.maxint
Пример #23
0
def get_dists(keyword):
    dists = []
    for word in words_preprocessed:
        dists.append({
            "edit_dist":
            edit_distance(word, keyword),
            "jaro_simi":
            jaro_similarity(word, keyword),
            "jaro_winkler_simi":
            jaro_winkler_similarity(word, keyword),
            "jaccard_dist":
            jaccard_distance(set(word), set(keyword)),
            "word":
            word,
            "keyword":
            keyword
        })
    return pd.DataFrame(dists).sort_values("edit_dist").iloc[0:3, :]
Пример #24
0
def similarity_by_jaccard(ox_defis, wn_defis):

  matrix_similarity_jaccard = [[0 for x in range(len(ox_defis))] for x in range(len(wn_defis))];

  for iWnWord in range(len(wn_defis)):

    tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn_defis[iWnWord]));
    words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

    # words = nltk.wordpunct_tokenize(wn.synset(wn_defis[iWnWord].name()).definition());
    # print words
    for i in range(len(words)):
      words[i] = wordnet_lemmatizer.lemmatize(words[i]);
    wn_set = set(words);
#    print "\n"
#    print wn_set
    # wn_set = set(wn.synset(wn_defis[iWnWord].name()).definition().split())
    # print wn_set

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(ox_defis)):

#      if not ox_defis[str(iDictWord)].has_key("d") or dict_words[str(iDictWord)]["d"] == None:
#        matrix_similarity_jaccard[iWnWord][iDictWord] = 1;
#        continue

      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(ox_defis[iDictWord]));
      words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

      # words = nltk.wordpunct_tokenize(ox_defis[str(iDictWord)]["d"]);
      # print words
      for i in range(len(words)):
        words[i] = wordnet_lemmatizer.lemmatize(words[i]);
      dict_set = set(words);
#      print dict_set
      # print
      # dict_set = set(ox_defis[str(iDictWord)]["d"].encode('utf8').split());
      matrix_similarity_jaccard[iWnWord][iDictWord] = 1 - jaccard_distance(wn_set,dict_set);
#      matrix_similarity_jaccard[iWnWord][iDictWord] = cal_jacc_for_ngrams(wn_set, dict_set, 1)

  ########################################
  return matrix_similarity_jaccard
def synsets_similarity(s1, s2):
    """
    Find the jaccard similarity between two sentences synsets using lesk algorithm
    to disambiguate words given their context.
    """
    lemmas_sentence_1, tagged_sentence_1 = lemmatize_sentence(s1.lower())
    lemmas_sentence_2, tagged_sentence_2 = lemmatize_sentence(s2.lower())

    # Disambiguate words and create list of sysnsets
    synsets_sentence_1 = []
    for (lemma, word_tag) in zip(lemmas_sentence_1, tagged_sentence_1):
        if lemma in stop_words:
            continue
        synset = lesk(lemmas_sentence_1, lemma, wordnet_pos_code(word_tag[1]))
        if synset is not None:
            synsets_sentence_1.append(synset)
        else:
            found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1]))
            if len(found) > 0:
                synsets_sentence_1.append(found[0])
                #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0]))

    synsets_sentence_2 = []
    for (lemma, word_tag) in zip(lemmas_sentence_2, tagged_sentence_2):
        if lemma in stop_words:
            continue
        synset = lesk(lemmas_sentence_2, lemma, wordnet_pos_code(word_tag[1]))
        if synset is not None:
            synsets_sentence_2.append(synset)
        else:
            found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1]))
            if len(found) > 0:
                synsets_sentence_2.append(found[0])
                #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0]))

    # Compute similarity
    if len(synsets_sentence_1) != 0 and len(synsets_sentence_2) != 0:
        similarity = 1 - jaccard_distance(set(synsets_sentence_1),
                                          set(synsets_sentence_2))
        return similarity
    else:
        return 0
Пример #26
0
    def computeDistances(self):
        for k, v in sorted(self.keywords.iteritems()):
            prev = None
            for tup in v:
                self.allItems.append((tup[0], tup[1], k))
                cnt = len(self.allItems) - 1
                if prev != None:
                    self.allSeq.append([prev, cnt, k])
                prev = cnt

        n = len(self.allItems)
        self.dist = numpy.zeros(shape=(n, n))
        for i in range(0, n):
            for j in range(0, n):
                try:
                    self.dist[i,
                              j] = jaccard_distance(set(self.allItems[i][1]),
                                                    set(self.allItems[j][1]))
                except ZeroDivisionError:
                    self.dist[i, j] = 0  #sys.maxint
Пример #27
0
 def number_overlap(self, config, sentence1, sentence2):
     """
     Computes the Jaccard distance between the sets of the cardinal numbers that appear in the two sentences.
     It tries to parse numbers expressed in words (for example 'two', 'three hundreds and forty-nine'), numbers using
     ',' (for example '16,432,970') and decimal numbers (for example '45,233.4123'). If the number can't be parsed,
     it's added as the corresponding string to the set.
     """
     numbers1 = []
     for i, word in enumerate(sentence1[2]):
         if sentence1[3][i] == 'CD':
             try:
                 numbers1.append(w2n.word_to_num(sentence1[2][i]))
             except ValueError:
                 try:
                     numbers1.append(
                         w2n.word_to_num(sentence1[2][i].replace(',', '')))
                 except ValueError:
                     try:
                         numbers1.append(
                             float(sentence1[2][i].replace(',', '')))
                     except ValueError:
                         numbers1.append(word)
     numbers2 = []
     for i, word in enumerate(sentence2[2]):
         if sentence2[3][i] == 'CD':
             try:
                 numbers2.append(w2n.word_to_num(sentence2[2][i]))
             except ValueError:
                 try:
                     numbers2.append(
                         w2n.word_to_num(sentence2[2][i].replace(',', '')))
                 except ValueError:
                     try:
                         numbers2.append(
                             float(sentence2[2][i].replace(',', '')))
                     except ValueError:
                         numbers2.append(word)
     try:
         return 1 - jaccard_distance(set(numbers1), set(numbers2))
     except ZeroDivisionError:
         return 0
def dependency_similarity(s1, s2):
    """
    Find the jaccard similarity between the semantic depency parsing nodes of the sentences
    using CoreNLP dependency parser.
    """
    # pass
    parsed_sentence_1 = parser.raw_parse(s1)
    parsed_sentence_2 = parser.raw_parse(s2)

    tree1 = next(parsed_sentence_1)
    tree2 = next(parsed_sentence_2)

    triples1 = [t for t in tree1.triples()]
    triples2 = [t for t in tree2.triples()]

    # Compute similarity
    if len(triples1) != 0 and len(triples2) != 0:
        similarity = 1 - jaccard_distance(set(triples1), set(triples2))
        return similarity
    else:
        return 0
Пример #29
0
def compute_feature8(frases1, frases2, X_train_or_test):
    feature = []
    sw = set(stopwords.words('english'))
    wnl = WordNetLemmatizer()

    feature = []

    for sent1, sent2 in zip(frases1, frases2):
        sent1b = sent1
        sent2b = sent2
        sent1 = preprocess(sent1, wnl, sw)
        sent2 = preprocess(sent2, wnl, sw)

        trigrams1 = list(nltk.trigrams(sent1))
        trigrams2 = list(nltk.trigrams(sent2))
        if len(trigrams1) == 0 or len(trigrams2) == 0:
            feature.append(0)
        else:
            feature.append(jaccard_distance(set(trigrams1), set(trigrams2)))

    X_train_or_test = np.concatenate((X_train_or_test, np.array(feature).reshape(len(feature), 1)), axis=1)
    return X_train_or_test
def lemmas_similarity(s1, s2, filter_stop_words=True):
    """
    Jaccard lematized sentences similarity 
    """
    # Tokenize by sentences into words in lower case
    tokenized_sentence_1 = nltk.word_tokenize(s1.lower())
    tokenized_sentence_2 = nltk.word_tokenize(s2.lower())

    if not filter_stop_words:
        tokenized_sentence_1 = [
            token for token in tokenized_sentence_1 if token not in stop_words
        ]
        tokenized_sentence_2 = [
            token for token in tokenized_sentence_2 if token not in stop_words
        ]

    tagged_sentence_1 = pos_tag(
        tokenized_sentence_1)  # [ (word, POS_TAG), ...]
    tagged_sentence_2 = pos_tag(
        tokenized_sentence_2)  # [ (word, POS_TAG), ...]

    lemmas_sentence_1 = [
        lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_1
    ]
    lemmas_sentence_2 = [
        lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_2
    ]  # [LEMMA_1, ...]

    # Compute similarity
    if len(lemmas_sentence_1) > 0 and len(lemmas_sentence_2) > 0:
        similarity = 1 - jaccard_distance(set(lemmas_sentence_1),
                                          set(lemmas_sentence_2))
        # Compute label of similarity
        return similarity
    else:
        return 0
Пример #31
0
def create_graph():  
    g = nx.Graph()
    xml_docs = Collection()
    xml_docs_subset = xml_docs.get_docs(author="Wau Holland")
    docs_no = len(xml_docs_subset)
    id_dict = dict()
    stems_dict = dict()
    doc_id = 1
    
    print "Put stems into a dict for each document (with an uniq id) ..."
    print "Create nodes with all the documents' relevant information ..."
    pb = ProgressBar(maxval=docs_no).start()
    
    for xml_doc in xml_docs_subset:
        pb.update(doc_id)
        id_dict[xml_doc.get_xml_filename()] = doc_id
        g.add_node(doc_id, 
                   id = xml_doc.get_id(),
                   rawlen = xml_doc.get_rawlen(),
                   subj = xml_doc.get_subj(),
                   author = xml_doc.get_author(),
                   date = xml_doc.get_date(),
                   words = xml_doc.get_words(),
                   uniq_stems = list(xml_doc.get_stems(uniq=True, 
                                                       relev=True)),
                   rawcontent = xml_doc.get_rawcontent()
                   )
        doc_id += 1
        # It seems sometimes a list (-> set conversion) gets returned 
        # ... ugly. XXX
        stems_dict[doc_id] = set(xml_doc.get_stems(uniq=True, relev=True))
        
    print "Create undirected, weighted graph based on Jaccard similarity ..."
    no_of_edges = docs_no * (docs_no - 1) / 2
    pb = ProgressBar(maxval=no_of_edges).start()
    count = 1
    for doc_idx1 in stems_dict.keys():
        doc_idx2 = doc_idx1 + 1
        # Nothing left to compare
        if (doc_idx1 == docs_no):
            break
    
        while True:
            # print "Comparing: ", doc_idx1, doc_idx2
            
            # Find longer doc
            doc1_len, doc2_len = len(stems_dict[doc_idx1]), \
                                    len(stems_dict[doc_idx2])
            long_doc_len = max((doc1_len, doc2_len))
            short_doc_len = min((doc1_len, doc2_len))
            
            # In case a document has no useful stems to classify
            edge_weight = 0
            alias_coeff = 0
            if long_doc_len == 0 or short_doc_len == 0:
                pass
            else:
                alias_coeff = float(long_doc_len) / short_doc_len
            
                edge_weight = (1 - jaccard_distance(stems_dict[doc_idx1],
                                           stems_dict[doc_idx2])) \
                           * alias_coeff
                           
            print alias_coeff, edge_weight
            
            # Still redundant, only for testing
            if (edge_weight > 0.3):
                cluster_stems = stems_dict[doc_idx1].intersection(
                               stems_dict[doc_idx2])
                try: 
                    g.node[doc_idx1]['cluster_stems']
                except KeyError:
                    g.node[doc_idx1]['cluster_stems'] = cluster_stems
                else:
                    for stem in cluster_stems:
                        g.node[doc_idx1]['cluster_stems'].add(stem)
                try: 
                    g.node[doc_idx2]['cluster_stems']
                except KeyError:
                    g.node[doc_idx2]['cluster_stems'] = cluster_stems
                else:
                    for stem in cluster_stems:
                        g.node[doc_idx2]['cluster_stems'].add(stem)
            
            # To be made more flexible
            if edge_weight > 0.3:
                g.add_edge(doc_idx1, doc_idx2, weight=edge_weight)
            doc_idx2 += 1
            pb.update(count)
            count += 1
            if doc_idx2 > docs_no:
                break
    
    print "Draw graph showing possible clusters  ..."
    
    elarge = [(u,v) for (u,v,d) in g.edges(data=True) if d['weight'] > 0.4]
    emedium = [(u,v) for (u,v,d) in g.edges(data=True) 
              if d['weight'] > 0.2 and d['weight'] < 0.4]
    esmall = [(u,v) for (u,v,d) in g.edges(data=True) if d['weight'] <= 0.2]
    print "elarge: ", len(elarge)
    print "emedium: ", len(emedium)
    print "esmall: ", len(esmall)
       
    pos = nx.spring_layout(g, scale=20)
    #pos = nx.random_layout(g)
    
    dlarge = [n for n,d in g.degree_iter() if d >= 20]
    dmedium = [n for n,d in g.degree_iter() if d > 1 and d < 20]
    dsmall = [n for n,d in g.degree_iter() if d == 1]
    dnone = [n for n,d in g.degree_iter() if d == 0]
    print "dlarge: ", len(dlarge)
    print "dmedium: ", len(dmedium)
    print "dsmall: ", len(dsmall)
    print "dnone: ", len(dnone)
    
    # Draw nodes
    # nx.draw_networkx_nodes(g, pos, node_size=5, linewidths=0)
    nx.draw_networkx_nodes(g, pos, nodelist=dlarge, node_size=20,
                           node_color='b',
                           linewidths=0)
    nx.draw_networkx_nodes(g, pos, nodelist=dmedium, node_size=10,
                           node_color='g',
                           alpha=0.8, 
                           linewidths=0)
    nx.draw_networkx_nodes(g, pos, nodelist=dsmall, node_size=5,
                           node_color='b',
                           alpha=0.2,
                           linewidths=0,
                           )
    nx.draw_networkx_nodes(g, pos, nodelist=dnone, node_size=5,
                           node_color='b',
                           alpha=0.2, 
                           linewidths=0)
    
    # Draw edges
    nx.draw_networkx_edges(g, pos, edgelist=elarge, width=0.4)
    nx.draw_networkx_edges(g, pos, edgelist=emedium, edge_color='g', 
                           alpha=0.8, width=0.2)
    nx.draw_networkx_edges(g, pos, edgelist=esmall, width=0.1,
                           alpha=0.1, edge_color='b')
    
    # Draw labels
    # nx.draw_networkx_labels(g, pos, font_size=1, font_family='sans-serif')
    
    plt.axis('off')
    plt.figure(1, figsize=(20,20))
    """
    print "Print PNG"
    plt.savefig("graph.png", dpi=600)
    """
    # plt.show()
    nx.write_yaml(g, get_graph_file())
    d3_js.export_d3_js(g)
Пример #32
0
def jaccard_distance_char(s1, s2):
    w1 = set(s1)
    w2 = set(s2)
    return jaccard_distance(w1, w2)
Пример #33
0
def jaccard_distance_word(s1, s2):
    w1 = set(s1.split(" "))
    w2 = set(s2.split(" "))
    return jaccard_distance(w1, w2)
Пример #34
0
def nGrams(d,inputfile):
    text.configure(state="normal")
    text.delete('1.0',END)
    e = inputfile+"/"
    fi = open(d+inputfile+".txt",'r')
    d += e
    fn = open(d+"ngrams.txt",'w')
    fi.readline()
    fn.write("char2\tchar3\tchar4\tword1\tword2\tword3\tlemma1\tlemma2\tlemma3\tposgm1\tposgm2\tposgm3\n")
    for line in fi.readlines():
        sents = line.split("\t")
        words_1 = sents[0].split()
        words_2 = sents[1].split()
        if len(words_1) < 3:
            words_1.append(".")
        if len(words_2) < 3:
            words_2.append(".")
        char1_2 = set(ngrams(sents[0],2))
        char1_3 = set(ngrams(sents[0],3))
        char1_4 = set(ngrams(sents[0],4))
        char2_2 = set(ngrams(sents[1],2))
        char2_3 = set(ngrams(sents[1],3))
        char2_4 = set(ngrams(sents[1],4))
        word1_1 = set(ngrams(words_1,1))
        word1_2 = set(ngrams(words_1,2))
        word1_3 = set(ngrams(words_1,3))
        word2_1 = set(ngrams(words_2,1))
        word2_2 = set(ngrams(words_2,2))
        word2_3 = set(ngrams(words_2,3))
        sent1 = nltk.pos_tag(words_1)
        sent2 = nltk.pos_tag(words_2)
        nouns = ['NN','NNS','NNP','NNPS']
        adj = ['JJ','JJR','JJS']
        adv = ['RB','RBR','RBS']
        verbs = ['VB','VBG','VBN','VBZ','VBP','VBD']
        all_pos = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','RB','RBR','RBS','VB','VBG','VBN','VBZ','VBP','VBD']
        posgm1 = []
        posgm2 = []
        
        for s in sent1:
            posgm1.append(s[1])
        for s in sent2:
            posgm2.append(s[1])
        
        pos = []
        for s in sent1:
            if s[1] not in all_pos:
                pos.append((s[0],'v'))
            if s[1] in nouns:
                pos.append((s[0],'n'))
            if s[1] in adj:
                pos.append((s[0],'a'))
            if s[1] in adv:
                pos.append((s[0],'r'))
            if s[1] in verbs:
                pos.append((s[0],'v'))
        sent1 = pos
        pos = []
        for s in sent2:
            if s[1] not in all_pos:
                pos.append((s[0],'v'))
            if s[1] in nouns:
                pos.append((s[0],'n'))
            if s[1] in adj:
                pos.append((s[0],'a'))
            if s[1] in adv:
                pos.append((s[0],'r'))
            if s[1] in verbs:
                pos.append((s[0],'v'))
        sent2 = pos
        lemma1 = []
        for s in sent1:
            lemma1.append(lemmatizer.lemmatize(s[0],s[1]))
        lemma2 = []
        for s in sent2:
            lemma2.append(lemmatizer.lemmatize(s[0],s[1]))
        
        print sents[0],sents[1]
        lemma1_1 = set(ngrams(lemma1,1))
        lemma1_2 = set(ngrams(lemma1,2))
        lemma1_3 = set(ngrams(lemma1,3))
        lemma2_1 = set(ngrams(lemma2,1))
        lemma2_2 = set(ngrams(lemma2,2))
        lemma2_3 = set(ngrams(lemma2,3))
        posgm1_1 = set(ngrams(posgm1,1))
        posgm1_2 = set(ngrams(posgm1,2))
        posgm1_3 = set(ngrams(posgm1,3))
        posgm2_1 = set(ngrams(posgm2,1))
        posgm2_2 = set(ngrams(posgm2,2))
        posgm2_3 = set(ngrams(posgm2,3))
        char2_jd = 1.0 - jaccard_distance(char1_2,char2_2)
        char3_jd = 1.0 - jaccard_distance(char1_3,char2_3)
        char4_jd = 1.0 - jaccard_distance(char1_4,char2_4)
        word1_jd = 1.0 - jaccard_distance(word1_1,word2_1)
        word2_jd = 1.0 - jaccard_distance(word1_2,word2_2)
        word3_jd = 1.0 - jaccard_distance(word1_3,word2_3)
        lemma1_jd = 1.0 - jaccard_distance(lemma1_1,lemma2_1)
        lemma2_jd = 1.0 - jaccard_distance(lemma1_2,lemma2_2)
        lemma3_jd = 1.0 - jaccard_distance(lemma1_3,lemma2_3)
        posgm1_jd = 1.0 - jaccard_distance(posgm1_1,posgm2_1)
        posgm2_jd = 1.0 - jaccard_distance(posgm1_2,posgm2_2)
        posgm3_jd = 1.0 - jaccard_distance(posgm1_3,posgm2_3)
        text.insert(INSERT,"%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t\n" % (sents[0],sents[1],char2_jd,char3_jd,char4_jd,word1_jd,word2_jd,word3_jd,lemma1_jd,lemma2_jd,lemma3_jd,posgm1_jd,posgm2_jd,posgm3_jd))
        fn.write("%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % (char2_jd,char3_jd,char4_jd,word1_jd,word2_jd,word3_jd,lemma1_jd,lemma2_jd,lemma3_jd,posgm1_jd,posgm2_jd,posgm3_jd))
        #print "LCS of ",sent1,sent2,"is: ",lcs(sent1,sent2)/(1.0 * min_len)
    fi.close()
    fn.close()
    text.configure(state="disabled")
Пример #35
0
def compareVietNetAndOxford(dict_VietNet, dict_Oxford):

    for WORD in dict_Oxford:

        if len(dict_Oxford[WORD]) == 0:
            continue

        # if WORD == "BA":
        # print "holyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyy"

        wn_words = wn.synsets(WORD, pos="n")
        if wn_words == None:
            continue

        if WORD == "baby":
            a = 1

        if dict_VietNet.has_key(WORD):

            arr_VietNet = dict_VietNet[WORD]
            arr_Oxford = dict_Oxford[WORD]

            matrix_similarity = [[0 for x in range(len(arr_Oxford))] for x in range(len(wn_words))]

            for iWn in range(len(wn_words)):

                definitionWn = wn.synset(wn_words[iWn].name()).definition()

                vietNet = {}
                for iVietNet in arr_VietNet:

                    levenshtein_vn_wn = Util.levenshtein(arr_VietNet[iVietNet]["d"], definitionWn)

                    if levenshtein_vn_wn < len(definitionWn) / 2.0:
                        vietNet = arr_VietNet[iVietNet]
                        break

                if not vietNet.has_key("tv"):
                    vietNet["tv"] = ""

                viet_net_tv = vietNet["tv"]

                for iOxford in range(len(arr_Oxford)):
                    oxford = arr_Oxford[str(iOxford)]

                    vietNet_tv = viet_net_tv

                    if not oxford.has_key("tv"):
                        continue
                    oxford_tv = oxford["tv"].encode("utf-8")

                    vietNet_tv.replace(";", "")
                    oxford_tv = oxford_tv.replace(";", "")
                    oxford_tv = oxford_tv.replace(",", "")
                    oxford_tv = oxford_tv.replace("/", " ")

                    arr_tv_oxford = set(oxford_tv.split(" "))
                    arr_tv_vietnet = set(vietNet_tv.split(" "))

                    jaccard = jaccard_distance(arr_tv_oxford, arr_tv_vietnet)
                    print arr_tv_vietnet
                    print arr_tv_oxford
                    print jaccard
                    matrix_similarity[iWn][iOxford] = 0
                    if jaccard < 0.95:
                        matrix_similarity[iWn][iOxford] = 1

                matrix_similarity[iWn].insert(0, viet_net_tv + "<>" + definitionWn.encode("utf-8"))

            print matrix_similarity
            # - - - - - - - - - - - - - - - - - - - - - - - - -
            # col
            # for i in range(len(dict_VietNet[WORD])):
            #   matrix_similarity[i].insert(0,dict_VietNet[WORD][i]["tv"] + "<>" + dict_VietNet[WORD][i]["d"]);

            # - - - - - - - - - - - - - - - - - - - - - - - - -
            # row
            arrRowDict = []
            arrRowDict.append(WORD)
            for i in range(len(dict_Oxford[WORD])):
                if not dict_Oxford[WORD][str(i)].has_key("tv"):
                    dict_Oxford[WORD][str(i)]["tv"] = "-"
                if not dict_Oxford[WORD][str(i)].has_key("d"):
                    dict_Oxford[WORD][str(i)]["d"] = "-"
                if dict_Oxford[WORD][str(i)]["d"] == None:
                    dict_Oxford[WORD][str(i)]["d"] = "-"

                arrRowDict.append(
                    dict_Oxford[WORD][str(i)]["tv"].encode("utf-8")
                    + "<>"
                    + dict_Oxford[WORD][str(i)]["d"].encode("utf-8")
                )

            FileProcess.append_to_excel_file(
                "Results/parameters/VN_Ox/" + "compare_VN_Ox_2_2.1.csv", arrRowDict, matrix_similarity
            )
Пример #36
0
    def calculate_jaccard(self, s0, s1):
        lemms_0 = set([a.lower() for a in s0 if a])
        lemms_1 = set([a.lower() for a in s1 if a])

        jaccard_simmilarity = (1 - jaccard_distance(lemms_0, lemms_1))
        return jaccard_simmilarity
 def unordered_content_distance(self, sentence):
     """Jaccard distance on (unordered) content words between `self` and
     `sentence`."""
     return jaccard_distance(set(self.content_words),
                             set(sentence.content_words))
Пример #38
0
def jaccard_dist(row):
    return jaccard_distance(set(str(row['question1'])),
                            set(str(row['question2'])))
Пример #39
0
def getJaccardDistance(a, b):
    try:
        jd = jaccard_distance(set(a), set(b))
    except ZeroDivisionError:
        jd = 0
    return jd
Пример #40
0
def similarity_by_synsets_synsets_nbest_withword_average(WORD, dict_words):


  if WORD == "bank":
    asf = 0;
  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # dictionary data
  dict_words_synsets = get_nbest_synsets_n_v_with_word(dict_words,WORD);
  # print "dict-word_synsets"
  # print dict_words_synsets

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # wordnet data

  wn_words = wn.synsets(WORD, pos = 'n');
  print "wn_words -------"
  print wn_words;

  wn_words_synsets = WordnetProcess.get_synsets_n_v(WORD, wn_words);

  print wn_words_synsets

  # matrix for similarity dict_words vs wn_words
  matrix_similarity = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################
  #
  # calculate 2d matrix of p

  for iWnWord in range(len(wn_words)):

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      p_iWnWord_iDictWord = 0.;

      arr_p_word = [];
      #
      for dict_synset in dict_words_synsets[iDictWord]:

        # print "------------ dict noun"
        # print dictNoun;
        p_dictNoun_wnNouns = 0;

        # for some nouns don't have synsets

        arr_p  = [];

        # - - - - - - - - - - - - - - - - - - - - - - - -

        for wn_synset in wn_words_synsets[iWnWord]:
          #
          p_max = dict_synset.path_similarity(wn_synset);
          if p_max == None:
            continue

          arr_p.append(p_max);

          # print p_max

        arr_p = sorted(arr_p, reverse=True);

        nBest = 8;
        count = 0.0001;
        for i in xrange(0, len(arr_p)-1):
          if i < nBest:
            p_dictNoun_wnNouns += arr_p[i];
            count += 1;

        p_dictNoun_wnNouns = p_dictNoun_wnNouns/count;
        arr_p_word.append(p_dictNoun_wnNouns);

      arr_p_word = sorted(arr_p_word, reverse=True);
      nBest = 10;
      count = 5;
      for i in range(len(arr_p_word)):
        if i < nBest:
          if nBest > len(arr_p_word):
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< nBest/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
          else:
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< len(arr_p_word)/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;

          count += 1;

      if count == 0:
        p_iWnWord_iDictWord = 0;
      else:
        p_iWnWord_iDictWord = p_iWnWord_iDictWord/count
      matrix_similarity[iWnWord][iDictWord] = p_iWnWord_iDictWord;
      # - - - - - - - - - - - - - - - - - - - - - - - - -

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # dictionary data

  wn_words = dict_words;
  wn_words_synsets = get_nbest_synsets_n_v_with_word(wn_words,WORD);

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # wordnet data

  dict_words = wn.synsets(WORD, pos = 'n');
  # print wn_words;
  dict_words_synsets = WordnetProcess.get_synsets_n_v(WORD, dict_words);

  print "sysnets -----------------------.----.-----.--.-"

  # matrix for similarity dict_words vs wn_words
  matrix_similarity_reverse = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################
  #
  # calculate 2d matrix of p

  for iWnWord in range(len(wn_words)):

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      p_iWnWord_iDictWord = 0.;

      arr_p_word = [];

      for dict_synset in dict_words_synsets[iDictWord]:

        # print dictNoun;
        p_dictNoun_wnNouns = 0;

        # for some nouns don't have synsets
        countwnNouns = 0.00000001;

        arr_p  = [];

        # - - - - - - - - - - - - - - - - - - - - - - - -

        for wn_synset in wn_words_synsets[iWnWord]:

          p_max = dict_synset.path_similarity(wn_synset);
          if p_max != None:
            arr_p.append(p_max);

          # print p_max
          # - - - - - - - - - - - - - - - - - - - - - - - -

        arr_p = sorted(arr_p, reverse=True);

        nBest = 8;
        count = 0.0001
        for i in range(len(arr_p)):
          if i < nBest:
            p_dictNoun_wnNouns += arr_p[i];
            count +=1

        p_dictNoun_wnNouns = p_dictNoun_wnNouns/count;
        arr_p_word.append(p_dictNoun_wnNouns);

      arr_p_word = sorted(arr_p_word, reverse=True);
      nBest = 10;
      count = 5;
      for i in xrange(0, len(arr_p_word)-1):
        if i < nBest:
          if nBest > len(arr_p_word):
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5;
            elif i< nBest/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1.;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
          else:
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< len(arr_p_word)/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1.;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;

          count += 1;

      if count == 0:
        p_iWnWord_iDictWord = 0;
      else:
        p_iWnWord_iDictWord = p_iWnWord_iDictWord/count
      matrix_similarity_reverse[iWnWord][iDictWord] = p_iWnWord_iDictWord;
      # - - - - - - - - - - - - - - - - - - - - - - - - -

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity_reverse]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  dict_words = wn_words;
  wn_words = wn.synsets(WORD, pos = 'n');

  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord] + matrix_similarity_reverse[iDictWord][iWnWord];
      matrix_similarity[iWnWord][iDictWord] /= 2;

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)


  ####################################################################################################
  #
  # @brief:
  #

  matrix_similarity_jaccard = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  for iWnWord in range(len(wn_words)):

    tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition()));
    words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

    # words = nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition());
    # print words
    for i in range(len(words)):
      words[i] = wordnet_lemmatizer.lemmatize(words[i]);
    wn_set = set(words);
    # wn_set = set(wn.synset(wn_words[iWnWord].name()).definition().split())
    # print wn_set

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      if not dict_words[str(iDictWord)].has_key("d") or dict_words[str(iDictWord)]["d"] == None:
        matrix_similarity_jaccard[iWnWord][iDictWord] = 1;
        continue

      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"]));
      words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

      # words = nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"]);
      # print words
      for i in range(len(words)):
        words[i] = wordnet_lemmatizer.lemmatize(words[i]);
      dict_set = set(words);
      # print
      # dict_set = set(dict_words[str(iDictWord)]["d"].encode('utf8').split());
      matrix_similarity_jaccard[iWnWord][iDictWord] = jaccard_distance(wn_set,dict_set);


  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord]*10 + 2*(1-matrix_similarity_jaccard[iWnWord][iDictWord]);
      matrix_similarity[iWnWord][iDictWord] /= 12;

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)


  ####################################################################################################
  #
  # write file

  # - - - - - - - - - - - - - - - - - - - - - - - - -
  # col
  arrColWn = [];
  for i in range(len(wn_words)):
    matrix_similarity[i].insert(0,wn.synset(wn_words[i].name()).definition());

  # - - - - - - - - - - - - - - - - - - - - - - - - -
  # row
  arrRowDict = [];
  arrRowDict.append("--");
  for i in range(len(dict_words)):
    if not dict_words[str(i)].has_key('tv'):
      dict_words[str(i)]['tv'] = "--";
    if dict_words[str(i)]['tv'] == None:
      dict_words[str(i)]['tv'] = "--"
    arrRowDict.append(dict_words[str(i)]["tv"].encode('utf8'));

  FileProcess.write_to_excel_file("Results/"+WORD+"_synsets_synsets_nbest_withword_average.csv",arrRowDict,matrix_similarity)
Пример #41
0
 def uc_distance(self, sentence):
     """Jaccard distance on (unordered) content lemmas between `self` and
     `sentence`."""
     return jaccard_distance(set(self.content_lemmas),
                             set(sentence.content_lemmas))
def _jaccard(sent1, sent2):
    sent1 = set(sent1)
    sent2 = set(sent2)
    return jaccard_distance(sent1, sent2)