Пример #1
0
 def path_similarity(self, word1, word2):
     score = 0
     for syn1 in wn.synsets(word1):
         for syn2 in wn.synsets(word2):
             if (wn.path_similarity(syn1, syn2)):
                 score += wn.path_similarity(syn1, syn2)
     self.path.append(score)
Пример #2
0
def checkFirstSentence_test5(paragraphID, paragraph):
    if paragraphID in cache:
        tokens = cache[paragraphID]
    else:
        tokens = getParagraphTokenIntersectionByID(paragraphID)
        cache[paragraphID] = tokens

    for token in tokens:
        if token in paragraph[0]:
            return True

    for token in tokens:
        tokenSynset = getSynset(token, paragraphID)
        if tokenSynset:
            partOfSpeechMap = POSCache[paragraphID]
            tokenPos = partOfSpeechMap[token]
            for word in paragraph[0].split(" "):
                word = word.strip().lower()
                if word in partOfSpeechMap:
                    wordPos = partOfSpeechMap[word]
                    if wordPos == tokenPos:
                        wordSynset = getSynset(word, paragraphID)
                        if wordSynset:
                            if (
                                wn.path_similarity(tokenSynset, wordSynset)
                                and wn.path_similarity(tokenSynset, wordSynset) > 0.13
                            ):
                                return True
    return False
Пример #3
0
    def test_path_similarities(self):
        from nltk.corpus import wordnet as nltk_wn
        nltk_cat = nltk_wn.synset('cat.n.1')
        nltk_dog = nltk_wn.synset('dog.n.1')
        nltk_bus = nltk_wn.synset('bus.n.1')

        our_cat = our_wn.synset('cat.n.1')
        our_dog = our_wn.synset('dog.n.1')
        our_bus = our_wn.synset('bus.n.1')
        assert nltk_wn.path_similarity(nltk_cat,
                                       nltk_dog) == our_wn.path_similarity(
                                           our_cat, our_dog)
        assert nltk_wn.wup_similarity(nltk_cat,
                                      nltk_dog) == our_wn.wup_similarity(
                                          our_cat, our_dog)
        assert nltk_wn.lch_similarity(nltk_cat,
                                      nltk_dog) == our_wn.lch_similarity(
                                          our_cat, our_dog)

        assert nltk_wn.path_similarity(nltk_cat,
                                       nltk_bus) == our_wn.path_similarity(
                                           our_cat, our_bus)
        assert nltk_wn.wup_similarity(nltk_cat,
                                      nltk_bus) == our_wn.wup_similarity(
                                          our_cat, our_bus)
        assert nltk_wn.lch_similarity(nltk_cat,
                                      nltk_bus) == our_wn.lch_similarity(
                                          our_cat, our_bus)
Пример #4
0
def get_synonym(tok_details):
    count = 0
    for i in range(len(tok_details)):
        tok_details[i][1] = Dic_pos[
            tok_details[i][1]
            [0]] if tok_details[i][1][0] in Dic_pos.keys() else None
        tok_details[i].append(wn.synsets(tok_details[i][0], tok_details[i][1])[0]) if \
                        len(wn.synsets(tok_details[i][0], tok_details[i][1])) > 0 \
                        else tok_details[i].append(None)
        if tok_details[i][3] == 1:
            count += 1
    print(count)

    # for i in tok_details:
    #     # if i[3] is 0:
    #         # print(i)
    for i in range(len(tok_details)):
        if tok_details[i][4] is not None and tok_details[i][3] == 0:
            # print()
            for j in range(len(tok_details)):
                if i != j and tok_details[j][4] is not None:
                    # print(tok_details[i], tok_details[j])
                    # print(tok_details[i], tok_details[j], wn.path_similarity(tok_details[i][4], tok_details[j][4]))
                    if wn.path_similarity(tok_details[i][4], tok_details[j][4]) is not None \
                            and wn.path_similarity(tok_details[i][4], tok_details[j][4]) >= 1:
                        tok_details[i][3] = 2
                        print(tok_details[i], tok_details[j])
                        print(i, j)
                        break
    return tok_details
Пример #5
0
def getSimilarity(w1, w2, method='max'):
    meanings_1 = wn.synsets(w1)
    nm1 = len(meanings_1)

    meanings_2 = wn.synsets(w2)
    nm2 = len(meanings_2)

    if (method == 'max'):
        similarity = 0
        for i in range(nm1):
            m1 = wn.synset(meanings_1[i].name())
            for j in range(nm2):
                m2 = wn.synset(meanings_2[j].name())

                sim = wn.path_similarity(m1, m2)
                similarity = max(sim, similarity)
    elif (method == 'mean'):
        if (nm1 * nm2 == 0):
            return 0

        similarities = [0 for i in range(nm1 * nm2)]
        count = 0
        for i in range(nm1):
            m1 = wn.synset(meanings_1[i].name())
            for j in range(nm2):
                m2 = wn.synset(meanings_2[j].name())

                sim = wn.path_similarity(m1, m2)
                if (sim == None):
                    sim = 0
                similarities[count] = sim
                count += 1
        similarity = float(sum(similarities)) / count

    return similarity
Пример #6
0
def similarity_by_path(sense1, sense2, option="path"):
  if option.lower() in ["path", "path_similarity"]: # Path similaritys
    return max(wn.path_similarity(sense1,sense2), 
               wn.path_similarity(sense1,sense2))
  elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer 
    return wn.wup_similarity(sense1, sense2)
  elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow
    if sense1.pos != sense2.pos: # lch can't do diff POS
      return 0
    return wn.lch_similarity(sense1, sense2)
def wnsensesim(synset1, synset2, metric):
    #return wn similarity of two synsets according to metric

    #if metric == 'path_similarity':
    print "synset1:%r"%synset1
    print "synset2:%r"%synset2
    if metric == 'path_similarity':
        print wn.path_similarity(synset1, synset2)
        return wn.path_similarity(synset1, synset2)
    else:#add more similarity measures e.g., jcn
        print "Unsupported wn similarity measure requested"
Пример #8
0
def similarity_by_path(sense1, sense2, option="path"):
    """ Returns maximum path similarity between two senses. """
    if option.lower() in ["path", "path_similarity"]: # Path similaritys
        return max(wn.path_similarity(sense1,sense2),
                   wn.path_similarity(sense1,sense2))
    elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer
        return wn.wup_similarity(sense1, sense2)
    elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow
        if sense1.pos != sense2.pos: # lch can't do diff POS
            return 0
        return wn.lch_similarity(sense1, sense2)
def most_similar_path(synsets_dict, verb):
    best_similarity = -1
    most_similar = str()
    verb_synset = wn.synsets(verb, pos=wn.VERB)[0]

    for verb, synset in synsets_dict.items():
        if wn.path_similarity(synset, verb_synset) > best_similarity:
            best_similarity = wn.path_similarity(synset, verb_synset)
            most_similar = verb

    return most_similar
Пример #10
0
def similarity_by_path(sense1, sense2, option="path"):
  """ Returns maximum path similarity between two senses. """
  if option.lower() in ["path", "path_similarity"]: # Path similaritys
    return max(wn.path_similarity(sense1,sense2), 
               wn.path_similarity(sense1,sense2))
  elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer 
    return wn.wup_similarity(sense1, sense2)
  elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow
    if sense1.pos != sense2.pos: # lch can't do diff POS
      return 0
    return wn.lch_similarity(sense1, sense2)

    return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
Пример #11
0
    def find_exercise(self, sentence, dist_th=0.2):
        sport_v = []

        ss_sport = wn.synsets('sport')[0]
        ss_exercise = wn.synsets('exercise')[0]
        ss_practice = wn.synsets('practice')[0]

        pt_v = self.postag(sentence)
        if len(pt_v) > 1:
            for w, t in pt_v:
                if 'sport' in w or 'exe' in w or 'practi' in w:
                    continue

                if ('NN' in t) or ('VB' in t):
                    ss_NN = wn.synsets(w)
                    d_sport = [
                        wn.path_similarity(ss, ss_sport, simulate_root=False)
                        for ss in ss_NN
                    ]
                    d_exercise = [
                        wn.path_similarity(ss,
                                           ss_exercise,
                                           simulate_root=False) for ss in ss_NN
                    ]
                    d_practice = [
                        wn.path_similarity(ss,
                                           ss_practice,
                                           simulate_root=False) for ss in ss_NN
                    ]
                    ##                    print(d_sport)
                    ##                    print(d_exercise)
                    d_v = [
                        d for d in (d_sport + d_exercise + d_practice)
                        if d is not None
                    ]
                    if len(d_v) > 0:
                        d_NN = max(d_v)
                    else:
                        d_NN = 0

                    if d_NN > dist_th:
                        sport_v.append([d_NN, w])

        elif len(pt_v) == 1:
            sport_v.append((1.0, pt_v[0][0]))

        if len(sport_v) == 0:
            return None
        else:
            sport_v.sort()
            return sport_v[0][1]
Пример #12
0
def get_relation(syn1,syn2,sim_metric):
    from nltk.corpus import wordnet as wn
    if sim_metric == "path":
        # https://stackoverflow.com/questions/20075335/is-wordnet-path-similarity-commutative
        sim_score = min(wn.path_similarity(syn1,syn2), wn.path_similarity(syn2,syn1))
    elif sim_metric == "lch":
        if syn1.pos() == syn2.pos():
            sim_score = syn1.lch_similarity(syn2)
        else:
            sim_score = 0
    elif sim_metric == "wup":
        sim_score = syn1.wup_similarity(syn2)
    if sim_score: return sim_score
    else: return 0 
Пример #13
0
def get_relation(syn1, syn2, sim_metric):
    from nltk.corpus import wordnet as wn
    if sim_metric == "path":
        # https://stackoverflow.com/questions/20075335/is-wordnet-path-similarity-commutative
        sim_score = min(wn.path_similarity(syn1, syn2),
                        wn.path_similarity(syn2, syn1))
    elif sim_metric == "lch":
        if syn1.pos() == syn2.pos():
            sim_score = syn1.lch_similarity(syn2)
        else:
            sim_score = 0
    elif sim_metric == "wup":
        sim_score = syn1.wup_similarity(syn2)
    if sim_score: return sim_score
    else: return 0
Пример #14
0
def domain_range_measure(q_type, p_domain, p_range):
    if q_type == 'HUM':
        q_type = "person"
    elif q_type == 'NUM':
        q_type = "number"
    elif q_type == 'ENTY':
        q_type = "entity"
    elif q_type == 'DESC':
        q_type = ""
    elif q_type == 'ABBR':
        q_type = "abbreviation"
    elif q_type == 'LOC':
        q_type = "location"

    if q_type == p_domain and q_type == p_range:
        return 1
    elif q_type == p_domain or q_type == p_range:
        return 0.75
    else:
        f_syns = wn.synsets(q_type)
        s1_syns = wn.synsets(p_domain)
        s2_syns = wn.synsets(p_range)

        path_sim_d = 0.0
        path_sim_r = 0.0
        try:
            for f in f_syns:

                for s in s1_syns:
                    path_sim1 = wn.path_similarity(f, s)

                    if path_sim1 > path_sim_d:
                        path_sim_d = path_sim1

                for s in s2_syns:
                    path_sim2 = wn.path_similarity(f, s)

                    if path_sim2 > path_sim_r:
                        path_sim_r = path_sim2
        except:
            pass

        if path_sim_d == path_sim_r and path_sim_d > 0.75:
            return 1
        else:
            return max(path_sim_d, path_sim_r)

    return 0
Пример #15
0
def sentence_similarity(wordSense1, wordSense2, similarity_metric = 'path'):
    '''
    Calculating sentence similarity measurement.
    
    Parameters:
        wordSense1 (list): a list of extracted sense for the first sentence.
        wordSense2 (list): a list of extracted sense for the second sentence.
        similarity_metric (str): which algorithm for similarity measurement. Default to be the path similaity. Available choice include 
            path similarity (path), and Wu-Palmer Similarity (lcs). See the official definition here: http://www.nltk.org/howto/wordnet.html
        
    Return:
        the similarity score (float).
    '''
    similarity = 0.0
    total = 0.0
    if len(wordSense1) == 0 or len(wordSense2) == 0:
        return 0
    
    for sense1 in wordSense1:
        for sense2 in wordSense2:
            total += 1.0
            cur_sim = None
            if similarity_metric == 'path':
                cur_sim = wn.path_similarity(sense1, sense2)
            elif similarity_metric == 'lcs':
                cur_sim = wn.wup_similarity(sense1, sense2)
            else:
                raise ValueError('ERROR: given similarity metric is not defined.')
            if cur_sim:
                similarity += cur_sim

    return similarity / total
Пример #16
0
 def disambiguateWordSenses(self,sentence,word):
     wordsynsets = wn.synsets(word)
     bestScore = 0.0
     result = None
     for synset in wordsynsets:
         for w in nltk.word_tokenize(sentence):
             score = 0.0
             for wsynset in wn.synsets(w):
                 sim = wn.path_similarity(wsynset, synset)
                 if(sim == None):
                     continue
                 else:
                     score += sim
                 if (score > bestScore):
                     bestScore = score
                     result = synset
     if result:
         pos = result.pos()
         offset = result.offset()
         pos_score=0.0
         neg_score=0.0
         if (pos, offset) in self.db:
             pos_score, neg_score = self.db[(pos, offset)]
         obj = 1.0-(pos_score+neg_score)
         #print "%%%%%%%%%%"
         #print pos_score,neg_score, obj
     else:
         obj=1.0
         pos=None
         pos_score=0.0
         neg_score=0.0
     return obj,pos,pos_score,neg_score
def semantic_similarity(word1, word2, speech, measure):
  """
  Finds the highest similarity score for the given pair of words. Goes through each combination of all senses.

  :param word1: First word in the pair of words
  :param word2: Second word in the pair of words
  :param speech: part of speech e.g. nw.NOUN
  :param measure: String representing the type of similarity measure ("path" = path ; "res" = Resnik  ;  "lin" = Lin)
  :return: The highest similarity score across all senses and all parts of speech
  """
  #error handling if invalid measure input is given
  if measure not in ["path","res","lin"]:
    raise ValueError("Not a valid similarity type \n Must be 'path'(path), 'res'(Resnik) or 'lin'(Lin)")

  greatest = 0
  conceptsA = wn.synsets(word1,speech)
  conceptsB = wn.synsets(word2,speech)
  #finds similarity score for every combination of senses
  for conceptA in conceptsA:
    for conceptB in conceptsB:
      if measure == "path":
        similarity = wn.path_similarity(conceptA,conceptB)
      elif measure == "res":
        similarity = wn.res_similarity(conceptA,conceptB,brown_ic)
      elif measure == "lin":
        similarity = wn.lin_similarity(conceptA,conceptB,brown_ic)
      if similarity == None : continue #error checking if similarity scorce not possible
      if similarity>greatest:
          greatest = similarity #if new highest similairty is found, set it to the greatest
  return greatest
Пример #18
0
    def distance_between_pairs(self, lemma_i, lemma_j, pos_i, pos_j):
        '''Computes path distance between a pair of words

        Args:
          lemma_i: i-th word lemma.
          lemma_j: j-th word lemma.
          pos_i: i-th word part of speech tag.
          pos_j: j-th word part of speech tag.

        Returns:
          The minimal distance in the WordNet lexical tree d_path(i,j)
        '''

        if pos_i not in constants.pos2wnpos or pos_j not in constants.pos2wnpos:
            return None
        if not wn.synsets(lemma_i, pos=constants.pos2wnpos[pos_i], lang=self.iso_lang) or \
                not wn.synsets(lemma_j, pos=constants.pos2wnpos[pos_j], lang=self.iso_lang):
            return None

        max_similarity = 0.
        # TODO: consider language, maybe use other type of similatity
        for i_synset in wn.synsets(lemma_i,
                                   pos=constants.pos2wnpos[pos_i],
                                   lang=self.iso_lang):
            for j_synset in wn.synsets(lemma_j,
                                       pos=constants.pos2wnpos[pos_j],
                                       lang=self.iso_lang):
                pair_sim = wn.path_similarity(i_synset, j_synset)
                if pair_sim and pair_sim > max_similarity:
                    max_similarity = pair_sim

        if max_similarity == 0.:
            return None

        return 1. / max_similarity
Пример #19
0
def return_relationship_matrix(sentence1, sentence2, posGroup):
    relationshipMatrix = []

    # applies inderect similarity measurement techniques
    if posGroup == NOUNS or posGroup == VERBS:

        for word_A in sentence1:
            relationshipMatrixNode = []

            for word_B in sentence2:
                similarity = wn.path_similarity(
                    word_A.wordSense, word_B.wordSense
                )  # a path similarity is measure for 2 word senses
                relationshipMatrixNode.append(similarity)

            relationshipMatrix.append(relationshipMatrixNode)

    # applies a dirtect match to singulars
    elif posGroup == SINGULARS:

        for word_A in sentence1:
            relationshipMatrixNode = []

            for word_B in sentence2:
                if word_A.word.lower() == word_B.word.lower():
                    relationshipMatrixNode.append(1)

                else:
                    relationshipMatrixNode.append(0)

            relationshipMatrix.append(relationshipMatrixNode)

    return relationshipMatrix
    def disambiguate_word_senses(self, sentence, word):
        """
        Attempts to determine the proper sense of the target
        word from the sentence in which it appears.

        Args:
            sentence: String representation of the sentence
            word: String represtnation of word

        Returns:
            Returns a synset which is the best guess.

        Example:
            disambiguateWordSenses('A cat is a good pet', 'cat')
            OUT: Synset('cat.v.01')
        """
        wordsynsets = wn.synsets(word)
        bestScore = 0.0
        result = None
        for synset in wordsynsets:
            for w in nltk.word_tokenize(sentence):
                score = 0.0
                for wsynset in wn.synsets(w):
                    sim = wn.path_similarity(wsynset, synset)
                    if(sim == None):
                        continue
                    else:
                        score += sim
                if (score > bestScore):
                    bestScore = score
                    result = synset
        return result
Пример #21
0
    def disambiguate_word_senses(self, sentence, word):
        """
        Attempts to determine the proper sense of the target
        word from the sentence in which it appears.

        Args:
            sentence: String representation of the sentence
            word: String represtnation of word

        Returns:
            Returns a synset which is the best guess.

        Example:
            disambiguateWordSenses('A cat is a good pet', 'cat')
            OUT: Synset('cat.v.01')
        """
        wordsynsets = wn.synsets(word)
        bestScore = 0.0
        result = None
        for synset in wordsynsets:
            for w in nltk.word_tokenize(sentence):
                score = 0.0
                for wsynset in wn.synsets(w):
                    sim = wn.path_similarity(wsynset, synset)
                    if (sim == None):
                        continue
                    else:
                        score += sim
                if (score > bestScore):
                    bestScore = score
                    result = synset
        return result
Пример #22
0
def get_best_synset_pair(word_1, word_2):
    """ 
    Choose the pair with highest path similarity among all pairs. 
    Mimics pattern-seeking behavior of humans.
    """
    global synset_pair_cache
    max_sim = -1.0
    with lock:
        synsets_1 = wn.synsets(word_1, pos=wn.NOUN)
        synsets_2 = wn.synsets(word_2, pos=wn.NOUN)

    #print "w1:", word_1, synsets_1
    #print "w2:", word_2, synsets_2
    if len(synsets_1) == 0 or len(synsets_2) == 0:
        return None, None
    else:
        max_sim = -1.0
        best_pair = None, None
        for synset_1 in synsets_1:
            for synset_2 in synsets_2:
                with lock:
                    sim = wn.path_similarity(synset_1, synset_2)
                if sim > max_sim:
                    max_sim = sim
                    best_pair = synset_1, synset_2
        return best_pair
Пример #23
0
def intersection(h, ref, wordnetTest=True):
    refmap = {}
    for word in ref:
        if word in refmap:
            refmap[word] += 1
        else:
            refmap[word] = 1
    i = 0
    for run in h:
        if run in refmap and refmap[run] > 0:
            i += 1
            refmap[run] -= 1
        elif wordnetTest and not run in refmap and not isinstance(run, tuple):
            # Use wordnet to match
            synsets = wordnet.synsets(run.decode("utf-8"))
            if len(synsets) == 0:
                continue;
            for word in refmap:
                s = wordnet.synsets(word.decode("utf-8"))
                if len(s) < 1:
                    continue
                if wordnet.path_similarity(synsets[0], s[0]) > 0.9:
                    i += 1
                    refmap[word] -= 1
    return i
 def semantic_diff(a: object, b: object):
     """
     Computes the semantic difference, as 1 - path similarity, between two string.
     After calculated, the semantic difference will be stored in the dictionary semantic_diff_dic so when this distance is again requested it will
     not be calculated a second time.
     The path similarity is calculated by using WordNet.
     If one of the two parameter is NaN, the distance returned will be infinity.
     If both are NaN, the distance returned will be 0.
     :param a: first term
     :type a: str or float for NaN value
     :param b: second term
     :type b: str or float for NaN value
     :return: the semantic difference between a and b
     :rtype float
     """
     if (isinstance(a, float) and np.isnan(a)) and (isinstance(b, float)
                                                    and np.isnan(b)):
         return 0
     if isinstance(a, float) and np.isnan(a):
         return np.inf
     if isinstance(b, float) and np.isnan(b):
         return np.inf
     if a == b:
         return 0
     if (a, b) in DiffDataFrame.semantic_diff_dict:
         return DiffDataFrame.semantic_diff_dict[(a, b)]
     elif (b, a) in DiffDataFrame.semantic_diff_dict:
         return DiffDataFrame.semantic_diff_dict[(b, a)]
     else:
         t = wn.path_similarity(DiffDataFrame.sysnset_dict[a],
                                DiffDataFrame.sysnset_dict[b])
         DiffDataFrame.semantic_diff_dict[(a, b)] = 1 - t
         DiffDataFrame.semantic_diff_dict[(b, a)] = 1 - t
         return 1 - t
def word_probability(word, tag, hashtag):
    probability = 1
    hashtag_words = bags_of_words[hashtag]
    wordcount = 0
    # Counts up how many of the word appear in the bag of words for the hashtag
    # Includes similar words as partial counts
    if tag != "":
        word_synset = wn.synsets(word, pos=tag)
        if len(word_synset) != 0:
            for training_word, training_tag in hashtag_words:
                # NOTE: This could be made more thourough at the cost of some performance in the future
                training_synset = wn.synsets(training_word, pos=training_tag)
                if len(training_synset) != 0:
                    similarity = wn.path_similarity(word_synset[0],
                                                    training_synset[0])
                    if similarity is not None:
                        wordcount += similarity * similarity

    # Add to probability
    probability += wordcount

    # Divide probability of sum of words in category plus unique word count
    probability /= hashtag_frequency[hashtag] + unique_word_count

    return probability
Пример #26
0
 def __similarity(self, word, compareto):
     try:
         score = word.jcn_similarity(compareto, self.wordnet_ic, True)
     except:
         score = wordnet.path_similarity(word, compareto)
     if score == -1: score = None  #No path between the words was found
     return score
Пример #27
0
def get_best_synset_pair(word_1, word_2):


    max_sim = -1.0
    synsets_1 = wn.synsets(word_1)
    synsets_2 = wn.synsets(word_2)


    if len(synsets_1) == 0 or len(synsets_2) == 0:
        return None, None


    else:
        max_sim = -1.0
        best_pair = None, None
        for synset_1 in synsets_1:
            for synset_2 in synsets_2:
               sim = wn.path_similarity(synset_1, synset_2)

               # error occured
               if sim is not None and sim > max_sim:
                   max_sim = sim
                   best_pair = synset_1, synset_2

        # 2つの単語の類義語一覧の中から最も似ているペアを返す
        return best_pair
Пример #28
0
def similarity_score(s1, s2):

    get_score = []
    max_score = []
    for syn1 in s1:
        for syn2 in s2:
            #print(syn1,syn2)
            score = wn.path_similarity(syn1, syn2)
            #print('score is',score)
            if score is not None:
                #print('true score',score)
                get_score.append(score)
                #print(score)
                #print('hi')
            #print(get_score.append(score))

        #print(get_score)
        if len(get_score) >= 1:
            #print('hi')
            max_score.append(max(get_score))

    #print(nltk.pos_tag(s1))
    # Your Code Here

    return (sum(max_score) / len(max_score))  # Your Answer Here
Пример #29
0
def sentence_similarity(sentence1, sentence2):
    """ compute the sentence similarity using Wordnet """
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))

    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]

    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]

    score, count = 0.0, 0

    for synset in synsets1:
        scores = [wn.path_similarity(synset, ss) for ss in synsets2]
        if [x for x in scores if x is not None] == []:
            return 0

        best_score = max([x for x in scores if x is not None])
        if best_score is not None:
            score += best_score
            count += 1

    if count == 0:
        score = 0
        print('oops')
    else:
        score /= count
    return score * 100
Пример #30
0
def syn(conw,candw):
    syn_strings = lambda x : str(x)[8:-2]
    pos = lambda y : y[-4:]
        
    #find the synsets of the context word
    ssets = wn.synsets(conw)
    sset_strings = map(syn_strings,ssets)

    #synsets of the candidate word
    csets = wn.synsets(candw)
    cset_strings = map(syn_strings,csets)

    #take a synset whose part of speech matches
    
    matches = [(i,j) for i in range(len(sset_strings)) for j in range(len(cset_strings)) if pos(sset_strings[i]) == pos(cset_strings[j])]
    similarity = 0
    if matches != []:
          
        (k,l) = matches[0]
        similarity = wn.path_similarity(ssets[k],csets[l])
    else:
        similarity = 0

    if similarity is None:
        return 0
    else:
        return similarity
def spreadItemsIntoGroups(groupNames, groupItemList):
    res = []
    for groupItem in groupItemList:
        maxPsSims = []
        maxWupSims = []
        maxMixSims = []
        for groupName in groupNames:
            groupNameSynsets = wordnet.synsets(groupName)
            groupItemSynsets = wordnet.synsets(groupItem)
            psSims = []
            wupSims = []
            for nameSyns, itemSyns in product(groupNameSynsets, groupItemSynsets):
                ps = wordnet.path_similarity(nameSyns, itemSyns) or 0
                psSims.append((ps, groupItem, groupName))
                wup = wordnet.wup_similarity(nameSyns, itemSyns) or 0
                wupSims.append((wup, groupItem, groupName))
            maxPsSims.append(max(psSims))
            maxWupSims.append(max(wupSims))
            maxMixSims.append((max(psSims)[0] * max(wupSims)[0], groupItem, groupName))
        print('     path:', sorted(maxPsSims, key=lambda item: item[0], reverse=True))
        print('     wup:', sorted(maxWupSims, key=lambda item: item[0], reverse=True))
        print('     mix:', sorted(maxMixSims, key=lambda item: item[0], reverse=True))
        print('path:', max(maxPsSims))
        print('wup:', max(maxWupSims))
        maxMix = max(maxMixSims)[0]
        print('mix:', max(maxMixSims))
        res.append((maxMix, groupItem, [maxMixSim[2] for maxMixSim in maxMixSims if maxMixSim[0] == maxMix]))
    return res



#---------------------------------------------------------------------------------------------
     path: [(0.3333333333333333, 'Apple', 'Fruit'), (0.3333333333333333, 'Apple', 'Berry'), (0.25, 'Apple', 'Vegetable'), (0.2, 'Apple', 'Mushroom'), (0.125, 'Apple', 'Plant')]
     wup: [(0.9, 'Apple', 'Fruit'), (0.8571428571428571, 'Apple', 'Berry'), (0.8, 'Apple', 'Vegetable'), (0.75, 'Apple', 'Mushroom'), (0.6666666666666666, 'Apple', 'Plant')]
     mix: [(0.3, 'Apple', 'Fruit'), (0.2857142857142857, 'Apple', 'Berry'), (0.2, 'Apple', 'Vegetable'), (0.15000000000000002, 'Apple', 'Mushroom'), (0.08333333333333333, 'Apple', 'Plant')]
Пример #32
0
def get_category(text):
    stop_free = " ".join([i for i in text.lower().split() if i not in stop])
    punc_free = "".join([i for i in stop_free if i not in punc])
    normalized = [lemma.lemmatize(i) for i in punc_free.split()]
    score = {}
    for i in categories:
        nb_word = len(normalized)
        score[categories[i]] = 0
        for j in normalized:
            try:
                # limit the impact of words with many different senses because it could lead to misinterpretation
                nb_sense = len(wn.synsets(j))
                score[categories[i]] += wn.path_similarity(wn.synset(i), wn.synsets(j)[0])/nb_sense
                nb_word -= (nb_sense-1)/nb_sense
            except IndexError:  # word can not be found in wordnet
                nb_word -= 1    # in this case we should not count this word
            except TypeError:  # or no similarity is found
                pass           # in this case we count the word
        if nb_word == 0:  # if no word is considered interesting, the category cannot be defined
            return None
        score[categories[i]] /= nb_word  # nb_word is the same at each iteration
    maxScore = max(score, key=lambda x: score[x])
    if score[maxScore] > 0.048:
        return maxScore
    return None
Пример #33
0
def wn_pairwise_similarity(f, namer):
    """
    Get pairwise similarity of label given wordnet similarity
    """
    from nltk.corpus import wordnet as wn
    leaves = f.get_vals()
    leaves = [namer(l).lower() for l in leaves]
    leaves = clean(leaves, to_underscore=True)

    synsets = [get_synset(s) for s in leaves]
    if all(s is None for s in synsets):
        return 0., 0., 0.

    synsets = [s for s in synsets if s is not None]
    synsets = list(set(synsets))

    if len(synsets) == 1:
        return 1., 1., 1.

    sims = []
    for a, b in itertools.combinations(synsets, 2):
        sim = wn.path_similarity(a, b)
        sims.append(sim)
    mean_sim = np.mean(sims)
    min_sim = min(sims)
    max_sim = max(sims)

    return mean_sim, min_sim, max_sim
Пример #34
0
def dependency_parse(result, sentence, label):
    base, tree = None, None
    target, tar_index = target_search(result.nodes)
    if tar_index is not None:
        base, tree = base_search(tar_index, result.nodes)
    if base is not None and len(target) > 1:
        base, b = changePronoun(base)  #check if it is a person name
        print(sentence, "---", base, target)
        target, t = changePronoun(target)
        similarity = wn.path_similarity(b, t)
        if similarity is None:
            similarity = 0.0
        return {
            "base": base,
            "target": target,
            "similarity": similarity,
            "sentence": sentence,
            "tree_type": tree,
            "detected": True,
            "label": label
        }  #add features here
    else:
        return {
            "base": "",
            "target": "",
            "similarity": 0.0,
            "sentence": sentence,
            "tree_type": 0,
            "detected": False,
            "label": label
        }
Пример #35
0
def return_relationship_matrix(sentence1, sentence2, posGroup):

    relationshipMatrix = []

    if posGroup == NOUNS or posGroup == VERBS:

        for word_A in sentence1:
            relationshipMatrixNode = []

            for word_B in sentence2:
                similarity = wn.path_similarity(word_A.wordSense, word_B.wordSense)
                relationshipMatrixNode.append(similarity)

            relationshipMatrix.append(relationshipMatrixNode)

    elif posGroup == SINGULARS:

        for word_A in sentence1:
            relationshipMatrixNode = []

            for word_B in sentence2:
                if word_A.word.lower() == word_B.word.lower():
                    relationshipMatrixNode.append(1)

                else:
                    relationshipMatrixNode.append(0)

            relationshipMatrix.append(relationshipMatrixNode)

    return relationshipMatrix
Пример #36
0
    def relevancy_score(desiredDoc):
        #Each word has score between 0 to 1 in terms of similarity. "None" is returned
        #there is no similarity. 
        newWord =searchWord + ".n.01" 
        searchWordwn = wn.synset(newWord)
##        print (newWord)
##        print (searchWordwn)
        relevancyScore = 0
        currentWordScore = 0
        memo = {}
        for i in range(len(keywords)):
                currentWord = keywords[i][0]
                if currentWord in memo:
                    currentWordScore = memo[currentWord]
                    if currentWordScore != None:
                        relevancyScore += currentWordScore
                else:
                    if wn.synsets(currentWord, pos = wn.NOUN) != []:
                        currentWordwn = wn.synsets(currentWord, pos = wn.NOUN)[0]
                        currentWordScore = wn.path_similarity(searchWordwn,currentWordwn)
                        memo[currentWord] = currentWordScore

                    if currentWordScore != None:
                        relevancyScore += currentWordScore

        return relevancyScore
Пример #37
0
    def get_best_synset_pair(self,
                             word1,
                             word2,
                             word1_tag=None,
                             word2_tag=None):
        """
        Returns the best synset pair with the highest similaity among all pairs

        Args:
            word1: source word
            word2: Word compared to
            word1_tag: POS tag of word1
            word2_tag: POS tag of word 2

        Returns:
            Tuple of (synset1, synset2)
        """
        max_similarity = -1.0
        best_pair = None, None
        word1_synsets = wn.synsets(word1, word1_tag)
        word2_synsets = wn.synsets(word2, word2_tag)
        if word1_synsets is None or word2_synsets is None:
            return best_pair
        for syn1 in word1_synsets:
            for syn2 in word2_synsets:
                # Compare pos tags for both words here
                if syn1._pos != 's' and syn1._pos == syn2._pos:
                    sim = wn.path_similarity(syn1, syn2)
                    if sim is None:
                        # print("here sim is None")
                        return None, None
                    elif sim > max_similarity:
                        max_similarity = sim
                        best_pair = syn1, syn2
        return best_pair
Пример #38
0
 def __similarity(self, word, compareto):
     try:
         score = word.jcn_similarity(compareto, self.wordnet_ic, True)
     except:
         score = wordnet.path_similarity(word, compareto)
     if score == -1: score = None #No path between the words was found
     return score
def create_graphs(doc_list):
    documents = doc_list
    if documents is None:
        documents = default_document_list()

    distance_functions = [
        (wn.lch_similarity(SYNSETS[0], SYNSETS[0]), 'lch', lambda sense_1, sense_2: wn.lch_similarity(sense_1, sense_2)),
        (1.0, 'lin', lambda sense_1, sense_2: wn.lin_similarity(sense_1, sense_2, CORPUS)),
        (10.636958516573292, 'res', lambda sense_1, sense_2: wn.res_similarity(sense_1, sense_2, CORPUS)),
        (wn.jcn_similarity(SYNSETS[0], SYNSETS[0], CORPUS), 'jcn', lambda sense_1, sense_2: wn.jcn_similarity(sense_1, sense_2, CORPUS)),
        (1.0, 'path', lambda sense_1, sense_2: wn.path_similarity(sense_1, sense_2)),
    ]
    all_senses = []
    for doc in documents:
        for sense in doc.top_senses():
            all_senses.append((sense, doc.name))
    against_colors = ['r', 'b', 'g']
    against_to = [wn.synset(word) for word in ["economy.n.01", "philosophy.n.02", "politics.n.01"]]
    create_against_graph('phyl_eco_pol', documents, all_senses, against_to, distance_functions, against_colors)

    against_to = SYNSETS

    against_colors = [(random(), random(), random()) for _i in range(0, len(SYNSETS))]
    create_against_graph('handpicked', documents, all_senses, against_to, distance_functions, against_colors)

    create_graph_top_senses(documents, all_senses, distance_functions)
def get_score(tags, groups):
  sscore = 0
  scount = 0 
  illegal_word = 0

  if (tags != None ) :
   for g in groups:
    
    for x in k.tags:
     try : 
      #print str(x.text), 
      #check substring else calculate words similarity score
      if g in str(x.text).lower():
	sscore += 2.0
        scount += 1
      else:
       tag = wn.synset(str(x.text).lower()+'.n.01')
       group = wn.synset(g+ '.n.01')  
       sem = wn.path_similarity(group,tag)
       if sem >= 0.3 :
        sscore += sem
	scount += 1     
     except:
	illegal_word += 1
  if scount != 0 :
    return sscore/scount
  else :
    return 0
def similarity_score(s1, s2):
    """
    Calculate the normalized similarity score of s1 onto s2

    For each synset in s1, finds the synset in s2 with the largest similarity value.
    Sum of all of the largest similarity values and normalize this value by dividing it by the
    number of largest similarity values found.

    Args:
        s1, s2: list of synsets from doc_to_synsets

    Returns:
        normalized similarity score of s1 onto s2

    Example:
        synsets1 = doc_to_synsets('I like cats')
        synsets2 = doc_to_synsets('I like dogs')
        similarity_score(synsets1, synsets2)
        Out: 0.73333333333333339
    """
    
    
    sim_s = []
    for syn1 in s1 :
        sim_v = []
        for syn2 in s2 :
            val = wn.path_similarity(syn1,syn2) 
            if isinstance(val,float) :
                sim_v.append(val)
        if (sim_v) :
            sim_s.append(max(sim_v))

    return sum(sim_s)/len(sim_s)
Пример #42
0
def internal_word_max_WSD(sentence, word):
    """
    Auxiliary function for sem_wsd()

    Input: a sentence and a word in the sentence,
            sentence is a list of words, not a string

    Return: synset(sense) of the word that maximize one similarity with another word in the sentence

    Derived from code at http://www.jaist.ac.jp/~s1010205/sitemap-2/styled-7/
    """    
    wordsynsets = wn.synsets(word)
    bestScore = 0.0
    result = None
    for synset in wordsynsets:
        for w in sentence:
            score = 0.0
            for wsynset in wn.synsets(w):
                sim = wn.path_similarity(wsynset, synset)
                if(sim == None):
                    continue
                else:
                    score += sim
            if (score > bestScore):
                bestScore = score
                result = synset
    return result
def similarity_score(s1, s2):
    """
    Calculate the normalized similarity score of s1 onto s2

    For each synset in s1, finds the synset in s2 with the largest similarity value.
    Sum of all of the largest similarity values and normalize this value by dividing it by the
    number of largest similarity values found.

    Args:
        s1, s2: list of synsets from doc_to_synsets

    Returns:
        normalized similarity score of s1 onto s2

    Example:
        synsets1 = doc_to_synsets('I like cats')
        synsets2 = doc_to_synsets('I like dogs')
        similarity_score(synsets1, synsets2)
        Out: 0.73333333333333339
    """
    largest_similarity_values = []
    for syn1 in s1:
        similarity_values = []
        for syn2 in s2:
            simi_value = wn.path_similarity(syn1, syn2)
            if simi_value is not None:
                similarity_values.append(simi_value)
        if len(similarity_values) != 0:
            largest_similarity_values.append(max(similarity_values))
    return sum(largest_similarity_values) / len(largest_similarity_values)
Пример #44
0
def findMaxPathSimilarity(ingredSynsets, foodSynsets):
	maxPathSimilarity = 0
	for synseta in ingredSynsets:
		for synsetb in foodSynsets:
			pathSim = wn.path_similarity(synseta, synsetb)
			if pathSim > maxPathSimilarity: 
				maxPathSimilarity = pathSim
	return maxPathSimilarity
Пример #45
0
def wnsensesim(synset1, synset2, metric):

    if metric == 'path_similarity':
        return wn.path_similarity(synset1, synset2)
    elif metric == 'lch_similarity':
        return wn.lch_similarity(synset1, synset2)
    elif metric == 'wup_similarity':
        return wn.wup_similarity(synset1, synset2)
    else:#add more similarity measures e.g., jcn
        print "Unsupported wn similarity measure requested"
Пример #46
0
 def word_similarity(self, word1, word2):
     w1synsets = wn.synsets(word1)
     w2synsets = wn.synsets(word2)
     maxsim = 0
     for w1s in w1synsets:
         for w2s in w2synsets:
             current = wn.path_similarity(w1s, w2s)
             if (current > maxsim and current > 0):
                 maxsim = current
     return maxsim
Пример #47
0
def get_path_similarity_between_boy_and_dog():
    """
    Computes the path similarity between "boy" and "dog".

    Returns
    -------
    A float.
    """

    return wn.path_similarity(wn.synset('boy.n.01'), wn.synset('dog.n.01'))
Пример #48
0
def compare(a,b,min=0.31):
# returns True if a has equal meaning to b, False otherwise
    asyn = wn.synsets(a)
    bsyn = wn.synsets(b)
    if len(asyn) > 0 and len(bsyn) > 0:
        for ax in asyn:
            if len(filter(lambda x : x == True,map(lambda bx : wn.path_similarity(ax,bx) > min, bsyn)))>0:
                return True
        return False
    else:
        return a == b
Пример #49
0
def word_similarity(word1, word2):
   w1synsets = wn.synsets(word1)
   w2synsets = wn.synsets(word2)
   maxsim = 0
   for w1s in w1synsets:
       for w2s in w2synsets:
           current = wn.path_similarity(w1s, w2s)
           if (current > maxsim and current > 0):
               maxsim = current
           #print "Common hypernyms of ", w1s, " and ", w2s, ": ", w1s.common_hypernyms(w2s)
   return maxsim
 def __init__(self, obs_corpus, target_corpus, metric="path", aggregation_mode_prev="", aggregation_mode=""):
     super().__init__(obs_corpus, target_corpus, aggregation_mode, None, aggregation_mode_prev)
     self.metric = metric
     if self.metric == "path":
         self.metric_func = lambda syn1, syn2: wn.path_similarity(syn1, syn2)
     elif self.metric == "lch":
         self.metric_func = lambda syn1, syn2: wn.lch_similarity(syn1, syn2)
     elif self.metric == "wup":
         self.metric_func = lambda syn1, syn2: wn.wup_similarity(syn1, syn2)
     else:
         raise(ValueError("Wrong similarity metric: %s, should be one of path/lch/wup."%self.metric))
Пример #51
0
def semantic_score(word1, word2):
    '''
    Semantic score between two words based on WordNet
    Returns: float (the semantic score between word1 and word2)
    '''
    try:
        w1 = wn.synset('%s.n.01'%(word1))
        w2 = wn.synset('%s.n.01'%(word2))
        return wn.path_similarity(w1,w2,simulate_root = False)
    except:
        return 0
Пример #52
0
 def wnpath(self, target, neighbor):
     r"""Return the best path_similarity between
     `target` and `neighbor`."""
     synsetsT = wn.synsets(target, self.args.wordnet_pos_tag)
     synsetsN = wn.synsets(neighbor, self.args.wordnet_pos_tag)
     if not synsetsT:
         return 0  # XXX no synsets for `target`
     if not synsetsN:
         return 0  # XXX no synsets for `neighbor`
     return (
         max(wn.path_similarity(sT, sN) for sT in synsetsT for sN in synsetsN) or 0
     )  # When `wn` returns None, we just say sim==0
Пример #53
0
def get_path_similarity_between_girl_and_girl():
    """
    Computes the path similarity between "girl" and "girl".

    Returns
    -------
    A float.
    """

    # YOUR CODE HERE

    return wn.path_similarity(wn.synset('girl.n.01'), wn.synset('girl.n.01'))
Пример #54
0
 def wordNetSimilarity(self, term1, term2):
     #http://www.nltk.org/howto/wordnet.html
     sim = None
     try:
         wn_term1 = wn.synsets(term1)[0] #+ ".n.01")
         wn_term2 = wn.synsets(term2)[0] #+ ".n.01")
         sim = wn.path_similarity(wn_term1, wn_term2)
     except:
         print("Error computing similarity.")
     if not sim:
         sim = 0
     return sim
 def extract_feature(self, sent):
     feature = [0] * (self.n_bow+self.n_verbs)
     verbs = [ w for w,pos in self.tagger.tag(word_tokenize(sent)) if pos=='VB' ]
     words = set(sent.split())
     for i in xrange(self.n_bow):
         feature[i] = 1 if self.BOW[i] in words else 0
     for i in xrange(self.n_verbs):
         if not verbs:
             feature[self.n_bow+i] = 0
         else:
             similarities = [ wn.path_similarity(self.VERBS[i],wn.synset(v+'.v.01')) for v in verbs ]
             feature[self.n_bow+i] = max(similarities)
     return feature
Пример #56
0
 def meansimilarity(word):
     # Helper function. Calculate mean path similarity of first synset
     # of the word with all synsets of this word.
     sums = 0.0
     synsets = wn.synsets(word)
     index = 0
     for synset in range(0, len(synsets)):
         if index < len(synsets) - 1:
             ps = wn.path_similarity(synsets[0], synsets[index + 1])
             if ps is not None:
                 sums = sums + ps
         index += 1
     return sums / len(synsets)
Пример #57
0
def word_path_similarity(s1, s2):
    wplock.acquire()
    val = 0
    if(s1.lower == s2.lower):
        return 1.0

    ss1 = wn.synsets(s1.lower())
    ss2 = wn.synsets(s2.lower())
    for t1 in ss1:
        for t2 in ss2:
            val = max(wn.path_similarity(t1, t2), val)
    wplock.release()
    return val
Пример #58
0
    def cmp_text_word_net(self, annotation, candidate, entire_annotation):
        """
        Compare the retrieved answer with the annotation using WordNet path distance.

        THIS IS VERY SLOW, RESULTS ARE NOT CACHED

        :param annotation: The correct Answer
        :type annotation: String
        :param candidate: The retrieved Answer
        :type candidate: [String, String]

        :return: Float
        """

        if annotation is None or annotation is 'NULL':
            # annotation is NULL
            return -1
        elif candidate is None:
            # no answer was extracted
            return -2

        # fetch synsets for both answers
        self._lock.acquire()
        syn_a = [wordnet.synsets(t) for t in word_tokenize(annotation)]
        syn_b = [wordnet.synsets(t[0]) for t in candidate]

        # drop tokens without synsets
        syn_a = [syn for syn in syn_a if len(syn) > 0]
        syn_b = [syn for syn in syn_b if len(syn) > 0]
        self._lock.release()

        if not any(syn_a) or not any(syn_b):
            # no synsets were found for one of the answers!
            return -3

        score = 0
        max_b = [0] * len(syn_b)

        self._lock.acquire()
        for i in range(len(syn_a)):
            max_a = 0
            for j in range(len(syn_b)):
                sim = max(list((wordnet.path_similarity(a, b) or 0) for a, b in product(syn_a[i], syn_b[j])) or [0])
                max_a = max(sim, max_a)
                max_b[j] = max(max_b[j], sim)

            score += max_a
        score += sum(max_b)
        self._lock.release()
        return score / len(syn_a) + len(syn_b)
Пример #59
0
def wnsim(synset1, synset2, method='all'):
    synset_patt = re.compile(r'^.+\..+\.\d+$')

    if synset_patt.match(synset1):
        s1 = wn.synset(synset1)
    else:
        s1 = wn_synset(synset1)

    if synset_patt.match(synset2):
        s2 = wn.synset(synset2)
    else:
        s2 = wn_synset(synset2)

    if s1 is None or s2 is None:
        return 0

    if method == 'lin':
        return wn.lin_similarity(s1, s2, wn_ic)
    elif method == 'res':
        return wn.res_similarity(s1, s2, wn_ic)
    elif method == 'jcn':
        return wn.jcn_similarity(s1, s2, wn_ic)
    elif method == 'wup':
        return wn.wup_similarity(s1, s2)
    elif method == 'path':
        return wn.path_similarity(s1, s2)
    elif method == 'lch':
        return wn.lch_similarity(s1, s2)
    elif method == 'all':
        return [
            ('lin', wn.lin_similarity(s1, s2, wn_ic)),
            ('res', wn.res_similarity(s1, s2, wn_ic)),
            ('jcn', wn.jcn_similarity(s1, s2, wn_ic)),
            ('wup', wn.wup_similarity(s1, s2)),
            ('path', wn.path_similarity(s1, s2)),
            ('lch', wn.lch_similarity(s1, s2))
        ]
def get_best_synset_pair(word_1, word_2):
    synsets_1 = wn.synsets(word_1)
    synsets_2 = wn.synsets(word_2)
    if len(synsets_1) == 0 or len(synsets_2) == 0:
        return None, None
    else:
        max_sim = -1.0
        best_pair = None, None
        for synset_1 in synsets_1:
            for synset_2 in synsets_2:
               sim = wn.path_similarity(synset_1, synset_2)
               if sim > max_sim:
                   max_sim = sim
                   best_pair = synset_1, synset_2
        return best_pair