示例#1
0
def get_lesk_answers(senseval_data):
    time_start = time.clock()

    # Getting answers from lesk algorithms
    original_lesk_answers = {}
    simple_lesk_answers = {}
    adapted_lesk_answers = {}
    for sentence_data in senseval_data:
        for phrase in sentence_data["test_phrases"]:
            word_id, word = phrase["headword"]
            original_lesk_answers[word_id] = lesk.original_lesk(
                " ".join(sentence_data["sentence"]), word)
            simple_lesk_answers[word_id] = lesk.simple_lesk(
                " ".join(sentence_data["sentence"]), word)
            adapted_lesk_answers[word_id] = lesk.adapted_lesk(
                " ".join(sentence_data["sentence"]), word)
        for word_id, word in sentence_data["test_words"].iteritems():
            original_lesk_answers[word_id] = lesk.original_lesk(
                " ".join(sentence_data["sentence"]), word)
            simple_lesk_answers[word_id] = lesk.simple_lesk(
                " ".join(sentence_data["sentence"]), word)
            adapted_lesk_answers[word_id] = lesk.adapted_lesk(
                " ".join(sentence_data["sentence"]), word)
        sys.stdout.write(".")
    lesk_answers_list = []
    lesk_answers_list.append((original_lesk_answers, "original lesk"))
    lesk_answers_list.append((simple_lesk_answers, "simple lesk"))
    lesk_answers_list.append((adapted_lesk_answers, "adapted lesk"))

    time_end = time.clock()
    print "\nlesk took " + str(time_end - time_start) + " seconds"
    return lesk_answers_list
示例#2
0
def get_lesk_answers(senseval_data):
    time_start = time.clock()

    # Getting answers from lesk algorithms
    original_lesk_answers = {}
    simple_lesk_answers = {}
    adapted_lesk_answers = {}
    for sentence_data in senseval_data:
        for phrase in sentence_data["test_phrases"]:
            word_id, word = phrase["headword"]
            original_lesk_answers[word_id] = lesk.original_lesk(" ".join(sentence_data["sentence"]), word)
            simple_lesk_answers[word_id] = lesk.simple_lesk(" ".join(sentence_data["sentence"]), word)
            adapted_lesk_answers[word_id] = lesk.adapted_lesk(" ".join(sentence_data["sentence"]), word)
        for word_id, word in sentence_data["test_words"].iteritems():
            original_lesk_answers[word_id] = lesk.original_lesk(" ".join(sentence_data["sentence"]), word)
            simple_lesk_answers[word_id] = lesk.simple_lesk(" ".join(sentence_data["sentence"]), word)
            adapted_lesk_answers[word_id] = lesk.adapted_lesk(" ".join(sentence_data["sentence"]), word)
        sys.stdout.write(".")
    lesk_answers_list = []
    lesk_answers_list.append((original_lesk_answers, "original lesk"))
    lesk_answers_list.append((simple_lesk_answers, "simple lesk"))
    lesk_answers_list.append((adapted_lesk_answers, "adapted lesk"))

    time_end = time.clock()
    print "\nlesk took " + str(time_end - time_start) + " seconds"
    return lesk_answers_list
示例#3
0
def getCommonWord(str1, str2):
    # str1 = "Some scoff at the notion that movies do anything more than entertain ."
    # str2 = "Some are wrong ."
    ml1 = str1.lower().split()
    ml1 = ml1[2:]
    ml1[0] = ml1[0][0].upper() + ml1[0][1:]
    ml2 = str2.lower().split()
    ml2 = ml2[2:]
    ml2[0] = ml2[0][0].upper() + ml2[0][1:]
    print ml1
    print ml2
    ml3 = set(ml1).intersection(ml2)
    # print ml3
    file = open("stpwrds.txt", "r+")
    stpwrds = [
        x.rstrip('\n').rstrip(')') for x in file.readlines() if x.strip()
    ]
    file.close()
    ml4 = set(ml3).difference(stpwrds)
    # print ml4
    ml4 = list(ml4)
    print ml4
    i = 0
    while i < len(ml4):
        if simple_lesk(str1, ml4[i]) != simple_lesk(str2, ml4[i]):
            del ml4[i]
        i += 1

    t2 = nltk.pos_tag(ml4)
    ml = []
    for x in t2:
        if x[1][0] == 'V':
            ml.append(x[0])
    for x in t2:
        if x[1][0] == 'N':
            ml.append(x[0])
    for x in t2:
        if x[1][0] == 'P' and x[1][1] == 'R':
            ml.append(x[0])
    for x in t2:
        if x[1] == "CD":
            ml.append(x[0])
    for x in t2:
        if x[1][0] == 'F':
            ml.append(x[0])
    for x in t2:
        if x[0] not in ml:
            ml.append(x[0])
    # print t2
    # print ml

    print "After WSD and POS reordering:\n", ml
    i = 0
    while i < len(ml):
        ml[i] = ml[i].lower()
        i += 1

    return ml
示例#4
0
    def test_simple_lesk_default(self):
        bank_sents = [('I went to the bank to deposit my money', 'depository_financial_institution.n.01'),
                      ('The river bank was full of dead fishes', 'bank.n.01')]

        plant_sents = [('The workers at the industrial plant were overworked', 'plant.n.01'),
                       ('The plant was no longer bearing flowers', 'plant.v.01')]
        for sent, synset_name in bank_sents:
            self.assertEqual(simple_lesk(sent,'bank').name(), synset_name)
        for sent, synset_name in plant_sents:
            self.assertEqual(simple_lesk(sent,'plant').name(), synset_name)
示例#5
0
    def test_simple_lesk_default(self):
        bank_sents = [('I went to the bank to deposit my money',
                       'depository_financial_institution.n.01'),
                      ('The river bank was full of dead fishes', 'bank.n.01')]

        plant_sents = [('The workers at the industrial plant were overworked',
                        'plant.n.01'),
                       ('The plant was no longer bearing flowers',
                        'plant.v.01')]
        for sent, synset_name in bank_sents:
            self.assertEqual(simple_lesk(sent, 'bank').name(), synset_name)
        for sent, synset_name in plant_sents:
            self.assertEqual(simple_lesk(sent, 'plant').name(), synset_name)
示例#6
0
 def wordSenseDisambiguation(self, sentence):
     # removing the disambiguity by getting the context
     pos = self.identifyWordsForComparison(sentence)
     sense = []
     for p in pos:
         sense.append(simple_lesk(sentence, p[0], pos=p[1][0].lower()))
     return set(sense)
示例#7
0
def simple_lesk_algo():
    sent = 'How much deposit i can deposit to my deposit?'
    ambiguous = 'deposit'
    answer = simple_lesk(sent, ambiguous, pos='v')
    print (answer)
    # Synset('depository_financial_institution.n.01')
    print (answer.definition())
示例#8
0
 def get_event_guesses(self,link):
     tokens = link['sentence']
     pos_tokens = nltk.pos_tag(tokens)
     guess, guess_backup, guess_all = [], [], []
     for i in range(len(tokens)):
         guess_all.append(i)
         word = tokens[i].lower()
         word_lem_n = Word(word).lemmatize('n')
         word_lem_v = Word(word).lemmatize('v')
         lesk_syn = simple_lesk((" ").join(tokens), word)
         if ((word == '.\n') or (word in stopwords.words('english'))):
             continue #EOL or stop word; ignore
         guess_backup.append(i)
         if ('VB' in pos_tokens[i][1] or \
              word in noun_events or word_lem_n in noun_events or word_lem_v in noun_events): #verb then add it in
             guess.append(i)
         elif ('NN' in pos_tokens[i][1] and lesk_syn):
             lesk_list = lesk_syn.name().split('.')
             if ((len(lesk_list) == 3) and ([lesk_list[0],int(lesk_list[2])] in noun_senses)):
                 guess.append(i) #word disamb is in noun sense list matching sense
     # if (len(guess)==0):
     #     # print('no guesses made here')
     #     # print('link sentence:> ', tokens)
     #     # print('pos tags:> ', pos_tokens)
     #     # print('actual event:> ', tokens[link['start_index']:link['end_index']+1])
     #     # print('link:> ', link)
     #     # print()
     #     # print()
     #     return guess_backup
     return guess_all
示例#9
0
 def disambiguateWordSenses3(self,sentence,word,stanfordPOS):        #disambiguation with simple_lesk
     print word,stanfordPOS
     result_list=simple_lesk(sentence,word,nbest=True)         #result is a list of synsets of word
     print result_list
     result = None
     print word,stanfordPOS
     if result_list:
         for ss in result_list:
             pos=ss.pos()
             if (pos == u's'):
                 pos = u'a'
             if pos == stanfordPOS:
                 result  = ss
                 print "matched"
                 break
     if result:
         pos = result.pos()
         if (pos == u's'):
             pos = u'a'
         offset = result.offset()
         pos_score=0.0
         neg_score=0.0
         if (pos, offset) in self.db:
      #       print word,pos,offset
             pos_score, neg_score = self.db[(pos, offset)]
         obj = 1.0-(pos_score+neg_score)
         #print "%%%%%%%%%%"
         #print pos_score,neg_score, obj
     else:
         obj=1.0
         pos=None
         pos_score=0.0
         neg_score=0.0
     return obj,pos,pos_score,neg_score
示例#10
0
def wsd():
    definition = ""
    if request.method == "POST":
        word = request.form["word"]
        sentence = request.form["sentence"]
        if word and sentence and word.lower() in sentence.lower():
            definition = simple_lesk(sentence, word).definition()
    return render_template("main.html", definition=definition)
示例#11
0
def provide_synset(word, context):
    try:
        answer = simple_lesk(context, word, pos='n')
        return answer
    except IndexError:
        #print("PYWSD DOES NOT LIKE THIS WORD BECAUSE OF A BUG:")
        #print(word)
        return None
示例#12
0
        def get_semantic_score_with_context(token, nlp_review):

            word = token.lower_  # get lowercased token text
            position = token.idx  # get position of word in document
            pos = posTag_to_wordNetTag(
                token.pos_
            )  # get POS of token, for better word sense disambiguation

            # define how many tokens around the token of interest we look at
            num_surrounding_words = 10
            # careful if there are less then num_surrounding_words before our token or after our token
            leftmost_word_idx = max(0, position - num_surrounding_words)
            rightmostword_idx = min(len(nlp_review),
                                    position + num_surrounding_words)
            surrounding_text = nlp_review[
                leftmost_word_idx:rightmostword_idx].text

            # determine word with the closest sense in WordNet
            #     print(word,"....",surrounding_text,pos)
            try:
                word_with_closest_sense = simple_lesk(surrounding_text,
                                                      word,
                                                      pos=pos)
            except:
                word_with_closest_sense = simple_lesk(surrounding_text, word)
            #     print(word,pos,word_with_closest_sense)
            # find the sentiment score to the word we found in wordnet
            if word_with_closest_sense:
                sentiword = swn.senti_synset(word_with_closest_sense.name())

                sent_scores = {
                    "objective": sentiword.obj_score(),
                    "positive": sentiword.pos_score(),
                    "negative": sentiword.neg_score()
                }

                sentiment = max(sent_scores, key=sent_scores.get)

                return sentiment
            else:
                return 'no_sentiment_assigned'
示例#13
0
    def disambiguate_lesk(self, sentence, ambiguous, pos):
        """
            @param sentence : I went to the bank to deposit my money

            @param ambiguous : bank

            @param pos : n

            @return : Synset('depository_financial_institution.n.01')
        """
        from pywsd.lesk import simple_lesk
        return simple_lesk(sentence, ambiguous, pos)
示例#14
0
 def wrapper(doc):
     head_word = func(doc)
     head_word_synset = simple_lesk(str(doc), str(head_word))
     if not head_word_synset:
         return ""
     max_similarity = -1
     max_class_synset = ""
     for category in FINE_CLASSES_SYNSETS:
         class_synset = wn.synset(category)
         similarity = wn.path_similarity(head_word_synset, class_synset)
         if similarity and similarity > max_similarity:
             max_class_synset = class_synset
             max_similarity = similarity
     return max_class_synset
 def disambiguate_word_senses(self, sentence):
     """
         Disambiguating word senses for nouns and verbs using the LESK algorithm
     """
     # Extract nouns and verbs
     pos_tags = self.extract_nouns_and_verbs(sentence)
     sense = []
     for tag in pos_tags:
         # Fetch correct synset for each tag based on surrounding context
         disambiguated_term = simple_lesk(sentence,
                                          tag[0],
                                          pos=tag[2][0].lower())
         if disambiguated_term is not None:
             sense.append(disambiguated_term)
     return set(sense)
示例#16
0
    def predictwsd() -> Response:  # pylint: disable=unused-variable
        """make a prediction using the specified model and return the results"""
        if request.method == "OPTIONS":
            return Response(response="", status=200)

        data = request.get_json()
        sentence = data["sentence"]
        word = data["word"]
        answer = simple_lesk(sentence, word)
        hypernym = ""
        for synset in answer.hypernyms():
            lemma_list = synset.lemmas()
            hypernym = lemma_list[0].name()
            break
        log_blob = {"hypernym": hypernym}

        return jsonify(log_blob)
def get_event_guesses(link, nsenses, nevents, punctuation):
    tokens = link['sentence']
    cv_events = list(link['caevo_event'])
    guess, guess2, guess3 = [], [], []  #backups to avoid empty return
    for i in range(len(tokens)):
        guess3.append(i)
        word = tokens[i].rstrip('\n').lower().replace("'",
                                                      "").replace("\"", "")
        # print('main word:> ', word)
        wordS = Word(word).stem()
        wordLN = Word(word).lemmatize('n')
        wordLV = Word(word).lemmatize('v')
        if (word in punctuation or word in stopwords.words('english')):
            continue
        guess2.append(i)  #all words except punctuations & stop words

        #1. check with caevo events
        for cv in cv_events:
            if (word == cv.decode('utf8').rstrip('\n').lower().replace(
                    "'", "").replace("\"", "")):
                guess.append(i)
                continue

        #2. check with noun events
        n, nstem, nlemn, nlemv = nevents['nevents'], nevents[
            'nevents_stem'], nevents['nevents_lem_n'], nevents['nevents_lem_v']
        if (word in n or word in nstem or word in nlemn or word in nlemv or \
            wordS in n or wordS in nstem or wordS in nlemn or wordS in nlemv or \
            wordLN in n or wordLN in nstem or wordLN in nlemn or wordLN in nlemv or \
            wordLV in n or wordLV in nstem or wordLV in nlemn or wordLV in nlemv):
            guess.append(i)
            continue

        #3. check with noun senses
        n, nstem, nlemn, nlemv = nsenses['nsenses'], nsenses[
            'nsenses_stem'], nsenses['nsenses_lem_n'], nsenses['nsenses_lem_v']
        lesk_syn = simple_lesk((" ").join(tokens), word)
        if (lesk_syn):
            lesk_list = lesk_syn.name().split('.')
            if (len(lesk_list) == 3):
                lsen = int(lesk_list[2])
                if ((word, lsen) in n or (wordS, lsen) in n or (wordLN, lsen) in n or (wordLV, lsen) in n or \
                    (wordS,lsen) in nstem or (wordLN,lsen) in nlemn or (wordLV,lsen) in nlemv):
                    guess.append(i)
                    continue
    return guess
示例#18
0
 def wrapper(doc):
     head_word = func(doc)
     hypernyms = []
     if head_word:
         # print("question: " + str(doc))
         # print("head word: " + str(head_word) + " pos=" + str(head_word.pos_))
         synset = simple_lesk(str(doc), str(head_word))
         if synset:
             unvisited_hypernyms = synset.hypernyms()
             for i in range(5):
                 for hypernym in unvisited_hypernyms:
                     unvisited_hypernyms = unvisited_hypernyms + hypernym.hypernyms(
                     )
                     unvisited_hypernyms.remove(hypernym)
                     hypernyms.append(hypernym)
             hypernyms.append(synset)
             # print(str(hypernyms))
     return hypernyms
示例#19
0
    def get_synonyms(self, sentence, word):
        from pywsd.lesk import simple_lesk

        synonyms = set()

        if isinstance(sentence, str):
            sentence = sentence.decode('utf-8')

        if isinstance(word, str):
            word = word.decode('utf-8')

        synset = simple_lesk(sentence, word)
        if synset is not None:
            for synonym in synset.lemma_names():
                synonyms.add(synonym.replace('_', ' '))

        # for idx, synset in enumerate(wordnet.synsets(word)):
        #     for synonym in synset.lemma_names():
        #         synonyms.add(synonym.replace('_', ' '))

        return list(synonyms)
    def tagging(self):

        if len(self.question1) <= 0 or len(self.question2) <= 0:
            return NULL
        else:
            stemmer1 = SnowballStemmer("english")
            stemmer2 = SnowballStemmer("english")
            #for self.w in :
        self.words1 = pos_tag(word_tokenize(self.question1))
        for i in range(0, len(self.words1)):
            #To create a list of lists instead of a read only tuple
            self.words1stem.append([])
            self.words1stem[i].append(stemmer1.stem((self.words1[i])[0]))
            self.words1stem[i].append((self.words1[i])[1])

        for word in self.words1stem:
            print word

            print(simple_lesk(self.question1, word[0]))

    #print self.answer1
    #if self.w.lower() not in stops:
    #print stemmer1.stem(self.w)
    #self.words1.append(stemmer.stem(self.w))
    #self.words1.append(nltk.pos_tag(self.w))
    #temp=stemmer1.stem(self.w)

    #print WORD1[0]
    #self.answer1.append(simple_lesk(self.question1,WORD1[0],WORD1[1]))
    #answer = simple_lesk
        #self.words1pos=nltk.pos_tag(self.words1)
        #print self.answer1
        #words1=stem(words1)
        for self.w2 in self.question2.split():
            if self.w2.lower() not in stops:
                #self.words2.append(stemmer2.stem(self.w2))
                #self.words2.append(self.w2)
                WORD2 = nltk.pos_tag(stemmer2.stem(self.w2))
示例#21
0
def get_synset(metode, word, text):
    synset = ""
    if metode == "original_lesk":
        synset = simple_lesk(text, word)
    elif metode == "simple_lesk":
        synset = adapted_lesk(text, word)
    elif metode == "adapted_lesk":
        synset = cosine_lesk(text, word)
    # elif metode == "path" :
    #     synset = max_similarity(text, word, "path")
    # elif metode == "path" :
    #     synset = max_similarity(text, word, "wup")
    # elif metode == "path" :
    #     synset = max_similarity(text, word, "lin")
    # elif metode == "path" :
    #     synset = max_similarity(text, word, "res")
    # elif metode == "random_sense":
    #     synset = random_sense(word)
    # elif metode == "first_sense":
    #     synset = first_sense(word)
    # elif metode == "most_frequent_sense":
    #     synset = most_frequent_sense(word)
    return synset
示例#22
0
async def extract_wsd(request, target):
    """
    $ curl -d '{"sents":"The sheet is twenty centimeters."}' \
        -H "Content-Type: application/json" -X POST \
        localhost:1700/en/wsd/default | json

    :param request:
    :return:
    """
    from pywsd import disambiguate
    from pywsd.similarity import max_similarity as maxsim
    from pywsd.lesk import simple_lesk

    rd = request.json
    sents = rd['sents']

    extract_syn = lambda r: (r[0], r[1].name(), r[1].definition())

    def extract_sents():
        rs = disambiguate(sents)
        return [extract_syn(r) for r in rs if r[1]]

    fn_map = {
        'default':
        lambda: extract_sents(),
        'maxsim':
        lambda: [
            extract_syn(r) for r in disambiguate(sents,
                                                 algorithm=maxsim,
                                                 similarity_option='wup',
                                                 keepLemmas=False) if r[1]
        ],
        'lesk':
        lambda: simple_lesk(sents, rd['word']),
    }
    result = fn_map[target]() if target in fn_map else []
    return json(result)
示例#23
0
from pywsd.utils import has_synset

simplelesk_answer = []
adaptedlesk_answer = []
cosinelesk_answer = []

print "\nSentence Context Disambiguation\n============================== \n"

raw_sentence="Some people are happy this sentence."
words = nltk.word_tokenize(raw_sentence)
print "\nChecking synsets of each word . . .\n==========================================\n"
print(disambiguate(raw_sentence))
print "\nDisambiguating your sentence word by word using Simple Lesk algorithm. Hold on. \n======================================================"
for eachword in words:
    if has_synset(eachword):
        answer = simple_lesk(raw_sentence, eachword)
        simplelesk_answer.append(answer)
        print "Sense :", answer
        print eachword+":"+answer.definition()+"\n"
    else:
        print eachword+": "+eachword+"\n"    
        simplelesk_answer.append(eachword)
""""
        
print "\nDisambiguating your sentence word by word using Adapted Lesk algorithm. Hold on. \n======================================================"

for eachword in words:
    if has_synset(eachword):
        answer = adapted_lesk(raw_sentence, eachword)
        adaptedlesk_answer.append(answer)
        print "Sense :", answer
示例#24
0
from string import punctuation

from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import brown, stopwords

from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk
from pywsd.similarity import max_similarity
from pywsd.utils import lemmatize, penn2morphy
from pywsd.allwords_wsd import disambiguate

"""
This module is to test for consistency between using the dismabiguate() and
individually calling wsd functions.
"""

for sentence in brown.sents()[:100]:
    # Retrieves a tokenized text from brown corpus.
    sentence = " ".join(sentence)
    # Uses POS info when WSD-ing.
    _, poss = zip(*pos_tag(word_tokenize(sentence)))
    tagged_sent =  disambiguate(sentence, prefersNone=True, keepLemmas=True)

    for word_lemma_semtag, pos in zip(tagged_sent, poss):
        word, lemma, semtag = word_lemma_semtag
        if semtag is not None:
            # Changes POS to morphy POS
            pos = penn2morphy(pos, returnNone=True)
            # WSD on lemma
            assert simple_lesk(sentence, lemma, pos=pos) == semtag
示例#25
0
sent_id = {}
for i in data:
    i = i.split()
    print(i)
    sent_id[i[0]] = i[1]
file.close()
file = open("raw_sent2.txt", "r")

data = file.read()

data = data.split("\n")
sent_dict = {}
counter = 1
for i in data:
    sent_dict["hom_" + str(counter)] = i
    counter += 1
print(len(sent_id))
print(len(sent_dict))
synsets = {}
list = []
for i in sent_id:
    print sent_dict[i], sent_id[i]
    synsets[sent_id[i]] = simple_lesk(sent_dict[i], sent_id[i])
    list.append((sent_id[i], simple_lesk(sent_dict[i], sent_id[i])))

file = open("interpretation.txt", "w")
for i in list:
    file.write(str(i))
    file.write("\n")

print list
示例#26
0
def predict(sent, ambiguous):
    try:
        return simple_lesk(sent, ambiguous)
    except:
        return None
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''


    for i in get_pos_tags(sentences[0]):
        try:

            dict[i[0]] = simple_lesk(sentences[0], i[0], get_wordnet_pos(i[1]))



        except IndexError:
            print sentences[0]
            dict[i[0]] = simple_lesk(sentences[0], i[0])

            continue

    flag = 0
    s=[]
    for i in trigrams:

        for j in get_pos_tags(i):
            try:
def RecursiveGlossOverlap_Classify(text):
	definitiongraphedges=defaultdict(list)
	definitiongraphedgelabels=defaultdict(list)
	
	#---------------------------------------------------------------------------------
	#2.Compute intrinsic merit (either using linear or quadratic overlap)
	#---------------------------------------------------------------------------------
	tokenized = nltk.word_tokenize(text)
	fdist1 = FreqDist(tokenized)
	stopwords = nltk.corpus.stopwords.words('english')
	stopwords = stopwords + [u' ',u'or',u'and',u'who',u'he',u'she',u'whom',u'well',u'is',u'was',u'were',u'are',u'there',u'where',u'when',u'may', u'The', u'the', u'In',u'in',u'A',u'B',u'C',u'D',u'E',u'F',u'G',u'H',u'I',u'J',u'K',u'L',u'M',u'N',u'O',u'P',u'Q',u'R',u'S',u'T',u'U',u'V',u'W',u'X',u'Y',u'Z']
	puncts = [u' ',u'.', u'"', u',', u'{', u'}', u'+', u'-', u'*', u'/', u'%', u'&', u'(', ')', u'[', u']', u'=', u'@', u'#', u':', u'|', u';',u'\'s']
	#at present tfidf filter is not applied
	#freqterms1 = [w for w in fdist1.keys() if w not in stopwords and w not in puncts and (fdist1.freq(w) * compute_idf(corpus, w))]
	freqterms1 = [w.decode("utf-8") for w in fdist1.keys() if w not in stopwords and w not in puncts]
	
	current_level = 1
	nodewithmaxparents = ''
	noofparents = 0
	maxparents = 0
	relatedness = 0
	first_convergence_level = 1
	tokensofthislevel = []
	convergingterms = []
	convergingparents = []
	tokensofprevlevel = []
	prevlevelsynsets = []
	commontokens = []
	vertices = 0
	edges = 0
	overlap = 0
	iter = 0
	from nltk.corpus import wordnet as wn

	#recurse down to required depth and update intrinsic merit score
	#relatedness is either sum(overlaps) or sum((overlapping_parents)*(overlaps)^2) also called convergence factor
	while current_level < 3:
		#crucial - gather nodes which converge/overlap (have more than 1 parent)
		if current_level > 1:
			print current_level
			for x in freqterms1:
				for y in parents(x,prevlevelsynsets):
					ylemmanames=y.lemma_names()
					#for yl in ylemmanames:
					#	definitiongraphedges[x].append(yl)
					definitiongraphedges[x].append(ylemmanames[0])
					definitiongraphedgelabels[x + " - " + ylemmanames[0]].append(" is a subinstance of ")
					definitiongraphedgelabels[ylemmanames[0] + " - " + x].append(" is a superinstance of ")
						
			convergingterms = [w for w in freqterms1 if len(parents(w,prevlevelsynsets)) > 1]
			for kw in freqterms1:
				convergingparents = convergingparents + ([w for w in parents(kw, prevlevelsynsets) if len(parents(kw, prevlevelsynsets)) > 1])
			for kw in freqterms1:
				noofparents = len(parents(kw, prevlevelsynsets))
				if noofparents > maxparents:
					maxparents = noofparents
					nodewithmaxparents = kw
		for keyword in freqterms1:
			#WSD - invokes Lesk's algorithm adapted to recursive gloss overlap- best_matching_synset() 
			#disamb_synset = best_matching_synset(set(doc1), wn.synsets(keyword))
			if use_pywsd_lesk:
				disamb_synset = simple_lesk(" ".join(freqterms1), keyword)
			if use_nltk_lesk:
				disamb_synset = lesk(freqterms1, keyword)
			else:
				disamb_synset = best_matching_synset(freqterms1, wn.synsets(keyword))
			prevlevelsynsets = prevlevelsynsets + [disamb_synset]
			if len(wn.synsets(keyword)) != 0:
				disamb_synset_def = disamb_synset.definition()
				tokens = nltk.word_tokenize(disamb_synset_def) 
				fdist_tokens = FreqDist(tokens)
				#at present frequency filter is not applied
				#if keyword in convergingterms:
				tokensofthislevel = tokensofthislevel + ([w for w in fdist_tokens.keys() if w not in stopwords and w not in puncts and fdist_tokens.freq(w)])
		listcount = len(tokensofthislevel)
		setcount = len(set(tokensofthislevel))
		overlap =  listcount-setcount
		if overlap > 0 and iter == 0 :
			first_convergence_level = current_level
			iter = 1
		#choose between two relatedness/convergence criteria :- 
		#1) simple linear overlap or 2) zipf distributed quadratic overlap
		#relatedness = relatedness + len(convergingparents)*overlap 
		relatedness = relatedness + overlap + len(convergingparents)
		#relatedness = relatedness + ((len(convergingparents)*overlap*overlap) + 1) 
		#find out common tokens of this and previous level so that same token does not get grasped again - 	
		#relatedness must be increased since repetition of keywords in two successive levels is a sign of 
		#interrelatedness(a backedge from child-of-one-of-siblings to one-of-siblings). Remove vertices and edges 					#corresponding to common tokens
		commontokens = set(tokensofthislevel).intersection(set(tokensofprevlevel))
		tokensofthislevel = set(tokensofthislevel).difference(commontokens)
		relatedness = relatedness + len(commontokens)
		#decrease the vertices count to address common tokens removed above - edges should remain same since they 
		#would just point elsewhere
		vertices = vertices + setcount - len(commontokens)
		edges = edges + listcount
		current_level = current_level + 1
		freqterms1 = set(tokensofthislevel)
		tokensofprevlevel = tokensofthislevel
		tokensofthislevel = []
	
	intrinsic_merit = vertices*edges*relatedness / first_convergence_level

	print definitiongraphedges

	nxg=nx.DiGraph()
	pos=nx.spring_layout(nxg)
	#pos=nx.shell_layout(nxg)
	#pos=nx.random_layout(nxg)
	#pos=nx.spectral_layout(nxg)
	#nx.draw_graphviz(nxg,prog="neato")
	for k,v in definitiongraphedges.iteritems():
                for l in v:
                        nxg.add_edge(k,l)
                        nxg.add_edge(l,k)
	#nx.draw_networkx(nxg)
	#plt.show()

	nxg.remove_edges_from(nxg.selfloop_edges())
	#print "Core number =",nx.core_number(nxg)
	sorted_core_nxg=sorted(nx.core_number(nxg).items(),key=operator.itemgetter(1), reverse=True)
	print "Core number (sorted) :",sorted_core_nxg
	print "============================================================================================================="
	print "Unsupervised Classification based on top percentile Core numbers of the definition graph(subgraph of WordNet)"
	print "============================================================================================================="
	no_of_classes=len(nx.core_number(nxg))
	top_percentile=0
	max_core_number=0
	max_core_number_class=""
	for n in sorted_core_nxg:
		print "This document belongs to class:",n[0],",core number=",n[1]
		if top_percentile < no_of_classes*0.50:
			top_percentile+=1
		else:	
			break
		if n[1] > max_core_number:
			max_core_number=n[1]
			max_core_number_class=n[0]
	print "	max_core_number",max_core_number

	print "==================================================================="
	print "Betweenness Centrality of Recursive Gloss Overlap graph vertices"
	print "==================================================================="
	bc=nx.betweenness_centrality(nxg)
	sorted_bc=sorted(bc.items(),key=operator.itemgetter(1),reverse=True)
	print sorted_bc 

	print "==================================================================="
	print "Closeness Centrality of Recursive Gloss Overlap graph vertices"
	print "==================================================================="
	cc=nx.closeness_centrality(nxg)
	sorted_cc=sorted(cc.items(),key=operator.itemgetter(1),reverse=True)
	print sorted_cc 

	print "==================================================================="
	print "Degree Centrality of Recursive Gloss Overlap graph vertices"
	print "==================================================================="
	dc=nx.degree_centrality(nxg)
	sorted_dc=sorted(dc.items(),key=operator.itemgetter(1),reverse=True)
	print sorted_dc 
	
	print "==================================================================="
	print "Page Rank of the vertices of RGO Definition Graph (a form of Eigenvector Centrality)"
	print "==================================================================="
	sorted_pagerank_nxg=sorted(nx.pagerank(nxg).items(),key=operator.itemgetter(1),reverse=True)
	print sorted_pagerank_nxg
	return (sorted_core_nxg, sorted_pagerank_nxg)
示例#29
0
def get_def(word, context, lang):

    #job = json.loads(injob.text)
    #lang = job.lang
    #context = job.context
    #word = job.word

    # remove non alphanumeric chars
    context = remove_notalpha(context)
    doc = nlp(context)
    if lang != 'eng':
        #call for translation to proper lang
        getstr = "https://glosbe.com/gapi/translate?from=" + lang + "&dest=eng&format=json&phrase=" + word + "&pretty=true"
        response = requests.get(getstr)
        indef = json.loads(response.text)
        word = find_token(indef, doc)
    else:
        for token in doc:
            if word == token.text:
                word = token
                break

    # do two seperate lesks
    answer = simple_lesk(context, word.text, pos_convert(word.pos_))
    cosans = cosine_lesk(context, word.text, pos_convert(word.pos_))

    # find what we hope is the better answer
    if (check_def(context, cosans.definition()) > check_def(
            context, answer.definition())):
        answer = cosans

    sense = str(answer)
    sense = sense.split("'")[1].split(".")

    if ((sense[0] != word.lemma_ or int(sense[2]) > 4)
            and word.pos_ != 'PROPN'):
        try:
            answer = wn.synset(word.lemma_ + '.' + pos_convert(word.pos_) +
                               '.01')
        except Exception:
            pass

    if lang != 'eng':
        if lang == 'spa':
            lang = 'es'
        if lang == 'arb':
            lang = 'ar'
        #this should use the spa or arb word given
        if len(indef['tuc']) > 0:
            meaning = ""
            for tuc in indef['tuc']:
                try:
                    if tuc['phrase']['text'] == word.lemma_:
                        esptemp = ""
                        for m in tuc['meanings']:
                            if m['language'] == lang and len(
                                    m['text']) > len(meaning):
                                meaning = m['text']
                except KeyError:
                    pass
    else:
        # needs to look for beginning of sentence
        if (word.pos_ == 'PROPN'):
            meaning = word.text + " is a proper noun."
        elif answer:
            meaning = answer.definition()
    return meaning
示例#30
0
#importing libraries
import nltk
import re
from pywsd.lesk import simple_lesk

#Downloading the The stopwords and populars
nltk.download('popular')

#Taking the user input
sent = input("Enter the sentence")

#Tokenizing the inut into word
sent2 = nltk.word_tokenize(sent)

#Tagging parts of speech
tagged_word = nltk.pos_tag(sent2)
print(tagged_word)
#Ask your to which word sense the want to know and collecting all that word with there POS in a list
ambiguous = input('Enter the word want to disambiguate:')
sense_word_list = []
for i in range(len(tagged_word)):
    if tagged_word[i][0].lower() == ambiguous:
        synset = simple_lesk(sent, tagged_word[i][0],
                             tagged_word[i][1][0].lower())
        sense = synset.lemmas()[0].name()
        sense = re.sub(r"_", " ", sense)
        print("Sense of", tagged_word[i][0], "is: ", sense)
        print("Definition of", tagged_word[i][0], "is:", synset.definition())
示例#31
0
def get_def(injob):
    lang = injob['language']
    context = injob['context'].lower()
    word = injob['word'].lower()
    # make proper names into iso standard
    if lang == 'English':
        lang = 'eng'
    if lang == 'Spanish':
        lang = 'spa'
    if lang == 'Arabic':
        lang = 'arb'
    if lang == 'French':
        lang = 'fra'

    # remove non alphanumeric chars

    doc = nlp(context)

    if lang != 'eng':
        if lang == 'fra':
            stoken = flp(word)
        if lang == 'spa':
            stoken = slp(word)
        for token in stoken:
            print(token.lemma_)
            word = token.lemma_.lower()
        # call for translation to proper lang
        getstr = "https://glosbe.com/gapi/translate?from="+ lang + "&dest=eng&format=json&phrase=" + word + "&pretty=true"
        response = requests.get(getstr)
        indef = json.loads(response.text)
        word = find_token(indef, doc, lang)
        if isinstance(word, str):
            return word
    else:
        for token in doc:
            if word == token.text:
                word = token
                break
    if word and (word.is_stop or word.text == 'I'):
        if lang != 'eng':
            return find_def(indef, lang, word)
        else:
            if word.text == 'I':
                response = "Singular first person pronoun."
            else:
                try:
                    a = o.get_info_about_word(word.lemma_).json()
                except Exception:
                    a = o.get_info_about_word(word.text).json()
                response = a['results'][0]['lexicalEntries'][0][
                    'entries'][0]['senses'][0]['definitions'][0]
            return response

    if word:
        # do two seperate lesks
        answer = simple_lesk(context, word.text,
                             pos_convert(word.pos_))
        cosans = cosine_lesk(context, word.text,
                             pos_convert(word.pos_))

        # find what we hope is the better answer
        if(check_def(context, cosans.definition()) >
           check_def(context, answer.definition())):
            answer = cosans

        sense = str(answer)
        sense = sense.split("'")[1].split(".")

        if ((sense[0] != word.lemma_ or
             int(sense[2]) > 4) and word.pos_ != 'PROPN'):
            try:
                answer = wn.synset(word.lemma_ + '.' +
                                   pos_convert(word.pos_) +
                                   '.01')
            except Exception:
                pass

        # probably broken now the stemmer had problems with capitolization
        if (word.pos_ == 'PROPN'):
            meaning = word.text + " is a proper noun."
        elif lang != 'eng' and len(indef['tuc']) > 0:
            # this should use the spa or arb word given
            meaning = find_def(indef, lang, word)
        elif answer:
            meaning = answer.definition()

        if meaning:
            print("meaning: " + meaning)
            return meaning
        elif lang == 'eng':
            return "Sorry, I don't know that definintion:("
        elif lang == 'spa':
            return "Lo siento, no sé esa definición:("
        elif lang == 'fra':
            return "Désolé, je ne connais pas cette définition:("
    elif lang == 'eng':
        return "Sorry, I don't know that definintion:("
    elif lang == 'spa':
        return "Lo siento, no sé esa definición:("
    elif lang == 'fra':
        return "Désolé, je ne connais pas cette définition:("
示例#32
0
def wsd_lesk(raw_df, algorithm_choice):
    """This finds the synset of the word using
        the original sentence as context and
        different lesk algorithms from nltk-
        and pywsd-packages.

        Algorithm choices are: 1. nltk's lesk
        2. pywsd simple_lesk, 3. pywsd advanced_lesk."""
    start = timer()
    algorithm_dict = {1: "nltk_lesk", 2: "pywsd_simple_lesk",
                      3: "pywsd_advanced_lesk", 4: "pywsd_cosine_lesk"}
    df = raw_df
    full_aspect_synset_list = []
    full_aspect_synset_list_definition = []
    aspect_synset_list_definition = []
    aspect_synset_list = []
    opinion_synset_list = []
    opinion_synset_list_definition = []
    full_opinion_synset_list = []
    full_opinion_synset_list_definition = []
    aspect_opinion = ["aspect_tags", "opinion_tags"]
    tokenized_sentences = raw_df["tokenized_sentence"]
    non_tokenized_sentences = raw_df["original_text"]

    for opinion_list in aspect_opinion:
        for i, phrase in enumerate(df[opinion_list]):
            multiple_word_found = False
            for j, word in enumerate(phrase):
                special_word = False
                if multiple_word_found is False:
                    # Check here for special words such as "bug".
                    aspect = check_for_special_word(word)
                    if aspect is not None:
                        special_word = True
                    wn_check = []
                    if len(phrase) >= 2:
                        k = 0
                        temporary_combined_word = []
                        while k < len(phrase):
                            temporary_combined_word.append(phrase[k][0])
                            k += 1
                        combined_word_string = '_'.join(temporary_combined_word)
                        wn_check = wn.synsets(combined_word_string, pos=find_wordnet_pos(word[1]))
                        multiple_word_found = True
                    if len(wn_check) == 0:
                        wn_check = wn.synsets(word[0], pos=find_wordnet_pos(word[1]))
                        multiple_word_found = False
                    if len(wn_check) > 0:
                        if special_word is False:
                            if algorithm_choice == 1:
                                if multiple_word_found is True:
                                    aspect = lesk(tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1]))
                                else:
                                    aspect = lesk(tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                            if algorithm_choice == 2:
                                if multiple_word_found is True:
                                    aspect = pylesk.simple_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1]))
                                else:
                                    aspect = pylesk.simple_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                            if algorithm_choice == 3:
                                if multiple_word_found is True:
                                    aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], combined_word_string,
                                                             find_wordnet_pos(word[1]))
                                else:
                                    aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                            if algorithm_choice == 4:
                                if multiple_word_found is True:
                                    aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], combined_word_string,
                                                            find_wordnet_pos(word[1]))
                                else:
                                    aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                        if aspect is not None:
                            if opinion_list is "aspect_tags":
                                aspect_synset_list.append(aspect)
                                aspect_synset_list_definition.append(aspect.definition())
                            else:
                                opinion_synset_list.append(aspect)
                                opinion_synset_list_definition.append(aspect.definition())
            if opinion_list is "aspect_tags":
                full_aspect_synset_list.append(aspect_synset_list)
                full_aspect_synset_list_definition.append(aspect_synset_list_definition)
                aspect_synset_list = []
                aspect_synset_list_definition = []
            else:
                full_opinion_synset_list.append(opinion_synset_list)
                full_opinion_synset_list_definition.append(opinion_synset_list_definition)
                opinion_synset_list = []
                opinion_synset_list_definition = []
    df[algorithm_dict[algorithm_choice] + "_aspect_synset"] = pd.Series(full_aspect_synset_list).values
    df[algorithm_dict[algorithm_choice] + "_aspect_definition"] = pd.Series(full_aspect_synset_list_definition).values
    df[algorithm_dict[algorithm_choice] + "_opinion_synset"] = pd.Series(full_opinion_synset_list).values
    df[algorithm_dict[algorithm_choice] + "_opinion_definition"] = pd.Series(full_opinion_synset_list_definition).values
    end = timer()
    logging.debug("WSD Lesk Time: %.2f seconds" % (end - start))
    return df
示例#33
0
            imp_pos.append("n")    
        elif (i[1][1] in pa):
            imp_pos.append("a")
        elif (i[1][1] in pv):
            imp_pos.append("v")        
        elif (i[1][1] in pav):
            imp_pos.append("r")
        else:
            imp_pos.append("none")
            
'''
imp_synset_definition = []
for i in range(len(imp_key)):
    # imp_synset_definition.append(simple_lesk(sent, imp_key[i],pos=imp_pos[i]).definition())
    imp_synset_definition.append(
        simple_lesk(sent, imp_key[i], pos=None).definition())

dic_for_context = {}
kw_extractor = yake.KeywordExtractor(lan="en", n=1, windowsSize=2, top=10)
imp_key_from_definition = []
for i in range(len(imp_key)):
    temp = []
    a = kw_extractor.extract_keywords(imp_synset_definition[i])
    for j in range(len(a)):
        imp_key_from_definition.append(a[j][1])
        temp.append(a[j][1])
    dic_for_context[imp_key[i]] = temp
'''
Weight Distribution 
'''
'''
示例#34
0
#
# Copyright (C) 2014 alvations
# URL:
# For license information, see LICENSE.md

bank_sents = ['I went to the bank to deposit my money',
'The river bank was full of dead fishes']

plant_sents = ['The workers at the industrial plant were overworked',
'The plant was no longer bearing flowers']

print "======== TESTING simple_lesk ===========\n"
from pywsd.lesk import simple_lesk
print "#TESTING simple_lesk() ..."
print "Context:", bank_sents[0]
answer = simple_lesk(bank_sents[0],'bank')
print "Sense:", answer
try: definition = answer.definition() 
except: definition = answer.definition # Using older version of NLTK.
print "Definition:", definition
print

print "#TESTING simple_lesk() with POS ..."
print "Context:", bank_sents[1]
answer = simple_lesk(bank_sents[1],'bank','n')
print "Sense:", answer
try: definition = answer.definition() 
except: definition = answer.definition # Using older version of NLTK.
print "Definition:", definition
print
示例#35
0
# for s in sentence:
#    answers = disambiguate(s, adapted_lesk, keepLemmas=False)

# what I was doing before
context = remove_notalpha(context)
# lemmatize the context
doc = nlp(context)
for token in doc:
    print("")
    if ' ' not in token.text and not token.is_stop and token.pos_ != '-PRON-':
        try:
            con = ''.split(context)
            for word in con:
                if word == token.text:
                    context = token.text + " "
            answer = simple_lesk(context, token.text, pos_convert(token.pos_))
            print(answer)
            if not answer:
                continue
        except Exception:
            continue

        sense = split_syn(answer)
        print(sense[0] + " " + token.lemma_)
        if ((sense[0] != token.lemma_ or int(sense[2]) > 4)
                and token.pos_ != 'PROPN'):
            try:
                cosans = cosine_lesk(context, token.text,
                                     pos_convert(token.pos_))
                if (check_def(context, cosans.definition()) > check_def(
                        context, answer.definition())):
示例#36
0
bank_sents = ['I went to the bank to deposit my money',
'The river bank was full of dead fishes']

plant_sents = ['The workers at the industrial plant were overworked',
'The plant was no longer bearing flowers']

print "======== TESTING simple_lesk ===========\n"
from pywsd.lesk import simple_lesk
print "#TESTING simple_lesk() ..."
print "Context:", bank_sents[0]
answer = simple_lesk(bank_sents[0],'bank')
print "Sense:", answer
definition = answer.definition() 
#except: definition = answer.definition # Using older version of NLTK.
print "Definition:", definition
print ''

print "#TESTING simple_lesk() with POS ..."
print "Context:", bank_sents[1]
answer = simple_lesk(bank_sents[1],'bank','n')
print "Sense:", answer
definition = answer.definition() 
#except: definition = answer.definition # Using older version of NLTK.
print "Definition:", definition
print

print "#TESTING simple_lesk() with POS and stems ..."
print "Context:", plant_sents[0]
answer = simple_lesk(plant_sents[0],'plant','n', True)
print "Sense:", answer
definition = answer.definition()