Пример #1
0
	def wordnetLinking(self):
		
		for trp in self.triples:
			sub, rel, obj 		  = trp['triple']
			raw_sub, raw_rel, raw_obj = trp['raw_triple']
			sub_id, rel_id, obj_id 	  = self.ent2id[sub], self.rel2id[rel], self.ent2id[obj]

			for sentence in trp['src_sentences']:
				# sent = [wrd.lower() for wrd in sentence.split()]
				sent = sentence.split()

				''' 92 is the length of list returned by dir when lesk is successful '''
				self.ent2wnet[sub_id] = self.ent2wnet.get(sub_id, set())
				res = lesk(sent, raw_sub) 
				if len(dir(res)) == 92: self.ent2wnet[sub_id].add(res.name())

				self.ent2wnet[obj_id] = self.ent2wnet.get(obj_id, set())
				res = lesk(sent, raw_obj) 
				if len(dir(res)) == 92: self.ent2wnet[obj_id].add(res.name())

				self.rel2wnet[rel_id] = self.rel2wnet.get(rel_id, set())
				res = lesk(sent, raw_rel) 
				if len(dir(res)) == 92: self.rel2wnet[rel_id].add(res.name())

		self.setHeading('Wordnet Entity Clusters')
		self.printCluster(self.ent2wnet, self.id2ent, 'm2ol')

		# for ent in self.ent_list: self.ent2wnet[self.ent2id[ent]] = [ele.name() for ele in lesk(ent)]
		# for rel in self.rel_list: self.rel2wnet[self.rel2id[rel]] = [ele.name() for ele in wordnet.synsets(rel)]

		self.setHeading('Wordnet Relation Clusters')
		self.printCluster(self.rel2wnet, self.id2rel, 'm2ol')
Пример #2
0
def caseC(question_string, question_string_pos_tagged,
          noun_phrases_in_question, likely_sentence_string,
          likely_sentence_string_tokenized, noun_phrases_in_likely_sentence):
    second_word_in_question_tag = question_string_pos_tagged[1][1]
    if (second_word_in_question_tag in noun_tags):
        second_word_in_question_supersense = lesk(
            question_string, question_string_pos_tagged[1][0], 'n')
        second_word_in_question_supersense = second_word_in_question_supersense.lexname(
        )
        super_sense_matches_in_likely_sentence = []
        for word in likely_sentence_string_tokenized:
            temp_supersense = lesk(likely_sentence_string, word, 'n')
            if (temp_supersense is None):
                continue
            temp_supersense = temp_supersense.lexname()
            if (temp_supersense == second_word_in_question_supersense):
                super_sense_matches_in_likely_sentence.append(word)
        if (len(super_sense_matches_in_likely_sentence) == 1):
            return super_sense_matches_in_likely_sentence[0]
        if (len(super_sense_matches_in_likely_sentence) > 1):
            first_np_in_likely_sentence = noun_phrases_in_likely_sentence[0]
            answer = closest_phrase_absolute(
                likely_sentence_string, first_np_in_likely_sentence,
                super_sense_matches_in_likely_sentence)
            return answer
        return None
    else:
        return None
Пример #3
0
    def createAllNyms(self,sentence):
        wordTokens = word_tokenize(sentence)
        pos = pos_tag(wordTokens)

        hyper = {}
        hypo = {}
        mero = {}
        holo = {}

        index = 0
        for tokenOrig in wordTokens:
            #print(tokenOrig)
            if(pos[index][1] in self.wordnet_tag_map):
                token = lesk(wordTokens, tokenOrig, self.wordnet_tag_map[pos[index][1]])
            else:
                token = lesk(wordTokens, tokenOrig)
            index = index+1
            #print(token)-
            hyper[token] = []
            hypo[token] = []
            mero[token] = []
            holo[token] = []
            if (token):
                if token.hypernyms():
                    hyper[token] = token.hypernyms()
                if token.hyponyms():
                    hypo[token] = token.hyponyms()
                mero[token] = token.part_meronyms()
                holo[token] = token.part_holonyms()

        return hyper, hypo, mero, holo
Пример #4
0
def simplified_lesk(word, sent3, sent1, sent2):
    a = lesk(sent1, 'bank', 'n')
    b = lesk(sent2, 'bank', 'n')
    c = lesk(sent3, 'bank', 'n')
    if a == b and b == c:
        best_sence = b
    elif a == b and a == c:
        best_sence = a
    elif b == c and a == c:
        best_sence = c
    max_overlap = 0
    count_final = 0

    for word in sent3:
        count = 0
        if word in sent1:
            count = count + 1
        if count > count_final:
            max_overlap = 1

    for word in sent3:
        count = 0
        if word in sent2:
            count = count + 1
        if count > count_final:
            max_overlap = 2

    return max_overlap
Пример #5
0
def my_lesk(tagged_strings, desired_word):
    """
    @tagged_string: the string of words in the format 'word1_pos1 word2_pos2...'
    @desired_word: the word we want disambiguated
    Returns: - synset returned by lesk with part of speech (more accurate)
             - synset returned by lesk without pos specified if no pos (less accurate)
             - None if lesk returns nothing
    """
    normal_string = ''
    desired_tag = ''
    for tagged_string in tagged_strings:
        word, tag = tagged_string.rsplit("_", 1)

        # Reject non-ASCII characters
        try:
            word = word.decode('ascii')
        except (UnicodeDecodeError, UnicodeEncodeError):
            continue

        if word == desired_word:
            desired_tag = tag

        normal_string += word + ' '

    # ignore proper nouns and punctuation
    if desired_tag == 'NNP' or desired_tag == 'NNPS' or desired_tag in string.punctuation:
        return None

    # if the POS can be resolved to a wordnet POS, call lesk with POS
    # else call lesk without POS
    wn_pos = reduce_pos_tagset(desired_tag)
    if not wn_pos:
        return lesk(normal_string,desired_word)
    else:
        return lesk(normal_string, desired_word, wn_pos)
Пример #6
0
    def createAllNyms(self, sentence):
        wordTokens = word_tokenize(sentence)
        pos = pos_tag(wordTokens)

        hyper = {}
        hypo = {}
        mero = {}
        holo = {}

        index = 0
        hyperd = {}
        hypod = {}
        merod = {}
        holod = {}
        for tokenOrig in wordTokens:
            #print(tokenOrig)
            if (pos[index][1] in self.wordnet_tag_map):
                token = lesk(wordTokens, tokenOrig,
                             self.wordnet_tag_map[pos[index][1]])
            else:
                token = lesk(wordTokens, tokenOrig)
            index = index + 1
            #print(token)-
            hyper[token] = []
            hypo[token] = []
            mero[token] = []
            holo[token] = []

            if (token):
                print("")
                print(tokenOrig)
                print(token)
                if token.hypernyms():
                    #hyper.append(token.hypernyms())
                    hyperd[token] = token.hypernyms()
                    print("hypernyms")
                    print(token.hypernyms())
                    print("")
                else:
                    print("there are no hypernyms")
                if token.hyponyms():
                    #hypo[token] = token.hyponyms()
                    hypod[token] = token.hyponyms()
                    print("hyponyms")
                    print(token.hyponyms())
                    print("")
                else:
                    print("there are no hyponyms")
                #mero[token] = token.part_meronyms()
                merod[token] = token.part_meronyms()
                print("meronyms")
                print(token.part_meronyms())
                print("")
                #holo[token] = token.part_holonyms()
                holod[token] = token.part_holonyms()
                print("holonyms")
                print(token.part_holonyms())
                print("")

        return hyperd, hypod, merod, holod
Пример #7
0
def lesk_similarity(triple_1, triple_2):
    triple_1 = triple_1.lower().split(' ')
    triple_2 = triple_2.lower().split(' ')
    triple_1 = [
        wnl.lemmatize(w.replace('^-1', '')) for w in triple_1 if w not in stop
    ]
    triple_2 = [
        wnl.lemmatize(w.replace('^-1', '')) for w in triple_2 if w not in stop
    ]

    triple_1 = [x for x in triple_1 if x != '']
    triple_2 = [x for x in triple_2 if x != '']

    count = 0
    for wrd_1 in triple_1:
        if count == 1:
            break
        for wrd_2 in triple_2:
            if wrd_1 == wrd_2 or wrd_1 in wrd_2 or wrd_2 in wrd_1:
                count += 1
                index_1 = triple_1.index(wrd_1)
                index_2 = triple_2.index(wrd_2)
                break
        if count == 0:
            return 0

    syn_1 = lesk(triple_1, triple_1[index_1])
    syn_2 = lesk(triple_2, triple_2[index_2])
    if syn_1 == syn_2:
        return 1
    else:
        return 0
Пример #8
0
def wsd(sent, subj, obj):
    '''
    Questa funzione effettua la WordSenseDisambiguation mediante il metodo lesk() fornito da WordNet restituendo il
    synset delle parole disambiguate.
    :param sent: frase del corpus;
    :param subj: soggetto della frase;
    :param obj: oggetto della frase;
    :return: synset relativi al soggetto ed oggetto.
    '''
    possible_subj = ["i", "you", "he", "she", "it", "we", "they"]
    sing_subj = ["i", "he", "she", "it"]
    plural_subj = ["we", "you" "they"]

    if subj in possible_subj or subj is None:
        if subj in plural_subj:
            ris = wn.synsets('people')[0]
        else:
            ris = wn.synsets('person')[0]
    elif subj is not None:
        ris = lesk(sent, subj)
        if ris is None:
            ris = wn.synsets('people')[0]
    else:
        ris = None
    if obj is not None:
        ris1 = lesk(sent, obj)
        if ris1 is None:
            #ris1 = wn.synsets('thing')[0]
            ris1 = wn.synsets('food')[0]
    else:
        ris1 = wn.synsets('food')[0]
        #ris1 = wn.synsets('thing')[0]

    #print("Ris ", ris, "Ris1 ", ris1)
    return ris, ris1
Пример #9
0
def compute_supersense(tuple, sentence):
    if tuple[2] in personal_pronoun_tags:
        return 'noun.person' if tuple[0] != 'it' else 'noun.entity'
    if tuple[0] == 'who':
        return 'noun.person'
    if tuple[0] == 'what':
        return 'noun.entity'
    # Ex. in sentence: interest in how what people eat affects their health
    # ('what', 'obj', 'WP') what sense returns no sense
    # force to entity
    if tag_to_wnpos_map[tuple[2]] is not None:
        #ottieni la lista di wn pos abbinata al tag penn e calcola il sinset con lesk per ognuno di questi
        possible_syns_by_pos = [
            x for x in [
                lesk(sentence, tuple[0], pos=pos)
                for pos in tag_to_wnpos_map[tuple[2]]
            ] if x is not None
        ]
        #usa lesk per disambiguare tra i synset trovati
        filler_sense = lesk(sentence, tuple[0], synsets=possible_syns_by_pos)
    else:
        filler_sense = lesk(sentence, tuple[0])

    filler_supersense = filler_sense.lexname(
    ) if filler_sense is not None else None
    return filler_supersense
Пример #10
0
 def __setup_lesk(cls):
     try:
         lesk(word_tokenize('This device is used to jam the signal'), 'jam')
     except LookupError:
         nltk.download('punkt')
         nltk.download('wordnet')
         nltk.download('omw-1.4')
         lesk(word_tokenize('This device is used to jam the signal'), 'jam')
Пример #11
0
    def assign_word_senses(self, word, partOfSpeech, sentenceIndex):

        if partOfSpeech == VERBS:
            wordSense = wn.synsets(word.word, pos=wn.VERB)
            if wordSense:
                word.wordSense = lesk(self.sentences[sentenceIndex], word.word, 'v')

        elif partOfSpeech == NOUNS:
            wordSense = wn.synsets(word.word, pos=wn.NOUN)
            if wordSense:
                word.wordSense = lesk(self.sentences[sentenceIndex], word.word, 'n')
Пример #12
0
def disambiguate():
    """Returns the best synset if a word is ambiguous"""
    lesks = []
    nouns = o_tag()
    text = one_line()
    for noun in nouns:
        if len(wn.synsets(noun[0])) > 1:
            if lesk(text, noun[0], 'n'):
                lesks.append((noun[0], lesk(text, noun[0], 'n')))
        elif lesk(text, noun[0], 'n'):
            lesks.append((noun[0], wn.synsets(noun[0])))
    return lesks
Пример #13
0
def apply_lesk(offset_sentic_dict):
    print('Appling Lesk Algorithms Started...')
    # making another offset_sentic_dict for rest of concepts and concat it to the existed one.
    # input: offset_sentic_dict["00044455-n"] = ['0.1', '0.1', '0.1', '0.1', #joy', '#surprise', 'positive', '0.726', 'appearance', 'start', 'casus_belli', 'beginning', 'egress'] // semantics might not included
    # output: last_offset_sentic_dict["00044455-n"] = ['0.1', '0.1', '0.1', '0.1', '#joy', '#surprise', 'positive', '0.726', 'appearance', 'start', 'casus_belli', 'beginning', 'egress']

    with open('vocabulary/affectnet_dict.pkl', 'rb') as f:
        aff_fr_dict = pickle.load(f)
    fr_offset_dict = dict()

    deleted_offset = []

    #direct mapped words were deleted before
    for word, value in senticnet.items():
        context = word
        found = False
        for i in range(8, 13):
            context += senticnet[word][i] + ' '
        try:
            synset = lesk(context, word)
            offset = str(synset.offset()).zfill(8) + '-' + synset.pos()
            found = True
        except AttributeError:
            # not found
            # sequentially, because it is arranged by c. similarity
            for v in value[8:13]:
                try:
                    synset = lesk(context, v)
                    offset = str(synset.offset()).zfill(8) + '-' + synset.pos()
                    found = True
                except AttributeError:
                    continue
        if found == False:
            continue
        # Direct mapped offset is not considered
        if offset in offset_sentic_dict:
            continue
        if offset in deleted_offset:
            continue

        if offset not in offset_sentic_dict:
            offset_sentic_dict[offset] = value
            fr_offset_dict[offset] = aff_fr_dict[word]
        else:
            if offset_sentic_dict[offset][6] != value[6]:
                del offset_sentic_dict[offset]
                deleted_offset.append(offset)
                continue
            else:
                offset_sentic_dict[offset] = weighted_sentic(offset_sentic_dict[offset], fr_offset_dict[offset], value, aff_fr_dict[word])
                fr_offset_dict[offset] += aff_fr_dict[word]

    return offset_sentic_dict
def wsd(listOfWords, word, pos = None):
	
	if pos == None:
		syn = lesk(listOfWords, word)
		return [syn] if isSynset(syn) else []
	else:
		syns = []
		for x in pos:
			syn = lesk(listOfWords, word, x)
			if isSynset(syn):
				syns.append(syn)
		return syns
Пример #15
0
def disambiguate():
    """Returns the best synset if a word is ambiguous"""
    lesks = []
    nouns = o_tag()
    text = one_line()
    for noun in nouns:
        if len(wn.synsets(noun[0])) > 1:
            if lesk(text, noun[0], 'n'):
                lesks.append((noun[0], lesk(text, noun[0], 'n')))
        elif lesk(text, noun[0], 'n'):
            lesks.append((noun[0], wn.synsets(noun[0])))
    return lesks
    def batch_matrix(self, dataset, begin_idx, end_idx):
        indice = range(begin_idx, end_idx)
        r = np.zeros((end_idx - begin_idx, self.sequence_length, self.sequence_length, 5))
        premise = np.array([dataset[i]['sentence1_binary_parse_index_sequence'] for i in indice])
        hypothesis = np.array([dataset[i]['sentence2_binary_parse_index_sequence'] for i in indice])
        premise_def, hypothesis_def = np.zeros((2, end_idx - begin_idx, self.sequence_length, self.sequence_length))
        mask_p, mask_h = np.ones((2,end_idx - begin_idx, self.sequence_length))
        mask_p_def, mask_h_def = np.ones((2,end_idx - begin_idx, self.sequence_length, self.sequence_length))
        labels = np.array([dataset[i]['label'] for i in indice])
        genres = np.array([dataset[i]['genre'] for i in indice])
        for i in range(begin_idx, end_idx):
            pre, hyp = dataset[i]['sentence1'].split()[:self.sequence_length], dataset[i]['sentence2'].split()[:self.sequence_length]
            mask_p[i - begin_idx, :len(pre)] = 1
            mask_h[i - begin_idx, :len(hyp)] = 1

            for i_p, p in enumerate(pre):
                # lesk: find the most possible sysnet of word p from word p and whole sentence pre
                tmp = lesk(pre, p)
                if tmp is not None:
                    # get p's definition
                    tmp = tmp.definition().strip('\'()').split()
                else:
                    continue
                    # turn definition words to corresponding indices
                premise_def[i - begin_idx][i_p][:len(tmp)] = ([word_indices[i] if i in word_indices else 0 for i in tmp])[:self.sequence_length]
                mask_p_def[i - begin_idx][i_p][:len(tmp)] = 1
            
            for i_h, h in enumerate(hyp):   
                tmp = lesk(hyp, h)
                if tmp is not None:
                    tmp = tmp.definition().strip('\'()').split()
                else:
                    continue
                hypothesis_def[i - begin_idx][i_h][:len(tmp)] = ([word_indices[i] if i in word_indices else 0 for i in tmp])[:self.sequence_length]
                mask_h_def[i - begin_idx][i_h][:len(tmp)] = 1
            
            for i_p, p in enumerate(pre):
                p_syn = lesk(pre, p)
                if p_syn is None:
                    continue
                for i_h, h in enumerate(hyp):
                    h_syn = lesk(hyp, h)
                    if h_syn is None:
                        continue
                    r[i - begin_idx][i_p][i_h][0] = is_synonym(p_syn, h_syn)
                    r[i - begin_idx][i_p][i_h][1] = is_antonym(p_syn, h_syn)
                    r[i - begin_idx][i_p][i_h][2] = is_hypernym([p_syn], h_syn, 1)
                    r[i - begin_idx][i_h][i_p][3] = is_hypernym([p_syn], h_syn, 1)
                    r[i - begin_idx][i_p][i_h][4] = is_co_hypernym(p_syn, h_syn)
        return r, premise, hypothesis, premise_def, hypothesis_def ,labels, mask_p, mask_h, mask_p_def, mask_h_def, genres
def sentiment_analysis_wsd(text_n_tagged_text):
    pos_tagged_text = text_n_tagged_text[0]
    text = text_n_tagged_text[1]
    pos_arr = []
    neg_arr = []
    subj_arr = []

    for obj in pos_tagged_text:
        if return_pos_sentiwordnet(obj[1]) == 0:
            continue
        pos = return_pos_sentiwordnet(obj[1])

        if lesk(text, obj[0], pos):
            syn = lesk(text, obj[0], pos)
            polarity = polarity_score_1(syn)
            subj = subjectivity_score_1(syn)
        else:
            polarity = polarity_score_2(obj[0], pos)
            subj = subjectivity_score_2(obj[0], pos)

        subj_arr.append(subj)
        if polarity > 0.0:
            pos_arr.append(polarity)
        elif polarity < 0.0:
            neg_arr.append(polarity)
        else:
            continue
        #print pos_arr
    if np.array(pos_arr).size == 0:
        pos_mean_score = 0.0
    else:
        pos_mean_score = round(np.mean(np.array(pos_arr)), 1)
    if np.array(neg_arr).size == 0:
        neg_mean_score = 0.0
    else:
        neg_mean_score = round(np.mean(np.array(neg_arr)), 1)
    subj_mean_score = round(np.mean(np.array(subj_arr)), 1)
    temp_neg_score = neg_mean_score * -1.0

    #if (pos_mean_score,neg_mean_score,subj_mean_score):
    #return (pos_mean_score,neg_mean_score,subj_mean_score)
    #else:
    #return (0,0.0)
    if pos_mean_score > temp_neg_score:
        return ('1', pos_mean_score + neg_mean_score, subj_mean_score)
    elif pos_mean_score < temp_neg_score:
        return ('-1', pos_mean_score + neg_mean_score, subj_mean_score)
    else:
        return ('0', 0.0, subj_mean_score)
Пример #18
0
def get_semantic_features(tagged_tok, line):
    '''
    return features like synonyms, hypernyms, hyponyms, meronyms, holonymns
    extracted from each word of sentence
    '''
    lemma_sen = set()
    hyper_sen = set()
    hypo_sen = set()
    mero_sen = set()
    holo_sen = set()
    for word, tag in tagged_tok:
        if tag[:2] in WN_TAG_LIST:
            sense = lesk(line, word, pos=WN_TAG_LIST.get(tag[:2]))
            if not sense:
                continue
            for lem in sense.lemmas():
                lemma_sen.add(lem.name())
            for hyper in sense.hypernyms()[:30]:
                hyper_sen.add(hyper.name())
            for hypo in sense.hyponyms()[:30]:
                hypo_sen.add(hypo.name())
            for mero in sense.part_meronyms()[:30]:
                mero_sen.add(mero.name())
            for holo in sense.member_holonyms()[:30]:
                holo_sen.add(holo.name())
    return (' '.join(lemma_sen), ' '.join(hyper_sen), ' '.join(hypo_sen),
            ' '.join(mero_sen), ' '.join(holo_sen))
Пример #19
0
def getFeatures(tokensTagged, line):
    lemmaS = set()
    hyperS = set()
    hypoS = set()
    meroS = set()
    holoS = set()

    for word, tag in tokensTagged:
        if tag[:2] in nltkWnMap:  # and tag != 'NNP':
            sense = lesk(line, word, pos=nltkWnMap.get(tag[:2]))
            # sense = lesk(line, word)
            if not sense:
                continue
            for lem in sense.lemmas():
                lemmaS.add(lem.name())
            for hyper in sense.hypernyms()[:featureMaxLimit]:
                hyperS.add(hyper.name())
            for hypo in sense.hyponyms()[:featureMaxLimit]:
                hypoS.add(hypo.name())
            for mero in sense.part_meronyms()[:featureMaxLimit]:
                meroS.add(mero.name())
            for holo in sense.member_holonyms()[:featureMaxLimit]:
                holoS.add(holo.name())
    return (' '.join(lemmaS), ' '.join(hyperS), ' '.join(hypoS),
            ' '.join(meroS), ' '.join(holoS))
Пример #20
0
def word_synonyms(word, sentence):
    synset = lesk(sentence, word)
    if synset is None:
        return None
    lemmas = synset.lemmas()
    lemmas = [str(lemma.name()) for lemma in lemmas]
    return lemmas
    def __subtree__(self, row):
        """
        Find subtree pattern like X ->R<- Y and generation heuristic based score 
        where 
        X: Subject
        R:Relation
        Y:Object
        
        """
        corpus = row["corpus"]
        output = []
        tokens = row["tokens"]

        # iterate through all the tokens in the input sentence
        for i, sent in enumerate(corpus):
            r = ''
            for tok in sent:
                # # extract subject
                #     if tok.dep_.find("subjpass") == True:
                #         y = tok.text

                # # extract object
                #     if tok.dep_.endswith("obj") == True:
                #         x = tok.text
                # # extract relation
                if tok.dep_ == "ROOT":
                    r = lesk(tokens[i], tok.text, get_wordnet_pos(tok.tag_))
                    if r is not None:
                        r = r.name()
                    break
            output.append(r)

        return output
Пример #22
0
    def get_replacement(self, token_with_tag, tokens):
        word = token_with_tag[0]
        synset_lock = Lock() 
        synset_lock.acquire()
        synset = lesk(tokens, word)
        synset_lock.release()

        if synset is None:
            return None
        
        lemmas = synset.lemmas()
         # find a replacement
        replacement = None
        for lemma in lemmas:
            lemma_name = lemma.name()
            lemma_pos = lemma.synset().pos()
            if (lemma_pos == 'n') and lemma_name != word:
                replacement = lemma_name
                break
        
        if replacement is None:
            #replacement = word
            return None

        return replacement.replace("_", " ")
Пример #23
0
def canonicalise(extractions):
    for ext in extractions:
        ext = extractions[ext]

        # relation synsets
        rel_synsets = set([])
        rel_root = set([])
        if ext['object']:
            sentence = ext['subject'] + ' ' + ext['relation'] + ext['object']
        else:
            sentence = ext['subject'] + ' ' + ext['relation']
        doc = nlp(ext['relation'])
        for token in doc:
            if token.pos_ == 'VERB' and token.text not in [
                    'will', 'shall', 'may', 'must', 'can', 'could'
            ]:
                try:
                    rel_synsets.add(lesk(sentence, token.text, 'v').name())
                    rel_root.add(token.lemma_)
                except:
                    print(
                        'ERROR:',
                        token.lemma_,
                    )
        ext['rel_synsets'] = list(rel_synsets)

        # entity canonicalisation
        ext['subject'] = entity_canonicalisation(ext['subject'])
        ext['object'] = entity_canonicalisation(ext['object'])
        for m in ext['modifiers'] + ext['subject_modifiers']:
            m['m_obj'] = entity_canonicalisation(m['m_obj'])

    return extractions
def findCategories(tokens, tags, nouns, verbs):
    for word in tokens:
        if not lesk(tokens, word):
            continue
        if lesk(tokens, word).pos() == 'n':
            category = lesk(tokens, word).lexname()
            if category not in nouns.keys():
                nouns[category] = 1
            else:
                nouns[category] += 1
        elif lesk(tokens, word).pos() == 'v':
            category = lesk(tokens, word).lexname()
            if category not in verbs.keys():
                verbs[category] = 1
            else:
                verbs[category] += 1
Пример #25
0
def get_tokens_POS(sentence_token_complexity_pairs):
    stemmer = SnowballStemmer(LANGUAGE)
    tokens_POS = [(sentence, lesk(sentence, str(token)), complexity) for sentence, token, complexity in sentence_token_complexity_pairs]

    for i in range(len(tokens_POS)):
        sentence, token, complexity = tokens_POS[i]
        if token is None:
            word = sentence_token_complexity_pairs[i][1]
            token = lesk(sentence, stemmer.stem(word))

            if token is None:
                tokens_POS[i] = (sentence, word, complexity)
            else:
                tokens_POS[i] = (sentence, token, complexity)

    return [(sentence, token, complexity) for sentence, token, complexity in tokens_POS]
Пример #26
0
def transform_tag(tag, word, words):
    synset = lesk(words, word, "n")
    if synset:
        if tag == "ORGANIZATION" or tag == "PERSON":
            return tag[:3]
        elif tag == "LOCATION":
            paths = synset.hypernym_paths()
            for path in paths:
                for synset in path:
                    name = synset.name()
                    if "city" in name or "town" in name:
                        return "CIT"
                    elif "country" in name or "state" in name:
                        return "COU"
            return "NAT"
        elif tag == "MISC":
            paths = synset.hypernym_paths()
            for path in paths:
                for synset in path:
                    name = synset.name()
                    if "animal" in name:
                        return "ANI"
                    elif "sport" in name:
                        return "SPO"
                    elif "entertainment" in name:
                        return "ENT"
            return ""
        else:
            return ""
    else:
        return ""
Пример #27
0
def query_expanded(list_of_words, weight, word_similarity = None):
	"""
	Expand the query using various word relations (synonyms, hypernyms or hyponyms)
	"""
	count = 0
	expanded_query = []
	for x in list_of_words:	
		expanded_query.extend([x for i in range(weight)])
		# # WSD
		syn = lesk(list_of_words, x)
		try:
			for l in syn.lemmas() :
				# if(count<3):
				if l.name() not in expanded_query:
					expanded_query.append(l.name())
					count+=1
			# for hyp in syn.hypernyms():
			# 	for hyp_lemma in hyp.lemmas():
			# 		if hyp_lemma.name() not in expanded_query:
			# 			expanded_query.append(hyp_lemma.name())
			# 			count+=1
			# for hyp in syn.hyponyms():
			# 	for hyp_lemma in hyp.lemmas():
			# 		if hyp_lemma.name() not in expanded_query:
			# 			expanded_query.append(hyp_lemma.name())
			# 			count+=1
		except:
			pass

	return expanded_query
Пример #28
0
def ExtendText(fileName, tagger=PerceptronTagger()):
    with io.open(fileName, 'r') as w:
        text = TextBlob(w.read(), pos_tagger=tagger)
        extended_text = []
        for sent in text.sentences:
            for word in sent.pos_tags:
                #word = "bank"
                penn_tags = ['JJ', 'NN', 'V']
                extending = False
                for tag in penn_tags:
                    if tag in word[1]:
                        extending = True
                        pos = tag[0].lower()
                        try:
                            l = lesk(sent.string, word[0].lower(), pos)
                            syns = l._lemma_names
                            for syn in syns:
                                extended_text.append(syn)
                            break
                        except:
                            extended_text.append(word[0].lower())
                if not extending:
                    extended_text.append(word[0].lower())
        extended_text = ' '.join([
            word for word in extended_text if word not in cachedStopWords
        ]).lstrip()
        return extended_text
Пример #29
0
def main(questionLoc, categoriesLoc, featuresLoc):
    categories, subCategories = [],[]
    categories, subCategories = getCategories(categoriesLoc)
    questions = loadQuestions(questionLoc) 
    features = getFeatureLists(featuresLoc)
    finalString = ''
    
    for i in range(len(questions)):
        disambedSens = None
        whWord = features[i][0]
        headWord = features[i][1]
        if headWord == 'null':
            headWord = None
        label = features[i][2]
        label = convertPOS(label)
        question = questions[i].strip()
        uni = GetUnigram(question)
        bi = GetBigram(question)
        tri = GetTrigram(question)
        wordShape = GetWordShape(uni)
        if headWord != None and ':' not in headWord:
            disambigSense = wsd.lesk(question, headWord, pos=label)
            if disambigSense:
                directHypernym = getHypernym(question, 5, disambigSense)
                indirectHypernym = mostSimilarCategory(disambigSense, subCategories)
        print whWord, headWord#, disambigSense, wordShape, uni, bi, tri
Пример #30
0
def desambiguar(men):
    # Para que no den problemas los carateres los reemplazo
    global cont
    
    # Separo el texto en frases
    frases = sentence_tokenizer.tokenize(men)
    
    for frase in frases:
        
        nfrase = frase #Nueva frase
        
        # Proceso la frase
        doc = nlp(frase)
        cont = 0
        # Busco las palabras ambiguas de la frase
        for token in doc:
            cont = nfrase.find(token.text,cont)
            
            # Intento desambiguar
            synset = lesk(doc, token.text)
            if synset and len(wn.synsets(token.text)) > 1:
                nfrase = creaPildora(token.text, synset.name(), nfrase)
            else:
                cont += len(token.text)
        men = men.replace(frase,nfrase,1)
    
    return men
Пример #31
0
def synonym_paraphrase(words, xpos):
    verb_labels = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    adj_labels = ['JJ', 'JJR', 'JJS']
    noun_labels = ['NN', 'NNP', 'NNS', 'NNPS']
    adv_labels = ['RB', 'RBR', 'RBS']
    synonyms = {}
    for i in range(len(words)):
        ### get pos tag
        pos_tag = None
        if (xpos[i] in verb_labels):
            pos_tag = 'v'
        if (xpos[i] in adj_labels):
            pos_tag = 'a'
        if (xpos[i] in noun_labels):
            pos_tag = 'n'
        if (xpos[i] in adv_labels):
            pos_tag = 'r'
        if (pos_tag == None):
            continue
        ### get morphy of word
        morphy = wn.morphy(words[i], pos_tag)
        if (morphy == None):
            continue
        ### get all synonyms
        meaning = lesk(' '.join(words), morphy, pos_tag)
        if (str(meaning) == 'None'):
            continue
        syns = meaning.lemma_names()
        synonyms[morphy] = syns

    return synonyms
Пример #32
0
def obtener_mejor_definicion(tokens, sustantivo):
    definiciones = encontrar_definicion(sustantivo)
    if len(definiciones) == 1:
        return definiciones[0]['definicion']
    else:
        synset = lesk(tokens, sustantivo, 'n')
        return synset.definition()
Пример #33
0
def mostcommonsyns(row):
	#print "Evaluating new row"
	record, stop_words = row
	text = record['text']
	#print "------------------\n\n", text
	#print "hi len: ", len(text)
	stopWords = get_stopwords()
	sentList = nltk.sent_tokenize(text)
	#print "hi2"
	wordsInSentsPos = [nltk.pos_tag(nltk.word_tokenize(s)) for s in sentList]
	#print "hi3"
	wordsInSentsWnPos = [[(w[0],penn2morphy(w[1])) for w in s if w[0].lower() not in stopWords] for s in wordsInSentsPos]
	#print "h4"
	#the above returns a list of sentences where each sentence is a list of
	#(word-as-string, pos tag) tuples. Stop words are removed here because pos_tag
	#uses grammatical structure but lesk does not. 
	#This would also be the place to lemmatize, which will help lesk out.
	synsetsList = [lesk(s,w[0],w[1]) for s in wordsInSentsWnPos for w in s]
	#print "go"
	#print synsetsList, "\n\n"
	#return [3,2,2,2]

	#res = FreqDist([x for x in synsetsList if x is not None])
	res = [s.name() for s in synsetsList if s is not None]
	return res
Пример #34
0
def main():
	wikiDict = pickle.load(open('wikis.pickle','rb'))
	amountList = []
	for value in wikiDict.values():
		value = re.sub(r'\[[0-9]*\]',"",value)
		text = sent_tokenize(value)
		for sent in text:
			tokenized = word_tokenize(sent)
			pos = pos_tag(tokenized)
			for token, tag in pos:

				if (tag == 'NNPS' or tag == 'NNP' or tag == 'NNS' or tag == 'NN'):
					tag = "n"
					if len(wn.synsets(token, tag)) > 1:
						print(token, lesk(sent, token, tag))
						print("All possible senses:")
						n = 0
						for ss in wn.synsets(token, tag):
							print(ss, ss.definition())
							n += 1
						amountList.append(int(n))
						#print()
	#print(amountList)
	
	c2 = Counter(amountList)
	print(c2)
Пример #35
0
def ExtendText(fileName,tagger=PerceptronTagger()):
	with io.open(fileName, 'r') as w:
		text = TextBlob(w.read(), pos_tagger=tagger)
		extended_text = []
		for sent in text.sentences:		
			for word in sent.pos_tags:
				#word = "bank"
				penn_tags = ['JJ','NN','V']
				extending = False
				for tag in penn_tags:
					if tag in word[1]:
						extending = True
						pos = tag[0].lower()
						try:
							l = lesk(sent.string, word[0].lower(), pos)
							syns = l._lemma_names
							for syn in syns:
								extended_text.append(syn)
							break
						except:
							extended_text.append(word[0].lower())
				if not extending:
					extended_text.append(word[0].lower())
		extended_text = ' '.join([word for word in extended_text if word not in cachedStopWords]).lstrip()
		return extended_text
Пример #36
0
def abstraction_score(text, uselesk = False):
    """Takes in a list of sentences, tags the parts of speech from the word tokenized sentence,
    looks for part of speech beginning with N (a type of noun) returns the mean abstraction score by
    calling calculate_scores.

    Optional, uses the default lesk algorithm from NLTK
    """
    nouns = []

    for sent in text:
        nouns.append([token for token, pos in pos_tag(word_tokenize(sent)) if pos.startswith('N')])
#         print nouns
    nouns = [item for sublist in nouns for item in sublist]

    scores=[]
    for i in nouns:
        if uselesk:
            y = lesk(sent, i)
            if y is not None:
                #print i, calculate_scores(y)
                scores.append(calculate_scores(y))
        else:
            by_word=[]
            synsets = wn.synsets(i, "n")
            for synset in synsets:
                by_word.append(calculate_scores(synset))

            absword = np.mean(by_word)
            scores.append(absword)
            #print absword
    scores = np.array(scores)
    #print scores
    return scores[~np.isnan(scores)].mean()
Пример #37
0
 def analyser(doc):
     expand_tokens = []
     expand_Lemms = []
     synsets = []
     # apply the preprocessing and tokenzation steps
     sent_text = nltk.sent_tokenize(doc)
     doc_clean = self.build_preprocessor()(doc)
     tokens = self.build_tokenizer()(doc_clean)
     # use CountVectorizer's _word_ngrams built in method
     # to remove stop words and extract n-grams        
     n_grams = list(set(self._word_ngrams(tokens,self.get_stop_words())))
     for x in n_grams:
         my_regex = r"\b(?=\w)" + re.escape(x) + r"\b(?!\w)"
         matched_sent = [s for s in sent_text if len(re.findall(my_regex, s, re.IGNORECASE)) > 0]
         expand_tokens.append(x)
         expand_Lemms.append(matched_sent)
     Expanded_Token_Lemmd = pd.DataFrame({'Word': expand_tokens,'Sentences': expand_Lemms})
     for i in range(0, len(Expanded_Token_Lemmd.index)):
         for row_len in range(0, len(Expanded_Token_Lemmd.iloc[i]['Sentences'])):
             x = lesk(Expanded_Token_Lemmd.iloc[i]['Sentences'][row_len],Expanded_Token_Lemmd.iloc[i]['Word'].lower().replace(' ','_'))
             if not x:
                 synsets.append(Expanded_Token_Lemmd.iloc[i]['Word'].lower())
             else:
                 synsets.append(str(x)[8:-2])
     return(synsets)
Пример #38
0
def lexsense(word, context='') :
    if context:
        sense = lesk(context,word,pos=wn.NOUN);
        if sense:
            return {sense}

    return set(wn.synsets(word, pos=wn.NOUN))
Пример #39
0
    def calculate_probable_synset(self, sentence: List[str]):
        if self.probable_synset:
            return

        self.probable_synset = lesk(sentence,
                                    self.token.text,
                                    pos=_token_pos_to_nltk_pos(self.token))
def main(file):

    # Stuff for unicode decode errors
    reload(sys)
    sys.setdefaultencoding("utf-8")

    # Get the text of the  URL
    text = getText(file[1])
    tokens = nltk.word_tokenize(text)

    # POS tag words
    taggedWords = nltk.pos_tag(tokens)

    # Filter all nouns
    nouns = [(word, tag) for word, tag in taggedWords if tag.startswith("N")]

    # Set values for the answers
    amountOfPolysemousWords = 0
    amountOfSenses = 0
    listofsenses = []

    # Get wordnet synsets
    for word, tag in nouns:

        # Get the amount of senses
        senses = len(wordnet.synsets(word, "n"))

        # Check if the word is polysemous
        if senses > 1:

            # Count the polysemous words and senses
            amountOfPolysemousWords = amountOfPolysemousWords + 1
            amountOfSenses = amountOfSenses + senses

            listofsenses.append(senses)

    # Answer for question 1
    print("For this file, there are {} polysemous word".format(amountOfPolysemousWords))

    # Answer for question 3
    averageSenses = amountOfSenses/amountOfPolysemousWords
    print("For this file, the average senses are {} per polysemous word".format(averageSenses))

    # Answer for question 4
    result = Counter(listofsenses)
    print(result)

    # Answer for question 5
    words = ["cars", "quantity", "carbon", "states", "change", "life"]
    pos = "n"
    textObject = nltk.Text(tokens)
    for sent in sent_tokenize(text):
        for word in words:
            context = textObject.concordance(word)
            print("\n\n" + str(context))
            print ("\n\n The result of algorithm is: " + str(lesk(sent, word, pos)))
            print("\n\n All possible senses for " + word + ":")
            for ss in wordnet.synsets(word, "n"):
                print(ss, ss.definition())
Пример #41
0
def disambiguate(context, word, pos):
    """
    Word sense disambiguation using Lesk algorithm
    @context: a string containing the word whose meaning we want to disambiguate
    @word: the word we want to disambiguate
    @pos: the part of speech
    Returns: the Synset of the most likely meaning of the word
    """
    return lesk(context, word, pos)
Пример #42
0
def leskify(file):
    with(open(file, 'r')) as f:
        contents = f.read()
        words, describingWord = wordify(file)
        meaning = {}
        for word in words:
            meaning[word] = lesk(contents, word)
            # meaning.append((word, lesk(contents, word)))
        return meaning, describingWord
Пример #43
0
def define_func(nick,match,target):
	word=match.group('word')
	sentence=match.group('sentence')
	meaning=match.group('meaning')
	lang=match.group('lang')	
	sentiment=match.group('sentiment')!=None
	if sentence:
		separator=re.compile("[.\s,]+")
		words=re.split(separator,sentence)
		for w in words:
			synsets=wn.synsets(w)
			if synsets!=[]:
				word=w
				break
		if synsets!=[]:
			synset=lesk(words,word)
			define_synset(nick,word,synset,target,lang,sentiment)
		else:
			mb.tell(nick+": what the flying f**k does that even mean",target)		
		return
	if word:
		synsets=wn.synsets(word)
		if synsets==[]:
			mb.tell(nick+': no idea what "'+word+'" is. probably something gay. like you',target)
		elif len(synsets)==1:
			define_synset(nick,word,synsets[0],target,lang,sentiment)
		else:
			choose=[]
			temp_choose=[]
			for synset in synsets:
				name=None
				for hypernym in synset.hypernyms():
					for lemma in hypernym.lemmas():
						name=lemma.name()
						break
						break
				if not name:
					for lemma in synset.lemmas():
						name=lemma.name()
						break
				temp_name=name
				if name in temp_choose:
					name=name+"("+str(temp_choose.count(name))+")"
				if meaning:
					if meaning.upper()==name.upper():
						define_synset(nick,word,synset,target,lang,sentiment)
						return
				temp_choose.append(temp_name)
				choose.append(name)		
			words="|".join([re.escape(x).replace("_","[_\s]+") for x in choose])
			message=", ".join(choose)
			pattern="^(?:murderb[o0]t[,\s:!]+)?(?:(?:(?:(?:who|what)(?:\s+am|\s+is|\s+are|'s|'re|'m|s|re)(?:\s+a|\s+the)?)|(?:define))\s+)?(?:"+word+"\s+)?(?:as\s+in\s+)?(?P<word>{0})$".format(words);
			clarify=re.compile(pattern,flags=re.IGNORECASE)
			response={'nick':nick,'func':clarify_func,'pattern':clarify,'param':{'sent':sentiment,'lang':lang,'synsets':synsets,'words':[x.upper().replace("_","") for x in choose],'word':word},'target':target}
			mb.responses['define']=response
			mb.tell(nick+": "+word+" as in "+message+"?",target)
	return
def extract_nouns_info(sentence):
    nouns_info = []

    tokens = word_tokenize(sentence)
    tagged_words = pos_tag(tokens)
    for (word, tag) in tagged_words:
        if('NN' in tag):
            pos = 'n'
            wsd = lesk(tokens, word, pos)
            syns = wsd.lemma_names()

            nouns_info.append( (word, wsd, syns) )

    return nouns_info
Пример #45
0
def lexclass(word, context=''):
    rslt = set()

    if context:
        sense = lesk(context,word,pos=wn.NOUN);
        if sense:
            rslt.add(sense.lexname())
            return rslt

    sets = wn.synsets(word, pos=wn.NOUN)
    for s in sets:
        rslt.add(s.lexname())

    return rslt
Пример #46
0
	def disambiguate(self, wsd_instance):
		"""
		Disambiguates the given instance, returning the predicted lemma sense
		key.

		@param wsd_instance - WSDInstance to disambiguate
		@return Sense key as a string if a sense could be found, None
			otherwise
		"""

		syn = lesk(wsd_instance.context, wsd_instance.lemma, 'n')
		if syn is not None:
			return to_sense_key(syn)
		
		return None
Пример #47
0
    def getWordSentimentTuple(self, word, pos, wordlist):
        if wordlist != "sentiwordnet":
            raise InvalidDictionaryException("Invalid dictionary " + wordlist + \
                    "please use sentiwordnet")
        else:
            simplePOS = convertPOSTagToSimplePOS(pos)
            if pos:
                wordSense = lesk(self.tokens, word, simplePOS)
                if wordSense: 
                   sentiSynsetWord = swn.senti_synset(wordSense.name())
                   if sentiSynsetWord:
                       return (sentiSynsetWord.pos_score(), 
                               sentiSynsetWord.neg_score(),
                               sentiSynsetWord.obj_score())

        return (0, 0, 0)
Пример #48
0
def synonym_picker3(word, wpos, sentence, npos):
    #print(word, pos, sentence)
    #print(word, " ", npos)
    start_synset = lesk(sentence, word, wpos)
    synonyms = []
    #start_synset.res_similarity()
    if (start_synset):
        #print(start_synset.lemmas())
        for l in start_synset.lemmas():
            if l.name() not in synonyms and l.name() != word:
                synonyms.append(l.name().replace('_', ' '))
        if len(synonyms) > 0:
            return random.choice(list(synonyms))
        else:
            return word
    else:
        return word
Пример #49
0
 def analyze(self, doc):
     res = []
     for sentence in self.normalizer.sent_tokenize(doc):
         tagged_sentence = self.tagger.tag(self.normalizer.split_and_normalize(sentence))
         lemmatized_doc = []
         for w, pos in tagged_sentence:
             try:
                 pos_ = pos[:1]
                 wn_postag = self.translation_dict[pos_]
             except KeyError:
                 wn_postag = None
             if wn_postag:
                 lemmatized_doc.append(self.lem.lemmatize(w, wn_postag))
         for w in lemmatized_doc:
             sense = wsd.lesk(lemmatized_doc, w)
             if sense:
                 res.append(sense.name())
     return res
Пример #50
0
def get_Candidate_Frequency_from_wordnet(word, tag, context):
    wordnet_tag = get_wordnet_pos(tag)
    sent = list(context)
    #print("context is")
    #print(sent)
	#syns = lesk(sent, word, wordnet_tag)
    syns = lesk(sent, word)

    res = []
    if syns:
        for l in syns.lemmas():
            if l:
                lemma_name = str(l.name())
                st = LancasterStemmer()
                if ((st.stem(word) != st.stem(lemma_name))):
                    res.append(lemma_name)


    candidate_list = list(set(res))
    #print("for word: " + word)
    #print(candidate_list)

    '''
    if candidate_list:
        for c in candidate_list:
            allsyns1 = set(ss for ss in wordnet.synsets(c))
            print(allsyns1)
            allsyns2 = set(ss for ss in wordnet.synsets(word))
            print(allsyns2)
            best = max((wordnet.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in product(allsyns1, allsyns2))
            print(best)
        '''


    '''
    for c in candidate_list:
        wordsFromList1 = wordnet.synsets(word)
        wordsFromList2 = wordnet.synsets(c)
        if wordsFromList1 and wordsFromList2:  # Thanks to @alexis' note
            s = wordsFromList1[0].wup_similarity(wordsFromList2[0])
            similarity_list.append(s)
    '''
    return candidate_list
Пример #51
0
def main():
    path = "group9/"
    dirs = ["p34", "p35"]
    number_of_ss = 0
    synsets = []

    for directory in dirs:
        for directory2 in os.listdir(path+directory):
            for filename in os.listdir(path+directory+"/"+directory2):
                if filename.endswith(".tok.off.pos.ent"):
                    with open(os.path.join(path, directory+"/"+directory2, filename), 'r') as fname:
                        output = open(os.path.join(path, directory+"/"+directory2, "output_disambiguation.txt"), 'w')

                        ambiguous_words = []
                        ambiguous_lines = []
                        text_words = []


                        for line in fname:
                            split_line = line.split()
                            text_words.append(split_line)
                            l = line.split()
                            if l[4] == "NN" or l[4] == "NNP":
                                if len(wordnet.synsets(l[3], "n")) > 1:
                                    number_of_ss += len(wordnet.synsets(l[3], "n"))
                                    synsets.append(len(wordnet.synsets(l[3], "n")))
                                    ambiguous_words.append((l[2], l[3]))
                        for word in ambiguous_words:
                            start = int(str(word[0][0]) + "001")
                            end = start + 999
                            lines = []
                            for l in text_words:
                                if start <= int(l[2]) <= end:
                                    lines.append(l[3])
                            ambiguous_lines.append(lines)
                        for i in range(len(ambiguous_words)):
                            ss = lesk(ambiguous_lines[i], ambiguous_words[i][1], "n")
                            outputwrite = str((ss, ss.definition())) + "\n"
                            output.write(outputwrite)
    c = Counter(synsets)

    print(sorted(c.items(), key=lambda pair: pair[0], reverse=True))
    print(number_of_ss)
Пример #52
0
def mostSimilarCategory(theSense, theCategories):
    maxValue = 0
    catName = ''
    posList = None
    for cat in theCategories:
        for key, values in cat.iteritems():
            posList = nltk.pos_tag(values)
            index = 0
            contextSentence = ' '.join(values)
            for word in values:#this is each word in the category     
                catSense = wsd.lesk(contextSentence, word)
                if catSense:
                    similarityValue = wn.path_similarity(catSense, theSense)
                    if similarityValue!=None:
                        if  similarityValue > maxValue:
                            maxValue = similarityValue
                            catName = key
                index += 1
    return catName
Пример #53
0
def getSenseLocs(words, sentence):
    senseLocs = {}
    token_sent = nltk.word_tokenize(sentence)
    positions = matcher.getPositions(words, token_sent)
    tagged = [(word, get_wordnet_pos(pos)) for word, pos in nltk.pos_tag(token_sent)]
    for key, value in positions.items():
        
        word = tagged[value][0]
        pos = tagged[value][1]
        #print(word)
        #print(pos)
        syns = wordnet.synsets(word, pos=pos)
        #print(syns)
        #print(token_sent)
        if len(syns) > 0:
            sense = lesk(token_sent, word, pos)
            if sense:
                senseLocs[str(pos)+"_senseLoc"] = syns.index(sense)
    return senseLocs
Пример #54
0
def combineTags(sentence):
	""" Adds the tags to the testfile """
	#pbar = ProgressBar()
	refDict=defaultdict(list)
	sentList = []
	#for sentence in taggedText:
	for i, words in enumerate(sentence):
		if i != 0:
			prevword = sentence[i-1][0]
			prevtag = sentence[i-1][1]
		else:
			prevword=prevtag=''
		if words[1] == prevtag:
			newTuple = (newTuple[0]+' '+words[0],words[1])
			sentList.pop()
			sentList.append(newTuple)
			refDict[i+1,words].append(newTuple)

		else: 
			newTuple = (words[0],words[1])
			sentList.append(newTuple)
			refDict[i+1,words].append(newTuple)

	for i, words in enumerate(sentList):
		if i != 0 and i < len(sentList)-1:
			sent=sentList[i-1][0],words[0],sentList[i+1][0]
			if words[1]!= 'O':
				mwords=words[0].replace(' ','_')
				if len(wn.synsets(mwords, 'n')) > 1:
					leskDec=lesk(word_tokenize(' '.join(sent)), mwords, 'n')
					for value in refDict.values():
						if value[0] == words and len(value) < 2:
							value.append([mwords,leskDec,leskDec.definition()])
				else:
					for ss in wn.synsets(mwords, 'n'):
						for value in refDict.values():
							if value[0] == words and len(value) < 2:
								value.append([mwords,ss, ss.definition()])
	return refDict
Пример #55
0
import sys
import pprint
import nltk
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from collections import Counter
meaning = []

# with open('resume.txt', 'r') as resume:
#     for line in resume:
#         #pprint.pprint(line.split())
#         # tokens = nltk
#         pprint.pprint(1)

if __name__ == '__main__':
    job_des_words = []
    resume_words = []
    if(3 == len(sys.argv)):
        script, job_des_file, resume_file = sys.argv

        with(open(job_des_file)) as jdf:
            description = jdf.read()
            job_des_words = description.split(" ")
        with(open(resume_file)) as rf:
            resume = rf.read()
            resume_words = resume.split(" ")
        print(job_des_words[0])
        print(lesk(description, job_des_words[0]))
    else:
        print("No input present, exiting.")
Пример #56
0
def disAmbi(sents, ambi_nouns):
    noun_dict = dict()
    for noun in ambi_nouns:
        noun_dict[noun] = lesk(sents, noun, "n")
    return noun_dict
Пример #57
0
def similarity(string1,string2):

        #split the string into sentences and sentences into words
        sentences1=[d for d in re.split('\.\W',string1)]
        sentences2=[d for d in re.split('\.\W',string2)]

        #sentences1=[d for d in string1.split('|')]
        #sentences2=[d for d in string2.split('|')]

        sentences1=[d.split('|') for d in sentences1]
        sentences1=[d for e in sentences1 for d in e]

        sentences2=[d.split('|') for d in sentences2]
        sentences2=[d for e in sentences2 for d in e]
        
        #remove stop words
        stop=stopwords.words('english')

        #clean_sentence=[ j for j in sentence if j not in stop]
        #sentence1=[j for j in sentence1 if j not in stop]
        #sentence2=[j for j in sentence2 if j not in stop]

        #remove Punctuation
        regex=re.compile('[%s]' % re.escape(string.punctuation))
        #remove_punct_map=dict((ord(char),None) for char in string.punctuation)
        #regex.sub('', s)
        #clean_sentence_npunct=[c.translate(remove_punct_map) for c in clean_sentence]

        sentences1=[regex.sub('', c) for c in sentences1]
        sentences2=[regex.sub('', c) for c in sentences2]

        sentences1=[c.replace('\n','') for c in sentences1]
        sentences2=[c.replace('\n','') for c in sentences2]

        sentences1=[c.replace('\t','') for c in sentences1]
        sentences2=[c.replace('\t','') for c in sentences2]

        
        #print(sentences1)
        #print(sentences2)

        totalsimilarity=0;
        for sent1 in sentences1:

             sentence1=sent1.split()

             for k,ktag in nltk.pos_tag(sentence1):

                maxsimilarity=0

                syn1=lesk(sentence1,k)

                #print(k+" and"+ktag)
                if syn1 is None:
                        #print("syn1 is none");
                        for i,sym1  in enumerate(wn.synsets(k)):
                                syn1=sym1
                if syn1 is not None:
                    
                     for sent2 in sentences2:
                         
                         sentence2=sent2.split()
                         
                         for j,jtag in nltk.pos_tag(sentence2):
                             
                             #  print(syn1.name()+" and "+syn2.name())
                               sim=0
                               a=0
                                #print(j+ " and " +jtag)
                             #   if(ktag is jtag):
                               if(a==0):
                                        syn2=lesk(sentence2,j)
                                        #print(syn1.name()+" and "+syn2.name())
                                        if syn2 is None:
                                                #print("syn2 is none")
                                                for i,sym2 in enumerate(wn.synsets(j)):
                                                        syn2=sym2
                                        if syn2 is not None:
                                                #print(j+"is none")
                                        #else:
                        #                        print(syn1.name()+" and "+syn2.name())
                                                ps=syn1.path_similarity(syn2)
                                                ws=syn1.wup_similarity(syn2)
                                                ls=0
                                                if(syn1.name().split('.')[1] == syn2.name().split('.')[1]):
                         #                               print(syn1.name()+" and "+syn2.name()) 
                                                        ls=syn1.lch_similarity(syn2)
                                                if ps is not None:
                                                        sim=sim+ps
                                                if ws is not None:
                                                        sim=sim+ws
                                                if ls is not None:
                                                        sim=sim+ls
                                                if maxsimilarity < sim :  
                                                        maxsimilarity=sim
        totalsimilarity=totalsimilarity+maxsimilarity
        #print ("\nscore"+str(totalsimilarity));
        return totalsimilarity
Пример #58
0
def wiki_lookup(search_pass, tag_pass):
    """
    This function looks up a word or bigram with a tag on wikipedia and returns the best possible results
    :param search_pass: the word or bigram to lookup
    :param tag_pass: The tag that belongs to the search_pass
    :return: Returns a list with 3 elements, 3 links or less.
    """

    search = search_pass
    tag = tag_pass
    search_lower = search.lower()

    # These tags will return one link
    tagcheck = ["COUNTRY", "STATE", "CITY", "TOWN", "NATURAL_PLACE", "PERSON", "ORGANISATION", "ANIMAL", "SPORT"]

    # Since the link returned with president, is wrong often. This prevents president from being linked.
    if search_lower != "president":
        # If the search contains just one word.
        if len(search.split(" ")) == 1:
            # Try to get synset of the search, if not possible set synset to None
            try:
                search_syn = wordnet.synsets(search, pos="n")[0]
                search_syn = str(search_syn)
            except IndexError:
                search_syn = None
        # If the search contains multiple words, replace the spaces with _
        else:
            search_clean = search.split(" ")
            search_clean = "_".join(search_clean)
            syn = wordnet.synsets(search_clean, pos="n")
            if len(syn) == 0:
                search_syn = None
            else:
                search_syn = str(wordnet.synsets(search_clean, pos="n")[0])

        wiki_results = []
        url_list = []
        result_syns = []
        to_return = []


        # These tags wont be added to the wiki lookup
        if tag != "NATURAL_PLACE" and tag != "ANIMAL" and tag != "ENTERTAINMENT" and tag != "COUNTRY" and tag != "CITY":
            search = search+" "+tag
            search_results = wikipedia.search(search)
        else:
            search_results = wikipedia.search(search)

        # If search results are found.
        if len(search_results) != 0:
            # Get a summary of all the results found.
            for result in search_results:
                try:
                    wiki_results.append([result, wikipedia.summary(result, sentences=2)])
                except wikipedia.exceptions.DisambiguationError as e:
                    for result_e in e:
                        wiki_results.append([result_e, wikipedia.summary(result, sentences=2)])
                except wikipedia.exceptions.PageError:
                    pass
            # Cleanup the search results, so a synset can be created
            for result in wiki_results:
                result_words = result[0].split(" ")
                if len(result_words) >= 1:
                    # Cleanup the search results
                    result_clean = "_".join(result_words)
                    # Lookup the synset of the search result, using the summary of the search word
                    ss = lesk(result[1], result_clean, "n")
                    try:
                        if ss == None:
                            result.append("-")
                        else:
                            result.append(str(ss))
                            result_syns.append(str(ss))
                    except AttributeError:
                        result.append("-")
                        result_syns.append("-")

                else:
                    result.append("-")

                # Create a url for all the search results
                page = wikipedia.page(result[0])
                result.append(page.url)
                url_list.append(page.url)

            print(search, search_results, url_list)

            # If a synset was found, compare the found synset with the synset of the search.
            if search_syn != None:
                if search_syn in result_syns:
                    for result in wiki_results:
                        if result[2] == search_syn:
                            to_return = [result[3], "-", "-"]
                # Else return the first link
                else:
                    to_return = [url_list[0], "-", "-"]
            # If the tag is in the list with tags that return one link, return one link
            elif tag in tagcheck:
                to_return = [url_list[0], "-", "-"]
            # Else return up to 3 links, if possible
            else:
                if len(url_list) >= 3:
                    to_return = [url_list[0], url_list[1], url_list[2]]
                elif len(url_list) == 2:
                    to_return = [url_list[0], url_list[1], "-"]
                else:
                    to_return = [url_list[0], "-", "-"]
        else:
            to_return = ["-", "-", "-"]
    else:
        to_return = ["-", "-", "-"]

    return to_return
def RecursiveGlossOverlap_Classify(text):
	definitiongraphedges=defaultdict(list)
	definitiongraphedgelabels=defaultdict(list)
	
	#---------------------------------------------------------------------------------
	#2.Compute intrinsic merit (either using linear or quadratic overlap)
	#---------------------------------------------------------------------------------
	tokenized = nltk.word_tokenize(text)
	fdist1 = FreqDist(tokenized)
	stopwords = nltk.corpus.stopwords.words('english')
	stopwords = stopwords + [u' ',u'or',u'and',u'who',u'he',u'she',u'whom',u'well',u'is',u'was',u'were',u'are',u'there',u'where',u'when',u'may', u'The', u'the', u'In',u'in',u'A',u'B',u'C',u'D',u'E',u'F',u'G',u'H',u'I',u'J',u'K',u'L',u'M',u'N',u'O',u'P',u'Q',u'R',u'S',u'T',u'U',u'V',u'W',u'X',u'Y',u'Z']
	puncts = [u' ',u'.', u'"', u',', u'{', u'}', u'+', u'-', u'*', u'/', u'%', u'&', u'(', ')', u'[', u']', u'=', u'@', u'#', u':', u'|', u';',u'\'s']
	#at present tfidf filter is not applied
	#freqterms1 = [w for w in fdist1.keys() if w not in stopwords and w not in puncts and (fdist1.freq(w) * compute_idf(corpus, w))]
	freqterms1 = [w.decode("utf-8") for w in fdist1.keys() if w not in stopwords and w not in puncts]
	
	current_level = 1
	nodewithmaxparents = ''
	noofparents = 0
	maxparents = 0
	relatedness = 0
	first_convergence_level = 1
	tokensofthislevel = []
	convergingterms = []
	convergingparents = []
	tokensofprevlevel = []
	prevlevelsynsets = []
	commontokens = []
	vertices = 0
	edges = 0
	overlap = 0
	iter = 0
	from nltk.corpus import wordnet as wn

	#recurse down to required depth and update intrinsic merit score
	#relatedness is either sum(overlaps) or sum((overlapping_parents)*(overlaps)^2) also called convergence factor
	while current_level < 3:
		#crucial - gather nodes which converge/overlap (have more than 1 parent)
		if current_level > 1:
			print current_level
			for x in freqterms1:
				for y in parents(x,prevlevelsynsets):
					ylemmanames=y.lemma_names()
					#for yl in ylemmanames:
					#	definitiongraphedges[x].append(yl)
					definitiongraphedges[x].append(ylemmanames[0])
					definitiongraphedgelabels[x + " - " + ylemmanames[0]].append(" is a subinstance of ")
					definitiongraphedgelabels[ylemmanames[0] + " - " + x].append(" is a superinstance of ")
						
			convergingterms = [w for w in freqterms1 if len(parents(w,prevlevelsynsets)) > 1]
			for kw in freqterms1:
				convergingparents = convergingparents + ([w for w in parents(kw, prevlevelsynsets) if len(parents(kw, prevlevelsynsets)) > 1])
			for kw in freqterms1:
				noofparents = len(parents(kw, prevlevelsynsets))
				if noofparents > maxparents:
					maxparents = noofparents
					nodewithmaxparents = kw
		for keyword in freqterms1:
			#WSD - invokes Lesk's algorithm adapted to recursive gloss overlap- best_matching_synset() 
			#disamb_synset = best_matching_synset(set(doc1), wn.synsets(keyword))
			if use_pywsd_lesk:
				disamb_synset = simple_lesk(" ".join(freqterms1), keyword)
			if use_nltk_lesk:
				disamb_synset = lesk(freqterms1, keyword)
			else:
				disamb_synset = best_matching_synset(freqterms1, wn.synsets(keyword))
			prevlevelsynsets = prevlevelsynsets + [disamb_synset]
			if len(wn.synsets(keyword)) != 0:
				disamb_synset_def = disamb_synset.definition()
				tokens = nltk.word_tokenize(disamb_synset_def) 
				fdist_tokens = FreqDist(tokens)
				#at present frequency filter is not applied
				#if keyword in convergingterms:
				tokensofthislevel = tokensofthislevel + ([w for w in fdist_tokens.keys() if w not in stopwords and w not in puncts and fdist_tokens.freq(w)])
		listcount = len(tokensofthislevel)
		setcount = len(set(tokensofthislevel))
		overlap =  listcount-setcount
		if overlap > 0 and iter == 0 :
			first_convergence_level = current_level
			iter = 1
		#choose between two relatedness/convergence criteria :- 
		#1) simple linear overlap or 2) zipf distributed quadratic overlap
		#relatedness = relatedness + len(convergingparents)*overlap 
		relatedness = relatedness + overlap + len(convergingparents)
		#relatedness = relatedness + ((len(convergingparents)*overlap*overlap) + 1) 
		#find out common tokens of this and previous level so that same token does not get grasped again - 	
		#relatedness must be increased since repetition of keywords in two successive levels is a sign of 
		#interrelatedness(a backedge from child-of-one-of-siblings to one-of-siblings). Remove vertices and edges 					#corresponding to common tokens
		commontokens = set(tokensofthislevel).intersection(set(tokensofprevlevel))
		tokensofthislevel = set(tokensofthislevel).difference(commontokens)
		relatedness = relatedness + len(commontokens)
		#decrease the vertices count to address common tokens removed above - edges should remain same since they 
		#would just point elsewhere
		vertices = vertices + setcount - len(commontokens)
		edges = edges + listcount
		current_level = current_level + 1
		freqterms1 = set(tokensofthislevel)
		tokensofprevlevel = tokensofthislevel
		tokensofthislevel = []
	
	intrinsic_merit = vertices*edges*relatedness / first_convergence_level

	print definitiongraphedges

	nxg=nx.DiGraph()
	pos=nx.spring_layout(nxg)
	#pos=nx.shell_layout(nxg)
	#pos=nx.random_layout(nxg)
	#pos=nx.spectral_layout(nxg)
	#nx.draw_graphviz(nxg,prog="neato")
	for k,v in definitiongraphedges.iteritems():
                for l in v:
                        nxg.add_edge(k,l)
                        nxg.add_edge(l,k)
	#nx.draw_networkx(nxg)
	#plt.show()

	nxg.remove_edges_from(nxg.selfloop_edges())
	#print "Core number =",nx.core_number(nxg)
	sorted_core_nxg=sorted(nx.core_number(nxg).items(),key=operator.itemgetter(1), reverse=True)
	print "Core number (sorted) :",sorted_core_nxg
	print "============================================================================================================="
	print "Unsupervised Classification based on top percentile Core numbers of the definition graph(subgraph of WordNet)"
	print "============================================================================================================="
	no_of_classes=len(nx.core_number(nxg))
	top_percentile=0
	max_core_number=0
	max_core_number_class=""
	for n in sorted_core_nxg:
		print "This document belongs to class:",n[0],",core number=",n[1]
		if top_percentile < no_of_classes*0.50:
			top_percentile+=1
		else:	
			break
		if n[1] > max_core_number:
			max_core_number=n[1]
			max_core_number_class=n[0]
	print "	max_core_number",max_core_number

	print "==================================================================="
	print "Betweenness Centrality of Recursive Gloss Overlap graph vertices"
	print "==================================================================="
	bc=nx.betweenness_centrality(nxg)
	sorted_bc=sorted(bc.items(),key=operator.itemgetter(1),reverse=True)
	print sorted_bc 

	print "==================================================================="
	print "Closeness Centrality of Recursive Gloss Overlap graph vertices"
	print "==================================================================="
	cc=nx.closeness_centrality(nxg)
	sorted_cc=sorted(cc.items(),key=operator.itemgetter(1),reverse=True)
	print sorted_cc 

	print "==================================================================="
	print "Degree Centrality of Recursive Gloss Overlap graph vertices"
	print "==================================================================="
	dc=nx.degree_centrality(nxg)
	sorted_dc=sorted(dc.items(),key=operator.itemgetter(1),reverse=True)
	print sorted_dc 
	
	print "==================================================================="
	print "Page Rank of the vertices of RGO Definition Graph (a form of Eigenvector Centrality)"
	print "==================================================================="
	sorted_pagerank_nxg=sorted(nx.pagerank(nxg).items(),key=operator.itemgetter(1),reverse=True)
	print sorted_pagerank_nxg
	return (sorted_core_nxg, sorted_pagerank_nxg)