def lowest_common_hypernym(fr):
    """
    Returns the lowest common hypernym of the two mentions (based on WordNet).
    Again assuming that the last word = head word, and that it represents the phrase.
    Also considering only the first sense.
    """
    try:

        i_final=wn.morphy(re.sub(r"\W", r"",fr.i_token.split('_')[-1]))
        j_final=wn.morphy(re.sub(r"\W", r"",fr.j_token.split('_')[-1]))

        if i_final is None or j_final is None:
            return "lowest_common_hypernym={}".format(False)

        if _is_pronoun(i_final) or _is_pronoun(j_final):
            return "lowest_common_hypernym={}".format(False)

        i_synsets=wn.synsets(i_final)
        j_synsets=wn.synsets(j_final)

        lowest_common_hypernym=i_synsets[0].lowest_common_hypernyms(j_synsets[0])[0]

        return "lowest_common_hypernym={}".format(lowest_common_hypernym)

    except wn_error:
        return "lowest_common_hypernym={}".format(False)
예제 #2
0
	def get_similarity(self,word1,word2):
		'''计算相似度:基于WordNet语义词典'''
		'''
		print 'before stemmed:',word1
		print 'after stemmed:',wn.morphy(word1.lower())
		print 'before stemmed:',word2
		print 'after stemmed:',wn.morphy(word2.lower())
		'''
		#stemmed word
		if wn.morphy(word1.lower()) != None :
			word1 = wn.morphy(word1.lower())
		if wn.morphy(word2.lower()) != None :
			word2 = wn.morphy(word2.lower()) 
		word1_synsets = wn.synsets(word1)
		#print word1_synsets
		word2_synsets = wn.synsets(word2)
		#print word2_synsets
		sim = 0

		for syn1 in word1_synsets:
			w1 = wn.synset(syn1.name())
			for syn2 in word2_synsets:
				w2 = wn.synset(syn2.name())
				tmp = w1.path_similarity(w2)
				#print tmp,syn1.name(),syn2.name()
				if tmp > sim:
					sim = tmp
		return sim
예제 #3
0
def expand_queries(file):
    '''
    For each term in a query, takes the first synset of the word from wordnet and adds all synonyms of that synset
    '''
    file = open(file)
    for sentence in file:
        sentence = sentence.strip()
        if sentence.find('<text>') != -1:
            query = sentence[sentence.find('>')+1: sentence.rfind('<')]
            additions = ''
            updated_q = nltk.pos_tag(nltk.wordpunct_tokenize(query.lower()))
            full_q = query
            for word, pos in updated_q:
               if word not in stopwords.words('english'):
                   looking_for = str(word)+'.'+str(get_wordnet_pos(pos))+'.01'                   
                   synsets = wn.synsets(word)
                   if looking_for in str(synsets):
                       new_words = (wn.synset(looking_for).lemma_names) #was .definition
                       for new_word in new_words:
                           if new_word.lower() != word.lower():
                               full_q = full_q +' '+ str(new_word)
                   else:
                       if wn.morphy(word) != None:
                           word = wn.morphy(word)
                           looking_for = str(word)+'.'+str(get_wordnet_pos(pos))+'.01'
                           print str(looking_for) + ' THIS IS WORD'
                           synsets = wn.synsets(word)
                           if looking_for in str(synsets):
                               new_words = (wn.synset(looking_for).lemma_names) #was .definition
                               for new_word in new_words:
                                   if new_word.lower() != word.lower():
                                       full_q = full_q +' '+ str(new_word)
            print query + ' '+ full_q
def preprocessWords(lst):
  index = 0
  while index < len(lst):
    word = lst[index].lower()
    if word not in reservedWordList:
      #special handling from java code
      if word == 'financial':
        lst[index] = 'finance'
      #avoid _id is a word in dscrp
      if word == '_id':
        lst[index] = 'id'
      #only VERB and NOUN are saved, do not know if wn.morphy has many speech stem, which will return as wn.morphy(word)
      # if wn.morphy(word, wn.VERB) and wn.morphy(word, wn.NOUN) and wn.morphy(word, wn.VERB) !=  wn.morphy(word, wn.NOUN):
      # print word, wn.morphy(word, wn.VERB), wn.morphy(word, wn.NOUN), wn.morphy(word)
      if wn.morphy(word, wn.VERB) or wn.morphy(word, wn.NOUN):
        if wn.morphy(word) != word:
          lst[index] = wn.morphy(word)
          word = lst[index]
        elif wn.morphy(PorterStemmer().stem_word(word)):
          lst[index] = PorterStemmer().stem_word(word)
          word = lst[index]
      else:
        del lst[index]
        continue
      if len(word) == 1 or word in stopWordList or word.isdigit():
        del lst[index]
        continue
    index += 1
  return lst
def subclass(feats):
    if string_match(feats).endswith("False"):
        try:
            result = False
            i_clean = wn.morphy(feats.i_cleaned.lower(), wn.NOUN)
            i_synsets = wn.synsets(i_clean)
            j_clean = wn.morphy(feats.j_cleaned.lower(), wn.NOUN)
            j_synsets = wn.synsets(j_clean)
            def get_common_hypernym(i_synset,j_synset):
                i_hypernyms = i_synset.hypernyms()
                j_hypernyms = j_synset.hypernyms()
                if len(i_hypernyms) == 0:
                    i_synset = i_synset.instance_hypernyms()[0]
                if len(j_hypernyms) == 0:
                    j_synset = j_synset.instance_hypernyms()[0]
                subc = i_synset.common_hypernyms(j_synset)
                return (i_synset in subc) or (j_synset in subc)

            for synset in i_synsets:
                for syn in j_synsets:
                    result = get_common_hypernym(synset,syn)
                    if result: break
                if result:break
            return "subclass={}".format(result)
        except:
            wn_error
            return "subclass={}".format(False)

    else:
        return "subclass={}".format(False)
def same_hypernym(fr):
    """
    True if the two mentions have the same hypernym in WordNet.
    In multiword mentions, considering only the last word (I'm assuming last word=head).
    Not considering pronouns.
    Most of the logic was borrowed from Julia's WN function in the coref project - thank you.
    """

    try:

        i_final=wn.morphy(re.sub(r"\W", r"",fr.i_token.split('_')[-1]))
        j_final=wn.morphy(re.sub(r"\W", r"",fr.j_token.split('_')[-1]))

        if i_final is None or j_final is None:
            return "same_hypernym={}".format(False)

        if _is_pronoun(i_final) or _is_pronoun(j_final):
            return "same_hypernym={}".format(False)

        i_synsets=wn.synsets(i_final)
        j_synsets=wn.synsets(j_final)

        for i_synset in i_synsets:
            i_hypernym_set=set(i_synset.hypernyms())
            for j_synset in j_synsets:
                j_hypernym_set=set(j_synset.hypernyms())
                if i_hypernym_set.intersection(j_hypernym_set):
                    return "same_hypernym={}".format(True)

        return "same_hypernym={}".format(False)

    except wn_error:
        return "same_hypernym={}".format(False)
 def _wnbase(self):
     if self.postag == 'n':
         return wn.morphy(self.lemma, wn.NOUN)
     elif self.postag == 'v':
         return wn.morphy(self.lemma, wn.VERB)
     elif self.postag == 'a':
         return wn.morphy(self.lemma, wn.ADJ)
     return None
def ApplyBNB(doc_tokens, classes_postings, condprob, prior, vocabulary, selected_features):
    ## Assumes global dictionaries defined: stop_words, names, negation_words
    global stop_words, names, negation_words
    scores = dict()
    for c in classes_postings:
        scores[c] = 0  # math.log(prior[c])

        negation_found = False
        adverb_found = False
        adverb_condprob = 0.0
        doc_features = []
        for t in doc_tokens:
            t = t.lower()

            if constants.LA and t in negation_words:
                negation_found = True
                continue

            if t in stop_words:
                continue

            if t in names:
                continue

            isAdj = wn.morphy(t, wn.ADJ) is not None
            isNoun = wn.morphy(t, wn.NOUN) is not None
            isVerb = wn.morphy(t, wn.VERB) is not None
            isAdv = wn.morphy(t, wn.ADV) is not None

            if constants.LA and negation_found:
                negation_found = False
                continue

            t = process_word(t)

            if t not in vocabulary:
                continue
            if constants.FEATURE_SELECTION is not None and t not in selected_features[c]:
                continue

            doc_features.append(t)

        vocab = vocabulary
        if constants.FEATURE_SELECTION is not None:
            vocab = selected_features[c]

        for t in vocabulary:
            if t in doc_features:
                scores[c] += math.log(condprob[t][c])
            else:
                scores[c] += math.log(1.0 - condprob[t][c])

    diff = math.fabs(scores["0"] - scores["1"])

    return (scores, diff)
예제 #9
0
파일: te.py 프로젝트: AllanRamsay/COMP34411
def getRoot(w, tag=False):
    if tag == False:
        for tag in ['v', 'n', 'a', 'r']:
            r = wordnet.morphy(w, tagequiv(tag))
            if r:
                return r
        return w
    try:
        return wordnet.morphy(w, tag)
    except:
        return w
예제 #10
0
def get_synonyms_as_set(input_word):
    if input_word is None:
        return set()

    synonyms = set()
    synSets = wn.synsets(input_word)
    for syn in synSets:
        for lemma_name in syn.lemma_names():
            if wn.morphy(lemma_name) is not None:
                synonyms.add(str(wn.morphy(lemma_name).encode('utf-8').decode('ascii','ignore')))
    return synonyms
예제 #11
0
def getGroup(count, word, threshold, wordsSeen, groups):
    word = "".join(l for l in word if l not in string.punctuation)
    best = 0
    group = word
    
    #searchForExisting
    if(wordsSeen.has_key(word)):
        return wordsSeen.get(word)
    
    #get synset of word
    if(wn.synsets(word)):
        wordSyn = wn.synsets(word)[0]
    elif(wn.morphy(word)):
        wordSyn = wn.morphy(word)[0]
    else:
        #no synset; use word
        wordsSeen.update({word: group})
        
        if(groups.has_key(group)):
            newValue = groups.get(group)
            newValue.update([word])
            groups.update({group: newValue})
        else:
            newValue = set()
            newValue.update([word])
            groups.update({group: newValue})
            wordsSeen.update({word: group}) 
        return word
    
    #compare to each group
        # is there a way to compare one word to many words?
    for super_word in count.keys():
        #get synset of group being tested against
        comparisons = groups.get(super_word)
        sim = nSim(wordSyn, comparisons)
        
        if(sim >= threshold and sim > best):
            group = super_word
            best = sim
            
    wordsSeen.update({word: group})
    if(groups.has_key(group)):
        newValue = groups.get(group)
        newValue.update([word])
        groups.update({group: newValue})
    else:
        newValue = set()
        newValue.update([word])
        groups.update({group: newValue})
    wordsSeen.update({word: group}) 
    
    return group
예제 #12
0
def chunktaged(tokens, tagged, word):
    '''
    Extract the meaningful chunk (phrase) from the sentence.
    Also can be imagined as a phrase detection.

    PARAMETER LIST:
    tokens is a list of the words in the sentence:
    ['I', 'previously', 'booked', 'the', 'nice', 'flight', '.']
    tagged is a list of tuples consisting of word and POS:
    [('I', 'PRP'), ('previously', 'RB'), ('booked', 'VBD'), ('the', 'DT'), ('nice', 'JJ'), ('flight', 'NN'), ('.', '.')]
    word is what we look up for:
    'booked'

    The return value should be a phrase like 'turn_on' or just the origin word.

    # the rules as our knowledge:
    # 1, consecutive nouns
    # 2, verb before a preposition
    '''

    word_index = tokens.index(word)
    
    if (pos_map.has_key(tagged[word_index][1])):
        word_pos = pos_map[tagged[word_index][1]]
    else:
        return word

    if (word_pos == 'VERB' and (wn.morphy(word, wn.VERB) != None)):
        word = wn.morphy(word, wn.VERB)
    elif (word_pos == 'NOUN' and (wn.morphy(word, wn.NOUN) != None)):
        word = wn.morphy(word, wn.NOUN)
    
    if word_index == len(tokens) - 1:
        return word

    if (pos_map.has_key(tagged[word_index + 1][1])):
        next_word_pos = pos_map[tagged[word_index + 1][1]]
    else:
        return word

    if (word_pos == 'VERB' and next_word_pos == 'PP') or \
       (word_pos == 'NOUN' and next_word_pos == 'NOUN'):
        possible_chunk = word + '_' + tokens[word_index+1]
        # in case the consecutive Noun is not a phrase
        if wn.synsets(possible_chunk) == []:
            return word
        else:
            return possible_chunk
    else:
        return word
예제 #13
0
    def get_roots(sentence):
        roots = []
        for idx, token in enumerate(sentence.clean_tokens):
            if sentence.tokens_pos[idx] == "VB":
                root = wn.morphy(token, wn.VERB)
            else:
                root = wn.morphy(token)

            if root is None:
                root = token

            roots.append(root)

        return roots
예제 #14
0
def main():
    punIn = raw_input("Pun File: ")  # get it it's a pun on "punning" hah hah
    f = open(punIn, "r")
    for line in f:
        posList = POSCheck(line)  # returns a list of words that stood out in the POS tagging
        hList = homophoneCheck(line)  # returns a list of homophones, along with the original word from the sentence
        print (posList)
        print (hList)
        extText = POSextract(line)  # returns a list with all of the important words extracted
        print (extText)
        hiscore = 0
        highSim = []
        for word in extText:
            for i in range(0, len(hList)):
                hSim = conceptCheck(word, hList[i])
                if hSim == []:
                    continue
                elif hSim[2] > hiscore:
                    highSim = hSim
                    hiscore = highSim[2]
            for a in range(0, len(hList)):
                mword = wn.morphy(word)
                if mword:
                    hMorphSim = conceptCheck(mword, hList[a])
                    if hMorphSim == []:
                        continue
                    elif hMorphSim[2] > hiscore:
                        highSim = hMorphSim
                        hiscore = highSim[2]
                else:
                    break
            for j in range(0, len(posList)):
                pSim = conceptCheck(word, posList[j])
                if pSim == []:
                    continue
                elif pSim[2] > hiscore:
                    highSim = pSim
                    hiscore = highSim[2]
            for b in range(0, len(posList)):
                mword = wn.morphy(word)
                if mword:
                    pMorphSim = conceptCheck(mword, posList[b])
                    if pMorphSim == []:
                        continue
                    elif pMorphSim[2] > hiscore:
                        highSim = pMorphSim
                        hiscore = highSim[2]
                else:
                    break
            print (highSim)
예제 #15
0
 def simple_pos_morphy(self, word):
     """
     Helper funcion for simpletextprocess function. It doesn't process
     the PoS tagging in the first place.
     @param word: the raw word before morph
     @type word: C{string}
     """
     morphied = wn.morphy(word, wn.NOUN)
     if morphied != None:
         return morphied
     else:
         morphied = wn.morphy(word, wn.VERB)
         if morphied != None:
             return morphied
     return word
예제 #16
0
def get_antonyms_as_set(input_word):
    if input_word is None:
        return set()

    antonyms = set()
    synonyms = wn.synsets(input_word)
    
    for syn in synonyms:
        lemmas = syn.lemmas()
        
        for lem in lemmas:
            for ant in lem.antonyms():
                if wn.morphy(ant.name()) is not None:
                    antonyms.add(str(wn.morphy(ant.name()).encode('utf-8').decode('ascii', 'ignore')))
    return antonyms
예제 #17
0
파일: Parser.py 프로젝트: jcccf/cs4740
 def __load_lesk_vector(self, dicty, window_size=100):
   # print example.word, example.pos, example.target
   # print example.senses
 
   # Generate WordSets of surrounding words
   other_sets = []
   words = self.words_window(window_size)
   for word, pos in words:
     # print word, pos
     baseword = wn.morphy(word)
     if baseword is not None:
       pos = penn_to_wn(pos)
       synsets = wn.synsets(baseword, pos=pos) if pos is not None else wn.synsets(baseword)
       for synset in synsets:
         other_sets.append(WordSet(synset.definition))
 
   # for sety in other_sets:
   #   print sety.words
 
   # Loop through possible wordsets and note counts:
   counts = []
   for sense in self.all_senses(dicty):
     curr_set = WordSet(sense.gloss)
     # print curr_set.words
     counts.append(curr_set.overlap(other_sets))
   # print counts
 
   # Normalize and return
   countfirsts = [count[0] for count in counts]
   countfirsts_max = max(countfirsts)
   if countfirsts_max > 0:
     return [float(count)/countfirsts_max for count in countfirsts]
   else:
     return [0.0 for count in countfirsts]
예제 #18
0
def search_for_all_strings(line, file_format):
    '''Search for all strings with NLTK'''
    result = []
    for regexp in Config.excluded_lines:
        for match in re.finditer(regexp, line):
            if match:
                return([])

    for regexp in Config.strings_patterns[file_format]:
        for match in re.finditer(regexp, line):
            if not match:
                continue
            group = match.group(1)
            if len(group) > 0 and not contains_forbidden_patterns(group):
                try:
                    tokens = nltk.word_tokenize(group)
                    if len(tokens) > 0:
                        for word in tokens:
                            morf = wn.morphy(word)
                            if morf and len(str(morf)) > 1:
                                if (output_format == "csv") | (group not in global_word_pull):
                                    result.append(group)
                                    global_word_pull.add(group)
                                break
                except:
                    print ("Unexpected error:{0}".format(sys.exc_info()))
                    traceback.print_tb(sys.exc_info()[2])
                    url = os.path.join(os.path.split(os.path.realpath(__file__))[0] + "/nltk_info.html")
                    print("See here for installation instructions:\n" + url)
                    webbrowser.open_new(url)

                    nltk.download()
                    sys.exit(2)

    return result
예제 #19
0
def wordnet_formatted(word):
    word = word.lower()
    words = search_wordnet(word)
    response_data = []
    for found_word in words:
        word_data = dict()
        word_data['label'] = found_word.lemma
        word_data['desc'] = str(found_word.definition)
        word_data['sensenum'] = found_word.sensenum
        response_data.append(word_data)

        synsetid = found_word.lemma + '.' + str(found_word.pos) + '.' + str(found_word.sensenum)
        word_data['synset'] = synsetid

    stem = wn.morphy(word)
    if stem is not None and stem is not word:
        # print '*' + str(stem) + '*'

        word_stem_words = search_wordnet(stem)
        for found_word in word_stem_words:
            word_data = dict()
            word_data['label'] = found_word.lemma
            word_data['desc'] = str(found_word.definition)
            word_data['sensenum'] = found_word.sensenum
            response_data.append(word_data)

            synsetid = found_word.lemma + '.' + str(found_word.pos) + '.' + str(found_word.sensenum)
            word_data['synset'] = synsetid

    return response_data
예제 #20
0
파일: feeds.py 프로젝트: stephegn/rss
def get_words(text):
    text = text.lower()
    words = word_tokenize(text)
    taggedWords = nltk.pos_tag(words)

    #j=0;
    word_list = []
    #stemming
    for index, item in enumerate(taggedWords):
        if 'VB' in item[1]:
            pos = wn.ADV
            #print item[1]+'vb'
        elif 'AJ' in item[1]:
            #print item[1]+'aj'
            pos = wn.ADJ
        else:
            #print item[1]+'noun'
            pos = wn.NOUN
        #morphy needs to take the pos... Fix this!
        test = wn.morphy(item[0], pos)
        #word_list[j]=wn.morphy(w)
        if(test is None):
            word_list.append(item[0])
        else:
            word_list.append(test)
        #j=j+1
    return word_list
예제 #21
0
    def find_candidates(self, property_subtree):
        if not isinstance(property_subtree, ParentedTree):
            raise AttributeError

        candidates = set(self.__get_property_string_forms(property_subtree))

        new_candidates = set()
        for candidate in candidates:
            for label in self.__fetch_from_wikibase(candidate):
                new_candidates.add(label)
        candidates.update(new_candidates)

        new_candidates = set()
        for candidate in candidates:
            new_candidates.update(self.__fetch_synonyms_and_hypernyms(candidate))
        candidates.update(new_candidates)

        new_candidates = set()
        for candidate in candidates:
            for POS in [wordnet.ADJ, wordnet.ADV, wordnet.NOUN, wordnet.VERB]:
                morphy = wordnet.morphy(candidate, POS)
                if morphy is not None:
                    new_candidates.add(morphy)
        candidates.update(new_candidates)

        return candidates
def __get_sem_class__(token):
    token = re.sub(r'[`]','',token)
    per_pronouns = ["she", "he", "they", "you", "we",
                    "i", "them", "her", "him", "us", "who","whom"]
    if token.lower() in per_pronouns:
        sem_class = "PER"
    else:
        sem_class = "PER" #it will crash with NNP that are not GPE, so probably people
        try:
            token_clean = wn.morphy(token.lower(), wn.NOUN)
            token_sense = ""+token_clean+".n.01"
            token_synset = wn.synset(token_sense)
            for synset in SEM_CLASSES.keys():
                hypernyms = token_synset.hypernyms()
                if len(hypernyms) == 0: #need to get the instance
                    token_synset = token_synset.instance_hypernyms()[0]

                if synset in token_synset.common_hypernyms(synset):
                    sem_class = SEM_CLASSES[synset]
                    break
                else:
                    sem_class = "OTHER"
        except:
            wn_error
    return sem_class
def __set_has_homework_or_assignment(text=str, replacement_text=str, word_list=list):
    """
    Checks if the text contains synonyms to homework, and replaces words with 'has_homework'

    Arguments:
        text (str): Text to check for "homework words"
        replacement_text (str): Text to replace "homework" words with
        word_list (list): List of words to use as comparison against the text

    Returns:
        str: Text with replaced "homework words", or the original text

    """
    word_set = set()
    tokenized_text = nltk.word_tokenize(text)
    # loop through all the words to see if it contains homework or its synonyms
    for word in tokenized_text:
        word_lem = wordnet.morphy(word, wordnet.NOUN)
        if (word_lem is not None) and (word_lem in word_list):
            word_set.add(word)
    # convert to list and sort based on length
    word_set = list(word_set)
    word_set.sort(key=len, reverse=True)
    # replace those words, if any, with the replacement text
    for word in word_set:
        text = text.replace(word, replacement_text)
    return text
예제 #24
0
def guessFromGrammarStructs(meaning_generator):
	# NUM_HAIKUS = 400
	# with open('haikus.json') as haikus_file:
	# 	dataset = json.load(haikus_file)
	# pos_counter = tokenize_dataset(dataset, haikus_limit=NUM_HAIKUS)
	
	for x in xrange(1,10):
		# pos_tags = []
		# for i in xrange(3):
		#  	grammar_struct = pick_random_structure(pos_counter)
		#  	print(grammar_struct)
		#  	pos_tags += list(grammar_struct)	
		#  	pos_tags += ['\n']

		pos_tags = ['DT', 'JJ', 'NN','\n',
		'NNS', 'VBG','\n',
		'JJ', 'JJ','NN',
		]	
		

		haiku = ''
		seedWord = 'nature'
		oldWords = [meaning_generator.random_word('NN')]
		for postag in pos_tags:
			lastWord = wordFromPOStag(postag,seedWord,oldWords,meaning_generator)
			oldWords += [wn.morphy(lastWord)]
			haiku += lastWord+' '
		print(haiku)
		print("")
예제 #25
0
 def document_extraction(self, documnet):
     features = {}
     words_in_doc = get_tokens(documnet)
     for word in self.words.keys()[:min(len(self.words), self.amount_of_words)]:
         word = wn.morphy(word) or word
         features['contains(%s)' % word] = (word in words_in_doc)
     return features
예제 #26
0
def clean_file(text, entities):
    clean = []
    recognised = 0
    dirty = []
    ctext = clean_text(text)
    tokens = nltk.word_tokenize(ctext)
    unusual = unusual_words(tokens)
    for token in tokens:
        ltoken = token.lower().strip()
        if ltoken in PUNCTUATION:
            clean.append(token)
            recognised += 1
        else:
            if len(ltoken) > 1 or ltoken in ['a', 'i']:
                if ltoken in unusual:
                    stem = wn.morphy(ltoken)
                    if stem:
                        clean.append(token)
                        recognised += 1
                    else:
                        if ltoken in entities:
                           clean.append(token)
                           recognised += 1
                        else:
                            dirty.append(token)
                            clean.append('[?]')
                else:
                    clean.append(token)
                    recognised += 1
            else:
                dirty.append(token)
    return {'clean': clean, 'dirty': dirty, 'recognised': recognised, 'total': len(tokens)}
예제 #27
0
def generate_line(pos_tags,word_dump, inspiration,meaning_generator):
    words = []
    for tag in pos_tags:
        word = generate_word(tag, pos_tags,words, word_dump, inspiration,meaning_generator)
        words.append(word)
        word_dump.append(wn.morphy(word))
    return words
def statement_stemmer(stmt):    
    """
    str -> [list of strs]

    Return list of stemmed words from a single statement string.
    """
    stmt_stems = []

    #only stem statements with strings (used in statement_corpus() below)
    if type(stmt) is float:
        return
    
    #stmt_text_str = rev_text.encode('utf-8')  ##deprecated.
    stmt_text_str = stmt.lower()
    
    
    sentence_list = nltk.sent_tokenize(stmt_text_str)
    
    for sent in sentence_list:
        word_list = nltk.word_tokenize(sent)
        for word in word_list:  #compare with WordNet Corpus
            wn_word = wordnet.morphy(word)
            if wn_word is not None:
                wn_stem = PorterStemmer().stem_word(wn_word)
                stmt_stems.append(wn_stem)
    return stmt_stems
예제 #29
0
def nominalise(verb):
  # to be used when turning verbs to noun forms when extracting simple co-occ.
  stem = STP.stem(verb)
  # going with pure stem
  for suff in SUFFIXES:
    noun_form = wn.morphy(stem+suff,wn.NOUN)
    if noun_form != None:
      return noun_form
  # trying with the last character of the stem doubled
  stem += stem[-1]
  for suff in SUFFIXES:
    noun_form = wn.morphy(stem+suff,wn.NOUN)
    if noun_form != None:
      return noun_form
  # returning None if nothing works
  return None
예제 #30
0
def search_files_by_noun(noun, folder_path):
    """ Search files that contain noun
    :param noun: searching noun
    :param folder_path: folder with files
    :return: list of the files which contain noun
    """

    # get general form of the noun
    noun = wordnet.morphy(noun, wordnet.NOUN)
    if noun is None:
        raise ValueError("Input word isn't noun")

    # check that directory exists
    if not os.path.exists(folder_path):
        raise IOError('Path "' + folder_path + '" does\'t exist')
    if not os.path.isdir(folder_path):
        raise IOError('"' + folder_path + '" isn\'t directory')

    # get file list
    file_list = os.listdir(folder_path)
    file_list = [os.path.join(folder_path, f) for f in file_list]
    file_list = filter(lambda f: path.isfile(f), file_list)
    file_list = filter(lambda f: mimetypes.guess_type(f)[0].startswith("text/"), file_list)

    # search word in the file
    process_pool = Pool()
    find_fun_args = [(noun, f_val) for f_val in file_list]
    file_list_contain_word = process_pool.map(_find_noun, find_fun_args)
    res_files = [os.path.basename(f_val) for f_val, mark in zip(file_list, file_list_contain_word) if mark]

    return res_files
예제 #31
0
def morphy_stem(word):
    """
    Simple stemmer
    """
    stem = wn.morphy(word)
    if stem:
        return stem.lower()
    else:
        return word.lower()
def get_lemma(word):
    """
	lemmatization des mots (pour le NLP)
	"""
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
예제 #33
0
def get_lemma(word):
    #nltk.download('wordnet')
    from nltk.corpus import wordnet as wn

    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
예제 #34
0
 def morphy(multi_word):
     morphyed_words = []
     for word in multi_word.split("_"):
         morphyed_word = wn.morphy(word, wn.VERB)
         if morphyed_word == None:
             morphyed_word = word
             # print word,"not found in WN"
         morphyed_words.append(morphyed_word)
     r = "_".join(morphyed_words)
     return r
예제 #35
0
def pos(a, p):
    ret = set()
    for i in a:
        m = wn.morphy(i)
        if m is None:
            continue
        for ss in wn.synsets(m):
            if ss.pos() == p:
                ret.add(i)
    return list(ret)
예제 #36
0
파일: 4_parse.py 프로젝트: anbnyc/xword
def getWordnetCandidates(clue, length):
    clue = re.sub('[' + string.punctuation + ']', '', clue.lower())
    if clue is not wn.morphy(clue):
        ## TODO reverse the morph transform
        morphedclue = wn.morphy(clue)
        morph = clue.replace(morphedclue, '')
        clue = morphedclue
        print("morph-", morph)
    synsets = wn.synsets(clue)
    names = functools.reduce(lambda x, y: x + y.lemma_names(), synsets, [])
    for syn in synsets:
        if syn.hyponyms():
            for hyposet in syn.hyponyms():
                names += [lemma.name() for lemma in hyposet.lemmas()]
        if syn.hypernyms():
            for hyperset in syn.hypernyms():
                names += [lemma.name() for lemma in hyperset.lemmas()]
    names = list({x for x in names if len(x) == length})
    return names
예제 #37
0
    def lemmatize_word(word):
        # make words lower  example: Python =>python
        word = word.lower()

        # lemmatize  example: cooked=>cook
        lemma = wn.morphy(word)
        if lemma is None:
            return word
        else:
            return lemma
예제 #38
0
	def preproc_word(self, w, pos=None):
		if pos == 'j': pos = 'a'
		w = re.sub(r'[\$,\{\}\[\]\(\)`\'\":;!\?\.]', '', w).lower()
		w = re.sub(r'\-', '_', w) # hyphen -> underscore
		if w == 'an': w = 'a' # dirty hack....
		if w == 'oclock': w = 'o\'clock'
		if w.isdigit(): w = '<NUM>'
		wp = wn.morphy(w, pos=pos)
		if wp is None: wp = w
		return wp
def get_synsets(text):
    words = get_words(text)

    synsets = []

    for word in words:
        #print(word)
        word_synsets = wn.synsets(word, NOUN)

        if word_synsets == []:
            morpied_word = wn.morphy(word, NOUN)

            while morpied_word != None:
                word_synsets += wn.synsets(morphied_word, NOUN)
                morphied_word = wn.morphy(word, NOUN)

        synsets += word_synsets

    return synsets
def notNeutral(word, pos):
    morphy = wn.morphy(word, pos)
    if morphy is not None:
        if morphy in lexiconDict or morphy in lexiconDict:
            return True

    if word in lexiconDict or word in disgustingWords:
        return True
    else:
        return False
예제 #41
0
def suitable_word(seedWord, oldWords, POStag, meaning_generator):
    for x in xrange(1, 3):
        for word in reversed(
                oldWords + [seedWord]
        ):  # will say, the seedword and then last word is most significant
            newWord = meaning_generator.associate(word, POStag)
            if newWord != None and wn.morphy(newWord) not in oldWords:
                return newWord
    # couldn't find any, let's just return random word
    return meaning_generator.random_word(POStag)
예제 #42
0
def printStuff(s):
    line = 1
    print('Stemmer:')
    for i in range(len(s)):
        if s[i - 1] == ';' or (s[i] != ';' and isfloat(s[i - 1]) == True):
            line = line + 1
        if isfloat(s[i]) == True:
            print(s[i] + ' DOUBLE ' + str(line))
        elif isOP(s[i]) == True:
            print(s[i] + ' OP ' + str(line))
        else:
            s[i] = s[i].lower()
            if isLegal(s[i]) and str(type(wordnet.morphy(
                    s[i]))) != "<class 'NoneType'>":
                print(s[i] + ' STRING ' + str(line) + ' ' +
                      wordnet.morphy(s[i]))
            else:
                print(s[i] + ' STRING ' + str(line))
    print('ENDFILE')
  def score_words(self):
    '''
    generates self.word_scores and self.min_word_scores
    for the markov chain calculation
    '''
    g = nx.DiGraph()
    added = set()
    for n in self.ranked_nodes:
      
      score = self.ranked_nodes[n]
      triple = json.loads(n)
      #sdistance = abs(TextBlob(triple[0][0] + ' ' + triple[2][0]).sentiment.polarity - self.overall_sentiment);
      score = score #/ sdistance;
      a = wn.morphy(triple[0][0]) or triple[0][0]
      b = wn.morphy(triple[2][0]) or triple[2][0]
      if a not in added:
        g.add_node(a, pos=triple[0][1], weight=score)
        added.add(a)
      if b not in added:
        g.add_node(b, pos=triple[2][1], weight=score)
        added.add(b)
    for n in self.ranked_nodes:
      score = self.ranked_nodes[n]
      triple = json.loads(n)
      a = wn.morphy(triple[0][0]) or triple[0][0]
      b = wn.morphy(triple[2][0]) or triple[2][0]
      relationship = triple[1]
      g.add_edge(a, b, dep=relationship, weight=score)

    nx.draw(g, with_labels = True)
    plt.show()

    o = g.out_degree()
    i = g.in_degree()
    t = {}
    for n in o:
      t[n] = o[n] + i[n]
    s = sorted(t, key=t.get)
    pprint(s)

    ranked = nx.pagerank(g)
    self.word_scores = ranked
    self.min_word_score = min(self.word_scores.values())
예제 #44
0
def get_pun_token(pun):
    """
    Return a set the contains the pun word and its lemma
    """
    for wordID, word in pun.items():
        try:
            if word['ispun']:
                return wordID, set([word['token'], wn.morphy(word['token'])])
        except:
            pass
예제 #45
0
파일: rte_classify.py 프로젝트: ekaf/nltk
    def _lemmatize(word):
        """
        Use morphy from WordNet to find the base form of verbs.
        """
        from nltk.corpus import wordnet as wn

        lemma = wn.morphy(word, pos=wn.VERB)
        if lemma is not None:
            return lemma
        return word
예제 #46
0
def word_in_awl(word):
    global awl_words
    if word in string.punctuation:
        return 0
    lemma = wn.morphy(word)
    if lemma is None:
        lemma = word
    if lemma in awl_words:
        return 1
    return 0
def get_lemma(word):
    """Return lemmatized word from the input text.
    Keyword arguments:
    word -- the line(s) of text to be lemmatized.
    """
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
예제 #48
0
def valid_en_word(word):
    global all_words
    if word in string.punctuation:
        return False
    lemma = wn.morphy(word)
    if lemma is None:
        lemma = word
    if lemma in all_words:
        return True
    return False
예제 #49
0
    def findword(self, word):
        if word.isdigit():
            w = word
        else:
            w = wn.morphy(strip(word))

        if w in self.dictionary:
            return self.dictionary[w]

        return set()
    def _get_similarity_wordnet_2word(self, word1, word2):
        '''
		print 'before stemmed:',word1
		print 'after stemmed:',wn.morphy(word1.lower())
		print 'before stemmed:',word2
		print 'after stemmed:',wn.morphy(word2.lower())
		'''
        #stemmed word
        if wn.morphy(word1.lower()) != None:
            word1 = wn.morphy(word1.lower())
        if wn.morphy(word2.lower()) != None:
            word2 = wn.morphy(word2.lower())

        key1 = '(%s,%s)' % (word1, word2)
        key2 = '(%s,%s)' % (word2, word1)

        if self.sim_2word.has_key(key1):
            return self.sim_2word[key1]
        if self.sim_2word.has_key(key2):
            return self.sim_2word[key2]

        word1_synsets = wn.synsets(word1)
        #print word1_synsets
        word2_synsets = wn.synsets(word2)
        #print word2_synsets
        sim = 0

        for syn1 in word1_synsets:
            w1 = wn.synset(syn1.name())
            for syn2 in word2_synsets:
                w2 = wn.synset(syn2.name())
                tmp = w1.path_similarity(w2)
                #print tmp,syn1.name(),syn2.name()
                if tmp > sim:
                    sim = tmp
                if sim == 1.0:
                    break
            if sim == 1.0:
                break
        self.sim_2word[key1] = sim
        self.sim_2word[key2] = sim
        return sim
def get_synsets(text):
    tokens = nlp(text)

    synsets = []

    for t in tokens:
        if t.pos_ == "NOUN":
            word = t.text
            word_synsets = wn.synsets(word, NOUN)

            if word_synsets == []:
                morpied_word = wn.morphy(word, NOUN)

                while morpied_word != None:
                    word_synsets += wn.synsets(morphied_word, NOUN)
                    morphied_word = wn.morphy(word, NOUN)

            synsets += word_synsets

    return synsets
예제 #52
0
def lesk(word1, sentence):
    bestsense = None
    maxoverlap = 0
    word=wordnet.morphy(word1)
  
    for sense in wordnet.synsets(word):
        overlap = calculateOverlap(sense,sentence)
        if overlap > maxoverlap:
                maxoverlap = overlap
                bestsense = sense
    return bestsense
예제 #53
0
def morphy_stem(word):
    """
    Simple stemmer
    """
    # Morphy returns the base form of a word, ie, dogs -> dog
    # unknown 'stem' returns word.lower()
    stem = wn.morphy(word)
    if stem:
        return stem.lower()
    else:
        return '0'
예제 #54
0
def stemWord(words_list):
    result = []
    for word in words_list:
        word = word.strip()
        word = word.lower()
        aa = wn.morphy(word)
        if aa == None:  #我发现词干化会出现返回为空的情况
            result.append(word)
        else:
            result.append(aa)
    return result
예제 #55
0
    def __init__(self, word, part_of_speech, lookuptable):
        self.part_of_speech = morphy_tag[
            part_of_speech] if part_of_speech in morphy_tag else wordnet.NOUN
        self.word = wordnet.morphy(word, self.part_of_speech)  #Lemmatization

        self.synset = listify(wordnet.synsets(
            word, pos=self.part_of_speech)) if self.word else None
        self.orphan = not self.synset
        self.db = lookuptable
        self.lemmatizer = WordNetLemmatizer()
        self.kernel = {}
예제 #56
0
def nounify(adj_word):
    set_of_related_nouns = set()

    for lemma in wn.lemmas(wn.morphy(adj_word, wn.ADJ), pos="a"):
        for related_form in lemma.derivationally_related_forms():
            for synset in wn.synsets(related_form.name(), pos=wn.NOUN):
                if wn.synset('person.n.01') in synset.closure(
                        lambda s: s.hypernyms()):
                    set_of_related_nouns.add(synset)

    return set_of_related_nouns
예제 #57
0
파일: parse_job.py 프로젝트: xxyzz/WordDumb
def find_lemma(start, text, lemmas, ll_conn, is_kfx):
    from nltk.corpus import wordnet as wn

    for match in re.finditer(r'[a-zA-Z\u00AD]{3,}', text):
        lemma = wn.morphy(match.group(0).replace('\u00AD', '').lower())
        if lemma in lemmas:
            if is_kfx:
                index = start + match.start()
            else:
                index = start + len(text[:match.start()].encode('utf-8'))
            insert_lemma(ll_conn, (index, ) + tuple(lemmas[lemma]))
예제 #58
0
def find_lemma(start, text, lemmas, ll_conn):
    from nltk.corpus import wordnet as wn

    bytes_str = isinstance(text, bytes)
    pattern = b'[a-zA-Z]{3,}' if bytes_str else r'[a-zA-Z]{3,}'
    for match in re.finditer(pattern, text):
        word = match.group(0).decode('utf-8') if bytes_str else match.group(0)
        lemma = wn.morphy(word.lower())
        if lemma in lemmas:
            insert_lemma(ll_conn, (start + match.start(),) +
                         tuple(lemmas[lemma]))
예제 #59
0
 def get_lemma(self, word):
     """
     Create a lemmatized version of a word
     :param word:
     :return:
     """
     lemma = wn.morphy(word)
     if lemma is None:
         return word
     else:
         return lemma
예제 #60
0
def get_lemma(word):
    """
    The lemmas corresponding to a word
    :param word: a single word from the dictionary
    :return: the lemma
    """
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma