Exemplo n.º 1
0
def process_verb(verb):
    verb = verb[:-1] # Remove newline char
    with open('youtube_setof_verbs.txt') as f:
        verb_dict = f.read()
    verb_dict = verb_dict.split('\n')
    
    max_score = 0
    finl_verb = (verb, '<>')
    verb_list = re.findall('[A-Z][^A-Z]*', verb)
    
    for prob_verb in verb_list:
        if prob_verb[len(prob_verb)-3:] == 'ing':
            prob_verb = prob_verb[:-3] # Remove 'ing' from verb
            if prob_verb.lower() == 'cutt':
                prob_verb = 'cut'
        if wn.synsets(prob_verb):
            try:
                v1 = wn.synset(prob_verb + '.v.01')
                for yout_verb in verb_dict:
                    if yout_verb != '':
                        # if wn.synsets(yout_verb):
                        v2 = wn.synset(yout_verb + '.v.01')
                        score = v1.wup_similarity(v2)
                        if score > max_score:
                            finl_verb = (prob_verb, yout_verb)
                            max_score = score
            except:
                finl_verb = (prob_verb, '<>')
                pass
                
    # print finl_verb, max_score
    return finl_verb[1]
Exemplo n.º 2
0
def process_subj(subj, flag):
    if flag == 1:
        with open('youtube_setof_subjects.txt') as f:
            subj_dict = f.read()
        subj_dict = subj_dict.split('\n')
    elif flag == 2:
        with open('youtube_setof_objects.txt') as f:
            obj_dict = f.read()
        subj_dict = obj_dict.split('\n')
    
    max_score = 0
    finl_subj = (subj, '<>')
    subj_list = subj.split(',')

    if len(subj_list) == 1:
        return subj
    for prob_subj in subj_list:
        prob_subj = prob_subj.strip()
        if wn.synsets(prob_subj):
            try:
                v1 = wn.synset(prob_subj + '.n.01')
                for yout_subj in subj_dict:
                    if yout_subj != '':
                        v2 = wn.synset(yout_subj + '.n.01')
                        score = v1.wup_similarity(v2)
                        if score > max_score:
                            finl_subj = (prob_subj, yout_subj)
                            max_score = score
            except:
                finl_subj = (prob_subj, '<>')
                pass
                
    # print finl_verb, max_score
    return (finl_subj[1])
Exemplo n.º 3
0
def preprocess_docs():
    stopwords = nltk.corpus.stopwords.words('english')
    corpus = list(filtered_corpus())
    counter = 0
    for train, topic, title, text in corpus:
        if counter % 10 == 0:
            print "%.2f %%\r" % (counter * 100.0 / len(corpus),),
            sys.stdout.flush()
        counter += 1
        text = [i for i in nltk.word_tokenize(title) if i.lower() not in stopwords]
        buf = []
        for word in text:
            synsets = wn.synsets(word)
            grain = []
            wheat = [] 
            for s in synsets:
                grain.append(s.path_similarity(wn.synset('wheat.n.02')))
                wheat.append(s.path_similarity(wn.synset('grain.n.08')))

            grain = [i for i in grain if i is not None]
            wheat = [i for i in wheat if i is not None]

            if len(grain) == 0:
                grain = 0
            else:
                grain = sum(grain) * 1.0 / len(grain)
            if len(wheat) == 0:
                wheat = 0
            else:
                wheat = sum(wheat) * 1.0 / len(wheat)
            buf.append((word, grain, wheat))
        yield train, topic, buf
    print ""
def get_score(tags, groups):
  sscore = 0
  scount = 0 
  illegal_word = 0

  if (tags != None ) :
   for g in groups:
    
    for x in k.tags:
     try : 
      #print str(x.text), 
      #check substring else calculate words similarity score
      if g in str(x.text).lower():
	sscore += 2.0
        scount += 1
      else:
       tag = wn.synset(str(x.text).lower()+'.n.01')
       group = wn.synset(g+ '.n.01')  
       sem = wn.path_similarity(group,tag)
       if sem >= 0.3 :
        sscore += sem
	scount += 1     
     except:
	illegal_word += 1
  if scount != 0 :
    return sscore/scount
  else :
    return 0
Exemplo n.º 5
0
def getSenseSimilarity(worda,wordb):

	"""

	find similarity betwwn word senses of two words

	"""

	wordasynsets = wn.synsets(worda)

	wordbsynsets = wn.synsets(wordb)

	synsetnamea = [wn.synset(str(syns.name)) for syns in wordasynsets]

	synsetnameb = [wn.synset(str(syns.name)) for syns in wordbsynsets]



	for sseta, ssetb in [(sseta,ssetb) for sseta in synsetnamea for ssetb in synsetnameb]:

		pathsim = sseta.path_similarity(ssetb)

		wupsim = sseta.wup_similarity(ssetb)

		if pathsim != None:

			print "Path Sim Score: ",pathsim," WUP Sim Score: ",wupsim,"\t",sseta.definition, "\t", ssetb.definition
Exemplo n.º 6
0
def probability(tokens, category, dictionary, total):   	  
	if category == "sense":
		total_score = 0
		dic = dictionary
		if len(tokens) == 0:
			return 0
		for token in tokens:
			for dict_sense in dic:
				score = wn.synset(token).path_similarity(wn.synset(dict_sense))
				if score is not None:
					total_score += score * dic[dict_sense]
		return (total_score/len(tokens))
	else:
		p = 0 
		dic = dictionary
		total_instances = total
		for token in tokens:
		    if token in dic:
		    	token_prob = dic[token]
		    else:
		    	token_prob = 0
		    # smooth one out
		    curr = token_prob/float(total_instances)
		    p += curr  
	
	return p
Exemplo n.º 7
0
def get_similar_words(word):
    lemmas_noun = hypernyms_noun = lemmas_verb = hypernyms_verb =[]
    try:
        lemmas_noun =  [str(lemma.name()) for lemma in wn.synset(word + '.n.01').lemmas()]    
    except WordNetError:
        pass

    try:
        hypernyms_noun = [str(lemma.name()).split('.')[0] for lemma in wn.synset(word + '.n.01').hypernyms()]    
    except WordNetError:
        pass

    if len(lemmas_noun) == 0 and len(hypernyms_noun) == 0:
        """
        Only try verbs if there are no similar nouns
        """
        try:
            lemmas_verb =  [str(lemma.name()) for lemma in wn.synset(word + '.v.01').lemmas()]    
        except WordNetError:
            pass

        try:
            hypernyms_verb = [str(lemma.name()).split('.')[0] for lemma in wn.synset(word + '.v.01').hypernyms()]    
        except WordNetError:
            pass
    
    similar_words = lemmas_noun + hypernyms_noun + lemmas_verb + hypernyms_verb
    # filter words which are not purely alphabets (there will be words with underscore)
    # this is because if we want to process such words like "domestic_animal", we have to 
    # implement 2-grams search which is not done here
    pattern = re.compile('^[a-zA-Z]+$')
    return filter(lambda x: pattern.match(x) and x != word, similar_words)
Exemplo n.º 8
0
def expand_queries(file):
    '''
    For each term in a query, takes the first synset of the word from wordnet and adds all synonyms of that synset
    '''
    file = open(file)
    for sentence in file:
        sentence = sentence.strip()
        if sentence.find('<text>') != -1:
            query = sentence[sentence.find('>')+1: sentence.rfind('<')]
            additions = ''
            updated_q = nltk.pos_tag(nltk.wordpunct_tokenize(query.lower()))
            full_q = query
            for word, pos in updated_q:
               if word not in stopwords.words('english'):
                   looking_for = str(word)+'.'+str(get_wordnet_pos(pos))+'.01'                   
                   synsets = wn.synsets(word)
                   if looking_for in str(synsets):
                       new_words = (wn.synset(looking_for).lemma_names) #was .definition
                       for new_word in new_words:
                           if new_word.lower() != word.lower():
                               full_q = full_q +' '+ str(new_word)
                   else:
                       if wn.morphy(word) != None:
                           word = wn.morphy(word)
                           looking_for = str(word)+'.'+str(get_wordnet_pos(pos))+'.01'
                           print str(looking_for) + ' THIS IS WORD'
                           synsets = wn.synsets(word)
                           if looking_for in str(synsets):
                               new_words = (wn.synset(looking_for).lemma_names) #was .definition
                               for new_word in new_words:
                                   if new_word.lower() != word.lower():
                                       full_q = full_q +' '+ str(new_word)
            print query + ' '+ full_q
Exemplo n.º 9
0
	def get_similarity(self,word1,word2):
		'''计算相似度:基于WordNet语义词典'''
		'''
		print 'before stemmed:',word1
		print 'after stemmed:',wn.morphy(word1.lower())
		print 'before stemmed:',word2
		print 'after stemmed:',wn.morphy(word2.lower())
		'''
		#stemmed word
		if wn.morphy(word1.lower()) != None :
			word1 = wn.morphy(word1.lower())
		if wn.morphy(word2.lower()) != None :
			word2 = wn.morphy(word2.lower()) 
		word1_synsets = wn.synsets(word1)
		#print word1_synsets
		word2_synsets = wn.synsets(word2)
		#print word2_synsets
		sim = 0

		for syn1 in word1_synsets:
			w1 = wn.synset(syn1.name())
			for syn2 in word2_synsets:
				w2 = wn.synset(syn2.name())
				tmp = w1.path_similarity(w2)
				#print tmp,syn1.name(),syn2.name()
				if tmp > sim:
					sim = tmp
		return sim
def print_other_lexical_rel():
    good1 = wn.synset('good.a.01')
    wn.lemmas('good')
    print("Antonyms of 'good': " + str(good1.lemmas()[0].antonyms()))
    print("")
    print("Entailment of 'walk': " + str(wn.synset('walk.v.01').entailments()))
    print("")
Exemplo n.º 11
0
    def overlapCount(self, sentence):
        #set count to be one so we can guess in case there are no sentences with overlap
        count = 1

        sWiki = TextBlob(self.arrayToString(sentence))
        sVerbs = self.getVerbs(sWiki)

        #compare verbs for similarities and based on wordnet's similarity score
        #if they're exactly the same, they'll score 1
        for sverb in sVerbs:
            synv = wn.synset(sverb + '.v.01')
            for qverb in self.questionVerbs:
                synq = wn.synset(qverb + '.v.01')
                count += synv.path_similarity(synq)

        #remove stop words from sentence AFTER we've gotten POS tags
        s = self.removeStopWords(sentence)
        sLower = self.removeStopWords(sentence.lower())

        for word in self.qList:
             if word in s:
                 count += 1
             else:
                 if word.lower() in sLower:
                     count += 0.1
        return count
Exemplo n.º 12
0
 def compare(self, word1, word2):
     tmp1 = wn.synsets(word1)[0].name
     tmp2 = wn.synsets(word2)[0].name
     w1 = wn.synset(tmp1)
     w2 = wn.synset(tmp2)
     val = w1.wup_similarity(w2)
     return val
def is_ingredient(word):
    """
    Return True if the word is an ingredient, False otherwise.

    >>> is_ingredient('milk')
    True
    >>> is_ingredient('blackberries')
    True
    >>> is_ingredient('Canada')
    False
    >>> is_ingredient('breakfast')
    False
    >>> is_ingredient('dish')
    False
    """
    reject_synsets = ['meal.n.01', 'meal.n.02', 'dish.n.02', 'vitamin.n.01']
    reject_synsets = set(wordnet.synset(w) for w in reject_synsets)
    accept_synsets = ['food.n.01', 'food.n.02']
    accept_synsets = set(wordnet.synset(w) for w in accept_synsets)
    for word_synset in wordnet.synsets(word, wordnet.NOUN):
        all_synsets = set(word_synset.closure(lambda s: s.hypernyms()))
        all_synsets.add(word_synset)
        for synset in reject_synsets:
            if synset in all_synsets:
                return False
        for synset in accept_synsets:
            if synset in all_synsets:
                return True
    return word in wordlists.ingredients
Exemplo n.º 14
0
	def ontoList(self, synset):
		# things to pick from
		if self.pos == 'v':
			ln = wn.synset(synset).lexname.split('.')[1]
			hyper = self.lemmatize(self.getHypernyms(synset))
			definition = self.getDefinition(synset)
			lemmas = self.lemmatize(self.getLemmas(synset))
			examples = self.getExamples(synset)
			strings = [string.replace("_", " ") for string in self.getFrameStrings(synset)]
			hypo = self.lemmatize(self.getHyponyms(synset))  
			ontologyList = [strings, ln, lemmas, examples, hypo, definition, hyper]
		else:
			ln = wn.synset(synset).lexname.split('.')[1]
			hyper = self.lemmatize(self.getHypernyms(synset))
			definition = self.getDefinition(synset)
			lemmas = self.lemmatize(self.getLemmas(synset))
			examples = self.getExamples(synset)
			hypo = self.lemmatize(self.getHyponyms(synset)) 
			ontologyList = [ln, lemmas, examples, hypo, definition, hyper]

		returnList = list()
		for o in ontologyList:
			if o:
				returnList.append(o)
		return returnList
def calculate_and_write_edge_weigthings_for_synsets(synset_filenames_dict, file_name):
  max_co_occurrence = calculate_max_co_occurrence(synset_filenames_dict)
  edge_weigthings_for_synsets = dict()
  how_many_added = 0
  how_many_done = 0
  how_many_to_do = len(synset_filenames_dict.keys()) * (len(synset_filenames_dict.keys())-1)
  write_edge_weightings_to_file(dict(), file_name)

  for synset1, filenames1 in synset_filenames_dict.iteritems():
    for synset2, filenames2 in synset_filenames_dict.iteritems():
      if synset1 < synset2:
        how_many_done += 1
        #if (synset1.name, synset2.name) not in similarity_histogram:
        similarity = wn.synset(synset1).lch_similarity(wn.synset(synset2))
        co_occurence = len(set(synset_filenames_dict[synset1]).intersection(set(synset_filenames_dict[synset2])))
        normalized_co_occurrence = co_occurence/max_co_occurrence
        if similarity < 2.0:
          similarity = 0
        if normalized_co_occurrence < 0.4:
          normalized_co_occurrence = 0
        edge_weighting = similarity + 4*normalized_co_occurrence
        if edge_weighting != 0:
          edge_weigthings_for_synsets[(synset1, synset2)] = edge_weighting
          how_many_added += 1
        if how_many_added > 1000:
          print_status("Done with " + str(how_many_done) + " von " + str(how_many_to_do) + "\n")
          write_edge_weightings_to_file(edge_weigthings_for_synsets, file_name, append_to_file=True)
          edge_weigthings_for_synsets = dict()
          how_many_added = 0
  write_edge_weightings_to_file(edge_weigthings_for_synsets, file_name, append_to_file=True)
def get_message(message_parser):
  message_split =  message_parser.split("|")
  mobile_number = message_split[0]
  need_synonyms = ["require", "want", "motivation", "motive", "ask", "call for", "demand", "involve", "necessitate", "need", "postulate", "take", "indigence", "pauperism", "pauperization", "penury"]
  supply_synonyms = ["issue", "furnish", "provide", "render", "add", "append", "cater", "ply", "provision", "supplying", "afford", "yield", "commit", "consecrate", "dedicate", "devote", "spring", "springiness", "impart", "leave", "pass on", "ease up", "give way", "move over", "render", "feed", "generate", "return", "throw", "chip in", "contribute", "kick in", "grant", "pay", "break", "cave in", "collapse", "fall in", "founder", "hand", "pass", "reach", "turn over", "have", "hold", "make", "establish", "open", "apply", "gift", "present", "sacrifice"]
  tokens = nltk.word_tokenize(message_split[1])
  need = len(set(tokens) & set(need_synonyms)) > 0
  need_json = {"need": True} if need else {"supply": True}
  need_json.update({"number": mobile_number})
  tagged_tokens = nltk.pos_tag(tokens)
  for i in range(len(tagged_tokens)):
    if tagged_tokens[i][1] == 'CD':
      current_count = get_integer(tagged_tokens[i][0])
    elif  tagged_tokens[i][1] == 'DT':
      current_count = 1
    elif  tagged_tokens[i][1] in ['NNS','NN']:
      if tagged_tokens[i][0] in ["cups", "cup", "packets","packet","bottle", "bottles", "bundle","bundles","packages", "package", need_synonyms, supply_synonyms]:
          continue
      current_category = tagged_tokens[i][0]
      c = wn.synsets(current_category)
      food = wn.synset('food.n.01')
      water = wn.synset('water.n.01')
      food = food.wup_similarity(c[0])
      water = water.wup_similarity(c[0])
      current_category = "food" if food > water else "water"
      print current_count
      try :
        current_count = current_count
      except NameError:
        current_count =1 
      if current_count == None:
        current_count =1
      need_json.update({current_category: current_count})
      current_count = None
  return need_json
Exemplo n.º 17
0
def define(word, Webster, bestdef, changed, old_topic, new_topic):
	"""Defines a word, if desired by the user, and if the topic has changed."""
	import answer
	if ((Webster != "") and (not changed)):	return (False, Webster)
	if (Webster == ""):
		answer.write("The word " + word + " was not defined under the topic " + old_topic + ".")
	else:
		asked = ask.getPlay("The word " + word + " was defined under the topic " + old_topic + " as " + Webster + ".\nDo you want this meaning to carry over to the new topic " + new_topic + "?  ")
		if yes(asked):
			return (False, Webster)
	undone = True
	dno = 1
	while (undone):
		if (dno == bestdef):	dno += 1
		string = word + ".n." + str(dno)
		try:
			if (dno < len(wordnet.synsets(word, pos = wordnet.NOUN))):
				asked = ask.getPlay("Does " + wordnet.synset(string).definition + " work for your usage of " + word + "?  ")
				undone = not yes(ask)
				newdef = wordnet.synset(string).definition
				dno += 1
			else:
				newdef = ask.getPlay("Then how would you define " + word + "?  ")
				undone = False
		except(Exception):
			newdef = ask.getPlay("How would you define " + word + "?  ")
			undone = False
	return (True, newdef)
Exemplo n.º 18
0
def similarity(word1, word2, tag):
    obj1 = wn.synset(word1 + "."+ tag+".01")
    obj2 = wn.synset(word2 + "."+ tag+".01")
    #print(obj1)
    brown_ic = wordnet_ic.ic('ic-brown.dat') 	# Information content
    semcor_ic = wordnet_ic.ic('ic-brown.dat')
    value = obj1.res_similarity(obj2, brown_ic)
    return value
Exemplo n.º 19
0
 def wsd(self,sent,target, tag=None):
     if tag is None:
         self.scoring(sent, target)
     else:
         self.scoring(sent, target,tag)
     sense = self.getGreedyBestSenses(10)
     print wordnet.synset(sense).definition
     return sense
def get_relative_similarity(a,b):
    '''
        Returns path similarity between two word a and b.
        Used for merging two clusters
    '''
    x=wn.synset("%s.n.01"%a)
    y=wn.synset("%s.n.01"%b)
    return x.path_similarity(y)
Exemplo n.º 21
0
def Sim2(text1, text2) :
    
    stop = stopwords.words('english')
    
    text1=regexpProcessing(text1)
    text2=regexpProcessing(text2)
    
    # convert both texts into upper case
    TEXT1=text1.strip()
    TEXT2=text2.strip()
    TEXT1=TEXT1.lower()
    TEXT2=TEXT2.lower()
    
    token1 = generateTokens(TEXT1)
    token2 = generateTokens(TEXT2)
    
    t1List=[]
    for tok1 in token1:
        word1 = Word(tok1)
        w1=word1.spellcheck()
        correctw=w1[0][0]
        confidence = w1[0][1]
        
        if (confidence > 0.8) and (correctw not in stop):
            t1List.append(correctw)
            
            
    t2List=[]
    for tok2 in token2:
        word2 = Word(tok2)
        w2=word2.spellcheck()
        correctw=w2[0][0]
        confidence = w2[0][1]
        
        if (confidence > 0.8) and (correctw not in stop):
            t2List.append(correctw)
            
             
        
        
    
    
    
    for i in range(len(TextItems)):        
        token = generateTokens(TextItems[i])
        tokenList.append(token)
        token = []
    # spell correction
     
    
    # POS Tagging
    word1 = wn.synset('dog.n.01')
    word2 = wn.synset('cat.n.01')

    word1.path_similarity(word2)
    return CosineSimilarity
Exemplo n.º 22
0
 def get_threshold(w1, w2):
     if w1 == w2 or w1 == w2 + "s" or w1 == w2 + "es":
         return 1
     else:
         try:
             syn1 = wordnet.synset(w1 + ".n.01")
             syn2 = wordnet.synset(w2 + ".n.01")
             return syn1.wup_similarity(syn2)
         except WordNetError:
             return 0
def senseRange(word, pos):
    '''Given an english word and its POS tag (as required for wordnet), return the number of senses that it has.'''
    sense = 0
    while True:
        try:
            wn.synset("%s.%s.%02d" %(word, pos, sense))
        except nltk.corpus.reader.wordnet.WordNetError:
            return sense
        else:
            sense += 1
def buildCategoryForest(category):
  treeList = []
  hypo = lambda s:s.hyponyms()
  
  treeList.append(getEte2Tree(wn.synset('travel.n.01').tree(hypo)))
  treeList.append(getEte2Tree(wn.synset('travel.v.03').tree(hypo)))
  treeList.append(getEte2Tree(wn.synset('travel.v.04').tree(hypo)))
  treeList.append(getEte2Tree(wn.synset('travel.v.05').tree(hypo)))
  treeList.append(getEte2Tree(wn.synset('travel.v.06').tree(hypo)))
  return treeList
Exemplo n.º 25
0
def get_path_similarity_between_boy_and_dog():
    """
    Computes the path similarity between "boy" and "dog".

    Returns
    -------
    A float.
    """

    return wn.path_similarity(wn.synset('boy.n.01'), wn.synset('dog.n.01'))
Exemplo n.º 26
0
def findVerbRelatedToNoun(noun):
    threshold = random.uniform(0.2, 0.5)
    while 1:
        verb = random.choice(verbs)
        nounSynset = wn.synset(noun + ".n.01")
        verbSynset = wn.synset(verb + ".v.01")
        threshold -= 0.01

        if (nounSynset.path_similarity(verbSynset) > threshold or verbSynset.path_similarity(nounSynset) > threshold):
            return verb
Exemplo n.º 27
0
def findTwoRelatedNouns():
    threshold = 0.8
    while 1:
        noun1 = random.choice(nouns)
        noun2 = random.choice(nouns)
        word1 = wn.synset(noun1 + ".n.01")
        word2 = wn.synset(noun2 + ".n.01")
        sim = word1.wup_similarity(word2)
        if noun1 != noun2 and sim > threshold:
            print noun1 + " " + noun2 + " " + str(sim)
            return [noun1, noun2]
Exemplo n.º 28
0
 def determineSynonym(first, second):
     if len(wordnet.synsets(first)) == 0 or len(wordnet.synsets(second)) == 0:
         return 0
     else:
         w1 = wordnet.synset(wordnet.synsets(first)[0].name())
         w2 = wordnet.synset(wordnet.synsets(second)[0].name())
         similarity_ratio = w1.wup_similarity(w2)
         if similarity_ratio != None:
             return similarity_ratio
         else:
             return 0
 def __init__(self, train=False):
     self.tagger = PerceptronTagger()
     self.model = None
     # BOW: triangle, rectangle, circle, hand
     # verbs: draw, wave, rotate
     self.BOW = ['draw', 'wave', 'rotate', 'triangle', 'rectangle', 'circle', 'hand']
     self.VERBS = [wn.synset('draw.v.01'), wn.synset('wave.v.01'), wn.synset('rotate.v.01')]
     self.n_bow, self.n_verbs = len(self.BOW), len(self.VERBS)
     if train: self.train_svm()
     else: self.load_model()
     return
Exemplo n.º 30
0
def semantic_score(word1, word2):
    '''
    Semantic score between two words based on WordNet
    Returns: float (the semantic score between word1 and word2)
    '''
    try:
        w1 = wn.synset('%s.n.01'%(word1))
        w2 = wn.synset('%s.n.01'%(word2))
        return wn.path_similarity(w1,w2,simulate_root = False)
    except:
        return 0