def validate(self, tag, media): # check with wordnet # if synset continue else return -1 # check category and decide if verify # use google vision api to verify # result: 1 -> good (keep image) ; 0 -> bad (discard image) ; -1 -> cannot validate (keep) tag = singularize(tag).lower() synset = wordnet.synsets(tag, pos=NOUN) if not synset: return -1 category = synset[0].lexname if self.VALIDATE_CATEGORY == 'all': pass elif category in self.VALIDATE_CATEGORY: pass # do not return yet else: return -1 # not all and cannot be validated img = requests.get(media) gImage = google.cloud.vision.types.Image(content=img.content) response = self.vision_client.label_detection(image=gImage) labels = map(lambda d: d.description if d.score > 0.9 else None, response.label_annotations) # check if tag in the detected labels with a good probability (score) if tag in labels: return 1 # compare synonyms synonyms = synset[0].synonyms # check if any synonym in labels for synonym in synonyms: if singularize(tag).lower() in labels: return 1 return 0
def unify_query(query): """ a peek of sorts .. param: list of Word objs return: synset entry from wn """ #build query from query = build_wn_query(query) print 'wordnet query: {0}'.format(query) s = wordnet.synsets(singularize(query), pos=wordnet.NOUN) if len(s) == 0: #this is a bit hacky.. it's based on the assumption, if it fails, it may be a two word NN #i.e. thrill ride fails, ride doesn't print 'no entry for {0}..'.format(query) s = wordnet.synsets(singularize(query.split()[1]), pos=wordnet.NOUN) if len(s) == 0: print 'no entry for {0}'.format(query.split()[1]) return s
def getKeywords(self): """ Extract keywords using POS tagging :return: Query keywords """ nouns = [] if len(self.sentences) == 1: s = re.sub('[' + string.punctuation + ']', '', self.sentences[0]) self.r.extract_keywords_from_text(s) rp = self.r.get_ranked_phrases() for n in rp: tokens = nltk.tokenize.word_tokenize(n) if len(tokens) == 1: item, tag = nltk.pos_tag(tokens)[0] if 'NN' in tag: if len(item) > 1: if singularize(item) not in nouns and pluralize( item) not in nouns: nouns.append(item) else: nouns.append(n) return nouns for s in self.sentences: s = re.sub('[' + string.punctuation + ']', '', s) tokens = nltk.tokenize.word_tokenize(s) tagged = nltk.pos_tag(tokens) final_nouns = [] for item, t in tagged: if 'NN' in t: if len(item) > 1: if singularize(item) not in final_nouns and pluralize( item) not in final_nouns: final_nouns.append(item) nouns.append(final_nouns) return nouns
def getAnonymizationStructure(self, words): # deal with first word if singularize(words[0].lower()) in self.ind.stop_words or singularize( words[0].lower()) in self.ents: words[0] = words[0].lower() self.normalizeTables(words) spans = {} i = 0 while i < len(words): for w in range(3, 0, -1): at_most_2 = {} span = ' '.join(words[i:i + w]).replace('#', ' ') exact = True if self.allInitCaps(span): exact = False if not self.allInitCaps(span): continue (docs, typs) = self.ind.getKey(span, exact=exact, case=(len(span) <= 2)) if docs: spans[i] = {'width': w, 'docs': [], 'types': []} for j in range(0, len(docs)): if at_most_2.setdefault( typs[j], 0) < 2: # typs[j] not in spans[i]['types']: spans[i]['types'].append(typs[j]) spans[i]['docs'].append(docs[j]) at_most_2[typs[j]] += 1 i = i + w - 1 # we have finished the span break i += 1 return spans
def __init__(self, doc_name, n): self.freq = {} path = "C:/Users/ARKAZA KUMARI/Desktop/Mini Project/Mini Project/Source Code/" + doc_name line = "" with open(path) as f: line = f.readline() line = line.split(" ") line = [word.lower() for word in line if word not in ["", " "]] stop_words = set(stopwords.words('english')) new_stopwords = ['also', 'may', 'must', 'since', 'could', 'whether'] new_stopwords_list = stop_words.union(new_stopwords) filtered_words = [ word for word in line if word not in new_stopwords_list ] writepath = "C:/Users/ARKAZA KUMARI/Desktop/Mini Project/Mini Project/Source Code/words" + str( n) + ".txt" with open(writepath, 'w') as f: for i in filtered_words: #print(i) i = re.sub(r'[^\w]', '', i) count = self.freq.get(singularize(i), 0) self.freq[singularize(i)] = count + 1 #f("%s\n" % i) f.write(singularize(i) + "\n") f.close()
def singularize(self, word): ''' Given a base-form of noun, return a singular form (For Noun only) Args: word (str): base-form of noun Raises: ValueError: [description] ValueError: [description] Returns: str: singular form of noun ''' if word in self._word2index: return singularize(word) else: try: base_form_word = lemma(word) if base_form_word in self._word2index: return singularize(base_form_word) else: raise ValueError( "Found the base-form for '{}': '{}'. But even the base-form not in vocabulary" .format(word, base_form_word)) except: raise ValueError( "Can not found base-form for '{}'".format(word))
def inject(self, title, word_pair): for (i, slot), word in zip(title.slots, word_pair): word = word.replace("_", " ").title() if slot == 'NOUN': title.inject(singularize(word), slot, i) elif slot == 'NOUNS': title.inject(pluralize(singularize(word)), slot, i) else: title.inject(word, slot, i)
def approx_match(label, gold_label, use_includes=False): if label == gold_label: return True # Approximate matching strategy from Zesch and Gurevych (2009). # Following their human validation test, we implement the MORPH and # INCLUDES matching strategies. singularized_label_tokens = [singularize(token) for token in label.split()] singularized_gold_label_tokens = [singularize(token) for token in gold_label.split()] if use_includes: return contains_sublist(singularized_label_tokens, singularized_gold_label_tokens) else: return singularized_label_tokens == singularized_gold_label_tokens
def pluralizationError(text, nlp, correctFlag=False): ''' Purpose: To check for pluralization error. Additionally, it returns corrected sentence. Parameters: text: string A string of text-single or a paragraph. correctFlag:boolean True or False Returns: count: integer text: Corrected sentence. (If correctFlag is True) ''' doc = nlp(text) count = 0 text = "" for s in doc.sentences: for i in range(len(s.words)): if (i != len(s.words) - 1) and (s.words[i].xpos == "NN" or s.words[i].xpos == "NNP"): if s.words[i + 1].xpos in ["VB", "VBP"]: count += 1 text += pluralize(s.words[i].text) + " " else: text += s.words[i].text + " " elif (i != len(s.words) - 1) and (s.words[i].xpos == "NNS" or s.words[i].xpos == "NNPS"): if s.words[i + 1].xpos == "VBZ": text += singularize(s.words[i].text) + " " else: text += s.words[i].text + " " elif (i != len(s.words) - 1) and s.words[i].xpos == "CD": if s.words[i].text == "1" or s.words[i].text == "one": if s.words[i + 1].xpos == "NNS" or s.words[i + 1].xpos == "NNPS": count += 1 s.words[i + 1].text = singularize(s.words[i + 1].text) text += s.words[i].text + " " else: if s.words[i + 1].xpos == "NN" or s.words[i + 1].xpos == "NNP": count += 1 s.words[i + 1].text = pluralize(s.words[i + 1].text) text += s.words[i].text + " " else: text += s.words[i].text + " " if correctFlag == True: return count, text else: return count
def get_related_noun_or_not(noun, d=True): w = wordnet.synsets(noun) if w: w = w[0] w1 = w.hyponyms() w2 = w.hypernyms() if w1 + w2: nw = random.choice(w1 + w2) if nw and nw.senses: return nw.senses[0] elif wordnet.synsets(singularize(noun)) and d: return get_related_noun_or_not(singularize(noun, False)) return noun
def get_document_topics(doc, name): lda = gensim.models.ldamodel.LdaModel.load(name + '.lda') englishStopWords = get_stopwords('english', name) text = [singularize(word) for word in doc.lower().split() if singularize(word) not in englishStopWords and word.isalpha() and len(word) > 1] dictionary = gensim.corpora.Dictionary.load(name + '.dict') document_topics = lda.get_document_topics(dictionary.doc2bow(text), minimum_probability=0.05) if len(document_topics) > 0: primary_topic_tuple = max(document_topics, key=lambda x:x[1]) topic_terms = lda.show_topic(primary_topic_tuple[0]) print topic_terms return document_topics, topic_terms else: return [], ''
def get_related_or_not(word, d=True, pos='NN'): w = wordnet.synsets(word, pos=pos) if w: w = w[0] w1 = w.hyponyms() w2 = w.hypernyms() if w1 + w2: nw = random.choice([w] + w1 + w2) if nw and nw.senses: return nw.senses[0] elif wordnet.synsets(singularize(word)) and d: return get_related_or_not(singularize(word, False, pos)) return word
def word_normalize(text): normal_form = parse_text(text) # tags for detection part of speech Nouns_tags = ["NNS", "NNPS"] Adjective_tags = ["JJR", "JJS"] Verb_tags = ["VBD", "VBG", "VBN", "VBP", "VBZ"] buff_string = [] # buffer list for compose internal box # to make noun singular form. The func I taken from standard functions in pattern.in module # in that part of code I wish compare tag NNS, NNPS and object from word class. for word in normal_form: if word.tag in Nouns_tags: buff_Noun = singularize(word.string, pos=NOUN) # make singular from plural buff_Noun = str(buff_Noun) buff_string.append(buff_Noun) return buff_string # how can I return part of speech? Or I need one func just for detecting tag part of speech? # but that not effective else: buff_string.append(str(word.string)) return buff_string # make the basic adjective form. # in that part of code I wish compare tags JJR, JJS and object from word class # and make word singular form, after that use func not_comperative_superlative to make # adj basic form. buff_string1 = buff_string for word in normal_form: if word.tag in Adjective_tags: buff_Adj = singularize(word.string, pos=ADJECTIVE) # make singular from plural buff_Adj = str(buff_Adj) buff_string1.append.not_comperative_superlative(buff_Adj) return buff_string else: buff_string.append(str(word.string)) return buff_string # verb to infinitive form. The func I taken from standard functions in pattern.in module buff_string2 = buff_string # make a copy of entered string for word in normal_form: # go thru the string word by word if word.tag in Verb_tags: # looking for only verb buff_Vb = str(word.string) buf = lemma(buff_Vb) buff_string2.append(buf) return buff_string2 else: buff_string2.append(str(word.string)) return buff_string
def wordListPrint(fileName): """ Remove all character names, plurality, and stop words """ wordList = [] characterList = characterBuilder(fileName) swords = stopwords.words("english") for word in corpusBuilder(fileName): word = word.strip(".:,()?!;[]") singularize(word) if word.lower() not in characterList and word.lower() not in swords and len(word) > 1: print "%s\t%s" % (word.lower(), 1) else: continue
def stem_word(word): try: if word.endswith("s"): if singularize(word) in nltk_words: return singularize(word) else: return word if word.endswith("d") or word.endswith('ing'): if conjugate(word) in nltk_words: return conjugate(word) else: return word except: return word return word
def approx_match(label, gold_label, use_includes=False): if label == gold_label: return True # Approximate matching strategy from Zesch and Gurevych (2009). # Following their human validation test, we implement the MORPH and # INCLUDES matching strategies. singularized_label_tokens = [singularize(token) for token in label.split()] singularized_gold_label_tokens = [ singularize(token) for token in gold_label.split() ] if use_includes: return contains_sublist(singularized_label_tokens, singularized_gold_label_tokens) else: return singularized_label_tokens == singularized_gold_label_tokens
def wordListPrint(fileName): """ Remove all character names, plurality, and stop words """ wordList = [] characterList = characterBuilder(fileName) swords = stopwords.words('english') for word in corpusBuilder(fileName): word = word.strip('.:,()?!;[]') singularize(word) if word.lower() not in characterList and word.lower( ) not in swords and len(word) > 1: print '%s\t%s' % (word.lower(), 1) else: continue
def wordListCleaner(fileName): """ Remove all character names, plurality, and stop words """ wordList = [] characterList = characterBuilder(fileName) swords = stopwords.words('english') for word in corpusBuilder(fileName): word = word.strip('.:,()?!;[]') singularize(word) if word.lower() not in characterList and word.lower() not in swords and len(word) > 1: wordList.append(word.lower()) else: continue return wordList
def wordListCleaner(fileName): """ Remove all character names, plurality, and stop words """ wordList = [] characterList = characterBuilder(fileName) swords = stopwords.words('english') for word in corpusBuilder(fileName): word = word.strip('.:,()?!;[]') singularize(word) if word.lower() not in characterList and word.lower( ) not in swords and len(word) > 1: wordList.append(word.lower()) else: continue return wordList
def inject_sub_nn(sent_i, e_config): target_indices = [] for i, w_i in enumerate(sent_i): if w_i['tag'] in ('NN', 'NNS'): target_indices.append(i) if target_indices: target_index = target_indices[random.randint(0, len(target_indices) - 1)] target_token = sent_i[target_index]['form'] target_tag = sent_i[target_index]['tag'] new_token = "" new_tag = "" if target_tag == "NN": new_token = pluralize(target_token) new_tag = "NNS" elif target_tag == "NNS": new_token = singularize(target_token) new_tag = "NN" else: raise sent_i[target_index]['form'] = str(new_token) sent_i[target_index]['tag'] = new_tag sent_i[target_index]['ctag'] = new_tag else: pass return sent_i
def process(line): # replace some known utf-8 chars with ascii line = re.sub("\xe2\x80\x99", "x", line) # U+2019 (right single quotation mark) line = re.sub("\xe2\x80\x93", "-", line) # U+2013 (EN-DASH) # remove the rest of the non-ascii chars line = re.sub(r'[^\x00-\x7F]+', ' ', line) sentences = nltk.tokenize.sent_tokenize(line) # print('---------------') tags = set() for sentence in sentences: # words all_words = [singularize(w).capitalize() for w in nltk.tokenize.word_tokenize(sentence)] words = {remove_nonalpha(w).lower() for w in all_words if accept_word(w)} # search solr for word in words: # print(word) tags.update(query(word)) # bigrams all_bigrams = nltk.bigrams(all_words) bigrams = {b for b in all_bigrams if accept_word(b[0]) and accept_word(b[1])} for bigram in bigrams: b = '%s_%s' % (remove_nonalpha(bigram[0]), remove_nonalpha(bigram[1])) b = b.lower() # print('>>>>>>>>> %s' % b) tags.update(query(b)) return ",".join(tags).encode('utf-8')
def conceptnet_relatedness(subject, candidates, object): base_score = call_cp_api(subject, object) pred_subject = subject # print(base_score) # Is there any other label in the ranking making more sense? for o_class, confidence in candidates.items(): f_class, _ = formatlabel(o_class) if f_class == subject: continue # Skip the object itself score = call_cp_api(f_class, object) if score > base_score: base_score = score pred_subject = o_class print("CONCEPTNET: Within the ranking, the most likely subject is %s" % pred_subject) if singularize(pred_subject) == pred_subject: # Re-format back for evaluation pred_subject = pluralize(pred_subject) pred_subject = reverse_map(pred_subject) return pred_subject.replace('_', '-'), base_score
def perturb(self, word, tag): res = "" # pertube verb if 'V' in tag: vs = pe.lexeme(word) res = choice(vs) while (res == word or len(res) > len(word)) and (vs[0] != word): res = choice(vs) if vs[0] == word: res = vs[1] #pertube plural/singlar noun if 'NNS' == tag: res = pe.singularize(word) if res == word: res = word[:-1] if len(res) > 0: return (res, word, (0, len(res))) else: #if the perturbed result is empty, we just randomly remove some chars in the word removeLen = randint(1, min(len(word) - 1, 3)) lenw = len(word) removestart = lenw - removeLen return (word[:removestart] + word[removestart + removeLen:], word, (0, lenw - removeLen))
def do_flower(self, i, j): """Process finding a flower and possibly doing something with it""" # Get a random color and flower name color = random.choice(self.JSON['colors'])['color'] flower = singularize(random.choice(self.JSON['flowers'])) # Print them self.TEMP += "There was a beautiful " + color + " " + flower + " there. " self.TEMP += "It smelled like " + pluralize( random.choice(self.JSON['fruits'])) + "." # Put a square on the map to mark the flower self.IMAGE.filledRectangle((i * 15 + 4, j * 15 + 4), (i * 15 + 11, j * 15 + 10), self.COLORS['purple']) # Is the narrator keeping this flower? if random.randrange(100) < 10: self.TEMP += " I picked it" if self.FLOWERS: self.TEMP += " and added it to the rest of my bouquet" self.TEMP += "." self.FLOWERS.append({'color': color, 'flower': flower}) # Does the narrator eat this flower instead? elif random.randrange(100) < 5: self.TEMP += " For some reason I ate it. It tasted " + random.choice( self.TASTES) + "." self.TEMP += "\n" self.THEN = False
def transform(term, term_modified): if term == 'VBZ' or term == 'VBP' or term == 'VBN' or term == 'VBG' or term == 'VBD': return lemmatizer.lemmatize(''.join(term_modified), 'v') elif term == 'NNS': return singularize(''.join(term_modified)) else: return term_modified
def pos_all(word): rlist = [] _rtense = ('infinitive', 'present', 'past', 'future') _rperson = (1, 2, 3) _rnumber = ('singular', 'plural') _rmood = ('indicative', 'imperitive', 'conditional', 'subjuntive') _raspect = ('imperfective', 'perfective', 'progressive') for rtense in _rtense: for rperson in _rperson: for rnumber in _rnumber: for rmood in _rmood: for raspect in _raspect: item = conjugate(word, tense=rtense, person=rperson, number=rnumber, mood=rmood, aspect=raspect, negated=False) if item not in rlist: rlist.append(item) print bcolors.Magenta + "All pos of " + word print_list(rlist, 4) print "Singluar : " + singularize( word) + " Plural : " + pluralize(word) print "Comparative : " + comparative( word) + " Superlative : " + superlative(word)
def replace_sql(sql, select_clause, from_clause, where_clause): """Perform replacement on skeleton SQL""" sql = sql.substitute(columns=select_clause, tables=from_clause, where=where_clause) # Build a GROUP BY clause if the SELECT has a COUNT group_by = '' print str(select_clause.find('COUNT')) + ' :: ' + select_clause if select_clause.find('COUNT') >= 0: group_by = ' GROUP BY ' + singularize(from_clause.strip().split(' ')[0]) + '_type\n' sql = sql + group_by # Build an ORDER BY clause 50% of the time order_by = '' if random.choice(range(1, 100)) < 50: if select_clause.split(' ')[0] == '*': order_by = ' ORDER BY ' + random.choice(from_clause.strip().split(' ')[0:]) + '_name' elif select_clause.split(' ')[0] != 'COUNT(*)': order_by = ' ORDER BY ' + select_clause.split(' ')[0].replace(',', '') if len(order_by): order_by = order_by + random.choice(['', ' ASC', ' DESC']) # But only attach the ORDER BY if it keeps us under 140 characters if len(sql + order_by) < 140: sql = sql + order_by return sql.strip() + ';'
def cleanData(data_matrix): printable = set(string.printable) prepositions = ["is", "a", "at", "the", "which", "on ", "to"] for line in data_matrix: line[1] = line[1].replace("UPDATE 5-", "") line[1] = line[1].replace("UPDATE 1-", "") line[1] = line[1].replace("UPDATE ", "") line[1] = line[1].replace("UPDATE: ", "") line[1] = line[1].replace("Companies", "") line[1] = line[1].replace("Insight - ", "") line[1] = line[1].replace(" - Quick Facts", "") line[1] = line[1].replace(" ...", "") line[1] = filter(lambda x: x in printable, line[1]) line[1] = line[1].lower() line[1] = line[1].translate(None, string.punctuation) # for prep in prepositions: # line[1] = line[1].replace(prep, "") sentence_array = nltk.word_tokenize(line[1]) # for pr in prepositions: # try: # sentence_array.remove(pr) for i in range(len(sentence_array)): #sentence_array[i] = str(WordNetLemmatizer().lemmatize(sentence_array[i], 'v')) sentence_array[i] = str(singularize(sentence_array[i])) line.append(sentence_array)
def change_pluralization(token): singularForm = singularize(token) pluralForm = pluralize(token) if token == singularForm: return pluralForm else: return singularForm
def getSynonyms(word, part): synonyms = [] wordToTry = lemma(word) if part[0] == 'V' else word synList = dictionary.synonym(wordToTry) if synList is None: return [word] for syn in synList: if " " not in syn: if part == "VB" or part == "VBP": synonyms.append(lemma(syn)) elif part == "VBD" and len(lexeme(syn)) > 3: synonyms.append(lexeme(syn)[3]) elif part == "VBG" and len(lexeme(syn)) > 0: synonyms.append(lexeme(syn)[0]) elif part == "VBN" and len(lexeme(syn)) > 3: synonyms.append(lexeme(syn)[-1]) elif part == "VBZ" and len(lexeme(syn)) > 1: synonyms.append(lexeme(syn)[1]) elif part == "NN" and syn[-2:] != "ss": synonyms.append(singularize(syn)) elif part == "NNS": synonyms.append(pluralize(syn)) else: synonyms.append(syn) return list(set(synonyms))
def updateTerms(self, line, w2vmodel): list_term = line.split('_') list_result = [] whitelist = set( ['win', 'won', 'most', 'biggest', 'largest', 'fastest']) blacklist = set(['give', 'also']) stoplist = set(stopwords.words('english')) for term in list_term: if term in blacklist: continue if term not in whitelist and term in stoplist: continue # find lem = lemma(term) sing = singularize(term) if term in w2vmodel.vocab: list_result.append(term) elif lem in w2vmodel.vocab: list_result.append(lem) elif sing in w2vmodel.vocab: list_result.append(sing) return list_result
def conjugate_noun(noun, pos): if pos == "NNS" or pos == "NNPS": return str(ptn.pluralize(noun)) elif pos == "NN" or pos == "NNP": return str(ptn.singularize(noun)) else: return noun
def synonyms(data): augment_n = 10 data_dict = dict((key,[val]) for val,key,_ in data) is_plural = lambda word: singularize(word) <> word stops = set(stopwords.words('english') + ['l']) for disease in data: for _ in range(augment_n): new_facts_list = [] for fact in disease[0]: new_fact = fact[:] for k,word in enumerate(fact): if word not in stops: syn = wordnet.synsets(word) if syn: random_syn = syn[0] random_lemma = random.choice(random_syn.lemma_names()) random_lemma = pluralize(random_lemma) if is_plural(word)\ else random_lemma random_lemma = random_lemma.lower() random_lemma = random_lemma.replace('_',' ') random_lemma = random_lemma.replace('-',' ') if ' ' in random_lemma: continue new_fact[k] = random_lemma new_facts_list.append(new_fact) #print new_facts_list data_dict[disease[1]].append(new_facts_list[:]) return data_dict
def phrase_search(q, positional_index): q = q.strip("'") q = q.strip() # to remove white space in the phrase query phrase_query = [] for val in q.split(" "): phrase_query.append(singularize(val)) combine_doc = {} for index in range(0, len(phrase_query)): if(len(combine_doc) == 0): combine_doc = positional_index[str( phrase_query[index])][1] else: match = {} print(positional_index[phrase_query[index]][1]) for key, value in combine_doc.items(): for key1, value2 in positional_index[phrase_query[index]][1].items(): print("1") if(key == key1): print("2") for position in value: for position2 in value2: if (position+1 == position2): match[key] = set() match[key].add(position2) combine_doc = match relevent_docs = set() for keys in combine_doc: relevent_docs.add(int(keys)) return relevent_docs
def _isplural(w): word = w.lower() singula = singularize(word) if singula == word: return False else: return True
def getdata(): inp =raw_input('Enter the topic:') while 1: try: topic= wikipedia.page(inp) content1=topic.content #fetches content from the wikipedia webpage in the form of text break except wikipedia.exceptions.DisambiguationError as e: c=1 for i in e.options: print str(c) + '.' + i c+=1 choice=input('Enter your choice:') inp=e.options[choice-1] topic= wikipedia.page(inp) content1=topic.content break ## 1=re.sub("[\(\[].*?[\)\]]", "", summ) content1=content1.encode('ascii','ignore') content1=content1.lower() tokens=nltk.word_tokenize(content1) tagged=nltk.pos_tag(tokens) freqdic={} for i in tagged: word=singularize(i[0]) if i[1] in ['NN','NNS','NNP','NNPS','FW'] and not (word in inp.lower().split()) and word.isalpha() : #iterates through the text and filters out for nouns and the various forms of nouns if word in freqdic: #makes sure the dictionary doesn't contain the word itself, also no pronouns and numbers freqdic[word]+=1 else: freqdic[word]=1 return freqdic
def conjugate_noun(noun, pos): if pos=="NNS" or pos =="NNPS": return str(pluralize(noun)) elif pos=="NN" or pos =="NNP": return str(singularize(noun)) else: return noun
def process_agent_output(answer_template, noun, nouns, noun_topics, answer_sentiment): agent_output = answer_template.answer temp_nouns = nouns #print(agent_output, nouns, noun_topics, (nouns)) if answer_template.fetch_count > 0 and noun_topics != None and len( noun_topics) > 0: #print(noun_topics) if question_sentiment in sentiment_opt_pos: temp_nouns = topic_favorites[noun_topics[0]] #like_memory.loc[like_memory['sentiment'] > 0.5 && like_memory['topic'] == noun_topics[0]].sample().subject elif question_sentiment in sentiment_opt_neg: temp_nouns = topic_dislike[noun_topics[0]] sing_noun = singularize(noun) plural_noun = pluralize(noun) if sing_noun in temp_nouns: temp_nouns.remove(sing_noun) elif plural_noun in temp_nouns: temp_nouns.remove(plural_noun) #replace nouns for i in range(1, answer_template.fetch_count + 1): temp = "noun_" + str(i) agent_output = agent_output.replace(wildcards[temp], temp_nouns[i - 1]) if answer_template.use_noun: agent_output = agent_output.replace(wildcards["noun"], noun) if answer_template.use_sentiment: agent_output = agent_output.replace(wildcards["sentiment"], question_sentiment) agent_output = agent_output.replace(wildcards["agent_sentiment"], answer_sentiment) #print(agent_output) return agent_output
def do_flower(self, i, j): """Process finding a flower and possibly doing something with it""" # Get a random color and flower name color = random.choice(self.JSON['colors'])['color'] flower = singularize(random.choice(self.JSON['flowers'])) # Print them self.TEMP += "There was a beautiful " + color + " " + flower + " there. " self.TEMP += "It smelled like " + pluralize(random.choice(self.JSON['fruits'])) + "." # Put a square on the map to mark the flower self.IMAGE.filledRectangle((i * 15 + 4, j * 15 + 4), (i * 15 + 11, j * 15 + 10), self.COLORS['purple']) # Is the narrator keeping this flower? if random.randrange(100) < 10: self.TEMP += " I picked it" if self.FLOWERS: self.TEMP += " and added it to the rest of my bouquet" self.TEMP += "." self.FLOWERS.append({'color': color, 'flower': flower}) # Does the narrator eat this flower instead? elif random.randrange(100) < 5: self.TEMP += " For some reason I ate it. It tasted " + random.choice(self.TASTES) + "." self.TEMP += "\n" self.THEN = False
def _transform_word(self, word, pos, less, more): """transforms a word to be less less and more more :param word: word to transform :type word: str :param pos: part of speech of the word :type pos: str :param less: list of 'less' words :type less: list :param more: list of 'more' words :type more: list :returns: transformed word :rtype: str """ new_word = self._get_similar_word(word, less, more) new_pos = en.tag(new_word)[0][1] if (pos[:2] != new_pos[:2]) or word == new_word: return word # handle noun if pos.startswith('NN'): # pluralization if pos.endswith('S') and not new_pos.endswith('S'): new_word = en.pluralize(new_word) elif not pos.endswith('S') and new_pos.endswith('S'): new_word = en.singularize(new_word) # capitalization if word[0].isupper(): new_word = new_word[0].upper() + new_word[1:] else: new_word = new_word.lower() # handle verb elif pos.startswith('VB'): tense, person, number = en.tenses(word)[0][:3] # conjugation conjugated = en.conjugate(new_word, tense=tense, person=person, number=number, parse=False) if conjugated is not None: new_word = conjugated # remove underscores for joint words new_word = new_word.replace('_', ' ') return new_word
def trimSentence(word_POS): sentence_array = [] for word in word_POS: # if word[1] == "IN": # #do nothing # pass # elif word[1] == "TO": # pass # elif word[1] == "$": # pass # elif word[1] == "CD": # pass # elif word[1] == "CC": # pass # elif word[1] == ":": # pass # elif word[0] == "%": # pass # elif word[0] == "pct" or word[0] == "percent": # pass # elif word[0] == "second": ####### # pass # elif word[0] == "wo": # sentence_array.append("will") # elif word[0] == "n't": # sentence_array.append("not") #if its a verb, add the base of that verb if word[1] == "VB" or word[1] == "VBD" or word[1] == "VBG" or word[ 1] == "VBN" or word[1] == "VBP" or word[1] == "VBZ": base = WordNetLemmatizer().lemmatize(word[0], 'v') sentence_array.append(base) else: #add sentence_array.append(singularize(word[0])) return sentence_array
def inject(self, title, word_pair): for i, cat in title.get_slots('NP'): if cat == 'plural': title.inject(pluralize(word_pair[0]).capitalize(), 'NP') else: title.inject(singularize(word_pair[0]).capitalize(), 'NP') for i, cat in title.get_slots('ADJ'): title.inject(word_pair[1].capitalize(), 'ADJ')
def getPluralSingular(w): word = w plural = isplural(word) if plural: word = singularize(word) else: word = pluralize(word) return word
def pluralize_singularize(word,prev_word): if "thing" in word: print word,prev_word if "these" in prev_word: return pluralize(word) elif "this" in prev_word: return singularize(word) else: return word
def xproto_singularize(field): try: # The user has set a singular, as an exception that cannot be handled automatically singular = field['options']['singular'] singular = unquote(singular) except KeyError: singular = en.singularize(field['name']) return singular
def xproto_singularize_pluralize(field): try: # The user has set a plural, as an exception that cannot be handled automatically plural = field['options']['plural'] plural = unquote(plural) except KeyError: plural = en.pluralize(en.singularize(field['name'])) return plural
def key_set(full_word): words = [] # hack for class etc if singularize(full_word) == full_word or full_word.endswith('ss'): plural = pluralize(full_word) words = [full_word, plural] else: words = [singularize(full_word), full_word, pluralize(singularize(full_word))] for w in words[:]: # if not already plural like if not w.endswith('s'): suffix = 's' if any([w.endswith(suf) for suf in ['x', 'z', 'ch', 'sh']]): suffix = 'es' words.append('%s%s' % (w, suffix)) tup = tuple(sorted(list(set(words)))) return tup
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for sg, pl in Datasheet.load(os.path.join("corpora", "celex-wordforms-en.csv")): if en.singularize(pl) == sg: i +=1 n += 1 self.assertTrue(float(i) / n > 0.95) print "pattern.en.singularize()"
def c2nn(x): #函数输入chunk 输出chunk中的名词或者名词词组 a=[] ss="" #print chunk.string for word in x: if str(word.type)[0]=="N" and checkword(word.string): if ss=="": ss=singularize(word.string) else: ss=ss+" "+singularize(word.string) else: if ss!="": a.append(ss) ss="" # print word.string+" "+word.type if ss!="": a.append(ss) #print a return a
def set_ingredient_tokens(current_recipe): for item in current_recipe.ingredients: quantity_conversion = {'quarter' : 0.25,'eighth' : 0.125, 'half' : 0.5,'1/4' : 0.25, '1/8' : 0.125,'1/3' : 0.333, '2/3' : 0.667,'3/4' : 0.75, '1/2' : 0.5,'1' : 1.0, '2' : 2.0,'3' : 3.0, '4' : 4.0,'5' : 5.0, '6' : 6.0,'7' : 7.0, 'lots' : 3.0, '8' : 8.0,'9' : 9.0, '5-6' : 5.5, 'a' : 1.0,'few' : 2.0, 'scant' : 1.0, 'pinch' : 0.125, 'pinches' : 0.25, '4-' : 4.0, 'to' : 0.0, 'tablespoon' : 1.0, 'teaspoon' : 1.0, 'couple' : 2.0} #set 'dumb' quantity by assuming the first item is quanity prelim_quantity = nltk.tokenize.word_tokenize(item.source_line)[0] #EAFP! try: prelim_quantity = float(prelim_quantity) except ValueError: print "Can't convert :: " + prelim_quantity pass # pass to conversion dictionary lookup try: prelim_quantity = quantity_conversion[prelim_quantity] except KeyError: print KeyError("No conversion value found : " + prelim_quantity) #need to flag here for note in UI prelim_quantity = 0 else: item.quantity = prelim_quantity item.quantity = prelim_quantity filterList = ['tsp', 'tsps', 'tbsps', 'tbsp', 'tablespoon', \ 'tablespoons', 'teaspoon', 'teaspoons', 'cup', \ 'cups', 'bowl', 'pint', 'quart', 'mg', 'g', 'gram',\ 'grams', 'ml', 'oz', 'ounce', 'ounces' ] item.measure = ' '.join([word for word in item.source_line.split(" ") if word in filterList]) new_source_line = ' '.join([word for word in item.source_line.split(" ") if word not in filterList]) sentence = parsetree(new_source_line, chunks=True, lemmata=True) for s in sentence: #filter all the NP (noun phrases) into a chunk list chunk_list = [singularize(chunk.string) for chunk in s.chunks if chunk.type =='NP'] search_term = chunk_list[0] search_term = "".join([i for i in search_term if i != '/']) search_term = ''.join([i for i in search_term if not i.isdigit()]) item.search_term = search_term return current_recipe
def get_singular_form_of_word(word): """ Get singular form of the words. Args: word (str): keyword. Returns: (str): singular form of the word. TODO: Or convert to base form of the words. """ return singularize(word)
def make_thesaurus_lesk(file_path): """ Returns dict of counters 'thesaurus', where thesaurus[synset] = { word1: 4, word2: 8, word3: 1, ... } """ thesaurus = defaultdict(lambda: Counter()) with open(file_path, "r") as f: f = f.read().split() for i, word_and_tag in enumerate(f): word, tag = word_and_tag.rsplit("_", 1) # Reject non-ASCII characters try: word = word.decode("ascii") except (UnicodeDecodeError, UnicodeEncodeError): continue # look at a window of 9 words each time lesk is called window = [i - WINDOW, i + WINDOW] if i < WINDOW: window = [i, i + 2 * WINDOW] elif i >= len(f) - WINDOW: window = [i - 2 * WINDOW, i] synset = lesk.my_lesk(f[window[0] : window[1]], word) # if lesk can decide on a meaning for that word, add # that meaning, i.e., that synset, to thesaurus if not synset: continue # if word is verb, only add present tense to thesaurus if tag[0] == "V": word_tenses = tenses(word.lower()) if "inf" in word_tenses or "1sg" in word_tenses or "2sg" in word_tenses or "3sg" in word_tenses: thesaurus[str(synset)].update([word.lower()]) elif tag[0] == "N": synset_name = synset.name().split(".")[0] if synset_name == pluralize(synset_name): thesaurus[str(synset)].update([pluralize(word.lower())]) else: thesaurus[str(synset)].update([singularize(word.lower())]) else: thesaurus[str(synset)].update([word.lower()]) # Update thesaurus with mappings, if map_file exists file_path = file_path.replace(config.CORPUS_FOLDER, config.MAPPING_FOLDER) map_file = file_path.replace(config.CORP_TAG, config.MAP_TAG) thesaurus = _add_mappings(map_file, thesaurus) return thesaurus
def docs2corpus(docs, name, isNew): print '>> converting documents to corpus...' numDocs = len(docs) englishStopWords = get_stopwords('english', name) # texts = [[word for word in doc.lower().split() if word not in englishStopWords and word.isalpha() and len(word) > 1] for doc in docs] texts = [[singularize(word) for word in doc.lower().split() if singularize(word) not in englishStopWords and word.isalpha() and len(word) > 1] for doc in docs] # remove words that appear only once frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] print len(texts) if isNew: dictionary = generate_dictionary(texts, name, numDocs) #uncomment for new corpus else: dictionary = gensim.corpora.Dictionary.load(name + '.dict') corpus = [dictionary.doc2bow(text) for text in texts] if isNew: gensim.corpora.MmCorpus.serialize(name + '.mm', corpus) # store to disk, for later use return corpus, dictionary
def getIngredientNames(index): # get = request.GET # index = int(get.get('index')) # # # from recipes.views import * # getIngredientNames(8279) # urlBase = 'http://cooking.nytimes.com/recipes/' while index < 2000000: url = urlBase + str(index) print index index += 1 try: req = urllib2.Request(url.encode("utf8"), headers={'accept': '*/*', 'User-Agent' : "Magic Browser"}) html = urllib2.urlopen(req, timeout=10) except: continue soup = BeautifulSoup(html, "html5lib") ingredients = soup.select('.ingredient-name span') for i in ingredients: i = i.text.lower() if not 'nutritional information' in i: if ' and ' in i: i = i.split(' and ') elif ' or ' in i: i = i.split(' or ') elif ', ' in i: i = i.split(' or ') else: i = [i] for part in i: if 'our' in part: Ingredient.objects.get_or_create(name = part) else: if part != singularize(part): print part, singularize(part) Ingredient.objects.get_or_create(name = singularize(part)) print 'DONE'
def custom_similarity(word, synsets, pos=None): word = singularize(word.lower()) similarities = [] if pos: word_synsets = wordnet.synsets(word, pos=pos) else: word_synsets = wordnet.synsets(word) for i in synsets: for j in word_synsets: try: similarities.append(wordnet.similarity(i, j)) except Exception, e: pass
def tagLemma(self, word_old): #print tag(word_old) for word, pos in tag(word_old): if pos=="NNS": #plurales x = singularize(word) elif pos in ["VB","VBG","VBZ","VBP","VBD","VBN","MD"]: # verbos a infinitivo x = conjugate(word, INFINITIVE) #To-Do: fix this if x: # a veces da error al conjugar x = x else: x = word else: x = word return x
def fullQuery(sentence): new_str = "" for word in sentence.words: if word.string in ['places', 'locations', 'spots']: continue new_word = singularize(word.string) if word.type == "NNS" else word.string new_str += new_word + " " singularized_sentence = parsetree(new_str, relations=True, lemmata=True) m = search('{JJ? NN+} IN {JJ? NN+}', singularized_sentence) query = {} if len(m) > 0: query["term"] = m[0].group(1).string query["location"] = m[0].group(2).string return query