def p2(): p = "the {place} was filled with the {adjective1} {noun1} of {nouns1}, and when the {adjective2} {adjective3} {noun2} stirred admist the {nouns2} of the {noun3}, there came through the {adjective4} {noun4} the {adjective5} {noun5} of the {noun6}, or the more {adjective6} {noun7} of the {adjective7} {noun8}.".format( place = random.choice(ns), adjective1 = random.choice(adjs), noun1 = random.choice(ns), nouns1 = pluralize(random.choice(ns)), adjective2 = random.choice(adjs), adjective3 = random.choice(adjs), noun2 = random.choice(ns), nouns2 = pluralize(random.choice(ns)), noun3 = random.choice(ns), adjective4 = random.choice(adjs), noun4 = random.choice(ns), adjective5 = random.choice(adjs), noun5 = random.choice(ns), noun6 = random.choice(ns), adjective6 = random.choice(adjs), noun7 = random.choice(ns), adjective7 = random.choice(adjs), noun8 = random.choice(ns) ) if random.random() > 0.6: p = random.choice([look_around_you(p), get_lost(p)]) elif random.random() > 0.8: p = suddenly(p) return capitalize(p)
def make_noun_string(np, plural=False): # random chance of removing modifier #if random.random() < 0.5: # np[0] == '' # common mass nouns if np[1] in ['data', 'information', 'children', 'people', 'stuff', 'equipment']: return ' '.join(np).strip() elif any(np[1].lower().startswith(x) for x in ('every', 'any', 'some')) or np[1] in ('nothing', 'nobody'): return np[1] quantifiers = ['many', 'few', 'several', 'various', 'multiple', 'fewer', 'more'] if np[0] in quantifiers: return np[0] + ' ' + pluralize(np[1]) else: die_roll = random.random() if die_roll < 0.15 or plural: return ' '.join((np[0], pluralize(np[1]))).strip() elif die_roll < 0.25: return random.choice(('his', 'her', 'their', 'your')) + ' ' + ' '.join(np).strip() elif random.random() < 0.45: return referenced(' '.join(np).strip()) else: return 'the ' + ' '.join(np).strip()
def getKeywords(self): """ Extract keywords using POS tagging :return: Query keywords """ nouns = [] if len(self.sentences) == 1: s = re.sub('[' + string.punctuation + ']', '', self.sentences[0]) self.r.extract_keywords_from_text(s) rp = self.r.get_ranked_phrases() for n in rp: tokens = nltk.tokenize.word_tokenize(n) if len(tokens) == 1: item, tag = nltk.pos_tag(tokens)[0] if 'NN' in tag: if len(item) > 1: if singularize(item) not in nouns and pluralize( item) not in nouns: nouns.append(item) else: nouns.append(n) return nouns for s in self.sentences: s = re.sub('[' + string.punctuation + ']', '', s) tokens = nltk.tokenize.word_tokenize(s) tagged = nltk.pos_tag(tokens) final_nouns = [] for item, t in tagged: if 'NN' in t: if len(item) > 1: if singularize(item) not in final_nouns and pluralize( item) not in final_nouns: final_nouns.append(item) nouns.append(final_nouns) return nouns
def generate_refexs(self, answer_list): """ Given all of the possible answers, generate the referring expressions to store in dictionary. """ # TODO: Make referring expression data-driven for aa in answer_list: ans = aa.split("_(")[0] for jj in ans.split(): # each word and plural form of each word self._refex_lookup[aa].add(jj.lower()) self._refex_lookup[aa].add(pluralize(jj).lower()) self._refex_count[jj] += 1 self._refex_count[pluralize(jj)] += 1 # answer and plural form self._refex_count[ans.lower()] += 1 self._refex_count[pluralize(ans).lower()] += 1 self._refex_lookup[aa].add(ans.lower()) self._refex_lookup[aa].add(pluralize(ans).lower()) # THE answer self._refex_count["the %s" % ans.lower()] += 1 self._refex_lookup[aa].add("the %s" % ans.lower())
def pluralize(self, word): ''' Given base-form of the word, return back plural form of the word (For Noun only) Args: word (str): base-form of the word Raises: ValueError: The vocabulary does not contain the base-form ValueError: Can not find the base-form of the given word Returns: str: plural form of the word ''' if word in self._word2index: return pluralize(word) else: try: base_form_word = lemma(word) if base_form_word in self._word2index: return pluralize(base_form_word) else: raise ValueError( "Found the base-form for '{}': '{}'. But even the base-form not in vocabulary" .format(word, base_form_word)) except: raise ValueError( "Can not found base-form for '{}'".format(word))
def make_thesaurus(file_path): """ Returns dict of counters 'thesaurus', where thesaurus[word] = { synonym1: 4, syn2: 8, syn3: 1, ... } """ thesaurus = defaultdict(lambda: Counter()) with open(file_path, "r") as f: for line in f: # Ignore repeated book title headers if _is_title(line): continue parsed = parse(line) for tagged_word in parsed.split()[0]: word = tagged_word[0].strip().lower() pos = tagged_word[1][0] # get pos for word # Reject non-ASCII characters try: word = word.decode("ascii") except (UnicodeDecodeError, UnicodeEncodeError): continue # Reject whitespace character if re.match("^[\s]*$", word): continue # Increment word count of word w thesaurus[word].update([word]) # Retrieve syn = synonym[w], add to thesaurus[syn] for syn in wn.get_synonyms(word): syn = syn.name().split(".")[0] # if noun, add plural form if word is plural, else add singular if pos == "N": if word == pluralize(word): thesaurus[pluralize(syn)].update([word]) else: thesaurus[syn].update([word]) # if verb, conjugate synonyms to the right form before adding them to thes elif pos == "V": word_tenses = tenses(word) if word_tenses: thesaurus[conjugate(syn, tense=word_tenses[0][0])].update([word]) else: thesaurus[syn].update([word]) else: thesaurus[syn].update([word]) # Update thesaurus with mappings, if map_file exists file_path = file_path.replace(config.CORPUS_FOLDER, config.MAPPING_FOLDER) map_file = file_path.replace(config.CORP_TAG, config.MAP_TAG) thesaurus = _add_mappings(map_file, thesaurus) return thesaurus
def find_noun_form(original_form, original_lemma, new_lemma): """ Figure out whether original form was singular or plural. If plural, then return new_lemma in plural form """ if pluralize(original_lemma) == original_form: return pluralize(new_lemma) else: return new_lemma
def p3(): p = "{pronoun} seated in {noun}, surrounded by {nouns1} and {nouns2}.".format( pronoun = capitalize(PRONOUN) + " " + VERB, noun = referenced(random.choice(ns)), nouns1 = pluralize(random.choice(ns)), nouns2 = pluralize(random.choice(ns)) ) return capitalize(p)
def fix_rejects(rejects, ingredient_book): fixed = [] for r in rejects: r_split = r.split() # r_split = [w for w in r_split if not re.search(fraction_match, w)] # take out nums # print r_split # get bbc ingredients # all_i = open("new_ing.txt", "r") # all_i = all_i.read() all_ingredients = ingredient_book.keys() # lst = all_i.split(',') # all_ingredients = [] # for i in lst: # i = i.replace('u\'', '').replace('\'', '').strip() # all_ingredients.append(i) t = [] for true_ing in all_ingredients: # real ings from bbc ing_regex_single = r'\b{0}\b'.format(true_ing) ing_regex_plural = r'\b{0}\b'.format(pluralize(true_ing)) match_single = re.search(ing_regex_single, true_ing.lower()) match_plural = re.search(ing_regex_plural, true_ing.lower()) if match_single: t.append(true_ing.lower()) elif match_plural: t.append(pluralize(true_ing.lower())) full_name = [] for s in r_split: # word in rejected ingredient match_flag = False for i in t: # i in ing_book_keys s_regex_single = r'\b{0}\b'.format(s) s_regex_plural = r'\b{0}\b'.format(pluralize(s)) match_single = re.search(s_regex_single, i.lower()) match_plural = re.search(s_regex_plural, i.lower()) if match_single: match_flag = True full_name.append(i) # fixed.append(i) elif match_plural: match_flag = True full_name.append(i) # fixed.append(i) else: if s not in full_name: full_name.append(s) if match_flag: break f = ' '.join(full_name) full_name_result = ' '.join(unique_list(f.split())) fixed.append(full_name_result) return fixed
def make_thesaurus_lesk(file_path): """ Returns dict of counters 'thesaurus', where thesaurus[synset] = { word1: 4, word2: 8, word3: 1, ... } """ thesaurus = defaultdict(lambda: Counter()) with open(file_path, "r") as f: f = f.read().split() for i, word_and_tag in enumerate(f): word, tag = word_and_tag.rsplit("_", 1) # Reject non-ASCII characters try: word = word.decode("ascii") except (UnicodeDecodeError, UnicodeEncodeError): continue # look at a window of 9 words each time lesk is called window = [i - WINDOW, i + WINDOW] if i < WINDOW: window = [i, i + 2 * WINDOW] elif i >= len(f) - WINDOW: window = [i - 2 * WINDOW, i] synset = lesk.my_lesk(f[window[0] : window[1]], word) # if lesk can decide on a meaning for that word, add # that meaning, i.e., that synset, to thesaurus if not synset: continue # if word is verb, only add present tense to thesaurus if tag[0] == "V": word_tenses = tenses(word.lower()) if "inf" in word_tenses or "1sg" in word_tenses or "2sg" in word_tenses or "3sg" in word_tenses: thesaurus[str(synset)].update([word.lower()]) elif tag[0] == "N": synset_name = synset.name().split(".")[0] if synset_name == pluralize(synset_name): thesaurus[str(synset)].update([pluralize(word.lower())]) else: thesaurus[str(synset)].update([singularize(word.lower())]) else: thesaurus[str(synset)].update([word.lower()]) # Update thesaurus with mappings, if map_file exists file_path = file_path.replace(config.CORPUS_FOLDER, config.MAPPING_FOLDER) map_file = file_path.replace(config.CORP_TAG, config.MAP_TAG) thesaurus = _add_mappings(map_file, thesaurus) return thesaurus
def pluralizationError(text, nlp, correctFlag=False): ''' Purpose: To check for pluralization error. Additionally, it returns corrected sentence. Parameters: text: string A string of text-single or a paragraph. correctFlag:boolean True or False Returns: count: integer text: Corrected sentence. (If correctFlag is True) ''' doc = nlp(text) count = 0 text = "" for s in doc.sentences: for i in range(len(s.words)): if (i != len(s.words) - 1) and (s.words[i].xpos == "NN" or s.words[i].xpos == "NNP"): if s.words[i + 1].xpos in ["VB", "VBP"]: count += 1 text += pluralize(s.words[i].text) + " " else: text += s.words[i].text + " " elif (i != len(s.words) - 1) and (s.words[i].xpos == "NNS" or s.words[i].xpos == "NNPS"): if s.words[i + 1].xpos == "VBZ": text += singularize(s.words[i].text) + " " else: text += s.words[i].text + " " elif (i != len(s.words) - 1) and s.words[i].xpos == "CD": if s.words[i].text == "1" or s.words[i].text == "one": if s.words[i + 1].xpos == "NNS" or s.words[i + 1].xpos == "NNPS": count += 1 s.words[i + 1].text = singularize(s.words[i + 1].text) text += s.words[i].text + " " else: if s.words[i + 1].xpos == "NN" or s.words[i + 1].xpos == "NNP": count += 1 s.words[i + 1].text = pluralize(s.words[i + 1].text) text += s.words[i].text + " " else: text += s.words[i].text + " " if correctFlag == True: return count, text else: return count
def get_all_names(ings, ingredient_book): #scrape # recipe = scrapeRecipe(url) # ings = recipe[0] ings = map(lambda x: x.lower(), ings) # get bbc ingredients # all_i = open("new_ing.txt", "r") # all_i = all_i.read() all_ingredients = ingredient_book.keys() # lst = all_i.split(',') # all_ingredients = [] # for i in lst: # i = i.replace('u\'', '').replace('\'', '').strip() # all_ingredients.append(i) #get names t = [] rejected = [] for true_ing in all_ingredients: # real ings from bbc ing_regex_single = r'\b{0}\b'.format(true_ing) ing_regex_plural = r'\b{0}\b'.format(pluralize(true_ing)) match_single = re.search(ing_regex_single, true_ing.lower()) match_plural = re.search(ing_regex_plural, true_ing.lower()) if match_single: t.append(true_ing.lower()) elif match_plural: t.append(pluralize(true_ing.lower())) seen = set() seen_add = seen.add t = [x for x in t if not (x in seen or seen_add(x))] t.sort(key=lambda x: len(x.split()), reverse=True) names = [] desc_and_prep = [] for i in ings: for db_ingredient in t: if db_ingredient in i.lower(): names.append(db_ingredient) # db_ingredient is now the name of the Ingredient desc_and_prep.append(i.split(db_ingredient)) break else: rejected.append(i) return [names, desc_and_prep, rejected] #[ ['white sugar'], [[u'1 cup', u'']]]
def p10(): p = "beneath {pronoun2} feet, the {adjective1} {noun1} {verbed} with the {adjective2} {quality} of {nouns1} and {nouns2}.".format( pronoun2 = random.choice([POS_PRONOUN, NAME + "\'s"]), adjective1 = random.choice(adjs), noun1 = random.choice(ns), verbed = random.choice(verbed), adjective2 = random.choice(adjs), quality = random.choice(qualities), nouns1 = pluralize(random.choice(ns)), nouns2 = pluralize(random.choice(ns)) ) return capitalize(p)
def p15(): p = "the {noun1} consisted of {noun2}-like {adjective} {nouns1}, interupted at {number} points by {nouns2}.".format( noun1 = random.choice(ns), noun2 = random.choice(ns), adjective = random.choice(adjs), nouns1 = pluralize(random.choice(ns)), number = str(random.randint(3, 9)), nouns2 = pluralize(random.choice(ns)) ) if random.random() < 0.25: p = look_around_you(p) return capitalize(p)
def p16(): p = "the {noun1} was {adjective1} {noun2}, divided into {adjective2} {nouns1} by {quant} of {adjective3} {color} {nouns2}.".format( noun1 = random.choice(ns), adjective1 = referenced(random.choice(adjs)), noun2 = random.choice(ns), adjective2 = random.choice(adjs), nouns1 = pluralize(random.choice(ns)), quant = referenced(random.choice(quants)), adjective3 = random.choice(ns), color = random.choice(colors), nouns2 = pluralize(random.choice(ns)) ) return capitalize(p)
def generate_phrase_2(): '''Return a phrase and its entropy (in bits) of the form (# adj noun) (adverb verb) (adjective noun punctuation) E.g., 17 MODERATE TRAYS At once live outed wORTH bOSSES ''' selections = [ADJECTIVES, NOUNS, ADVERBS, TRANSITIVE_VERBS, ADJECTIVES, NOUNS, TERMINAL_PUNCTUATION] entropy = sum([log(len(item), 2) for item in selections]) conjugations = [None, None, None, [random_item_from_list([PAST, PRESENT]), 3, PLURAL], None, None, None] sub_list = [random_item_from_list(item) for item in selections] for idx, word in enumerate(sub_list): if conjugations[idx]: sub_list[idx] = conjugate(word, *conjugations[idx]) entropy += 1 sub_list[1] = pluralize(sub_list[1]) sub_list[5] = pluralize(sub_list[5]) entropy += log(997, 2) for idx, item in enumerate(sub_list): rnd = randint(4) if rnd == 1: sub_list[idx] = item.capitalize() if rnd == 2: sub_list[idx] = item.upper() if rnd == 3: sub_list[idx] = item[0] + item[1:].upper() entropy += 2 phrase = ('%i %s %s %s %s %s %s%s' % tuple([randint(997) + 2] + sub_list)).replace('_', ' ') # Insert a random symbol into the sentence insert_point = randint(len(phrase) + 1) entropy += log(len(phrase) + 1, 2) + log(len(SYMBOLS), 2) phrase = phrase[:insert_point] + random_item_from_list(SYMBOLS) + phrase[insert_point:] insert_point = randint(len(phrase) + 1) entropy += log(len(phrase) + 1, 2) + log(len(SYMBOLS), 2) phrase = phrase[:insert_point] + random_item_from_list(SYMBOLS) + phrase[insert_point:] return phrase, entropy
def p8(): p = "the walls of the {noun1} were metre-high {quants} of {adjective1} {noun2}, like the {adjective2} {nouns} of a {noun3}.".format( noun1 = random.choice(ns), quants = pluralize(random.choice(quants)), adjective1 = random.choice(adjs), noun2 = random.choice(ns), adjective2 = random.choice(adjs), nouns = pluralize(random.choice(ns)), noun3 = random.choice(ns) ) if random.random() < 0.25: p = random.choice([look_around_you(p), get_lost(p)]) return capitalize(p)
def tryPOS(word, p, target): if target in p and target not in ['RB', 'DT', 'RP']: if target == 'PRP' or target == 'WP': d = WPD if target == 'PRP': d = PRPD for k in d: if d[k] == word: return k return None return wn.morphy(word) #else if target == 'PRP$' and p == 'PRP': return PRPD.get(word) if target == 'WP$': return WPD.get(word) if p == 'NN': if target == 'NNP': return word else: return pluralize(word) if p == 'NNP': return pluralize(word) if 'VB' in p: t = '' if target == 'VBD': t = PAST if target == 'VBP': t = INFINITIVE if target == 'VBZ': t = PRESENT if target == 'VBN': t = PAST + PARTICIPLE if target == 'VBG': t = PARTICIPLE if t: return conjugate(word, tense=t) ret = '' if target == 'JJR' or target == 'RBR': ret = comparative(word) if target == 'JJS' or target == 'RBS': ret = superlative(word) if not ret or ' ' in ret: return None #default else: return ret
def pos_all(word): rlist = [] _rtense = ('infinitive', 'present', 'past', 'future') _rperson = (1, 2, 3) _rnumber = ('singular', 'plural') _rmood = ('indicative', 'imperitive', 'conditional', 'subjuntive') _raspect = ('imperfective', 'perfective', 'progressive') for rtense in _rtense: for rperson in _rperson: for rnumber in _rnumber: for rmood in _rmood: for raspect in _raspect: item = conjugate(word, tense=rtense, person=rperson, number=rnumber, mood=rmood, aspect=raspect, negated=False) if item not in rlist: rlist.append(item) print bcolors.Magenta + "All pos of " + word print_list(rlist, 4) print "Singluar : " + singularize( word) + " Plural : " + pluralize(word) print "Comparative : " + comparative( word) + " Superlative : " + superlative(word)
def do_flower(self, i, j): """Process finding a flower and possibly doing something with it""" # Get a random color and flower name color = random.choice(self.JSON['colors'])['color'] flower = singularize(random.choice(self.JSON['flowers'])) # Print them self.TEMP += "There was a beautiful " + color + " " + flower + " there. " self.TEMP += "It smelled like " + pluralize( random.choice(self.JSON['fruits'])) + "." # Put a square on the map to mark the flower self.IMAGE.filledRectangle((i * 15 + 4, j * 15 + 4), (i * 15 + 11, j * 15 + 10), self.COLORS['purple']) # Is the narrator keeping this flower? if random.randrange(100) < 10: self.TEMP += " I picked it" if self.FLOWERS: self.TEMP += " and added it to the rest of my bouquet" self.TEMP += "." self.FLOWERS.append({'color': color, 'flower': flower}) # Does the narrator eat this flower instead? elif random.randrange(100) < 5: self.TEMP += " For some reason I ate it. It tasted " + random.choice( self.TASTES) + "." self.TEMP += "\n" self.THEN = False
def conceptnet_relatedness(subject, candidates, object): base_score = call_cp_api(subject, object) pred_subject = subject # print(base_score) # Is there any other label in the ranking making more sense? for o_class, confidence in candidates.items(): f_class, _ = formatlabel(o_class) if f_class == subject: continue # Skip the object itself score = call_cp_api(f_class, object) if score > base_score: base_score = score pred_subject = o_class print("CONCEPTNET: Within the ranking, the most likely subject is %s" % pred_subject) if singularize(pred_subject) == pred_subject: # Re-format back for evaluation pred_subject = pluralize(pred_subject) pred_subject = reverse_map(pred_subject) return pred_subject.replace('_', '-'), base_score
def get_plural_box(cat, nodes, max_boxes=False, new_label=None): """Get plural box for given category/label. max_boxes: used for limiting number of nodes used per category.""" # Get min and max values for nodes # If max_boxes is set, only take limited number of boxes for merging if max_boxes: ymax = max([node.ymax for node in nodes if int(node.ID) <= max_boxes]) ymin = min([node.ymin for node in nodes if int(node.ID) <= max_boxes]) xmax = max([node.xmax for node in nodes if int(node.ID) <= max_boxes]) xmin = min([node.xmin for node in nodes if int(node.ID) <= max_boxes]) else: ymax = max([node.ymax for node in nodes]) ymin = min([node.ymin for node in nodes]) xmax = max([node.xmax for node in nodes]) xmin = min([node.xmin for node in nodes]) # Get new (merged) coordinates coodindates = [xmin, ymin, xmax, ymax] # Get plural label (pluralize -- not used here!) plural_label = pat.pluralize(cat) # Define the bounding box entry with new label if new_label: bbox_line = new_label+'-pl ; ('+", ".join([str(c) for c in coodindates])+')' else: # Create bounding box line bbox_line = cat+'-pl ; ('+", ".join([str(c) for c in coodindates])+')' return bbox_line
def process(statement,database_name = DATABASE_NAME): ''' Allows us to create entities via statements like "There is a course CSCI4702 called Mobile Programming" and modify entities with statements like "CSCI4702 has a start date of Jan 31st 2013" already encountering a statement like "There is a game engine Unity3d" gives us trouble seems like we need named entity recognition to be able to extract types like that ... or perhaps rely on capitalization which doesn't really work for things like CTO as a category of items, hmm >>> sent = "There is a game engine Unreal Engine".split() >>> print nltk.ne_chunk(nltk.pos_tag(sent)) ''' # this runs real fast, but it doesn't quite get the NN/NNP combination I hoped for from "There is a game engine Unity3D" # although it does now with light=True setting, but now it doesn't get the NNP in "There is a game engine Source" s = parse(statement, relations=True, lemmata=True, light=True) s = split(s) #result = search('There be DT NN+ (DT) (RB) (JJ) NNP+ (call) (DT) (RB) (JJ) (NNPS|NNP)+', s) s, result = extract(statement) if result: #try: noun = search('(NN)+', s)[0].string table = pluralize(noun.replace(' ','_')) result = search('(JJ|NNPS|NNP)+', s) # this pulls in adjectives, but there's supposed to be a better fix coming ident = result[0].string name = result[1].string if len(result) > 1 else ident #raise Exception(table+"; "+ident+"; "+name) return newTable(table,ident,name,database_name) #except: #return regexMatch(statement,database_name) else: return regexMatch(statement,database_name)
def do_flower(self, i, j): """Process finding a flower and possibly doing something with it""" # Get a random color and flower name color = random.choice(self.JSON['colors'])['color'] flower = singularize(random.choice(self.JSON['flowers'])) # Print them self.TEMP += "There was a beautiful " + color + " " + flower + " there. " self.TEMP += "It smelled like " + pluralize(random.choice(self.JSON['fruits'])) + "." # Put a square on the map to mark the flower self.IMAGE.filledRectangle((i * 15 + 4, j * 15 + 4), (i * 15 + 11, j * 15 + 10), self.COLORS['purple']) # Is the narrator keeping this flower? if random.randrange(100) < 10: self.TEMP += " I picked it" if self.FLOWERS: self.TEMP += " and added it to the rest of my bouquet" self.TEMP += "." self.FLOWERS.append({'color': color, 'flower': flower}) # Does the narrator eat this flower instead? elif random.randrange(100) < 5: self.TEMP += " For some reason I ate it. It tasted " + random.choice(self.TASTES) + "." self.TEMP += "\n" self.THEN = False
def process_agent_output(answer_template, noun, nouns, noun_topics, answer_sentiment): agent_output = answer_template.answer temp_nouns = nouns #print(agent_output, nouns, noun_topics, (nouns)) if answer_template.fetch_count > 0 and noun_topics != None and len( noun_topics) > 0: #print(noun_topics) if question_sentiment in sentiment_opt_pos: temp_nouns = topic_favorites[noun_topics[0]] #like_memory.loc[like_memory['sentiment'] > 0.5 && like_memory['topic'] == noun_topics[0]].sample().subject elif question_sentiment in sentiment_opt_neg: temp_nouns = topic_dislike[noun_topics[0]] sing_noun = singularize(noun) plural_noun = pluralize(noun) if sing_noun in temp_nouns: temp_nouns.remove(sing_noun) elif plural_noun in temp_nouns: temp_nouns.remove(plural_noun) #replace nouns for i in range(1, answer_template.fetch_count + 1): temp = "noun_" + str(i) agent_output = agent_output.replace(wildcards[temp], temp_nouns[i - 1]) if answer_template.use_noun: agent_output = agent_output.replace(wildcards["noun"], noun) if answer_template.use_sentiment: agent_output = agent_output.replace(wildcards["sentiment"], question_sentiment) agent_output = agent_output.replace(wildcards["agent_sentiment"], answer_sentiment) #print(agent_output) return agent_output
def p13(): p = "the sound that the {nouns1} made was {adjective1} and {adjective2}; {adjective3} {adjective4} {nouns2} so {adjective5} that they were almost {verbed1} rather than {verbed2}.".format( nouns1 = pluralize(random.choice(ns)), adjective1 = random.choice(adjs), adjective2 = random.choice(adjs), adjective3 = random.choice(adjs), adjective4 = random.choice(adjs), nouns2 = pluralize(random.choice(ns)), adjective5 = random.choice(adjs), verbed1 = random.choice(verbed), verbed2 = random.choice(verbed) ) if random.random() < 0.25: p = look_around_you(p) return capitalize(p)
def conjugate_noun(noun, pos): if pos == "NNS" or pos == "NNPS": return str(ptn.pluralize(noun)) elif pos == "NN" or pos == "NNP": return str(ptn.singularize(noun)) else: return noun
def conjugate_noun(noun, pos): if pos=="NNS" or pos =="NNPS": return str(pluralize(noun)) elif pos=="NN" or pos =="NNP": return str(singularize(noun)) else: return noun
def synonyms(data): augment_n = 10 data_dict = dict((key,[val]) for val,key,_ in data) is_plural = lambda word: singularize(word) <> word stops = set(stopwords.words('english') + ['l']) for disease in data: for _ in range(augment_n): new_facts_list = [] for fact in disease[0]: new_fact = fact[:] for k,word in enumerate(fact): if word not in stops: syn = wordnet.synsets(word) if syn: random_syn = syn[0] random_lemma = random.choice(random_syn.lemma_names()) random_lemma = pluralize(random_lemma) if is_plural(word)\ else random_lemma random_lemma = random_lemma.lower() random_lemma = random_lemma.replace('_',' ') random_lemma = random_lemma.replace('-',' ') if ' ' in random_lemma: continue new_fact[k] = random_lemma new_facts_list.append(new_fact) #print new_facts_list data_dict[disease[1]].append(new_facts_list[:]) return data_dict
def getInflections(key): inflections = set() # print('"%s"' % key) if key.isalpha(): try: try: lexeme(key) except: pass inflections.add(lexeme(key)) # get all lexem inflections of words inflections.add(pluralize(key)) # add plural inflections inflections.intersection_update(wordlist) print(inflections) except: pass # print("Unexpected error") return inflections
def getSynonyms(word, part): synonyms = [] wordToTry = lemma(word) if part[0] == 'V' else word synList = dictionary.synonym(wordToTry) if synList is None: return [word] for syn in synList: if " " not in syn: if part == "VB" or part == "VBP": synonyms.append(lemma(syn)) elif part == "VBD" and len(lexeme(syn)) > 3: synonyms.append(lexeme(syn)[3]) elif part == "VBG" and len(lexeme(syn)) > 0: synonyms.append(lexeme(syn)[0]) elif part == "VBN" and len(lexeme(syn)) > 3: synonyms.append(lexeme(syn)[-1]) elif part == "VBZ" and len(lexeme(syn)) > 1: synonyms.append(lexeme(syn)[1]) elif part == "NN" and syn[-2:] != "ss": synonyms.append(singularize(syn)) elif part == "NNS": synonyms.append(pluralize(syn)) else: synonyms.append(syn) return list(set(synonyms))
def change_pluralization(token): singularForm = singularize(token) pluralForm = pluralize(token) if token == singularForm: return pluralForm else: return singularForm
def process(statement, database_name=DATABASE_NAME): ''' Allows us to create entities via statements like "There is a course CSCI4702 called Mobile Programming" and modify entities with statements like "CSCI4702 has a start date of Jan 31st 2013" already encountering a statement like "There is a game engine Unity3d" gives us trouble seems like we need named entity recognition to be able to extract types like that ... or perhaps rely on capitalization which doesn't really work for things like CTO as a category of items, hmm >>> sent = "There is a game engine Unreal Engine".split() >>> print nltk.ne_chunk(nltk.pos_tag(sent)) ''' # this runs real fast, but it doesn't quite get the NN/NNP combination I hoped for from "There is a game engine Unity3D" # although it does now with light=True setting, but now it doesn't get the NNP in "There is a game engine Source" s = parse(statement, relations=True, lemmata=True, light=True) s = split(s) #result = search('There be DT NN+ (DT) (RB) (JJ) NNP+ (call) (DT) (RB) (JJ) (NNPS|NNP)+', s) s, result = extract(statement) if result: #try: noun = search('(NN)+', s)[0].string table = pluralize(noun.replace(' ', '_')) result = search( '(JJ|NNPS|NNP)+', s ) # this pulls in adjectives, but there's supposed to be a better fix coming ident = result[0].string name = result[1].string if len(result) > 1 else ident #raise Exception(table+"; "+ident+"; "+name) return newTable(table, ident, name, database_name) #except: #return regexMatch(statement,database_name) else: return regexMatch(statement, database_name)
def inject_sub_nn(sent_i, e_config): target_indices = [] for i, w_i in enumerate(sent_i): if w_i['tag'] in ('NN', 'NNS'): target_indices.append(i) if target_indices: target_index = target_indices[random.randint(0, len(target_indices) - 1)] target_token = sent_i[target_index]['form'] target_tag = sent_i[target_index]['tag'] new_token = "" new_tag = "" if target_tag == "NN": new_token = pluralize(target_token) new_tag = "NNS" elif target_tag == "NNS": new_token = singularize(target_token) new_tag = "NN" else: raise sent_i[target_index]['form'] = str(new_token) sent_i[target_index]['tag'] = new_tag sent_i[target_index]['ctag'] = new_tag else: pass return sent_i
def _transform_word(self, word, pos, less, more): """transforms a word to be less less and more more :param word: word to transform :type word: str :param pos: part of speech of the word :type pos: str :param less: list of 'less' words :type less: list :param more: list of 'more' words :type more: list :returns: transformed word :rtype: str """ new_word = self._get_similar_word(word, less, more) new_pos = en.tag(new_word)[0][1] if (pos[:2] != new_pos[:2]) or word == new_word: return word # handle noun if pos.startswith('NN'): # pluralization if pos.endswith('S') and not new_pos.endswith('S'): new_word = en.pluralize(new_word) elif not pos.endswith('S') and new_pos.endswith('S'): new_word = en.singularize(new_word) # capitalization if word[0].isupper(): new_word = new_word[0].upper() + new_word[1:] else: new_word = new_word.lower() # handle verb elif pos.startswith('VB'): tense, person, number = en.tenses(word)[0][:3] # conjugation conjugated = en.conjugate(new_word, tense=tense, person=person, number=number, parse=False) if conjugated is not None: new_word = conjugated # remove underscores for joint words new_word = new_word.replace('_', ' ') return new_word
def regexMatch(statement,database_name = DATABASE_NAME): match = re.search(r'There is an? ([\w]+) ([\s\w]+) called ([\s\w]+)\.?',statement) if match: table = pluralize(match.group(1)) ident = match.group(2) name = match.group(3) return newTable(table,ident,name,database_name) return processNewAspect(statement,database_name)
def inject(self, title, word_pair): for i, cat in title.get_slots('NP'): if cat == 'plural': title.inject(pluralize(word_pair[0]).capitalize(), 'NP') else: title.inject(singularize(word_pair[0]).capitalize(), 'NP') for i, cat in title.get_slots('ADJ'): title.inject(word_pair[1].capitalize(), 'ADJ')
def is_plural(word): if word.lower() in PRONOUN_PLURAL: return True if word.lower() in PRONOUN_SINGULAR: return False return True if pluralize(lemma(word.lower())) == word.lower() else False
def getTags(self): """ Extract possible tags from the text using RAKE :return: Tag set """ meaningset = [] if len(self.sentences) == 1: s = re.sub('[' + string.punctuation + ']', '', self.sentences[0]) self.r.extract_keywords_from_text(s) rp = self.r.get_ranked_phrases() self.phraseScore.append(self.r.get_ranked_phrases_with_scores()) final_nouns = [] for n in rp: tokens = nltk.tokenize.word_tokenize(n) if len(tokens) == 1: item, tag = nltk.pos_tag(tokens)[0] if 'NN' in tag: if len(item) > 1: if singularize( item) not in final_nouns and pluralize( item) not in final_nouns: final_nouns.append(item) else: final_nouns.append(n) return final_nouns for s in self.sentences: s = re.sub('[' + string.punctuation + ']', '', s) self.r.extract_keywords_from_text(s) rp = self.r.get_ranked_phrases() self.phraseScore.append(self.r.get_ranked_phrases_with_scores()) final_nouns = [] for n in rp: tokens = nltk.tokenize.word_tokenize(n) if len(tokens) == 1: item, tag = nltk.pos_tag(tokens)[0] if 'NN' in tag: if len(item) > 1: if singularize( item) not in final_nouns and pluralize( item) not in final_nouns: final_nouns.append(item) else: final_nouns.append(n) meaningset.append(final_nouns) return meaningset
def getPluralSingular(w): word = w plural = isplural(word) if plural: word = singularize(word) else: word = pluralize(word) return word
def pluralize_singularize(word,prev_word): if "thing" in word: print word,prev_word if "these" in prev_word: return pluralize(word) elif "this" in prev_word: return singularize(word) else: return word
def pl(word, num=2): ''' Pluralize word based on count ''' if num != 1: if word not in PLURAL_EXCEPTIONS: word = pluralize(word, pos='NOUN') else: word = PLURAL_EXCEPTIONS[word] return word
def shapeNoun(noun,posTag): """ Reshapes the base noun according to it's pos tag Assuming noun is in singular form """ if posTag == 'NNS' or posTag == 'NNPS': return pt.pluralize(noun) else: return noun
def key_set(full_word): words = [] # hack for class etc if singularize(full_word) == full_word or full_word.endswith('ss'): plural = pluralize(full_word) words = [full_word, plural] else: words = [singularize(full_word), full_word, pluralize(singularize(full_word))] for w in words[:]: # if not already plural like if not w.endswith('s'): suffix = 's' if any([w.endswith(suf) for suf in ['x', 'z', 'ch', 'sh']]): suffix = 'es' words.append('%s%s' % (w, suffix)) tup = tuple(sorted(list(set(words)))) return tup
def word_denormalize(word, part_of_speech, word_form): original_word_view = "" # decision tree for part of speech # Nouns. From singular to plural # maybe in that part I need use *args to send any arguments if part_of_speech in Nouns_tags and word_form == '?': # I don't understand what I need to send in word_form original_word_view = pluralize(word, pos=NOUN, classical=True) return original_word_view
def shapeNoun(noun, posTag): """ Reshapes the base noun according to it's pos tag Assuming noun is in singular form """ if posTag == 'NNS' or posTag == 'NNPS': return pt.pluralize(noun) else: return noun
def xproto_pluralize(field): try: # The user has set a plural, as an exception that cannot be handled automatically plural = field['options']['plural'] plural = unquote(plural) except KeyError: plural = en.pluralize(field['name']) return plural
def p7(): p = "{adjective1} {nouns1} were detaching from the {adjective2} {nouns2} and {nouns3}, {verbing} in {adjective3} {quants}.".format( adjective1 = random.choice(adjs), nouns1 = pluralize(random.choice(ns)), adjective2 = random.choice(adjs), nouns2 = pluralize(random.choice(ns)), nouns3 = pluralize(random.choice(ns)), verbing = random.choice(verbing), adjective3 = random.choice(adjs), quants = pluralize(random.choice(quants)) ) if random.random() < 0.25: p = meanwhiler(p) elif random.random() < 0.5: p = suddenly(p) return capitalize(p)
def inject(self, title, word_pair): for (i, slot), word in zip(title.slots, word_pair): word = word.replace("_", " ").title() if slot == 'NOUN': title.inject(singularize(word), slot, i) elif slot == 'NOUNS': title.inject(pluralize(singularize(word)), slot, i) else: title.inject(word, slot, i)
def regexMatch(statement, database_name=DATABASE_NAME): match = re.search(r'There is an? ([\w]+) ([\s\w]+) called ([\s\w]+)\.?', statement) if match: table = pluralize(match.group(1)) ident = match.group(2) name = match.group(3) return newTable(table, ident, name, database_name) return processNewAspect(statement, database_name)
def build_from_clause(tables): """Assembles a FROM clause from a list of tables""" from_clause = '' for s in tables: if from_clause: from_clause = from_clause + ', ' from_clause = from_clause + format(pluralize(s.hypernym[0])) return from_clause
def p11(): p = "the {noun1} was studded with an enormous number of {nouns}, flooding the {noun2} with {adjective} {noun3}.".format( noun1 = random.choice(ns), nouns = pluralize(random.choice(ns)), noun2 = random.choice(ns), adjective = random.choice(adjs), noun3 = random.choice(ns) ) if random.random() < 0.25: p = look_around_you(p) return capitalize(p)
def p12(): p = "{noun1} stood in the middle of the {noun2}, surrounded by {noun3} of {adjective} {nouns}.".format( noun1 = referenced(random.choice(ns)), noun2 = random.choice(ns), verbed = random.choice(verbed), noun3 = referenced(random.choice(ns)), adjective = random.choice(adjs), nouns = pluralize(random.choice(ns)) ) if random.random() < 0.25: p = look_around_you(p) return capitalize(p)
def inflate(s): """ Returns an exaggerated string: inflate("I'm eating a burger") => "I'm eating hundreds of burgers". """ # Part-of-speech tagging identifies word types in a text. # For example, "can" can be a noun (NN) or a verb (VB), # depending on the words surrounding it. # http://www.clips.ua.ac.be/pages/pattern-en#noc_parser # A parse tree splits punctuation marks from words, tags words, # and constructs a nested tree of sentences that contain words. # http://www.clips.ua.ac.be/pages/pattern-en#tree t = parsetree(s) # We can use pattern.search to search for patterns inside a parse tree. # If you know what regular expressions are: this is similar, # only you can also search by part-of-speech tag. # This is very useful to retrieve syntactic structures, e.g.: # "any noun, optionally preceded by an adjective", or # "any conjugation of the verb to be". # http://www.clips.ua.ac.be/pages/pattern-search # The search pattern below means: # "any determiner (a, an, the), optionally followed by any adjective, # followed by one or more nouns". # The search will yield a list of matches. # We'll pluralize the nouns in each match, so that "burger" becomes "burgers", etc. # Note the curly braces {}. # We can retrieve the words inside it with match.group(). for match in search("{DT} {JJ?} {NN+}", t): x = choice(["dozens of ", "hundreds of ", "thousands of "]) # We'll only look at matches that start with "a" or "an". # This indicates an object or a thing of which many can exist. # If the match starts with "the", it might indicate something unique, # like "the capital of Nairobi". It doesn't make sense to transform # it into "hundreds of capitals of Nairobi". if match.group(1).string.lower() not in ("a", "an"): continue # Include the adjective, if any. if match.group(2): x += match.group(2).string x += " " # Pluralize the group of nouns. x += pluralize(match.group(3).string) s = s.replace(match.group(0).string, x) return s