def handleBotInfo(sentence): name = ["Optimus... ah no, Optimist Prime :D", "I.am.the.legendary.Optimist.Prime B-)", "The most Optimist Prime! B-)", "You knew already *tsk tsk*"] creator = ["It's a mystery :O", "Are you optimist enough to know? ;)", "You are among the few who I tell: All I know about my creator is the initials HT :)", "It remains a mystery to me even :(", "It was erased from my memory from the start :("] m = search('what *+ your name', sentence) if len(m) > 0: return oneOf(name) m = search('VP+ *+ your name', sentence) if len(m) > 0: return oneOf(name) m = search('who *+ your creator|dad|mom|father|mother|papa|mama|daddy|mommy', sentence) if len(m) > 0: return oneOf(creator) m = search('VP+ *+ your creator|dad|mom|father|mother|papa|mama|daddy|mommy', sentence) if len(m) > 0: return oneOf(creator) m = search('who *+ creates|created|gave_birth *+ you', sentence) if len(m) > 0: return oneOf(creator) return "Can you guess? ;)"
def _extract_reporters(self): """ Extract the reporters and entities from those sentence of the text where a reported speech verb is used. """ # search for those sentences with reported speech verbs sentences = [s for s in self.__tree if search('RPTVRB|según', s)] for s in sentences: s_str = s.string sent_nlp = self.nlp(s_str) verb = search('RPTVRB|según',s)[0].string shortest_dist = np.inf shortest_word = [] for ent in sent_nlp.ents: # calculate distance dist = self._get_distance(verb, ent.text, s_str) # store all proper nouns in entities word = Word(s, ent.text, tag=None, index=s.id) self.__entities.append(word) # PER and ORG type entities closest to a reporter verb if ent.label_ in ["PER","ORG"] and abs(dist) < shortest_dist: word = Word(s, ent.text, tag='NNP', index=s.id) shortest_dist = abs(dist) shortest_word = word if shortest_word and abs(dist) < self._max_dist: self.__reporters.append(shortest_word)
def isAskingBotInformation(sentence): m = search('what *+ your name', sentence) if len(m) > 0: return True m = search('VP+ *+ your name', sentence) if len(m) > 0: return True m = search( 'who *+ your creator|dad|mom|father|mother|papa|mama|daddy|mommy', sentence) if len(m) > 0: return True m = search('VP+ *+ your creator|dad|mom|father|mother', sentence) if len(m) > 0: return True m = search('who made|are|created|create|wrote|gave_birth|built you', sentence) if len(m) > 0: return True return False
def re_search(text, search_string, strict=False): tree = parsetree(text, lemmata=True) if strict: results = search(search_string, tree, STRICT) else: results = search(search_string, tree) return results
def process(statement, database_name=DATABASE_NAME): ''' Allows us to create entities via statements like "There is a course CSCI4702 called Mobile Programming" and modify entities with statements like "CSCI4702 has a start date of Jan 31st 2013" already encountering a statement like "There is a game engine Unity3d" gives us trouble seems like we need named entity recognition to be able to extract types like that ... or perhaps rely on capitalization which doesn't really work for things like CTO as a category of items, hmm >>> sent = "There is a game engine Unreal Engine".split() >>> print nltk.ne_chunk(nltk.pos_tag(sent)) ''' # this runs real fast, but it doesn't quite get the NN/NNP combination I hoped for from "There is a game engine Unity3D" # although it does now with light=True setting, but now it doesn't get the NNP in "There is a game engine Source" s = parse(statement, relations=True, lemmata=True, light=True) s = split(s) #result = search('There be DT NN+ (DT) (RB) (JJ) NNP+ (call) (DT) (RB) (JJ) (NNPS|NNP)+', s) s, result = extract(statement) if result: #try: noun = search('(NN)+', s)[0].string table = pluralize(noun.replace(' ', '_')) result = search( '(JJ|NNPS|NNP)+', s ) # this pulls in adjectives, but there's supposed to be a better fix coming ident = result[0].string name = result[1].string if len(result) > 1 else ident #raise Exception(table+"; "+ident+"; "+name) return newTable(table, ident, name, database_name) #except: #return regexMatch(statement,database_name) else: return regexMatch(statement, database_name)
def process(statement,database_name = DATABASE_NAME): ''' Allows us to create entities via statements like "There is a course CSCI4702 called Mobile Programming" and modify entities with statements like "CSCI4702 has a start date of Jan 31st 2013" already encountering a statement like "There is a game engine Unity3d" gives us trouble seems like we need named entity recognition to be able to extract types like that ... or perhaps rely on capitalization which doesn't really work for things like CTO as a category of items, hmm >>> sent = "There is a game engine Unreal Engine".split() >>> print nltk.ne_chunk(nltk.pos_tag(sent)) ''' # this runs real fast, but it doesn't quite get the NN/NNP combination I hoped for from "There is a game engine Unity3D" # although it does now with light=True setting, but now it doesn't get the NNP in "There is a game engine Source" s = parse(statement, relations=True, lemmata=True, light=True) s = split(s) #result = search('There be DT NN+ (DT) (RB) (JJ) NNP+ (call) (DT) (RB) (JJ) (NNPS|NNP)+', s) s, result = extract(statement) if result: #try: noun = search('(NN)+', s)[0].string table = pluralize(noun.replace(' ','_')) result = search('(JJ|NNPS|NNP)+', s) # this pulls in adjectives, but there's supposed to be a better fix coming ident = result[0].string name = result[1].string if len(result) > 1 else ident #raise Exception(table+"; "+ident+"; "+name) return newTable(table,ident,name,database_name) #except: #return regexMatch(statement,database_name) else: return regexMatch(statement,database_name)
def isGetNews(sentence): m = search('{VP} {VBG+? JJ+?} {news | information} about|on|regarding { *+ }', sentence) if len(m) > 0: if m[0].group(1).string.lower() in ['look', 'get', 'find', 'tell', 'show', 'fetch', 'search']: return True # Solve special case when "Get" at the beginning of sentence is recognized as # a proper noun m = search('get|find|look *+ news|information about|on|regarding', sentence) if len(m) > 0: return True return False
def re_search(text, search_string, strict=False): try: from pattern.search import STRICT, search from pattern.en import parsetree except: print('Please install pattern: pip install https://github.com/clips/pattern/archive/development.zip') sys.exit() tree = parsetree(text, lemmata=True) if strict: results = search(search_string, tree, STRICT) else: results = search(search_string, tree) return results
def isYelp(sentence): verbs = findVerb(sentence) noun_phrases = findNounPhrase(sentence) # If match key verbs yelpVerbs = ['eat', 'drink', 'find', 'display', 'get'] for verb in verbs: if verb.lower() in yelpVerbs: if "news" in noun_phrases or "information" in noun_phrases and "news stand" not in noun_phrases and "newsstand" not in noun_phrases: return False yelpNouns = ['restaurant', 'food', 'drink', 'shop', 'store', 'bar', 'pub'] for noun in yelpNouns: if noun in noun_phrases: return True # If match question/command structure # "is there" + noun phrase if "is there" in sentence.string \ or "are there" in sentence.string \ and noun_phrases != "": return True # noun phrase + "near by" nearby = nearBy(sentence) if noun_phrases != "" and nearby: return True m = search('{fine|find|get|find|show|search} { *+ }', sentence) # Sometimes Speech to Text misunderstood "find" as "fine" print m if len(m) > 0: return True return False
def find_all_matches_by_ziyu(text, the_pattern): tree = parsetree(text, lemmata=True) results = search( the_pattern , tree, STRICT) output = [] for match in results: word_list = [] for word in match: word_list.append(word.string) sentence = " ".join(word_list) output.append(sentence) # gen_num = 0 # if len(output) > 0 and len(output)<2: # gen_num=1 # elif len(output) >= 2: # gen_num=2 # random_number = [] # while len(random_number) != gen_num: # r = random.randint(0,len(output)) # if r not in random_number: # random_number.append(r) # final_output = [] # if len(output) > 0: # print "have OUTPUT" # print random_number # for i in range(gen_num): # print i # final_output.append(output[random_number[i]]) return output
def _extract_reporters(self): """ Extract the reporters and entities from those sentence of the text where a reported speech verb is used. """ # search for those sentences with reported speech verbs sentences = [s for s in self.__tree if search('RPTVRB|según', s)] # search for proper nouns that are not locations pattern = Pattern.fromstring('!LOCATION|NNP+', STRICT, taxonomy=TAXONOMY) for s in sentences: matches = pattern.search(s) for m in matches: for w in m.words: # chunks with roles (SBJ, OBJ) connected to a reporter verb if ((w.chunk.role is not None) and (w.chunk.verb.head.lemma in taxonomy)): if self._is_composed_noun(w): self.__reporters.append(w.previous()) self.__reporters.append(w) # proper nouns not spotlighted as reported else: if self._is_composed_noun(w): self.__entities.append(w.previous()) self.__entities.append(w)
def getQueries(self): queries = [] WP = 'who|what|when|where|why|how|which' patterns = [ # Some verbs are mislabeled as nouns # When *+ is used next to NP, it swallows parts of the NP # Becuase of this, using {JJ|NN*+} to capture NPs in some cases # [NUMS] -> rearrange captured groups in order of NUMS # [(x, y)] -> conjugates x into the tense of y # ex: [1, (2, 3)] -> "(First Group) (Second Group conjugated to tense of Third Group)" (WP + ' {be} {NP}', "queries.append((self.joinGroups(match[0], [2, 1]), 1, 'R'))"), (WP + ' {be} {NP} {VB*|NN}', "queries.append((self.joinGroups(match[0], [2, 1, 3]), 3, 'R'))"), (WP + ' {be} {NP} {VB*|NN} {*+}', "queries.append((self.joinGroups(match[0], [4, 2, 1, 3]), 4, 'R'))"), (WP + ' {do} {NP} {VB*|NN}', "queries.append((self.joinGroups(match[0], [2, (1, 3)]), 5, 'R'))"), (WP + ' {do} {NP} {VB*|NN} {*+}', "queries.append((self.joinGroups(match[0], [4, 2, (1, 3)]), 5, 'R'))"), (WP + ' {NP} {VB*|NN} {*+}', "queries.append((self.joinGroups(match[0], [1, 3, 2]), 3, 'R'))"), (WP + ' {VB*|NN} {JJ|NN*+} {*+}', "queries.append((self.joinGroups(match[0], [3, 2, 1]), 2, 'R'))"), (WP + ' {NP} {VB*|NN} {*+}', "queries.append((self.joinGroups(match[0], [2, 1, 3]), 4, 'L'))"), (WP + ' {VB*|NN} {JJ|NN*+} {*+}', "queries.append((self.joinGroups(match[0], [1, 2, 3]), 4, 'L'))") ] t = parsetree(self._q.strip('?'), lemmata=True) for p, c in patterns: match = search(p, t) if match: exec c return queries + [(self.getKeyWords(t), 1, 'A')] + [(self._q, 2, 'A')]
def compare_visualization(product_sku, compare_phrase): all_reviews = ReviewInfo.objects.all().filter(sku=product_sku) g = Graph() count = 0.0 for e in all_reviews : s = e.comment.lower() s = plaintext(s) s = parsetree(s) #p = '{NP} (VP) faster than {NP}' p = '{NP} (VP) ' + compare_phrase + ' {NP}' for m in search(p, s): x = m.group(1).string # NP left y = m.group(2).string # NP right if x not in g: g.add_node(x) if y not in g: g.add_node(y) g.add_edge(g[x], g[y], stroke=(0,0,0,0.75)) # R,G,B,A count += 1.0 print count/len(all_reviews), '\r' if len(g) > 0: g = g.split()[0] # Largest subgraph. for n in g.sorted()[:80]: # Sort by Node.weight. n.fill = (0, 0.5, 1, 0.75 * n.weight) g.export('static/compare_visualization', directed=True, weighted=2.0) return True else: return False
def get_pattern_data(search_param): twitter = Twitter(language='en') for tweet in twitter.search(search_param, cached=True): print(plaintext(tweet.text).encode('ascii', 'ignore').decode('utf-8')) g = Graph() for i in range(10): for result in twitter.search(search_param, start=i+1,count=50): s = result.text.lower() s = plaintext(s) s = parsetree(s) p = '{NP} (VP) ' +search_param+ ' {NP}' for m in search(p, s): x = m.group(1).string # NP left y = m.group(2).string # NP right if x not in g: g.add_node(x) if y not in g: g.add_node(y) g.add_edge(g[x], g[y], stroke=(0,0,0,0.75)) # R,G,B,A #if len(g)>0: # g = g.split()[0] # Largest subgraph. for n in g.sorted()[:40]: # Sort by Node.weight. n.fill = (0, 0.5, 1, 0.75 * n.weight) g.export('data', directed=False, weighted=0.6)
def basicExtract(statement): #s = Sentence(parse(statement, relations=True, lemmata=True, light=True)) #p = Pattern.fromstring('(DT) (RB) (JJ) NN+') s = Sentence(parse(statement, lemmata=True)) m = search("There be DT {JJ? NN}", s) return m
def get_noun_phrases_fr_text(text_parsetree, print_output = 0, phrases_num_limit =5, stopword_file=''): """ Method to return noun phrases in target text with duplicates The phrases will be a noun phrases ie NP chunks. Have the in build stop words --> check folder address for this. Args: text_parsetree (pattern.text.tree.Text): parsed tree of orginal text Kwargs: print_output (bool): 1 - print the results else do not print. phrases_num_limit (int): return the max number of phrases. if 0, return all. Returns: (list): list of the found phrases. """ target_search_str = 'NP' #noun phrases target_search = search(target_search_str, text_parsetree)# only apply if the keyword is top freq:'JJ?+ NN NN|NNP|NNS+' target_word_list = [] for n in target_search: if print_output: print retrieve_string(n) target_word_list.append(retrieve_string(n)) ## exclude the stop words. if stopword_file: with open(stopword_file,'r') as f: stopword_list = f.read() stopword_list = stopword_list.split('\n') target_word_list = [n for n in target_word_list if n.lower() not in stopword_list ] if (len(target_word_list)>= phrases_num_limit and phrases_num_limit>0): return target_word_list[:phrases_num_limit] else: return target_word_list
def get_pattern_data(search_param): twitter = Twitter(language='en') for tweet in twitter.search(search_param, cached=True): print(plaintext(tweet.text).encode('ascii', 'ignore').decode('utf-8')) g = Graph() for i in range(10): for result in twitter.search(search_param, start=i + 1, count=50): s = result.text.lower() s = plaintext(s) s = parsetree(s) p = '{NP} (VP) ' + search_param + ' {NP}' for m in search(p, s): x = m.group(1).string # NP left y = m.group(2).string # NP right if x not in g: g.add_node(x) if y not in g: g.add_node(y) g.add_edge(g[x], g[y], stroke=(0, 0, 0, 0.75)) # R,G,B,A #if len(g)>0: # g = g.split()[0] # Largest subgraph. for n in g.sorted()[:40]: # Sort by Node.weight. n.fill = (0, 0.5, 1, 0.75 * n.weight) g.export('data', directed=False, weighted=0.6)
def get_phrases_contain_keyword(text_parsetree, keyword, print_output = 0, phrases_num_limit =5): """ Method to return phrases in target text containing the keyword. The keyword is taken as an Noun or NN|NP|NNS. The phrases will be a noun phrases ie NP chunks. Args: text_parsetree (pattern.text.tree.Text): parsed tree of orginal text keyword (str): can be a series of words separated by | eg "cat|dog" Kwargs: print_output (bool): 1 - print the results else do not print. phrases_num_limit (int): return the max number of phrases. if 0, return all. Returns: (list): list of the found phrases. (remove duplication ) """ ## Regular expression matching. ## interested in phrases containing the traget word, assume target noun is either adj or noun target_search_str = 'JJ|NN|NNP|NNS?+ ' + keyword + ' NN|NNP|NNS?+' target_search = search(target_search_str, text_parsetree)# only apply if the keyword is top freq:'JJ?+ NN NN|NNP|NNS+' target_word_list = [] for n in target_search: if print_output: print retrieve_string(n) target_word_list.append(retrieve_string(n)) target_word_list_rm_duplicates = rm_duplicate_keywords(target_word_list) if (len(target_word_list_rm_duplicates)>= phrases_num_limit and phrases_num_limit>0): return target_word_list_rm_duplicates[:phrases_num_limit] else: return target_word_list_rm_duplicates
def htmlSearch(self, html, url): logger.debug(u"htmlSearch URL : %s" % url) logger.debug(u"html : %s" % html[:20]) s = html.lower() s = plaintext(s) s = parsetree(s) # self.logSentences(s) # Execute a Regular Expression Search p = r'(NN)+' q = search(p, s) # self.logPOS(q) # Iterate over all the words in the POS logger.debug(u" q.Length=%d" % len(q)) logger.debug(u" q[]=%s" % q) self.g, self.urlConcepts, self.wordConcepts = self.addNodes(self.g, q, url, self.urlConcepts, self.wordConcepts) return self.urlConcepts, self.wordConcepts
def get_phrases_contain_keyword(text_parsetree, keyword, print_output=0, phrases_num_limit=5): """ Method to return phrases in target text containing the keyword. The keyword is taken as an Noun or NN|NP|NNS. The phrases will be a noun phrases ie NP chunks. Args: text_parsetree (pattern.text.tree.Text): parsed tree of orginal text keyword (str): can be a series of words separated by | eg "cat|dog" Kwargs: print_output (bool): 1 - print the results else do not print. phrases_num_limit (int): return the max number of phrases. if 0, return all. Returns: (list): list of the found phrases. (remove duplication ) """ ## Regular expression matching. ## interested in phrases containing the traget word, assume target noun is either adj or noun target_search_str = "JJ|NN|NNP|NNS?+ " + keyword + " NN|NNP|NNS?+" target_search = search( target_search_str, text_parsetree ) # only apply if the keyword is top freq:'JJ?+ NN NN|NNP|NNS+' target_word_list = [] for n in target_search: if print_output: print retrieve_string(n) target_word_list.append(retrieve_string(n)) target_word_list_rm_duplicates = rm_duplicate_keywords(target_word_list) if len(target_word_list_rm_duplicates) >= phrases_num_limit and phrases_num_limit > 0: return target_word_list_rm_duplicates[:phrases_num_limit] else: return target_word_list_rm_duplicates
def test_search_function(self): # Assert search() function. s = Sentence(parse("Go on Bors, chop his head off!")) m = search.search("PRP*? NN*", s) self.assertEqual(m[0].string, "Bors") self.assertEqual(m[1].string, "his head") print "pattern.search.search()"
def find_causal_matches(unicode_string, causal_pattern, pattern_order): # Description: Searches text string and returns all cause-effect # relationships based on specified pattern. # Inputs: unicode_string, raw text in Unicode format for Python 3 # causal_pattern, regex defining specific causal statement pattern # pattern_order, specifying which noun phrase is cause or effect # Outputs: List of causal tuples [(cause, effect), ...] or empty list [] # Initialize causal_tuple_list as empty list causal_tuple_list = [] # Convert string to Pattern parsed text (with POS tags) t = parsetree(unicode_string, lemmata=True) # possible_matches is a list of all Pattern matches, given text and pattern possible_matches = search(causal_pattern, t, lemmata=True) # Add causal matches as tuples (cause, effect) to causal_tuple_list # Note, if possible_matches=[], there are no matches if possible_matches != []: # Extract cause-effect tuples and add to causal_tuple_list causal_tuple_list = extract_cause_effect_tuple(possible_matches, pattern_order) final_causal_tuple_list = [] for causal_tuple in causal_tuple_list: if (causal_tuple[0] in unicode_string) and (causal_tuple[1] in unicode_string): final_causal_tuple_list.append(causal_tuple) return(final_causal_tuple_list)
def features(sentence): stop = nltk.corpus.stopwords.words('english') #ptree = parsetree(sentence, relations=True, lemmata=True) ptree = parsetree(sentence) matches = search('NP', ptree) phrases = [] for match in matches: filtered_np = [ word for word in match if word.string.lower() not in stop ] if len(filtered_np) > 0: phrases.append( filtered_np ) #for sentence in ptree: # for chunk in sentence.chunks: # if chunk.type == 'NP': # print [(w.string, w.type) for w in chunk.words] sentence_sentiment = 'NEU' sent_result = sentiment(sentence) sent = sent_result[0] if sent > .1: sentence_sentiment ='POS' elif sent < -.1: sentence_sentiment ='NEG' sentence_subjectivity = 'OBJ' if sent_result[1] > .5: sentence_subjectivity = 'SUB' features = {} features['NP'] = phrases features['SN'] = sentence_sentiment features['SUB'] = sentence_subjectivity return features
def get_noun_phrase_fr_title(self,title): """ Get the NP from title. Use for comparing to company names to extract specific news. """ t = parsetree(title, lemmata=True) target_search = search('NP', t) return target_search[0].group(0).string
def extract_verbs(tree): verb_matches = search('to|you {VB*}', tree) phrases = list() for match in verb_matches: if match.group(1)[0].type in ('VBG', 'VBZ'): continue if match.group(1)[0].string == "dream": continue phrases.append(tree[match.group(1).start:]) return phrases
def print_feature(sentence): ptree = parsetree(sentence) #, relations=True, lemmata=True) #It matches anything from food to cat food, tasty cat food, the tasty cat food, etc. t = parsetree('tasty cat food') matches = search('DT? RB? JJ? NN+', ptree) for match in matches: print match print '\n'
def test_group(self): # Assert Match groups. s = Sentence(parse("the big black cat eats a tasty fish")) m = search.search("DT {JJ+} NN", s) self.assertEqual(m[0].group(1).string, "big black") self.assertEqual(m[1].group(1).string, "tasty") # Assert nested groups (and syntax with additional spaces). m = search.search("DT { JJ { JJ { NN }}}", s) self.assertEqual(m[0].group(1).string, "big black cat") self.assertEqual(m[0].group(2).string, "black cat") self.assertEqual(m[0].group(3).string, "cat") # Assert chunked groups. m = search.search("NP {VP NP}", s) v = m[0].group(1, chunked=True) self.assertEqual(v[0].string, "eats") self.assertEqual(v[1].string, "a tasty fish") print "pattern.search.Match.group()"
def isGetNews(sentence): m = search( '{VP} {VBG+? JJ+?} {news | information} about|on|regarding { *+ }', sentence) if len(m) > 0: if m[0].group(1).string.lower() in [ 'look', 'get', 'find', 'tell', 'show', 'fetch', 'search' ]: return True # Solve special case when "Get" at the beginning of sentence is recognized as # a proper noun m = search('get|find|look *+ news|information about|on|regarding', sentence) if len(m) > 0: return True return False
def extract_keyphrases_from_doc_pattern(item, key): # build parsetree, extract NP's pt = parsetree(item[key]) noun_phrases = search('NP', pt) # convert np matches to unicode list noun_phrases = [np.string for np in noun_phrases] # remove ones too short, lemmatize, etc.. cleankeys = regularise_keys(noun_phrases) return cleankeys
def measure_pattern_search(): """ pattern JJ|NN* NN* DT? JJ|NN?+ NN DT? JJ|NN*+ NN* """ global pattern_search_result #Make measure_me able to modify the value #print ('text_tree', text_tree) pattern_search_result = search(pattern_string, text_tree)
def taxonomy_normalize(sentence): bp_match = search('BEAUTY_PARTS', parsetree(sentence, lemmata=True)) facial_match = search('MAKEUP', parsetree(sentence, lemmata=True)) feet_match = search('FEET', parsetree(sentence, lemmata=True)) body_match = search('BODY', parsetree(sentence, lemmata=True)) matches = '' if len(bp_match) > 0: matches += 'BEAUTY_PARTS-' if len(facial_match) > 0: matches += 'MAKEUP-' if len(feet_match) > 0: matches += 'FEET-' if len(body_match) > 0: matches += 'BODY-' return matches
def extract_verb_phrases(tree): verb_phrase_matches = search('to|you {VP}', tree) phrases = list() if len(verb_phrase_matches) > 0: possible_matches = list() for match in verb_phrase_matches: if match.group(1)[0].string == "dream": continue phrases.append(tree[match.group(1).start:]) return phrases
def add_keywords(self, phrase): sent = en.Sentence(en.parse(phrase)) nouns = search('NN', sent) self.blackboard.pool.nouns.update( set(Word(en.singularize(n[0].string)) for n in nouns)) adjs = search('JJ', sent) self.blackboard.pool.adjectives.update( set(Word(en.lemma(a[0].string)) for a in adjs)) try: nps = search('NP', sent) for np in nps: self.blackboard.pool.epithets.update({ Word(en.singularize(w.string), "NN"): [Word(jj.string, "JJ") for jj in np if "JJ" in jj.tag] for w in np if "NN" in w.tag }) except IndexError: pass
def nlp(bigram, sent): entity = [] for tup in bigram: txt = " ".join(tup) #print txt m = search(txt, sent) if m: entity.append(txt) #print m return entity
def get_ngrams(description, lang='it'): """ Analyze description and get relevant ngrams using an italian POS tagger, looking for exact combination of POS pattern """ s = it_parsetree(description, relations=True, lemmata=True) if lang == "en": s = en_parsetree(description, relations=True, lemmata=True) matches = [] ngrams = [] for match in search("JJ NN", s): matches.append(match.constituents()) for match in search("NN JJ", s): matches.append(match.constituents()) for match in search("NN", s): matches.append(match.constituents()) for match in matches: ngrams.append(" ".join([chunk.string for chunk in match]).encode("utf8")) return remove_uncorrect_tokens(ngrams)
def verbosInfinitivos(cadena): t = parsetree(cadena) verbos = search('VB*', t) #lis=verbos.match.string #print 'list: ',lis #print #no puedo convertirlo a lista de una?? lista =[] for match in verbos: lista.append((match.string , conjugate(match.string, INFINITIVE))) #print 'lista for: ',lista #print lista[3][1] return lista
def isAskingBotInformation(sentence): m = search('what *+ your name', sentence) if len(m) > 0: return True m = search('VP+ *+ your name', sentence) if len(m) > 0: return True m = search('who *+ your creator|dad|mom|father|mother|papa|mama|daddy|mommy', sentence) if len(m) > 0: return True m = search('VP+ *+ your creator|dad|mom|father|mother', sentence) if len(m) > 0: return True m = search('who made|create|wrote|built you', sentence) if len(m) > 0: return True return False
def extract(statement): s = Sentence(parse(statement, lemmata=True)) '''c1 = Constraint.fromstring("There be DT") c2 = Constraint.fromstring("NN+") c3 = Constraint.fromstring("(DT)") c4 = Constraint.fromstring("(RB) (JJ) NNP+") c5 = Constraint.fromstring("(call) (DT)") c6 = Constraint.fromstring("(RB) (JJ) (NNPS|NNP)+") p = Pattern(sequence=[c1, c2, c3, c4, c5, c6]) match = p.search(s) ''' s = find_entities(s) # not sure about this "be" thing - happy to match plural (is/are) but not sure about past tense ... match = search(MATCH_STRING_EXT, s) if not match: match = search(MATCH_STRING, s) #raise Exception(match) return s, match
def verbosInfinitivos(cadena): t = parsetree(cadena) verbos = search('VB*', t) print('verbos =', verbos) #lis=verbos.match.string #print ('list: ',lis) #print() #no puedo convertirlo a lista de una?? LAMBDA lista = [] for match in verbos: lista.append((match.string, conjugate(match.string, INFINITIVE))) #print ('lista for: ',lista) #print (lista[3][1]) return lista
def inflate(s): """ Returns an exaggerated string: inflate("I'm eating a burger") => "I'm eating hundreds of burgers". """ # Part-of-speech tagging identifies word types in a text. # For example, "can" can be a noun (NN) or a verb (VB), # depending on the words surrounding it. # http://www.clips.ua.ac.be/pages/pattern-en#noc_parser # A parse tree splits punctuation marks from words, tags words, # and constructs a nested tree of sentences that contain words. # http://www.clips.ua.ac.be/pages/pattern-en#tree t = parsetree(s) # We can use pattern.search to search for patterns inside a parse tree. # If you know what regular expressions are: this is similar, # only you can also search by part-of-speech tag. # This is very useful to retrieve syntactic structures, e.g.: # "any noun, optionally preceded by an adjective", or # "any conjugation of the verb to be". # http://www.clips.ua.ac.be/pages/pattern-search # The search pattern below means: # "any determiner (a, an, the), optionally followed by any adjective, # followed by one or more nouns". # The search will yield a list of matches. # We'll pluralize the nouns in each match, so that "burger" becomes "burgers", etc. # Note the curly braces {}. # We can retrieve the words inside it with match.group(). for match in search("{DT} {JJ?} {NN+}", t): x = choice(["dozens of ", "hundreds of ", "thousands of "]) # We'll only look at matches that start with "a" or "an". # This indicates an object or a thing of which many can exist. # If the match starts with "the", it might indicate something unique, # like "the capital of Nairobi". It doesn't make sense to transform # it into "hundreds of capitals of Nairobi". if match.group(1).string.lower() not in ("a", "an"): continue # Include the adjective, if any. if match.group(2): x += match.group(2).string x += " " # Pluralize the group of nouns. x += pluralize(match.group(3).string) s = s.replace(match.group(0).string, x) return s
def handle_starttag(self, tag, attrs): for attr in attrs: a = attr[1] a = a.split('/') a = [stem(a[j]) for j in range(len(a))] a = '_'.join(str(e) for e in a) for i in attr: l = search('src', i) b1 = search('5g', a) b3 = search('imag', a) b4 = search('video', a) b5 = search('pdf', a) if not len(b1) == 0: if not len(b5) == 0: if not attr[1][2:] in Pdfs[lien]: Pdfs[lien].append(attr[1][2:]) if not len(b3) == 0: if not attr[1][2:] in Images[lien]: Images[lien].append(attr[1][2:]) if not len(b4) == 0: if not attr[1][2:] in Videos[lien]: Videos[lien].append(attr[1][2:]) if not len(l) == 0 and not len(b1) == 0: if not attr[1][2:] in Images[lien]: Images[lien].append(attr[1][2:])
def handleBotInfo(sentence): name = [ "Optimus... ah no, Optimist Prime :D", "I.am.the.legendary.Optimist.Prime B-)", "The most Optimist Prime! B-)", "You knew already *tsk tsk*" ] creator = [ "It's a mystery :O", "Are you optimist enough to know? ;)", "You are among the few who I tell: All I know about my creator is the initials HT :)", "It remains a mystery to me even :(", "It was erased from my memory from the start :(" ] m = search('what *+ your name', sentence) if len(m) > 0: return oneOf(name) m = search('VP+ *+ your name', sentence) if len(m) > 0: return oneOf(name) m = search( 'who *+ your creator|dad|mom|father|mother|papa|mama|daddy|mommy', sentence) if len(m) > 0: return oneOf(creator) m = search( 'VP+ *+ your creator|dad|mom|father|mother|papa|mama|daddy|mommy', sentence) if len(m) > 0: return oneOf(creator) m = search('who *+ creates|created|gave_birth *+ you', sentence) if len(m) > 0: return oneOf(creator) return "Can you guess? ;)"
def extractTrait(characterSentences): """ Analyzes the sentence using serach module of pattern for adjective. """ print(1) characterTrait = defaultdict(list) for key, value in characterSentences.items(): for x in value: #print(x) #t=parsetree(x) characterTrait[key].append(search('JJ', parsetree(str(x)))) #print(search('JJ',parsetree(str(x)))) return characterTrait
def learn(concept): """ Returns a list of properties for the given concept, collected from a "I think X is Y". """ q = 'I think %s is *' % concept p = [] g = Google(language='en', license=None) for i in range(10): for result in g.search(q, start=i, cached=True): m = plaintext(result.description) m = search(q, m) # Use * as a wildcard. if m: p.append(m[0][-1].string) return [w for w in p if w in PROPERTIES] # only handles known properties...
def inflate(s): """ Returns an exaggerated string: inflate("I'm eating a burger") => "I'm eating hundreds of burgers". """ # Part-of-speech tagging identifies word types in a text. # For example, "can" can be a noun (NN) or a verb (VB), # depending on the words surrounding it. # http://www.clips.ua.ac.be/pages/pattern-en#parser # A parse tree splits punctuation marks from words, tags words, # and constructs a nested tree of sentences that contain words. # http://www.clips.ua.ac.be/pages/pattern-en#tree t = parsetree(s) # We can use pattern.search to search for patterns inside a parse tree. # If you know what regular expressions are: this is similar, # only you can also search by part-of-speech tag. # This is very useful to retrieve syntactic structures, e.g.: # "any noun, optionally preceded by an adjective", or # "any conjugation of the verb to be". # http://www.clips.ua.ac.be/pages/pattern-search # The search pattern below means: # "any determiner (a, an, the), optionally followed by any adjective, # followed by one or more nouns". # The search will yield a list of matches. # We'll pluralize the nouns in each match, so that "burger" becomes "burgers", etc. # Note the curly braces {}. # We can retrieve the words inside it with match.group(). for match in search("{DT} {JJ?} {NN+}", t): x = choice(["dozens of ", "hundreds of ", "thousands of "]) # We'll only look at matches that start with "a" or "an". # This indicates an object or a thing of which many can exist. # If the match starts with "the", it might indicate something unique, # like "the capital of Nairobi". It doesn't make sense to transform # it into "hundreds of capitals of Nairobi". if match.group(1).string.lower() not in ("a", "an"): continue # Include the adjective, if any. if match.group(2): x += match.group(2).string x += " " # Pluralize the group of nouns. x += pluralize(match.group(3).string) s = s.replace(match.group(0).string, x) return s
def test_parse_sentences(self): texts = [ pattern.en.Text(pattern.en.parse("Mary had a little lamb and it was really gorgeous. None.",lemmata=True)), pattern.fr.Text(pattern.fr.parse("Mary avait un agneau et il etait vraiment sympa. Personne.",lemmata=True)) ] nps = [] for text in texts: for sentence in text: for match in search('NP', sentence): for word in match.words: nps.append(word.lemma) self.assertEqual(nps, [u'mary', u'a', u'little', u'lamb', u'it', u'none', u'mary', u'un', u'agneau', u'et', u'il', u'personne'])
def test_search(): from pattern.search import search from pattern.en import parsetree t = parsetree('big white rabbit') print t print print search('JJ', t) # all adjectives print search('NN', t) # all nouns print search('NP', t) # all noun phrases
def fullQuery(sentence): new_str = "" for word in sentence.words: if word.string in ['places', 'locations', 'spots']: continue new_word = singularize( word.string) if word.type == "NNS" else word.string new_str += new_word + " " singularized_sentence = parsetree(new_str, relations=True, lemmata=True) m = search('{JJ? NN+} IN {JJ? NN+}', singularized_sentence) query = {} if len(m) > 0: query["term"] = m[0].group(1).string query["location"] = m[0].group(2).string return query
def fullQuery(sentence): new_str = "" for word in sentence.words: if word.string in ['places', 'locations', 'spots']: continue new_word = singularize(word.string) if word.type == "NNS" else word.string new_str += new_word + " " singularized_sentence = parsetree(new_str, relations=True, lemmata=True) m = search('{JJ? NN+} IN {JJ? NN+}', singularized_sentence) query = {} if len(m) > 0: query["term"] = m[0].group(1).string query["location"] = m[0].group(2).string return query
def load_data(data_file): corpus_data = [] corpus_target = [] print "Reading data file: {}".format(data_file) corpus_file = open (data_file, "r") print "Importing data..." lines = [] for line in corpus_file: entry = Entry() line_parts = line.split("\t") # data validity check assert len(line_parts) == 4 entry.figurative = True if (line_parts[2] == "figuratively") else False # initial pre-process phrase = line_parts[1].decode('utf8').lower() sentences = remove_tags(line_parts[3].decode('utf8').lower()) entry.phrase = wordpunct_tokenize(phrase) entry.phrase_lemma = [lemmatize(w) for w in entry.phrase] # clean up and parse sentence entry.sentences = sent_tokenize(sentences) entry.sentence = np.array([wordpunct_tokenize(x) for x in entry.sentences]) #entry.pos = pos_tag(entry.sentence) entry.sentence = np.hstack(entry.sentence) entry.sentence_lemma = np.array([lemmatize(w) for w in entry.sentence]) # find match of phrase (original strings) phrase_match = search(" ".join(entry.phrase_lemma), " ".join(entry.sentence_lemma)) if len(phrase_match) > 0: # isolate context (remove phrase) context_select = np.ones(len(entry.sentence), dtype=np.bool) start = phrase_match[0].start stop = phrase_match[0].stop context_select[start:stop] = False entry.context = entry.sentence[context_select] entry.context_lemma = entry.sentence_lemma[context_select] else: #print u"phrase {} not found in sentence {}?".format(phrase, sentences) entry.context = entry.sentence entry.context_lemma = entry.sentence_lemma lines.append(entry) return lines
def adjectives(L): """Returns a list of adjecives present in input lists. >>> adjectives([['big', 'white', 'tall', 'dog'], ['bat', 'tall']]) ['big', 'white', 'tall', 'tall'] >>> adjectives([['march'], ['yes', 'i', 'know', 'its', 'almost', 'march']]) [] """ adjs = [] for l in range(len(L)): current_string = " ".join(L[l]) parts_of_speech = parsetree(current_string) for i in search("JJ", parts_of_speech): # Search the parsed string for adjectives adjs.append(str(i.string)) return adjs
def patternSearch(self, n=12, m=50): logger.info(u"patternSearch") proxyList = list() proxyList.append(u"3128") proxyList.append(u"206.217.138.154") logger.info(u"proxyList - %s" % proxyList) engine = Google(license=None, throttle=0.5, language=None) # engine = Bing(license=None, throttle=0.5, language=None) for i in range(n): logger.info(u"Search %d" % i) results = engine.search(self.searchTerm, start=i+1, count=m, cached=False, proxy=proxyList) for r in results: logger.debug(u"Result=%s" % r.text) url = r.url logger.debug(u"URL=%s" % url) # if url[-4:] == ".com": # continue s = r.text.lower() s = plaintext(s) s = parsetree(s) # self.logSentences(s) # Execute a Regular Expression Search # p = r'(NN)+ (VB)+' p = r'(NN)+' q = search(p, s) # logPOS(q) # Iterate over all the words in the POS logger.debug(u" q.Length=%d" % len(q)) logger.debug(u" q[]=%s" % q) self.g, self.urlConcepts, self.wordConcepts = \ self.addNodes(self.g, q, url, self.urlConcepts, self.wordConcepts) return self.urlConcepts, self.wordConcepts
def extract_noun_phrases(body_part_name): stop = nltk.corpus.stopwords.words('english') filename = '/Users/rsteckel/tmp/Observable_body_parts-sentences-BODYPART1.tsv' df = pd.read_csv(filename, sep='\t', encoding='utf-8') df['lemmas'] = df['themeword'].apply(lambda x: lemma(x)) sentences = df[ df['lemmas'] == body_part_name]['sentence'].tolist() phrases = [] for sentence in sentences: ptree = parsetree(sentence) matches = search('NP', ptree) for match in matches: filtered_np = [ word for word in match if word.string.lower() not in stop ] if len(filtered_np) > 0: phrases.append( (sentence, filtered_np) ) return pd.DataFrame(phrases, columns=['sentence', 'phrase'])