def print_feature(sentence): ptree = parsetree(sentence) #, relations=True, lemmata=True) #It matches anything from food to cat food, tasty cat food, the tasty cat food, etc. t = parsetree('tasty cat food') matches = search('DT? RB? JJ? NN+', ptree) for match in matches: print match print '\n'
def find_sentences(dealiased_book_path): """Given a text it extractes phrases and sentences.""" dealiased_book = open(dealiased_book_path, "r", encoding='utf8') book = dealiased_book.read() dealiased_book.close() sss1 = parsetree(book, relations=True, lemmata=True) chunk_phrases = [] sentenced_chunks = [] sent_chunks = [] chunks = [] STOP = stopwords.words('english') + list(string.punctuation) for sentence in sss1: sentenced_chunks.append(sentence.chunks) stchk = [] for chunk in sentence.chunks: chnks = (chunk.type, [(w.string, w.type) for w in chunk.words]) ch_str = [ w.string for w in chunk.words if len(w.string) > 2 and w.string not in STOP ] chunk_phrases.append(' '.join(ch_str)) chunks.append(chnks) stchk.append(chnks) sent_chunks.append(stchk) return chunk_phrases, sent_chunks
def find_causal_matches(unicode_string, causal_pattern, pattern_order): # Description: Searches text string and returns all cause-effect # relationships based on specified pattern. # Inputs: unicode_string, raw text in Unicode format for Python 3 # causal_pattern, regex defining specific causal statement pattern # pattern_order, specifying which noun phrase is cause or effect # Outputs: List of causal tuples [(cause, effect), ...] or empty list [] # Initialize causal_tuple_list as empty list causal_tuple_list = [] # Convert string to Pattern parsed text (with POS tags) t = parsetree(unicode_string, lemmata=True) # possible_matches is a list of all Pattern matches, given text and pattern possible_matches = search(causal_pattern, t, lemmata=True) # Add causal matches as tuples (cause, effect) to causal_tuple_list # Note, if possible_matches=[], there are no matches if possible_matches != []: # Extract cause-effect tuples and add to causal_tuple_list causal_tuple_list = extract_cause_effect_tuple(possible_matches, pattern_order) final_causal_tuple_list = [] for causal_tuple in causal_tuple_list: if (causal_tuple[0] in unicode_string) and (causal_tuple[1] in unicode_string): final_causal_tuple_list.append(causal_tuple) return(final_causal_tuple_list)
def step_to_computer_adjs(string): try: parse = parsetree(string, relations=True)[0] return ' '.join(a.string for a in parse.adjectives if a.string.isalpha()) except IndexError: return ''
def verbConjugate(lemma, rel, aan): relAvoid = [ "/r/CapableOf", "/r/PartOf", "/r/MemberOf" "/r/IsA", "/r/HasA", "/r/TranslationOf", "/r/HasProperty" ] if not rel in relAvoid: s = parsetree(lemma, relations=True) try: vb = s[0].verbs[0].words[0].string result = lemma.replace(vb, conjugate(vb, "part")) except: result = lemma else: if vb in ["to", "can"]: result = lemma # if not aan: # try: # firstWord = s[0].chunks[0].words[0].string # reconjugated = conjugate(firstWord, "part") # result = lemma.replace(firstWord, reconjugated) # except: # result = lemma else: result = lemma return result
def dialogue_act_features(self, post): stop = nltk.word_tokenize(post) post = [] for i in stop: if i not in self.stopwords: post.append(i) else: pass #regect_list.append(i) posts = "" for i in post: posts += i posts += " " processed = parsetree(posts, relations=True, lemmata=True) features = {} for sents in processed: x = sents for i in x.chunks: j = i.pos if j == "VP": tense = self.tenseses(i.string) else: tense = "" h = i.words for words in h: apd_str = str(words.lemma) + "-" + str(words.pos) #+ "-" +tense #if words.pos[:1] == "NN" or words.pos[:1] == "VP": features['word({})'.format(str(words.lemma))] = str( words.pos) #pos['features({})'.format(j)] = True return features
def get_pattern_data(search_param): twitter = Twitter(language='en') for tweet in twitter.search(search_param, cached=True): print(plaintext(tweet.text).encode('ascii', 'ignore').decode('utf-8')) g = Graph() for i in range(10): for result in twitter.search(search_param, start=i + 1, count=50): s = result.text.lower() s = plaintext(s) s = parsetree(s) p = '{NP} (VP) ' + search_param + ' {NP}' for m in search(p, s): x = m.group(1).string # NP left y = m.group(2).string # NP right if x not in g: g.add_node(x) if y not in g: g.add_node(y) g.add_edge(g[x], g[y], stroke=(0, 0, 0, 0.75)) # R,G,B,A #if len(g)>0: # g = g.split()[0] # Largest subgraph. for n in g.sorted()[:40]: # Sort by Node.weight. n.fill = (0, 0.5, 1, 0.75 * n.weight) g.export('data', directed=False, weighted=0.6)
def retrieve_top_freq_noun_phrases_fr_file(target_file, phrases_num_limit, top_cut_off, saveoutputfile = ''): """ Retrieve the top frequency words found in a file. Limit to noun phrases only. Stop word is active as default. Args: target_file (str): filepath as str. phrases_num_limit (int): the max number of phrases. if 0, return all top_cut_off (int): for return of the top x phrases. Kwargs: saveoutputfile (str): if saveoutputfile not null, save to target location. Returns: (list) : just the top phrases. (list of tuple): phrases and frequency """ with open(target_file, 'r') as f: webtext = f.read() t = parsetree(webtext, lemmata=True) results_list = get_noun_phrases_fr_text(t, phrases_num_limit = phrases_num_limit, stopword_file = r'C:\pythonuserfiles\google_search_module_alt\stopwords_list.txt') #try to get frequnecy of the list of words counts = Counter(results_list) phrases_freq_list = counts.most_common(top_cut_off) #remove non consequencial words... most_common_phrases_list = [n[0] for n in phrases_freq_list] if saveoutputfile: with open(saveoutputfile, 'w') as f: for (phrase, freq) in phrases_freq_list: temp_str = phrase + ' ' + str(freq) + '\n' f.write(temp_str) return most_common_phrases_list, phrases_freq_list
def main(): db_file = sys.argv[1] # Connect to the sqlite database conn = sqlite3.connect(db_file) c = conn.cursor() #Get all the lines and for each line fill the phrases line_list = c.execute("SELECT id, text FROM line order by id;").fetchall() print "Done with fetchall" phrase_id = 0 for line in line_list: line_id, text = line #Parse the sentence and break it into phrases res = parsetree(text, relations = True) #Identify the phrases sentence = res.sentences[0] phrases = sentence.phrases for phrase in phrases: #Count the length of the words if len(phrase.words) >= 2: text = unicode(phrase) c.execute("INSERT INTO phrase (id, text, line_id) VALUES \ (?, ?, ?)", (phrase_id, text, line_id)) phrase_id += 1 print phrase_id conn.commit() conn.close() print "Finished"
def match_help_to_element_NLP(elements, text): placeholder_elements_dict = get_pe_dict(elements) placeholders = placeholder_elements_dict.keys() t = parsetree(text) # pprint(t) for sen in t: chunks = filter(lambda x: (x.type == 'NP'), sen.chunks) for chunk in chunks: words = filter(lambda x: (x.type.startswith('NN')), chunk.words) for w in words: for p in placeholders: p_t = parsetree(p) p_words = [i.string for i in p_t.words] if w.string.lower() in p_words: return (placeholder_elements_dict[p]) return (None)
def get_objects(text, verbs): """ Given a passage of text and a list of verbs, yields all noun phrases which are at any point the object of any of those verbs. >>> list(get_objects("I'm explaining politics to my dad", ["explaining"])) ["politics"] >>> list(get_objects("I'm talking to my dad about politics", ["talking to"])) ["my dad"] """ tree = parsetree(text) for sentence in tree: matching_verb_phrase_indices = [ idx for idx, chunk in enumerate(sentence.chunks[:-1]) if chunk.type == 'VP' and chunk.words[-1].string in verbs ] for idx in matching_verb_phrase_indices: chunk = sentence.chunks[idx+1] if chunk.type == 'NP': if accept_noun_phrase(chunk): yield " ".join([w.string for w in chunk.words])
def fix_caption(str): s = parsetree(str, lemmata=True) string = '' for sentence in s: if "and a" in str: string = str+' ' else: for i, chunk in enumerate(sentence.chunks): if chunk.type == 'VP' and len(chunk) == 2: verb = chunk[1].string string += lexeme(verb)[1]+' ' else: for j, w in enumerate(chunk.words): if i == 0 and j == 0 and (w.string == 'a' or w.string == 'A'): print('chuk', chunk) pass else: string = string + w.string+' ' string = string[:1].upper() + string[1:-1] if string.startswith('A'): string = string[2].upper() + string[3:] if string.endswith('.'): string = string[:-1] return string
def dramatize(s): """ Returns a string with stronger adjectives: dramatize("This code is nice") => "This code is legendary" """ x = [] # A parse tree takes a string and returns a list of sentences, # where each sentence is a list of words, where each word is an # object with interesting attributes such as Word.tag. for sentence in parsetree(s): for word in sentence: replaced = False if word.tag == "JJ": # What's the polarity of this adjective? polarity = sentiment(word.string)[0] # Don't change neutral adjectives like "last", "political", ... if polarity != 0.0: # Can we find an adjective in our dictionary # with a more extreme polarity? # Note: the shuffled() function takes a list # and returns a new, randomly ordered list. for w, p in shuffled(adjectives.items()): if polarity >= 0 and p > polarity + 0.2 \ or polarity < 0 and p < polarity - 0.2: x.append(w.lower()) replaced = True break if not replaced: x.append(word.string) return " ".join(x)
def find_all_matches_by_ziyu(text, the_pattern): tree = parsetree(text, lemmata=True) results = search( the_pattern , tree, STRICT) output = [] for match in results: word_list = [] for word in match: word_list.append(word.string) sentence = " ".join(word_list) output.append(sentence) # gen_num = 0 # if len(output) > 0 and len(output)<2: # gen_num=1 # elif len(output) >= 2: # gen_num=2 # random_number = [] # while len(random_number) != gen_num: # r = random.randint(0,len(output)) # if r not in random_number: # random_number.append(r) # final_output = [] # if len(output) > 0: # print "have OUTPUT" # print random_number # for i in range(gen_num): # print i # final_output.append(output[random_number[i]]) return output
def drivel(noun): """ Generates drivel by shifting nouns in the description of the shifted noun, and prepending random alliterative adjectives. """ drivel = [] description = shift(noun)[1] description = description.split(";")[0] for sentence in parsetree(description): for i, w in enumerate(sentence.words): w, tag = w.string, w.tag if tag in ("VBD", "VBZ"): w = conjugate(w, "infinitive") w = conjugate(w, "past") if tag == "NN": # noun try: w = shift(w)[0] a = list(alliterate(w)) if a: if i > 0 and sentence.words[i].tag == "JJ": # adjective drivel.pop() drivel.append(choice(a)) except: pass drivel.append(w) return " ".join(drivel)
def lemmatize(data): processed = "" for line in data: tree_data = parsetree(line, lemmata=True) for each in str(tree_data).split(' '): processed += " " + each.split('/')[-1] return (processed.lstrip())
def _parse(*args, **kw): # FIXME (workaround) from pattern.text.en import parser if isinstance(parser.model, str): from pattern.text import Model parser.model = Model(path=parser.model) return parsetree(*args, **kw)
def verbConjugate(lemma, rel, aan): relAvoid = ["/r/CapableOf", "/r/PartOf", "/r/MemberOf" "/r/IsA", "/r/HasA", "/r/TranslationOf", "/r/HasProperty"] if not rel in relAvoid: s = parsetree(lemma, relations=True) try: vb = s[0].verbs[0].words[0].string result = lemma.replace(vb, conjugate(vb, "part")) except: result = lemma else: if vb == "to": result = lemma # if not aan: # try: # firstWord = s[0].chunks[0].words[0].string # reconjugated = conjugate(firstWord, "part") # result = lemma.replace(firstWord, reconjugated) # except: # result = lemma else: result = lemma return result
def chTitle(hi): htmlFile = open(APPPATH+'static/output/'+hi+'.html', 'r') html = htmlFile.read() htmlFile.close() soup = BeautifulSoup(html) text = "\n".join([unicode(i) for i in soup.p.contents]).replace("<br/>", "\n") s = parsetree(text) nounPhrases = [] for sentence in s: for chunk in sentence.chunks: if chunk.type == "NP": nounPhrases.append(chunk.string) selectNPs = rs([np for np in nounPhrases if not "&" in np], ri(1,2)) articles = ["a", "an", "the"] nps = [] for np in selectNPs: if startsWithCheck(np, articles): nps.append(np) else: nps.append(a_or_an(np)) if len(selectNPs) == 1: title = titlecase(nps[0]) elif len(selectNPs) == 2: title = titlecase(" and ".join(nps)) # elif len(selectNPs) == 3: # title = titlecase("%s, %s, and %s" % tuple(nps)) return title.encode('ascii', 'xmlcharrefreplace')
def re_search(text, search_string, strict=False): tree = parsetree(text, lemmata=True) if strict: results = search(search_string, tree, STRICT) else: results = search(search_string, tree) return results
def has_single_highlight(line): match = re.search(r'\'(.+?)\'', line) if match: quote = match.group()[1:-1] s = parsetree(quote, chunk=True, relations=True, lemmata=True) for sentence in s: rel = sentence.relations pnp = sentence.pnp sbj = True vb = True obj = True pnp = True if not rel.get("SBJ"): sbj = False if not rel.get("VP"): vp = False if not rel.get("OBJ"): obj = False if not pnp: pnp = False if sbj and vb and (obj or pnp): # quotes.append(line) return 0 else: return 1 else: return 0
def getParseTrees(self): """Return parse trees of each sentence.""" from pattern.en import parsetree if not self.parseTrees: self.parseTrees = [ parsetree(sent) for sent in self.getPlainSentences() ] return
def acceptPatterns(): original_content = request.form['drunk_text'] text_content_array = original_content.split(' ') text_content = '' for s in text_content_array: text_content += superlative(s) + ' ' s = parsetree(original_content, relations=True, lemmata=True) return repr(s)
def extract_keyphrases_from_doc_pattern(item, key): # build parsetree, extract NP's pt = parsetree(item[key]) noun_phrases = search('NP', pt) # convert np matches to unicode list noun_phrases = [np.string for np in noun_phrases] # remove ones too short, lemmatize, etc.. cleankeys = regularise_keys(noun_phrases) return cleankeys
def form_help_to_vec(p): t = parsetree(p) requirements = [] # pprint(t) for sen in t: for i, chunk in enumerate(sen.chunks): if chunk.type == "NP": requirements.append(vector) return (final)
def __init__(self): #create an instance of our clickbait detector #skipGetNumbersSum will prevent getNumbersSum from summing anything that could be a number in english, sometimes slow skipGetNumbersSum = False #set to 0 to prevent using most common word list as features topWordsFeatureCount = 100 #set to 0 to prvent using most common word trigrams as features topWordTrigramFeatureCount = 50 self.clickbaitDetector = clickbaitDetector(skipGetNumbersSum, topWordsFeatureCount, topWordTrigramFeatureCount) #TODO: include a wide range of headlines, including some less vulgar #for the unit tests, we need headlines and manually determined correct values for each feature testCase = "RT The world's 10 most \"advanced sex\" dolls will soon be able to think and talk" #"getNumbersSum" : 10.0 #this is excluded since getNumbersSum may be disabled testValues = { "getWordCount" : 15, "getHashTagsAndRTs" : 1, "getQuestionMarks": 0, "getAtMentions" : 0, "getCharLength" : len(testCase), "getNNPLOCCount" : 0, "getNNPPERSCount" : 0, "getSwearCount" : 2, "maxDistToQuote" : 31, "maxDistToNNP": 0, "getNumbersSum" : 0.0 } patternParseTree = parsetree(testCase, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True) strSentenceText = testCase lstSentPOS = [] lstSentWords = [] for sentence in patternParseTree: for chunk in sentence.chunks: for word in chunk.words: lstSentPOS.append(word.type) lstSentWords.append(word.string) #call unit tests here try: self.assertMaxDistToNNP(lstSentPOS, lstSentWords, testValues) self.assertMaxDistToQuote(strSentenceText, testValues) self.assertGetWordCount(lstSentWords, testValues) self.assertGetHashTagsAndRTs(strSentenceText, lstSentWords, testValues) self.assertGetQuestionMarks(strSentenceText, testValues) self.assertGetAtMentions(strSentenceText, testValues) self.assertGetNumbersSum(lstSentWords, testValues) self.assertGetNNPPERSCount(lstSentPOS, testValues) self.assertGetSwearCount(lstSentWords, testValues) self.assertGetNNPLOCCount(lstSentPOS, testValues) self.assertGetCharLength(strSentenceText, testValues) print("Clickbait detector has passed all Unit Tests!") except AssertionError as e: print(e.args) #print the "args" part of assertionError, showing expected values exit(1)
def test_search(): from pattern.search import search from pattern.en import parsetree t = parsetree('big white rabbit') print t print print search('JJ', t) # all adjectives print search('NN', t) # all nouns print search('NP', t) # all noun phrases
def adjectives(s): """ Returns a list of adjectives in the given string. """ a = set() # set ~= list of unique values t = parsetree(s) for sentence in t: for word in sentence.words: if word.tag and word.tag == "JJ": a.add(word.string.lower()) return list(sorted(a))
def get_parse_tree(sentence): tree = [] Tree = [] for sentence in parsetree(sentence): for chunk in sentence.chunks: for word in chunk.words: tree.append(word) Tree.append(tree) tree = [] return Tree
def getPatterns(self, query): cleaned = query.strip('?') p = [(cleaned, 3)] t = parsetree(query)[0] for chunk in t.chunks: if chunk.pos == 'NP': p.append((chunk.string, 2)) for w in cleaned.split(): p.append((w, 1)) return p
def set_ingredient_tokens(current_recipe): for item in current_recipe.ingredients: quantity_conversion = {'quarter' : 0.25,'eighth' : 0.125, 'half' : 0.5,'1/4' : 0.25, '1/8' : 0.125,'1/3' : 0.333, '2/3' : 0.667,'3/4' : 0.75, '1/2' : 0.5,'1' : 1.0, '2' : 2.0,'3' : 3.0, '4' : 4.0,'5' : 5.0, '6' : 6.0,'7' : 7.0, 'lots' : 3.0, '8' : 8.0,'9' : 9.0, '5-6' : 5.5, 'a' : 1.0,'few' : 2.0, 'scant' : 1.0, 'pinch' : 0.125, 'pinches' : 0.25, '4-' : 4.0, 'to' : 0.0, 'tablespoon' : 1.0, 'teaspoon' : 1.0, 'couple' : 2.0} #set 'dumb' quantity by assuming the first item is quanity prelim_quantity = nltk.tokenize.word_tokenize(item.source_line)[0] #EAFP! try: prelim_quantity = float(prelim_quantity) except ValueError: print "Can't convert :: " + prelim_quantity pass # pass to conversion dictionary lookup try: prelim_quantity = quantity_conversion[prelim_quantity] except KeyError: print KeyError("No conversion value found : " + prelim_quantity) #need to flag here for note in UI prelim_quantity = 0 else: item.quantity = prelim_quantity item.quantity = prelim_quantity filterList = ['tsp', 'tsps', 'tbsps', 'tbsp', 'tablespoon', \ 'tablespoons', 'teaspoon', 'teaspoons', 'cup', \ 'cups', 'bowl', 'pint', 'quart', 'mg', 'g', 'gram',\ 'grams', 'ml', 'oz', 'ounce', 'ounces' ] item.measure = ' '.join([word for word in item.source_line.split(" ") if word in filterList]) new_source_line = ' '.join([word for word in item.source_line.split(" ") if word not in filterList]) sentence = parsetree(new_source_line, chunks=True, lemmata=True) for s in sentence: #filter all the NP (noun phrases) into a chunk list chunk_list = [singularize(chunk.string) for chunk in s.chunks if chunk.type =='NP'] search_term = chunk_list[0] search_term = "".join([i for i in search_term if i != '/']) search_term = ''.join([i for i in search_term if not i.isdigit()]) item.search_term = search_term return current_recipe
def test_pattern(): from pattern.search import Pattern from pattern.en import parsetree t = parsetree('Chuck Norris is cooler than Dolph.', lemmata=True) p = Pattern.fromstring('{NP} be * than {NP}') m = p.match(t) print m.group(1) print m.group(2) print t
def grammatical_tagging(): sentence = "The white house is at the top of the hill" sentences = "The white house is at the top of the hill. My house is not" print( tag(sentence) ) # The result is an array of tuples tagging each word (verbs, nouns, etc.) print(parse(sentence)) #pprint(parse(sentence)) pprint(parsetree(sentences))
def selectWords(review): ''' a function that gets a review and selects the nouns, adjectives, verbs and exclamation mark ''' review = parsetree(review, lemmata=True)[0] #lemmatize the review #select adjectives (JJ), nouns (NN), verbs (VB) and exclamation marks review = [ w.lemma for w in review if w.tag.startswith(('JJ', 'NN', 'VB', '!')) ] review = count(review) #a dictionary of (word, count) return review