def test_document(self): # Assert Document properties. # Test with different input types. for constructor, w in ( (vector.Document, "The cats sit on the mat."), (vector.Document, ["The", "cats", "sit", "on", "the", "mat"]), (vector.Document, {"cat": 1, "mat": 1, "sit": 1}), (vector.Document, Text(parse("The cats sat on the mat."))), (vector.Document, Sentence(parse("The cats sat on the mat.")))): # Test copy. v = constructor( w, stemmer=vector.LEMMA, stopwords=False, name="Cat", type="CAT") v = v.copy() # Test properties. self.assertEqual(v.name, "Cat") self.assertEqual(v.type, "CAT") self.assertEqual(v.count, 3) self.assertEqual(v.terms, {"cat": 1, "mat": 1, "sit": 1}) # Test iterator decoration. self.assertEqual(sorted(v.features), ["cat", "mat", "sit"]) self.assertEqual(sorted(v), ["cat", "mat", "sit"]) self.assertEqual(len(v), 3) self.assertEqual(v["cat"], 1) self.assertEqual("cat" in v, True) print("pattern.vector.Document")
def test_modality(self): # Assert -1.0 => +1.0 representing the degree of certainty. v = en.modality(en.Sentence(en.parse("I wish it would stop raining."))) self.assertTrue(v < 0) v = en.modality( en.Sentence(en.parse("It will surely stop raining soon."))) self.assertTrue(v > 0) # Assert the accuracy of the modality algorithm. # Given are the scores for the CoNLL-2010 Shared Task 1 Wikipedia uncertainty data: # http://www.inf.u-szeged.hu/rgai/conll2010st/tasks.html#task1 # The baseline should increase (not decrease) when the algorithm is # modified. from pattern.db import Datasheet from pattern.metrics import test sentences = [] for certain, sentence in Datasheet.load(os.path.join(PATH, "corpora", "uncertainty-conll2010.csv")): sentence = en.parse(sentence, chunks=False, light=True) sentence = en.Sentence(sentence) sentences.append((sentence, int(certain) > 0)) A, P, R, F = test( lambda sentence: en.modality(sentence) > 0.5, sentences) #print(A, P, R, F) self.assertTrue(A > 0.69) self.assertTrue(P > 0.72) self.assertTrue(R > 0.64) self.assertTrue(F > 0.68) print("pattern.en.modality()")
def update_with_sentences(self, user_input, text): combined_text = user_input + ' ' + text # Parse sentences (lazily doing user parsing twice) parsed_combined = parse(combined_text, chunks = False) parsed_user = parse(user_input, chunks = False) # Take out new lines parsed_combined = parsed_combined.replace('\n', ' ') self.topic_state.update_with_sentences(parsed_combined) self.character_state.update_with_sentences(parsed_user)
def testParse2(): from pattern.en import parse result = parse('I eat pizza with a fork.') result = parse('I eat pizza with a fork. I ate pizza.', tokenize=True, split=True ) for s in result : print s print "-----------" return
def testParse3(): from pattern.en import parse result = parse('I eat pizza with a fork.') result = parse('The new Control Center design might not be final, or it might even go back to the old design. ', tokenize=True, chunks=True, split=True ) for s in result : print s print "-----------" return
def test_mood(self): # Assert imperative mood. v = en.mood(en.Sentence(en.parse("Do your homework!"))) self.assertEqual(v, en.IMPERATIVE) # Assert conditional mood. v = en.mood(en.Sentence(en.parse("We ought to help him."))) self.assertEqual(v, en.CONDITIONAL) # Assert subjunctive mood. v = en.mood(en.Sentence(en.parse("I wouldn't do that if I were you."))) self.assertEqual(v, en.SUBJUNCTIVE) # Assert indicative mood. v = en.mood(en.Sentence(en.parse("The weather is nice today."))) self.assertEqual(v, en.INDICATIVE) print "pattern.en.mood()"
def testParse(): from pattern.en import parse result = parse('I eat pizza with a fork.') result = parse('I eat pizza with a fork. I ate pizza.', tokenize=True ) for s in result.split(): print s return print type(result) print isinstance(result, unicode) print isinstance(result, basestring) print result.tags
def test_find_prepositions(self): # Assert preposition tag annotation (PP + NP). v = en.parser.find_prepositions([ ["", "", "NP"], ["", "", "VP"], ["", "", "PP"], ["", "", "NP"], ["", "", "NP"],]) self.assertEqual(v, [ ["", "", "NP", "O"], ["", "", "VP", "O"], ["", "", "PP", "B-PNP"], ["", "", "NP", "I-PNP"], ["", "", "NP", "I-PNP"]]) # Assert PNP's with consecutive PP's. v = en.parse("The cat was looking at me from up on the roof with interest.", prepositions=True) self.assertEqual(v, "The/DT/B-NP/O cat/NN/I-NP/O " \ "was/VBD/B-VP/O looking/VBG/I-VP/O " \ "at/IN/B-PP/B-PNP me/PRP/B-NP/I-PNP " \ "from/IN/B-PP/B-PNP up/IN/I-PP/I-PNP on/IN/I-PP/I-PNP the/DT/B-NP/I-PNP roof/NN/I-NP/I-PNP " \ "with/IN/B-PP/B-PNP interest/NN/B-NP/I-PNP " \ "././O/O" ) print "pattern.en.parser.find_prepositions()"
def myExtract(statement): s = Sentence(parse(statement, relations=True, lemmata=True, light=True)) p = Pattern.fromstring('There be DT NN+') match = p.search(s) #raise Exception(match) return match
def test_search_function(self): # Assert search() function. s = Sentence(parse("Go on Bors, chop his head off!")) m = search.search("PRP*? NN*", s) self.assertEqual(m[0].string, "Bors") self.assertEqual(m[1].string, "his head") print "pattern.search.search()"
def test_findVerb(): from pattern.en import parse, Text, Sentence from pattern.en import pprint sent = "Bachelor's in Computer Science, Information Systems or a related study, is required." sent = 'I ate pizza.' sent = "Bachelor's in Computer Science is required." sent = "Bachelor 's Degree or 4 years equivalent professional experience ." sent = "A Master ’ s Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ." sent = "A Master's Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ." sent = "Bachelor ’ s degree in Computer Science or equivalent" sent = "Bachelor ' s degree in Computer Science or equivalent" result = parse(sent, tokenize = True, # Tokenize the input, i.e. split punctuation from words. tags = True, # Find part-of-speech tags. ) pprint(result) # print type(result) # print result sen = Sentence(result) # for word in sen: # print word, word.type vlist = [ word.string for word in sen if word.type.startswith("V") ] print vlist
def add_modality(tdb): for tweet in tdb: s = parse(tweet[2], lemmata=True) s = Sentence(s) (form, score) = (mood(s), modality(s)) tweet.extend((form, score)) return tdb
def trainVectorizers(self,document): ''' Train the Vectorizers with a document that should be tokenized into sentences and words **Warning: All listed items will be concatenated to a single matrix** *Required Parameters* :param document: the document (text) or list of documents (file paths) to build count and tfidf vectorizers with (be as representative as possible) ''' self.buildVectorizer('count') self.buildVectorizer('tfidf') if type(document) is str: self.__tfidf.fit(self.__vectorizer.fit_transform(document)) else: uvecs=None sentences=[] for doc in document: if os.path.exists(doc) is True: sentences=[] with open(document,'r') as fp: sentences.extend([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())]) if uvecs is not None: self.__tfidf.fit(self.__vectorizer.fit(sentences))
def setUp(self): # Parse sentences to test on. # Creating a Text creates Sentence, Chunk, PNP and Word. # Creating a Sentence tests Sentence.append() and # Sentence.parse_token(). self.text = "I'm eating pizza with a fork. What a tasty pizza!" self.text = en.Text(en.parse(self.text, relations=True, lemmata=True))
def test_match(self): # Assert Match properties. s = Sentence(parse("Death awaits you all with nasty, big, pointy teeth.")) p = search.Pattern(sequence=[ search.Constraint(tags=["JJ"], optional=True), search.Constraint(tags=["NN*"])]) m = p.search(s) self.assertTrue(isinstance(m, list)) self.assertEqual(m[0].pattern, p) self.assertEqual(m[1].pattern, p) self.assertEqual(m[0].words, [s.words[0]]) self.assertEqual(m[1].words, [s.words[-3], s.words[-2]]) # Assert contraint "NN*" links to "Death" and "teeth", and "JJ" to "pointy". self.assertEqual(m[0].constraint(s.words[ 0]), p[1]) self.assertEqual(m[1].constraint(s.words[-3]), p[0]) self.assertEqual(m[1].constraint(s.words[-2]), p[1]) # Assert constraints "JJ NN*" links to chunk "pointy teeth". self.assertEqual(m[1].constraints(s.chunks[6]), [p[0], p[1]]) # Assert Match.constituents() by constraint, constraint index and list of indices. self.assertEqual(m[1].constituents(), [s.chunks[6]]) self.assertEqual(m[1].constituents(constraint=p[0]), [s.words[-3]]) self.assertEqual(m[1].constituents(constraint=1), [s.words[-2]]) self.assertEqual(m[1].constituents(constraint=(0,1)), [s.chunks[6]]) # Assert Match.string. self.assertEqual(m[1].string, "pointy teeth") print "pattern.search.Match"
def test_sentiment(self): # Assert < 0 for negative adjectives and > 0 for positive adjectives. self.assertTrue(en.sentiment("wonderful")[0] > 0) self.assertTrue(en.sentiment("horrible")[0] < 0) self.assertTrue(en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0) self.assertTrue(en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0] < 0) # Assert that :) and :( are recognized. self.assertTrue(en.sentiment(":)")[0] > 0) self.assertTrue(en.sentiment(":(")[0] < 0) # Assert the accuracy of the sentiment analysis (for the positive class). # Given are the scores for Pang & Lee's polarity dataset v2.0: # http://www.cs.cornell.edu/people/pabo/movie-review-data/ # The baseline should increase (not decrease) when the algorithm is modified. from pattern.db import Datasheet from pattern.metrics import test reviews = [] for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee1.csv")): reviews.append((review, int(score) > 0)) A, P, R, F = test(lambda review: en.positive(review), reviews) self.assertTrue(A > 0.755) self.assertTrue(P > 0.760) self.assertTrue(R > 0.747) self.assertTrue(F > 0.754) # Assert the accuracy of the sentiment analysis on short text (for the positive class). # Given are the scores for Pang & Lee's sentence polarity dataset v1.0: # http://www.cs.cornell.edu/people/pabo/movie-review-data/ reviews = [] for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee2.csv")): reviews.append((review, int(score) > 0)) A, P, R, F = test(lambda review: en.positive(review), reviews) self.assertTrue(A > 0.642) self.assertTrue(P > 0.653) self.assertTrue(R > 0.607) self.assertTrue(F > 0.629) print "pattern.en.sentiment()"
def process(statement,database_name = DATABASE_NAME): ''' Allows us to create entities via statements like "There is a course CSCI4702 called Mobile Programming" and modify entities with statements like "CSCI4702 has a start date of Jan 31st 2013" already encountering a statement like "There is a game engine Unity3d" gives us trouble seems like we need named entity recognition to be able to extract types like that ... or perhaps rely on capitalization which doesn't really work for things like CTO as a category of items, hmm >>> sent = "There is a game engine Unreal Engine".split() >>> print nltk.ne_chunk(nltk.pos_tag(sent)) ''' # this runs real fast, but it doesn't quite get the NN/NNP combination I hoped for from "There is a game engine Unity3D" # although it does now with light=True setting, but now it doesn't get the NNP in "There is a game engine Source" s = parse(statement, relations=True, lemmata=True, light=True) s = split(s) #result = search('There be DT NN+ (DT) (RB) (JJ) NNP+ (call) (DT) (RB) (JJ) (NNPS|NNP)+', s) s, result = extract(statement) if result: #try: noun = search('(NN)+', s)[0].string table = pluralize(noun.replace(' ','_')) result = search('(JJ|NNPS|NNP)+', s) # this pulls in adjectives, but there's supposed to be a better fix coming ident = result[0].string name = result[1].string if len(result) > 1 else ident #raise Exception(table+"; "+ident+"; "+name) return newTable(table,ident,name,database_name) #except: #return regexMatch(statement,database_name) else: return regexMatch(statement,database_name)
def tokenize_line(line): res = line.lower() for regexp, replacement in TOKEN_MAPPINGS: res = regexp.sub(replacement, res) sentence = parse(res,tokenize=True,tags=False, chunks=False, relations= False, lemmata=True).split() # initialize the Variables allowed_tags = re.compile('(NN|VB|JJ|RB)') stopwords = frozenset() min_length = 2 max_length = 15 result = [] # lemmatization of the words try: sentence = sentence[0] except IndexError: pass for token, tag, lemma in sentence: if min_length <= len(lemma) <= max_length and lemma not in stopwords: if allowed_tags.match(tag): lemma += "/" + tag[:2] result.append(lemma.encode('utf8')) res = result logging.info("That's how res looks %s" %res) return res
def getData(self, params): if self.now_cache is not None: if (self.now_cache + datetime.timedelta(minutes=5)) < datetime.datetime.now(): self.data_cache = None self.today_cache = None self.now_cache = None if self.data_cache is None: tweets = [] for cand in candidates: tweets.append({'tweets': api.user_timeline(cand['user'], count=20), 'name': cand['name'], 'party': cand['party']}) all_tweets = [] for tweet_data in tweets: name = tweet_data['name'] party = tweet_data['party'] for tweet in tweet_data['tweets']: all_tweets.append( {'Name': name, 'Tweet': tweet.text, 'Favorites': tweet.favorite_count, 'Retweets': tweet.retweet_count} ) dfs = pd.DataFrame(all_tweets) sentiments = [sentiment(tweet) for tweet in dfs['Tweet']] dfs['Polarity'] = [sent[0] for sent in sentiments] dfs['Subjectivity'] = [sent[1] for sent in sentiments] modal = [modality(Sentence(parse(tweet, lemmata=True))) for tweet in dfs['Tweet']] dfs['Certainty'] = modal today = date.strftime(datetime.datetime.now(), format='%m/%d/%Y, %H:%M') now = datetime.datetime.now() self.data_cache = dfs self.today_cache = today self.now_cache = now return self.data_cache
def basicExtract(statement): #s = Sentence(parse(statement, relations=True, lemmata=True, light=True)) #p = Pattern.fromstring('(DT) (RB) (JJ) NN+') s = Sentence(parse(statement, lemmata=True)) m = search("There be DT {JJ? NN}", s) return m
def get_parts(thetext, punctuation): # generate stopwords list & regexes for 2+ periods or 2+ dashes stop = stopwords.words('english') regex1=re.compile(r"\.{2,}") regex2=re.compile(r"\-{2,}") thetext=re.sub(regex1, ' ', thetext) thetext=re.sub(regex2, ' ', thetext) nouns=[] descriptives=[] for i,sentence in enumerate(parse(thetext, tokenize=True, lemmata=True).split()): nouns.append([]) descriptives.append([]) for token in sentence: if len(token[4]) >0: if token[1] in ['JJ', 'JJR', 'JJS']: if token[4] in stop or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1: continue descriptives[i].append(token[4]) elif token[1] in ['NN', 'NNS']: if token[4] in stop or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1: continue nouns[i].append(token[4]) out=zip(nouns, descriptives) nouns2=[] descriptives2=[] for n,d in out: if len(n)!=0 and len(d)!=0: nouns2.append(n) descriptives2.append(d) return nouns2, descriptives2
def get_word_types(words): """Determine the occurrences of pos types. Args: words (list): A list of words Returns: dict: The data and summary results. """ new_arr = [] for val in words: try: val = parse( val, encoding='utf-8', tokenize=False, light=False, tags=True, chunks=False, relations=False, lemmata=False) new_arr.append(val) except IndexError: continue return { 'data': new_arr, 'summary': None }
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False): """ This function is only available when the optional 'pattern' package is installed. Use the English lemmatizer from `pattern` to extract tokens in their base form=lemma, e.g. "are, is, being" -> "be" etc. This is a smarter version of stemming. Only consider nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded). >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21') ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN'] """ if light: import warnings warnings.warn("The light flag is no longer supported by pattern.") # tokenization in `pattern` is weird; it gets thrown off by non-letters, # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little # FIXME this throws away all fancy parsing cues, including sentence structure, # abbreviations etc. content = u' '.join(tokenize(content, lower=True, errors='ignore')) parsed = parse(content, lemmata=True, collapse=False) result = [] for sentence in parsed: for token, tag, _, _, lemma in sentence: if 2 <= len(lemma) <= 15 and not lemma.startswith('_'): if allowed_tags.match(tag): lemma += "/" + tag[:2] result.append(lemma.encode('utf8')) return result
def gender_feature(text, feature_vect): """ Extract the gender features :param text: :param feature_vect: contains a bag of words and a list of bigrams :return: a dictionary which contains the feature and its computed value """ #sentence length and vocab features tokens = word_tokenize(text.lower()) sentences = sent_tokenize(text.lower()) words_per_sent = np.asarray([len(word_tokenize(s)) for s in sentences]) #bag_of_word features bag_dict = {} for bag in feature_vect[:29]: bag_dict[bag] = bag in tokens #bigrams features bigram_dict = {} for big in feature_vect[29:]: bigram_dict[big] = big in bigrams(tokens) #POS tagging features POS_tag = ['ADJ', 'ADV', 'DET', 'NOUN', 'PRT', 'VERB', '.'] tagged_word = parse(text, chunks=False, tagset='UNIVERSAL').split() simplified_tagged_word = [(tag[0], map_tag('en-ptb', 'universal', tag[1])) for s in tagged_word for tag in s] freq_POS = nltk.FreqDist(tag[1] for tag in simplified_tagged_word if tag[1] in POS_tag) d = dict({'sentence_length_variation': words_per_sent.std()}, **bag_dict) return dict(dict(d, **bigram_dict), **freq_POS)
def run(o): """ STM is shortcuts to the short_term_memory operators """ STM_PATH = './bin/%s/brain/short_term_memory' % o.o['name'] WM_PLACES_PATH = './bin/%s/brain/working_memory/PLACES' % o.o['name'] import os, sys; mydirs = os.listdir( STM_PATH ) from pattern.en import parse, pprint, tag import shutil for word in mydirs: ignore = [".DS_Store",".gitignore","README.txt"] if word in ignore: continue #print word s = parse(word,tags=True) print s #pprint(s) tagged = s.split('/')[1] if tagged == "NNP-LOC": from_path = "%s/%s" % (STM_PATH,word) to_path = "%s/" % WM_PLACES_PATH os.system( "rsync -avrz %s %s" % (from_path,to_path) )
def findVerb(sent): result = parse(sent,tokenize = True, tags = True, ) sen = Sentence(result) vlist = [ word.string for word in sen if word.type.startswith("V") ] print vlist vlist = [ word.string for word in sen if word.type.startswith("V") ] return vlist
def get_NNPs(text, counts = False): '''Extract proper nouns from text. :param text: Text to parse :type text: str :param counts: Return counts for each extracted NNP :type counts: bool :returns: list -- List containing either only the extracted NNP's or (NNP, count) -pairs sorted by count. ''' parsed_text = parse(text).split() nnps = [] for sent in parsed_text: for word in sent: if word[1].startswith('NNP'): nnps.append(word[0]) ctr = Counter(nnps) if counts: ctri = ctr.items() ctri = sorted(ctri, key = lambda x: x[1], reverse = True) else: ctri = ctr.keys() return ctri
def getLemma(dico): #we get every chapte for chapter in dico: #We create the new part in it dico[chapter]["lemma"] = {} #Then we get every sentence for sentence in dico[chapter]["sentences"]: #We parse it temp = parse(dico[chapter]["sentences"][sentence], tokenize = True, tags = True, chunks = False, relations = False, lemmata = True, default = 'NN', light = True) #We split it temp = temp.split() for weird in temp: for temp2 in temp: for triple in temp2: key = sentence + " " + triple[0] dico[chapter]["lemma"][key] = (triple[1],triple[2]) #print(each) #print(temp) #End of sentence loop #End of chapter loop #We return our data return dico
def test_match(self): # Assert Constraint-Word matching. R = search.Constraint.fromstring S = lambda s: Sentence(parse(s, relations=True, lemmata=True)) W = lambda s, tag=None, index=0: search.Word(None, s, tag, index) for constraint, tests in ( (R("cat|dog"), [(W("cat"), 1), (W("dog"), 1), (W("fish"), 0)]), (R("cat*"), [(W("cats"), 1)]), (R("*cat"), [(W("tomcat"), 1)]), (R("c*t|d*g"), [(W("cat"), 1), (W("cut"), 1), (W("dog"), 1), (W("dig"), 1)]), (R("cats|NN*"), [(W("cats", "NNS"), 1), (W("cats"), 0)]), (R("^cat"), [(W("cat", "NN", index=0), 1),(W("cat", "NN", index=1), 0)]), (R("*|!cat"), [(W("cat"), 0), (W("dog"), 1), (W("fish"), 1)]), (R("my cat"), [(W("cat"), 0)]), (R("my cat"), [(S("my cat").words[1], 1)]), # "my cat" is an overspecification of "cat" (R("my_cat"), [(S("my cat").words[1], 1)]), (R("cat|NP"), [(S("my cat").words[1], 1)]), (R("dog|VP"), [(S("my dog").words[1], 0)]), (R("cat|SBJ"), [(S("the cat is sleeping").words[1], 1)]), (R("dog"), [(S("MY DOGS").words[1], 1)]), # lemma matches (R("dog"), [(S("MY DOG").words[1], 1)])): # case-insensitive for test, b in tests: self.assertEqual(constraint.match(test), bool(b)) # Assert Constraint-Taxa matching. t = search.Taxonomy() t.append("Tweety", type="bird") t.append("Steven", type="bird") v = search.Constraint.fromstring("BIRD", taxonomy=t) self.assertTrue(v.match(W("bird"))) self.assertTrue(v.match(S("tweeties")[0])) self.assertTrue(v.match(W("Steven"))) print "pattern.search.Constraint.match()"
def normalize(self, text): """Normalizes a given string by: * singularizing any plurals. * getting the base form of any verb * eliminating all capitals""" if self.language == self.LANG_EN: from pattern.en import parse elif self.language == self.LANG_NL: from pattern.nl import parse elif self.language == self.LANG_DE: from pattern.de import parse elif self.language == self.LANG_ES: from pattern.es import parse elif self.language == self.LANG_UNKNOWN: # Don't do any parsing. return text.lower() else: raise Exception("Unsupported language: %s" % repr(self.language)) parsed = parse(text, lemmata=True, chunks=False) parsed = [x for y in parsed.split() for x in y] # Flatten normalized = map(lambda w: w[-1], parsed) normalized = filter(lambda w: w not in string.punctuation, normalized) normalized = ' '.join(normalized) return normalized
def get_sentiment_bigrams(paths): bigrams_list = [] for path in paths: f = open(path, "r+") review_text = f.read() f.close() sentences = review_text.split(".") for sentence in sentences: tagged_sentence = parse(sentence.lower()).split(" ") sentence_bigrams = get_bigrams(tagged_sentence) bigrams_list = combine_lists(bigrams_list, sentence_bigrams) return bigrams_list
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, stopwords=frozenset()): """ This function is only available when the optional 'pattern' package is installed. Use the English lemmatizer from `pattern` to extract tokens in their base form=lemma, e.g. "are, is, being" -> "be" etc. This is a smarter version of stemming, taking word context into account. Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded). >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21') ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN'] >>> lemmatize('The study ranks high.') ['study/NN', 'rank/VB', 'high/JJ'] >>> lemmatize('The ranks study hard.') ['rank/NN', 'study/VB', 'hard/RB'] """ if light: import warnings warnings.warn("The light flag is no longer supported by pattern.") # tokenization in `pattern` is weird; it gets thrown off by non-letters, # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little # FIXME this throws away all fancy parsing cues, including sentence structure, # abbreviations etc. content = u(' ').join(tokenize(content, lower=True, errors='ignore')) parsed = parse(content, lemmata=True, collapse=False) result = [] for sentence in parsed: for token, tag, _, _, lemma in sentence: if 2 <= len(lemma) <= 15 and not lemma.startswith( '_') and lemma not in stopwords: if allowed_tags.match(tag): lemma += "/" + tag[:2] result.append(lemma.encode('utf8')) return result #endif HAS_PATTERN
def lemmatize(content): """ Use the English lemmatizer from `pattern` to extract tokens in their base form=lemma, e.g. "are, is, being" -> "be" etc. This is a smarter version of stemming, taking word context into account. Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded). """ content = u' '.join(utils.tokenize(content, lower=True, errors='ignore')) parsed = parse(content, lemmata=True, collapse=False) result = [] for sentence in parsed: for token, tag, _, _, lemma in sentence: if 2 <= len(lemma) <= 15 and not lemma.startswith('_'): if utils.ALLOWED_TAGS.match(tag): result.append(lemma.encode('utf8')) return result
def test_group(self): # Assert Match groups. s = Sentence(parse("the big black cat eats a tasty fish")) m = search.search("DT {JJ+} NN", s) self.assertEqual(m[0].group(1).string, "big black") self.assertEqual(m[1].group(1).string, "tasty") # Assert nested groups (and syntax with additional spaces). m = search.search("DT { JJ { JJ { NN }}}", s) self.assertEqual(m[0].group(1).string, "big black cat") self.assertEqual(m[0].group(2).string, "black cat") self.assertEqual(m[0].group(3).string, "cat") # Assert chunked groups. m = search.search("NP {VP NP}", s) v = m[0].group(1, chunked=True) self.assertEqual(v[0].string, "eats") self.assertEqual(v[1].string, "a tasty fish") print("pattern.search.Match.group()")
def test_parse(self): # Assert parsed output with Penn Treebank II tags (slash-formatted). # 1) "the black cat" is a noun phrase, "on the mat" is a prepositional noun phrase. v = en.parser.parse("The black cat sat on the mat.") self.assertEqual(v, "The/DT/B-NP/O black/JJ/I-NP/O cat/NN/I-NP/O " + \ "sat/VBD/B-VP/O " + \ "on/IN/B-PP/B-PNP the/DT/B-NP/I-PNP mat/NN/I-NP/I-PNP ././O/O" ) # 2) "the black cat" is the subject, "a fish" is the object. v = en.parser.parse("The black cat is eating a fish.", relations=True) self.assertEqual(v, "The/DT/B-NP/O/NP-SBJ-1 black/JJ/I-NP/O/NP-SBJ-1 cat/NN/I-NP/O/NP-SBJ-1 " + \ "is/VBZ/B-VP/O/VP-1 eating/VBG/I-VP/O/VP-1 " + \ "a/DT/B-NP/O/NP-OBJ-1 fish/NN/I-NP/O/NP-OBJ-1 ././O/O/O" ) # 3) "chasing" and "mice" lemmata are "chase" and "mouse". v = en.parser.parse("The black cat is chasing mice.", lemmata=True) self.assertEqual(v, "The/DT/B-NP/O/the black/JJ/I-NP/O/black cat/NN/I-NP/O/cat " + \ "is/VBZ/B-VP/O/be chasing/VBG/I-VP/O/chase " + \ "mice/NNS/B-NP/O/mouse ././O/O/." ) # 4) Assert unicode. self.assertTrue(isinstance(v, unicode)) # 5) Assert unicode for faulty input (bytestring with unicode characters). self.assertTrue(isinstance(en.parse("ø ü"), unicode)) self.assertTrue(isinstance(en.parse("ø ü", tokenize=True, tags=False, chunks=False), unicode)) self.assertTrue(isinstance(en.parse("ø ü", tokenize=False, tags=False, chunks=False), unicode)) self.assertTrue(isinstance(en.parse("o u", encoding="ascii"), unicode)) # 6) Assert optional parameters (i.e., setting all to False). self.assertEqual(en.parse("ø ü.", tokenize=True, tags=False, chunks=False), u"ø ü .") self.assertEqual(en.parse("ø ü.", tokenize=False, tags=False, chunks=False), u"ø ü.") # 7) Assert the accuracy of the English tagger. i, n = 0, 0 for sentence in open(os.path.join(PATH, "corpora", "tagged-en-penntreebank.txt")).readlines(): sentence = sentence.decode("utf-8").strip() s1 = [w.split("/") for w in sentence.split(" ")] s2 = [[w for w, pos in s1]] s2 = en.parse(s2, tokenize=False) s2 = [w.split("/") for w in s2.split(" ")] for j in range(len(s1)): if s1[j][1] == s2[j][1].split("-")[0]: i += 1 n += 1 #print float(i) / n self.assertTrue(float(i) / n > 0.945) print "pattern.en.parse()"
def getData(self, params): if self.now_cache is not None: if (self.now_cache + datetime.timedelta(minutes=5)) < datetime.datetime.now(): self.data_cache = None self.today_cache = None self.now_cache = None if self.data_cache is None: tweets = [] for cand in candidates: tweets.append({ 'tweets': api.user_timeline(cand['user'], count=20), 'name': cand['name'], 'party': cand['party'] }) all_tweets = [] for tweet_data in tweets: name = tweet_data['name'] party = tweet_data['party'] for tweet in tweet_data['tweets']: all_tweets.append({ 'Name': name, 'Tweet': tweet.text, 'Favorites': tweet.favorite_count, 'Retweets': tweet.retweet_count }) dfs = pd.DataFrame(all_tweets) sentiments = [sentiment(tweet) for tweet in dfs['Tweet']] dfs['Polarity'] = [sent[0] for sent in sentiments] dfs['Subjectivity'] = [sent[1] for sent in sentiments] modal = [ modality(Sentence(parse(tweet, lemmata=True))) for tweet in dfs['Tweet'] ] dfs['Certainty'] = modal today = date.strftime(datetime.datetime.now(), format='%m/%d/%Y, %H:%M') now = datetime.datetime.now() self.data_cache = dfs self.today_cache = today self.now_cache = now return self.data_cache
def corrupt_remove_prep(sent): # parse P = parse(sent) parsed = sum(P.split(), []) # chunk new = [] for word in parsed: chunktag = word[3] if chunktag != 'B-PNP': new.append(word[0]) corr = ' '.join(new) if sorted(tokenize(sent)) == sorted(tokenize(corr)): return None return corr
def team_sentiment_analysis(stats): for s in stats.sentences: this_sentiment = sentiment(s) polarity = float("{0:.2f}".format(this_sentiment[0])) subjectivity = float("{0:.2f}".format(this_sentiment[1])) polarity_10 = float("{0:.1f}".format(this_sentiment[0])) subjectivity_10 = float("{0:.1f}".format(this_sentiment[1])) stats.polarity_counts[polarity] += 1 stats.subjectivity_counts[subjectivity] += 1 stats.polarity_counts_10s[polarity_10] += 1 stats.subjectivity_counts_10s[subjectivity_10] += 1 s = Sentence(parse(s, lemmata=True)) stats.mood_counts[mood(s)] += 1 rounded_modality = float("{0:.2f}".format(modality(s))) rounded_modality_10 = float("{0:.1f}".format(modality(s))) stats.modality_counts[rounded_modality] += 1 stats.modality_counts_10s[rounded_modality_10] += 1
def calculate_phrase_sentiment(self, phrases): # print "Rating phrases sentiment..." valence_list = [] arousal_list = [] for p in phrases: pol = sentiment(p)[0] sent = parse(p, lemmata=True) mod = modality(Sentence(sent)) print mod valence_list.append(10 * pol) arousal_list.append(5 * mod) valence = max(valence_list) arousal = max(arousal_list) print "Valence: " + str(valence) print "arousal: " + str(arousal) return ((valence, arousal))
def getBroken(word): global duplicateWordForms # print('in get broken. word: ', word) ''' returns word parsed into array of caps chars, word root, and POS tag if any''' if word.isspace(): return [word] returner = [] if word in words_to_ignore: return getCapsChars(word) + [word.lower()] patternParsedList = parse(word, relations=True, lemmata=True).split()[0] for patternParsed in patternParsedList: word, root, pos = postParse(patternParsed) capsChars = [] wordLower = word.lower() wordHasCaps = not word.islower() wordLower = word.lower() if wordHasCaps else word # print("wordHasCaps", wordHasCaps) # print(word, root, pos) if wordHasCaps: capsChars = getCapsChars(word) # if pos in unsupportedPoss: # returner += capsChars + [wordLower] if root == wordLower: ''' this means there are no POS tags we need to keep ''' returner += capsChars + [wordLower] else: if root in roots_to_ignore or "'" in word or "‘" in word or "’" in word: returner += capsChars + [ wordLower ] #was, were, am, are -- these words get tokenized/untokenized unreliably. :( else: useParsed = True key = root + pos if key in m_rootPos_word: value = m_rootPos_word[key] if value != wordLower: duplicateWordForms += [(key, value, wordLower)] useParsed = False if useParsed: m_rootPos_word[root + pos] = wordLower returner += capsChars + [root, pos] else: returner += capsChars + [wordLower] return returner
def test_sentiment(self): # Assert < 0 for negative adjectives and > 0 for positive adjectives. self.assertTrue(en.sentiment("wonderful")[0] > 0) self.assertTrue(en.sentiment("horrible")[0] < 0) self.assertTrue( en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0) self.assertTrue( en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0] < 0) # Assert that :) and :( are recognized. self.assertTrue(en.sentiment(":)")[0] > 0) self.assertTrue(en.sentiment(":(")[0] < 0) # Assert the accuracy of the sentiment analysis (for the positive class). # Given are the scores for Pang & Lee's polarity dataset v2.0: # http://www.cs.cornell.edu/people/pabo/movie-review-data/ # The baseline should increase (not decrease) when the algorithm is modified. from pattern.db import Datasheet from pattern.metrics import test reviews = [] for score, review in Datasheet.load( os.path.join(PATH, "corpora", "polarity-en-pang&lee1.csv")): reviews.append((review, int(score) > 0)) from time import time t = time() A, P, R, F = test(lambda review: en.positive(review), reviews) #print A, P, R, F self.assertTrue(A > 0.753) self.assertTrue(P > 0.768) self.assertTrue(R > 0.725) self.assertTrue(F > 0.746) # Assert the accuracy of the sentiment analysis on short text (for the positive class). # Given are the scores for Pang & Lee's sentence polarity dataset v1.0: # http://www.cs.cornell.edu/people/pabo/movie-review-data/ reviews = [] for score, review in Datasheet.load( os.path.join(PATH, "corpora", "polarity-en-pang&lee2.csv")): reviews.append((review, int(score) > 0)) A, P, R, F = test(lambda review: en.positive(review), reviews) #print A, P, R, F self.assertTrue(A > 0.654) self.assertTrue(P > 0.660) self.assertTrue(R > 0.636) self.assertTrue(F > 0.648) print "pattern.en.sentiment()"
def mood(sentence, **kwargs): """Returns IMPERATIVE (command), CONDITIONAL (possibility), SUBJUNCTIVE (wish) or INDICATIVE (fact).""" if isinstance(sentence, basestring): try: # A Sentence is expected but a string given. # Attempt to parse the string on-the-fly. from pattern.en import parse, Sentence sentence = Sentence(parse(sentence)) except ImportError: pass if imperative(sentence, **kwargs): return IMPERATIVE if conditional(sentence, **kwargs): return CONDITIONAL if subjunctive(sentence, **kwargs): return SUBJUNCTIVE else: return INDICATIVE
def getPosTag(in_filename, out_filename): #open files for inpurt/output, fhnd[0] - in, fhnd[1 ]- out fhnd = open_files(in_filename, out_filename) tagged_line = "" from pattern.en import parse for line in fhnd[0]: word_list = line.split() #line to list for word in word_list: tagged_line += " " + parse(word, Relations = False, lemmata = False) fhnd[1].write(tagged_line + "\n") tagged_line = "" return 0
def pos_counts(sentence, ngram=4): counts = {} sentence = " ".join(sentence.strip().split()) words = sentence.split(); if len(words) < ngram: return counts pos = parse(sentence, chunks=False).split()[0] pos = filter(lambda x: re.match('^[\w-]+$', x[1]) is not None, pos) pos = [x[1] for x in pos] for w in range(len(pos)-4): postag = "|".join(pos[w:w+4]) if postag in counts: counts[postag] = counts[postag] + 1 else: counts[postag] = 1 return counts
def test_search(self): # Assert one match containing all words. v = search.Pattern.fromstring("*+") v = v.search("one two three") self.assertEqual(v[0].string, "one two three") # Assert one match for each word. v = search.Pattern.fromstring("*") v = v.search("one two three") self.assertEqual(v[0].string, "one") self.assertEqual(v[1].string, "two") self.assertEqual(v[2].string, "three") # Assert all variations are matched (sentence starts with a NN* which must be caught). v = search.Pattern.fromstring("(DT) JJ?+ NN*") v = v.search(Sentence(parse("dogs, black cats and a big white rabbit"))) self.assertEqual(v[0].string, "dogs") self.assertEqual(v[1].string, "black cats") self.assertEqual(v[2].string, "a big white rabbit") v = search.Pattern.fromstring("NN*") print "pattern.search.Pattern.search()"
def lemmatize(content, light=False, allowed_tags=re.compile('(NN|VB|JJ|RB)')): """ This function is only available when the optional 'pattern' package is installed. Use the English lemmatizer from `pattern` to extract tokens in their base form=lemma, e.g. "are, is, being" -> "be" etc. This is a smarter version of stemming. Only consider nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded). >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21') ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN'] From http://www.clips.ua.ac.be/pages/pattern-en#parser : The parser is built on a Brill lexicon of tagged words and rules to improve the tags context-wise. With light=False, it uses Brill's contextual rules. With light=True it uses Jason Wiener's simpler ruleset. This ruleset is 5-10x faster but also 25% less accurate. """ # tokenization in `pattern` is weird; it gets thrown off by non-letters, # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little # FIXME this throws away all fancy parsing cues, including sentence structure, # abbreviations etc. content = u' '.join(tokenize(content, lower=True, errors='ignore')) # use simpler, modified pattern.text.en.text.parser.parse that doesn't # collapse the output at the end: https://github.com/piskvorky/pattern parsed = parse(content, lemmata=True, collapse=False, light=light) result = [] for sentence in parsed: for token, tag, _, _, lemma in sentence: if 2 <= len(lemma) <= 15 and not lemma.startswith('_'): if allowed_tags.match(tag): lemma += "/" + tag[:2] result.append(lemma.encode('utf8')) return result #endif HAS_PATTERN
def add_keywords(self, phrase): sent = en.Sentence(en.parse(phrase)) nouns = search('NN', sent) self.blackboard.pool.nouns.update( set(Word(en.singularize(n[0].string)) for n in nouns)) adjs = search('JJ', sent) self.blackboard.pool.adjectives.update( set(Word(en.lemma(a[0].string)) for a in adjs)) try: nps = search('NP', sent) for np in nps: self.blackboard.pool.epithets.update({ Word(en.singularize(w.string), "NN"): [Word(jj.string, "JJ") for jj in np if "JJ" in jj.tag] for w in np if "NN" in w.tag }) except IndexError: pass
def testTokenize(): s = "I eat pizza with a fork." s = "B.S. in Computer Science, a related degree or its equivalent " s = "What's this? This is a book." s = "Bachelor's degree in Computer Science or equivalent" s = "Bachelor’s degree in Computer Science or equivalent" s = parse( s, tokenize=True, # Tokenize the input, i.e. split punctuation from words. tags=False, # Find part-of-speech tags. chunks= False, # Find chunk tags, e.g. "the black cat" = NP = noun phrase. relations=False, # Find relations between chunks. lemmata=False, # Find word lemmata. light=False) print s.split()
def sentiment(content): from pattern.en import parse, split, wordnet wordnet.sentiment.load() relevant_types = [ 'JJ', 'VB', 'VBD', 'VBN', 'VBG' 'RB', ] score = 0 sentences = split(parse(content, lemmata=True)) for sentence in sentences: for word in sentence.words: if word.type in relevant_types: pos, neg, obj = wordnet.sentiment[word.lemma] score = score + ((pos - neg) * (1 - obj)) #return 1 if score >= 0 else -1 return score
def extract(statement): s = Sentence(parse(statement, lemmata=True)) '''c1 = Constraint.fromstring("There be DT") c2 = Constraint.fromstring("NN+") c3 = Constraint.fromstring("(DT)") c4 = Constraint.fromstring("(RB) (JJ) NNP+") c5 = Constraint.fromstring("(call) (DT)") c6 = Constraint.fromstring("(RB) (JJ) (NNPS|NNP)+") p = Pattern(sequence=[c1, c2, c3, c4, c5, c6]) match = p.search(s) ''' s = find_entities(s) # not sure about this "be" thing - happy to match plural (is/are) but not sure about past tense ... match = search(MATCH_STRING, s) #raise Exception(match) return s, match
def tagSentence(sent): result = parse( sent, tokenize=True, # Tokenize the input, i.e. split punctuation from words. tags=True, # Find part-of-speech tags. chunks= False, # Find chunk tags, e.g. "the black cat" = NP = noun phrase. relations=False, # Find relations between chunks. lemmata=False, # Find word lemmata. light=False) # pprint(result) array = str(result).split(" ") tokens = [] posTags = [] for a in array: b = a.split("/") tokens.append(b[0]) posTags.append(b[1]) # print tokens # print posTags return (tokens, posTags)
def lemmatize(content): """ Use the English lemmatizer from the `pattern` package to extract tokens in their base form (lemmas: "are, is, being"->"be" etc.). This is a smarter version of stemming. """ # tokenization in `pattern` is weird; it gets thrown off by non-letters, # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little # FIXME this throws away all fancy parsing cues, including sentence structure, # abbreviations etc. content = u' '.join(tokenize(content, lower=True, errors='ignore')) # use simpler, modified pattern.text.en.text.parser.parse that doesn't # collapse the output at the end: https://github.com/piskvorky/pattern parsed = parse(content, lemmata=True, collapse=False) result = [] for sentence in parsed: for token, tag, _, _, lemma in sentence: if 2 <= len(lemma) <= 15 and not lemma.startswith('_'): if ALLOWED_TAGS.match(tag): lemma += "/" + tag[:2] result.append(lemma.encode('utf8')) return result
def convert_pattern_format(text): """ Text is parsed through pattern's parsing function into a standardized format. """ parsed_text = [] # parse text via Pattern's parser pattern_parsed_text = Text(parse(text, relations=True, lemmata=True)) for sentence in pattern_parsed_text: s = Sentence() s.string = remove_blanks(sentence.string) for word in sentence: # Patterns tags for each word in the sentence are stored in a new Word-object w = Word() w.string = word.string w.lemma = word.lemma w.index = word.index w.tag = word.type w.entity = "" # each word is appended to a Sentence-object s.words.append(w) # each Sentence-object is appended to an array parsed_text.append(s) return parsed_text
def test_parse(): from pattern.en import parse, Text, Sentence from pattern.en import pprint sent = "Experience with mobile application development a plus: iPhone/iPad, Android, or Blackberry." sent = "3+ years web software development experience." sent = "Bachelor's in Computer Science, Information Systems or a related study, is required." sent = 'I ate pizza.' sent = "Bachelor's in Computer Science is required." sent = "Bachelor 's Degree or 4 years equivalent professional experience ." sent = "A Master ’ s Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ." sent = "A Master's Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ." sent = "BS degree ( BSEE or BSCS strongly preferred , MSCS a plus ) and/or the equivalent in training and experience ." result = parse(sent, tokenize = True, # Tokenize the input, i.e. split punctuation from words. tags = True, # Find part-of-speech tags. chunks = True, # Find chunk tags, e.g. "the black cat" = NP = noun phrase. relations = True, # Find relations between chunks. lemmata = True, # Find word lemmata. light = True) pprint(result)
def getLemma(dico): #we get every chapte for chapter in dico: #We create the new part in it dico[chapter]["lemma"] = {} #Then we get every sentence for sentence in dico[chapter]["sentences"]: #We parse it temp = parse(dico[chapter]["sentences"][sentence], tokenize=True, tags=True, chunks=False, relations=False, lemmata=True, default='NN', light=True) #We split it temp = temp.split() for weird in temp: for temp2 in temp: for triple in temp2: key = sentence + " " + triple[0] dico[chapter]["lemma"][key] = (triple[1], triple[2]) #print(each) #print(temp) #End of sentence loop #End of chapter loop #We return our data return dico
def test_parse(self): # Assert parsed output with Penn Treebank II tags (slash-formatted). # 1) "the black cat" is a noun phrase, "on the mat" is a prepositional noun phrase. v = en.parser.parse("The black cat sat on the mat.") self.assertEqual(v, "The/DT/B-NP/O black/JJ/I-NP/O cat/NN/I-NP/O " + \ "sat/VBD/B-VP/O " + \ "on/IN/B-PP/B-PNP the/DT/B-NP/I-PNP mat/NN/I-NP/I-PNP ././O/O" ) # 2) "the black cat" is the subject, "a fish" is the object. v = en.parser.parse("The black cat is eating a fish.", relations=True) self.assertEqual(v, "The/DT/B-NP/O/NP-SBJ-1 black/JJ/I-NP/O/NP-SBJ-1 cat/NN/I-NP/O/NP-SBJ-1 " + \ "is/VBZ/B-VP/O/VP-1 eating/VBG/I-VP/O/VP-1 " + \ "a/DT/B-NP/O/NP-OBJ-1 fish/NN/I-NP/O/NP-OBJ-1 ././O/O/O" ) # 3) "chasing" and "mice" lemmata are "chase" and "mouse". v = en.parser.parse("The black cat is chasing mice.", lemmata=True) self.assertEqual(v, "The/DT/B-NP/O/the black/JJ/I-NP/O/black cat/NN/I-NP/O/cat " + \ "is/VBZ/B-VP/O/be chasing/VBG/I-VP/O/chase " + \ "mice/NNS/B-NP/O/mouse ././O/O/." ) # 4) Assert unicode. self.assertTrue(isinstance(v, unicode)) # 5) Assert unicode for faulty input (bytestring with unicode characters). self.assertTrue(isinstance(en.parse("ø ü"), unicode)) self.assertTrue( isinstance( en.parse("ø ü", tokenize=True, tags=False, chunks=False), unicode)) self.assertTrue( isinstance( en.parse("ø ü", tokenize=False, tags=False, chunks=False), unicode)) self.assertTrue(isinstance(en.parse("o u", encoding="ascii"), unicode)) # 6) Assert optional parameters (i.e., setting all to False). self.assertEqual( en.parse("ø ü.", tokenize=True, tags=False, chunks=False), u"ø ü .") self.assertEqual( en.parse("ø ü.", tokenize=False, tags=False, chunks=False), u"ø ü.") print "pattern.en.parser.parse()"
def get_word_types(words): """Determine the occurrences of pos types. Args: words (list): A list of words Returns: dict: The data and summary results. """ new_arr = [] for val in words: try: val = parse(val, encoding='utf-8', tokenize=False, light=False, tags=True, chunks=False, relations=False, lemmata=False) new_arr.append(val) except IndexError: continue return {'data': new_arr, 'summary': None}
def process(self, message): # print pattern_en.suggest(message) -- suggestions if message == ">!train": self.train() return "It is nice to learn new stuff." if message == ">!forget": memory.clear() return "I am reborn. So much free space :) maybe you will use files to store memory and not RAM..." if message == ">!load_page": if sessionId not in memory: response = "Hello! My name is Chad and I am passionate about music." response += "We can share our experiences and maybe we can get along." response += "Would you mind telling me your name first?" expect[sessionId] = "name" memory[sessionId] = dict() else: response = "Welcome back!" search.search("new songs") with open('results.json') as data_file: data = json.load(data_file) for i in range(10): if 'musicrecording' in data['items'][i]['pagemap']: mr = data['items'][i]['pagemap']['musicrecording'] which = random.randint(0, len(mr) - 1) if 'name' not in mr[which]: response += " Did you know that " + mr[which][ 'byartist'] + " has released a new song?" else: response += " You can check out this cool song, " + mr[which]['name'] + ", by " + \ mr[which]['byartist'] return response s = nlp.get_sentences(message) doc = spacy_nlp(message) for w in doc: print "(", w, w.dep_, w.pos_, w.head, ")" aiml_sent_type = [] aiml_responses = [] memory_responses = [] sentence_types = [] emotions = [] for sentence in s: sentence_type = self.instant_classifier.classify( dialogue_act_features(sentence)) sentence_types.append(sentence_type) polarity, subjective = pattern_en.sentiment(sentence) sent = pattern_en.parse(sentence, lemmata=True) sent = pattern_en.Sentence(sent) modality = pattern_en.modality(sent) mood = pattern_en.mood(sent) if polarity > 0.8: emotions.append("SUPER HAPPY") elif polarity > 0.3: emotions.append("GOOD SURPRISE") elif polarity < -0.4: emotions.append("FEAR") elif polarity > 0.4: emotions.append("COOL") elif polarity < -0.1: emotions.append("SAD") elif polarity < -0.7: emotions.append("ANGER") else: emotions.append("NEUTER") print sentence_type, polarity, subjective, modality, mood if sentence_type not in ["whQuestion", "ynQuestion"]: try: aiml_sent_type_res = self.kernel.respond( sentence_type, sessionId) except: aiml_sent_type_res = "" aiml_sent_type.append(aiml_sent_type_res) verbs_subj = set() sentence = sentence[0].upper() + sentence[1:] doc = spacy_nlp(sentence) for possible_subject in doc: if (possible_subject.dep == nsubj or possible_subject.dep == nsubjpass) and possible_subject.head.pos == VERB: verbs_subj.add((possible_subject, possible_subject.head)) try: aiml_response = self.kernel.respond(sentence, sessionId) except: aiml_response = "" aiml_responses.append(aiml_response) # MEMORY MODULE memory_msg = "" if sentence_type == "Statement": # insert into memory for i in verbs_subj: subjs = [] subjects = [i[0]] for tok in i[0].children: if tok.dep == conj: subjects.append(tok) for subj in subjects: predec = "" for tok in subj.children: if tok.dep_ == "poss" or tok.dep == amod: predec += tok.lower_ if len(predec) > 0: subjs.append(predec + " " + subj.lower_) else: subjs.append(subj.lower_) vb = i[1].lower_ if vb not in memory[sessionId]: memory[sessionId][vb] = dict() for subj in subjs: for c in i[1].children: if c.dep in [prep]: memory[sessionId][vb][subj] = c.lower_ + " " for c_prep in c.children: if c_prep.dep in [dobj, pobj, attr]: memory[sessionId][vb][ subj] += c_prep.text memory_responses.append( self.kernel.respond( "memorate", sessionId)) elif c.dep in [dobj, pobj, attr]: memory[sessionId][vb][subj] = c.text memory_responses.append( self.kernel.respond("memorate", sessionId)) elif sentence_type == "whQuestion": for i in verbs_subj: subjs = [] subjects = [i[0]] for tok in i[0].children: if tok.dep == conj: subjects.append(tok) for subj in subjects: predec = "" for tok in subj.children: if tok.dep_ == "poss" or tok.dep == amod: predec += tok.lower_ if len(predec) > 0: subjs.append(predec + " " + subj.lower_) else: subjs.append(subj.lower_) max_similarity = 0 verb = i[1].lower_ for j in memory[sessionId]: p_word = spacy_nlp(j) similarity = i[1].similarity(p_word[0]) if similarity > max_similarity: max_similarity = similarity verb = j if max_similarity > 0.5 and verb in memory[sessionId]: num_subjs = len(subjs) memory_msg = "" for subj in subjs: if subj in memory[sessionId][verb]: toks = nlp.tokenize_text(subj) memory_msg = "" for t in toks: if t in first_person: memory_msg += pron_translate[t] + " " else: memory_msg += t + " " num_subjs -= 1 if num_subjs > 2: memory_msg += ", " elif num_subjs == 1: memory_msg += "and " if len(memory_msg) > 0: memory_msg += verb + " " if num_subjs != len(subjs): memory_msg += memory[sessionId][verb][ subjs[-1]] + "." memory_responses.append(memory_msg) arr_response = [] for i in aiml_sent_type: if len(i) > 0: arr_response.append(i) for i in aiml_responses: if len(i) > 0: arr_response.append(i) for i in memory_responses: if len(i) > 0: arr_response.append(i) if len(arr_response) == 0: data = search.search(message) snip = data['items'][0]['snippet'] sents = nlp.get_sentences(snip) arr_response.append(sents[0]) response = "" for i in emotions: try: emoi = self.kernel.respond(i, sessionId) except: emoi = None if emoi is not None: if random.randint(0, 100) < 50: response += " " + emoi + "." break for res in arr_response: if len(res) > 1: response += res + " " # generic response, if no response restoks = nlp.tokenize_text(response) if len(restoks) == 0: idx = random.randint(0, len(sentence_types) - 1) try: aiml_response = self.kernel.respond(sentence_types[idx], sessionId) except: aiml_response = "" response += aiml_response # polarity, subjective = pattern_en.sentiment(response) # sent = pattern_en.parse(sentence, lemmata=True) # sent = pattern_en.Sentence(sent) # modality = pattern_en.modality(sent) # mood = pattern_en.mood(sent) # sentence_type = self.instant_classifier.classify(dialogue_act_features(response)) # print response, polarity, subjective, modality, mood return response
# It does not use modal verbs such as "could" and "would": # "You could eat your dinner!" is not a command but a bubbly suggestion. # We can create a pattern that scans for infinitive verbs (VB), # and use "!" to exclude certain words: # "!could|!would|!should|!to+ VB" = infinitive not preceded by modal or "to". # This works fine except in one case: if the sentence starts with a verb. # So we need a second rule "^VB" to catch this. # Note that the example below contains a third rule: "^do|VB*". # This catches all sentences that start with a "do" verb regardless if it is infinitive, # because the parses sometimes tags infinitive "do" incorrectly. def imperative(sentence): for p in ("!could|!would|!should|!to+ VB", "^VB", "^do|VB*"): m = match(p, sentence) if match(p, sentence) and sentence.string.endswith( (".", "!")): # Exclude questions. return True return False for s in ("Just stop it!", "Look out!", "Do your homework!", "You should do your homework.", "Could you stop it.", "To be, or not to be."): s = parse(s) s = Sentence(s) print(s) print(imperative(s)) print("")
def test_chunk_modifiers(self): # Assert list of nearby adjectives and adverbs with no role, for VP. v = en.Sentence(en.parse("Perhaps you should go.")) self.assertEqual(v.chunk[2].modifiers, [v.chunk[0]]) # should <=> perhaps print("pattern.en.Chunk.modifiers")