Пример #1
0
 def test_document(self):
     # Assert Document properties.
     # Test with different input types.
     for constructor, w in (
             (vector.Document, "The cats sit on the mat."),
             (vector.Document, ["The", "cats", "sit", "on", "the", "mat"]),
             (vector.Document, {"cat": 1, "mat": 1, "sit": 1}),
             (vector.Document, Text(parse("The cats sat on the mat."))),
             (vector.Document, Sentence(parse("The cats sat on the mat.")))):
         # Test copy.
         v = constructor(
             w, stemmer=vector.LEMMA, stopwords=False, name="Cat", type="CAT")
         v = v.copy()
         # Test properties.
         self.assertEqual(v.name, "Cat")
         self.assertEqual(v.type, "CAT")
         self.assertEqual(v.count, 3)
         self.assertEqual(v.terms, {"cat": 1, "mat": 1, "sit": 1})
         # Test iterator decoration.
         self.assertEqual(sorted(v.features), ["cat", "mat", "sit"])
         self.assertEqual(sorted(v), ["cat", "mat", "sit"])
         self.assertEqual(len(v), 3)
         self.assertEqual(v["cat"], 1)
         self.assertEqual("cat" in v, True)
     print("pattern.vector.Document")
Пример #2
0
 def test_modality(self):
     # Assert -1.0 => +1.0 representing the degree of certainty.
     v = en.modality(en.Sentence(en.parse("I wish it would stop raining.")))
     self.assertTrue(v < 0)
     v = en.modality(
         en.Sentence(en.parse("It will surely stop raining soon.")))
     self.assertTrue(v > 0)
     # Assert the accuracy of the modality algorithm.
     # Given are the scores for the CoNLL-2010 Shared Task 1 Wikipedia uncertainty data:
     # http://www.inf.u-szeged.hu/rgai/conll2010st/tasks.html#task1
     # The baseline should increase (not decrease) when the algorithm is
     # modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     sentences = []
     for certain, sentence in Datasheet.load(os.path.join(PATH, "corpora", "uncertainty-conll2010.csv")):
         sentence = en.parse(sentence, chunks=False, light=True)
         sentence = en.Sentence(sentence)
         sentences.append((sentence, int(certain) > 0))
     A, P, R, F = test(
         lambda sentence: en.modality(sentence) > 0.5, sentences)
     #print(A, P, R, F)
     self.assertTrue(A > 0.69)
     self.assertTrue(P > 0.72)
     self.assertTrue(R > 0.64)
     self.assertTrue(F > 0.68)
     print("pattern.en.modality()")
Пример #3
0
  def update_with_sentences(self, user_input, text):
    combined_text = user_input + ' ' + text
    # Parse sentences (lazily doing user parsing twice)
    parsed_combined = parse(combined_text, chunks = False)
    parsed_user = parse(user_input, chunks = False)
    # Take out new lines
    parsed_combined = parsed_combined.replace('\n', ' ')

    self.topic_state.update_with_sentences(parsed_combined)
    self.character_state.update_with_sentences(parsed_user)
Пример #4
0
def testParse2():
    
    from pattern.en import parse
    result = parse('I eat pizza with a fork.')
    result = parse('I eat pizza with a fork. I ate pizza.', tokenize=True, split=True  )
   
    for s in result :
        print s
        print "-----------"
    return
Пример #5
0
def testParse3():
    
    from pattern.en import parse
    result = parse('I eat pizza with a fork.')
    result = parse('The new Control Center design might not be final, or it might even go back to the old design. ', tokenize=True, chunks=True, split=True  )
   
    for s in result :
        print s
        print "-----------"
    return
Пример #6
0
 def test_mood(self):
     # Assert imperative mood.
     v = en.mood(en.Sentence(en.parse("Do your homework!")))
     self.assertEqual(v, en.IMPERATIVE)
     # Assert conditional mood.
     v = en.mood(en.Sentence(en.parse("We ought to help him.")))
     self.assertEqual(v, en.CONDITIONAL)
     # Assert subjunctive mood.
     v = en.mood(en.Sentence(en.parse("I wouldn't do that if I were you.")))
     self.assertEqual(v, en.SUBJUNCTIVE)
     # Assert indicative mood.
     v = en.mood(en.Sentence(en.parse("The weather is nice today.")))
     self.assertEqual(v, en.INDICATIVE)
     print "pattern.en.mood()"
Пример #7
0
def testParse():
    
    from pattern.en import parse
    result = parse('I eat pizza with a fork.')
    result = parse('I eat pizza with a fork. I ate pizza.', tokenize=True )
        
    
    for s in result.split():
        print s
    return
    
    print type(result)
    print isinstance(result, unicode)
    print isinstance(result, basestring)
    print result.tags
Пример #8
0
 def test_find_prepositions(self):
     # Assert preposition tag annotation (PP + NP).
     v = en.parser.find_prepositions([
         ["", "", "NP"],
         ["", "", "VP"],
         ["", "", "PP"],
         ["", "", "NP"], 
         ["", "", "NP"],])
     self.assertEqual(v, [
         ["", "", "NP", "O"], 
         ["", "", "VP", "O"], 
         ["", "", "PP", "B-PNP"], 
         ["", "", "NP", "I-PNP"], 
         ["", "", "NP", "I-PNP"]])
     # Assert PNP's with consecutive PP's.
     v = en.parse("The cat was looking at me from up on the roof with interest.", prepositions=True)
     self.assertEqual(v,
         "The/DT/B-NP/O cat/NN/I-NP/O " \
         "was/VBD/B-VP/O looking/VBG/I-VP/O " \
         "at/IN/B-PP/B-PNP me/PRP/B-NP/I-PNP " \
         "from/IN/B-PP/B-PNP up/IN/I-PP/I-PNP on/IN/I-PP/I-PNP the/DT/B-NP/I-PNP roof/NN/I-NP/I-PNP " \
         "with/IN/B-PP/B-PNP interest/NN/B-NP/I-PNP " \
         "././O/O"
     )
     print "pattern.en.parser.find_prepositions()"
Пример #9
0
def myExtract(statement):

  s = Sentence(parse(statement, relations=True, lemmata=True, light=True))
  p = Pattern.fromstring('There be DT NN+')
  match = p.search(s)
  #raise Exception(match)
  return match
Пример #10
0
 def test_search_function(self):
     # Assert search() function.
     s = Sentence(parse("Go on Bors, chop his head off!"))
     m = search.search("PRP*? NN*", s)
     self.assertEqual(m[0].string, "Bors")
     self.assertEqual(m[1].string, "his head")
     print "pattern.search.search()"
Пример #11
0
def test_findVerb():
    from pattern.en import parse, Text, Sentence
    from pattern.en import pprint 
    
   
    sent = "Bachelor's in Computer Science, Information Systems or a related study, is required."
    sent = 'I ate pizza.'
    sent = "Bachelor's in Computer Science is required."
    sent = "Bachelor 's Degree or 4 years equivalent professional experience ."
    sent = "A Master ’ s Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ."
    sent = "A Master's Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ."
    sent = "Bachelor ’ s degree in Computer Science or equivalent"
    sent = "Bachelor ' s degree in Computer Science or equivalent"
       
    
    result = parse(sent,
         tokenize = True,  # Tokenize the input, i.e. split punctuation from words.
             tags = True,  # Find part-of-speech tags.
            )
    pprint(result) 
    
  #  print type(result)
  #  print result         
    sen = Sentence(result)
  #  for word in sen:
 #       print word, word.type
    
    vlist = [ word.string for word in sen if word.type.startswith("V") ]
    print vlist
Пример #12
0
def add_modality(tdb):
        for tweet in tdb:
                s = parse(tweet[2], lemmata=True)
                s = Sentence(s)
                (form, score) = (mood(s), modality(s))
                tweet.extend((form, score))
        return tdb
Пример #13
0
 def trainVectorizers(self,document):
     '''
     Train the Vectorizers with a document that should be tokenized into sentences and words
     
     **Warning: All listed items will be concatenated to a single matrix**
     
     *Required Parameters*
     :param document: the document (text) or list of documents (file paths) to build count and tfidf vectorizers with (be as representative as possible)
     '''
     
     self.buildVectorizer('count')
     self.buildVectorizer('tfidf')
     
     if type(document) is str:
         self.__tfidf.fit(self.__vectorizer.fit_transform(document))
     else:
         uvecs=None
         sentences=[]
         for doc in document:
             if os.path.exists(doc) is True:
                 sentences=[]
                 with open(document,'r') as fp:
                     sentences.extend([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())])
     
         if uvecs is not None:
             self.__tfidf.fit(self.__vectorizer.fit(sentences))
Пример #14
0
 def setUp(self):
     # Parse sentences to test on.
     # Creating a Text creates Sentence, Chunk, PNP and Word.
     # Creating a Sentence tests Sentence.append() and
     # Sentence.parse_token().
     self.text = "I'm eating pizza with a fork. What a tasty pizza!"
     self.text = en.Text(en.parse(self.text, relations=True, lemmata=True))
Пример #15
0
 def test_match(self):
     # Assert Match properties.
     s = Sentence(parse("Death awaits you all with nasty, big, pointy teeth."))
     p = search.Pattern(sequence=[
         search.Constraint(tags=["JJ"], optional=True),
         search.Constraint(tags=["NN*"])])
     m = p.search(s)
     self.assertTrue(isinstance(m, list))
     self.assertEqual(m[0].pattern, p)
     self.assertEqual(m[1].pattern, p)
     self.assertEqual(m[0].words, [s.words[0]])
     self.assertEqual(m[1].words, [s.words[-3], s.words[-2]])
     # Assert contraint "NN*" links to "Death" and "teeth", and "JJ" to "pointy".
     self.assertEqual(m[0].constraint(s.words[ 0]), p[1])
     self.assertEqual(m[1].constraint(s.words[-3]), p[0])
     self.assertEqual(m[1].constraint(s.words[-2]), p[1])
     # Assert constraints "JJ NN*" links to chunk "pointy teeth".
     self.assertEqual(m[1].constraints(s.chunks[6]), [p[0], p[1]])
     # Assert Match.constituents() by constraint, constraint index and list of indices.
     self.assertEqual(m[1].constituents(), [s.chunks[6]])
     self.assertEqual(m[1].constituents(constraint=p[0]), [s.words[-3]])
     self.assertEqual(m[1].constituents(constraint=1), [s.words[-2]])
     self.assertEqual(m[1].constituents(constraint=(0,1)), [s.chunks[6]])
     # Assert Match.string.
     self.assertEqual(m[1].string, "pointy teeth")
     print "pattern.search.Match"
Пример #16
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(en.sentiment("wonderful")[0] > 0)
     self.assertTrue(en.sentiment("horrible")[0] < 0)
     self.assertTrue(en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0)
     self.assertTrue(en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0] < 0)
     # Assert that :) and :( are recognized.
     self.assertTrue(en.sentiment(":)")[0] > 0)
     self.assertTrue(en.sentiment(":(")[0] < 0)
     # Assert the accuracy of the sentiment analysis (for the positive class).
     # Given are the scores for Pang & Lee's polarity dataset v2.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee1.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     self.assertTrue(A > 0.755)
     self.assertTrue(P > 0.760)
     self.assertTrue(R > 0.747)
     self.assertTrue(F > 0.754)
     # Assert the accuracy of the sentiment analysis on short text (for the positive class).
     # Given are the scores for Pang & Lee's sentence polarity dataset v1.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee2.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     self.assertTrue(A > 0.642)
     self.assertTrue(P > 0.653)
     self.assertTrue(R > 0.607)
     self.assertTrue(F > 0.629)
     print "pattern.en.sentiment()"
Пример #17
0
Файл: faq.py Проект: VRDate/twss
def process(statement,database_name = DATABASE_NAME):
  ''' Allows us to create entities via statements like "There is a course CSCI4702 called Mobile Programming" 
      and modify entities with statements like "CSCI4702 has a start date of Jan 31st 2013"
      
      already encountering a statement like "There is a game engine Unity3d" gives us trouble
      seems like we need named entity recognition to be able to extract types like that ... or perhaps rely on capitalization
      which doesn't really work for things like CTO as a category of items, hmm
      
      >>> sent = "There is a game engine Unreal Engine".split()
      >>> print nltk.ne_chunk(nltk.pos_tag(sent))
      '''
  # this runs real fast, but it doesn't quite get the NN/NNP combination I hoped for from "There is a game engine Unity3D"
  # although it does now with light=True setting, but now it doesn't get the NNP in "There is a game engine Source"

  s = parse(statement, relations=True, lemmata=True, light=True) 
  s = split(s)

  #result = search('There be DT NN+ (DT) (RB) (JJ) NNP+ (call) (DT) (RB) (JJ) (NNPS|NNP)+', s)
  s, result = extract(statement)
  if result:
    #try:
      noun = search('(NN)+', s)[0].string
      table = pluralize(noun.replace(' ','_'))
      result = search('(JJ|NNPS|NNP)+', s) # this pulls in adjectives, but there's supposed to be a better fix coming
      ident = result[0].string
      name = result[1].string if len(result) > 1 else ident
      #raise Exception(table+"; "+ident+"; "+name)
      return newTable(table,ident,name,database_name)
    #except:
      #return regexMatch(statement,database_name)
  else:
    return regexMatch(statement,database_name)
Пример #18
0
def tokenize_line(line):
    res = line.lower()
    for regexp, replacement in TOKEN_MAPPINGS:
        res = regexp.sub(replacement, res)

    sentence = parse(res,tokenize=True,tags=False, chunks=False, relations= False, lemmata=True).split()

    # initialize the Variables
    allowed_tags = re.compile('(NN|VB|JJ|RB)')
    stopwords = frozenset()
    min_length = 2
    max_length = 15
    result = []

    # lemmatization of the words
    try:
        sentence = sentence[0]
    except IndexError:
        pass

    for token, tag, lemma in sentence:
        if min_length <= len(lemma) <= max_length and lemma not in stopwords:
            if allowed_tags.match(tag):
                lemma += "/" + tag[:2]
                result.append(lemma.encode('utf8'))
    res = result
    logging.info("That's how res looks %s" %res)
    return res
 def getData(self, params):
     if self.now_cache is not None:
         if (self.now_cache + datetime.timedelta(minutes=5)) < datetime.datetime.now():
             self.data_cache = None
             self.today_cache = None
             self.now_cache = None
     if self.data_cache is None:
         tweets = []
         for cand in candidates:
             tweets.append({'tweets': api.user_timeline(cand['user'], count=20), 
                             'name': cand['name'], 
                             'party': cand['party']})
         all_tweets = []
         for tweet_data in tweets:
             name = tweet_data['name']
             party = tweet_data['party']
             for tweet in tweet_data['tweets']:
                 all_tweets.append( {'Name': name,
                                     'Tweet': tweet.text, 
                                     'Favorites': tweet.favorite_count, 
                                     'Retweets': tweet.retweet_count} )
         dfs = pd.DataFrame(all_tweets)
         sentiments = [sentiment(tweet) for tweet in dfs['Tweet']]
         dfs['Polarity'] = [sent[0] for sent in sentiments]
         dfs['Subjectivity'] = [sent[1] for sent in sentiments]
         modal = [modality(Sentence(parse(tweet, lemmata=True))) for tweet in dfs['Tweet']]
         dfs['Certainty'] = modal
         today = date.strftime(datetime.datetime.now(), format='%m/%d/%Y, %H:%M')
         now = datetime.datetime.now()
         self.data_cache = dfs
         self.today_cache = today
         self.now_cache = now
     return self.data_cache
Пример #20
0
def basicExtract(statement):

  #s = Sentence(parse(statement, relations=True, lemmata=True, light=True))
  #p = Pattern.fromstring('(DT) (RB) (JJ) NN+')
  s = Sentence(parse(statement, lemmata=True))
  m = search("There be DT {JJ? NN}", s)
  return m
def get_parts(thetext, punctuation):
    # generate stopwords list & regexes for 2+ periods or 2+ dashes
    stop = stopwords.words('english')
    regex1=re.compile(r"\.{2,}")
    regex2=re.compile(r"\-{2,}")
    thetext=re.sub(regex1, ' ', thetext)
    thetext=re.sub(regex2, ' ', thetext)
    nouns=[]
    descriptives=[]
    for i,sentence in enumerate(parse(thetext, tokenize=True, lemmata=True).split()):
        nouns.append([])
        descriptives.append([])
        for token in sentence:
            if len(token[4]) >0:
                if token[1] in ['JJ', 'JJR', 'JJS']:
                    if token[4] in stop or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    descriptives[i].append(token[4])
                elif token[1] in ['NN', 'NNS']:
                    if token[4] in stop or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    nouns[i].append(token[4])
    out=zip(nouns, descriptives)
    nouns2=[]
    descriptives2=[]
    for n,d in out:
        if len(n)!=0 and len(d)!=0:
            nouns2.append(n)
            descriptives2.append(d)
    return nouns2, descriptives2
Пример #22
0
def get_word_types(words):
    """Determine the occurrences of pos types.

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    new_arr = []
    for val in words:
        try:
            val = parse(
                val,
                encoding='utf-8',
                tokenize=False,
                light=False,
                tags=True,
                chunks=False,
                relations=False,
                lemmata=False)
            new_arr.append(val)
        except IndexError:
            continue
    return {
        'data': new_arr,
        'summary': None
    }
Пример #23
0
    def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False):
        """
        This function is only available when the optional 'pattern' package is installed.

        Use the English lemmatizer from `pattern` to extract tokens in
        their base form=lemma, e.g. "are, is, being" -> "be" etc.
        This is a smarter version of stemming. Only consider nouns, verbs, adjectives
        and adverbs by default (=all other lemmas are discarded).

        >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
        ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']

        """
        if light:
            import warnings
            warnings.warn("The light flag is no longer supported by pattern.")

        # tokenization in `pattern` is weird; it gets thrown off by non-letters,
        # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
        # FIXME this throws away all fancy parsing cues, including sentence structure,
        # abbreviations etc.
        content = u' '.join(tokenize(content, lower=True, errors='ignore'))

        parsed = parse(content, lemmata=True, collapse=False)
        result = []
        for sentence in parsed:
            for token, tag, _, _, lemma in sentence:
                if 2 <= len(lemma) <= 15 and not lemma.startswith('_'):
                    if allowed_tags.match(tag):
                        lemma += "/" + tag[:2]
                        result.append(lemma.encode('utf8'))
        return result
def gender_feature(text, feature_vect):
    """
    Extract the gender features
    :param text:
    :param feature_vect: contains a bag of words and a list of bigrams
    :return: a dictionary which contains the feature and its computed value
    """
    #sentence length and vocab features
    tokens = word_tokenize(text.lower())
    sentences = sent_tokenize(text.lower())
    words_per_sent = np.asarray([len(word_tokenize(s)) for s in sentences])

    #bag_of_word features
    bag_dict = {}
    for bag in feature_vect[:29]:
        bag_dict[bag] = bag in tokens

    #bigrams features
    bigram_dict = {}
    for big in feature_vect[29:]:
        bigram_dict[big] = big in bigrams(tokens)

    #POS tagging features
    POS_tag = ['ADJ', 'ADV', 'DET', 'NOUN', 'PRT', 'VERB', '.']
    tagged_word = parse(text, chunks=False, tagset='UNIVERSAL').split()
    simplified_tagged_word = [(tag[0], map_tag('en-ptb', 'universal', tag[1])) for s in tagged_word for tag in s]
    freq_POS = nltk.FreqDist(tag[1] for tag in simplified_tagged_word if tag[1] in POS_tag)

    d = dict({'sentence_length_variation': words_per_sent.std()}, **bag_dict)

    return dict(dict(d, **bigram_dict), **freq_POS)
Пример #25
0
def run(o):
	""" STM is shortcuts to the short_term_memory operators """

	STM_PATH = './bin/%s/brain/short_term_memory' % o.o['name']
	
	WM_PLACES_PATH = './bin/%s/brain/working_memory/PLACES' % o.o['name']

	import os, sys;
	mydirs = os.listdir( STM_PATH )

	from pattern.en import parse, pprint, tag

	import shutil

	for word in mydirs:
		
		ignore = [".DS_Store",".gitignore","README.txt"]
		if word in ignore:
			continue

		#print word
		s = parse(word,tags=True)
		print s
		#pprint(s)

		tagged = s.split('/')[1]

		if tagged == "NNP-LOC":
			from_path = "%s/%s" % (STM_PATH,word)
			to_path = "%s/" % WM_PLACES_PATH
			
			os.system( "rsync -avrz %s %s" % (from_path,to_path) )
Пример #26
0
def findVerb(sent):
    result = parse(sent,tokenize = True, tags = True, )
    sen = Sentence(result) 
    vlist = [ word.string for word in sen if word.type.startswith("V") ]
    print vlist
    vlist = [ word.string for word in sen if word.type.startswith("V") ]
    return vlist
Пример #27
0
def get_NNPs(text, counts = False):
    '''Extract proper nouns from text.
    
    :param text: Text to parse
    :type text: str
    :param counts: Return counts for each extracted NNP
    :type counts: bool
    :returns: list -- List containing either only the extracted NNP's or (NNP, count) -pairs sorted by count.
    '''
    parsed_text = parse(text).split()
    nnps = [] 
    for sent in parsed_text:
        for word in sent:
            if word[1].startswith('NNP'):
                nnps.append(word[0])
                
    ctr = Counter(nnps)
    if counts:
        ctri = ctr.items()
        ctri = sorted(ctri, key = lambda x: x[1], reverse = True)
    else:
        ctri = ctr.keys()
    return ctri
    
    
Пример #28
0
def getLemma(dico):

	#we get every chapte
	for chapter in dico:
		#We create the new part in it
		dico[chapter]["lemma"] = {}

		#Then we get every sentence
		for sentence in dico[chapter]["sentences"]:

			#We parse it
			temp = parse(dico[chapter]["sentences"][sentence], tokenize = True, tags = True, chunks = False, relations = False, lemmata = True, default = 'NN', light = True)

			#We split it
			temp = temp.split()
			for weird in temp:
				for temp2 in temp:
					for triple in temp2:
						key = sentence + " " + triple[0]
						dico[chapter]["lemma"][key] = (triple[1],triple[2])

				#print(each)

			#print(temp)
		#End of sentence loop

	#End of chapter loop

	#We return our data
	return dico
Пример #29
0
 def test_match(self):
     # Assert Constraint-Word matching.
     R = search.Constraint.fromstring
     S = lambda s: Sentence(parse(s, relations=True, lemmata=True))
     W = lambda s, tag=None, index=0: search.Word(None, s, tag, index)
     for constraint, tests in (
       (R("cat|dog"),  [(W("cat"), 1), (W("dog"), 1), (W("fish"), 0)]),
       (R("cat*"),     [(W("cats"), 1)]),
       (R("*cat"),     [(W("tomcat"), 1)]),
       (R("c*t|d*g"),  [(W("cat"), 1), (W("cut"), 1), (W("dog"), 1), (W("dig"), 1)]),
       (R("cats|NN*"), [(W("cats", "NNS"), 1), (W("cats"), 0)]),
       (R("^cat"),     [(W("cat", "NN", index=0), 1),(W("cat", "NN", index=1), 0)]),
       (R("*|!cat"),   [(W("cat"), 0), (W("dog"), 1), (W("fish"), 1)]),
       (R("my cat"),   [(W("cat"), 0)]),
       (R("my cat"),   [(S("my cat").words[1], 1)]),  # "my cat" is an overspecification of "cat"
       (R("my_cat"),   [(S("my cat").words[1], 1)]),
       (R("cat|NP"),   [(S("my cat").words[1], 1)]),
       (R("dog|VP"),   [(S("my dog").words[1], 0)]),
       (R("cat|SBJ"),  [(S("the cat is sleeping").words[1], 1)]),
       (R("dog"),      [(S("MY DOGS").words[1], 1)]), # lemma matches
       (R("dog"),      [(S("MY DOG").words[1], 1)])): # case-insensitive
         for test, b in tests:
             self.assertEqual(constraint.match(test), bool(b))
     # Assert Constraint-Taxa matching.
     t = search.Taxonomy()
     t.append("Tweety", type="bird")
     t.append("Steven", type="bird")
     v = search.Constraint.fromstring("BIRD", taxonomy=t)
     self.assertTrue(v.match(W("bird")))
     self.assertTrue(v.match(S("tweeties")[0]))
     self.assertTrue(v.match(W("Steven")))
     print "pattern.search.Constraint.match()"
Пример #30
0
    def normalize(self, text):
        """Normalizes a given string by:
            * singularizing any plurals.
            * getting the base form of any verb
            * eliminating all capitals"""

        if self.language == self.LANG_EN:
            from pattern.en import parse
        elif self.language == self.LANG_NL:
            from pattern.nl import parse
        elif self.language == self.LANG_DE:
            from pattern.de import parse
        elif self.language == self.LANG_ES:
            from pattern.es import parse
        elif self.language == self.LANG_UNKNOWN:
            # Don't do any parsing.
            return text.lower()
        else:
            raise Exception("Unsupported language: %s" % repr(self.language))

        parsed = parse(text, lemmata=True, chunks=False)
        parsed = [x for y in parsed.split() for x in y]  # Flatten
        normalized = map(lambda w: w[-1], parsed)
        normalized = filter(lambda w: w not in string.punctuation, normalized)
        normalized = ' '.join(normalized)
        return normalized
Пример #31
0
def get_sentiment_bigrams(paths):
    bigrams_list = []

    for path in paths:
        f = open(path, "r+")
        review_text = f.read()
        f.close()

        sentences = review_text.split(".")

        for sentence in sentences:
            tagged_sentence = parse(sentence.lower()).split(" ")
            sentence_bigrams = get_bigrams(tagged_sentence)

            bigrams_list = combine_lists(bigrams_list, sentence_bigrams)

    return bigrams_list
    def lemmatize(content,
                  allowed_tags=re.compile('(NN|VB|JJ|RB)'),
                  light=False,
                  stopwords=frozenset()):
        """
        This function is only available when the optional 'pattern' package is installed.

        Use the English lemmatizer from `pattern` to extract tokens in
        their base form=lemma, e.g. "are, is, being" -> "be" etc.
        This is a smarter version of stemming, taking word context into account.

        Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).

        >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
        ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']

        >>> lemmatize('The study ranks high.')
        ['study/NN', 'rank/VB', 'high/JJ']

        >>> lemmatize('The ranks study hard.')
        ['rank/NN', 'study/VB', 'hard/RB']

        """
        if light:
            import warnings
            warnings.warn("The light flag is no longer supported by pattern.")

        # tokenization in `pattern` is weird; it gets thrown off by non-letters,
        # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
        # FIXME this throws away all fancy parsing cues, including sentence structure,
        # abbreviations etc.
        content = u(' ').join(tokenize(content, lower=True, errors='ignore'))

        parsed = parse(content, lemmata=True, collapse=False)
        result = []
        for sentence in parsed:
            for token, tag, _, _, lemma in sentence:
                if 2 <= len(lemma) <= 15 and not lemma.startswith(
                        '_') and lemma not in stopwords:
                    if allowed_tags.match(tag):
                        lemma += "/" + tag[:2]
                        result.append(lemma.encode('utf8'))
        return result


#endif HAS_PATTERN
Пример #33
0
def lemmatize(content):
    """
	Use the English lemmatizer from `pattern` to extract tokens in
	their base form=lemma, e.g. "are, is, being" -> "be" etc.
	This is a smarter version of stemming, taking word context into account.

	Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
    """
    content = u' '.join(utils.tokenize(content, lower=True, errors='ignore'))
    parsed = parse(content, lemmata=True, collapse=False)
    result = []
    for sentence in parsed:
        for token, tag, _, _, lemma in sentence:
            if 2 <= len(lemma) <= 15 and not lemma.startswith('_'):
                if utils.ALLOWED_TAGS.match(tag):
                    result.append(lemma.encode('utf8'))
    return result
Пример #34
0
 def test_group(self):
     # Assert Match groups.
     s = Sentence(parse("the big black cat eats a tasty fish"))
     m = search.search("DT {JJ+} NN", s)
     self.assertEqual(m[0].group(1).string, "big black")
     self.assertEqual(m[1].group(1).string, "tasty")
     # Assert nested groups (and syntax with additional spaces).
     m = search.search("DT { JJ { JJ { NN }}}", s)
     self.assertEqual(m[0].group(1).string, "big black cat")
     self.assertEqual(m[0].group(2).string, "black cat")
     self.assertEqual(m[0].group(3).string, "cat")
     # Assert chunked groups.
     m = search.search("NP {VP NP}", s)
     v = m[0].group(1, chunked=True)
     self.assertEqual(v[0].string, "eats")
     self.assertEqual(v[1].string, "a tasty fish")
     print("pattern.search.Match.group()")
Пример #35
0
 def test_parse(self):
     # Assert parsed output with Penn Treebank II tags (slash-formatted).
     # 1) "the black cat" is a noun phrase, "on the mat" is a prepositional noun phrase.
     v = en.parser.parse("The black cat sat on the mat.")
     self.assertEqual(v,
         "The/DT/B-NP/O black/JJ/I-NP/O cat/NN/I-NP/O " + \
         "sat/VBD/B-VP/O " + \
         "on/IN/B-PP/B-PNP the/DT/B-NP/I-PNP mat/NN/I-NP/I-PNP ././O/O"
     )
     # 2) "the black cat" is the subject, "a fish" is the object.
     v = en.parser.parse("The black cat is eating a fish.", relations=True)
     self.assertEqual(v,
         "The/DT/B-NP/O/NP-SBJ-1 black/JJ/I-NP/O/NP-SBJ-1 cat/NN/I-NP/O/NP-SBJ-1 " + \
         "is/VBZ/B-VP/O/VP-1 eating/VBG/I-VP/O/VP-1 " + \
         "a/DT/B-NP/O/NP-OBJ-1 fish/NN/I-NP/O/NP-OBJ-1 ././O/O/O"
     )
     # 3) "chasing" and "mice" lemmata are "chase" and "mouse".
     v = en.parser.parse("The black cat is chasing mice.", lemmata=True)
     self.assertEqual(v,
         "The/DT/B-NP/O/the black/JJ/I-NP/O/black cat/NN/I-NP/O/cat " + \
         "is/VBZ/B-VP/O/be chasing/VBG/I-VP/O/chase " + \
         "mice/NNS/B-NP/O/mouse ././O/O/."
     )
     # 4) Assert unicode.
     self.assertTrue(isinstance(v, unicode))
     # 5) Assert unicode for faulty input (bytestring with unicode characters).
     self.assertTrue(isinstance(en.parse("ø ü"), unicode))
     self.assertTrue(isinstance(en.parse("ø ü", tokenize=True,  tags=False, chunks=False), unicode))
     self.assertTrue(isinstance(en.parse("ø ü", tokenize=False, tags=False, chunks=False), unicode))
     self.assertTrue(isinstance(en.parse("o u", encoding="ascii"), unicode))
     # 6) Assert optional parameters (i.e., setting all to False).
     self.assertEqual(en.parse("ø ü.", tokenize=True,  tags=False, chunks=False), u"ø ü .")
     self.assertEqual(en.parse("ø ü.", tokenize=False, tags=False, chunks=False), u"ø ü.")
     # 7) Assert the accuracy of the English tagger.
     i, n = 0, 0
     for sentence in open(os.path.join(PATH, "corpora", "tagged-en-penntreebank.txt")).readlines():
         sentence = sentence.decode("utf-8").strip()
         s1 = [w.split("/") for w in sentence.split(" ")]
         s2 = [[w for w, pos in s1]]
         s2 = en.parse(s2, tokenize=False)
         s2 = [w.split("/") for w in s2.split(" ")]
         for j in range(len(s1)):
             if s1[j][1] == s2[j][1].split("-")[0]:
                 i += 1
             n += 1
     #print float(i) / n
     self.assertTrue(float(i) / n > 0.945)
     print "pattern.en.parse()"
Пример #36
0
 def getData(self, params):
     if self.now_cache is not None:
         if (self.now_cache +
                 datetime.timedelta(minutes=5)) < datetime.datetime.now():
             self.data_cache = None
             self.today_cache = None
             self.now_cache = None
     if self.data_cache is None:
         tweets = []
         for cand in candidates:
             tweets.append({
                 'tweets':
                 api.user_timeline(cand['user'], count=20),
                 'name':
                 cand['name'],
                 'party':
                 cand['party']
             })
         all_tweets = []
         for tweet_data in tweets:
             name = tweet_data['name']
             party = tweet_data['party']
             for tweet in tweet_data['tweets']:
                 all_tweets.append({
                     'Name': name,
                     'Tweet': tweet.text,
                     'Favorites': tweet.favorite_count,
                     'Retweets': tweet.retweet_count
                 })
         dfs = pd.DataFrame(all_tweets)
         sentiments = [sentiment(tweet) for tweet in dfs['Tweet']]
         dfs['Polarity'] = [sent[0] for sent in sentiments]
         dfs['Subjectivity'] = [sent[1] for sent in sentiments]
         modal = [
             modality(Sentence(parse(tweet, lemmata=True)))
             for tweet in dfs['Tweet']
         ]
         dfs['Certainty'] = modal
         today = date.strftime(datetime.datetime.now(),
                               format='%m/%d/%Y, %H:%M')
         now = datetime.datetime.now()
         self.data_cache = dfs
         self.today_cache = today
         self.now_cache = now
     return self.data_cache
Пример #37
0
def corrupt_remove_prep(sent):
    # parse
    P = parse(sent)
    parsed = sum(P.split(), [])

    # chunk
    new = []
    for word in parsed:
        chunktag = word[3]
        if chunktag != 'B-PNP':
            new.append(word[0])

    corr = ' '.join(new)

    if sorted(tokenize(sent)) == sorted(tokenize(corr)):
        return None

    return corr
Пример #38
0
def team_sentiment_analysis(stats):
	for s in stats.sentences:
		this_sentiment = sentiment(s)
		polarity = float("{0:.2f}".format(this_sentiment[0]))
		subjectivity = float("{0:.2f}".format(this_sentiment[1]))
		polarity_10 = float("{0:.1f}".format(this_sentiment[0]))
		subjectivity_10 = float("{0:.1f}".format(this_sentiment[1]))
		stats.polarity_counts[polarity] += 1
		stats.subjectivity_counts[subjectivity] += 1
		stats.polarity_counts_10s[polarity_10] += 1
		stats.subjectivity_counts_10s[subjectivity_10] += 1

		s = Sentence(parse(s, lemmata=True))
		stats.mood_counts[mood(s)] += 1
		rounded_modality = float("{0:.2f}".format(modality(s)))
		rounded_modality_10 = float("{0:.1f}".format(modality(s)))
		stats.modality_counts[rounded_modality] += 1
		stats.modality_counts_10s[rounded_modality_10] += 1
Пример #39
0
    def calculate_phrase_sentiment(self, phrases):
        # print "Rating phrases sentiment..."
        valence_list = []
        arousal_list = []
        for p in phrases:
            pol = sentiment(p)[0]
            sent = parse(p, lemmata=True)
            mod = modality(Sentence(sent))
            print mod
            valence_list.append(10 * pol)
            arousal_list.append(5 * mod)

        valence = max(valence_list)
        arousal = max(arousal_list)

        print "Valence: " + str(valence)
        print "arousal: " + str(arousal)
        return ((valence, arousal))
Пример #40
0
def getBroken(word):
    global duplicateWordForms
    #     print('in get broken. word: ', word)
    ''' returns word parsed into array of caps chars, word root, and POS tag if any'''
    if word.isspace():
        return [word]
    returner = []
    if word in words_to_ignore:
        return getCapsChars(word) + [word.lower()]
    patternParsedList = parse(word, relations=True, lemmata=True).split()[0]
    for patternParsed in patternParsedList:
        word, root, pos = postParse(patternParsed)
        capsChars = []
        wordLower = word.lower()
        wordHasCaps = not word.islower()
        wordLower = word.lower() if wordHasCaps else word
        #         print("wordHasCaps", wordHasCaps)
        #         print(word, root, pos)
        if wordHasCaps:
            capsChars = getCapsChars(word)
#         if pos in unsupportedPoss:
#             returner += capsChars + [wordLower]
        if root == wordLower:
            ''' this means there are no POS tags we need to keep '''
            returner += capsChars + [wordLower]
        else:
            if root in roots_to_ignore or "'" in word or "‘" in word or "’" in word:
                returner += capsChars + [
                    wordLower
                ]  #was, were, am, are -- these words get tokenized/untokenized unreliably. :(
            else:
                useParsed = True
                key = root + pos
                if key in m_rootPos_word:
                    value = m_rootPos_word[key]
                    if value != wordLower:
                        duplicateWordForms += [(key, value, wordLower)]
                        useParsed = False
                if useParsed:
                    m_rootPos_word[root + pos] = wordLower
                    returner += capsChars + [root, pos]
                else:
                    returner += capsChars + [wordLower]
    return returner
Пример #41
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(en.sentiment("wonderful")[0] > 0)
     self.assertTrue(en.sentiment("horrible")[0] < 0)
     self.assertTrue(
         en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0)
     self.assertTrue(
         en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0]
         < 0)
     # Assert that :) and :( are recognized.
     self.assertTrue(en.sentiment(":)")[0] > 0)
     self.assertTrue(en.sentiment(":(")[0] < 0)
     # Assert the accuracy of the sentiment analysis (for the positive class).
     # Given are the scores for Pang & Lee's polarity dataset v2.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(
             os.path.join(PATH, "corpora", "polarity-en-pang&lee1.csv")):
         reviews.append((review, int(score) > 0))
     from time import time
     t = time()
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     #print A, P, R, F
     self.assertTrue(A > 0.753)
     self.assertTrue(P > 0.768)
     self.assertTrue(R > 0.725)
     self.assertTrue(F > 0.746)
     # Assert the accuracy of the sentiment analysis on short text (for the positive class).
     # Given are the scores for Pang & Lee's sentence polarity dataset v1.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     reviews = []
     for score, review in Datasheet.load(
             os.path.join(PATH, "corpora", "polarity-en-pang&lee2.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     #print A, P, R, F
     self.assertTrue(A > 0.654)
     self.assertTrue(P > 0.660)
     self.assertTrue(R > 0.636)
     self.assertTrue(F > 0.648)
     print "pattern.en.sentiment()"
Пример #42
0
def mood(sentence, **kwargs):
    """Returns IMPERATIVE (command), CONDITIONAL (possibility), SUBJUNCTIVE
    (wish) or INDICATIVE (fact)."""
    if isinstance(sentence, basestring):
        try:
            # A Sentence is expected but a string given.
            # Attempt to parse the string on-the-fly.
            from pattern.en import parse, Sentence
            sentence = Sentence(parse(sentence))
        except ImportError:
            pass
    if imperative(sentence, **kwargs):
        return IMPERATIVE
    if conditional(sentence, **kwargs):
        return CONDITIONAL
    if subjunctive(sentence, **kwargs):
        return SUBJUNCTIVE
    else:
        return INDICATIVE
Пример #43
0
def getPosTag(in_filename, out_filename):    
     #open files for inpurt/output, fhnd[0] - in, fhnd[1 ]- out
    fhnd = open_files(in_filename, out_filename)
    
    tagged_line = ""
    
    from pattern.en import parse
    
    for line in fhnd[0]:
        word_list = line.split() #line to list
        for word in word_list:
            tagged_line += " " + parse(word, Relations = False, lemmata = False)
        
        fhnd[1].write(tagged_line + "\n")
        tagged_line = ""
        
        
      
    return 0
Пример #44
0
def pos_counts(sentence, ngram=4):
	counts = {}
	sentence = " ".join(sentence.strip().split())
	words = sentence.split();
	if len(words) < ngram:
		return counts

	pos = parse(sentence, chunks=False).split()[0]
	pos = filter(lambda x: re.match('^[\w-]+$', x[1]) is not None, pos)
	pos = [x[1] for x in pos]

	for w in range(len(pos)-4):
		postag = "|".join(pos[w:w+4])
		if postag in counts:
			counts[postag] = counts[postag] + 1
		else:
			counts[postag] = 1

	return counts
Пример #45
0
 def test_search(self):
     # Assert one match containing all words.
     v = search.Pattern.fromstring("*+")
     v = v.search("one two three")
     self.assertEqual(v[0].string, "one two three")
     # Assert one match for each word.
     v = search.Pattern.fromstring("*")
     v = v.search("one two three")
     self.assertEqual(v[0].string, "one")
     self.assertEqual(v[1].string, "two")
     self.assertEqual(v[2].string, "three")
     # Assert all variations are matched (sentence starts with a NN* which must be caught).
     v = search.Pattern.fromstring("(DT) JJ?+ NN*")
     v = v.search(Sentence(parse("dogs, black cats and a big white rabbit")))
     self.assertEqual(v[0].string, "dogs")
     self.assertEqual(v[1].string, "black cats")
     self.assertEqual(v[2].string, "a big white rabbit")
     v = search.Pattern.fromstring("NN*")
     print "pattern.search.Pattern.search()"
Пример #46
0
    def lemmatize(content,
                  light=False,
                  allowed_tags=re.compile('(NN|VB|JJ|RB)')):
        """
        This function is only available when the optional 'pattern' package is installed.

        Use the English lemmatizer from `pattern` to extract tokens in
        their base form=lemma, e.g. "are, is, being" -> "be" etc.
        This is a smarter version of stemming. Only consider nouns, verbs, adjectives
        and adverbs by default (=all other lemmas are discarded).

        >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
        ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']

        From http://www.clips.ua.ac.be/pages/pattern-en#parser :

            The parser is built on a Brill lexicon of tagged words and rules to
            improve the tags context-wise. With light=False, it uses Brill's contextual
            rules. With light=True it uses Jason Wiener's simpler ruleset. This
            ruleset is 5-10x faster but also 25% less accurate.

        """
        # tokenization in `pattern` is weird; it gets thrown off by non-letters,
        # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
        # FIXME this throws away all fancy parsing cues, including sentence structure,
        # abbreviations etc.
        content = u' '.join(tokenize(content, lower=True, errors='ignore'))

        # use simpler, modified pattern.text.en.text.parser.parse that doesn't
        # collapse the output at the end: https://github.com/piskvorky/pattern
        parsed = parse(content, lemmata=True, collapse=False, light=light)
        result = []
        for sentence in parsed:
            for token, tag, _, _, lemma in sentence:
                if 2 <= len(lemma) <= 15 and not lemma.startswith('_'):
                    if allowed_tags.match(tag):
                        lemma += "/" + tag[:2]
                        result.append(lemma.encode('utf8'))
        return result


#endif HAS_PATTERN
Пример #47
0
    def add_keywords(self, phrase):

        sent = en.Sentence(en.parse(phrase))
        nouns = search('NN', sent)
        self.blackboard.pool.nouns.update(
            set(Word(en.singularize(n[0].string)) for n in nouns))
        adjs = search('JJ', sent)
        self.blackboard.pool.adjectives.update(
            set(Word(en.lemma(a[0].string)) for a in adjs))

        try:
            nps = search('NP', sent)
            for np in nps:
                self.blackboard.pool.epithets.update({
                    Word(en.singularize(w.string), "NN"):
                    [Word(jj.string, "JJ") for jj in np if "JJ" in jj.tag]
                    for w in np if "NN" in w.tag
                })
        except IndexError:
            pass
Пример #48
0
def testTokenize():
    s = "I eat pizza with a fork."

    s = "B.S. in Computer Science, a related degree or its equivalent "
    s = "What's this? This is a book."

    s = "Bachelor's degree in Computer Science or equivalent"
    s = "Bachelor’s degree in Computer Science or equivalent"

    s = parse(
        s,
        tokenize=True,  # Tokenize the input, i.e. split punctuation from words.
        tags=False,  # Find part-of-speech tags.
        chunks=
        False,  # Find chunk tags, e.g. "the black cat" = NP = noun phrase.
        relations=False,  # Find relations between chunks.
        lemmata=False,  # Find word lemmata.
        light=False)

    print s.split()
Пример #49
0
def sentiment(content):
    from pattern.en import parse, split, wordnet
    wordnet.sentiment.load()
    relevant_types = [
        'JJ',
        'VB',
        'VBD',
        'VBN',
        'VBG'
        'RB',
    ]
    score = 0
    sentences = split(parse(content, lemmata=True))
    for sentence in sentences:
        for word in sentence.words:
            if word.type in relevant_types:
                pos, neg, obj = wordnet.sentiment[word.lemma]
                score = score + ((pos - neg) * (1 - obj))
    #return 1 if score >= 0 else -1
    return score
Пример #50
0
def extract(statement):

  s = Sentence(parse(statement, lemmata=True))

  '''c1 = Constraint.fromstring("There be DT")
  c2 = Constraint.fromstring("NN+")
  c3 = Constraint.fromstring("(DT)")
  c4 = Constraint.fromstring("(RB) (JJ) NNP+")
  c5 = Constraint.fromstring("(call) (DT)")
  c6 = Constraint.fromstring("(RB) (JJ) (NNPS|NNP)+")
  p = Pattern(sequence=[c1, c2, c3, c4, c5, c6]) 
 
  match = p.search(s)
   '''
  s = find_entities(s)
   
   # not sure about this "be" thing - happy to match plural (is/are) but not sure about past tense ...
  match = search(MATCH_STRING, s)
  #raise Exception(match)
  return s, match
Пример #51
0
def tagSentence(sent):
    result = parse(
        sent,
        tokenize=True,  # Tokenize the input, i.e. split punctuation from words.
        tags=True,  # Find part-of-speech tags.
        chunks=
        False,  # Find chunk tags, e.g. "the black cat" = NP = noun phrase.
        relations=False,  # Find relations between chunks.
        lemmata=False,  # Find word lemmata.
        light=False)
    #    pprint(result)

    array = str(result).split(" ")
    tokens = []
    posTags = []
    for a in array:
        b = a.split("/")
        tokens.append(b[0])
        posTags.append(b[1])

#  print tokens
#  print posTags
    return (tokens, posTags)
Пример #52
0
    def lemmatize(content):
        """
        Use the English lemmatizer from the `pattern` package to extract tokens in
        their base form (lemmas: "are, is, being"->"be" etc.).
        This is a smarter version of stemming.
        """
        # tokenization in `pattern` is weird; it gets thrown off by non-letters,
        # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
        # FIXME this throws away all fancy parsing cues, including sentence structure,
        # abbreviations etc.
        content = u' '.join(tokenize(content, lower=True, errors='ignore'))

        # use simpler, modified pattern.text.en.text.parser.parse that doesn't
        # collapse the output at the end: https://github.com/piskvorky/pattern
        parsed = parse(content, lemmata=True, collapse=False)
        result = []
        for sentence in parsed:
            for token, tag, _, _, lemma in sentence:
                if 2 <= len(lemma) <= 15 and not lemma.startswith('_'):
                    if ALLOWED_TAGS.match(tag):
                        lemma += "/" + tag[:2]
                        result.append(lemma.encode('utf8'))
        return result
Пример #53
0
def convert_pattern_format(text):
    """
    Text is parsed through pattern's parsing function into a standardized format.
    """
    parsed_text = []
    # parse text via Pattern's parser
    pattern_parsed_text = Text(parse(text, relations=True, lemmata=True))
    for sentence in pattern_parsed_text:
        s = Sentence()
        s.string = remove_blanks(sentence.string)
        for word in sentence:
            # Patterns tags for each word in the sentence are stored in a new Word-object
            w = Word()
            w.string = word.string
            w.lemma = word.lemma
            w.index = word.index
            w.tag = word.type
            w.entity = ""
            # each word is appended to a Sentence-object
            s.words.append(w)
        # each Sentence-object is appended to an array
        parsed_text.append(s)
    return parsed_text
Пример #54
0
def test_parse():
    from pattern.en import parse, Text, Sentence
    from pattern.en import pprint 
    
   
    sent = "Experience with mobile application development a plus: iPhone/iPad, Android, or Blackberry."
    sent = "3+ years web software development experience."
    sent = "Bachelor's in Computer Science, Information Systems or a related study, is required."
    sent = 'I ate pizza.'
    sent = "Bachelor's in Computer Science is required."
    sent = "Bachelor 's Degree or 4 years equivalent professional experience ."
    sent = "A Master ’ s Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ."
    sent = "A Master's Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ."
    sent = "BS degree ( BSEE or BSCS strongly preferred , MSCS a plus ) and/or the equivalent in training and experience ."      
    
    result = parse(sent,
         tokenize = True,  # Tokenize the input, i.e. split punctuation from words.
             tags = True,  # Find part-of-speech tags.
           chunks = True,  # Find chunk tags, e.g. "the black cat" = NP = noun phrase.
        relations = True,  # Find relations between chunks.
          lemmata = True,  # Find word lemmata.
            light = True)
    pprint(result) 
Пример #55
0
def getLemma(dico):

    #we get every chapte
    for chapter in dico:
        #We create the new part in it
        dico[chapter]["lemma"] = {}

        #Then we get every sentence
        for sentence in dico[chapter]["sentences"]:

            #We parse it
            temp = parse(dico[chapter]["sentences"][sentence],
                         tokenize=True,
                         tags=True,
                         chunks=False,
                         relations=False,
                         lemmata=True,
                         default='NN',
                         light=True)

            #We split it
            temp = temp.split()
            for weird in temp:
                for temp2 in temp:
                    for triple in temp2:
                        key = sentence + " " + triple[0]
                        dico[chapter]["lemma"][key] = (triple[1], triple[2])

                #print(each)

            #print(temp)
        #End of sentence loop

    #End of chapter loop

    #We return our data
    return dico
Пример #56
0
 def test_parse(self):
     # Assert parsed output with Penn Treebank II tags (slash-formatted).
     # 1) "the black cat" is a noun phrase, "on the mat" is a prepositional noun phrase.
     v = en.parser.parse("The black cat sat on the mat.")
     self.assertEqual(v,
         "The/DT/B-NP/O black/JJ/I-NP/O cat/NN/I-NP/O " + \
         "sat/VBD/B-VP/O " + \
         "on/IN/B-PP/B-PNP the/DT/B-NP/I-PNP mat/NN/I-NP/I-PNP ././O/O"
     )
     # 2) "the black cat" is the subject, "a fish" is the object.
     v = en.parser.parse("The black cat is eating a fish.", relations=True)
     self.assertEqual(v,
         "The/DT/B-NP/O/NP-SBJ-1 black/JJ/I-NP/O/NP-SBJ-1 cat/NN/I-NP/O/NP-SBJ-1 " + \
         "is/VBZ/B-VP/O/VP-1 eating/VBG/I-VP/O/VP-1 " + \
         "a/DT/B-NP/O/NP-OBJ-1 fish/NN/I-NP/O/NP-OBJ-1 ././O/O/O"
     )
     # 3) "chasing" and "mice" lemmata are "chase" and "mouse".
     v = en.parser.parse("The black cat is chasing mice.", lemmata=True)
     self.assertEqual(v,
         "The/DT/B-NP/O/the black/JJ/I-NP/O/black cat/NN/I-NP/O/cat " + \
         "is/VBZ/B-VP/O/be chasing/VBG/I-VP/O/chase " + \
         "mice/NNS/B-NP/O/mouse ././O/O/."
     )
     # 4) Assert unicode.
     self.assertTrue(isinstance(v, unicode))
     # 5) Assert unicode for faulty input (bytestring with unicode characters).
     self.assertTrue(isinstance(en.parse("ø ü"), unicode))
     self.assertTrue(
         isinstance(
             en.parse("ø ü", tokenize=True, tags=False, chunks=False),
             unicode))
     self.assertTrue(
         isinstance(
             en.parse("ø ü", tokenize=False, tags=False, chunks=False),
             unicode))
     self.assertTrue(isinstance(en.parse("o u", encoding="ascii"), unicode))
     # 6) Assert optional parameters (i.e., setting all to False).
     self.assertEqual(
         en.parse("ø ü.", tokenize=True, tags=False, chunks=False),
         u"ø ü .")
     self.assertEqual(
         en.parse("ø ü.", tokenize=False, tags=False, chunks=False),
         u"ø ü.")
     print "pattern.en.parser.parse()"
Пример #57
0
def get_word_types(words):
    """Determine the occurrences of pos types.

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    new_arr = []
    for val in words:
        try:
            val = parse(val,
                        encoding='utf-8',
                        tokenize=False,
                        light=False,
                        tags=True,
                        chunks=False,
                        relations=False,
                        lemmata=False)
            new_arr.append(val)
        except IndexError:
            continue
    return {'data': new_arr, 'summary': None}
Пример #58
0
    def process(self, message):
        # print pattern_en.suggest(message) -- suggestions
        if message == ">!train":
            self.train()
            return "It is nice to learn new stuff."
        if message == ">!forget":
            memory.clear()
            return "I am reborn. So much free space :) maybe you will use files to store memory and not RAM..."
        if message == ">!load_page":
            if sessionId not in memory:
                response = "Hello! My name is Chad and I am passionate about music."
                response += "We can share our experiences and maybe we can get along."
                response += "Would you mind telling me your name first?"
                expect[sessionId] = "name"
                memory[sessionId] = dict()
            else:
                response = "Welcome back!"
                search.search("new songs")
                with open('results.json') as data_file:
                    data = json.load(data_file)
                    for i in range(10):
                        if 'musicrecording' in data['items'][i]['pagemap']:
                            mr = data['items'][i]['pagemap']['musicrecording']
                            which = random.randint(0, len(mr) - 1)
                            if 'name' not in mr[which]:
                                response += " Did you know that " + mr[which][
                                    'byartist'] + " has released a new song?"
                            else:
                                response += " You can check out this cool song, " + mr[which]['name'] + ", by " + \
                                            mr[which]['byartist']
            return response

        s = nlp.get_sentences(message)

        doc = spacy_nlp(message)
        for w in doc:
            print "(", w, w.dep_, w.pos_, w.head, ")"

        aiml_sent_type = []
        aiml_responses = []
        memory_responses = []
        sentence_types = []
        emotions = []

        for sentence in s:
            sentence_type = self.instant_classifier.classify(
                dialogue_act_features(sentence))

            sentence_types.append(sentence_type)

            polarity, subjective = pattern_en.sentiment(sentence)
            sent = pattern_en.parse(sentence, lemmata=True)
            sent = pattern_en.Sentence(sent)
            modality = pattern_en.modality(sent)
            mood = pattern_en.mood(sent)

            if polarity > 0.8:
                emotions.append("SUPER HAPPY")
            elif polarity > 0.3:
                emotions.append("GOOD SURPRISE")
            elif polarity < -0.4:
                emotions.append("FEAR")
            elif polarity > 0.4:
                emotions.append("COOL")
            elif polarity < -0.1:
                emotions.append("SAD")
            elif polarity < -0.7:
                emotions.append("ANGER")
            else:
                emotions.append("NEUTER")

            print sentence_type, polarity, subjective, modality, mood

            if sentence_type not in ["whQuestion", "ynQuestion"]:
                try:
                    aiml_sent_type_res = self.kernel.respond(
                        sentence_type, sessionId)
                except:
                    aiml_sent_type_res = ""
                aiml_sent_type.append(aiml_sent_type_res)

            verbs_subj = set()
            sentence = sentence[0].upper() + sentence[1:]
            doc = spacy_nlp(sentence)
            for possible_subject in doc:
                if (possible_subject.dep == nsubj or possible_subject.dep
                        == nsubjpass) and possible_subject.head.pos == VERB:
                    verbs_subj.add((possible_subject, possible_subject.head))

            try:
                aiml_response = self.kernel.respond(sentence, sessionId)
            except:
                aiml_response = ""
            aiml_responses.append(aiml_response)

            # MEMORY MODULE
            memory_msg = ""
            if sentence_type == "Statement":
                # insert into memory
                for i in verbs_subj:
                    subjs = []
                    subjects = [i[0]]
                    for tok in i[0].children:
                        if tok.dep == conj:
                            subjects.append(tok)

                    for subj in subjects:
                        predec = ""
                        for tok in subj.children:
                            if tok.dep_ == "poss" or tok.dep == amod:
                                predec += tok.lower_
                        if len(predec) > 0:
                            subjs.append(predec + " " + subj.lower_)
                        else:
                            subjs.append(subj.lower_)

                    vb = i[1].lower_
                    if vb not in memory[sessionId]:
                        memory[sessionId][vb] = dict()
                    for subj in subjs:
                        for c in i[1].children:
                            if c.dep in [prep]:
                                memory[sessionId][vb][subj] = c.lower_ + " "
                                for c_prep in c.children:
                                    if c_prep.dep in [dobj, pobj, attr]:
                                        memory[sessionId][vb][
                                            subj] += c_prep.text
                                        memory_responses.append(
                                            self.kernel.respond(
                                                "memorate", sessionId))
                            elif c.dep in [dobj, pobj, attr]:
                                memory[sessionId][vb][subj] = c.text
                                memory_responses.append(
                                    self.kernel.respond("memorate", sessionId))
            elif sentence_type == "whQuestion":
                for i in verbs_subj:
                    subjs = []
                    subjects = [i[0]]
                    for tok in i[0].children:
                        if tok.dep == conj:
                            subjects.append(tok)

                    for subj in subjects:
                        predec = ""
                        for tok in subj.children:
                            if tok.dep_ == "poss" or tok.dep == amod:
                                predec += tok.lower_
                        if len(predec) > 0:
                            subjs.append(predec + " " + subj.lower_)
                        else:
                            subjs.append(subj.lower_)

                    max_similarity = 0
                    verb = i[1].lower_
                    for j in memory[sessionId]:
                        p_word = spacy_nlp(j)
                        similarity = i[1].similarity(p_word[0])
                        if similarity > max_similarity:
                            max_similarity = similarity
                            verb = j
                    if max_similarity > 0.5 and verb in memory[sessionId]:
                        num_subjs = len(subjs)
                        memory_msg = ""
                        for subj in subjs:
                            if subj in memory[sessionId][verb]:
                                toks = nlp.tokenize_text(subj)
                                memory_msg = ""
                                for t in toks:
                                    if t in first_person:
                                        memory_msg += pron_translate[t] + " "
                                    else:
                                        memory_msg += t + " "
                                num_subjs -= 1
                                if num_subjs > 2:
                                    memory_msg += ", "
                                elif num_subjs == 1:
                                    memory_msg += "and "
                        if len(memory_msg) > 0:
                            memory_msg += verb + " "
                            if num_subjs != len(subjs):
                                memory_msg += memory[sessionId][verb][
                                    subjs[-1]] + "."
            memory_responses.append(memory_msg)

        arr_response = []

        for i in aiml_sent_type:
            if len(i) > 0:
                arr_response.append(i)

        for i in aiml_responses:
            if len(i) > 0:
                arr_response.append(i)

        for i in memory_responses:
            if len(i) > 0:
                arr_response.append(i)

        if len(arr_response) == 0:
            data = search.search(message)
            snip = data['items'][0]['snippet']
            sents = nlp.get_sentences(snip)
            arr_response.append(sents[0])

        response = ""

        for i in emotions:
            try:
                emoi = self.kernel.respond(i, sessionId)
            except:
                emoi = None
            if emoi is not None:
                if random.randint(0, 100) < 50:
                    response += " " + emoi + "."
                    break

        for res in arr_response:
            if len(res) > 1:
                response += res + " "

        # generic response, if no response
        restoks = nlp.tokenize_text(response)
        if len(restoks) == 0:
            idx = random.randint(0, len(sentence_types) - 1)
            try:
                aiml_response = self.kernel.respond(sentence_types[idx],
                                                    sessionId)
            except:
                aiml_response = ""
            response += aiml_response

        # polarity, subjective = pattern_en.sentiment(response)
        # sent = pattern_en.parse(sentence, lemmata=True)
        # sent = pattern_en.Sentence(sent)
        # modality = pattern_en.modality(sent)
        # mood = pattern_en.mood(sent)
        # sentence_type = self.instant_classifier.classify(dialogue_act_features(response))
        # print response, polarity, subjective, modality, mood

        return response
Пример #59
0
# It does not use modal verbs such as "could" and "would":
# "You could eat your dinner!" is not a command but a bubbly suggestion.

# We can create a pattern that scans for infinitive verbs (VB),
# and use "!" to exclude certain words:
# "!could|!would|!should|!to+ VB" = infinitive not preceded by modal or "to".
# This works fine except in one case: if the sentence starts with a verb.
# So we need a second rule "^VB" to catch this.
# Note that the example below contains a third rule: "^do|VB*".
# This catches all sentences that start with a "do" verb regardless if it is infinitive,
# because the parses sometimes tags infinitive "do" incorrectly.


def imperative(sentence):
    for p in ("!could|!would|!should|!to+ VB", "^VB", "^do|VB*"):
        m = match(p, sentence)
        if match(p, sentence) and sentence.string.endswith(
            (".", "!")):  # Exclude questions.
            return True
    return False


for s in ("Just stop it!", "Look out!", "Do your homework!",
          "You should do your homework.", "Could you stop it.",
          "To be, or not to be."):
    s = parse(s)
    s = Sentence(s)
    print(s)
    print(imperative(s))
    print("")
Пример #60
0
 def test_chunk_modifiers(self):
     # Assert list of nearby adjectives and adverbs with no role, for VP.
     v = en.Sentence(en.parse("Perhaps you should go."))
     self.assertEqual(v.chunk[2].modifiers, [v.chunk[0]]) # should <=> perhaps
     print("pattern.en.Chunk.modifiers")