예제 #1
0
파일: frames.py 프로젝트: rsteckel/EDA
def print_feature(sentence):    
    ptree = parsetree(sentence) #, relations=True, lemmata=True)
    #It matches anything from food to cat food, tasty cat food, the tasty cat food, etc.
    t = parsetree('tasty cat food')
    matches = search('DT? RB? JJ? NN+', ptree)
    for match in matches:
        print match
    print '\n'
예제 #2
0
def find_sentences(dealiased_book_path):
    """Given a text it extractes phrases and sentences."""
    dealiased_book = open(dealiased_book_path, "r", encoding='utf8')
    book = dealiased_book.read()
    dealiased_book.close()
    sss1 = parsetree(book, relations=True, lemmata=True)
    chunk_phrases = []
    sentenced_chunks = []
    sent_chunks = []
    chunks = []
    STOP = stopwords.words('english') + list(string.punctuation)
    for sentence in sss1:
        sentenced_chunks.append(sentence.chunks)
        stchk = []
        for chunk in sentence.chunks:
            chnks = (chunk.type, [(w.string, w.type) for w in chunk.words])
            ch_str = [
                w.string for w in chunk.words
                if len(w.string) > 2 and w.string not in STOP
            ]
            chunk_phrases.append(' '.join(ch_str))
            chunks.append(chnks)
            stchk.append(chnks)
        sent_chunks.append(stchk)

    return chunk_phrases, sent_chunks
예제 #3
0
def find_causal_matches(unicode_string, causal_pattern, pattern_order):
    # Description: Searches text string and returns all cause-effect
    #              relationships based on specified pattern.
    # Inputs: unicode_string, raw text in Unicode format for Python 3
    #         causal_pattern, regex defining specific causal statement pattern
    #         pattern_order, specifying which noun phrase is cause or effect
    # Outputs: List of causal tuples [(cause, effect), ...] or empty list []

    # Initialize causal_tuple_list as empty list
    causal_tuple_list = []

    # Convert string to Pattern parsed text (with POS tags)
    t = parsetree(unicode_string, lemmata=True)

    # possible_matches is a list of all Pattern matches, given text and pattern
    possible_matches = search(causal_pattern, t, lemmata=True)

    # Add causal matches as tuples (cause, effect) to causal_tuple_list
    # Note, if possible_matches=[], there are no matches
    if possible_matches != []:
        # Extract cause-effect tuples and add to causal_tuple_list
        causal_tuple_list = extract_cause_effect_tuple(possible_matches,
                                pattern_order)

    final_causal_tuple_list = []

    for causal_tuple in causal_tuple_list:
        if (causal_tuple[0] in unicode_string) and (causal_tuple[1] in unicode_string):
            final_causal_tuple_list.append(causal_tuple)

    return(final_causal_tuple_list)
예제 #4
0
 def step_to_computer_adjs(string):
     try:
         parse = parsetree(string, relations=True)[0]
         return ' '.join(a.string for a in parse.adjectives
                         if a.string.isalpha())
     except IndexError:
         return ''
예제 #5
0
def verbConjugate(lemma, rel, aan):
    relAvoid = [
        "/r/CapableOf", "/r/PartOf", "/r/MemberOf"
        "/r/IsA", "/r/HasA", "/r/TranslationOf", "/r/HasProperty"
    ]
    if not rel in relAvoid:
        s = parsetree(lemma, relations=True)
        try:
            vb = s[0].verbs[0].words[0].string
            result = lemma.replace(vb, conjugate(vb, "part"))
        except:
            result = lemma
        else:
            if vb in ["to", "can"]:
                result = lemma

        # if not aan:
        #     try:
        #         firstWord = s[0].chunks[0].words[0].string
        #         reconjugated = conjugate(firstWord, "part")
        #         result = lemma.replace(firstWord, reconjugated)
        #     except:
        #         result = lemma

    else:
        result = lemma

    return result
예제 #6
0
 def dialogue_act_features(self, post):
     stop = nltk.word_tokenize(post)
     post = []
     for i in stop:
         if i not in self.stopwords:
             post.append(i)
         else:
             pass
             #regect_list.append(i)
     posts = ""
     for i in post:
         posts += i
         posts += " "
     processed = parsetree(posts, relations=True, lemmata=True)
     features = {}
     for sents in processed:
         x = sents
         for i in x.chunks:
             j = i.pos
             if j == "VP":
                 tense = self.tenseses(i.string)
             else:
                 tense = ""
             h = i.words
             for words in h:
                 apd_str = str(words.lemma) + "-" + str(words.pos)
                 #+ "-" +tense
                 #if words.pos[:1] == "NN" or words.pos[:1] == "VP":
                 features['word({})'.format(str(words.lemma))] = str(
                     words.pos)
             #pos['features({})'.format(j)] = True
         return features
예제 #7
0
def get_pattern_data(search_param):

    twitter = Twitter(language='en')

    for tweet in twitter.search(search_param, cached=True):
        print(plaintext(tweet.text).encode('ascii', 'ignore').decode('utf-8'))

    g = Graph()
    for i in range(10):
        for result in twitter.search(search_param, start=i + 1, count=50):
            s = result.text.lower()
            s = plaintext(s)
            s = parsetree(s)
            p = '{NP} (VP) ' + search_param + ' {NP}'
            for m in search(p, s):
                x = m.group(1).string  # NP left
                y = m.group(2).string  # NP right
                if x not in g:
                    g.add_node(x)
                    if y not in g:
                        g.add_node(y)
                    g.add_edge(g[x], g[y], stroke=(0, 0, 0, 0.75))  # R,G,B,A

    #if len(g)>0:
    #   g = g.split()[0] # Largest subgraph.

    for n in g.sorted()[:40]:  # Sort by Node.weight.
        n.fill = (0, 0.5, 1, 0.75 * n.weight)

    g.export('data', directed=False, weighted=0.6)
예제 #8
0
def retrieve_top_freq_noun_phrases_fr_file(target_file, phrases_num_limit, top_cut_off, saveoutputfile = ''):
    """ Retrieve the top frequency words found in a file. Limit to noun phrases only.
        Stop word is active as default.
        Args:
            target_file (str): filepath as str.
            phrases_num_limit (int):  the max number of phrases. if 0, return all
            top_cut_off (int): for return of the top x phrases.
        Kwargs:
            saveoutputfile (str): if saveoutputfile not null, save to target location.
        Returns:
            (list) : just the top phrases.
            (list of tuple): phrases and frequency

    """
    with open(target_file, 'r') as f:
        webtext =  f.read()

    t = parsetree(webtext, lemmata=True)

    results_list = get_noun_phrases_fr_text(t, phrases_num_limit = phrases_num_limit, stopword_file = r'C:\pythonuserfiles\google_search_module_alt\stopwords_list.txt')

    #try to get frequnecy of the list of words
    counts = Counter(results_list)
    phrases_freq_list =  counts.most_common(top_cut_off) #remove non consequencial words...
    most_common_phrases_list = [n[0] for n in phrases_freq_list]

    if saveoutputfile:
        with open(saveoutputfile, 'w') as f:
            for (phrase, freq) in phrases_freq_list:
                temp_str = phrase + ' ' + str(freq) + '\n'
                f.write(temp_str)
            
    return most_common_phrases_list, phrases_freq_list
예제 #9
0
def main():
    db_file = sys.argv[1]
    # Connect to the sqlite database
    conn = sqlite3.connect(db_file)
    c = conn.cursor()

    #Get all the lines and for each line fill the phrases
    line_list = c.execute("SELECT id, text FROM line order by id;").fetchall()
    print "Done with fetchall"
    phrase_id = 0
    for line in line_list:
        line_id, text = line
        #Parse the sentence and break it into phrases
        res = parsetree(text, relations = True)
        #Identify the phrases
        sentence = res.sentences[0]
        phrases = sentence.phrases
        for phrase in phrases:
            #Count the length of the words
            if len(phrase.words) >= 2:
                text = unicode(phrase)
                c.execute("INSERT INTO phrase (id, text, line_id) VALUES \
                    (?, ?, ?)", (phrase_id, text, line_id))
                phrase_id += 1
    print phrase_id
    conn.commit()
    conn.close()
    print "Finished"
예제 #10
0
def match_help_to_element_NLP(elements, text):
    placeholder_elements_dict = get_pe_dict(elements)
    placeholders = placeholder_elements_dict.keys()
    t = parsetree(text)
    # pprint(t)
    for sen in t:
        chunks = filter(lambda x: (x.type == 'NP'), sen.chunks)
        for chunk in chunks:
            words = filter(lambda x: (x.type.startswith('NN')), chunk.words)
            for w in words:
                for p in placeholders:
                    p_t = parsetree(p)
                    p_words = [i.string for i in p_t.words]
                    if w.string.lower() in p_words:
                        return (placeholder_elements_dict[p])
    return (None)
예제 #11
0
def get_objects(text, verbs):
    """
    Given a passage of text and a list of verbs, yields all noun phrases
    which are at any point the object of any of those verbs.

    >>> list(get_objects("I'm explaining politics to my dad", ["explaining"]))
    ["politics"]

    >>> list(get_objects("I'm talking to my dad about politics", ["talking to"]))
    ["my dad"]
    """
    tree = parsetree(text)

    for sentence in tree:
        matching_verb_phrase_indices = [
            idx for idx, chunk in enumerate(sentence.chunks[:-1])
            if chunk.type == 'VP'
            and chunk.words[-1].string in verbs
        ]

        for idx in matching_verb_phrase_indices:
            chunk = sentence.chunks[idx+1]
            if chunk.type == 'NP':
                if accept_noun_phrase(chunk):
                    yield " ".join([w.string for w in chunk.words])
예제 #12
0
def fix_caption(str):
    s = parsetree(str, lemmata=True)
    string = ''
    for sentence in s:
        if "and a" in str:
            string = str+' '
        else:
            for i, chunk in enumerate(sentence.chunks):
                if chunk.type == 'VP' and len(chunk) == 2:
                    verb = chunk[1].string
                    string += lexeme(verb)[1]+' '
                else:
                    for j, w in enumerate(chunk.words):
                        if i == 0 and j == 0 and (w.string == 'a' or w.string == 'A'):
                            print('chuk', chunk)
                            pass
                        else:
                            string = string + w.string+' '

    string = string[:1].upper() + string[1:-1]
    if string.startswith('A'):
      string = string[2].upper() + string[3:]
    if string.endswith('.'):
      string = string[:-1]
    return string
예제 #13
0
def dramatize(s):
    """ Returns a string with stronger adjectives:
        dramatize("This code is nice") => "This code is legendary"
    """

    x = []

    # A parse tree takes a string and returns a list of sentences,
    # where each sentence is a list of words, where each word is an
    # object with interesting attributes such as Word.tag.
    for sentence in parsetree(s):
        for word in sentence:
            replaced = False
            if word.tag == "JJ":

                # What's the polarity of this adjective?
                polarity = sentiment(word.string)[0]

                # Don't change neutral adjectives like "last", "political", ...
                if polarity != 0.0:

                    # Can we find an adjective in our dictionary
                    # with a more extreme polarity?
                    # Note: the shuffled() function takes a list
                    # and returns a new, randomly ordered list.
                    for w, p in shuffled(adjectives.items()):
                        if polarity >= 0 and p > polarity + 0.2 \
                        or polarity <  0 and p < polarity - 0.2:
                            x.append(w.lower())
                            replaced = True
                            break
            if not replaced:
                x.append(word.string)

    return " ".join(x)
예제 #14
0
def find_all_matches_by_ziyu(text, the_pattern):
    tree = parsetree(text, lemmata=True)
    results = search( the_pattern  , tree, STRICT)
    output = []
    for match in results:
        word_list = []
        for word in match:
            word_list.append(word.string)
        sentence = " ".join(word_list)
        output.append(sentence)
    
    # gen_num = 0
    # if len(output) > 0 and len(output)<2:
    #     gen_num=1
    # elif len(output) >= 2:
    #     gen_num=2

    # random_number = []
    
    # while len(random_number) != gen_num:
    #     r = random.randint(0,len(output))
    #     if r not in random_number:
    #         random_number.append(r)

    # final_output = []

    # if len(output) > 0:
    #     print "have OUTPUT"
    #     print random_number
    #     for i in range(gen_num):
    #         print i
    #         final_output.append(output[random_number[i]])

    return output
예제 #15
0
def drivel(noun):
    """ Generates drivel by shifting nouns in the description of the shifted noun,
        and prepending random alliterative adjectives.
    """
    drivel = []
    description = shift(noun)[1]
    description = description.split(";")[0]
    for sentence in parsetree(description):
        for i, w in enumerate(sentence.words):
            w, tag = w.string, w.tag
            if tag in ("VBD", "VBZ"):
                w = conjugate(w, "infinitive")
                w = conjugate(w, "past")
            if tag == "NN": # noun
                try:
                    w = shift(w)[0]
                    a = list(alliterate(w))
                    if a:
                        if i > 0 and sentence.words[i].tag == "JJ": # adjective
                            drivel.pop()
                        drivel.append(choice(a))
                except:
                    pass
            drivel.append(w)
    return " ".join(drivel)
예제 #16
0
def lemmatize(data):
    processed = ""
    for line in data:
        tree_data = parsetree(line, lemmata=True)
        for each in str(tree_data).split(' '):
            processed += " " + each.split('/')[-1]
    return (processed.lstrip())
예제 #17
0
def _parse(*args, **kw):  # FIXME (workaround)
    from pattern.text.en import parser
    if isinstance(parser.model, str):
        from pattern.text import Model
        parser.model = Model(path=parser.model)

    return parsetree(*args, **kw)
예제 #18
0
파일: __init__.py 프로젝트: jffng/lexograd
def verbConjugate(lemma, rel, aan):
    relAvoid = ["/r/CapableOf", "/r/PartOf", "/r/MemberOf"
                "/r/IsA", "/r/HasA", "/r/TranslationOf",
                "/r/HasProperty"]
    if not rel in relAvoid:
        s = parsetree(lemma, relations=True)
        try:
            vb = s[0].verbs[0].words[0].string
            result = lemma.replace(vb, conjugate(vb, "part"))
        except:
            result = lemma
        else:
            if vb == "to":
                result = lemma

        # if not aan:
        #     try:
        #         firstWord = s[0].chunks[0].words[0].string
        #         reconjugated = conjugate(firstWord, "part")
        #         result = lemma.replace(firstWord, reconjugated)
        #     except:
        #         result = lemma

    else:
        result = lemma
        
    return result
예제 #19
0
def chTitle(hi):
    htmlFile = open(APPPATH+'static/output/'+hi+'.html', 'r')
    html = htmlFile.read()
    htmlFile.close()
    soup = BeautifulSoup(html)
    text = "\n".join([unicode(i) for i in soup.p.contents]).replace("<br/>", "\n")
    s = parsetree(text)
    nounPhrases = []
    for sentence in s:
        for chunk in sentence.chunks:
            if chunk.type == "NP":
                nounPhrases.append(chunk.string)
    selectNPs = rs([np for np in nounPhrases if not "&" in np], ri(1,2))

    articles = ["a", "an", "the"]

    nps = []

    for np in selectNPs:
        if startsWithCheck(np, articles):
            nps.append(np)
        else:
            nps.append(a_or_an(np))

    if len(selectNPs) == 1:
        title = titlecase(nps[0])
    elif len(selectNPs) == 2:
        title = titlecase(" and ".join(nps))
    # elif len(selectNPs) == 3:
    #     title = titlecase("%s, %s, and %s" % tuple(nps))

    return title.encode('ascii', 'xmlcharrefreplace')
예제 #20
0
def re_search(text, search_string, strict=False):
    tree = parsetree(text, lemmata=True)
    if strict:
        results = search(search_string, tree, STRICT)
    else:
        results = search(search_string, tree)
    return results
예제 #21
0
def has_single_highlight(line):
    match = re.search(r'\'(.+?)\'', line)
    if match:
        quote = match.group()[1:-1]
        s = parsetree(quote, chunk=True, relations=True, lemmata=True)
        for sentence in s:
            rel = sentence.relations
            pnp = sentence.pnp
            sbj = True
            vb = True
            obj = True
            pnp = True

            if not rel.get("SBJ"):
                sbj = False
            if not rel.get("VP"):
                vp = False
            if not rel.get("OBJ"):
                obj = False
            if not pnp:
                pnp = False

            if sbj and vb and (obj or pnp):
                # quotes.append(line)
                return 0
            else:
                return 1
    else:
        return 0
예제 #22
0
 def getParseTrees(self):
     """Return parse trees of each sentence."""
     from pattern.en import parsetree
     if not self.parseTrees:
         self.parseTrees = [
             parsetree(sent) for sent in self.getPlainSentences()
         ]
     return
예제 #23
0
def acceptPatterns():
    original_content = request.form['drunk_text']
    text_content_array = original_content.split(' ')
    text_content = ''
    for s in text_content_array:
        text_content += superlative(s) + ' '
    s = parsetree(original_content, relations=True, lemmata=True)
    return repr(s)
예제 #24
0
def extract_keyphrases_from_doc_pattern(item, key):
    # build parsetree, extract NP's
    pt = parsetree(item[key])
    noun_phrases = search('NP', pt)
    # convert np matches to unicode list
    noun_phrases = [np.string for np in noun_phrases]
    # remove ones too short, lemmatize, etc..
    cleankeys = regularise_keys(noun_phrases)
    return cleankeys
예제 #25
0
def form_help_to_vec(p):
    t = parsetree(p)
    requirements = []
    # pprint(t)
    for sen in t:
        for i, chunk in enumerate(sen.chunks):
            if chunk.type == "NP":
                requirements.append(vector)
    return (final)
예제 #26
0
    def __init__(self):
        #create an instance of our clickbait detector
        #skipGetNumbersSum will prevent getNumbersSum from summing anything that could be a number in english, sometimes slow
        skipGetNumbersSum = False

        #set to 0 to prevent using most common word list as features
        topWordsFeatureCount = 100
        #set to 0 to prvent using most common word trigrams as features
        topWordTrigramFeatureCount = 50
        self.clickbaitDetector = clickbaitDetector(skipGetNumbersSum, topWordsFeatureCount, topWordTrigramFeatureCount)

        #TODO: include a wide range of headlines, including some less vulgar
        #for the unit tests, we need headlines and manually determined correct values for each feature
        testCase = "RT The world's 10 most \"advanced sex\" dolls will soon be able to think and talk"
        #"getNumbersSum" : 10.0 #this is excluded since getNumbersSum may be disabled
        testValues = {
            "getWordCount" : 15,
            "getHashTagsAndRTs" : 1,
            "getQuestionMarks": 0,
            "getAtMentions" : 0,
            "getCharLength" : len(testCase),
            "getNNPLOCCount" : 0,
            "getNNPPERSCount" : 0,
            "getSwearCount" : 2,
            "maxDistToQuote" : 31,
            "maxDistToNNP": 0,
            "getNumbersSum" : 0.0
        }

        patternParseTree = parsetree(testCase, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)
        strSentenceText = testCase
        lstSentPOS = []
        lstSentWords = []
        for sentence in patternParseTree:
            for chunk in sentence.chunks:
                for word in chunk.words:
                    lstSentPOS.append(word.type)
                    lstSentWords.append(word.string)

        #call unit tests here
        try:
            self.assertMaxDistToNNP(lstSentPOS, lstSentWords, testValues)
            self.assertMaxDistToQuote(strSentenceText, testValues)
            self.assertGetWordCount(lstSentWords, testValues)
            self.assertGetHashTagsAndRTs(strSentenceText, lstSentWords, testValues)
            self.assertGetQuestionMarks(strSentenceText, testValues)
            self.assertGetAtMentions(strSentenceText, testValues)
            self.assertGetNumbersSum(lstSentWords, testValues)
            self.assertGetNNPPERSCount(lstSentPOS, testValues)
            self.assertGetSwearCount(lstSentWords, testValues)
            self.assertGetNNPLOCCount(lstSentPOS, testValues)
            self.assertGetCharLength(strSentenceText, testValues)
            print("Clickbait detector has passed all Unit Tests!")
        except AssertionError as e:
            print(e.args) #print the "args" part of assertionError, showing expected values
            exit(1)
예제 #27
0
def test_search():  
    from pattern.search import search
    from pattern.en import parsetree
      
    t = parsetree('big white rabbit')
    print t
    print
    print search('JJ', t) # all adjectives
    print search('NN', t) # all nouns
    print search('NP', t) # all noun phrases
예제 #28
0
def adjectives(s):
    """ Returns a list of adjectives in the given string.
    """
    a = set()  # set ~= list of unique values
    t = parsetree(s)
    for sentence in t:
        for word in sentence.words:
            if word.tag and word.tag == "JJ":
                a.add(word.string.lower())
    return list(sorted(a))
예제 #29
0
def get_parse_tree(sentence):
    tree = []
    Tree = []
    for sentence in parsetree(sentence):
        for chunk in sentence.chunks:
            for word in chunk.words:
                tree.append(word)
            Tree.append(tree)
            tree = []
    return Tree
예제 #30
0
 def getPatterns(self, query):
     cleaned = query.strip('?')
     p = [(cleaned, 3)]
     t = parsetree(query)[0]
     for chunk in t.chunks:
         if chunk.pos == 'NP':
             p.append((chunk.string, 2))
     for w in cleaned.split():
         p.append((w, 1))
     return p
예제 #31
0
 def set_ingredient_tokens(current_recipe):
     for item in current_recipe.ingredients:
         quantity_conversion = {'quarter' : 0.25,'eighth' : 0.125,
                                 'half' : 0.5,'1/4' : 0.25,
                                 '1/8' : 0.125,'1/3' : 0.333,
                                 '2/3' : 0.667,'3/4' : 0.75,
                                 '1/2' : 0.5,'1' : 1.0,
                                 '2' : 2.0,'3' : 3.0,
                                 '4' : 4.0,'5' : 5.0,
                                 '6' : 6.0,'7' : 7.0, 'lots' : 3.0,
                                 '8' : 8.0,'9' : 9.0, '5-6' : 5.5,
                                 'a' : 1.0,'few' : 2.0, 'scant' : 1.0, 
                                 'pinch' : 0.125, 'pinches' : 0.25, 
                                 '4-' : 4.0, 'to' : 0.0, 'tablespoon' : 1.0, 
                                 'teaspoon' : 1.0, 'couple' : 2.0}
                 
         #set 'dumb' quantity by assuming the first item is quanity
         prelim_quantity = nltk.tokenize.word_tokenize(item.source_line)[0]
         
         #EAFP!
         try:
             prelim_quantity = float(prelim_quantity)
         except ValueError:
             print "Can't convert :: " + prelim_quantity
             pass  # pass to conversion dictionary lookup
             try:
                 prelim_quantity = quantity_conversion[prelim_quantity]
             except KeyError:
                 print KeyError("No conversion value found : " +  prelim_quantity)
                 #need to flag here for note in UI                    
                 prelim_quantity = 0
             else:
                 item.quantity = prelim_quantity
         
         item.quantity = prelim_quantity
     
         filterList = ['tsp', 'tsps', 'tbsps', 'tbsp', 'tablespoon', \
                       'tablespoons', 'teaspoon', 'teaspoons', 'cup', \
                       'cups', 'bowl', 'pint', 'quart', 'mg', 'g', 'gram',\
                       'grams', 'ml', 'oz', 'ounce', 'ounces' ] 
         
         item.measure = ' '.join([word for word in item.source_line.split(" ") if word in filterList])
         new_source_line = ' '.join([word for word in item.source_line.split(" ") if word not in filterList])                               
         sentence = parsetree(new_source_line, chunks=True, lemmata=True)
      
         for s in sentence:
             #filter all the NP (noun phrases) into a chunk list
             chunk_list = [singularize(chunk.string) for chunk in s.chunks if chunk.type =='NP']
             search_term = chunk_list[0]
             search_term = "".join([i for i in search_term if i != '/'])
             search_term = ''.join([i for i in search_term if not i.isdigit()])                
             
             item.search_term = search_term
 
     return current_recipe
예제 #32
0
def test_pattern():
    
   from pattern.search import Pattern
   from pattern.en import parsetree
  
   t = parsetree('Chuck Norris is cooler than Dolph.', lemmata=True)
   p = Pattern.fromstring('{NP} be * than {NP}')
   m = p.match(t)
   print m.group(1)
   print m.group(2)
   print t
예제 #33
0
def grammatical_tagging():
    sentence = "The white house is at the top of the hill"
    sentences = "The white house is at the top of the hill. My house is not"

    print(
        tag(sentence)
    )  # The result is an array of tuples tagging each word (verbs, nouns, etc.)
    print(parse(sentence))
    #pprint(parse(sentence))

    pprint(parsetree(sentences))
예제 #34
0
def selectWords(review):
    '''
    a function that gets a review and selects the nouns, adjectives, verbs and exclamation mark
    '''
    review = parsetree(review, lemmata=True)[0]  #lemmatize the review
    #select adjectives (JJ), nouns (NN), verbs (VB) and exclamation marks
    review = [
        w.lemma for w in review if w.tag.startswith(('JJ', 'NN', 'VB', '!'))
    ]
    review = count(review)  #a dictionary of (word, count)
    return review