Пример #1
0
def handleBotInfo(sentence):
    name = ["Optimus... ah no, Optimist Prime :D", "I.am.the.legendary.Optimist.Prime B-)", "The most Optimist Prime! B-)", "You knew already *tsk tsk*"]
    creator = ["It's a mystery :O", "Are you optimist enough to know? ;)", "You are among the few who I tell: All I know about my creator is the initials HT :)", "It remains a mystery to me even :(", "It was erased from my memory from the start :("]

    m = search('what *+ your name', sentence)
    if len(m) > 0:
        return oneOf(name)

    m = search('VP+ *+ your name', sentence)
    if len(m) > 0:
        return oneOf(name)

    m = search('who *+ your creator|dad|mom|father|mother|papa|mama|daddy|mommy', sentence)
    if len(m) > 0:
        return oneOf(creator)

    m = search('VP+ *+ your creator|dad|mom|father|mother|papa|mama|daddy|mommy', sentence)
    if len(m) > 0:
        return oneOf(creator)

    m = search('who *+ creates|created|gave_birth *+ you', sentence)
    if len(m) > 0:
        return oneOf(creator)

    return "Can you guess? ;)"
Пример #2
0
    def _extract_reporters(self):
        """ Extract the reporters and entities from those sentence of the text
            where a reported speech verb is used.
        """
        # search for those sentences with reported speech verbs
        sentences = [s for s in self.__tree if search('RPTVRB|según', s)]

        for s in sentences:
            s_str = s.string
            sent_nlp = self.nlp(s_str)           
            verb = search('RPTVRB|según',s)[0].string
            shortest_dist = np.inf
            shortest_word = []
            for ent in sent_nlp.ents:
                # calculate distance
                dist = self._get_distance(verb, ent.text, s_str)
                # store all proper nouns in entities
                word = Word(s, ent.text, tag=None, index=s.id)
                self.__entities.append(word)
                # PER and ORG type entities closest to a reporter verb
                if ent.label_ in ["PER","ORG"] and abs(dist) < shortest_dist:
                    word = Word(s, ent.text, tag='NNP', index=s.id)
                    shortest_dist = abs(dist)
                    shortest_word = word
            if shortest_word and abs(dist) < self._max_dist:
                self.__reporters.append(shortest_word)
Пример #3
0
def isAskingBotInformation(sentence):
    m = search('what *+ your name', sentence)
    if len(m) > 0:
        return True

    m = search('VP+ *+ your name', sentence)
    if len(m) > 0:
        return True

    m = search(
        'who *+ your creator|dad|mom|father|mother|papa|mama|daddy|mommy',
        sentence)
    if len(m) > 0:
        return True

    m = search('VP+ *+ your creator|dad|mom|father|mother', sentence)
    if len(m) > 0:
        return True

    m = search('who made|are|created|create|wrote|gave_birth|built you',
               sentence)
    if len(m) > 0:
        return True

    return False
Пример #4
0
def re_search(text, search_string, strict=False):
    tree = parsetree(text, lemmata=True)
    if strict:
        results = search(search_string, tree, STRICT)
    else:
        results = search(search_string, tree)
    return results
Пример #5
0
def re_search(text, search_string, strict=False):
    tree = parsetree(text, lemmata=True)
    if strict:
        results = search(search_string, tree, STRICT)
    else:
        results = search(search_string, tree)
    return results
Пример #6
0
def process(statement, database_name=DATABASE_NAME):
    ''' Allows us to create entities via statements like "There is a course CSCI4702 called Mobile Programming" 
      and modify entities with statements like "CSCI4702 has a start date of Jan 31st 2013"
      
      already encountering a statement like "There is a game engine Unity3d" gives us trouble
      seems like we need named entity recognition to be able to extract types like that ... or perhaps rely on capitalization
      which doesn't really work for things like CTO as a category of items, hmm
      
      >>> sent = "There is a game engine Unreal Engine".split()
      >>> print nltk.ne_chunk(nltk.pos_tag(sent))
      '''
    # this runs real fast, but it doesn't quite get the NN/NNP combination I hoped for from "There is a game engine Unity3D"
    # although it does now with light=True setting, but now it doesn't get the NNP in "There is a game engine Source"

    s = parse(statement, relations=True, lemmata=True, light=True)
    s = split(s)

    #result = search('There be DT NN+ (DT) (RB) (JJ) NNP+ (call) (DT) (RB) (JJ) (NNPS|NNP)+', s)
    s, result = extract(statement)
    if result:
        #try:
        noun = search('(NN)+', s)[0].string
        table = pluralize(noun.replace(' ', '_'))
        result = search(
            '(JJ|NNPS|NNP)+', s
        )  # this pulls in adjectives, but there's supposed to be a better fix coming
        ident = result[0].string
        name = result[1].string if len(result) > 1 else ident
        #raise Exception(table+"; "+ident+"; "+name)
        return newTable(table, ident, name, database_name)
    #except:
    #return regexMatch(statement,database_name)
    else:
        return regexMatch(statement, database_name)
Пример #7
0
Файл: faq.py Проект: VRDate/twss
def process(statement,database_name = DATABASE_NAME):
  ''' Allows us to create entities via statements like "There is a course CSCI4702 called Mobile Programming" 
      and modify entities with statements like "CSCI4702 has a start date of Jan 31st 2013"
      
      already encountering a statement like "There is a game engine Unity3d" gives us trouble
      seems like we need named entity recognition to be able to extract types like that ... or perhaps rely on capitalization
      which doesn't really work for things like CTO as a category of items, hmm
      
      >>> sent = "There is a game engine Unreal Engine".split()
      >>> print nltk.ne_chunk(nltk.pos_tag(sent))
      '''
  # this runs real fast, but it doesn't quite get the NN/NNP combination I hoped for from "There is a game engine Unity3D"
  # although it does now with light=True setting, but now it doesn't get the NNP in "There is a game engine Source"

  s = parse(statement, relations=True, lemmata=True, light=True) 
  s = split(s)

  #result = search('There be DT NN+ (DT) (RB) (JJ) NNP+ (call) (DT) (RB) (JJ) (NNPS|NNP)+', s)
  s, result = extract(statement)
  if result:
    #try:
      noun = search('(NN)+', s)[0].string
      table = pluralize(noun.replace(' ','_'))
      result = search('(JJ|NNPS|NNP)+', s) # this pulls in adjectives, but there's supposed to be a better fix coming
      ident = result[0].string
      name = result[1].string if len(result) > 1 else ident
      #raise Exception(table+"; "+ident+"; "+name)
      return newTable(table,ident,name,database_name)
    #except:
      #return regexMatch(statement,database_name)
  else:
    return regexMatch(statement,database_name)
Пример #8
0
def isGetNews(sentence):
    m = search('{VP} {VBG+? JJ+?} {news | information} about|on|regarding { *+ }', sentence)
    if len(m) > 0:
        if m[0].group(1).string.lower() in ['look', 'get', 'find', 'tell', 'show', 'fetch', 'search']:
            return True

    # Solve special case when "Get" at the beginning of sentence is recognized as 
    # a proper noun
    m = search('get|find|look *+ news|information about|on|regarding', sentence)
    if len(m) > 0:
        return True

    return False
Пример #9
0
def re_search(text, search_string, strict=False):
    try:
        from pattern.search import STRICT, search
        from pattern.en import parsetree
    except:
        print('Please install pattern: pip install https://github.com/clips/pattern/archive/development.zip')
        sys.exit()

    tree = parsetree(text, lemmata=True)
    if strict:
        results = search(search_string, tree, STRICT)
    else:
        results = search(search_string, tree)
    return results
Пример #10
0
def re_search(text, search_string, strict=False):
    try:
        from pattern.search import STRICT, search
        from pattern.en import parsetree
    except:
        print('Please install pattern: pip install https://github.com/clips/pattern/archive/development.zip')
        sys.exit()

    tree = parsetree(text, lemmata=True)
    if strict:
        results = search(search_string, tree, STRICT)
    else:
        results = search(search_string, tree)
    return results
Пример #11
0
def isYelp(sentence):
    verbs = findVerb(sentence)
    noun_phrases = findNounPhrase(sentence)
    # If match key verbs
    yelpVerbs = ['eat', 'drink', 'find', 'display', 'get']
    for verb in verbs:
        if verb.lower() in yelpVerbs:
            if "news" in noun_phrases or "information" in noun_phrases and "news stand" not in noun_phrases and "newsstand" not in noun_phrases:
                return False

            yelpNouns = ['restaurant', 'food', 'drink', 'shop', 'store', 'bar', 'pub']
            for noun in yelpNouns:
                if noun in noun_phrases:
                    return True

    # If match question/command structure
    # "is there" + noun phrase 
    if "is there" in sentence.string \
        or "are there" in sentence.string \
        and noun_phrases != "":

        return True
    
    # noun phrase + "near by"
    nearby = nearBy(sentence)
    if noun_phrases != "" and nearby:
        return True

    m = search('{fine|find|get|find|show|search} { *+ }', sentence)
    # Sometimes Speech to Text misunderstood "find" as "fine"
    print m
    if len(m) > 0:
        return True

    return False
Пример #12
0
def find_all_matches_by_ziyu(text, the_pattern):
    tree = parsetree(text, lemmata=True)
    results = search( the_pattern  , tree, STRICT)
    output = []
    for match in results:
        word_list = []
        for word in match:
            word_list.append(word.string)
        sentence = " ".join(word_list)
        output.append(sentence)
    
    # gen_num = 0
    # if len(output) > 0 and len(output)<2:
    #     gen_num=1
    # elif len(output) >= 2:
    #     gen_num=2

    # random_number = []
    
    # while len(random_number) != gen_num:
    #     r = random.randint(0,len(output))
    #     if r not in random_number:
    #         random_number.append(r)

    # final_output = []

    # if len(output) > 0:
    #     print "have OUTPUT"
    #     print random_number
    #     for i in range(gen_num):
    #         print i
    #         final_output.append(output[random_number[i]])

    return output
Пример #13
0
    def _extract_reporters(self):
        """ Extract the reporters and entities from those sentence of the text
            where a reported speech verb is used.
        """
        # search for those sentences with reported speech verbs
        sentences = [s for s in self.__tree if search('RPTVRB|según', s)]
        # search for proper nouns that are not locations
        pattern = Pattern.fromstring('!LOCATION|NNP+',
                                     STRICT, taxonomy=TAXONOMY)

        for s in sentences:
            matches = pattern.search(s)

            for m in matches:
                for w in m.words:
                    # chunks with roles (SBJ, OBJ) connected to a reporter verb
                    if ((w.chunk.role is not None) and
                        (w.chunk.verb.head.lemma in taxonomy)):
                        if self._is_composed_noun(w):
                            self.__reporters.append(w.previous())
                        self.__reporters.append(w)
                    # proper nouns not spotlighted as reported
                    else:
                        if self._is_composed_noun(w):
                            self.__entities.append(w.previous())
                        self.__entities.append(w)
Пример #14
0
 def getQueries(self):
     queries = []
     WP = 'who|what|when|where|why|how|which'
     patterns = [
             # Some verbs are mislabeled as nouns
             # When *+ is used next to NP, it swallows parts of the NP
             # Becuase of this, using {JJ|NN*+} to capture NPs in some cases
             # [NUMS] -> rearrange captured groups in order of NUMS
             # [(x, y)] -> conjugates x into the tense of y
             # ex: [1, (2, 3)] -> "(First Group) (Second Group conjugated to tense of Third Group)"
             (WP + ' {be} {NP}',
                 "queries.append((self.joinGroups(match[0], [2, 1]), 1, 'R'))"),
             (WP + ' {be} {NP} {VB*|NN}',
                 "queries.append((self.joinGroups(match[0], [2, 1, 3]), 3, 'R'))"),
             (WP + ' {be} {NP} {VB*|NN} {*+}',
                 "queries.append((self.joinGroups(match[0], [4, 2, 1, 3]), 4, 'R'))"),
             (WP + ' {do} {NP} {VB*|NN}',
                 "queries.append((self.joinGroups(match[0], [2, (1, 3)]), 5, 'R'))"),
             (WP + ' {do} {NP} {VB*|NN} {*+}',
                 "queries.append((self.joinGroups(match[0], [4, 2, (1, 3)]), 5, 'R'))"),
             (WP + ' {NP} {VB*|NN} {*+}',
                 "queries.append((self.joinGroups(match[0], [1, 3, 2]), 3, 'R'))"),
             (WP + ' {VB*|NN} {JJ|NN*+} {*+}',
                 "queries.append((self.joinGroups(match[0], [3, 2, 1]), 2, 'R'))"),
             (WP + ' {NP} {VB*|NN} {*+}',
                 "queries.append((self.joinGroups(match[0], [2, 1, 3]), 4, 'L'))"),
             (WP + ' {VB*|NN} {JJ|NN*+} {*+}',
                 "queries.append((self.joinGroups(match[0], [1, 2, 3]), 4, 'L'))")
             ]
     t = parsetree(self._q.strip('?'), lemmata=True)
     for p, c in patterns:
         match = search(p, t)
         if match:
             exec c
     return queries + [(self.getKeyWords(t), 1, 'A')] + [(self._q, 2, 'A')]
Пример #15
0
def compare_visualization(product_sku, compare_phrase):
    all_reviews = ReviewInfo.objects.all().filter(sku=product_sku)
    g = Graph()

    count = 0.0
    for e in all_reviews :
        s = e.comment.lower() 
        s = plaintext(s)
        s = parsetree(s)
        #p = '{NP} (VP) faster than {NP}'
        p = '{NP} (VP) ' + compare_phrase + ' {NP}'
        for m in search(p, s):
            x = m.group(1).string # NP left
            y = m.group(2).string # NP right
            if x not in g:
                g.add_node(x)
            if y not in g:
                g.add_node(y)
            g.add_edge(g[x], g[y], stroke=(0,0,0,0.75)) # R,G,B,A
        count += 1.0
        print count/len(all_reviews), '\r'

    if len(g) > 0: 
        g = g.split()[0] # Largest subgraph.
        for n in g.sorted()[:80]: # Sort by Node.weight.
            n.fill = (0, 0.5, 1, 0.75 * n.weight)

        g.export('static/compare_visualization', directed=True, weighted=2.0)
        return True
    else: 
        return False
def get_pattern_data(search_param):
   
   twitter = Twitter(language='en') 
   
   for tweet in twitter.search(search_param, cached=True):
      print(plaintext(tweet.text).encode('ascii', 'ignore').decode('utf-8'))
   

   g = Graph()
   for i in range(10):
      for result in twitter.search(search_param, start=i+1,count=50):
         s = result.text.lower() 
         s = plaintext(s)
         s = parsetree(s)
         p = '{NP} (VP) ' +search_param+ ' {NP}'
         for m in search(p, s):
            x = m.group(1).string # NP left
            y = m.group(2).string # NP right
            if x not in g:
               g.add_node(x)
               if y not in g:
                  g.add_node(y)
               g.add_edge(g[x], g[y], stroke=(0,0,0,0.75)) # R,G,B,A

   #if len(g)>0:   
   #   g = g.split()[0] # Largest subgraph.

   for n in g.sorted()[:40]: # Sort by Node.weight.
      n.fill = (0, 0.5, 1, 0.75 * n.weight)

   g.export('data', directed=False, weighted=0.6)
Пример #17
0
def basicExtract(statement):

  #s = Sentence(parse(statement, relations=True, lemmata=True, light=True))
  #p = Pattern.fromstring('(DT) (RB) (JJ) NN+')
  s = Sentence(parse(statement, lemmata=True))
  m = search("There be DT {JJ? NN}", s)
  return m
Пример #18
0
def get_noun_phrases_fr_text(text_parsetree, print_output = 0, phrases_num_limit =5, stopword_file=''):
    """ Method to return noun phrases in target text with duplicates
        The phrases will be a noun phrases ie NP chunks.
        Have the in build stop words --> check folder address for this.
        Args:
            text_parsetree (pattern.text.tree.Text): parsed tree of orginal text

        Kwargs:
            print_output (bool): 1 - print the results else do not print.
            phrases_num_limit (int): return  the max number of phrases. if 0, return all.
        
        Returns:
            (list): list of the found phrases. 

    """
    target_search_str = 'NP' #noun phrases
    target_search = search(target_search_str, text_parsetree)# only apply if the keyword is top freq:'JJ?+ NN NN|NNP|NNS+'

    target_word_list = []
    for n in target_search:
        if print_output: print retrieve_string(n)
        target_word_list.append(retrieve_string(n))

    ## exclude the stop words.
    if stopword_file:
        with open(stopword_file,'r') as f:
            stopword_list = f.read()
        stopword_list = stopword_list.split('\n')

    target_word_list = [n for n in target_word_list if n.lower() not in stopword_list ]

    if (len(target_word_list)>= phrases_num_limit and phrases_num_limit>0):
        return target_word_list[:phrases_num_limit]
    else:
        return target_word_list
Пример #19
0
def get_pattern_data(search_param):

    twitter = Twitter(language='en')

    for tweet in twitter.search(search_param, cached=True):
        print(plaintext(tweet.text).encode('ascii', 'ignore').decode('utf-8'))

    g = Graph()
    for i in range(10):
        for result in twitter.search(search_param, start=i + 1, count=50):
            s = result.text.lower()
            s = plaintext(s)
            s = parsetree(s)
            p = '{NP} (VP) ' + search_param + ' {NP}'
            for m in search(p, s):
                x = m.group(1).string  # NP left
                y = m.group(2).string  # NP right
                if x not in g:
                    g.add_node(x)
                    if y not in g:
                        g.add_node(y)
                    g.add_edge(g[x], g[y], stroke=(0, 0, 0, 0.75))  # R,G,B,A

    #if len(g)>0:
    #   g = g.split()[0] # Largest subgraph.

    for n in g.sorted()[:40]:  # Sort by Node.weight.
        n.fill = (0, 0.5, 1, 0.75 * n.weight)

    g.export('data', directed=False, weighted=0.6)
Пример #20
0
def get_phrases_contain_keyword(text_parsetree, keyword, print_output = 0, phrases_num_limit =5):
    """ Method to return phrases in target text containing the keyword. The keyword is taken as an Noun or NN|NP|NNS.
        The phrases will be a noun phrases ie NP chunks.
        Args:
            text_parsetree (pattern.text.tree.Text): parsed tree of orginal text
            keyword (str): can be a series of words separated by | eg "cat|dog"

        Kwargs:
            print_output (bool): 1 - print the results else do not print.
            phrases_num_limit (int): return  the max number of phrases. if 0, return all.
        
        Returns:
            (list): list of the found phrases. (remove duplication )

    """
    ## Regular expression matching.
    ## interested in phrases containing the traget word, assume target noun is either adj or noun
    target_search_str = 'JJ|NN|NNP|NNS?+ ' + keyword + ' NN|NNP|NNS?+'
    target_search = search(target_search_str, text_parsetree)# only apply if the keyword is top freq:'JJ?+ NN NN|NNP|NNS+'

    target_word_list = []
    for n in target_search:
        if print_output: print retrieve_string(n)
        target_word_list.append(retrieve_string(n))

    target_word_list_rm_duplicates = rm_duplicate_keywords(target_word_list)

    if (len(target_word_list_rm_duplicates)>= phrases_num_limit and phrases_num_limit>0):
        return target_word_list_rm_duplicates[:phrases_num_limit]
    else:
        return target_word_list_rm_duplicates
Пример #21
0
    def htmlSearch(self, html, url):
        logger.debug(u"htmlSearch URL : %s" % url)
        logger.debug(u"html : %s" % html[:20])
               
        s = html.lower()
        s = plaintext(s)
        s = parsetree(s)
        
        # self.logSentences(s)

        # Execute a Regular Expression Search
        p = r'(NN)+'
        q = search(p, s)

        # self.logPOS(q)

        # Iterate over all the words in the POS
        logger.debug(u"  q.Length=%d" % len(q))
        logger.debug(u"  q[]=%s" % q)
        
        self.g, self.urlConcepts, self.wordConcepts = self.addNodes(self.g, q, url,
                                                                    self.urlConcepts,
                                                                    self.wordConcepts)

        return self.urlConcepts, self.wordConcepts
def get_phrases_contain_keyword(text_parsetree, keyword, print_output=0, phrases_num_limit=5):
    """ Method to return phrases in target text containing the keyword. The keyword is taken as an Noun or NN|NP|NNS.
        The phrases will be a noun phrases ie NP chunks.
        Args:
            text_parsetree (pattern.text.tree.Text): parsed tree of orginal text
            keyword (str): can be a series of words separated by | eg "cat|dog"

        Kwargs:
            print_output (bool): 1 - print the results else do not print.
            phrases_num_limit (int): return  the max number of phrases. if 0, return all.
        
        Returns:
            (list): list of the found phrases. (remove duplication )

    """
    ## Regular expression matching.
    ## interested in phrases containing the traget word, assume target noun is either adj or noun
    target_search_str = "JJ|NN|NNP|NNS?+ " + keyword + " NN|NNP|NNS?+"
    target_search = search(
        target_search_str, text_parsetree
    )  # only apply if the keyword is top freq:'JJ?+ NN NN|NNP|NNS+'

    target_word_list = []
    for n in target_search:
        if print_output:
            print retrieve_string(n)
        target_word_list.append(retrieve_string(n))

    target_word_list_rm_duplicates = rm_duplicate_keywords(target_word_list)

    if len(target_word_list_rm_duplicates) >= phrases_num_limit and phrases_num_limit > 0:
        return target_word_list_rm_duplicates[:phrases_num_limit]
    else:
        return target_word_list_rm_duplicates
Пример #23
0
 def test_search_function(self):
     # Assert search() function.
     s = Sentence(parse("Go on Bors, chop his head off!"))
     m = search.search("PRP*? NN*", s)
     self.assertEqual(m[0].string, "Bors")
     self.assertEqual(m[1].string, "his head")
     print "pattern.search.search()"
Пример #24
0
def find_causal_matches(unicode_string, causal_pattern, pattern_order):
    # Description: Searches text string and returns all cause-effect
    #              relationships based on specified pattern.
    # Inputs: unicode_string, raw text in Unicode format for Python 3
    #         causal_pattern, regex defining specific causal statement pattern
    #         pattern_order, specifying which noun phrase is cause or effect
    # Outputs: List of causal tuples [(cause, effect), ...] or empty list []

    # Initialize causal_tuple_list as empty list
    causal_tuple_list = []

    # Convert string to Pattern parsed text (with POS tags)
    t = parsetree(unicode_string, lemmata=True)

    # possible_matches is a list of all Pattern matches, given text and pattern
    possible_matches = search(causal_pattern, t, lemmata=True)

    # Add causal matches as tuples (cause, effect) to causal_tuple_list
    # Note, if possible_matches=[], there are no matches
    if possible_matches != []:
        # Extract cause-effect tuples and add to causal_tuple_list
        causal_tuple_list = extract_cause_effect_tuple(possible_matches,
                                pattern_order)

    final_causal_tuple_list = []

    for causal_tuple in causal_tuple_list:
        if (causal_tuple[0] in unicode_string) and (causal_tuple[1] in unicode_string):
            final_causal_tuple_list.append(causal_tuple)

    return(final_causal_tuple_list)
Пример #25
0
def features(sentence):    
    stop = nltk.corpus.stopwords.words('english')
    
    #ptree = parsetree(sentence, relations=True, lemmata=True)
    ptree = parsetree(sentence)
    matches = search('NP', ptree)
    phrases = []
    for match in matches:
        filtered_np = [ word for word in match if word.string.lower() not in stop ]
        if len(filtered_np) > 0:
            phrases.append( filtered_np )
    
    #for sentence in ptree:
    #    for chunk in sentence.chunks:
    #        if chunk.type == 'NP':
    #            print [(w.string, w.type) for w in chunk.words]
    
    sentence_sentiment = 'NEU'
    sent_result = sentiment(sentence)
    sent = sent_result[0]
    if sent > .1:
        sentence_sentiment  ='POS'
    elif sent < -.1:
        sentence_sentiment  ='NEG'
    
    sentence_subjectivity = 'OBJ'
    if sent_result[1] > .5:
        sentence_subjectivity = 'SUB'
    
    features = {}
    features['NP'] = phrases
    features['SN'] = sentence_sentiment
    features['SUB'] = sentence_subjectivity
    
    return features
    def get_noun_phrase_fr_title(self,title):
        """ Get the NP from title. Use for comparing to company names to extract specific news.

        """
        t = parsetree(title, lemmata=True)
        target_search = search('NP', t)
        return target_search[0].group(0).string
Пример #27
0
def extract_verbs(tree):
	verb_matches = search('to|you {VB*}', tree)
	phrases = list()
	for match in verb_matches:
		if match.group(1)[0].type in ('VBG', 'VBZ'): continue
		if match.group(1)[0].string == "dream": continue
		phrases.append(tree[match.group(1).start:])
	return phrases
Пример #28
0
def print_feature(sentence):    
    ptree = parsetree(sentence) #, relations=True, lemmata=True)
    #It matches anything from food to cat food, tasty cat food, the tasty cat food, etc.
    t = parsetree('tasty cat food')
    matches = search('DT? RB? JJ? NN+', ptree)
    for match in matches:
        print match
    print '\n'
Пример #29
0
 def test_group(self):
     # Assert Match groups.
     s = Sentence(parse("the big black cat eats a tasty fish"))
     m = search.search("DT {JJ+} NN", s)
     self.assertEqual(m[0].group(1).string, "big black")
     self.assertEqual(m[1].group(1).string, "tasty")
     # Assert nested groups (and syntax with additional spaces).
     m = search.search("DT { JJ { JJ { NN }}}", s)
     self.assertEqual(m[0].group(1).string, "big black cat")
     self.assertEqual(m[0].group(2).string, "black cat")
     self.assertEqual(m[0].group(3).string, "cat")
     # Assert chunked groups.
     m = search.search("NP {VP NP}", s)
     v = m[0].group(1, chunked=True)
     self.assertEqual(v[0].string, "eats")
     self.assertEqual(v[1].string, "a tasty fish")
     print "pattern.search.Match.group()"
Пример #30
0
def isGetNews(sentence):
    m = search(
        '{VP} {VBG+? JJ+?} {news | information} about|on|regarding { *+ }',
        sentence)
    if len(m) > 0:
        if m[0].group(1).string.lower() in [
                'look', 'get', 'find', 'tell', 'show', 'fetch', 'search'
        ]:
            return True

    # Solve special case when "Get" at the beginning of sentence is recognized as
    # a proper noun
    m = search('get|find|look *+ news|information about|on|regarding',
               sentence)
    if len(m) > 0:
        return True

    return False
Пример #31
0
def extract_keyphrases_from_doc_pattern(item, key):
    # build parsetree, extract NP's
    pt = parsetree(item[key])
    noun_phrases = search('NP', pt)
    # convert np matches to unicode list
    noun_phrases = [np.string for np in noun_phrases]
    # remove ones too short, lemmatize, etc..
    cleankeys = regularise_keys(noun_phrases)
    return cleankeys
Пример #32
0
def measure_pattern_search():
    """ pattern 
      JJ|NN* NN*
      DT? JJ|NN?+ NN
      DT? JJ|NN*+ NN*
  """
    global pattern_search_result  #Make measure_me able to modify the value
    #print ('text_tree', text_tree)
    pattern_search_result = search(pattern_string, text_tree)
Пример #33
0
def taxonomy_normalize(sentence):    
    bp_match = search('BEAUTY_PARTS', parsetree(sentence, lemmata=True))
    facial_match = search('MAKEUP', parsetree(sentence, lemmata=True))
    feet_match = search('FEET', parsetree(sentence, lemmata=True))
    body_match = search('BODY', parsetree(sentence, lemmata=True))    
    
    matches = ''
    
    if len(bp_match) > 0:
        matches += 'BEAUTY_PARTS-'     
    if len(facial_match) > 0:
        matches += 'MAKEUP-'              
    if len(feet_match) > 0:
        matches += 'FEET-'
    if len(body_match) > 0:
        matches += 'BODY-'

    return matches
Пример #34
0
def extract_verb_phrases(tree):
	verb_phrase_matches = search('to|you {VP}', tree)
	phrases = list()
	if len(verb_phrase_matches) > 0:
		possible_matches = list()
		for match in verb_phrase_matches:
			if match.group(1)[0].string == "dream":
				continue
			phrases.append(tree[match.group(1).start:])
	return phrases
Пример #35
0
    def add_keywords(self, phrase):

        sent = en.Sentence(en.parse(phrase))
        nouns = search('NN', sent)
        self.blackboard.pool.nouns.update(
            set(Word(en.singularize(n[0].string)) for n in nouns))
        adjs = search('JJ', sent)
        self.blackboard.pool.adjectives.update(
            set(Word(en.lemma(a[0].string)) for a in adjs))

        try:
            nps = search('NP', sent)
            for np in nps:
                self.blackboard.pool.epithets.update({
                    Word(en.singularize(w.string), "NN"):
                    [Word(jj.string, "JJ") for jj in np if "JJ" in jj.tag]
                    for w in np if "NN" in w.tag
                })
        except IndexError:
            pass
Пример #36
0
def nlp(bigram, sent):
	entity = []

	for tup in bigram:
		txt = " ".join(tup)
		#print txt
		m = search(txt, sent)
		if m:
			entity.append(txt)
		#print m

	return entity
Пример #37
0
def get_ngrams(description, lang='it'):
    """
    Analyze description and get relevant ngrams using an italian POS tagger,
    looking for exact combination of POS pattern
    """
    s = it_parsetree(description, relations=True, lemmata=True)
    if lang == "en":
        s = en_parsetree(description, relations=True, lemmata=True)

    matches = []
    ngrams = []
    for match in search("JJ NN", s):
        matches.append(match.constituents())
    for match in search("NN JJ", s):
        matches.append(match.constituents())
    for match in search("NN", s):
        matches.append(match.constituents())
    for match in matches:
        ngrams.append(" ".join([chunk.string
                                for chunk in match]).encode("utf8"))
    return remove_uncorrect_tokens(ngrams)
Пример #38
0
def verbosInfinitivos(cadena):
	t = parsetree(cadena)
	verbos = search('VB*', t) 
	#lis=verbos.match.string
	#print 'list: ',lis
	#print #no puedo convertirlo a lista de una??
	lista =[]
	for match in verbos:
		lista.append((match.string , conjugate(match.string, INFINITIVE)))
	#print 'lista for: ',lista
	#print lista[3][1] 
	return lista
Пример #39
0
def isAskingBotInformation(sentence):
    m = search('what *+ your name', sentence)
    if len(m) > 0:
        return True

    m = search('VP+ *+ your name', sentence)
    if len(m) > 0:
        return True

    m = search('who *+ your creator|dad|mom|father|mother|papa|mama|daddy|mommy', sentence)
    if len(m) > 0:
        return True

    m = search('VP+ *+ your creator|dad|mom|father|mother', sentence)
    if len(m) > 0:
        return True

    m = search('who made|create|wrote|built you', sentence)
    if len(m) > 0:
        return True

    return False
Пример #40
0
def extract(statement):

  s = Sentence(parse(statement, lemmata=True))

  '''c1 = Constraint.fromstring("There be DT")
  c2 = Constraint.fromstring("NN+")
  c3 = Constraint.fromstring("(DT)")
  c4 = Constraint.fromstring("(RB) (JJ) NNP+")
  c5 = Constraint.fromstring("(call) (DT)")
  c6 = Constraint.fromstring("(RB) (JJ) (NNPS|NNP)+")
  p = Pattern(sequence=[c1, c2, c3, c4, c5, c6]) 
 
  match = p.search(s)
   '''
  s = find_entities(s)
   
   # not sure about this "be" thing - happy to match plural (is/are) but not sure about past tense ...
  match = search(MATCH_STRING_EXT, s)
  if not match:
    match = search(MATCH_STRING, s)
  #raise Exception(match)
  return s, match
Пример #41
0
def verbosInfinitivos(cadena):
    t = parsetree(cadena)
    verbos = search('VB*', t)
    print('verbos =', verbos)
    #lis=verbos.match.string
    #print ('list: ',lis)
    #print() #no puedo convertirlo a lista de una?? LAMBDA
    lista = []
    for match in verbos:
        lista.append((match.string, conjugate(match.string, INFINITIVE)))
    #print ('lista for: ',lista)
    #print (lista[3][1])
    return lista
Пример #42
0
def inflate(s):
    
    """ Returns an exaggerated string:
        inflate("I'm eating a burger") => "I'm eating hundreds of burgers".
    """
    
    # Part-of-speech tagging identifies word types in a text.
    # For example, "can" can be a noun (NN) or a verb (VB),
    # depending on the words surrounding it.
    # http://www.clips.ua.ac.be/pages/pattern-en#noc_parser
    
    # A parse tree splits punctuation marks from words, tags words,
    # and constructs a nested tree of sentences that contain words.
    # http://www.clips.ua.ac.be/pages/pattern-en#tree
    t = parsetree(s)
    
    # We can use pattern.search to search for patterns inside a parse tree.
    # If you know what regular expressions are: this is similar,
    # only you can also search by part-of-speech tag.
    # This is very useful to retrieve syntactic structures, e.g.:
    # "any noun, optionally preceded by an adjective", or
    # "any conjugation of the verb to be".
    # http://www.clips.ua.ac.be/pages/pattern-search
    
    # The search pattern below means:
    # "any determiner (a, an, the), optionally followed by any adjective,
    #  followed by one or more nouns".
    # The search will yield a list of matches.
    # We'll pluralize the nouns in each match, so that "burger" becomes "burgers", etc.
    # Note the curly braces {}.
    # We can retrieve the words inside it with match.group().
    for match in search("{DT} {JJ?} {NN+}", t):
        x = choice(["dozens of ", "hundreds of ", "thousands of "])
        
        # We'll only look at matches that start with "a" or "an".
        # This indicates an object or a thing of which many can exist.
        # If the match starts with "the", it might indicate something unique,
        # like "the capital of Nairobi". It doesn't make sense to transform
        # it into "hundreds of capitals of Nairobi".
        if match.group(1).string.lower() not in ("a", "an"):
            continue
        
        # Include the adjective, if any.
        if match.group(2):
            x += match.group(2).string
            x += " "
            
        # Pluralize the group of nouns.
        x += pluralize(match.group(3).string)
        s = s.replace(match.group(0).string, x)
    return s
Пример #43
0
        def handle_starttag(self, tag, attrs):

            for attr in attrs:

                a = attr[1]
                a = a.split('/')
                a = [stem(a[j]) for j in range(len(a))]

                a = '_'.join(str(e) for e in a)

                for i in attr:

                    l = search('src', i)

                    b1 = search('5g', a)

                    b3 = search('imag', a)
                    b4 = search('video', a)
                    b5 = search('pdf', a)

                    if not len(b1) == 0:

                        if not len(b5) == 0:

                            if not attr[1][2:] in Pdfs[lien]:
                                Pdfs[lien].append(attr[1][2:])

                        if not len(b3) == 0:
                            if not attr[1][2:] in Images[lien]:
                                Images[lien].append(attr[1][2:])

                        if not len(b4) == 0:
                            if not attr[1][2:] in Videos[lien]:
                                Videos[lien].append(attr[1][2:])

                    if not len(l) == 0 and not len(b1) == 0:
                        if not attr[1][2:] in Images[lien]:
                            Images[lien].append(attr[1][2:])
Пример #44
0
def handleBotInfo(sentence):
    name = [
        "Optimus... ah no, Optimist Prime :D",
        "I.am.the.legendary.Optimist.Prime B-)",
        "The most Optimist Prime! B-)", "You knew already *tsk tsk*"
    ]
    creator = [
        "It's a mystery :O", "Are you optimist enough to know? ;)",
        "You are among the few who I tell: All I know about my creator is the initials HT :)",
        "It remains a mystery to me even :(",
        "It was erased from my memory from the start :("
    ]

    m = search('what *+ your name', sentence)
    if len(m) > 0:
        return oneOf(name)

    m = search('VP+ *+ your name', sentence)
    if len(m) > 0:
        return oneOf(name)

    m = search(
        'who *+ your creator|dad|mom|father|mother|papa|mama|daddy|mommy',
        sentence)
    if len(m) > 0:
        return oneOf(creator)

    m = search(
        'VP+ *+ your creator|dad|mom|father|mother|papa|mama|daddy|mommy',
        sentence)
    if len(m) > 0:
        return oneOf(creator)

    m = search('who *+ creates|created|gave_birth *+ you', sentence)
    if len(m) > 0:
        return oneOf(creator)

    return "Can you guess? ;)"
Пример #45
0
def extractTrait(characterSentences):
    """
    Analyzes the sentence using serach module of pattern for adjective.
    """
    print(1)
    characterTrait = defaultdict(list)
    for key, value in characterSentences.items():
        for x in value:
            #print(x)
            #t=parsetree(x)
            characterTrait[key].append(search('JJ', parsetree(str(x))))
            #print(search('JJ',parsetree(str(x))))

    return characterTrait
Пример #46
0
def learn(concept):
    """ Returns a list of properties for the given concept,
        collected from a "I think X is Y".
    """
    q = 'I think %s is *' % concept
    p = []
    g = Google(language='en', license=None)
    for i in range(10):
        for result in g.search(q, start=i, cached=True):
            m = plaintext(result.description)
            m = search(q, m) # Use * as a wildcard.
            if m:
                p.append(m[0][-1].string)
    return [w for w in p if w in PROPERTIES] # only handles known properties...
Пример #47
0
def inflate(s):
    """ Returns an exaggerated string:
        inflate("I'm eating a burger") => "I'm eating hundreds of burgers".
    """

    # Part-of-speech tagging identifies word types in a text.
    # For example, "can" can be a noun (NN) or a verb (VB),
    # depending on the words surrounding it.
    # http://www.clips.ua.ac.be/pages/pattern-en#parser

    # A parse tree splits punctuation marks from words, tags words,
    # and constructs a nested tree of sentences that contain words.
    # http://www.clips.ua.ac.be/pages/pattern-en#tree
    t = parsetree(s)

    # We can use pattern.search to search for patterns inside a parse tree.
    # If you know what regular expressions are: this is similar,
    # only you can also search by part-of-speech tag.
    # This is very useful to retrieve syntactic structures, e.g.:
    # "any noun, optionally preceded by an adjective", or
    # "any conjugation of the verb to be".
    # http://www.clips.ua.ac.be/pages/pattern-search

    # The search pattern below means:
    # "any determiner (a, an, the), optionally followed by any adjective,
    #  followed by one or more nouns".
    # The search will yield a list of matches.
    # We'll pluralize the nouns in each match, so that "burger" becomes "burgers", etc.
    # Note the curly braces {}.
    # We can retrieve the words inside it with match.group().
    for match in search("{DT} {JJ?} {NN+}", t):
        x = choice(["dozens of ", "hundreds of ", "thousands of "])

        # We'll only look at matches that start with "a" or "an".
        # This indicates an object or a thing of which many can exist.
        # If the match starts with "the", it might indicate something unique,
        # like "the capital of Nairobi". It doesn't make sense to transform
        # it into "hundreds of capitals of Nairobi".
        if match.group(1).string.lower() not in ("a", "an"):
            continue

        # Include the adjective, if any.
        if match.group(2):
            x += match.group(2).string
            x += " "

        # Pluralize the group of nouns.
        x += pluralize(match.group(3).string)
        s = s.replace(match.group(0).string, x)
    return s
Пример #48
0
  def test_parse_sentences(self):
    texts = [
      pattern.en.Text(pattern.en.parse("Mary had a little lamb and it was really gorgeous. None.",lemmata=True)),
      pattern.fr.Text(pattern.fr.parse("Mary avait un agneau et il etait vraiment sympa. Personne.",lemmata=True))
    ]

    nps = []
    
    for text in texts:
      for sentence in text:
        for match in search('NP', sentence):
          for word in match.words:
            nps.append(word.lemma)

    self.assertEqual(nps, [u'mary', u'a', u'little', u'lamb', u'it', u'none', u'mary', u'un', u'agneau', u'et', u'il', u'personne'])
Пример #49
0
def test_search():  
    from pattern.search import search
    from pattern.en import parsetree
      
    t = parsetree('big white rabbit')
    print t
    print
    print search('JJ', t) # all adjectives
    print search('NN', t) # all nouns
    print search('NP', t) # all noun phrases
Пример #50
0
def fullQuery(sentence):
    new_str = ""
    for word in sentence.words:
        if word.string in ['places', 'locations', 'spots']:
            continue
        new_word = singularize(
            word.string) if word.type == "NNS" else word.string
        new_str += new_word + " "
    singularized_sentence = parsetree(new_str, relations=True, lemmata=True)

    m = search('{JJ? NN+} IN {JJ? NN+}', singularized_sentence)
    query = {}
    if len(m) > 0:
        query["term"] = m[0].group(1).string
        query["location"] = m[0].group(2).string
    return query
Пример #51
0
def fullQuery(sentence):
    new_str = ""
    for word in sentence.words:
        if word.string in ['places', 'locations', 'spots']:
            continue
        new_word = singularize(word.string) if word.type == "NNS" else word.string
        new_str += new_word + " "
    singularized_sentence = parsetree(new_str, relations=True, lemmata=True)

    
    m = search('{JJ? NN+} IN {JJ? NN+}', singularized_sentence)
    query = {}
    if len(m) > 0:
        query["term"] = m[0].group(1).string
        query["location"] = m[0].group(2).string
    return query
Пример #52
0
def load_data(data_file):
    corpus_data = []
    corpus_target = []
    print "Reading data file: {}".format(data_file)
    corpus_file = open (data_file, "r")

    print "Importing data..."
    lines = []
    for line in corpus_file:
        entry = Entry()
        line_parts = line.split("\t")
        # data validity check
        assert len(line_parts) == 4
        entry.figurative = True if (line_parts[2] == "figuratively") else False
        # initial pre-process
        phrase     = line_parts[1].decode('utf8').lower()
        sentences  = remove_tags(line_parts[3].decode('utf8').lower())

        entry.phrase       = wordpunct_tokenize(phrase)
        entry.phrase_lemma = [lemmatize(w) for w in entry.phrase]

        # clean up and parse sentence
        entry.sentences = sent_tokenize(sentences)
        entry.sentence  = np.array([wordpunct_tokenize(x) for x in entry.sentences])
        #entry.pos       = pos_tag(entry.sentence)
        entry.sentence  = np.hstack(entry.sentence)
        entry.sentence_lemma = np.array([lemmatize(w) for w in entry.sentence])

        # find match of phrase (original strings)
        phrase_match = search(" ".join(entry.phrase_lemma),
                              " ".join(entry.sentence_lemma))
        if len(phrase_match) > 0:
            # isolate context (remove phrase)
            context_select = np.ones(len(entry.sentence), dtype=np.bool)
            start  = phrase_match[0].start
            stop   = phrase_match[0].stop
            context_select[start:stop] = False
            entry.context       = entry.sentence[context_select]
            entry.context_lemma = entry.sentence_lemma[context_select]
        else:
            #print u"phrase {} not found in sentence {}?".format(phrase, sentences)
            entry.context = entry.sentence
            entry.context_lemma = entry.sentence_lemma

        lines.append(entry)
    return lines
Пример #53
0
def adjectives(L):
    """Returns a list of adjecives present in input lists.

    >>> adjectives([['big', 'white', 'tall', 'dog'], ['bat', 'tall']])
    ['big', 'white', 'tall', 'tall']
    >>> adjectives([['march'], ['yes', 'i', 'know', 'its', 'almost', 'march']])
    []
    """

    adjs = []
    for l in range(len(L)):
        current_string = " ".join(L[l])
        parts_of_speech = parsetree(current_string)     
        for i in search("JJ", parts_of_speech):     # Search the parsed string for adjectives
            adjs.append(str(i.string))

    return adjs
Пример #54
0
 def patternSearch(self, n=12, m=50):
     logger.info(u"patternSearch")
  
     proxyList = list()
     proxyList.append(u"3128")
     proxyList.append(u"206.217.138.154")
     
     logger.info(u"proxyList - %s" % proxyList)
  
     engine = Google(license=None, throttle=0.5, language=None)
     # engine = Bing(license=None, throttle=0.5, language=None)
 
     for i in range(n):                
         logger.info(u"Search %d" % i)
         results = engine.search(self.searchTerm, start=i+1, count=m, cached=False, proxy=proxyList)
         
         for r in results:
             logger.debug(u"Result=%s" % r.text)
             url = r.url
             logger.debug(u"URL=%s" % url)
             
             # if url[-4:] == ".com":
             #    continue
                     
             s = r.text.lower()
             s = plaintext(s)
             s = parsetree(s)
 
             # self.logSentences(s)
 
             # Execute a Regular Expression Search
             # p = r'(NN)+ (VB)+'
             p = r'(NN)+'
             q = search(p, s)
 
             # logPOS(q)
 
             # Iterate over all the words in the POS
             logger.debug(u"  q.Length=%d" % len(q))
             logger.debug(u"  q[]=%s" % q)
 
             self.g, self.urlConcepts, self.wordConcepts = \
                 self.addNodes(self.g, q, url, self.urlConcepts, self.wordConcepts)
     
     return self.urlConcepts, self.wordConcepts
Пример #55
0
def extract_noun_phrases(body_part_name):    
    stop = nltk.corpus.stopwords.words('english')    
    filename = '/Users/rsteckel/tmp/Observable_body_parts-sentences-BODYPART1.tsv'
    
    df = pd.read_csv(filename, sep='\t', encoding='utf-8')
    df['lemmas'] = df['themeword'].apply(lambda x: lemma(x))
    
    sentences = df[ df['lemmas'] == body_part_name]['sentence'].tolist()
    
    phrases = []
    for sentence in sentences:
        ptree = parsetree(sentence)
        matches = search('NP', ptree)        
        for match in matches:
            filtered_np = [ word for word in match if word.string.lower() not in stop ]
            if len(filtered_np) > 0:
                phrases.append( (sentence, filtered_np) )
    
    return pd.DataFrame(phrases, columns=['sentence', 'phrase'])