def getDescription(article): intro = parsetree(article.sections[0].string, lemmata=True) pattern = Pattern.fromstring('be DT *+') try: mat = pattern.match(intro) return mat.string except TypeError: pattern = Pattern.fromstring('be *+') return pattern.match(intro).string
def _extract_reporters(self): """ Extract the reporters and entities from those sentence of the text where a reported speech verb is used. """ # search for those sentences with reported speech verbs sentences = [s for s in self.__tree if search('RPTVRB|según', s)] # search for proper nouns that are not locations pattern = Pattern.fromstring('!LOCATION|NNP+', STRICT, taxonomy=TAXONOMY) for s in sentences: matches = pattern.search(s) for m in matches: for w in m.words: # chunks with roles (SBJ, OBJ) connected to a reporter verb if ((w.chunk.role is not None) and (w.chunk.verb.head.lemma in taxonomy)): if self._is_composed_noun(w): self.__reporters.append(w.previous()) self.__reporters.append(w) # proper nouns not spotlighted as reported else: if self._is_composed_noun(w): self.__entities.append(w.previous()) self.__entities.append(w)
def myExtract(statement): s = Sentence(parse(statement, relations=True, lemmata=True, light=True)) p = Pattern.fromstring('There be DT NN+') match = p.search(s) #raise Exception(match) return match
def notneeded(word): print word, for pos in rmpat: p = Pattern.fromstring(pos) if (p.scan(word)): print " " + pos return True return False
def test_pattern(): from pattern.search import Pattern from pattern.en import parsetree t = parsetree('Chuck Norris is cooler than Dolph.', lemmata=True) p = Pattern.fromstring('{NP} be * than {NP}') m = p.match(t) print m.group(1) print m.group(2) print t
def pattern_match(pattern, sentence): if type(sentence) is not Text: sentence = parsetree(sentence, lemmata=True) p = Pattern.fromstring(pattern) try: m = p.match(sentence) return m except: return None
def _extract_sources(self): """ Extract those well-known sources from the text. """ # search for well-known sources in the tree pattern = Pattern.fromstring('SOURCE', STRICT, taxonomy=TAXONOMY) for sentence in self.__tree: matches = pattern.search(sentence) for m in matches: for w in m.words: self.__sources.append(w)
def __init__(self, entity_type, expression, variables=None, negation=False, taxonomy=None, strict=False, exclude=None): self.entity_type = entity_type self.expression = expression if variables is None: variables = {} self.variables = variables if negation is None: negation = False self.negation = negation self.taxonomy = taxonomy self.strict = strict self.exclude = set() if exclude is None else set(exclude) self.pattern = Pattern.fromstring(expression, taxonomy=taxonomy, strict=strict)
def __init__(self, expression, type, config=None, negation=False, taxonomy=None, strict=False, exclude=None): self.expression = expression self.type = type self.taxonomy = taxonomy if config is None: config = {'value': 1} self.config = config if negation is None: negation = False self.negation = negation self.pattern = Pattern.fromstring(expression, taxonomy=taxonomy, strict=strict) self.strict = strict self.exclude = set() if exclude is None else set(exclude)
s = Sentence(parse("When I sleep the big white rabbit will stare at my feet.")) m = search("rabbit stare at my", s) print s print m print # Why does this work? # The word "will" is included in the result, even if the pattern does not define it. # The pattern should break when it does not encounter "stare" after "rabbit." # It works because "will stare" is one verb chunk. # The "stare" constraint matches the head word of the chunk ("stare"), # so "will stare" is considered an overspecified version of "stare". # The same happens with the "rabbit" constraint: # this matches the overspecified chunk "the big white rabbit". p = Pattern.fromstring("rabbit stare at my", s) p.strict = True # Now it matches only what the pattern explicitly defines. m = p.search(s) print m print # Sentence chunks can be matched by tag (e.g. NP, VP, ADJP). # The pattern below matches anything from # "the rabbit gnaws at your fingers" to # "the white rabbit looks at the carrots": p = Pattern.fromstring("rabbit VP at NP", s) m = p.search(s) print m print if m:
print '' taxonomy = PS.Taxonomy() taxonomy.append('looks', type='perception') taxonomy.append('appears', type='perception') s = "Kiko foreign glitter that looks great in the shade." s = "I'm also thinking this polish would look amazing over black!" s = "Oh this is a great brush. Fluffy soft bristles and works like a charm." pattern = Pattern.fromstring('{SBJ?} * {PERCEPTION} * {JJ?} * {OBJ?} {OBJ?}', taxonomy=taxonomy, strict=True) #documents = [s] for document in documents: parsed = parsetree(document, lemmata=True, relations=True) for sentence in parsed.sentences: matches = pattern.search(sentence) if matches: print sentence.string for match in matches: for c in match.constituents(): print c print ''
# Example of pattern: http://www.clips.ua.ac.be/pages/pattern from pattern.web import Bing, plaintext from pattern.en import Sentence, Chunk, parse from pattern.search import Pattern from pattern.graph import Graph, Node, Edge, export g = Graph() for i in range(1): print "--------------", i for r in Bing().search('"more important than"', start=i+1, count=50): s = plaintext(r.description.lower()) print s s = Sentence(parse(s)) print s p = Pattern.fromstring('NP (VP) more important than NP') for m in p.search(s): a = m.constituents(p[+0])[-1] # Left NP. b = m.constituents(p[-1])[+0] # Right NP. a = (isinstance(a, Chunk) and a.head or a).string b = (isinstance(b, Chunk) and b.head or b).string if a and b: if a not in g: g.add_node(a, radius=5, stroke=(0,0,0,0.8)) if b not in g: g.add_node(b, radius=5, stroke=(0,0,0,0.8)) g.add_edge(g[b], g[a], stroke=(0,0,0,0.6)) g = g.split()[0] # Largest subgraph. for n in g.sorted()[:40]: # Sorted by Node.weight.
sentence = " ".join(word_list) output.append(sentence) # print "\n", sentence, "\n\n" return output results = find_all_matches_by_ziyu(whole_text, 'VB JJ NNS|NN') print "RESULTS:" for result in results: print result sys.exit() # t = parsetree('Dolph Lundgren is cooler than Frank.', lemmata=True) # p = Pattern.fromstring('{VB} {TO} {VB} {IN} {NN}') p = Pattern.fromstring('{V*}') m = p.match(t) print m sys.exit() # NAME OF DRUG # take the title's first word, first three letters # combine with suffixes ["phrin","ytril","syn","xyzal","yrhil","nexx"] print('\n' * 4) print " "*10 + "NAME OF DRUG" + ":" # SHORT DESCRIPTION OF DRUG # example: 100% grass fed supplement for cultrual materialism
import os, sys; sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.search import Pattern from pattern.en import Sentence, parse # Constraints wrapped in () are optional, matching one or no word. # Pattern.search() uses a "greedy" approach: # it will attempt to include as many optional constraints as possible. # The following pattern scans for words whose part-of-speech tag is NN (i.e. nouns). # A preceding adjective, adverb or determiner are picked up as well. p = Pattern.fromstring("(DT) (RB) (JJ) NN+") for s in ( "the cat", # DT NN "the very black cat", # DT RB JJ NN "tasty cat food", # JJ NN NN "the funny black cat", # JJ NN "very funny", # RB JJ => no match, since there is no noun. "my cat is black and your cat is white"): # NN + NN s = Sentence(parse(s)) m = p.search(s) print print s print m if m: for w in m[0].words: print w, "matches", m[0].constraint(w) # Note: the above pattern could also be written as "(DT|RB|JJ)+ NN+" # to include multiple adverbs/adjectives. # By combining * () and + patterns can become quite complex.
# that can be used to define semantic word types. # For example, consider that you want to extract flower names from a text. # This would make patterns somewhat unwieldy, e.g.: # Pattern.fromstring("rose|lily|daisy|daffodil|begonia"). # A better approach is to use the taxonomy: for flower in ("rose", "lily", "daisy", "daffodil", "begonia"): taxonomy.append(flower, type="flower") print taxonomy.children("flower") print taxonomy.parents("rose") print taxonomy.classify("rose") # Yields the most recently added parent. print # Taxonomy terms can be included in a pattern: p = Pattern([Constraint(taxa=["flower"])]) # or p = Pattern.fromstring("FLOWER") s = Sentence(parse("A field of white daffodils.", lemmata=True)) m = p.search(s) print s print m print from pattern.search import search taxonomy.append("chicken", type="food") taxonomy.append("chicken", type="bird") taxonomy.append("penguin", type="bird") taxonomy.append("bird", type="animal") print taxonomy.parents("chicken") print taxonomy.children("animal", recursive=True)
import os, sys; sys.path.insert(0, os.path.join("..", "..")) from pattern.search import Pattern from pattern.en import Sentence, parse # Constraints ending in + match one or more words. # Pattern.search() uses a "greedy" approach: # it will attempt to match as many words as possible. # The following pattern means: # one or more words starting with "t", # followed by one or more words starting with "f". p = Pattern.fromstring("t*+ f*+") s = Sentence(parse("one two three four five six")) m = p.search(s) print s print m print for w in m[0].words: print w, "matches", m[0].constraint(w) # Pattern.fromstring("*") matches each word in the sentence. # This yields a list with a Match object for each word. print print "* =>", Pattern.fromstring("*").search(s) # Pattern.fromstring("*+") matches all words. # This yields a list with one Match object containing all words. print print "*+ =>", Pattern.fromstring("*+").search(s)
from pattern.en import Sentence, parse from pattern.search import Pattern from pattern.db import Datasheet, pprint # "X IS MORE IMPORTANT THAN Y" # Here is a rough example of how to build a web miner. # It mines comparative statements from Bing and stores the results in a table, # which can be saved as a text file for further processing later on. # Pattern matching also works with Sentence objects from the MBSP module. # MBSP's parser is much more robust (but also slower). #from MBSP import Sentence, parse q = '"more important than"' # Bing search query p = "NP (VP) more important than NP" # Search pattern. p = Pattern.fromstring(p) d = Datasheet() engine = Bing(license=None) for i in range(1): # max=10 for result in engine.search(q, start=i+1, count=100, cached=True): s = result.description s = plaintext(s) s = Sentence(parse(s)) for m in p.search(s): a = m.constituents(constraint=0)[-1] # Left NP. b = m.constituents(constraint=5)[ 0] # Right NP. d.append(( a.string.lower(), b.string.lower()))
# Example of pattern: http://www.clips.ua.ac.be/pages/pattern from pattern.web import Bing, plaintext from pattern.en import Sentence, Chunk, parse from pattern.search import Pattern from pattern.graph import Graph, Node, Edge, export g = Graph() for i in range(1): print "--------------", i for r in Bing().search('"more important than"', start=i + 1, count=50): s = plaintext(r.description.lower()) print s s = Sentence(parse(s)) print s p = Pattern.fromstring('NP (VP) more important than NP') for m in p.search(s): a = m.constituents(p[+0])[-1] # Left NP. b = m.constituents(p[-1])[+0] # Right NP. a = (isinstance(a, Chunk) and a.head or a).string b = (isinstance(b, Chunk) and b.head or b).string if a and b: if a not in g: g.add_node(a, radius=5, stroke=(0, 0, 0, 0.8)) if b not in g: g.add_node(b, radius=5, stroke=(0, 0, 0, 0.8)) g.add_edge(g[b], g[a], stroke=(0, 0, 0, 0.6)) g = g.split()[0] # Largest subgraph. for n in g.sorted()[:40]: # Sorted by Node.weight.
import os, sys; sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.search import search, Pattern, Constraint from pattern.en import Sentence, parse # This example demonstrates an interesting search pattern that mines for comparisons. # Notice the use of the constraint "be". # If the output from the parser includes word lemmas (e.g. "doing" => "do") # these will also be matched. Using "be" then matches "is", "being", "are", ... # and if underspecification is used "could be", "will be", "definitely was", ... p = Pattern.fromstring("NP be (more) ADJP|ADVP than NP") for s in ( "the turtle was faster than the hare", "Arnold Schwarzenegger is more dangerous than Dolph Lundgren"): s = s = Sentence(parse(s, lemmata=True)) # parse lemmas m = p.search(s) print s print print m print if m: print m[0].constituents() # Words grouped by chunk whenever possible. print m[0].constraints(chunk=s.chunks[0]) # The constraints that match the given chunk. print m[0].constituents(constraint=p[0]) # Constituents for the given constraint. print m[0].constituents(constraint=[0,3,5]) # Constituents for the given constraint indices. print print print
def process(self,results, pattern, download): risk_results = [] body = '' for r in results: p = Pattern.fromstring(pattern) url = URL(r.url) s = Sentence(parse(r.description)) p_search = p.search(s) if download == DownloadType.Full or (download == DownloadType.Dynamic and len(p_search) == 0): try: #memetype and download url operation can throw 500+ & 400+, in those cases we #can escape the exception gracefully without halting the search if url.mimetype == "text/html": body = str(r.download(timeout=110, cached=True, proxy=None).encode("utf-8")) body = plaintext(body) except: #there are cases where plaintext func fails to extract just text, we catch that exception, #however the body text is returned as html therefore the pattern search maybe not be #reliable. Our choices are to skip this search intirely or attemp to extract the #pattern. For now we'll skip that search result. continue p_search = p.search(Sentence(parse(body))) else: body = '' result = Result(url=None) result.url = url result.url_content = (body or "") result.query= r.query result.sentence = s risky_terms = [] for m in p_search: rightNP = '' for chunk in m.constituents(p[-1]): #Right NP, get all NP elements in the list rightNP += chunk.string + " " risky_terms.append(rightNP) """ leftNP = m.constituents(p[+0])[-1] # Left NP. leftNP = (isinstance(leftNP, Chunk) and leftNP.head or leftNP).string c = leftNP if leftNP and rightNP: if leftNP not in g: g.add_node(leftNP, radius=4, stroke=(0,0,0,0.8)) if rightNP not in g: g.add_node(rightNP, radius=4, stroke=(0,0,0,0.8)) if c not in g: g.add_node(c, radius=4, stroke=(0,0,0,0.8)) g.add_edge(g[leftNP], g[c], stroke=(0,0,0,0.6)) g.add_edge(g[c], g[rightNP], stroke=(0,0,0,0.6)) """ if len(risky_terms) > 0 : result.risky_terms = risky_terms risk_results.append(result) """ g = g.split()[0] # Largest subgraph. for n in g.sorted()[:40]: # Sorted by Node.weight. n.fill = (0.0, 0.5, 1.0, 0.7 * n.weight) export(g, 'testtest', directed=True, weighted=0.6, distance=14, force=0.05, repulsion=150) """ return risk_results
def __init__(self, expression, type, taxonomy=None): self.expression = expression self.type = type self.taxonomy = taxonomy self.pattern = Pattern.fromstring(expression, taxonomy=taxonomy)
import os, sys; sys.path.append(os.path.join("..", "..", "..")) from pattern.search import Pattern from pattern.en import Sentence, parse # Constraints ending in + match one or more words. # Pattern.search() uses a "greedy" approach: # it will attempt to match as many words as possible. # The following pattern means: # one or more words starting with "t", # followed by one or more words starting with "f". p = Pattern.fromstring("t*+ f*+") s = Sentence(parse("one two three four five six")) m = p.search(s) print s print m print for w in m[0].words: print w, "matches", m[0].constraint(w) # Pattern.fromstring("*") matches each word in the sentence. # This yields a list with a Match object for each word. print print "* =>", Pattern.fromstring("*").search(s) # Pattern.fromstring("*+") matches all words. # This yields a list with one Match object containing all words. print print "*+ =>", Pattern.fromstring("*+").search(s)
import os, sys sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.search import Pattern from pattern.en import Sentence, parse # Constraints wrapped in () are optional, matching one or no word. # Pattern.search() uses a "greedy" approach: # it will attempt to include as many optional constraints as possible. # The following pattern scans for words whose part-of-speech tag is NN (i.e. nouns). # A preceding adjective, adverb or determiner are picked up as well. p = Pattern.fromstring("(DT) (RB) (JJ) NN+") for s in ( "the cat", # DT NN "the very black cat", # DT RB JJ NN "tasty cat food", # JJ NN NN "the funny black cat", # JJ NN "very funny", # RB JJ => no match, since there is no noun. "my cat is black and your cat is white"): # NN + NN s = Sentence(parse(s)) m = p.search(s) print print s print m if m: for w in m[0].words: print w, "matches", m[0].constraint(w) # Note: the above pattern could also be written as "(DT|RB|JJ)+ NN+" # to include multiple adverbs/adjectives.
s = parsetree("When I sleep the big white rabbit will stare at my feet.") m = search("rabbit stare at feet", s) print(s) print(m) print() # Why does this work? # The word "will" is included in the result, even if the pattern does not define it. # The pattern should break when it does not encounter "stare" after "rabbit." # It works because "will stare" is one verb chunk. # The "stare" constraint matches the head word of the chunk ("stare"), # so "will stare" is considered an overspecified version of "stare". # The same happens with "my feet" and the "rabbit" constraint, # which matches the overspecified chunk "the big white rabbit". p = Pattern.fromstring("rabbit stare at feet", s) # Now it matches only what the pattern explicitly defines (=no match). p.strict = True m = p.search(s) print(m) print() # Sentence chunks can be matched by tag (e.g. NP, VP, ADJP). # The pattern below matches anything from # "the rabbit gnaws at your fingers" to # "the white rabbit looks at the carrots": p = Pattern.fromstring("rabbit VP at NP", s) m = p.search(s) print(m) print()
from pattern.search import Pattern from pattern.en import parsetree t = parsetree('Chuck Norris is cooler than Dolph Lundgren.', lemmata=True) p = Pattern.fromstring('{NP} be * than {NP}') m = p.match(t) print m.group(1) print m.group(2) from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() lmtzr.lemmatize('humidity') from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer()