def Chunk(self, sentence, node='NP', grammer=r""" NP: {<DT|PP\$>?<JJ>*<NN>} {<NNP>+} """): ''' Takes text and returns a list of noune and noun phrases, this is done by a form RegEx matching which is included in the NLTK libary. @param text: the text that is going to be chunked @param node='NP': this is which node to chunk @param grammer='NP: {<DT|PP\$>?<JJ>*<NN>}{<NNP>+}': the grammar ReGex to use for chunking @return: A nested list of tuples of chunked phrases with pos tagging. ''' tmp = [] cp = RegexpParser(grammer) for sent in sentence: for phrase in self.sub_leaves(cp.parse(sent), node): tmp.append(phrase) results = [] for phrase in tmp: string = "" for (word, tag) in phrase: string = string + word + " " results.append(string[:-1]) return results
def ProcessWoeds(self, arr): tagged = pos_tag(arr) chunkGram = r"""Chunk:{<RB.?>*<VB.?>*<NNP>}""" chunkParser = RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) return chunked print(chunked)
def get_chunks(tagged_sentences): master_list = [] master_noun = [] master_adj = [] grammar = r""" CHUNK1: {<NN.*><.*>{0,3}<JJ.*>} # Any Noun terminated with Any Adjective CHUNK2: {<JJ.*><.*>{0,3}<NN.*>} # Nouns or Adjectives, terminated with Nouns """ cp = RegexpParser(grammar) for sent in tagged_sentences: tree = cp.parse(sent) for subtree in tree.subtrees( filter=lambda t: t.label() in ['CHUNK1', 'CHUNK2']): if (str(subtree).find('NN') > 0 or str(subtree).find('NNS') > 0 or str(subtree).find('NNP') > 0) and ( str(subtree).find('JJ') > 0 or str(subtree).find('JJS') > 0 or str(subtree).find('JJR') > 0): nouns = [ word for word, tag in subtree.leaves() if tag in ['NN', 'NNS', 'NNP'] ] adjss = [ word for word, tag in subtree.leaves() if tag in ['JJ', 'JJR', 'JJS'] ] master_noun.extend([nouns]) master_adj.extend([adjss]) return [m[0] + ":" + n[0] for m, n in zip(master_noun, master_adj)]
def parse_request(message): tagPatterns = [ (r'(honda|toyota|ford|kia|hyundai|audi|bmw|opel|mitsubishi|mazda|skoda|skoda|subaru)$', 'VENDOR'), (r'([a-zA-Z0-9]+)$', 'MODEL'), (r'(от|для)$', 'PREP'), (r'(нах|бля|твою мать)$', 'PROFANITY'), (r'([а-яА-Я]+)$', 'PART_NAME'), ] tagger = nltk.RegexpTagger(tagPatterns) taggedRequest = tagger.tag(nltk.word_tokenize(message)) chunker = RegexpParser(r''' S: {<CAR> <PREP>? <PART_NAME>} MODEL: {<MODEL>+} VENDOR: {<VENDOR>} CAR: {<VENDOR> <MODEL>} PROFANITY: {<PROFANITY>+} PART_NAME: {<PART_NAME>+} ''') tree = chunker.parse(taggedRequest) car = list(tree.subtrees(lambda t: t.label() == 'VENDOR')) parsed_request = {} # Hack with try except try: parsed_request['vendor'] = list( tree.subtrees(lambda t: t.label() == 'VENDOR'))[0].leaves()[0][0] except Exception: parsed_request['vendor'] = None try: parsed_request['model'] = ' '.join([ leave[0] for leave in list( tree.subtrees(lambda t: t.label() == 'MODEL'))[0].leaves() ]) except Exception: parsed_request['model'] = None try: parsed_request['part_name'] = ' '.join([ leave[0] for leave in list( tree.subtrees(lambda t: t.label() == 'PART_NAME'))[0].leaves() ]) except Exception: parsed_request['part_name'] = None try: if len(list(tree.subtrees(lambda t: t.label() == 'PROFANITY'))): parsed_request['profanity'] = True else: parsed_request['profanity'] = False except Exception: parsed_request['profanity'] = False return parsed_request
def __init__(self, patterns: str, loop: int = 1, trace: int = 0, attribute: str = 'pos', apply_iob2: bool = True) -> None: self.__attribute = attribute self.__regex_parser = RegexpParser(patterns, root_label='', loop=loop, trace=trace) self.__apply_iob2 = apply_iob2
def generate_chunks(tagged_sent, expression=r'CHUNK: {(<adj>* <n.*>+ <prp>)? <adj>* <n.*>+}'): chunks = [] chunkParser = RegexpParser(expression) try: if len(tagged_sent) == 0: tree = Tree('S', []) else: tree = chunkParser.parse(tagged_sent, trace=0) for subtree in tree.subtrees(): if subtree.label() == "CHUNK": chunks.append(subtree.leaves()) except ValueError: chunks = [] return chunks
class RegexpChunker(Chunker): """ Este tagger de n-gramas o chunker utiliza gramaticas para detectar frases. setupData: es el string de las gramaticas """ def __init__(self,setupData): super(RegexpChunker,self).__init__(setupData) self.chunker=RegexpParser(setupData) def tag(self,data): if self.fixer_function: data=self.fixer_function(data) iobs=None try: parsedTree=self.chunker.parse(data) print parsedTree iobs= tree2conlltags(parsedTree) except Exception,e: pass return iobs
def _chunker(self, tuple_sent): """Chunk base-phrases using chunking rules. Args: tuple_sent (list(tuple(str, str))) Returns: chunk_struct Tree('S', [Tree('CHUNK', [(str, str), (str, str)]], (str, str), ...): chunked sentence """ chunkTreeList = [] chunker = RegexpParser(self._ChunkingRule(self._CHUNK_RULE_VXP_)) chunk_struct = chunker.parse(tuple_sent) return chunk_struct
class RegexpChunker(Chunker): """ Este tagger de n-gramas o chunker utiliza gramaticas para detectar frases. setupData: es el string de las gramaticas """ def __init__(self,setupData): super(RegexpChunker,self).__init__(setupData) self.chunker=RegexpParser(setupData) def tag(self,data): if self.fixer_function: data=self.fixer_function(data) iobs=None try: parsedTree=self.chunker.parse(data) iobs= tree2conlltags(parsedTree) except Exception,e: pass return iobs
def find_keywords(text): """ Extracts keywords from text. Args: text: A text fragment. Returns: A list containing the extracted keywords. """ grammar = r''' KEYWORD: {<NNP><NNP>+} {<NN.*><NN.*>+} {<JJ>+<NN>+} ''' parser = RegexpParser(grammar) sentences = [ ] words = [ ] keywords = [ ] for sentence in sent_tokenize(text): tokens = word_tokenize(sentence) if not tokens: continue sentences.append(tokens) words += tokens collocations = find_collocations(words) for sentence in sentences: tree = parser.parse(pos_tag(sentence)) for node in _select_nodes(tree, ['KEYWORD']): word = ' '.join(map(lambda p: p[0], node)) if word in collocations: keywords.append(word) keywords = sorted(keywords, key=lambda k: len(k.split()), reverse=True) instances = { } for k in keywords: key = k for existing in instances.keys(): if re.match(k, existing): key = existing break instances[key] = instances.get(key, 0) + 1 results = instances.items() results.sort(key=lambda item: int(item[1]), reverse=True) return map(lambda item: item[0], results)
def rule_based_reqs_chunk(tagged_reqs, ids): chunker = RegexpParser(ruleset) terms = [] term_index = [] for i, t in enumerate(tagged_reqs): s = chunker.parse(t) for c in s: if not isinstance(c, tuple): if c.label() == 'NP': term = [] for tagged_word in c: if (tagged_word[1] != 'DT') and (tagged_word[1] != 'PRP$'): term = term + [tagged_word[0]] terms.append(term) term_index.append(i) return terms, term_index
def parse(self): """ Parse le texte tokenisé à l'aide de notre grammaire créé pour récupérer les groupes de mots contenant une NE. """ if self.own_tag: rp = RegexpParser(Parser.GRAMMAR_OWN_TAG) else: rp = RegexpParser(Parser.GRAMMAR) tree = rp.parse(self.tokens) for subtree in tree.subtrees(): if subtree.label() == "S": continue self.tagged_nodes.append( [subtree.label(), subtree.leaves()] ) print(self.tagged_nodes)
def additionalExtractions(dep_triples, tagged_sentence, svo_triples): if not svo_triples: return None grammar = "SmallNP: {(<CD.*>|<JJ.*>)<NN.*>+}" cp = RegexpParser(grammar) chunk = cp.parse(tagged_sentence) triple_array = [] for subtree in chunk.subtrees(): if subtree.label() == 'SmallNP': for triple in svo_triples: pos = subtree.leaves() loc1 = tag_index(pos, triple[0]) if loc1 != -1: triple_array.extend(chunk_triples(pos, loc1)) loc2 = tag_index(pos, triple[2]) if loc2 != -1: triple_array.extend(chunk_triples(pos, loc2)) return triple_array
def extractPossibleTerms(root, fileids): # get corpus #root, filename = os.path.split(path) reader = PlaintextCorpusReader(root, fileids) # get chunker grammar = 'NP: {<JJ>*<NNP>*<NN>*}' chunker = RegexpParser(grammar) # get terms terms = set() print len(reader.sents()) i = 0 for sent in reader.sents(): i += 1 if i%100==0: print i tree = chunker.parse(pos_tag(sent)) for t in tree.subtrees(lambda t: t.node!='S'): # exclude Sentence node terms.add(' '.join([el[0] for el in t])) return terms
def preprocessing(self,desc): desc = desc.replace(","," ") desc = desc.replace("!","") desc = desc.replace("@","") desc = desc.replace("#","") desc = desc.replace("%","") desc = desc.replace("(","") desc = desc.replace(")","") desc = desc.replace(":","") desc = desc.replace("{","") desc = desc.replace("}","") desc = desc.replace("`","") desc = desc.replace("[","") desc = desc.replace("]","") desc = desc.replace("'","") desc = desc.replace("*","") desc = desc.replace("&","") desc = desc.replace("^","") print desc if "I/O" in desc: desc = desc.replace("I/O","IO") desc = desc.replace("/"," and ") tokenized = nltk.word_tokenize(desc) posTag = nltk.pos_tag(tokenized) grammar = ''' RB: {<RB> | <RBS> | <RBR>}''' chunker = RegexpParser(grammar) chunked = chunker.parse(posTag) print chunked for n in range(len(chunked)): if str(chunked[n]).startswith('(RB') is True: if n is 0 : s = str(chunked[n]).split(" ") ss = s[1].split("/") removalWord = ss[0] desc = desc.replace(removalWord+" ","") if n>0 and n<=len : s = str(chunked[n]).split(" ") ss = s[1].split("/") removalWord = ss[0] desc = desc.replace(" "+removalWord,"") return desc
def exctract_ngrams(self, tagged_sent): ''' Exctract ngrams, given a list of chunk rules for the previously tagged sentence. Keyword arguments: @param tagged_sent the POST tagged sentence whose ngrams need to be exctracted ''' chunker = RegexpParser(CHUNK_RULE) tree = chunker.parse(tagged_sent) ngrams = [] for item in self.__leaves(tree): if not item == tagged_sent: probable_ngram = ' '.join(self.__stemmer.stem( word.lower()) for (word, pos) in item ) if self.__evaluate_polarity_ngram(probable_ngram): ngrams.append(probable_ngram) return ngrams
def chunking_noun(document): #Get the words in the document words = word_tokenize(document) tagged = nltk.pos_tag(words) counts = Counter(tag for WORD, tag in tagged) counts = dict(counts) #print(counts) chunkGram = r""" PHRASE: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}""" chunkParser = RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) serch_keywords = [] for tree in chunked.subtrees(): if tree.label() == 'PHRASE': serch_keyword = ' '.join([x for x, y in tree.leaves()]) serch_keywords.append(serch_keyword) serch_keywords = [ w for w in serch_keywords if len(w.split(' ')) > 1 and len(w.split(' ')) <= 3 ] return serch_keywords, tagged, counts
def get_noun_phrases(text_list, tagger): noun_phrases = [] tagged_texts = [tagger.tag(text.split()) for text in text_list] expression = r'NOUN_PHRASE: {(<adj>* <n.*>+ <prp>)? <adj>* <n.*>+}' chunkParser = RegexpParser(expression) for tagged_sent in tagged_texts: try: if len(tagged_sent) == 0: tree = Tree('S', []) else: tree = chunkParser.parse(tagged_sent, trace=0) for subtree in tree.subtrees(): if subtree.label() == "NOUN_PHRASE": noun_phrases.append([el[0] for el in subtree.leaves()]) except ValueError: noun_phrases = [] return noun_phrases
def extract_candidate_phrases(document_obj, parts_of_speech_re=DEFAULT_RE): ''' :param document_obj: document from which you want to extract parts of the speech (candidate phrases) :param parts_of_speech_re: regular expression with parts of speech structure :return: dict, keys are the sentence id and values list of candidate phrases for that sentence ''' candidate_phrases = {} # get sentences of the document sentences = document_obj.get_sentences() # for each sentence for sentence in sentences: sentence_id = sentence.get_sentence_id() # get tokens tokens_objs = sentence.get_tokens() # list of tuples with token and its pos token_pos_list = [(token_obj.get_token_str(), token_obj.get_token_pos()) for token_obj in tokens_objs] # create regex parser with regular expression of tags regex_parser = RegexpParser(parts_of_speech_re) sentence_regex_tree = regex_parser.parse(token_pos_list) # get all subtrees with NP label match_subtrees = sentence_regex_tree.subtrees( filter=lambda t: t.label() == STAGE_MARKER) sentence_candidate_phrases = [] # add candidate phrases for subtree in match_subtrees: leaves_str = ' '.join( [leave_token_pos[0] for leave_token_pos in subtree.leaves()]) sentence_candidate_phrases.append(leaves_str) candidate_phrases[sentence_id] = sentence_candidate_phrases return candidate_phrases
def get_search_tags(a, verbose=False): if verbose: print() print('-' * 100) print("\tRunning `get_search_tags`...") print('-' * 100) search_tag_parser = RegexpParser("STAG: {\ (<RB>|<RBR>|<RBS>|<VB>|<VB[A-Z]>|<IN>|<CC>)\ (<JJ>|<JJR>|<JJS>|<DT>)\ (<NN>|<NNS>|<NNP>|<NNPS>)+\ }") pos_tags = pos_tag(word_tokenize(a)) if verbose: print("Part of Speech Tags:", pos_tags, '\n') data = search_tag_parser.parse(pos_tags) if verbose: print("Matched Search Tags:", data) return extract_tags(data)
def extract_candidate_keywords(document): #Get the words in the document words = word_tokenize(document) # Chunk first to get 'Candidate Keywords' tagged = nltk.pos_tag(words) chunkGram = r""" PHRASE: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+} """ chunkParser = RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) candidate_keywords = [] for tree in chunked.subtrees(): if tree.label() == 'PHRASE': candidate_keyword = ' '.join([x for x,y in tree.leaves()]) candidate_keywords.append(candidate_keyword) candidate_keywords = [w for w in candidate_keywords if len(w) > 3 and len(w.split(' ')) < 6] #print("Data XYZ:",candidate_keywords) return candidate_keywords
class NLTKChunker(PackProcessor): r"""A wrapper of NLTK chunker. """ def __init__(self): super().__init__() self.chunker = None # pylint: disable=unused-argument def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) self.chunker = RegexpParser(configs.pattern) @classmethod def default_configs(cls): r"""This defines a basic config structure for NLTKChunker. """ config = super().default_configs() config.update({ 'pattern': 'NP: {<DT>?<JJ>*<NN>}', 'token_component': None, 'sentence_component': None }) return config def _process(self, input_pack: DataPack): for sentence in input_pack.get( Sentence, components=self.configs.sentence_component): token_entries = list( input_pack.get(entry_type=Token, range_annotation=sentence, components=self.configs.token_component)) tokens = [(token.text, token.pos) for token in token_entries] cs = self.chunker.parse(tokens) index = 0 for chunk in cs: if hasattr(chunk, 'label'): # For example: # chunk: Tree('NP', [('This', 'DT'), ('tool', 'NN')]) begin_pos = token_entries[index].span.begin end_pos = token_entries[index + len(chunk) - 1].span.end phrase = Phrase(input_pack, begin_pos, end_pos) phrase.phrase_type = chunk.label() index += len(chunk) else: # For example: # chunk: ('is', 'VBZ') index += 1
class NLTKChunker(PackProcessor): r"""A wrapper of NLTK chunker. """ def __init__(self): super().__init__() self.chunker = None self.token_component = None # pylint: disable=unused-argument def initialize(self, resource: Resources, configs: HParams): self.chunker = RegexpParser(configs.pattern) @staticmethod def default_configs(): r"""This defines a basic config structure for NLTKChunker. """ return { 'pattern': 'NP: {<DT>?<JJ>*<NN>}', } def _process(self, input_pack: DataPack): for sentence in input_pack.get(Sentence): token_entries = list( input_pack.get(entry_type=Token, range_annotation=sentence, component=self.token_component)) tokens = [(token.text, token.pos) for token in token_entries] cs = self.chunker.parse(tokens) index = 0 for chunk in cs: if hasattr(chunk, 'label'): # For example: # chunk: Tree('NP', [('This', 'DT'), ('tool', 'NN')]) begin_pos = token_entries[index].span.begin end_pos = token_entries[index + len(chunk) - 1].span.end phrase = Phrase(input_pack, begin_pos, end_pos) kwargs_i = {"phrase_type": chunk.label()} phrase.set_fields(**kwargs_i) input_pack.add_or_get_entry(phrase) index += len(chunk) else: # For example: # chunk: ('is', 'VBZ') index += 1
class TreeChunker(ContextChunker): def __init__(self, patterns: str, loop: int = 1, trace: int = 0, attribute: str = 'pos', apply_iob2: bool = True) -> None: self.__attribute = attribute self.__regex_parser = RegexpParser(patterns, root_label='', loop=loop, trace=trace) self.__apply_iob2 = apply_iob2 def tag(self, context: Context) -> List[str]: tokens_to_chunk = [ 'NULL' if tk == '' else tk for tk in context.get(self.__attribute) ] chunk_struct = list(zip(context.get('tokens'), tokens_to_chunk)) return self._traverse_tree(self.__regex_parser.parse(chunk_struct)) def _traverse_tree(self, tree, is_subtree: bool = False): tags = [] for i, subtree in enumerate(tree): if isinstance(subtree, nltk.tree.Tree): tags.extend(self._traverse_tree(subtree, True)) else: tag = tree.label() if is_subtree: index = '' if self.__apply_iob2: index = 'B-' if i == 0 else 'I-' tag = f'{index}{tag}' tags.append(tag) return tags
class PostPatternStrategy(Strategy): """ Hay casos en que las frases que deseamos detectar se basan en las palabras mas que en su clase gramatical. Tambien podemos ser mas precisos si podemos considerar distintos niveles del arbol, por ejemplo frases y palabras juntas dentro de una regla como un unico token. Esta estrategia permite mirar el arbol en altura y ancho, de modo que las gramaticas que escribamos podran ser mas presicas y flexibles. """ def __init__(self, grammar="", loop=2): super(PostPatternStrategy, self).__init__() self.postChunker = RegexpParser(grammar, loop) self.grammar = grammar self.loop = loop def fix(self, feature): cleanSentence = feature tree = None try: grammar_pattern_to_clean = r'_.*' # caracter de separacion de niveles dentro de un mismo token. clean_pattern = '' modified_chunk_pattern = r'.*_' words, post, iobs = zip(*feature) wiobs = tuple( w + "_" + iob for w, iob in zip(words, iobs) ) # las sentencias a parsear ahora no consideran el POS TAG, sino IOBS y palabras. sentence = zip(words, wiobs) tree = self.postChunker.parse(sentence) loc_tags = tree2conlltags(flatten_deeptree( tree)) # voy de arbol a lista de tuplas de nuevo. cleanSentence = cleanIobs(words, post, loc_tags, grammar_pattern_to_clean, modified_chunk_pattern, clean_pattern) except Exception, e: pass return cleanSentence
class PostPatternStrategy(Strategy): """ Hay casos en que las frases que deseamos detectar se basan en las palabras mas que en su clase gramatical. Tambien podemos ser mas precisos si podemos considerar distintos niveles del arbol, por ejemplo frases y palabras juntas dentro de una regla como un unico token. Esta estrategia permite mirar el arbol en altura y ancho, de modo que las gramaticas que escribamos podran ser mas presicas y flexibles. """ def __init__(self,grammar="",loop=2): super(PostPatternStrategy,self).__init__() self.postChunker=RegexpParser(grammar,loop) self.grammar=grammar self.loop=loop def fix(self, feature): cleanSentence=feature tree=None try: grammar_pattern_to_clean=r'_.*' # caracter de separacion de niveles dentro de un mismo token. clean_pattern='' modified_chunk_pattern=r'.*_' words,post,iobs=zip(*feature) wiobs=tuple(w+"_"+iob for w,iob in zip(words,iobs)) # las sentencias a parsear ahora no consideran el POS TAG, sino IOBS y palabras. sentence=zip(words,wiobs) tree=self.postChunker.parse(sentence) loc_tags=tree2conlltags(flatten_deeptree(tree)) # voy de arbol a lista de tuplas de nuevo. cleanSentence=cleanIobs(words,post,loc_tags,grammar_pattern_to_clean,modified_chunk_pattern,clean_pattern) except Exception,e: pass return cleanSentence
train_data = data[:4000] test_data = data[4000:] print train_data[7] simple_sentence = 'the quick fox jumped over the lazy dog' from nltk.chunk import RegexpParser from pattern.en import tag tagged_simple_sent = tag(simple_sentence) print tagged_simple_sent chunk_grammar = """ NP: {<DT>?<JJ>*<NN.*>} """ rc = RegexpParser(chunk_grammar) c = rc.parse(tagged_simple_sent) print c chink_grammar = """ NP: {<.*>+} # chunk everything as NP }<VBD|IN>+{ """ rc = RegexpParser(chink_grammar) c = rc.parse(tagged_simple_sent) print c tagged_sentence = tag(sentence) print tagged_sentence grammar = """
def GetPatternsTree(tagsList, pattern, patternName): gramaticalAnalyse = RegexpParser(pattern) tree = gramaticalAnalyse.parse(tagsList) patt = ExtractPhrases(tree, patternName) return patt
from tagged_article import TaggedArticle def sanitizeTags(taggedList): sanitizedList = [] for key, value in taggedList: if not value: value = 'NNP' sanitizedList.append((key, value)) return sanitizedList # Open the redis interface redisInterface = RedisInterface() # Prepare the chunker chunker = RegexpParser(r''' Nouns: {<JJ.*>*<NN.*>*} '''); # Print status print 'Analyzer ONLINE' # Run the wait-execute loop articleNumber = 0 while True: while not redisInterface.hasArticleData(articleNumber, 'article_data'): sleep(1) # Retreive the tagged data from redis taggedArticleObject = redisInterface.getArticleData(articleNumber, 'article_data')
def __init__(self,setupData): super(RegexpChunker,self).__init__(setupData) self.chunker=RegexpParser(setupData)
from nltk import Tree, RegexpChunkParser from nltk.chunk import RegexpParser from nltk.chunk.regexp import ChunkString, ChunkRule, ChinkRule s = [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')] # forth chunker = RegexpParser(r''' NP: {<DT><NN.*><.*>*<NN.*>} }<VB.*>{''' ) print(chunker.parse(s)) # back t = Tree('S', s) cs = ChunkString(t) print(cs) ur = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns') ur.apply(cs) print(cs) ir = ChinkRule('<VB.*>', 'chink verbs') ir.apply(cs) print(cs) print(cs.to_chunkstruct()) # cs.to_chunkstruct().draw() chunker = RegexpChunkParser([ur, ir])
def chunk(self, posTaggedQuote): '''Holds the chunkers used by the condensed class''' quoteItemCondensedList = [] #Need to zero this our for testing, might take away later EMPChunker = RegexpParser(r""" EMP: #Emotion Phrase {<MD>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><,><CC>} #Modular, verb, anything, adjective, comma, conjunction {<MD>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><CC>} #Modular, verb, anything, adjective, conjunction {(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><,><CC>} #Verb, anything, adjective, comma, conjunction {(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><CC>} #Verb, anything, adjective, conjunction {(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><CC>} #Verb, anything, adjective, conjunction {(<VBP>|<VB>|<VBZ>|<VBD>)<RB><JJ>} #Verb, adverb, adjective {<MD><RB>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>} #Modular, adverb, verb, adjective {<RB>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>} #Adverb, verb, adjective {<MD>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>} #Modular, verb, anything, adjective {(<VBP>|<VB>|<VBZ>|<VBD>)<TO>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>} #Verb, "to", verb anything, adjective {(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>} #Verb, anything, adjective """) PRPHChunker = RegexpParser(r""" PRPH: #Preposition Phrase {<.*>*<PRP><.*>*<EMP>} #Anything, proposition, anything {<EMP><.*>*<PRP><.*>*} #Anything, proposition, anything }<EMP>{ #Chink at the EMP chunk, recursion! """) #This is going to have to be recursive, to chunk the entire phrase #This section chunkes, and condenses, the EMP chunk becomes "EMP" #Then sets the happy level of the condesned quoteItem EMPChunked = EMPChunker.parse(posTaggedQuote) for piece in EMPChunked: if type(piece) != tuple: #self.quoteItemCondensedList.append((piece, 'EMP')) #TESTING self.quoteItemCondensedList.append(('','EMP')) #TESTING else: self.quoteItemCondensedList.append(piece) self.printCondensed() #Simulating the recursion, PRP chunk next #Want to chunk everything seperately, then figure out the best recursive algorithm newQuoteItemCondensedList = self.quoteItemCondensedList self.quoteItemCondensedList = [] #Clear the list to condense more PRPHChunked = PRPHChunker.parse(newQuoteItemCondensedList) for piece in PRPHChunked: if type(piece) != tuple: #self.quoteItemCondensedList.append((piece, 'PRPH')) #TESTING self.quoteItemCondensedList.append(('','PRPH')) #TESTING else: self.quoteItemCondensedList.append(piece) self.printCondensed() newQuoteItemCondensedList = self.quoteItemCondensedList self.quoteItemCondensedList = [] #Clear the list to condense more PRPHChunked = PRPHChunker.parse(newQuoteItemCondensedList) for piece in PRPHChunked: if type(piece) != tuple: #self.quoteItemCondensedList.append((piece, 'PRPH')) #TESTING self.quoteItemCondensedList.append(('','PRPH')) #TESTING else: self.quoteItemCondensedList.append(piece) self.printCondensed()
def __init__(self): grammar = r''' R-DATE: {<IN><CD><TO><CD>} R-DATE: {<IN><CD><IN><CD>} R-DATE: {<JJ><CD><CC><CD>} FULL-DATE: {<IN><CD><NNP><CD>} FULL-DATE: <VB.*>{<CD><NNP><CD>} MONTH-DATE: {(<IN|DT>)?<NNP><CD>} NP: {<JJR><IN><CD><NNS>} NP: {<IN><CD><NNS>} NP: {<CD><IN><DT><CD><NNS>(<JJ>)?} DM_DATE: {<IN><CD><NNP>}(<,>|<NN.*>) DATE: {<IN>(<DT>)?<CD>} DT-DATE: {<DT><CD>} POS-DATE: <POS>{<CD>} V-DATE: {<IN|CD><JJ><CD>} DATE: (<,>)?{<CD>}<,> N-DATE: (<,>)?{((<.*DATE><,>)+)?<CD><CC><CD>} NN-LST: {<NN.*>(<,><NN.*>)+(<,>)?<CC><NN.*>} NP: {(<RP|IN|NN.*|.*DT|RB|JJ.*|RB.*|POS|``|"|''|FW|POS-DATE|CD|TO|WRB>)*<NN.*>(<TO>(<DT>)?<NN.*>)?(<RB>)?(<IN>)?(<JJ|RB|CD|DT|POS>)*} NP: {<P-DATE><NP>} NP: {<NP><NP>} NP: {<NP><,><NP><,>} CC-NP: {<NP>(<CC><NP>)+} PP: {((<PDT>)?<DT>)?(<RB|IN|WRB|WDT|TO|JJ|PRP>)*<PRP.*>(<MD>)?} PP: {<WP|WRB>} PP: {<IN><WDT>(<DT|RBR>)*} PP: <,>{<DT><JJ>} NP: {<NP><PP><NP>} P-NP: {<PP><NP>(<,><NP><,>)?} C-PP: {(<CD><PP>|<PP><CD>)} CC-P-NP: {<P-NP|PP><CC><NP>} NP: {<NP><,>((<,|CC>)*<.*NP>)*<,>} VP: {<VB.*><IN><TO><DT><VB.*>} VP: {<VB.*><RP>} VP: {(<IN|TO|VB.*|.*DT|RB|JJ|EX|MD>)*<VB.*>(<JJ>)?(<RB>(<TO|JJ|>)?)?} VP: {<IN><DT><VB.*>(<RB><TO>)?} VP: {<RB|VB.*|MD|TO>*<VB.*><RB|VB.*|MD|TO>*} VP: {<VP><IN>} VP: {<IN><VP>(<RP>)?<TO>} VP: {((<DT>)?<IN>)?<WDT><VP>} VP: {<IN><DT-DATE><VP>} Y-DATE: <JJ>{<CD>} VP: {<JJ>}<Y-DATE> CC-VP: {<VP><NP><CC><VP><NP>} CC-NP: <VP>{<NP>(<,><NP>)*<CC><NP>} D-NP : <VP>{<.*DATE><.*NP>} CLAUSE-P: <,|CC>{<VP><P-NP>}(<,>|<CC>|<.*DATE>) CLAUSE-NS: <,>(<CC>)?{(<VP><.*NP>)+}<,> CLAUSE-NS: <CC>{(<VP><.*NP>)+} CLAUSE: {<NP>(<VP><.*NP>|<CC-VP>)+(.*P-NP)?} CLAUSE-P: {<PP|P-NP>(<VP><.*NP>|<CC-VP>)+} CLAUSE-P: <,>{<PP|P-NP><VP>}<,> CLAUSE-P: <,>{<PP|P-NP><VP><CLAUSE>} CLAUSE: <CC>{<NP><VP><CLAUSE-P>} CLAUSE-NS: <,>{<VP><.*NP>} CLAUSE-OSL: <CLAUSE-P><CC><,>{<NP>}<,> CLAUSE-OSR: <,>{<NP>}<CLAUSE-P> CLAUSE: {<NP><CLAUSE-P>} D-CLAUSE-P: {<CLAUSE-P><.*DATE>} D-CLAUSE-P: <,>{<DATE><CLAUSE-P>}<,> D-CLAUSE-P: <,>{<CLAUSE-P><,><VP><.*DATE>} D-CLAUSE: {<CLAUSE><.*DATE>} D-CLAUSE: {<.*DATE><,><CLAUSE>}<,> CLAUSE-NS: {<VP><.*NP>} D-CLAUSE-NS: {<CLAUSE-NS><.*DATE>} D-CLAUSE-NS: {<VP><NP><.*DATE>}<,> D-CLAUSE-NS: <CC>{<.*DATE>(<,>)?<CLAUSE-NS>} D-CLAUSE-P: {<P-NP><VP><.*DATE>} D-CLAUSE-M-P: {<.*DATE><,><CLAUSE-P>((<,|CC>)+<CLAUSE-P>)+} D-CLAUSE-M: {<.*DATE><,><CLAUSE-P>(<,>(<CC>)?<CLAUSE-NS>)+} D-CC-CLAUSE: {<.*DATE><CLAUSE><,><CC><CLAUSE>} D-CLAUSE: {<.*NP><.*VP><.*DATE>} D-CLAUSE: <,>{<.*DATE><.*CLAUSE.*>} D-CLAUSE-P: {<CLAUSE-P>(<,>)?(<.*NP>)?<.*DATE>} D-CLAUSE-P-L: <D-CLAUSE-P>(<,|CC>)+{<NP>(<,><NP>)*<.*DATE>} D-CLAUSE-P: {<.*DATE><,><CLAUSE-P>} D-CLAUSE-NS: <.*CLAUSE.*>(<,|CC>)*{<.*DATE>(<,>)?<CLAUSE-NS>} DD-CLAUSE: {<D-CLAUSE.*>(<,|CC>)+(<RB>)?<.*DATE>} D-CLAUSE-P: {<.*DATE><CLAUSE-P>}(<,>)? D-CLAUSE-P: (<,>)?{<CLAUSE-P><CC><D-CLAUSE-NS>} ''' self.chunker = RegexpParser(grammar, loop=1) self.exclude = {s for s in string.punctuation if s not in [';', ':', '&', ',', ]} self.exclude.add('``') self.exclude.add("''")
from util import sub_leaves SINGLE_WORD_FREQ_CUT_OFF = 6 PATTERNS = r''' NP: {<CD|VBN>?<NN.*|JJ.*>*<CD>?<NN.*|VBG><CD>?} ''' PATTERNS_X = r''' NP: {<NN.*|JJ.*|CD>*<NN.*|VBG><CD>?} {<NN.*|JJ.*>*<CD>?<NN.*|VBG><CD>?} ''' PATTERNS_ALT = r''' NP: {<NN.*|JJ.*>*<NN.*><CC><NN.*|VBG><CD>?} {<NN.*|JJ.*>*<CD>?<NN.*|VBG><CD>?} ''' # ('2009', 'CD'), ('Grammy', 'NNP'), ('Awards', 'NNS') NP_CHUNCKER = RegexpParser(PATTERNS) EARLY_CANDIDATE_CUTOFF = 25 LATE_CANDIDATE_CUTOFF = 10 def extract_candidates(tagged_sentences): ''' Returns three lists: - the candidate key concepts of the given document; - the candidate key concepts occurring early in the given document; and - the candidate key concepts occurring late in the given document. @param tagged_sentences: The POS tagged document. ''' #print tagged_sentences candidates = [] early = set([])
# Regex-based shallow parser. # The Tree structures used to represent parsed sentences in NLTK get converted to ChunkString objects here. # Create an object RegexpParser using chunking and chunking rules (classes ChunkRule and ChinkRule) smple_sntnc = 'The brown fox is quick and he is jumpling over the lazy dog' # Create POS tagged tokens from sample sentence tagged_sentence = tag(smple_sntnc) print(tagged_sentence) # Create the shallow parser grammar = """ NP: {<DT>?<JJ>?<NN.*>} ADJP: {<JJ>} ADVP: {<RB.*>} PP: {<IN>} VP: {<MD>?<VB.*>+} """ rc = RegexpParser(grammar) # Shallow parse the sample sentence c = rc.parse(tagged_sentence) print(c) # Evaluate parser performance on test data print(rc.evaluate(test_data))
import nltk from nltk.tokenize import word_tokenize, sent_tokenize from nltk.tag import pos_tag from nltk.chunk import RegexpParser nltk.download('averaged_perceptron_tagger') phrase = "I love Ice Cream. I also like steak" tokenized_sentences = sent_tokenize(phrase) tokenized_phrases = [ word_tokenize(sentence) for sentence in tokenized_sentences ] tagged_words = [pos_tag(phrase) for phrase in tokenized_phrases] print(tagged_words) grammar = r""" NP: {<PRP|NN|NNP>} """ parser = RegexpParser(grammar) results = [parser.parse(sentence) for sentence in tagged_words] print(results) results[0].draw()
def __init__(self,grammar="",loop=2): super(PostPatternStrategy,self).__init__() self.postChunker=RegexpParser(grammar,loop) self.grammar=grammar self.loop=loop
def __init__(self, setupData): super(RegexpChunker, self).__init__(setupData) self.chunker = RegexpParser(setupData)
class Chunker: def __init__(self): grammar = r''' R-DATE: {<IN><CD><TO><CD>} R-DATE: {<IN><CD><IN><CD>} R-DATE: {<JJ><CD><CC><CD>} FULL-DATE: {<IN><CD><NNP><CD>} FULL-DATE: <VB.*>{<CD><NNP><CD>} MONTH-DATE: {(<IN|DT>)?<NNP><CD>} NP: {<JJR><IN><CD><NNS>} NP: {<IN><CD><NNS>} NP: {<CD><IN><DT><CD><NNS>(<JJ>)?} DM_DATE: {<IN><CD><NNP>}(<,>|<NN.*>) DATE: {<IN>(<DT>)?<CD>} DT-DATE: {<DT><CD>} POS-DATE: <POS>{<CD>} V-DATE: {<IN|CD><JJ><CD>} DATE: (<,>)?{<CD>}<,> N-DATE: (<,>)?{((<.*DATE><,>)+)?<CD><CC><CD>} NN-LST: {<NN.*>(<,><NN.*>)+(<,>)?<CC><NN.*>} NP: {(<RP|IN|NN.*|.*DT|RB|JJ.*|RB.*|POS|``|"|''|FW|POS-DATE|CD|TO|WRB>)*<NN.*>(<TO>(<DT>)?<NN.*>)?(<RB>)?(<IN>)?(<JJ|RB|CD|DT|POS>)*} NP: {<P-DATE><NP>} NP: {<NP><NP>} NP: {<NP><,><NP><,>} CC-NP: {<NP>(<CC><NP>)+} PP: {((<PDT>)?<DT>)?(<RB|IN|WRB|WDT|TO|JJ|PRP>)*<PRP.*>(<MD>)?} PP: {<WP|WRB>} PP: {<IN><WDT>(<DT|RBR>)*} PP: <,>{<DT><JJ>} NP: {<NP><PP><NP>} P-NP: {<PP><NP>(<,><NP><,>)?} C-PP: {(<CD><PP>|<PP><CD>)} CC-P-NP: {<P-NP|PP><CC><NP>} NP: {<NP><,>((<,|CC>)*<.*NP>)*<,>} VP: {<VB.*><IN><TO><DT><VB.*>} VP: {<VB.*><RP>} VP: {(<IN|TO|VB.*|.*DT|RB|JJ|EX|MD>)*<VB.*>(<JJ>)?(<RB>(<TO|JJ|>)?)?} VP: {<IN><DT><VB.*>(<RB><TO>)?} VP: {<RB|VB.*|MD|TO>*<VB.*><RB|VB.*|MD|TO>*} VP: {<VP><IN>} VP: {<IN><VP>(<RP>)?<TO>} VP: {((<DT>)?<IN>)?<WDT><VP>} VP: {<IN><DT-DATE><VP>} Y-DATE: <JJ>{<CD>} VP: {<JJ>}<Y-DATE> CC-VP: {<VP><NP><CC><VP><NP>} CC-NP: <VP>{<NP>(<,><NP>)*<CC><NP>} D-NP : <VP>{<.*DATE><.*NP>} CLAUSE-P: <,|CC>{<VP><P-NP>}(<,>|<CC>|<.*DATE>) CLAUSE-NS: <,>(<CC>)?{(<VP><.*NP>)+}<,> CLAUSE-NS: <CC>{(<VP><.*NP>)+} CLAUSE: {<NP>(<VP><.*NP>|<CC-VP>)+(.*P-NP)?} CLAUSE-P: {<PP|P-NP>(<VP><.*NP>|<CC-VP>)+} CLAUSE-P: <,>{<PP|P-NP><VP>}<,> CLAUSE-P: <,>{<PP|P-NP><VP><CLAUSE>} CLAUSE: <CC>{<NP><VP><CLAUSE-P>} CLAUSE-NS: <,>{<VP><.*NP>} CLAUSE-OSL: <CLAUSE-P><CC><,>{<NP>}<,> CLAUSE-OSR: <,>{<NP>}<CLAUSE-P> CLAUSE: {<NP><CLAUSE-P>} D-CLAUSE-P: {<CLAUSE-P><.*DATE>} D-CLAUSE-P: <,>{<DATE><CLAUSE-P>}<,> D-CLAUSE-P: <,>{<CLAUSE-P><,><VP><.*DATE>} D-CLAUSE: {<CLAUSE><.*DATE>} D-CLAUSE: {<.*DATE><,><CLAUSE>}<,> CLAUSE-NS: {<VP><.*NP>} D-CLAUSE-NS: {<CLAUSE-NS><.*DATE>} D-CLAUSE-NS: {<VP><NP><.*DATE>}<,> D-CLAUSE-NS: <CC>{<.*DATE>(<,>)?<CLAUSE-NS>} D-CLAUSE-P: {<P-NP><VP><.*DATE>} D-CLAUSE-M-P: {<.*DATE><,><CLAUSE-P>((<,|CC>)+<CLAUSE-P>)+} D-CLAUSE-M: {<.*DATE><,><CLAUSE-P>(<,>(<CC>)?<CLAUSE-NS>)+} D-CC-CLAUSE: {<.*DATE><CLAUSE><,><CC><CLAUSE>} D-CLAUSE: {<.*NP><.*VP><.*DATE>} D-CLAUSE: <,>{<.*DATE><.*CLAUSE.*>} D-CLAUSE-P: {<CLAUSE-P>(<,>)?(<.*NP>)?<.*DATE>} D-CLAUSE-P-L: <D-CLAUSE-P>(<,|CC>)+{<NP>(<,><NP>)*<.*DATE>} D-CLAUSE-P: {<.*DATE><,><CLAUSE-P>} D-CLAUSE-NS: <.*CLAUSE.*>(<,|CC>)*{<.*DATE>(<,>)?<CLAUSE-NS>} DD-CLAUSE: {<D-CLAUSE.*>(<,|CC>)+(<RB>)?<.*DATE>} D-CLAUSE-P: {<.*DATE><CLAUSE-P>}(<,>)? D-CLAUSE-P: (<,>)?{<CLAUSE-P><CC><D-CLAUSE-NS>} ''' self.chunker = RegexpParser(grammar, loop=1) self.exclude = {s for s in string.punctuation if s not in [';', ':', '&', ',', ]} self.exclude.add('``') self.exclude.add("''") def prepare_sentence(self, s: list) -> list: s = [n for n in s if n[0] not in self.exclude] txt = [w[0] for w in s] pos = nltk.pos_tag(txt) return [(w, ps, net) for (w, ps), (_, net) in zip(pos, s)] @staticmethod def tree_label_fix(tree: nltk.tree.Tree) -> nltk.tree.Tree: for st in tree: if isinstance(st, nltk.tree.Tree): if bool(re.match(r'.*CLAUSE.*', st.label())): if not bool(re.match('.*D-.*CLAUSE.*', st.label())): leafs = st.leaves() if any([n for n in leafs if n[2] == 'DATE']): # Fixing the label of the tree new_lbl = 'D-' + st.label() st.set_label(new_lbl) st.label() else: leafs = st.leaves() if not any([n for n in leafs if n[2] == 'DATE']): oldlbl = st.label() new_lbl = re.sub(r'D-', '', oldlbl) st.set_label(new_lbl) return tree def generate_tree(self, s: list) -> nltk.tree.Tree: # noinspection PyTypeChecker t1 = self.chunker.parse(s) return self.tree_label_fix(t1)
def __init__(self, grammar="", loop=2): super(PostPatternStrategy, self).__init__() self.postChunker = RegexpParser(grammar, loop) self.grammar = grammar self.loop = loop
#t = npc.parse(tmp_arr_pos[0]) print "Finished loading..." #print len(t) #t.draw() #help(t) sentCount = 1 sentScore = [] #tuple with (Subj-Obj , Verb-P , ) totalS = [] print "Processing input..." print "Number of sentences to process: ", len(arr_pos) for q in ["", vp, prd, cls1, cls2]: grammer += q npc = RegexpParser(grammer) print "\n\n" for i in arr_pos: print "Reading sentence ", sentCount sentCount += 1 t = npc.parse(i) print t tmpVP = [] tmpNP = [] tmpPrd = [] tmpCls = [] x1 = "" for x in t: try: if x.node == "VP":
farechunker = RegexpParser(r''' CARRIER: {<CODESHARE><CODESHARE><CODESHARE><CODESHARE>} {<CODESHARE><CODESHARE><CODESHARE>} {<CODESHARE><CODESHARE>} {<CODESHARE><NN>} ROUTE: {<ROUTE>} CABIN: {<CABIN>} RBD: {<BOOKING><CLASS>} CORPORATE_DISCOUNT: {<CORPORATE><DISCOUNT>} {<EFFECTIVE><DISCOUNT>} AGENT_DISCOUNT: {<DISCOUNT>} FBC: {<FBC><VBD><TO><VBP><DISCOUNT>} {<FARE><BASIS>} TICKET_VALIDITY: {<TICKET><VALIDITY>} LOCATION: {<LOCATIONTYPE><NN><.*>} AIRLINE: {<CAT><PACIFIC><AIRWAYS><CITY>} CLIENT: <AIRLINE>{<.*><.*>}<TOURCODE> ''')
sentencas_treinadoras = mac_morpho.tagged_sents()[0:15000] #Cria o UnigramTagger com base no etiquetador padrão e treina-o com as sentenças etiquetadas do mac_morpho etiq = UnigramTagger(sentencas_treinadoras, backoff=etiqPadrao) coment = str(input("Entre com o texto: ")) if coment == "default": coment = open("default.txt", "r").read().replace("\n", " ") #O texto é convertido em tokens tokens=nltk.word_tokenize(coment.lower()) #É etiquetada cada token do texto tags = etiq.tag(tokens) #É criado o analisador de expresões regulares contendo os padrões procurados analiseGramatical = RegexpParser(r""" PADRAO7: {<N><ADJ>} PADRAO1: {<ADJ><N>(<PREP>?<N>)*} PADRAO2: {<ADV><ADV>?<ADJ>(<N>(<PREP>?<N>)*)?} PADRAO3: {<N>(<PREP>?<N>)*(<ADJ>)<ADV><ADV>?} PADRAO4: {<N>(<PREP>?<N>)*<ADV>?<ADJ>+} PADRAO5: {<ADV><V>} PADRAO6: {<V><ADV>} """) #O analisador é então utilizado para a geração da árvore de padrões arvore = analiseGramatical.parse(tags) x = [ExtractPhrases(arvore, "PADRAO1"), ExtractPhrases(arvore, "PADRAO2"), ExtractPhrases(arvore, "PADRAO3"), ExtractPhrases(arvore, "PADRAO4"), ExtractPhrases(arvore, "PADRAO5"), ExtractPhrases(arvore, "PADRAO6"), ExtractPhrases(arvore, "PADRAO7")] for aux in range(len(x)): print("PADRAO 0"+str(aux+1)+str(x[aux]))