def __init__(self, nfeatures=100000, doclen=60): self.grammar = r'KT: {(<RB.> <JJ.*>|<VB.*>|<RB.*>)|(<JJ> <NN.*>)}' # self.grammar = r'KT: {(<RB.*> <VB.>|<RB.>|<JJ.> <NN.*>)}' # self.grammar = r'KT: {<RB.>|<JJ.>}' self.chunker = RegexpParser(self.grammar) self.nfeatures = nfeatures self.doclen = doclen
def build_vocabulary(self): """ Generate a list of candidate phrases from the documents, using POS tagging and chunking functionality of nltk. """ stop_words = set(stopwords.words('english')) vocabulary = [] for doc in self.documents: words = [] candidates = [] clean_doc = text_cleaner(doc) sentences = sent_tokenize(clean_doc) words.extend([word_tokenize(sentence) for sentence in sentences]) tagged_words = pos_tag_sents(words) grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' chunker = RegexpParser(grammar) # split into a private function all_tag = chain.from_iterable( [tree2conlltags(chunker.parse(tag)) for tag in tagged_words]) for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'): candidate = ' '.join([word for (word, pos, chunk) in group]) if key is True and candidate not in stop_words: candidates.append(candidate) vocabulary.append(candidates) vocabulary = list(chain(*vocabulary)) vocabulary = list(np.unique(vocabulary)) self.vocabulary = vocabulary
def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs): """ Extracts key chunks based on a grammar for a list of tokenized sentences. If the sentences are already tokenized and tagged, pass in: tagged=True """ normalizer = Normalizer(**kwargs) chunker = RegexpParser(grammar) for sent in sents: # Tokenize and tag sentences if necessary if not tagged: sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent)) # Parse with the chunker if we have a tagged sentence if not sent: continue chunks = tree2conlltags(chunker.parse(sent)) # Extract candidate phrases from our parsed chunks chunks = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda (word, pos, chunk): chunk != 'O' ) if key ] # Yield candidates that are not filtered by stopwords and punctuation. for chunk in normalizer.normalize(chunks): yield chunk
def extract_candidate_phrases(sents, grammar=GRAMMAR, tagged=False): # Create the chunker that uses our grammar chunker = RegexpParser(grammar) for sent in sents: # Tokenize and tag sentences if necessary if not tagged: sent = nltk.pos_tag(nltk.word_tokenize(sent)) # Parse the sentence, converting the parse tree into a tagged sequence sent = normalize(sent) if not sent: continue chunks = tree2conlltags(chunker.parse(sent)) # Extract phrases and rejoin them with space phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda term: term[-1] != 'O' ) if key ] for phrase in phrases: yield phrase
def generate_candidate(texts, method='phrase', remove_punctuation=True): """ Generate word candidate from given string Parameters ---------- texts: str, input text string method: str, method to extract candidate words, either 'word' or 'phrase' Returns ------- candidates: list, list of candidate words """ words_ = list() candidates = list() # tokenize texts to list of sentences of words sentences = sent_tokenize(texts) for sentence in sentences: if remove_punctuation: sentence = punct_re.sub(' ', sentence) # remove punctuation # sentence = re.sub(r'[^\w]', ' ', sentence) words = word_tokenize(sentence) words = list(map(lambda s: s.lower(), words)) words_.append(words) tagged_words = pos_tag_sents(words_) # POS tagging words_.clear() if method == 'word': tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS']) tagged_words = chain.from_iterable(tagged_words) for word, tag in tagged_words: if tag in tags and word.lower() not in stop_words: candidates.append(word) elif method == 'phrase': # grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' grammar = r'KT: {(<JJ><NN.*>)' \ r' | (<NN.*><NN.*>) ' \ r' | (<NN.*><NN.*><NN.*>) ' \ r'| (<JJ><JJ><NN.*>+)' \ r' | (<JJ><NN.*><NN.*>)' \ r' | (<NN.*><JJ><NN.*>) ' \ r'| (<NN.*><IN><NN.*>) ' \ r'| (<JJ><NN.*><IN><NN.*>) ' \ r'| (<NN.*><IN><JJ><NN.*>) ' \ r'| (<JJ><NN.*><IN><JJ><NN.*>) }' chunker = RegexpParser(grammar) all_tag = chain.from_iterable( [tree2conlltags(chunker.parse(tag)) for tag in tagged_words]) for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'): candidate = ' '.join([word for (word, pos, chunk) in group]) if key is True and candidate not in stop_words: candidates.append(candidate) else: print("Use either 'word' or 'phrase' in method") return candidates
def buildchunkerlist(grammerlst, tagged): gtree = [] for g in grammerlst: chunker = RegexpParser(g) OP = chunker.parse(tagged) if (OP.height() >= 3 ): gtree.append(OP.subtrees(lambda t: t.height() == 2)) return gtree
def parseRelatedFeature(sent, tagged): chunker = RegexpParser(''' OP5: {<.*>+<NN>?<CD><.*>+<NN>?} ''') OP = chunker.parse(tagged) if (OP.height() >= 3 ): for m in OP.subtrees(lambda t: t.height() == 2): for (word,tag) in m: if ( tag == "NN" and r3.match(word)): return True
def getConcepts(text): grammar = """ CONCEPT: {(<DT>)?(<JJ>)?<NN|NNS>+} """ chunker = RegexpParser(grammar) taggedText = pos_tag(word_tokenize(text)) textChunks = chunker.parse(taggedText) current_chunk = [] for i in textChunks: if (type(i) == Tree and i.label() == "CONCEPT"): current_chunk.append(" ".join([token for token, pos in i.leaves()])) return current_chunk
def vocab_gen(texts, bool_key): list_word = [] vocabs = [] word_write = "" phrase_write = "" pos_write = "" sentences = sent_tokenize(texts) sentence_write = "\n".join(sentences) for sentence in sentences: words = word_tokenize(sentence) words = list(map(lambda s: s.lower(), words)) list_word.append(words) words_w_pos = pos_tag_sents(list_word) # POS dumb = [j for sub in words_w_pos for j in sub] dumb = pos_tag_sents(dumb) dumb = [j for sub in dumb for j in sub] for i in dumb: pos_write += str(i) pos_write += "\n" # define grammar to pull out the phrases grammar = r'KT: ' \ r'{' \ r'(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+' \ r'}' grammar = RegexpParser(grammar) all_tag = chain.from_iterable( [tree2conlltags(grammar.parse(tag)) for tag in words_w_pos]) for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'): vocabs_temp = ' '.join([word for (word, pos, chunk) in group]) if bool_key == 'Phrase': if key is True and vocabs_temp not in stop_words and len( vocabs_temp) > 2 and (' ' in vocabs_temp) == True: vocabs.append(vocabs_temp) phrase_write += vocabs_temp phrase_write += "\n" else: if key is True and vocabs_temp not in stop_words and len( vocabs_temp) > 2 and (' ' in vocabs_temp) == False: vocabs.append(vocabs_temp) word_write += vocabs_temp word_write += "\n" update_file = open(vocabs_word_path, 'w') update_file.write(word_write) if bool_key == 'Phrase': update_file = open(vocabs_phrase_path, 'w') update_file.write(phrase_write) update_file = open(sentence_path, 'w') update_file.write(sentence_write) update_file = open(pos_path, 'w') update_file.write(pos_write) return vocabs
def extract_from_sentences(sentences, add_verbs=True, language="english"): """ Processes Sentence objects to calculate contained Noun Phrases based on a given grammar and maps them to the sentences they occur in. :param sentences: A list of Sentence objects. :param add_verbs: Optional. Default: True. Whether or not verbs are to be added to the mapping. :param language: Optional. Default: English. The langue of the sentences. :return: A dictionary mapping tokens to the sentence IDs of the sentences they appear in. """ # produce the mapping of sentences to their contained (words, pos) tuples pos_dictionary = {} NP_GRAMMAR_COMPOUND = "NP: {<JJ.*>*(<N.*>|<JJ.*>)+((<IN>|<TO>)?<JJ.*>*(<N.*>|<JJ.*>)+)*((<CC>|,)<JJ.*>*(<N.*>|<JJ.*>)+((<IN>|<TO>)?<JJ.*>*(<N.*>|<JJ.*>)+)*)*}" for sentence in sentences: pos_dictionary[sentence.sentence_id] = [ (token, tag) for token, tag in sentence.tokens.items() ] parser_cmp = RegexpParser(NP_GRAMMAR_COMPOUND) term2sentence_id = {} lemmatizer = WordNetLemmatizer() for sentence_id, pos_tagged_tokens in pos_dictionary.items(): if add_verbs: # updating the inverse occurrence index with verbs for subject, tag in pos_tagged_tokens: # check if subject is tagged as a verb if tag.startswith("VB"): verb = lemmatizer.lemmatize(subject, "v").lower() if verb not in stopwords.words(language): if verb not in term2sentence_id: term2sentence_id[verb] = set() term2sentence_id[verb].add(sentence_id) # trying to parse the sentence_id into a top-level chunk tree tree = parser_cmp.parse(pos_dictionary[sentence_id]) # getting the top-level tree triples and decomposing the NPs cmp_triples, simple_trees = get_cooccurence([tree], ignore_stopwords=False, language=language) smp_triples, _ = get_cooccurence(simple_trees, ignore_stopwords=True, language=language) # updating the inverse occurrence index with NPs for subject, _, objecT in cmp_triples + smp_triples: if subject.lower() not in term2sentence_id: term2sentence_id[subject.lower()] = set() if objecT.lower() not in term2sentence_id: term2sentence_id[objecT.lower()] = set() term2sentence_id[subject.lower()].add(sentence_id) term2sentence_id[objecT.lower()].add(sentence_id) return term2sentence_id
def get_tokens(text): word_list = [] voc = [] voc_write = '' sent = sent_tokenize(text) word_single = word_tokenize(text) if os.path.exists('token_log.txt'): k = open('token_log.txt', 'w', encoding='UTF8') else: k = open('token_log.txt', 'x', encoding='UTF8') k = open('token_log.txt', 'w', encoding='UTF8') k.write(str(word_single)) for i in sent: word = word_tokenize(i) words = list(map(lambda s: s.lower(), word)) word_list.append(words) words_pos = pos_tag_sents(word_list) if os.path.exists('pos_log.txt'): f = open('pos_log.txt', 'w', encoding='UTF8') else: f = open('pos_log.txt', 'x', encoding='UTF8') f = open('pos_log.txt', 'w', encoding='UTF8') f.write(str(words_pos)) grammar = r'KT: ' \ r'{' \ r'(<JJ>* <NN.*>+ <In>)? <JJ>* <NN.*>+' \ r'}' grammar = RegexpParser(grammar) tags = chain.from_iterable( [tree2conlltags(grammar.parse(tag)) for tag in words_pos]) for key, group in groupby(tags, lambda tag: tag[2] != 'O'): voc_temp = ' '.join([word for (word, pos, chunk) in group]) if key is True and voc_temp not in stopwords.words( 'english') and voc_temp != 'https': voc.append(voc_temp) voc_write += voc_temp voc_write += '\n' if os.path.exists('voc_log.txt'): f = open('voc_log.txt', 'w', encoding='UTF8') else: f = open('voc_log.txt', 'x', encoding='UTF8') f = open('voc_log.txt', 'w', encoding='UTF8') f.write(voc_write) return voc
def getInstances(text): grammar = """ PRE: {<NNS|NNP|NN|NP|JJ|UH>+} MID: {<DT|IN|POS|FW|-|NP|NPS|NN|NNS>+} INSTANCE: {(<DT+>)?(<JJ+>)?<PRE>(<MID><PRE>)?} """ chunker = RegexpParser(grammar) taggedText = pos_tag(word_tokenize(text)) textChunks = chunker.parse(taggedText) current_chunk = [] for i in textChunks: if (type(i) == Tree and i.label() == "INSTANCE"): # print (i.leaves()) current_chunk.append(" ".join([token for token, pos in i.leaves()])) return current_chunk
def __init__(self, name, is_lazy, lazy_directory, debug, rule): """ Constructor of the component. @param name: The name of the component. @type name: C{string} @param is_lazy: True if the component must load previous data, False if data must be computed tought they have already been computed. @type is_lazy: C{bool} @param lazy_directory: The directory used to store previously computed data. @type lazy_directory: C{string} @param debug: True if the component is in debug mode, else False. When the component is in debug mode, it will output each step of its processing. @type debug: C{bool} @param rule: The rule to parse NP chunks. It is expressed with POS tags. @type rule: C{string} """ super(NPChunkExtractor, self).__init__(name, is_lazy, lazy_directory, debug) self.set_np_chunker(RegexpParser("NP: " + rule))
class KeyPhraseGenerator(): """ Extracts keyphrases from input list of strings. """ def __init__(self, grammar=GRAMMAR, stopwords=STOPWORDS): self.chunker = RegexpParser(grammar) self.stopwords = stopwords def clean_text(self, txt): """ Removes emoji and urls from text. """ cleaned = cleaner.remove_emojis(txt) cleaned = cleaner.remove_urls(cleaned) return cleaned def clean_tagged_text(self, tagged_text): """ Remove punctuation from tagged text. """ punct_tagged = lambda word: all( unicat(char).startswith("P") and char != "," for char in word) cleaned = filter(lambda t: not punct_tagged(t[0]), tagged_text) return list(cleaned) def extract_keyphrases_single(self, txt): """ Yields keyphrases for one piece of text. """ for sent in txt: sent = self.clean_tagged_text(sent) if not sent: continue chunks = tree2conlltags(self.chunker.parse(sent)) phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby(chunks, lambda term: term[-1] != "O") if key ] for phrase in phrases: if phrase.lower() not in self.stopwords and len(phrase) > 2: yield phrase def extract_keyphrases(self, txt_list): """ Returns keyphrases for input list of strings. """ key_docs = [] for txt in txt_list: tagged_doc = [] txt = self.clean_text(txt) for sent in nltk.sent_tokenize(txt): tagged_doc.append(nltk.pos_tag(nltk.word_tokenize(sent))) key_docs.append(list(self.extract_keyphrases_single(tagged_doc))) return key_docs
def generate_candidate(texts, method='word', remove_punctuation=False): """ Generate word candidate from given string Parameters ---------- texts: str, input text string method: str, method to extract candidate words, either 'word' or 'phrase' Returns ------- candidates: list, list of candidate words """ words_ = list() candidates = list() # tokenize texts to list of sentences of words sentences = sent_tokenize(texts) for sentence in sentences: if remove_punctuation: sentence = punct_re.sub(' ', sentence) # remove punctuation words = word_tokenize(sentence) words = list(map(lambda s: s.lower(), words)) words_.append(words) tagged_words = pos_tag_sents(words_) # POS tagging if method == 'word': tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS']) tagged_words = chain.from_iterable(tagged_words) for word, tag in tagged_words: if tag in tags and word.lower() not in stop_words: candidates.append(word) elif method == 'phrase': grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' chunker = RegexpParser(grammar) all_tag = chain.from_iterable([tree2conlltags(chunker.parse(tag)) for tag in tagged_words]) for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'): candidate = ' '.join([word for (word, pos, chunk) in group]) if key is True and candidate not in stop_words: candidates.append(candidate) else: print("Use either 'word' or 'phrase' in method") return candidates
def get_cooccurence(chunk_trees, ignore_stopwords=True, language="english"): """ Parses a chunk tree and gets co-occurance of terms. :param chunk_trees: Tree from the NLTK RegexParser, generated over POS-tagged sentences using the provided grammar. :param ignore_stopwords: Optional. Default: True. Whether stopwords are to be ignored or not. :param language: Optional. Default: English. The language of the texts over which the chunk trees were generated. :return: A list of co-occuring tokens and a simple parse tree generated over the leaves of the chunks of the provided one. """ triples = [] simple_trees = [] lemmatizer = WordNetLemmatizer() NP_GRAMMAR_SIMPLE = "NP: {<JJ.*>*(<N.*>|<JJ.*>)+}" parser_simple = RegexpParser(NP_GRAMMAR_SIMPLE) for t in chunk_trees: entities = [] for chunk in t: if isinstance(chunk, Tree) and chunk.label() == 'NP': # getting a tree for later processing of triples from the simple noun # phrases (if present) simple_trees.append(parser_simple.parse(chunk.leaves())) words = [] for word, tag in chunk: if (ignore_stopwords and word in stopwords.words(language)) or \ (not any(char.isalnum() for char in word)): # do not process stopwords for simple trees, do not process purely # non alphanumeric characters continue if tag.startswith('N'): words.append(lemmatizer.lemmatize(word, 'n')) elif tag.startswith('J'): words.append(lemmatizer.lemmatize(word, 'a')) else: words.append(word) if len(words) > 0: entities.append("_".join(words)) for e1, e2 in combinations(entities, 2): triples.append((e1, "close to", e2)) triples.append((e2, "close to", e1)) return triples, simple_trees
def create_phrase_vocabulary(raw_data): ''' Extract vocabulary of nounphrase, because tfidfvectorizer only automatically extract ngram, if we want to use different format or different vocabulary, vocabulary must be created. ''' #grammar to extract the noun phrase grammar = r'NP: {(<JJ.*>* <VBN>? <NN.*>+ <IN>)? <JJ.*>* <VBG>? <NN.*>+}' #set the punctuation and chunker punct = set(string.punctuation) chunker = RegexpParser(grammar) def lambda_unpack(f): #function to unpack the tuple return lambda args: f(*args) #tokenize and create pos tags per sentence, then get its IOB tag postag_sents = pos_tag_sents(word_tokenize(sent) for sent in raw_data) noun_phrases = list( chain.from_iterable( tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in postag_sents)) #join B-NP and I-NP tags as one noun phrase excluding O tags merged_nounphrase = [ ' '.join(stemmer.stem(word) for word, pos, chunk in group).lower() for key, group in itertools.groupby( noun_phrases, lambda_unpack(lambda word, pos, chunk: chunk != 'O')) if key ] #filter the term below than two characters and punctuation all_nounphrases = [ cand for cand in merged_nounphrase if len(cand) > 2 and not all(char in punct for char in cand) ] #select distinct noun phrases vocabulary = (list(set(all_nounphrases))) return vocabulary
def chunk_location_sent(pos_text, temp_text): list_of_locs = list() chunk_grammar = r""" LOC: {((<CD>?<NNP>+<CD>?)|(<CD>?<NN>+<CD>?))+} """ chunker = RegexpParser(chunk_grammar) chunked_article = chunker.parse(pos_text) for subtree in chunked_article.subtrees(): if subtree.label()=='LOC': #print(' '.join((tuples[0] for tuples in list(subtree)))) #print(subtree.pprint()) NNPs = ' '.join((tuples[0] for tuples in list(subtree))) #print("LOC: " + NNPs) list_of_locs.append(NNPs) #print("loc list:", list_of_locs) return list_of_locs
def getNounPhrases(self): featureSet = [] # Handbook of NLP - Multiword Expressions, Timothy Baldwin and Su Nam Kim grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = RegexpParser(grammar) for sentence in self.sentences: tokens = word_tokenize(sentence) if len(tokens) == 0: continue else: pass tagged = pos_tag(tokens) tree = chunker.parse(tagged) terms = [] leafCollection = [] for subtree in tree.subtrees(filter=lambda t: t.node == 'NP'): leafCollection.append(subtree.leaves()) for leaf in leafCollection: term = [w for w, t in leaf if len(w) > 2] phrase = ' '.join(term) terms.append(phrase) featureSet += terms self.convertToFeatureDist(featureSet) self.helperObject.saveAllFeaturesExtracted(featureSet)
def getNounPhrases(self): featureSet = [] # Handbook of NLP - Multiword Expressions, Timothy Baldwin and Su Nam Kim grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = RegexpParser(grammar) for sentence in self.sentences: tokens = word_tokenize(sentence) if len(tokens) == 0: continue else: pass tagged = pos_tag(tokens) tree = chunker.parse(tagged) terms = [] leafCollection = [] for subtree in tree.subtrees(filter = lambda t : t.node == 'NP'): leafCollection.append(subtree.leaves()) for leaf in leafCollection: term = [w for w,t in leaf if len(w) > 2] phrase = ' '.join(term) terms.append(phrase) featureSet += terms self.convertToFeatureDist(featureSet) self.helperObject.saveAllFeaturesExtracted(featureSet)
class KeyphraseExtractor(BaseEstimator, TransformerMixin): """ Wraps a PickledCorpusReader consisting of pos-tagged documents. """ def __init__(self, grammar=GRAMMAR): self.grammar = GRAMMAR self.chunker = RegexpParser(self.grammar) def normalize(self, sent): """ Removes punctuation from a tokenized/tagged sentence and lowercases words. """ is_punct = lambda word: all(unicat(char).startswith('P') for char in word) sent = filter(lambda t: not is_punct(t[0]), sent) sent = list(sent) if len(sent) == 2: sent = map(lambda t: (t[0].lower(), t[1]), [sent]) sent = list(sent) else: sent = list() return sent def extract_keyphrases(self, document): """ For a document, parse sentences using our chunker created by our grammar, converting the parse tree into a tagged sequence. Yields extracted phrases. """ for sents in document: for sent in sents: sent = self.normalize(sent) if not sent: continue chunks = tree2conlltags(self.chunker.parse(sent)) phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda term: term[-1] != 'O' ) if key ] for phrase in phrases: yield phrase def fit(self, documents, y=None): return self def transform(self, documents): for document in documents: yield list(self.extract_keyphrases(document))
def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs): """ Extracts key chunks based on a grammar for a list of tokenized sentences. If the sentences are already tokenized and tagged, pass in: tagged=True """ normalizer = Normalizer(**kwargs) chunker = RegexpParser(grammar) for sent in sents: # Tokenize and tag sentences if necessary if not tagged: sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent)) # Parse with the chunker if we have a tagged sentence if not sent: continue chunks = tree2conlltags(chunker.parse(sent)) # Extract candidate phrases from our parsed chunks chunks = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda (word, pos, chunk): chunk != 'O' ) if key ]
def chunk_name_sent(pos_text, temp_text): list_of_names = list() chunk_grammar = r""" NAME: {<NNP>+} """ chunker = RegexpParser(chunk_grammar) chunked_article = chunker.parse(pos_text) #print("chunk:", chunked_article) for subtree in chunked_article.subtrees(): if subtree.label()=='NAME': #print(' '.join((tuples[0] for tuples in list(subtree)))) #print(subtree.pprint()) NNPs = ' '.join((tuples[0] for tuples in list(subtree))) #print("..: ", NNPs) #print("LOC: " + NNPs) list_of_names.append(NNPs) #print("namelist: ", list_of_names) return list_of_names
def extract_words(nodetext, t2, doc, location): try: # tokenizer = RegexT(r'\w*[a-zA-Z]\w*') # return tokenizer.tokenize(nodetext) #except TypeError: # return [] grammar = "NP: {<JJ>*<NN>+}" phrases = [] final_phrases = [] for sent in sent_tokenize(nodetext): doc.add_sentence(Sentence(location, sent)) tag_list = t2.tag(word_tokenize(sent)) parser = RegexpParser(grammar) result = parser.parse(tag_list) for phrase in result: if isinstance(phrase, NLTREE.Tree) and phrase.node == "NP": phrases.append("_".join([word for word,pos in phrase.leaves()])) #n_phrase = "_".join([word for word,pos in phrase.leaves()]) #if any(c.isdigit() for c in n_phrase): # continue # elif '.' in n_phrase: # continue # else: # doc.add_word(Word(location, n_phrase, sent)) except TypeError: return [] for phrase in phrases: if any(c.isdigit() for c in phrase): continue elif '.' in phrase: continue else: final_phrases.append(phrase) return final_phrases
class KeyphraseExtractor(BaseEstimator, TransformerMixin): """ Wraps a PickledCorpusReader consisting of pos-tagged documents. """ def __init__(self, grammar=GRAMMAR): self.grammar = GRAMMAR self.chunker = RegexpParser(self.grammar) def normalize(self, sent): """ Removes punctuation from a tokenized/tagged sentence and lowercases words. """ is_punct = lambda word: all(unicat(char).startswith('P') for char in word) sent = filter(lambda t: not is_punct(t[0]), sent) sent = map(lambda t: (t[0].lower(), t[1]), sent) return list(sent) def extract_keyphrases(self, document): """ For a document, parse sentences using our chunker created by our grammar, converting the parse tree into a tagged sequence. Yields extracted phrases. """ for sents in document: for sent in sents: sent = self.normalize(sent) if not sent: continue chunks = tree2conlltags(self.chunker.parse(sent)) phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda term: term[-1] != 'O' ) if key ] for phrase in phrases: yield phrase def fit(self, documents, y=None): return self def transform(self, documents): for document in documents: yield list(self.extract_keyphrases(document))
def __init__(self, grammar=GRAMMAR, stopwords=STOPWORDS): self.chunker = RegexpParser(grammar) self.stopwords = stopwords
def apply_grammar(pos_words): grammar_parser = RegexpParser(GRAMMAR) return grammar_parser.parse(pos_words)
def regex_chunk(self, tagged, pattern): pr = RegexpParser(pattern) chunked = [pr.parse(sent) for sent in tagged] return chunked
def tagChunk(self, taggedword, loops=2): ## Cunking cp = RegexpParser(self.grammar, loop=loops) print('tagged word') print(taggedword) return cp.parse(taggedword)
ADJ_1: {<ADJ> <INTERJ|break>* <ADJ>+} ADJ_1: {<ADJ>} DET: {<NUM_ORD|NUM|PRON_POSS|EGEN_GEN|N_GEN>} DET2: {<PRON_DEMO|PRON_PERS>} DET3: {<PRON_UBST>} NP: {<DET2|PRON_1|DET|DET3> <INTERJ|break>* <N> <INTERJ|break>* <ADJ_1>} NP: {<DET2|PRON_1|DET|DET3> <INTERJ|break>* <ADJ_1|DET|DET3> <INTERJ|break>* <N>+} NP: {<DET2|PRON_1|DET3|DET> <INTERJ|break>* <DET>* <INTERJ|break>* <ADJ_1|DET3>+ <N>*} NP: {<DET2|PRON_1|DET|DET3> <INTERJ|break>* <N>+} NP: {<ADJ_1|DET3> <INTERJ|break>* <N>} NP: {<PRON_1>} NP: {<DET2> <INTERJ|break>* <DET3>} NP: {<PRON_INTER_REL|EGEN|N|DET2|DET3>} """ parser = RegexpParser(rules) tokenized = word_tokenize('I am a bird') tags = pos_tag(tokenized) def parse_sentences(data): chunked_sentences = [] for s in data: chunked = parser.parse(s) chunked_sentences.append(chunked) return chunked_sentences def IOB(list): return [
class KeyphraseExtractor(BaseEstimator, TransformerMixin): """ Extract adverbial and adjective phrases, and transform documents into lists of these keyphrases, with a total keyphrase lexicon limited by the nfeatures parameter and a document length limited/padded to doclen """ def __init__(self, nfeatures=100000, doclen=60): self.grammar = r'KT: {(<RB.> <JJ.*>|<VB.*>|<RB.*>)|(<JJ> <NN.*>)}' # self.grammar = r'KT: {(<RB.*> <VB.>|<RB.>|<JJ.> <NN.*>)}' # self.grammar = r'KT: {<RB.>|<JJ.>}' self.chunker = RegexpParser(self.grammar) self.nfeatures = nfeatures self.doclen = doclen def normalize(self, sent): """ Removes punctuation from a tokenized/tagged sentence and lowercases words. """ is_punct = lambda word: all(unicat(c).startswith('P') for c in word) sent = filter(lambda t: not is_punct(t[0]), sent) sent = map(lambda t: (t[0].lower(), t[1]), sent) return list(sent) def extract_candidate_phrases(self, sents): """ For a document, parse sentences using our chunker created by our grammar, converting the parse tree into a tagged sequence. Extract phrases, rejoin with a space, and yield the document represented as a list of it's keyphrases. """ for sent in sents: sent = self.normalize(sent) if not sent: continue chunks = tree2conlltags(self.chunker.parse(sent)) phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby(chunks, lambda term: term[-1] != 'O') if key ] for phrase in phrases: yield phrase def fit(self, documents, y=None): return self def get_lexicon(self, keydocs): """ Build a lexicon of size nfeatures """ keyphrases = [keyphrase for doc in keydocs for keyphrase in doc] fdist = FreqDist(keyphrases) counts = fdist.most_common(self.nfeatures) lexicon = [phrase for phrase, count in counts] return {phrase: idx + 1 for idx, phrase in enumerate(lexicon)} def clip(self, keydoc, lexicon): """ Remove keyphrases from documents that aren't in the lexicon """ return [ lexicon[keyphrase] for keyphrase in keydoc if keyphrase in lexicon.keys() ] def transform(self, documents): docs = [list(self.extract_candidate_phrases(doc)) for doc in documents] lexicon = self.get_lexicon(docs) clipped = [list(self.clip(doc, lexicon)) for doc in docs] return sequence.pad_sequences(clipped, maxlen=self.doclen)
def __init__(self, grammar=GRAMMAR): self.grammar = GRAMMAR self.chunker = RegexpParser(self.grammar)
pronounsent_nounDict = defaultdict( list ) #key:tuple(pronoun,sentence_num) val:list(list(tuple(noun,pos))) noun not normalized grammar = """NP:{<DT>?<JJ>*(<NN.*>)+} PR:{<PRP.*>} """ #grammar for tagging noun phrases and pronouns #DT - determiners eg: The, a, an, my #JJ - adjectives #NN.* - any type of noun #PRP - personal pronoun eg: He, she, I, We, they rp = RegexpParser(grammar) count = 0 for s in listOfTaggedSents: chunkedTree = ParentedTree.convert( rp.parse(s)) #tree of chunked parts of the sentence #ParentedTree is used to convert tagged words to tree structure neTree = ne_chunk(s) #tree with named entity tags #print (chunkedTree) #chunkedTree.draw() #neTree.draw() for n in chunkedTree: if isinstance(n, nltk.tree.Tree): if n.label() == 'NP':
from nltk.corpus import wordnet as wn from nltk.chunk import tree2conlltags from nltk.probability import FreqDist from nltk.chunk.regexp import RegexpParser from unicodedata import category as unicat from nltk.stem.wordnet import WordNetLemmatizer from sklearn.base import BaseEstimator, TransformerMixin from nltk.tokenize import TweetTokenizer from nltk import pos_tag GRAMMAR = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' GOODTAGS = frozenset(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS']) GOODLABELS = frozenset(['PERSON', 'ORGANIZATION', 'FACILITY', 'GPE', 'GSP']) grammar = GRAMMAR chunker = RegexpParser(grammar) tweet_tokenizer = TweetTokenizer() labels = GOODLABELS def normalize(sent): """ Removes punctuation from a tokenized/tagged sentence and lowercases words. """ sent = tweet_tokenizer.tokenize(sent) sent = [x for x in sent if not 'http' in x] is_punct = lambda word: all(unicat(char).startswith('P') for char in word) sent = filter(lambda t: not is_punct(t[0]), sent) # sent = map(lambda t: (t[0].lower(), t[1]), sent) sent = map(lambda t: t.lower(), sent)
path_to_jar_p = "/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/stanford-parser.jar" path_to_models_jar_p = "/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar" dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar_p, path_to_models_jar=path_to_models_jar_p) from nltk.chunk.regexp import RegexpParser grammar = ''' NP: {<DT>? <JJ>* <NN>*} # NP P: {<IN>} # Preposition V: {<V.*>} # Verb PP: {<P> <NP>} # PP -> P NP VP: {<V> <NP|PP>*} # VP -> V (NP|PP)* ''' reg_parser = RegexpParser(grammar) parser = stanford.StanfordParser( model_path= "/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/lexparser.sh" ) lmtzr = WordNetLemmatizer() #def file_len(fname): # with open(fname) as f: # for i, l in enumerate(f): # pass # return i + 1 #********************** --Create read in fn-- ****************** #fname = "002-0.cex" indir = '/Users/clairekelleher/Desktop/Thesis/Data/PItt_cookie_all_test'
def summarizer(tex, reduce_per): def norm(word, pos='x'): #normalizes all words except proper nouns word = word.lower() if pos not in ['NNP', 'NNPS']: wnl = WordNetLemmatizer() word = wnl.lemmatize(word) return (word) sentList = sent_tokenize(tex) #list of all tokenized sentences #print(sentList) sentNounDict = defaultdict( list ) # a dictionary key:sentence_number value:all nouns in the sentence... (nouns are normalised) for s in sentList: for w, pos in pos_tag(word_tokenize(s)): if pos in ['NN', 'NNS', 'NNP', 'NNPS']: sentNounDict[sentList.index(s)].append(norm(w, pos)) #print (sentNounDict) wordSentDict = defaultdict( list ) # a dictionary key:(word,pos) value:all sentences it appears in...(word is normalised) for s in sentList: for w, pos in pos_tag(word_tokenize(s)): wordSentDict[(norm(w, pos), pos)].append(sentList.index(s)) #print (wordSentDict) #list of all nouns in the text listOfNouns = list( sorted( set([ norm(w, pos) for s in sentList for w, pos in pos_tag(word_tokenize(s)) if pos in ['NN', 'NNS', 'NNP', 'NNPS'] ]))) #print (listOfNouns) listOfTaggedSents = [ ] #list of sentences of tokenized words with postags- list[tuple(w,pos)] for s in sentList: l = [(n, pos) for n, pos in pos_tag(word_tokenize(s))] listOfTaggedSents.append(l) #print (listOfTaggedSents) mostSigNoun = [] #most recently encountered significant noun mostSigNounObject = [ ] #most recently encountered significant noun which is not a person mostSigNounPerson = [ ] #most recently encountered significant noun which has named entity as person pronounNounDict = defaultdict( list ) #key:touple(pronoun,sentence_num) val:list(list(touple(noun,pos)))(noun not normalized) #grammar for tagging noun phrases and pronouns grammar = """NP:{<DT>?<JJ>*(<NN.*>)+} PR:{<PRP.*>} """ rp = RegexpParser(grammar) for s in listOfTaggedSents: begin = True chunkedTree = ParentedTree.convert( rp.parse(s)) #tree of chunked parts of the sentence neTree = ne_chunk(s) #tree with named entity tags #print (chunkedTree) #chunkedTree.draw() for n in chunkedTree: if isinstance(n, nltk.tree.Tree): if n.label() == 'NP': if begin == True: mostSigNoun = [ w for w in n if w[1] in ['NN', 'NNS', 'NNP', 'NNPS'] ] #print (mostSigNoun) for ne in neTree: if isinstance(ne, nltk.tree.Tree): if ne[0] in mostSigNoun: if ne.label() == 'PERSON': mostSigNounPerson = [] mostSigNounPerson.append(ne[0]) else: mostSigNounObject = [] mostSigNounObject.append(ne[0]) begin = False if n.label() == 'PR': pron = n[0][0].lower() #print pron if pron in ['it', 'its']: #for objects if len(mostSigNounObject) > 0: pronounNounDict[(pron, listOfTaggedSents.index(s) )].append(mostSigNounObject) else: #if mostsignounobject does not exist pronounNounDict[(pron, listOfTaggedSents.index(s) )].append(mostSigNoun) else: if len(mostSigNounPerson) > 0: pronounNounDict[(pron, listOfTaggedSents.index(s) )].append(mostSigNounPerson) else: pronounNounDict[(pron, listOfTaggedSents.index(s) )].append(mostSigNoun) begin = False #print pronounNounDict #adding the nouns corresponding to the pronouns to sentworddict and wordsentdict for v1 in pronounNounDict[(pron, listOfTaggedSents.index(s))]: for v11 in v1: #it is a list of lists sentNounDict[listOfTaggedSents.index(s)].append( norm(v11[0], v11[1])) wordSentDict[(norm(v11[0], v11[1]), v11[1])].append( listOfTaggedSents.index(s)) #print (sentNounDict) #print (wordSentDict) #print (pronounNounDict) for key, val in sentNounDict.items(): #making sentnoundict a set val = list(set(val)) sentNounDict[key] = val #print (sentNounDict) #following code calculates the distance between two phrases distance = defaultdict( int ) #a dict.. key:(noun or noun(pronoun),sentence_num) value:position in the sentence from the begining for s in listOfTaggedSents: dist = 0 chunkedTree = ParentedTree.convert(rp.parse(s)) for n in chunkedTree: if isinstance(n, nltk.tree.Tree): if n.label() == 'NP': tempNoun = [ w[0] for w in n if w[1] in ['NN', 'NNS', 'NNP', 'NNPS'] ] for w in tempNoun: distance[(norm(w), listOfTaggedSents.index(s))] = dist if n.label() == 'PR': pron = n[0][0].lower() tempNoun = pronounNounDict[(pron, listOfTaggedSents.index(s))] for v1 in tempNoun: for v11 in v1: distance[(norm(v11[0], v11[1]), listOfTaggedSents.index(s))] = dist dist += 1 #print (distance) #the following code assigns relation factor between two nouns nounGraph = np.zeros((len(listOfNouns), len(listOfNouns))) for key, value in sentNounDict.items(): for v1 in value: for v2 in value: d = 0 if v2 != v1: d = distance[v1, key] - distance[v2, key] nounGraph[listOfNouns.index(v1)][listOfNouns.index( v2)] += float((100 / (abs(d) + 1))) #if nounGraph[listOfNouns.index(v1)][listOfNouns.index(v2)]>=100: #print(v1+' '+v2+" "+str(d)) #print(nounGraph) nounPriority = defaultdict( int ) #dict to hold noun priorities... key:noun(normalized) value:priority sentencePriority = defaultdict( int ) #dict to hold sentence priorities...key:sentence_num value:priority def calcNounPriority( ): #function calculates the noun priority(sum of weights of all the edges attached to this noun in the noungraph) total = 0 i = 0 for x in nounGraph: total = sum(x) nounPriority[listOfNouns[i]] = total i += 1 #print (sorted(nounPriority.items(),key=lambda x:x[1], reverse=True)) def calcSentPriority( ): #function calculates sentence priority(sum of priorities of all nouns in the sent) for key, value in sentNounDict.items(): total = 0 for n in value: total += nounPriority[n] sentencePriority[key] = total calcNounPriority() calcSentPriority() #print (sorted(sentencePriority.items(),key=lambda x:x[1], reverse=True)) #for i in range(len(sentList)): #print(str(i)+' '+sentList[i]) reducingFactor = 0.9 #10% summary = [] #list to hold the summary reduce_per = reduce_per / 100 #print(reduce_per) for i in range(int(len(sentencePriority) * reduce_per)): summary.append(max(sentencePriority.items(), key=lambda x: x[1])) #print (summary) j = summary[-1][0] for n in sentNounDict[j]: nounPriority[ n] *= reducingFactor #reduce the priority of all nouns in the picked sentence del sentNounDict[j] del sentencePriority[j] #remove the picked sentence calcSentPriority() #recalculate sentence priority #print ("\n\n") i = 1 s_list = [] for s in sorted(summary): #print (i,sentList[s[0]]) s_list.append(sentList[s[0]]) i += 1 return (s_list)
def tagChunk(self, taggedword, loops=2): ## Cunking cp = RegexpParser(self.grammar, loop=loops) return cp.parse(taggedword)
'''------------------- POS Tagging --------------------------------------------------------------''' '''-----------------------------------------------------------------------------------------''' from nltk.corpus import treebank from nltk.tag import DefaultTagger, UnigramTagger train_sents = treebank.tagged_sents()[:3000] tagger1 = DefaultTagger('NN') tagger2 = UnigramTagger(train_sents, backoff=tagger1) '''-----------------------------------------------------------------------------------------''' '''------------------- Chunking with POS Tagging ---------------------------------------------------''' '''-----------------------------------------------------------------------------------------''' chunker = RegexpParser(r''' NP: {<DT>?<NN.*><VB.*><DT.*>?<NN.*>} {<DT>?<NN.*><IN><DT><NN.*>} {<NN.*><VB.*><NN.*>} ''') chunker2 = RegexpParser(r''' Phrase: {<JJ.*><NN.*>} {<RB><JJ>^<NN.*>} {<JJ><JJ>^<NN.*>} {<NN.*><JJ>^<NN.*>} {<RB.*><VB.*>} ''') chunkerPOS = RegexpParser(r'''
def makeParser(grammar=r""" NP: {<JJ.*>?<NN.*>+} """): return RegexpParser(grammar)