def extract_candidate_chunks(self,text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'): """ Extract candidate chunks from the sentence given and follow the pattern listed """ import itertools, nltk, string print('Inside extract_candidate_chunks ...') # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize, POS-tag, and chunk using regular expressions and creating the # chunk.RegexpParser with 1 stages: # RegexpChunkParser with 1 rules: # <ChunkRule: '(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+'> # see parsing vs chunking @http://nltk.sourceforge.net/doc/en/ch06.html chunker = nltk.chunk.regexp.RegexpParser(grammar) # this will tag the word with probable pos tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) # chunks the data in IOB-tags which means tagged with one of three special chunk tags, # I (inside), O (outside), or B (begin) all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase # 1. all_chunks is like (word,pos,chunk) and we are neglecting the words which are outside the in the IOB tags # 2. groupby with chunks # 3. Lower case the chunks and join B and I candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda chunk: chunk != 'O') if key] # return all the list of chunks after removing all the stopwords return [cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand)]
def extract_candidate_words(self,text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])): """ Here will get all the words which are eligigle for keywords """ import itertools, nltk, string print('Inside extract_candidate_words') # exclude candidates that are stop words or entirely punctuation # 1. Getting the punctuation() list and storing # 2. Getting all the stopwords list and storing punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize and POS-tag words # 1. tokenizing the string and an array is sent # 2. the string is splits the token into words # 3. attach pos-tag to each and every word tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(self.__initString__))) # filter on certain POS tags and lowercase all words # 1. Iterate all word and tags and lower-case # 2. Then qualify only the those tags which are there in the good_tags and # check whether it is not present in the Stopwords # 3. And also remove the punctuation() candidates = [word.lower() for word, tag in tagged_words if tag in good_tags and word.lower() not in stop_words and not all(char in punct for char in word)] return candidates
def pipeline_pos(titles, descriptions, tags): def preprocess(inpt): return inpt # Create feature vectors of context and only keep images WITH context bar = Bar('Extracting features...', max=len(titles)) pos_collection = [] for i in xrange(len(titles)): # Stem words and remove stopwords for title... context = [] title = preprocess(titles[i].split(' ')) if title: context.append(title) # ... description (for each sentence) ... for desc in sent_tokenize(descriptions[i]): desc = preprocess(desc.split(' ')) if desc: context.append(desc) # ... and tagsc ts = preprocess(tags[i]) if ts: context.append(ts) pos = nltk.pos_tag_sents(context) pos = list(itertools.chain(*pos)) pos_collection.append(pos) bar.next() bar.finish() return pos_collection
def readGenreBasedFilesAndTagWords(genre_to_file_list, meta_dict, tagger=None): for genre in genre_to_file_list: meta_dict_for_genre = meta_dict[genre] print '--------------------------------------------------------------' print 'Number of Files in genre ',genre,' : ',len(meta_dict_for_genre) for genre_file_path,genre_file_name in genre_to_file_list[genre]: if genre_file_name not in meta_dict_for_genre: continue pos_tag_dict = dict() with open(genre_file_path) as f: filelines = f.readlines() tokens = [ [word for word in line.split()] for line in filelines] pos_tagged_lines = [] if tagger != None: pos_tagged_lines = tagger.tag_sents(tokens) else: pos_tagged_lines = nltk.pos_tag_sents(tokens) for pos_tags in pos_tagged_lines: for word,tag in pos_tags: if tag not in pos_tag_dict: pos_tag_dict[tag] = 0.0 pos_tag_dict[tag]+= 1.0 total_tags = sum(pos_tag_dict.values()) pos_tag_dict = {key:(pos_tag_dict[key]/total_tags) for key in pos_tag_dict} meta_dict_for_genre_file = meta_dict_for_genre[genre_file_name] meta_dict_for_genre_file[TAGS] = pos_tag_dict print 'Genre ', genre, ' Done' print '--------------------------------------------------------------'
def process(self): """ process() Splits assigned text into list of words, normalizes them and creates a frequency distribution If no text is assigned - throws SourceNotAssigned exception Stores inside self.freq_dist a list of dicts: [{word, word_pos, word_freq}] word_pos is of database format (a,n,adv,v) Returns nothing """ if not self._text: raise SourceNotAssigned pos_dict = {"J": [], "N": [], "R": [], "V": []} t = [nltk.word_tokenize(sent) for sent in self._text] words = [tt for sent in nltk.pos_tag_sents(t) for tt in sent if tt[1] in ACCEPTABLE_POS] for word in words: pos_dict[word[1][0]].append(normalize(word)) del t, words self.freq_dist = [] for key in pos_dict: for word, freq in nltk.FreqDist(pos_dict[key]).most_common(): self.freq_dist.append(dict(word=word, word_pos=tag_to_pos[key], word_freq=freq))
def extract_candidate_words(self, text): # tokenize and POS-tag words tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))) # filter on certain POS tags and lowercase all words candidates = [word.lower() for word, tag in tagged_words if tag in self.good_tags and word.lower() not in self.stop_words and not all(char in self.punctuations for char in word)] return candidates
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'): punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents( nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda (word, pos, chunk): chunk != 'O') if key]
def extractTerms(self, doc): #doc = lambda doc: doc.decode('utf8', 'ignore') doc = doc.decode('utf-8') sents = nltk.sent_tokenize(doc) words = (nltk.word_tokenize(sent) for sent in sents) tagged_sents = nltk.pos_tag_sents(words) chunker = nltk.chunk.regexp.RegexpParser(self.grammer) chunked_sents = (chunker.parse(tagged_sent) for tagged_sent in tagged_sents) conll_tags = (nltk.chunk.tree2conlltags(chunked_sent) for chunked_sent in chunked_sents) all_chunks = list(itertools.chain.from_iterable(conll_tags)) candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
def Extract_Candidate_Chunks(Text , Grammar): ## Defining a Chunker based on the Grammar we defined above . Chunker = nltk.chunk.regexp.RegexpParser(Grammar) ## Assigning POS Tags . Tagged_Sentences = nltk.pos_tag_sents( nltk.word_tokenize(sentence) for sentence in nltk.sent_tokenize(Text) ) ## Gathering the Chunks made by the Chunker from the Tree created . Chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(Chunker.parse(Tagged_Sentences)) for Tagged_Sentences in Tagged_Sentences)) ## We will now join the chunk words into a single Chunked Phrase . Candidates = [' '.join(Word for Word, Pos, Chunk in Group).lower() for Key, Group in itertools.groupby(Chunks, lambda (Word,Pos,Chunk): Chunk != 'O') if Key]
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'): import itertools, nltk, string # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) # tokenize, POS-tag, and chunk using regular expressions chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
def __extract_candidate_words(self, text, good_tags=set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])): # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) # stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize and POS-tag words tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))) # filter on certain POS tags and lowercase all words candidates = [self.__trim(word) for word, tag in tagged_words if tag in good_tags and word.lower() not in self.__stop_words and not all(char in punct for char in word)] return candidates
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS','VB','VBD','VBG','VBN','VBP','VBZ'])): import itertools, nltk, string # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize and POS-tag words tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))) # filter on certain POS tags and lowercase all words candidates = [word.lower() for word, tag in tagged_words if tag in good_tags and word.lower() not in stop_words and not all(char in punct for char in word)] #print(candidates) return candidates
def char_recognition(self, char_number = 20): tagged_sentences = nltk.pos_tag_sents(self.tokenized_sentences) self.entities = [] entity_names = [] if nltk.__version__[0] == '3': chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False) for tree in chunked_sentences: entity_names.extend(extract_entity_names3(tree)) else: chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=False) for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) count = Counter([name for name in entity_names]) for c in count.most_common(char_number): self.entities.append(c[0])
def batch_tag_sentences(message_dict): """ Uses a more efficient way of tagging all sentences for a given message at once. """ num_sentences = [len(page['sentences']) for page in message_dict['urls']] all_sentences = [word_tokenize(s['s_clean']) for page in message_dict['urls'] for s in page['sentences']] all_tags = pos_tag_sents(all_sentences) for page_index, slice_length in enumerate(num_sentences): slice_start = sum(num_sentences[:page_index]) slice_end = slice_start + slice_length for sentence_index, tags in enumerate(all_tags[slice_start:slice_end]): pos_tags = ['/'.join(b) for b in tags] message_dict['urls'][page_index]['sentences'][sentence_index]['pos_tags'] = ' '.join(pos_tags)
def start(raw_text, user_sentence, length=10, is_debug=False): """ Entry point. :param raw_text: the source text as a string :param user_sentence: a starting sentence, string, provided by the user :param length: how many words should we generate :return: """ user_sentence = user_sentence.lower() sentences = process_raw_text(raw_text) lexica = [token for sent in sentences for token in sent] # When we create the bigrams for our matrices we process the whole text as a single list, without splitting it # into sentences. It causes a bug - if the last word of the list is unique no bigrams will ever start with this # word. There might be a couple of ways to circumvent this error, but in order to avoid any unforeseen issues in # future I simply add the first word of the text into the very end of the text, making sure that all the words are # located both in the left sides and in the right sides of our bigrams. This might slightly spoil our statistics, # however I am slightly randomizing statistics myself in order to get more unique results, # so this should not be an issue. if lexica.index(sentences[-1][-1]) == len(lexica) - 1: lexica.append(lexica[0]) sentences[-1].append(lexica[0]) # tagged_word_pairs = nltk.pos_tag(lexica) # the shortcut - it's a bit worse, but only a bit tagged_word_pairs = [token for sent in nltk.pos_tag_sents(sentences) for token in sent] # a list of ['word', 'POS'] # processing user input tokenized_user_input = nltk.word_tokenize(user_sentence) user_input_pairs = nltk.pos_tag(tokenized_user_input) if (tokenized_user_input[-1]) not in lexica: return "Error! Please try a different word - the last word of your sentence is not present in the original text." try: number_of_words = int(length) except ValueError: return "Error! Please make sure to input a number!" probability_matrices = (generate_word_word_matrix(lexica, is_debug), generate_word_pos_matrix(tagged_word_pairs, is_debug), generate_pos_pos_matrix(tagged_word_pairs, is_debug) ) output = generate(user_input_pairs, probability_matrices, number_of_words) #output = generate([("i", "PRP")], probability_matrices, 42) return " ".join([pair[0] for pair in output])
def extract_grammar_phrases(corpus, phrase_grammar_pattern): # build phrase list based on grammar pattern all_phrases = [] grammar_pattern = nltk.chunk.regexp.RegexpParser(phrase_grammar_pattern) sentences = normalize_document(corpus, esc_html=False, expand_cont=False, lemmatize=False, tokenize=False, remove_special_char=False, remove_stop_words=False) for sentence in sentences: # POS tag sentences tagged_sentences = nltk.pos_tag_sents([nltk.word_tokenize(sentence)]) # extract phrases based on pattern phrases = [ grammar_pattern.parse(tagged_sentence) for tagged_sentence in tagged_sentences ] # extract word, pos tag, tag triples wtc_sentences = [ nltk.chunk.tree2conlltags(phrase) for phrase in phrases ] flattened_phrases = list( itertools.chain.from_iterable(wtc_sentence for wtc_sentence in wtc_sentences)) # get valid phrase based on tags valid_phrases_tagged = [ (status, [wtc for wtc in chunk]) for status, chunk in itertools.groupby( flattened_phrases, lambda (word, pos, chunk): chunk != 'O') ] valid_phrases = [ ' '.join(word.lower() for word, tag, chunk in wtc_group if word.lower() not in stopword_list) for status, wtc_group in valid_phrases_tagged if status ] all_phrases.append(valid_phrases) return all_phrases
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'): import itertools, nltk, string # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize, POS-tag, and chunk using regular expressions chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda (word, pos, chunk): chunk != 'O') if key] return [cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand)]
def extract_candidate_words( text, good_tags=set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])): import itertools, nltk, string # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize and POS-tag words tagged_words = itertools.chain.from_iterable( nltk.pos_tag_sents( nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))) # filter on certain POS tags and lowercase all words candidates = [ word.lower() for word, tag in tagged_words if tag in good_tags and word.lower() not in stop_words and not all( char in punct for char in word) ] return candidates
def char_recognition(self, char_number=20): tagged_sentences = nltk.pos_tag_sents(self.tokenized_sentences) self.entities = [] entity_names = [] if nltk.__version__[0] == '3': chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False) for tree in chunked_sentences: entity_names.extend(extract_entity_names3(tree)) else: chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=False) for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) count = Counter([name for name in entity_names]) for c in count.most_common(char_number): self.entities.append(c[0])
def extract_candidate_chunks(text): import nltk, itertools, string # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize, POS-tag, and chunk using regular expressions grammar = "KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}" chunker = nltk.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) # ==== method 1====== candidates_with_POS = [] candidates = [] tree_chunked_sents = [chunker.parse(tagged_sent) for tagged_sent in tagged_sents] for tree in tree_chunked_sents: for subtree in tree.subtrees(): if subtree.label() == 'KT': candidates_with_POS.append(subtree.leaves()) for cand in candidates_with_POS: NP = [] for word, pos in cand: NP.append(word.lower()) candidates.append(" ".join(NP)) # ==== method 2====== # BOI_tagged_chunked_sents = [nltk.tree2conlltags(tree_chunked_sent) # for tree_chunked_sent in tree_chunked_sents] # all_chunks = list(itertools.chain.from_iterable(BOI_tagged_chunked_sents)) # # # get all the NP Chunk and exclude all the non-NP chunk # groups = [] # for key, group in itertools.groupby(all_chunks, lambda x : x[2]!='O'): # if(key): # groups.append(list(group)) # # # get all the candidate except stopwords and punkt # candidates = [" ".join(word for word, pos, chunk in group).lower() # for group in groups] candidates = [candidate for candidate in candidates if candidate not in stop_words and not all(char in punct for char in candidate)] return candidates
def generate_keyword(texts, method='phrase', remove_punctuation=False): """ Generate word candidate from given string Parameterstfidf_matrix = tf.transform(resumes_list) ---------- texts: str, input text string method: str, method to extract candidate words, either 'word' or 'phrase' Returns ------- candidates: list, list of candidate words """ words_ = list() candidates = list() # tokenize texts to list of sentences of words sentences = sent_tokenize(texts) for sentence in sentences: if remove_punctuation: sentence = punct_re.sub(' ', sentence) # remove punctuation words = word_tokenize(sentence) words = list(map(lambda s: s.lower(), words)) words_.append(words) tagged_words = pos_tag_sents(words_) # POS tagging if method == 'word': tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS']) tagged_words = chain.from_iterable(tagged_words) for word, tag in tagged_words: if tag in tags and word.lower() not in stop_words: candidates.append(word) elif method == 'phrase': grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' chunker = RegexpParser(grammar) all_tag = chain.from_iterable( [tree2conlltags(chunker.parse(tag)) for tag in tagged_words]) for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'): candidate = ' '.join([word for (word, pos, chunk) in group]) if key is True and candidate not in stop_words: candidates.append(candidate) else: print("Use word or phrase") return candidates
def extract_chunks(text_string, max_words=3, lemmatize=True): """ Extract phrase nouns by using regex """ # Any number of adjectives followed by any number of nouns and (optionally) again # any number of adjectives folowerd by any number of nouns grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' # Makes chunks using grammar regex chunker = nltk.RegexpParser(grammar) # Get grammatical functions of words # What this is doing: tag(sentence -> words) tagged_sents = nltk.pos_tag_sents( nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string)) # Make chunks from the sentences, using grammar. Output in IOB. all_chunks = list( itertools.chain.from_iterable( nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # Join phrases based on IOB syntax. candidates = [ ' '.join(w[0] for w in group).lower() for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O') if key ] # Filter by maximum keyphrase length candidates = list(filter(lambda l: len(l.split()) <= 3, candidates)) # Filter phrases consisting of punctuation or stopwords punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) candidates = list( filter( lambda l: l not in stop_words and not all(c in punct for c in l), candidates)) # lemmatize if lemmatize: lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize candidates = [lemmatizer(x) for x in candidates] print(candidates) return candidates
def get_topic_tokens(data): stopwords = set(nltk.corpus.stopwords.words()) tokens = nltk.word_tokenize(data) tokens = [t.lower() for t in tokens] stop_list = [] for i in range(len(tokens)): if tokens[i] == '.': stop_list.append(i) sents_list = [] for s in izip([0] + stop_list[:-1], stop_list): sents_list.append(tokens[s[0]:s[1]]) tag_token_sents = nltk.pos_tag_sents(sents_list) nn_list = [] for s in tag_token_sents: nns = [t[0] for t in s if t[0].isalpha() and t[1] == 'NN'] nn_list.extend([t for t in nns if t not in stopwords and len(t) > 3]) print nn_list[:30] return nn_list
def make_wordcloud(word_count): twitter = TweetTokenizer(strip_handles=True, reduce_len=True) # twitter = word_tokenize() sentences_tag = [] # 정규표현식 특수문자 제거 후 형태소 분석하여 리스트에 넣기 for sentence in title_list: wd = re.sub("[-=+,·#/\?:^$.@*\"※~&%ㆍ!’』\\‘|\(\)\[\]\<\>`\'…》]", '', sentence) morph = twitter.tokenize(wd) sentences_tag.append(morph) print(morph) print('-' * 30) print(sentences_tag) print('\n' * 3) noun_adj_list = [] # 명사와 형용사만 구분하여 리스트에 넣기 tagged = nltk.pos_tag_sents(sentences_tag) for sentence1 in tagged: for word, tag in sentence1: if tag in ['NN', 'NNP', 'JJ']: print(word, tag) noun_adj_list.append(word) # 형태소별 count counts = Counter(noun_adj_list) tags = counts.most_common(word_count) print(tags) # wordCloud생성 # 한글꺠지는 문제 해결하기위해 font_path 지정 wc = WordCloud(font_path='/Library/Fonts/NanumGothic.ttf', background_color='white', width=800, height=600) print(dict(tags)) cloud = wc.generate_from_frequencies(dict(tags)) plt.figure(figsize=(10, 8)) plt.axis('off') plt.imshow(cloud) plt.show()
def extract_candidate_chunks(text, grammar=r'NP: {<JJ>*<NN>}', delimiter='_', stemmer=None): # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize, POS-tag, and chunk using regular expressions chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents( nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = list( itertools.chain.from_iterable( nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase if stemmer: stem = stemmer.stem else: stem = lambda x: x candidates = [] for key, group in itertools.groupby( all_chunks, lambda word_pos_chunk_triple: word_pos_chunk_triple[2] != 'O'): if key: words = [] for word, pos, chunk in group: try: word = stem(word) except IndexError: print("word unstemmable:", word) words.append(word) candidates.append(delimiter.join(words).lower()) return [ cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand) ]
def get_instances(xml_file, key_file): def should_be_omitted(num_instances): return num_instances < 5 tree = ET.parse(xml_file) xml_instances = tree.getroot().findall('.//instance') ids = map(lambda x: x.attrib['id'], xml_instances) if should_be_omitted(len(ids)): return [] heads = map(lambda x: x.find('context').text or '', xml_instances) # the part of the context behind the <head> doesn't get included in head # so we use the tail of the head to obtain it tails = map(lambda x: x.find('.//head').tail or '', xml_instances) full_context = [(head + tail).split() for head, tail in izip(heads, tails)] pos_tags_of_full_context = nltk.pos_tag_sents(full_context) with open(key_file) as labels_file: labels = [line.split(' ')[2] for line in labels_file] # labels is now the sense in wordnet, in the senseval format # but we should convert it into a numbered format, based on a key file (use SENSE_TO_INDEX) csv_instances = [] for number, head, tail, label, pos_tags in izip(ids, heads, tails, labels, pos_tags_of_full_context): head_pos_tags = pos_tags[:len(head.split())] head_pos_tags = map(lambda x: x[1], head_pos_tags) assert len(head_pos_tags) == len(head.split()) tail_pos_tags = pos_tags[len(head.split()):] tail_pos_tags = map(lambda x: x[1], tail_pos_tags) assert len(tail_pos_tags) == len(tail.split()) instance = Instance(number, head, tail, head_pos_tags, tail_pos_tags, label) csv_instances.append(instance) return csv_instances
def pos_counts(text, pos_list): """Return the sorted list of distinct words with a given part of speech >>> emma = nltk.corpus.gutenberg.raw('austen-emma.txt') >>> pos_counts(emma, ['DET', 'NOUN']) [14352, 32029] """ temp = [] answer = [] # Arrays for count and final answer pos = [nltk.word_tokenize(s) for s in nltk.sent_tokenize(text)] # tokenize the text tagged = nltk.pos_tag_sents(pos, tagset="universal") # Tag each word for tag in tagged: for word in tag: temp.append(word[1]) # Select only the tagged part posCounter = Counter(temp) for p in pos_list: answer.append( posCounter[p]) #count the amount of PoS that is specified return answer
def extract_candidate_chunks(self, text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'): import itertools, nltk, string nltk.data.path.append('/home/guanhua/sunhongyu/iGitRepo/project/other/nltk_data') # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize, POS-tag, and chunk using regular expressions chunker = nltk.chunk.regexp.RegexpParser(grammar) sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)] tagged_sents = nltk.pos_tag_sents(sents, lang='eng') all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda word__pos__chunk: word__pos__chunk[2] != 'O') if key] return [cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand)]
def extract_candidate_chunks(text, grammar=r'KT: {<NNP>+?}'): # exclude candidates that are stop words or entirely punctuation # tokenize, POS-tag, and chunk using regular expressions chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents( nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = list( itertools.chain.from_iterable( nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase candidates = [ ' '.join(word for word, pos, chunk in group) for key, group in itertools.groupby(all_chunks, lambda x: x[2] != 'O') if key ] return candidates
def noun_phrases(text, *args, **kwargs): sentences = (nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) tagged_sentences = nltk.pos_tag_sents(sentences) # chunking baseado em https://www.nltk.org/book/ch07.html grammar = "NP: {<DT>?<JJ.*>*<NN.*>*<JJ.*>*}" cp = nltk.RegexpParser(grammar) NPs = [] # Sintagmas nominais como objetos Tree flat_noun_phrases = [] # Strings com os sintagmas nominais for sentence_tree in cp.parse_sents(tagged_sentences): for subtree in sentence_tree.subtrees(): if subtree.label() == "NP": NPs.append(subtree) np = ' '.join(w[0] for w in subtree.leaves()) flat_noun_phrases.append(np) yield np
def generate_candidate(texts, method='word', remove_punctuation=False): """ Generate word candidate from given string Parameters ---------- texts: str, input text string method: str, method to extract candidate words, either 'word' or 'phrase' Returns ------- candidates: list, list of candidate words """ words_ = list() candidates = list() # tokenize texts to list of sentences of words sentences = sent_tokenize(texts) for sentence in sentences: if remove_punctuation: sentence = punct_re.sub(' ', sentence) # remove punctuation words = word_tokenize(sentence) words = list(map(lambda s: s.lower(), words)) words_.append(words) tagged_words = pos_tag_sents(words_) # POS tagging if method == 'word': tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS']) tagged_words = chain.from_iterable(tagged_words) for word, tag in tagged_words: if tag in tags and word.lower() not in stop_words: candidates.append(word) elif method == 'phrase': grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' chunker = RegexpParser(grammar) all_tag = chain.from_iterable([tree2conlltags(chunker.parse(tag)) for tag in tagged_words]) for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'): candidate = ' '.join([word for (word, pos, chunk) in group]) if key is True and candidate not in stop_words: candidates.append(candidate) else: print("Use either 'word' or 'phrase' in method") return candidates
def extract_candidate_chunks(text, typ='title'): """ extract candidate chunks from given text :param text: string: a single text :return: candidates: list, contain a series of candidate chunks """ import nltk, itertools, string # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize, POS-tag, and chunk using regular expressions if typ == 'title': grammar = "KT: {<JJ>* <NN.*>+}" else: grammar = "KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}" chunker = nltk.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents( nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) # ==== method 1====== candidates_with_POS = [] candidates = [] tree_chunked_sents = [ chunker.parse(tagged_sent) for tagged_sent in tagged_sents ] for tree in tree_chunked_sents: for subtree in tree.subtrees(): if subtree.label() == 'KT': candidates_with_POS.append(subtree.leaves()) for cand in candidates_with_POS: NP = [] for word, pos in cand: NP.append(word.lower()) candidates.append(" ".join(NP)) candidates = [ candidate for candidate in candidates if candidate not in stop_words and not all(char in punct for char in candidate) ] return candidates
def extract_candidate_chunks(text, chunker, stop_words, punctuation): # tokenize, POS-tag, and chunk using regular expressions tagged_sents = nltk.pos_tag_sents( nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = [ nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents ] all_chunks = list(itertools.chain.from_iterable(all_chunks)) # join constituent chunk words into a single chunked phrase candidates = [ '-'.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, group_func) if key ] return set([ cand for cand in candidates if cand not in stop_words and not all( char in punctuation for char in cand) and 2 < len(cand) < 15 ])
def get_most_useful_pos_bigrams(count): """ Computes the most frequently occurring POS tag bigrams. """ filename = f'statistics/most_useful_pos_bigrams_{count}.json' if os.path.exists(filename): with open(filename) as _file: return [tuple(bigram) for bigram in json.load(_file)] bigram_counts_file = 'statistics/pos_bigram_counts.json' if os.path.exists(bigram_counts_file): with open(bigram_counts_file) as _file: counter = Counter( {tuple(key): value for key, value in json.load(_file)}) else: counter = Counter() for subreddit in tqdm(SUBREDDITS): with open(f'subreddits/{subreddit}.json') as _file: posts = json.load(_file) for post in tqdm(posts, desc=f'/r/{subreddit}'.ljust(20, ' ')): tag_sents = pos_tag_sents(sent_tokenize(post['content'])) for sent in tag_sents: for (token_a, pos_a), (token_b, pos_b) in zip(sent, sent[1:]): counter[pos_a, pos_b] += 1 with open(bigram_counts_file, 'w') as _file: json.dump(list(counter.items()), _file) results = [bigram for bigram, amount in counter.most_common(count)] results.sort() with open(filename, 'w') as _file: json.dump(results, _file) return results
def predict(): result = request.form with open('tokens.pkl', 'rb') as f: tokens = load(f) with open('tags.pkl', 'rb') as f: tags = load(f) with open('sent_tokens.pkl', 'rb') as f: sent_tokens = load(f) text = str(result['texto']) text = re.sub(r'\.+', ".", text).split('.') text = [re.sub(r'[^\w\s]', '', x).strip() for x in text] text = [x.strip() for x in text if x.strip()] new_data = DataFrame(text, columns=['Sentença']) new_data['tag'] = pos_tag_sents(new_data['Sentença'].apply(word_tokenize).tolist(), lang='pt') counts = [] for k in range(len(new_data)): counts.append(Counter([j for i, j in new_data['tag'][k]])) dmm = DataFrame(counts).fillna(0) for i in range(len(tags)): if tags[i] not in dmm: dmm[tags[i]] = 0 vec = CountVectorizer(vocabulary=tokens) dtm = DataFrame(vec.fit_transform(text).toarray(), columns=vec.get_feature_names()) sent = dtm.loc[:, sent_tokens].apply(sum, axis=1) with open('classifier.pkl', 'rb') as f: classifier = load(f) prediction = classifier.predict(concat([dtm, dmm, sent], axis=1)) proportion = 100*sum(prediction == 'F')/len(prediction) new_data['classe'] = prediction return render_template('result.html', prediction=proportion, table=new_data.to_html())
def extract_candidate_chunks(self): grammar = r'KT: { (<NN.*>+ <JJ.*>?)|(<JJ.*>? <NN.*>+)}' punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents( nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(self.text)) all_chunks = list( itertools.chain.from_iterable( nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) candidates = [ ' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby( all_chunks, lambda word__pos__chunk: word__pos__chunk[2] != 'O') if key ] x = [ cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand) ] data = [] for i in range(0, len(x), 1): if len(x[i].split()) == 1: if re.match("^[A-Za-z0-9]*$", x[i]): if len(x[i]) > 2: data.append(x[i]) else: add = "" split = x[i].split() lenth = len(split) for i in range(0, lenth, 1): king = re.match("^[A-Za-z0-9]*$", split[i]) if len(str(king)) > 2: add = add + " " + split[i] data.append(add.strip()) return data
def ner_recognize_string(string): sentences = nltk.sent_tokenize(string) tokenized_sentences = [] for sentence in sentences: tokenized_sentences.append(nltk.word_tokenize(sentence)) tagged_sentences = nltk.pos_tag_sents(tokenized_sentences) chunked_sentences = nltk.ne_chunk_sents(tagged_sentences) entities = set() for tree in chunked_sentences: for x in tree: if type(x) is tuple and "NNP" in x[1]: # print x[0] entities.add(x[0]) elif type(x) is nltk.tree.Tree: # print string_from_tree(x) entities.add(string_from_tree(x)) return entities
def getWordPattern(text): import itertools, nltk, string stop_words = set(nltk.corpus.stopwords.words('english')) punct = set(string.punctuation) tagged_words = itertools.chain.from_iterable( nltk.pos_tag_sents( nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))) # candidates = [[ tag, word.lower()] for word, tag in tagged_words # if not all(char in punct for char in word)] candidates = [ [tag, word.lower()] for word, tag in tagged_words if not all(char in punct for char in word) and word.lower() not in stop_words ] res = pd.DataFrame.from_records(candidates, columns=['tag', 'word']) return res
def extract_candidate_words(text, minlen = 3, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])): import itertools, nltk, string, re # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize and POS-tag words tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))) # filter on certain POS tags and lowercase all words candidates = [word.lower() for word, tag in tagged_words if tag in good_tags and word.lower() not in stop_words and not all(char in punct for char in word)] clean_candidates=[] for candidate in candidates: if re.search(r'[a-zA-Z0-9_]*\-*[a-zA-Z0-9_]*', candidate).group() == '': continue #remove wonky candidates if len(candidate)>=minlen: clean_candidates.append(candidate) return clean_candidates
def batch_tag_sentences(message_dict): """ Uses a more efficient way of tagging all sentences for a given message at once. """ num_sentences = [len(page['sentences']) for page in message_dict['urls']] all_sentences = [ word_tokenize(s['s_clean']) for page in message_dict['urls'] for s in page['sentences'] ] all_tags = pos_tag_sents(all_sentences) for page_index, slice_length in enumerate(num_sentences): slice_start = sum(num_sentences[:page_index]) slice_end = slice_start + slice_length for sentence_index, tags in enumerate(all_tags[slice_start:slice_end]): pos_tags = ['/'.join(b) for b in tags] message_dict['urls'][page_index]['sentences'][sentence_index][ 'pos_tags'] = ' '.join(pos_tags)
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'): import itertools, nltk, string # ------ exclude candiates that are stop words or entirely punctuation punct = set(string.punctuation) # print punct stop_words = set(nltk.corpus.stopwords.words('english')) # print stop_words # print chunker sents = nltk.sent_tokenize(text) # print sents words = (nltk.word_tokenize(sent) for sent in sents) # print words tagged_sents = nltk.pos_tag_sents(words) # print tagged_sents # ------ tokenize, POS-tag, and chunk using regular expressions chunker = nltk.chunk.regexp.RegexpParser(grammar) chunked_sents = (chunker.parse(tagged_sent) for tagged_sent in tagged_sents) # for chunked_sent in chunked_sents: print chunked_sent conll_tags = (nltk.chunk.tree2conlltags(chunked_sent) for chunked_sent in chunked_sents) # for conll_tag in conll_tags: print conll_tag all_chunks = list(itertools.chain.from_iterable(conll_tags)) print all_chunks # ------ join constituent chunk words into a single chunked phrase #for chunk in all_chunks: # lambda(word, pos, chunk): chunk != 'O' # print word #for key, group in itertools.groupby(all_chunks, lambda(word,pos,chunk): chunk != 'O'): # #print key # for word, pos, chunk in group: # #print key, word, pos, chunk # if key: # print ' '.join(word for word,pos,chunk in group).lower() candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
def vectorize(self, path_to_json): file = open(path_to_json, "r") text = json.load(file)["text"] tokenized_sentences = nltk.sent_tokenize(text) tokenized_words = [ nltk.word_tokenize(sent) for sent in tokenized_sentences ] pos_tagged_text = nltk.pos_tag_sents(tokenized_words) words = self.word_getter.get(text) tags = {} for sentence in pos_tagged_text: for tagged_word in sentence: if tagged_word[0] not in tags.keys(): tags[tagged_word[0]] = [] tags[tagged_word[0]].append(tagged_word[1]) result = {} for word in words: result[word] = [ 0, 0, 0, 0, 0 ] #Noun, Verb, Foreign Word, adjective, cardinal digit for word, tag_list in tags.items(): if word not in result.keys(): continue for tag in tag_list: if tag.startswith("N"): result[word][0] = 1 elif tag.startswith("V"): result[word][1] = 1 elif tag == "FW": result[word][2] = 1 elif tag.startswith("RB"): result[word][3] = 1 elif tag == "CD": result[word][4] = 1 return result
def select_candidates(text): grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' stop_words = set(nltk.corpus.stopwords.words('english')) chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents( nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = list( itertools.chain.from_iterable( nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) candidates = [ ' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby( all_chunks, lambda wordposchunk: wordposchunk[2] != 'O') if key ] result = [ cand for cand in candidates if cand not in stop_words and len(cand.split()) < 4 ] #result = list(dict.fromkeys(result)) #if len(result) < 10: #grammar = r'KT: {<DT>? <JJ>* (<NN>|<NP>|<PRN>)+}' #chunker = nltk.chunk.regexp.RegexpParser(grammar) #all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) #for tagged_sent in tagged_sents)) #candidates = [' '.join(word for word, pos, chunk in group).lower() #for key, group in itertools.groupby(all_chunks, lambda wordposchunk: wordposchunk[2] != 'O') if key] #result = [cand for cand in candidates #if cand not in stop_words and len(cand.split()) < 4 and not all(char in punct for char in cand)] return list(dict.fromkeys(result))
def create_phrase_vocabulary(raw_data): ''' Extract vocabulary of nounphrase, because tfidfvectorizer only automatically extract ngram, if we want to use different format or different vocabulary, vocabulary must be created. ''' #grammar to extract the noun phrase grammar = r'NP: {(<JJ.*>* <VBN>? <NN.*>+ <IN>)? <JJ.*>* <VBG>? <NN.*>+}' #set the punctuation and chunker punct = set(string.punctuation) chunker = RegexpParser(grammar) def lambda_unpack(f): #function to unpack the tuple return lambda args: f(*args) #tokenize and create pos tags per sentence, then get its IOB tag postag_sents = pos_tag_sents(word_tokenize(sent) for sent in raw_data) noun_phrases = list( chain.from_iterable( tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in postag_sents)) #join B-NP and I-NP tags as one noun phrase excluding O tags merged_nounphrase = [ ' '.join(stemmer.stem(word) for word, pos, chunk in group).lower() for key, group in itertools.groupby( noun_phrases, lambda_unpack(lambda word, pos, chunk: chunk != 'O')) if key ] #filter the term below than two characters and punctuation all_nounphrases = [ cand for cand in merged_nounphrase if len(cand) > 2 and not all(char in punct for char in cand) ] #select distinct noun phrases vocabulary = (list(set(all_nounphrases))) return vocabulary
def preprocess(text): """ Tag a english text with pos Args: text (str): The target text. Returns: pos_sents (list): A list of lists of tuples, containing each sentence with word-pos pairs. Examples: >>> text = "I am a loser. I don't have girlfriend." >>> preprocess(text) [ [('I', 'PRP'), ('am', 'VBP'), ('a', 'DT'), ('loser', 'NN'), ('.', '.')], [('I', 'PRP'), ('do', 'VBP'), ("n't", 'RB'), ('have', 'VB'), ('girlfriend', 'NN'), ('.', '.')] ] """ sentences = nltk.sent_tokenize(text) seg_sents = [nltk.word_tokenize(sent) for sent in sentences] pos_sents = nltk.pos_tag_sents(seg_sents) return pos_sents
def pos_counts(text, pos_list): """Return the sorted list of distinct words with a given part of speech >>> emma = nltk.corpus.gutenberg.raw('austen-emma.txt') >>> pos_counts(emma, ['DET', 'NOUN']) [14352, 32029] """ result = [] sentTokenized_text = nltk.sent_tokenize(text) tokenized_text = [nltk.word_tokenize(s) for s in sentTokenized_text] token_list = nltk.pos_tag_sents(tokenized_text, tagset="universal") pos_list_fd = nltk.FreqDist( [tag for tags in token_list for (word, tag) in tags]) for word in pos_list: value = pos_list_fd[word] result.append(value) return []
def sentence_postag(reviewSentence): """ 형태소 분석 by sentence tokenize : nltk.word_tokenize + '.','/' 으로 분할 """ re_split = re.compile('[/.-]') tokenize = [nltk.word_tokenize(sent) for sent in reviewSentence] tokenize2 = [] for sent in tokenize: sent_token = [] for word in sent: if bool(re_split.search(word)): # /, . 이 1개 이상 있으면 split token = re_split.split(word) sent_token.extend(token) else: sent_token.append(word) sent_token = [word for word in sent_token if len(word) > 0] #길이가 0인 문자열 제거 tokenize2.append(sent_token) tagged = nltk.pos_tag_sents(tokenize2) return tagged
def analize_text(text: str, *, exact_words: bool = False) -> tuple: sentences = array(split_into_sentences(text, True)) if (not len(sentences)): print("Nothing found") return [] tags = pos_tag_sents(map(word_tokenize, sentences)) if (not exact_words): lemmatized = lemmatize_sents(tags) else: lemmatized = tags chunker = RegexpParser("AC: {(<CD>?<TO|IN>?<CD>)+}\n " "AN: {(<NPP>+<DT|NPP|JJ>*)+}\n " "}<DT>+{\n " "PH: {<[B-Z]+>+}\n " "}<DT|CC|PRP|EX|WDT>+{") chunked = list(chunker.parse_sents(lemmatized)) return (*setup_search_structure(chunked, tuple), sentences)
def get_chunks(sentences, grammar = r'NP: {<DT>? <JJ>* <NN.*>+}'): all_chunks = [] chunker = nltk.chunk.regexp.RegexpParser(grammar) for sentence in sentences: tagged_sents = nltk.pos_tag_sents( [nltk.word_tokenize(sentence)]) chunks = [chunker.parse(tagged_sent) for tagged_sent in tagged_sents] wtc_sents = [nltk.chunk.tree2conlltags(chunk) for chunk in chunks] flattened_chunks = list( itertools.chain.from_iterable( wtc_sent for wtc_sent in wtc_sents) ) valid_chunks_tagged = [(status, [wtc for wtc in chunk]) for status, chunk in itertools.groupby(flattened_chunks, lambda (word,pos,chunk): chunk != 'O')] valid_chunks = [' '.join(word.lower() for word, tag, chunk in wtc_group if word.lower() not in stopword_list) for status, wtc_group in valid_chunks_tagged if status] all_chunks.append(valid_chunks) return all_chunks
def pos_tag(docs): """ Args: docs ([[[str]]]) """ return map(lambda doc: nltk.pos_tag_sents(doc), docs)
#get unique/total ratio ratios = [(float(len(set(words)))/float(len(words))) for words in tokens] plt.scatter(years, ratios) plt.show() #Collocations lower = [[word.lower() for word in words] for words in tokens] bigram_measures = nltk.collocations.BigramAssocMeasures() for i in range(len(years)): finder = BigramCollocationFinder.from_words(lower[i]) finder.apply_freq_filter(2) print (years[i], finder.nbest(bigram_measures.pmi, 10)) #chunk text and extract entities postags = [nltk.pos_tag_sents(entry) for entry in senttokens] ne_tags = [nltk.ne_chunk_sents(pos, binary=True) for pos in postags] ents = [extract_entities(tagged) for tagged in ne_tags] entFreqs = [nltk.FreqDist(entry) for entry in ents] #get freq dist of all entities allentities = [item for sublist in ents for item in sublist] allentfreq = nltk.FreqDist(allentities) #make list of top 50 most frequent and prune individual docs to take out filtered words filtered, freq = zip(*allentfreq.most_common(50)) pruned = [] for entFreq in entFreqs: ents, freqs = zip(*entFreq.most_common(100)) topEnts = [x for x in ents if x not in filtered] pruned.append(topEnts)
def tag_many(self, documents, tagset=None, **kwargs): """ POS-Tag many documents. """ return pos_tag_sents((word_tokenize(d) for d in documents), tagset)
import itertools import nltk import operator import pandas as pd import re import functions as f df = f.load_data() ## tokenize sents = df['ingredient_txt'].map(lambda x: map(nltk.word_tokenize, x.split('\n'))) ## remove first and last elements, which are empty lists. sents = map(lambda x: x[1:-1], sents) tagged = [nltk.pos_tag_sents(x) for x in sents[0:1000]] ## trying named entity recognition. nltk.ne_chunk(tagged[0]) ## trying hand-coded identification of ingredients. def seq(pos, x): return [t[pos] for t in x] ## split tokens and tags into separate lists. tok_seq = map(lambda x: map(lambda xx: seq(0, xx), x), tagged) tag_seq = map(lambda x: map(lambda xx: seq(1, xx), x), tagged) ## create mapping between flattened list of ingredients and recipe ids. idx = {}
#Each document will also disregard anything said by the interviewer. The logic #is that the interviewer's purpose is nothing more than to tease information out #of the interview subject and thus would be repetitive or not data rich. intervieweeTranscriptDict=dict() with open(data_path) as data_file: csv_reader=csv.reader(data_file, delimiter=',',quotechar='"') for row in csv_reader: if row[1]=='Interviewee': if row[0] in intervieweeTranscriptDict: intervieweeTranscriptDict[row[0]].extend([nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(row[2])]) else: intervieweeTranscriptDict[row[0]]=[nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(row[2])] intervieweeTranscriptDict={interviewee: nltk.pos_tag_sents(doc) for interviewee,doc in intervieweeTranscriptDict.items()} transcriptList=list() for doc in intervieweeTranscriptDict.values(): bufferDocList=list() stopwords=nltk.corpus.stopwords.words('english') for sentence in doc: for word, pos in sentence: posTest=nltk.tag.map_tag('en-ptb','universal',pos) if ((str.lower(word) not in stopwords and word not in set(string.punctuation)) and (posTest=='NOUN' or posTest=='VERB' or posTest=='ADJ' or posTest=='ADV')): bufferDocList.append(str.lower(word)) transcriptList.append(bufferDocList) #*************END PREPROCESSING
def get_corpus_pos(dic): return {interviewee: nltk.pos_tag_sents(doc) for interviewee,doc in dic.items()}
def getPOSTags(self, article): articleSents = list(filter(bool, [line.lower().replace("<s>", "").replace("</s>", "").strip().split() for line in article.split("\n")])) postags = nltk.pos_tag_sents(articleSents) return postags