def missingCorpus(corpusdir): try: os.makedirs(corpusdir) except OSError: if not os.path.isdir(corpusdir): raise try: os.makedirs(corpusdir+'/ratings') except OSError: if not os.path.isdir(corpusdir+'/ratings'): raise hotel = json.load(open(data_path+file)) stopset = hotelNameAddress(hotel) stopgroup = "" for e in stopset: stopgroup += e+" " stopgroup = stopgroup[0:-1] with open(corpusdir+'/stopset.txt', 'w') as fout: fout.write(stopgroup) revNum = 0 for review in hotel.get('Reviews'): revNum += 1 contentOut = "" overall = review.get('Ratings').get('Overall') content = pos_tag_sents([word_tokenize(sentence) for sentence in sent_tokenize(review.get('Content'))]) with open (corpusdir+'/ratings/OverallRating'+str(revNum)+'.txt', 'w') as fout: fout.write(overall) with codecs.open(corpusdir+'/Review'+str(revNum)+'.txt', 'w', encoding = "utf-8") as fout: for sentence in content: for word, pos in sentence: contentOut += word+"/"+pos+" " contentOut += '\n' fout.write(contentOut)
def twitter_token(): from nltk.corpus import twitter_samples from nltk.tag import pos_tag_sents tweets = twitter_samples.strings('positive_tweets.json') tweets_tokens = twitter_samples.tokenized('positive_tweets.json') tweets_tagged = pos_tag_sents(tweets_tokens) """ JJ:Adjective singular nouns (NN) plural nouns (NNS) """ JJ_count = 0 NN_count = 0 for tweet in tweets_tagged: for key, tag in tweet: #tag = pair[1] if tag == 'JJ': JJ_count += 1 elif tag == 'NN': NN_count += 1 print('Total number of adjectives = ', JJ_count) print('Total number of nouns = ', NN_count)
def nouns_verbs_to_POSTAG(sample): ''' Same principle as to_POSTAG but only applies transformation to nouns and verbs :param sample: full text sample :type sample: str Output: str ''' # Convert sample into input form needed by pos_tag_sents: list(list(str)) # Needed: sentence-tokenize, then word-tokenize sents_list = [word_tokenize(sent) for sent in sent_tokenize(sample)] # pos-tagging POS = pos_tag_sents(sents_list) # extracting the postags and put into one str to replace original sample # list(list((word, tag),(word, tag),(word, tag))) POS_sents = [] for sent_list in POS: word_pos_seq = [] for word, tag in sent_list: if tag.startswith('NN') or tag.startswith('VB'): word_pos_seq.append(tag) else: word_pos_seq.append(word) word_pos_seq = ' '.join( word_pos_seq) # This will be a string like 'NNP sees the NN .' POS_sents.append(word_pos_seq) POS_sample = ' '.join(POS_sents) return POS_sample
def main(): sentlist, dictlist = extractinfo() print('Info Extracted..') postaglist = pos_tag_sents([sent.split() for sent in sentlist], tagset='universal') print('POS Tagging Done') data = makedata(postaglist, dictlist) printdata(data) vocabwords = makewordvocab()
def __init__(self, path): self.sentences = [] with codecs.open(path, 'r', 'utf-8') as f: sentences = [word_tokenize(s) for s in sent_tokenize(f.read())] tuples = pos_tag_sents(sentences) for sentence in tuples: self.sentences.append({ "words" : [u[0] for u in sentence], "POS" : [u[1] for u in sentence] })
def __init__(self, path=None, input_text=None): self.sentences = [] raw_text = input_text if path is not None: with codecs.open(path, 'r', 'utf-8') as f: raw_text = f.read() sentences = [word_tokenize(s) for s in sent_tokenize(raw_text)] tuples = pos_tag_sents(sentences) for sentence in tuples: self.sentences.append({ "words": [u[0] for u in sentence], "POS": [u[1] for u in sentence] })
def posTagging(orgFile,targetFile): count_org , count= 0, 0 english_punctuations = ['(', '[', '<', ')', ']','>',',', '.', ':', ';', '?', '!', '@', '#', '%', '$', '*',' ','','\n'] try: f=open(orgFile,'r') lines=f.readlines() # except Exception as e: # print(orgFile,"文件解码错误\n",traceback.format_exc()) except UnicodeDecodeError: print("Error:",orgFile,"文件解码错误") return 2,1 else: f.close() ff=open(targetFile,'w') for line in lines: #统计原文词数 line_list = line.split() words_list = [i for i in line_list if i not in english_punctuations] count_org += len(words_list) for w in words_list: #处理Prob(source =l) if '(' in w and w[0]!='(' and w[-1]!='(': # print(w) count_org += 1 elif ')' in w and w[0]!=')' and w[-1]!=')' and w[-1] not in english_punctuations[6:12]: #(2016), # print(w) count_org += 1 line_str, pre_word = '','' res = pos_tag_sents([word_tokenize(i) for i in sent_tokenize(line)]) for sents in res: #每一个分句 for word_tuple in sents: #每个分句的分词 if word_tuple[0] in english_punctuations: word = ''.join([' ' for i in english_punctuations[0:3] if word_tuple[0]==i]) + word_tuple[0] else: if pre_word in [' '+ j for j in english_punctuations[0:3]]: word = "_".join(word_tuple) else: word = ' '+"_".join(word_tuple) count += 1 pre_word = word line_str += word ff.write(line_str) ff.write('\n') ff.close() if count==0: print(orgFile,"文件总词数为0") return 2,1 return count_org,count print("原文总词数",count_org," 附码后总词数:",count)
def pos_tag(self, tokens): return pos_tag_sents(tokens) # data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy." # preprocessor = Preprocessor() # print preprocessor.proccess_text(data) # dataset = [ # 'Today was a bad day', # 'I love running in the park', # 'I used to have a cat when I was a kid' # ] # print preprocessor.process_dataset(dataset)
def lemmatization(tokenized_sents): # lemmatization lem = WordNetLemmatizer() sents_pos_tag = tag.pos_tag_sents(tokenized_sents, lang="eng") wn_sents_pos_tag = [] for sent in sents_pos_tag: wn_sents_pos_tag.append( map(lambda tup: (tup[0], convert_nltk2wn_pos(tup[1])), sent)) lem_sents = [] for sent in wn_sents_pos_tag: s = [] for word, pos in sent: if pos is None: s.append(word) else: s.append(lem.lemmatize(word, pos)) lem_sents.append(s) return lem_sents
def transform1(self, X): # word X_tokenized1 = [self.feature_fxn(t) for t in X] # POS label tweets_tagged = pos_tag_sents(X_tokenized1) X_tokenized = [] for i in range(len(tweets_tagged)): X_tokenized.append([]) # i 是第几篇文章 for j in range(len(tweets_tagged[i])): X_tokenized[i].append(tweets_tagged[i][j][1]) # print(X_tokenized[2]) # print("------------------------") X = [] for tweet in X_tokenized: X.append( [self.feature_map[w] for w in tweet if w in self.feature_map]) X = sequence.pad_sequences(X, maxlen=self.max_len, padding='post') return X
def noun_freq_abstraction(sample, model=None): ''' Combination of above noun abstraction methods and get_word_freq ''' # if using default tagger (i.e. nltk tagger) if model == None: sents_list = [word_tokenize(sent) for sent in sent_tokenize(sample)] # pos-tagging POS = pos_tag_sents(sents_list) # extracting the postags and put into one str to replace original sample # list(list((word, tag),(word, tag),(word, tag))) POS_sents = [] for sent_list in POS: word_pos_seq = [] for word, tag in sent_list: if tag.startswith('NN'): freq = str(get_word_freq(word)) info = tag + '_' + freq word_pos_seq.append(info) else: word_pos_seq.append(word) word_pos_seq = ' '.join( word_pos_seq ) # This will be a string like 'NNP_5.3 sees the NN_4.8 .' POS_sents.append(word_pos_seq) POS_sample = ' '.join(POS_sents) # if using, e.g. spacy else: doc = model(sample) tokens = [] for token in doc: if token.tag_.startswith('N'): freq = str(get_word_freq(token.text)) info = token.tag_ + '_' + freq tokens.append(info) else: tokens.append(token.text) POS_sample = ' '.join(tokens) return POS_sample
def to_POSTAG(sample): ''' Turns each sample into a long string of postags, abstracting away from the actual words :param sample: full text sample :type sample: str Output: str ''' # Convert samplt into input form needed by pos_tag_sents: list(list(str)) # Needed: sentence-tokenize, then word-tokenize sents_list = [word_tokenize(sent) for sent in sent_tokenize(sample)] # pos-tagging POS = pos_tag_sents(sents_list) # extracting the postags and put into one str to replace original sample POS_sents = [] for sent_list in POS: pos_seq = [tag for word, tag in sent_list] pos_seq = ' '.join( pos_seq) # This will be a string like 'NNP VB DET NNP .' POS_sents.append(pos_seq) POS_sample = ' '.join(POS_sents) return POS_sample
def get_koss(fn_k, my_env, sysdic, stopdic): #日英の場合は英語 #s_mecab_ko = get_s_mecab(fn_k, 'ko', my_env) #koss = get_mrphs(s_mecab_ko, 'ko') #英語はまずget_mrphsをしない infile = open(fn_k, 'r', encoding='UTF-8') #, encoding='UTF-8' koss = [] lines = infile.readlines() sents = [] for l in lines: t = word_tokenize(l.strip()) #koss.append(t) sents.append(t) lemmatizer = WordNetLemmatizer() tags = pos_tag_sents(sents) for sent in tags: tmp = [] for w_pair in sent: word = w_pair[0].lower() #单词先全部小写 #if w_pair[1] in ["NN","NNS"]: # word = word.capitalize() postag = penn_to_wn(w_pair[1]) if (word in stopdic): continue if (postag == None or postag == "v"): #tmp.append(word) continue else: tmp.append(lemmatizer.lemmatize(word, postag)) koss.append(tmp) infile.close() return koss
from nltk.corpus import twitter_samples from nltk.tag import pos_tag_sents # Load tokenized data of tweets tokens = twitter_samples.tokenized('positive_tweets.json') # tag tokens tagged_tokens = pos_tag_sents(tokens) # print output of the couple of tagged tokens ## count nouns noun_count = 0 adj_count = 0 for tweet in tagged_tokens: for pair in tweet: tag = pair[1] if tag == 'NN': noun_count += 1 print('Number of nouns: ', noun_count)
if 'train' in record_file: all_M_vals = [record[6] for record in records] M = max(all_M_vals) avg_M = sum(all_M_vals) // len(records) if use_avg_M_plus_mode: M_counts = Counter(all_M_vals) avg_M += max(M_counts.items(), key=operator.itemgetter(1))[0] positions = create_pos_index(sentences_and_indexes, M, avg_M) num_positions = len(positions) if fire_positions else 0 norm_sents = [ normalize(M, sentence, avg_M) for sentence in sentences_and_indexes ] if fire_tagger: tagged_sents = pos_tag_sents( [[w if w is not None else 'none' for w in sent] for sent in norm_sents]) else: tagged_sents = None if max_suffix_size == 0: word_lengths = [len(w) for w in words] suffix_length = sum(word_lengths) // num_words else: suffix_length = max_suffix_size num_char_embeddings = suffix_length * char_emb_dims num_cats = cat_dims * suffix_length len_token_vec = num_words + num_positions + num_clusters + num_suffixes + num_shapes + num_tags + num_embeddings + num_char_embeddings + num_cats feat_val = ':1.0' with open(out_file, 'w+') as lib_out: for i, sentence in enumerate(sentences_and_indexes):
def tag_POS(process_pos): return pos_tag_sents(process_pos)
if not os.path.isdir(corpusdir): raise try: os.makedirs(corpusdir+'/ratings') except OSError: if not os.path.isdir(corpusdir+'/ratings'): raise hotel = json.load(open(path+file)) stopset = hotelNameAddress(hotel) stopgroup = "" for e in stopset: stopgroup += e+" " stopgroup = stopgroup[0:-1] with open(corpusdir+'/stopset.txt', 'w') as fout: fout.write(stopgroup) revNum = 0 for review in hotel.get('Reviews'): revNum += 1 contentOut = "" overall = review.get('Ratings').get('Overall') content = pos_tag_sents([word_tokenize(sentence) for sentence in sent_tokenize(review.get('Content'))]) with open (corpusdir+'/ratings/OverallRating'+str(revNum)+'.txt', 'w') as fout: fout.write(overall) with codecs.open(corpusdir+'/Review'+str(revNum)+'.txt', 'w', encoding = "utf-8") as fout: for sentence in content: for word, pos in sentence: contentOut += word+"/"+pos+" " contentOut += '\n' fout.write(contentOut)
from nltk.corpus import gutenberg from gensim.models import Word2Vec #for POS (parts of speech) from nltk.tag import pos_tag_sents g = Word2Vec(gutenberg.raw()) #g_tokens = g.tokenized('positive_tweets.json') g_tagged = pos_tag_sents(g) g_len = len(g_tagged) print(g_len) #JJ_count = 0 #NN_count = 0 #for tweet in tweets_tagged: # for pair in tweet: # tag = pair[1] # if tag == 'JJ': # JJ_count += 1 # elif tag == 'NN': # NN_count += 1 #print ('Total number of adjectives = ', JJ_count) #print ('Total number of nouns = ', NN_count)
from nltk.corpus import gutenberg #for POS (parts of speech) from nltk.tag import pos_tag_sents from nltk import word_tokenize emma = gutenberg.words('austen-emma.txt') emma_tokens = word_tokenize('austen-emma.txt') emma_tagged = pos_tag_sents(emma_tokens) JJ_count = 0 NN_count = 0 for words in emma_tagged: for pair in words: tag = pair[1] if tag == 'JJ': JJ_count += 1 elif tag == 'NN': NN_count += 1 print('Total number of adjectives = ', JJ_count) print('Total number of nouns = ', NN_count)
# https://www.digitalocean.com/community/tutorials/how-to-work-with-language-data-in-python-3-using-the-natural-language-toolkit-nltk from nltk.corpus import twitter_samples from nltk.tag import pos_tag_sents tweets = twitter_samples.strings('positive_tweets.json') tweets_tokens = twitter_samples.tokenized('positive_tweets.json') tweets_tokenzied = pos_tag_sents(tweets_tokens) print(tweets_tokenzied) JJ_count = 0 NN_count = 0 for tweet in tweets_tokenzied: # print("===",tweet) for pair in tweet: tag = pair[1] if tag == 'JJ': JJ_count += 1 elif tag == 'NN': NN_count += 1 print('NN count==', NN_count) print('JJ_count==', JJ_count)
ff=open('demo1.txt','w',encoding='UTF-8') for line in lines: #统计原文词数 line_list = line.split() words_list = [i for i in line_list if i not in english_punctuations] count_org += len(words_list) for w in words_list: #处理Prob(source =l) if '(' in w and w[0]!='(' and w[-1]!='(': print(w) count_org += 1 elif ')' in w and w[0]!=')' and w[-1]!=')' and w[-1] not in english_punctuations[6:12]: #(2016), print(w) count_org += 1 line_str, pre_word = '','' res = pos_tag_sents([word_tokenize(i) for i in sent_tokenize(line)]) for sents in res: #每一个分句 for word_tuple in sents: #每个分句的分词 if word_tuple[0] in english_punctuations: word = ''.join([' ' for i in english_punctuations[0:3] if word_tuple[0]==i]) + word_tuple[0] else: if pre_word in [' '+ j for j in english_punctuations[0:3]]: word = "_".join(word_tuple) else: word = ' '+"_".join(word_tuple) count += 1 # print("_".join(word_tuple)) pre_word = word line_str += word ff.write(line_str) ff.write('\n')
def extract_entity_patterns(self, chunk={}): color_mapping = { 'magenta': ['NN', 'NNS'], 'green': ['NNP', 'NNPS'], 'cyan': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'], 'yellow': ['JJ', 'JJR', 'JJS'] } # reverse color mapping color_mapping = { v: k for k, values in color_mapping.iteritems() for v in values } for entity, relations in chunk.iteritems(): cleaned_subject_entity_name = uri_rewriting.strip_cleaned_name( entity) subject_entity = uri_rewriting.strip_name(entity) for rel_ontology, values in relations.iteritems(): target_resources = values['resources'] sentences = values['sentences'] rel_ontology = rel_ontology.split('/')[-1] data = [{ 'entity': cleaned_subject_entity_name, 'relation': rel_ontology, 'resource': res, 'sentence': sent } for res in target_resources for sent in sentences if sent.contains_any_link([res]) and res != entity] # remove needless sentence information based on relation facts # data = map(self.shorten_sentence, data) # POS tag sentences for entry in data: sentence = entry['sentence'] if sentence.number_of_tokens() > 50: continue # probably too long for stanford tokenizer resource = entry['resource'] nl_sentence = sentence.as_string() relative_position = sentence.relative_pos entry['nl sentence'] = nl_sentence tokenized_sentences = map(word_tokenize, [nl_sentence]) pos_tagged_sentences = pos_tag_sents( tokenized_sentences).pop() object_addresses = sentence.addresses_of_link(resource) object_entity = uri_rewriting.strip_name(resource) pattern = self.pattern_extractor.extract_pattern( nl_sentence, object_addresses, relative_position, self.type_learning, subject_entity, object_entity) if pattern is not None: values['patterns'].append(pattern) entry['pattern'] = pattern # color sentence parts according to POS tag colored_sentence = [ colored(word, color_mapping.setdefault(pos, 'white')) for word, pos in pos_tagged_sentences ] colored_sentence = ' '.join(colored_sentence) colored_sentence = re.sub( r' (.\[\d+m),', ',', colored_sentence) # remove space before commas entry['colored_sentence'] = colored_sentence self.matches.extend(data)
from typing import List, Dict # type hinting is important from nltk.corpus import twitter_samples # me JSONs with tweets from nltk.tag import pos_tag_sents # part of speach tagger # Each tweet is an item pos_tweets: list = twitter_samples.strings('positive_tweets.json') # Goal: count adjective: descriptor and nouns: thing ''' 1. Tokenization: breaking up sequence of strings into /words(?)/phrases Regardless, each piece is a token. * In this case the strings are being split at each space ''' pos_tweets_tokens: list = twitter_samples.tokenized('positive_tweets.json') # tuples with token and tag pos_tweets_tagged: list = pos_tag_sents(pos_tweets_tokens) ''' Tags: JJ: adjective NN: singular noun NNS: plural noun ''' # Let us count how many times these appear!
from nltk.corpus import twitter_samples from nltk.tag import pos_tag_sents tweets = twitter_samples.strings('positive_tweets.json') tweets_tokens = twitter_samples.tokenized('positive_tweets.json') JJ_count = 0 NN_count = 0 tweets_tagged = pos_tag_sents(tweets_tokens) for tweet in tweets_tagged: for pair in tweet: tag = pair[1] if tag == 'JJ': JJ_count += 1 elif tag == 'NN': NN_count += 1 print('Total number of adjectives = ', JJ_count) print('Total number of nouns = ', NN_count)
"data/FMFS_Module_8_Verified_Post_Verbatim_TScript.txt", } text = "" for fname in fnames2: inFile = open(fname, 'r') text += inFile.read() text = text.replace('’', "'") verbs = {} sents = sent_tokenize(text) # print(sents) for i, s in enumerate(sents): sents[i] = word_tokenize(s) taggedsents = pos_tag_sents(sents) # print(taggedsents) for s in taggedsents: # print(s) for w, t in s: if t in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: # print(w) if (verbs.get(w.lower()) is not None): verbs[w.lower()] += 1 else: verbs[w.lower()] = 1 sorted_verbs = sorted(verbs.items(), key=operator.itemgetter(1)) sorted_verbs.reverse() i = 0
argparser.add_argument('file', help="text document") args = argparser.parse_args() stopwords = stopwords.words('english') doc_path = os.path.splitext(args.file)[0] tt = TextTilingTokenizer() text = codecs.open(doc_path + '.txt', 'r', "utf-8").read() parags = tt.tokenize(text) buffer_tiled = '' buffer_tiled_tagged = '' buffer_tiled_tagged_clean = '' tagged_parags = pos_tag_sents([word_tokenize(p) for p in parags]) clean_parags = [ filter(lambda taggedword: taggedword[0] not in stopwords, p) for p in tagged_parags ] for i, p in enumerate(parags): buffer_tiled += p for word, tag in tagged_parags[i]: buffer_tiled_tagged += word + "/" + tag + ' ' if word not in stopwords: if tag[0] == 'V': tag_abstract = 'verb' elif tag[0] == 'N': tag_abstract = 'noun' else: continue buffer_tiled_tagged_clean += word + ' ' + tag_abstract + '\n'
def tagText(text): sents = [word_tokenize(s) for s in sent_tokenize(text)] taggedSents = pos_tag_sents(sents) return taggedSents
from nltk.tag import pos_tag_sents import nltk import os file = open('sam2.txt', 'r') texts = file.readlines() text_tokenized = [] for txt in texts: [date, text] = txt.strip().split('@') text = text.lower() text_tokenized.append(nltk.word_tokenize(text)) pos = pos_tag_sents(text_tokenized) journal = open('journal.txt', 'w') journal.write( '------------------------------------------------------------------------------------------\n' ) journal.write('Date \t\t\t\t\tParticulars\t\t\t\t\t\t\t\t\tAmount(Rs.)\n') journal.write('%-10s %-20s%20s' % ('', ' Dr.', 'Cr.\n')) journal.write( '==========================================================================================\n\n' ) cash = open(os.path.join('ledgers', 'cash'), 'w') cash.write( '------------------------------------------------------------------------------------------\n' ) cash.write('Date \t\t\t\t\tParticulars\t\t\t\t\t\t\t\t\tAmount(Rs.)\n') cash.write('%-10s %-20s%20s' % ('', ' Dr.', 'Cr.\n')) cash.write(
# -*- coding: utf-8 -*- """ Created on Mon Oct 2 14:59:21 2017 @author: hp """ import nltk from nltk.corpus import twitter_samples from nltk.tag import pos_tag_sents JJCount = 0 NNCount = 0 tweets = twitter_samples.strings('positive_tweets.json') """Each tweet is stored as a string in a list varibale 'Tweets' """ tweets_token = twitter_samples.tokenized('positive_tweets.json') """Each string tweet needs to be broken down to keywords, phrases,symbols etc. these are called Tokens """ tweets_tagged = pos_tag_sents(tweets_token) for tweet in tweets_tagged: for pair in tweet: if pair[1] == 'NN': NNCount += 1 elif pair[1] == 'JJ': JJCount += 1 print('Total Adjectives= ', JJCount) print('Total Nouns= ', NNCount)
stopwords = stopwords.words("english") doc_path = os.path.splitext(args.file)[0] tt = TextTilingTokenizer() text = codecs.open(doc_path + ".txt", "r", "utf-8").read() parags = tt.tokenize(text) buffer_tiled = "" buffer_tiled_tagged = "" buffer_tiled_tagged_clean = "" tagged_parags = pos_tag_sents([word_tokenize(p) for p in parags]) clean_parags = [filter(lambda taggedword: taggedword[0] not in stopwords, p) for p in tagged_parags] for i, p in enumerate(parags): buffer_tiled += p for word, tag in tagged_parags[i]: buffer_tiled_tagged += word + "/" + tag + " " if word not in stopwords: if tag[0] == "V": tag_abstract = "verb" elif tag[0] == "N": tag_abstract = "noun" else: continue buffer_tiled_tagged_clean += word + " " + tag_abstract + "\n"