def condensify(train): """ Takes input either a string or a list of string Returns a list of all summaries; For a string returns a list with singleton document """ summ_list = [] if isinstance(train,string): train = [train] for t in train: summ=[] k=0 #corpus = [dictionary.doc2bow(text) for text in texts] dictionary = corpora.Dictionary([w for w in reuters.sents(t)]) corpus = [dictionary.doc2bow(w) for w in reuters.sents(t)] matrix = matutils.corpus2csc(corpus) #print matrix u,sigma,vt = sparse.linalg.svds(matrix) (k,l)= vt.shape while k>=1: if reuters.sents(t)[vt[k-1].argmax()] not in summ: summ.append(reuters.sents(t)[vt[k-1].argmax()]) k-=1 v=[] for s in summ: v.append(" ".join(s)) summ = "".join(v) summ_list.append(summ) return (summ_list)
def import_reuters_flat_pos(ds, silent=False, log=sys.stdout): """ Import the brown corpus into `ds`. E.g. >>> from nathan.core import Dataspace >>> ds = Dataspace() >>> %time brown.import_brown(ds, silent=True) CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s Wall time: 12min 29s """ tagger = nltk.data.load("./models/treebank_brill_aubt/treebank_brill_aubt.pickle") if not silent: total = len(reuters.sents()) counter = 0 root_handle = ds.insert("#reuters") for sent in reuters.sents(): sent = tagger.tag(sent) norm = [nltk.tuple2str(t) for t in sent] sen_handle = ds.insert(norm) ds.link(root_handle, sen_handle) if not silent: counter += 1 if (counter % 100 == 0): print("importing %s of %s sentences..." % (counter, total), file=log)
def condensify(train): """ Takes input either a string or a list of string Returns a list of all summaries; For a string returns a list with singleton document """ summ_list = [] if isinstance(train, string): train = [train] for t in train: summ = [] k = 0 #corpus = [dictionary.doc2bow(text) for text in texts] dictionary = corpora.Dictionary([w for w in reuters.sents(t)]) corpus = [dictionary.doc2bow(w) for w in reuters.sents(t)] matrix = matutils.corpus2csc(corpus) #print matrix u, sigma, vt = sparse.linalg.svds(matrix) (k, l) = vt.shape while k >= 1: if reuters.sents(t)[vt[k - 1].argmax()] not in summ: summ.append(reuters.sents(t)[vt[k - 1].argmax()]) k -= 1 v = [] for s in summ: v.append(" ".join(s)) summ = "".join(v) summ_list.append(summ) return (summ_list)
def getSummaries(cfs, number_of_summaries): for n in range(number_of_summaries): # get a random article from the corpus article = random.choice(reuters.fileids()) # make sure the article is of the apropriate length # I decided at least 5 sentences! while len(reuters.sents(article)) < 5: article = random.choice(reuters.fileids()) length = len(reuters.sents(article)) // 2 summary_sentences = cfs.summarize(article, length) print_summary(summary_sentences)
def load_reuters(self): sents = reuters.sents() print "Done loading reuters, cleaning..." # clean, etc... data = [] for sentence in sents: x = [] for word in sentence: if word in self.punctset: continue w = word.lower().strip(self.punctuation) ind = self.word2id.get(w, -1) if ind < 0: ind = len(self.word2id) self.word2id[w] = ind x.append(ind) data.append(x) print "Done cleaning reuters, vectorizing..." X = sparse.lil_matrix(( len(self.word2id), len(data), )) for (j, dat) in enumerate(data): for i in dat: X[i, j] = 1 return X
def language_mode(): # 读取语料库 from NLTK categories = reuters.categories() corpus = reuters.sents(categories=categories) print(f'top 3 corpus is:\n {corpus[:3]}') term_count = {} bigram_count = {} for doc in corpus: doc = ['<s>'] + doc for i in range(0, len(doc) - 1): #bigram :[i,i +1] term = doc[i] bigram = doc[i:i + 2] if term in term_count: term_count[term] += 1 else: term_count[term] = 1 bigram = ' '.join(bigram) if bigram in bigram_count: bigram_count[bigram] += 1 else: bigram_count[bigram] = 1 print(f'term_count length is: \n {len(term_count)}' ) # {'<s>': 54716, 'ASIAN': 12, 'EXPORTERS': 46, 'FEAR' print( f'bigram_count length is: \n {len(bigram_count)}' ) #{'<s> ASIAN': 4, 'ASIAN EXPORTERS': 1, 'EXPORTERS FEAR': 1, 'FEAR DAMAGE': 1, return term_count, bigram_count
def reuters_to_df(set_name, label_to_idx): data = [x for x in reuters.fileids() if set_name in x] # collect all data to create df from all_texts = [ " ".join([" ".join(sen) for sen in reuters.sents(doc_id)]) for doc_id in data ] all_labels = np.zeros((len(all_texts), len(label_to_idx))) all_label_indices = [[ label_to_idx[lab] for lab in reuters.categories(doc_id) ] for doc_id in data] for i, labs in enumerate(all_label_indices): # binary encode the labels all_labels[i][labs] = 1 all_labels = all_labels.astype(int) # all_labels[all_label_indices] = 1 cols = ["text"] label_cols = ["topic_{}".format(lab) for lab in reuters.categories()] cols.extend(label_cols) # create df and set values df = pd.DataFrame(columns=cols) df["text"] = all_texts df[label_cols] = all_labels return df
def import_reuters_files(ds, silent=False, log=sys.stdout): """ Import the brown corpus into `ds`. E.g. >>> from nathan.core import Dataspace >>> ds = Dataspace() >>> %time brown.import_brown(ds, silent=True) CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s Wall time: 12min 29s """ if not silent: total = len(reuters.fileids()) counter = 0 root_handle = ds.insert("#reuters") for fileid in reuters.fileids(): tags = ["@%s" % category for category in reuters.categories(fileid)] file_handle = ds.insert(["#%s" % fileid] + tags) ds.link(root_handle, file_handle) for sent in reuters.sents(fileid): norm = [word.lower() for word in sent] sen_handle = ds.insert(norm) ds.link(file_handle, sen_handle) if not silent: counter += 1 if (counter % 10 == 0): print("importing %s of %s files..." % (counter, total), file=log)
def filesForEval(cfs): #the number of sentences in each summary to be generated length = 5 #the number of summaries to be written into the file number_of_summaries = 10 #create the file containing the summaries summFile = open('random_order_summaries_2.txt', 'w') #create the file which will be the key keyFile = open('key_2.txt', 'w') #generate the appropriate number of summaries for n in range(number_of_summaries): #get a random article from the corpus article = random.choice(reuters.fileids()) #make sure the article is of the apropriate length # I decided at least twice the length of the summary in this case. while len(reuters.sents(article)) < length * 4: article = random.choice(reuters.fileids()) #print info about the article into the summary-containing document summFile.write('Article #' + str(n) + '\n') summFile.write('\nfileid: ' + article + '\n\n') #print info about the article into the key document keyFile.write('Article #' + str(n) + '\n') keyFile.write('fileid: ' + article + '\n\n') #get a list of sentences that is the summary generated by our algorithm summSents = cfs.summarize(article, length) #insert a marker to make sure we remember it is the summary summSents.insert(0, 'summ') #get a list of sentences that were randomly ordered randSents = getRandom(article, length) #insert a marker to make sure we remember it is the random selection randSents.insert(0, 'rand') #mix up the ordering summs = [summSents, randSents] random.shuffle(summs) #Write the summaries into the file and write the key, in a semi-nice format for summ in summs: for i, sentence in enumerate(summ): if i == 0: keyFile.write(sentence) else: for word in sentence: summFile.write(word + ' ') summFile.write('\n') summFile.write('\n') keyFile.write(' ') summFile.write('\n') keyFile.write('\n') summFile.close() keyFile.close()
def __init__(self): training_files = [ fileid for fileid in reuters.fileids() if fileid.startswith('training') ] super(ReutersTrainingCorpus, self).__init__(reuters.sents(training_files))
def create_raw_data_for_classifier(start_pt, end_pt): # printing process id #print("ID of process running : {}".format(os.getpid())) #df_for_raw = pd.DataFrame(columns=['sentences','polarity']) pos = 0 neg = 0 polarity_list = [] sentncs_list = [] #for i in range(len(reuters.sents())): for i in range(start_pt, end_pt): sentncs = " ".join(reuters.sents()[i]) #print("sentncs = ", sentncs) blob = TextBlob(sentncs) sentncs_list.append(sentncs) if blob.sentiment.polarity > 0: polarity_list.append('pos') pos = pos + 1 elif blob.sentiment.polarity < 0: polarity_list.append('neg') neg = neg + 1 raw_data = list(zip(sentncs_list, polarity_list)) #print(reutersDf.tail(10)) print("raw_data len = ", len(raw_data)) print("Total pos = ", pos, " Total Neg =", neg) print(raw_data[0]) raw_data = list(zip(sentncs_list, polarity_list)) #print(reutersDf.tail(10)) print("raw_data len = ", len(raw_data)) print("Total pos = ", pos, " Total Neg =", neg) print(raw_data[0]) return raw_data
def reuters_idf_dict(current_docs, file_name, order=2): """ """ idf_file = file_name + ".idf" dict_idf = {} if os.path.exists(idf_file): with open(idf_file, 'r', encoding='utf-8') as f: for line in f: values = line.split("\t") # print(values) dict_idf[tuple(values[0].split())] = float(values[1]) dict_idf = make_concept_idf_dict(current_docs, dict_idf, len(reuters.fileids())) return dict_idf else: logger.info("Process reuters idf.") l_docs = [] for fileid in reuters.fileids(): l_docs.append(reuters.sents(fileids=[fileid])) dict_idf = make_concept_idf_dict(l_docs, order=order) with open(idf_file, 'w', encoding='utf-8') as f: for concept in dict_idf.keys(): f.write(' '.join(concept) + '\t' + str(dict_idf[concept]) + '\n') dict_idf = make_concept_idf_dict(current_docs, dict_idf, len(reuters.fileids())) return dict_idf
def __init__(self, vocab_path: os.PathLike, spell_error_path: os.PathLike): # 构建词典库 with open(vocab_path) as f: self.vocab = {line.strip() for line in f} self.vocab_size = len(self.vocab) # 获取语料库, 构建语言模型 categories = reuters.categories() corpus = reuters.sents(categories=categories) self.unigram_count, self.bigram_count = defaultdict(int), defaultdict( int) for doc in corpus: doc = ['<s>'] + doc for i in range(1, len(doc)): self.unigram_count[doc[i]] += 1 self.bigram_count[(doc[i - 1], doc[i])] += 1 # 统计拼写错误概率 P(mistake|correct) self.channel_prob = defaultdict(dict) with open(spell_error_path) as f: for line in f: temp = line.split(':') correct = temp[0].strip() mistakes = [m.strip() for m in temp[1].strip().split(',')] for m in mistakes: self.channel_prob[correct][m] = 1. / len(mistakes)
def initCount2(): corpus_raw_text = reuters.sents(categories=reuters.categories()) gram_count = {} count = [0, 0, 0] for sents in corpus_raw_text: sents = ['<s>'] + sents + ['</s>'] # remove string.punctuation for words in sents[::]: # use [::] to remove the continuous ';' ';' if (words in [ '\'\'', '``', ',', '--', ';', ':', '(', ')', '&', '\'', '!', '?', '.' ]): sents.remove(words) # count the n-gram for n in range(1, 3): # only compute 1/2/3-gram if (len(sents) <= n): # 'This sentence is too short!' continue else: for i in range(n, len(sents) + 1): gram = sents[i - n:i] # ['richer', 'fuller', 'life'] key = ' '.join(gram) # richer fuller life count[n] = count[n] + 1 if (key in gram_count): # use dict's hash gram_count[key] += 1 else: gram_count[key] = 1 with open("Count.pk", "wb") as fCount: pickle.dump([gram_count], fCount) return gram_count, count[0], count[1], count[2]
def getreuters(): # don't tokenize from nltk.corpus import reuters reuterslist=[] for article in reuters.fileids(): reuterslist.append(reuters.sents(article)) return reuterslist
def get_tokenized_sentences(dataset): if dataset == 'brown_corpus': return list(brown.sents()) elif dataset == 'reuters_corpus': return list(reuters.sents()) elif dataset == 'gatsby': with open('./data/gatsby.txt', 'r') as f: text = '\n'.join(f.readlines()) tok_sent = [word_tokenize(t) for t in sent_tokenize(text)] return tok_sent elif dataset == 'RACE_corpus': df_1 = pd.read_csv('./data/middle_combined.csv') df_2 = pd.read_csv('./data/high_combined.csv') text = '\n'.join(list(df_1['text']) + list(df_2['text'])) # This dataset appears to have an issue with period spacing text = text.replace(".", ". ") tok_sent = [word_tokenize(t) for t in sent_tokenize(text)] return tok_sent elif dataset in ('news_small', 'news_large'): df = pd.read_csv(f'./data/all_the_{dataset}.csv') text = '\n'.join(list(df['content'])) text = text.replace(" ", " ") text = text.replace(" ", " ") tok_sent = [word_tokenize(t) for t in sent_tokenize(text)] return tok_sent elif dataset.startswith("books"): # Get all books if dataset == 'books': text = "" dif = ["middle", "high", "college"] for d in dif: files = glob.glob(f"./data/books/{d}/*.txt") for file in files: with open(file, 'r') as f: text += "\n".join(f.readlines()) + "\n" # Get all books of specified difficulty else: difficulty = dataset.split("_")[1] files = glob.glob(f"./data/books/{difficulty}/*.txt") text = "" for file in files: with open(file, 'r') as f: text += "\n".join(f.readlines()) + "\n" # Underscores are used to indicate italics here and should be dropped. text = text.replace("_", "") tok_sent = [word_tokenize(t) for t in sent_tokenize(text)] return tok_sent # Assume .txt document else: with open(dataset, 'r') as f: text = "\n".join(f.readlines()) tok_sent = [word_tokenize(t) for t in sent_tokenize(text)] return tok_sent
def load_reuters(): from nltk.corpus import reuters text = reuters.sents() text = [[word.lower() for word in sentence] for sentence in text] vocab = Vocab.build(text, reserved_tokens=[BOS_TOKEN, EOS_TOKEN, PAD_TOKEN]) corpus = [vocab.convert_tokens_to_idx(sentence) for sentence in text] return corpus, vocab
def q4(query): print("\n" + "~"*10 + " Q4 " + "~"*10) # retrieve seventh sentence and join into a string # also remove punctuation from it since they are separated out pick = " ".join( remove_punc(word) for word in reuters.sents()[7] if word not in punc) result = (pick, jaccard(query, pick)) print(result)
def q1(): global docs # retrieve sentences from three genres docs = list(reuters.sents(categories='bop')) docs = docs + (reuters.sents(categories='cocoa')) docs = docs + (reuters.sents(categories='zinc')) #print(docs) # convert list of lists into list of strings doc = [] for sent in docs: # case fold each sentence doc.append(" ".join([ word.lower() for word in sent if word not in set(string.punctuation) ])) docs = doc
def get_corpus(): """To get nltk corpora.""" from nltk.corpus import brown from nltk.corpus import reuters corpus = add_sent_marker(brown.sents()) corpus = corpus + add_sent_marker(reuters.sents()) corpus = add_sent_marker(corpus) return corpus
def train_skip(CBOW_skip=0, embed_size=100, window=5, min_count=5, epochs=5, workers=1): corpus_name = 'Brown' corpus = itertools.chain(reuters.sents(), brown.sents(), gutenberg.sents()) corpus = list(corpus) model = Word2Vec(corpus,sg=CBOW_skip, size=embed_size, window=window, min_count=min_count, workers=workers) logging.warning("[1]"+",".join([corpus_name,str(CBOW_skip),str(embed_size),str(window)])) model.train(corpus, total_examples=len(corpus), epochs=epochs) model = gensim_to_embed(model) return wordsim_eval(model)
def get_reuters_token_list_by_sentence(num_doc=100): """ Get a test data from reuters corpus. Stopwords will be included to see how HMM_LDA works with these stopwords. Parameters ---------- num_doc: int number of documents to be returned max_voca maximum number of vocabulary size for the returned corpus Returns ------- voca: ndarray vocabulary corpus: list nested list of """ file_list = reuters.fileids() corpus = [reuters.sents(file_list[i]) for i in xrange(num_doc)] valid_voca = set(w.lower() for w in nltk.corpus.words.words()) stop = stopwords.words('english') valid_voca = valid_voca.union(stop) tmp_corpus = list() voca_dic = dict() voca = list() for doc in corpus: tmp_doc = list() for sent in doc: tmp_sent = list() for word in sent: if word in valid_voca: tmp_sent.append(word) if word not in voca_dic: voca_dic[word] = len(voca_dic) voca.append(word) if len(tmp_sent) > 0: tmp_doc.append(tmp_sent) if len(tmp_doc) > 0: tmp_corpus.append(tmp_doc) # convert token list to word index list corpus = list() for doc in tmp_corpus: new_doc = list() for sent in doc: new_sent = list() for word in sent: new_sent.append(voca_dic[word]) new_doc.append(new_sent) corpus.append(new_doc) return np.array(voca), corpus
def create_model_from_NLTK(): filepath = "nltkcorpus.txt" if isfile(filepath): return create_model(filepath= filepath, save=False) else: from nltk.corpus import reuters, brown, gutenberg sents = reuters.sents() + brown.sents() for gsents in [gutenberg.sents(fid) for fid in gutenberg.fileids()]: sents += gsents return create_model(sentences=sents, savename=filepath)
def create_corpus(): """Removing punctuations from the sentences of corpus.""" # Here, I am taking a text from rueters, webtext and brown corpus here. rr_corpus = reuters.sents() + webtext.sents() + brown.sents() punctuations = [p for p in string.punctuation] cleaned_corpus = [] for idx in range(len(rr_corpus)): cleaned_corpus.append( [w for w in rr_corpus[idx] if w not in punctuations]) return cleaned_corpus
def _load_reuters_docs(): test_docs = [] train_docs = [] i = 0 for fileid in reuters.fileids(): i += 1 if 'test' in fileid: # test_docs.append((reuters.words(fileid), reuters.sents(fileid))) test_docs.append( TokenizedDoc(reuters.words(fileid), reuters.sents(fileid), reuters.categories(fileid))) elif 'training' in fileid: # train_docs.append((reuters.words(fileid), reuters.words(fileid))) train_docs.append( TokenizedDoc(reuters.words(fileid), reuters.sents(fileid), reuters.categories(fileid))) else: print( "Document not recognized as part of training-set or test-set while extracting the Reuters Corpus" ) return train_docs, test_docs
def load_reuters_corpus() -> List[str]: nltk.download('reuters') sentences = list( filter( lambda sent: (len(sent) <= 30) and (len(sent) >= 3) and any(map(lambda word: word.isalpha(), sent)) and len(list(filter(lambda word2: word2.isupper(), sent))) < (len(sent) // 4), reuters.sents())) mdetok = TreebankWordDetokenizer() return list( map( lambda sent: mdetok.detokenize( (' '.join(sent).replace('``', '"').replace("''", '"').replace( '`', "'")).split()), sentences))
def process_reuters(): print 'reuters' from nltk.corpus import reuters count = 0 word = 'bank' sen1 = 'depository_financial_institution.n.01' sen2 = 'bank.n.01' file_name = 'data/bank_reuters_tmp.txt' for f in reuters.fileids(): sents = reuters.sents(f) for i in range(len(sents)): sent = sents[i] if (word in sent): appendToFile(file_name, sentToStr(sent, '0')) count = count + 1 print count
def pre_process(): raw_sentences = reuters.sents(reuters.fileids()) tokenizer = Tokenizer() tokenizer.fit_on_texts(raw_sentences) count_thresh = 5 low_count_words = [ w for w, c in tokenizer.word_counts.items() if c < count_thresh ] for w in low_count_words: del tokenizer.word_index[w] del tokenizer.word_docs[w] del tokenizer.word_counts[w] word_index_dict = tokenizer.word_index index_word_dict = {word_index_dict[word]: word for word in word_index_dict} sentences_word_index = tokenizer.texts_to_sequences(raw_sentences) return sentences_word_index, word_index_dict, index_word_dict
def get_default_sentences() -> list: nltk.download('brown') brown_tokenized_sentences = brown.sents() brown_sentences = detok_sentences(brown_tokenized_sentences) nltk.download('gutenberg') nltk.download('punkt') gutenberg_tokenized_sentences = gutenberg.sents() gutenberg_sentences = detok_sentences(gutenberg_tokenized_sentences) nltk.download('reuters') reuters_tokenized_sentences = reuters.sents() reuters_sentences = detok_sentences(reuters_tokenized_sentences) nltk.download('webtext') webtext_tokenized_sentences = webtext.sents() webtext_sentences = detok_sentences(webtext_tokenized_sentences) nltk.download('inaugural') inaugural_tokenized_sentences = inaugural.sents() inaugural_sentences = detok_sentences(inaugural_tokenized_sentences) return brown_sentences + gutenberg_sentences + reuters_sentences + webtext_sentences + inaugural_sentences
def N_gramprediction(self, input_text): from nltk.corpus import reuters from nltk import bigrams, trigrams from collections import Counter, defaultdict import random # Create a placeholder for model model = defaultdict(lambda: defaultdict(lambda: 0)) # Count frequency of co-occurance for sentence in reuters.sents(): for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True): model[(w1, w2)][w3] += 1 # Let's transform the counts to probabilities for w1_w2 in model: total_count = float(sum(model[w1_w2].values())) for w3 in model[w1_w2]: model[w1_w2][w3] /= total_count # starting word text1 = str(input_text) text = list(text1.split(' ')) sentence_finished = False print(text1) while not sentence_finished: # select a random probability threshold r = random.random() accumulator = .0 for word in model[tuple(text[-2:])].keys(): accumulator += model[tuple(text[-2:])][word] # select words that are above the probability threshold if accumulator >= r: text.append(word) break if text[-2:] == [None, None]: sentence_finished = True return (' '.join([t for t in text if t]))
def getRandom(article, length): allSents = list(enumerate(list(reuters.sents(article)))) print(len(allSents)) sentences = [] sentences.append(allSents[0]) allSents.__delitem__(0) for n in range(length): sent = random.choice(allSents) sentences.append(sent) allSents.remove(sent) sentences.sort() sentList = [] for sent in sentences: sentence = sent[1] sentList.append(sentence) return sentList
def reuters_idf_dict(current_docs, file_name, order=2): """ """ idf_file = file_name + ".idf" dict_idf = {} if os.path.exists(idf_file): with open(idf_file, 'r') as f: for line in f: values = line.split("\t") dict_idf[tuple(values[0].split())] = values[1] return dict_idf else: l_docs = [] for doc in current_docs: l_docs.append(doc) for fileid in reuters.fileids(): l_docs.append(reuters.sents(fileids=[fileid])) dict_idf = make_concept_idf_dict(l_docs, order) with open(idf_file, 'w') as f: for concept in dict_idf.keys(): f.write(' '.join(concept) + '\t' + str(dict_idf[concept]) + '\n') return dict_idf
datafile = open(datapath, "r") datalines = [] for i in range(1000): dataline = datafile.readline().split('\t') datalines.append(dataline) datafile.close() end = time.time() print("time of loading data", end - start) # begin correction start = time.time() n = 100 fres = open("result.txt", "w") corpus_raw_text = reuters.sents(categories=reuters.categories()) corpus_text = [] for sents in corpus_raw_text: sents = ['<s>'] + sents + ['</s>'] # remove string.punctuation for words in sents[::]: # use [::] to remove the continuous ';' ';' if (words in [ '\'\'', '``', ',', '--', ';', ':', '(', ')', '&', '\'', '!', '?', '.' ]): sents.remove(words) corpus_text.extend(sents) vocab_corpus = {}.fromkeys(corpus_text).keys() vocab_corpus = list(vocab_corpus) vocabCount = len(vocab_corpus) corpus_str = ' '.join(corpus_text)
def __init__(self, language): self.language = language # from 'Multilingual and Cross-Lingual Complex Word Identification' (Yimam et. al, 2017) if language == 'english': self.avg_word_length = 5.3 # from Beker, Henry; Piper, Fred. Cipher Systems: The Protection of Communications. self.char_frequency = { 'a': 8.167, 'b': 1.492, 'c': 2.782, 'd': 4.253, 'e': 12.702, 'f': 2.228, 'g': 2.015, 'h': 6.094, 'i': 6.966, 'j': 0.153, 'k': 0.772, 'l': 4.025, 'm': 2.406, 'n': 6.749, 'o': 7.507, 'p': 1.929, 'q': 0.095, 'r': 5.987, 's': 6.327, 't': 9.056, 'u': 2.758, 'v': 0.978, 'w': 2.360, 'x': 0.150, 'y': 1.974, 'z': 0.074 } self.dic = pyphen.Pyphen(lang='en') self.reuters = reuters.words() self.unigram_counts = Counter(self.reuters) bigrams = [] for sent in reuters.sents(): bigrams.extend( nltk.bigrams(sent, pad_left=True, pad_right=True)) self.bigram_counts = Counter(bigrams) else: # spanish self.avg_word_length = 6.2 # self.char_frequency = {'a': 12.525,'b': 2.215,'c': 4.139,'d': 5.860,'e': 13.681, # 'f': 0.692,'g': 1.768,'h': 0.703,'i': 6.247,'j': 0.443, # 'k': 0.011,'l': 4.967,'m': 3.157,'n': 6.71,'o': 8.683, # 'p': 2.510, 'q': 0.877,'r': 6.871,'s': 7.977,'t': 4.632, # 'u': 3.927, 'v': 1.138,'w': 0.017,'x': 0.215,'y': 1.008, # 'z': 0.517,'á': 0.502, 'é': 0.433, 'í': 0.725, 'ñ': 0.311, # 'ó': 0.827, 'ú': 0.168, 'ü': 0.012} # self.dic = pyphen.Pyphen(lang='es') self.cess = cess.words() self.unigram_counts = Counter(self.cess) bigrams = [] for sent in cess.sents(): bigrams.extend( nltk.bigrams(sent, pad_left=True, pad_right=True)) self.bigram_counts = Counter(bigrams) # self.clf = svm.SVC() # self.model = LogisticRegression() self.model = svm.SVC(gamma=5)
#!/usr/local/bin/python import nltk, re, pprint import triple.py from nltk.corpus import reuters from nltk.sem import relextract,extract_rels,rtuple grammar = "Relation: {<DT>?<JJ>*<NN><V.*><NN>}" cp = nltk.RegexpParser(grammar) s = [nltk.pos_tag(s) for s in reuters.sents()[:30]] #print sentence #print cp.parse(sentence) #nltk.ne_chunk brown = nltk.corpus.brown # for sent in s: tree = cp.parse(sent) for subtree in tree.subtrees(): if subtree.label() == 'Relation': print(tree) print(subtree.leaves()) '''# IN = re.compile(r'.*\bof\b.*') for i,sent in enumerate(s): sent = nltk.ne_chunk(sent) rels = extract_rels('PERSON','ORGANIZATION',doc=sent,corpus='ace',pattern=IN,window=7)
self.logger.info("returning top terms") return [term for term, ig in term_ig[:k]] def top_common_words(self, k): self.logger.info("calculating top %d of %d word terms according to frequency", k, len(self._total_freq)) # terms = self.all_terms() # terms_freq = [(term, sum((term.frequency(doc) for doc in self._documents))) for term in terms] terms_freq = sorted(self._total_freq.items(), key=lambda x: x[1], reverse=True) self.logger.info("returning top %d word terms according to frequency", k) return [term for term, freq in terms_freq[:k]] if __name__ == '__main__': training_fileids = fileids = filter(lambda x: "training" in x, reuters.fileids()) documents = reuters.sents(training_fileids) # dict = set(reuters.words(training_fileids)) # print documents[0] # print " ".join(documents[0]) # print WordTerm("in").frequency(documents[0]) print 'Checking Vectorizer' documents = get_document_objects(documents) w = WordTermExtractor(documents, None) print documents[0].get_freq('BAHIA') print documents[0].get_freq('bahia') print ProjectParams.terms_matrix.total_freq
print "Processed {0} sentences\r".format(processed_count), print "Current Structure total: {0}".format(len(sentences)) print "Adding abc sentence structures ({0})...".format(len(abc.sents())) for sentence in abc.sents(): processed_count += 1 try: blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger()) tags = tuple([tag[1] for tag in blob.tags]) sentences.add(tags) except: print "\r", print "Processed {0} sentences\r".format(processed_count), print "Current Structure total: {0}".format(len(sentences)) print "Adding reuters sentence structures ({0})...".format(len(reuters.sents())) for sentence in reuters.sents(): processed_count += 1 try: blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger()) tags = tuple([tag[1] for tag in blob.tags]) sentences.add(tags) except: print "\r", print "Processed {0} sentences\r".format(processed_count), print "Current Structure total: {0}".format(len(sentences)) print "Adding brown sentence structures ({0})...".format(len(brown.sents())) for sentence in brown.sents(): processed_count += 1 try:
""" There are three options to train the true caser: 1) Use the sentences in NLTK 2) Use the train.txt file. Each line must contain a single sentence. Use a large corpus, for example Wikipedia 3) Use Bigrams + Trigrams count from the website http://www.ngrams.info/download_coca.asp The more training data, the better the results """ # :: Option 1: Train it based on NLTK corpus :: print "Update from NLTK Corpus" NLTKCorpus = brown.sents()+reuters.sents()+nltk.corpus.semcor.sents()+nltk.corpus.conll2000.sents()+nltk.corpus.state_union.sents() updateDistributionsFromSentences(NLTKCorpus, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) # :: Option 2: Train it based the train.txt file :: """ #Uncomment, if you want to train from train.txt print "Update from train.txt file" sentences = [] for line in open('train.txt'): sentences.append(line.strip()) tokens = [nltk.word_tokenize(sentence) for sentence in sentences] updateDistributionsFromSentences(tokens, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) """ # :: Option 3: Train it based ngrams tables from http://www.ngrams.info/download_coca.asp :: """ #Uncomment, if you want to train from train.txt
# def terminals(self, term, document): # return ( # self.bool(term, document), self.tf(term, document), self.tf_idf(term, document), self.tf_ig(term, document), # self.tf_chi(term, document), self.tf_rf(term, document)) def raw_terminals(self, term, document): return (self.bool(term, document), self.tf(term, document), self.max_prob_term_and_category(term, document), self.max_prob_term_not_category(term, document), self.avg_prob_term_category(term, document), self.avg_prob_term_not_category(term, document), self.first_occurrence_perc(term, document)) if __name__ == '__main__': training_fileids = fileids = filter(lambda fileid: "training" in fileid and len(reuters.categories(fileid)) == 1, reuters.fileids()) documents = [sum(reuters.sents(fid), []) for fid in training_fileids] doc = documents[0] term = terminals.WordTerm("in") docs_categories = [reuters.categories(fid)[0] for fid in training_fileids] print docs_categories print doc fe = TWSCalculator(documents, docs_categories) print "tf =", fe.tf(term, doc), "idf =", fe.idf(term), "tf-idf =", fe.tf_idf(term, doc) term = terminals.WordTerm("in") print 'TF-CHI: ', fe.tf_chi(term, doc) print 'TF-CHI: ', fe.tf_chi(term, doc) print 'TF-IG: ', fe.tf_ig(term, doc) print 'TF-IG: ', fe.tf_ig(term, doc)
from sklearn.multiclass import OneVsRestClassifier from sklearn.naive_bayes import MultinomialNB from features import TWSCalculator from readers import NewsgroupsReader from terminals import get_document_objects, WordTermExtractor, WordTerm from terms_lists.ng20_ig import ng_20_ig500 from terms_lists.r8_ig import r_eight_terms __author__ = 'itay' if __name__ == '__main__': cats_limiter = categories = ['earn', 'acq', 'crude', 'trade', 'money-fx', 'interest', 'money-supply', 'ship'] # top 8 training_fileids = fileids = filter(lambda fileid: "training" in fileid and len(reuters.categories(fileid)) == 1, reuters.fileids(cats_limiter)) training_documents = [" ".join(sum(reuters.sents(fid), [])) for fid in training_fileids] training_docs_categories = [reuters.categories(fid)[0] for fid in training_fileids] map(lambda x: x.lower, training_documents) # training_documents_objects = get_document_objects(training_documents, training_docs_categories) # training_documents_objects = NewsgroupsReader(False).get_training() # training_documents = [d.doc for d in training_documents_objects] # training_docs_categories = [d.category for d in training_documents_objects] #top IG r8: # words = ng_20_ig500 # tws_calculator = TWSCalculator(training_documents_objects, training_docs_categories) # word_term_extractor = WordTermExtractor(training_documents_objects, tws_calculator) # # top_terms = word_term_extractor.top_common_words(500)
def __init__(self): test_files = [fileid for fileid in reuters.fileids() if fileid.startswith('test')] super(ReutersTestCorpus, self).__init__(reuters.sents(test_files))
#!/usr/bin/python #coding:utf-8 # 2013/02/08 from nltk.corpus import reuters import nltk # nltk.pos_tag() のタグの種類を調べる sents=reuters.sents() tags = set( tag for sent in sents[:5000] for word,tag in nltk.pos_tag(sent) ) print tags # set(['PRP$', 'VBG', 'VBD', '``', 'VBN', ',', "''", 'VBP', 'WDT', 'JJ', 'WP', 'VBZ', 'DT', 'RP', 'NN', 'POS', '.', 'TO', 'PRP', 'RB', ':', 'NNS', 'NNP', 'VB', 'WRB', 'CC', 'LS', 'PDT', 'RBS', 'RBR', 'CD', '-NONE-', 'EX', 'IN', 'WP$', 'MD', 'NNPS', 'JJS', 'JJR']) # VB,VBG,VBD,VBN,VBP,VBZ,JJ,JJS,JJR,NN,NNS,NNP,NNPS # VB : 動詞,基本形 # VBD : 過去形 # VBG : 動名詞,現在分詞 # VBN : 過去分詞 # VBP : 非三単現 # VBZ : 三単現 # JJ : 形容詞 # JJR : 比較級 # JJS : 最上級 # NN : 単数名詞 # NNS : 複数名詞 # NNP : 単数固有名詞 # NNPS : 複数固有名詞
def __init__(self): training_files = [fileid for fileid in reuters.fileids() if fileid.startswith('training')] super(ReutersTrainingCorpus, self).__init__(reuters.sents(training_files))