def extractDataAndRunExperiment(self): print("Extracting data from NLTK corpus.") category_pairs = parameters.reuters_category_pairs for pair in category_pairs: #do the experiment on selected pairs #Extract data for corresponding categories spam_category = pair[0] ham_category = pair[1] print("Experiment pair: " + str(spam_category) + ", " + str(ham_category)) self.dataset_name = str(spam_category) + ", " + str(ham_category) spam_doc_ids = reuters.fileids( spam_category) #acquire documents using all docids for spam ham_doc_ids = reuters.fileids( ham_category) #acquire documents using all docids for ham spamlist = [] #dataset for spam words hamlist = [] #dataset for ham words for docid in spam_doc_ids: wordlist = reuters.words(docid) #get the words for document spamlist.append( wordlist) #append the documents words to spamlist as a row for docid in ham_doc_ids: wordlist = reuters.words(docid) #get the words for document hamlist.append( wordlist) #append the documents words to hamlist as a row #preprocess extracted data and merge them into self.dataset self.preprocessDocuments( spamlist, 1 ) #will tokenize (remove stopwords, punct.etc.), insert labels and dictionarize self.preprocessDocuments( hamlist, 0 ) #will tokenize (remove stopwords, punct.etc.), insert labels and dictionarize #call generate training test and experiment self.runExperiment()
def find_suitable_text(): """Find suitable text for background with checking length of Gutenberg texts, Brown categories and Reuters categories. Texts or categories over 50k words are marked green. Total length of the corpus is at the top as header""" print('\033[95m') print("--------------- Gutenberg ---------------") print("Total Length: ", len(gutenberg.words())) print('\033[0m') for fid in gutenberg.fileids(): words = gutenberg.words(fid) length = len(words) if length > 50000: print('\033[92m') print("Text: ", fid) print("Length: ", length) print("Content preview: ", words[:20]) if length > 50000: print('\033[0m') else: print("") # brown texts are too short, therefore check categories print('\033[95m') print("--------------- Brown ---------------") print("Total Length: ", len(brown.words())) print('\033[0m') for cat in brown.categories(): words = brown.words(categories=cat) length = len(words) if length > 50000: print('\033[92m') print("Text category: ", cat) print("Length: ", length) print("Content preview: ", words[:20]) if length > 50000: print('\033[0m') else: print("") # reuters texts are too short, therefore check categories # reuters actually has some funny categories # reuters categories are rather small, however the total corpus is quire large print('\033[95m') print("--------------- Reuters ---------------") print("Total Length: ", len(reuters.words())) print('\033[0m') for cat in reuters.categories(): words = reuters.words(categories=[cat]) if length > 50000: print('\033[92m') print("Text category: ", cat) print("Length: ", len(words)) print("Content preview: ", words[:20]) if length > 50000: print('\033[0m') else: print("")
def get_train_test_docs(): test_docs, train_docs = get_train_test_splits() # print(test_docs) test_doc = [] train_doc = [] for doc in test_docs: test_doc.extend(reuters.words(doc)) for doc in train_docs: train_doc.extend(reuters.words(doc)) return test_doc, train_doc
def __init__(self) -> None: global basewords self.nltkwords = dict() self.fdist = nltk.FreqDist([w.lower() for w in reuters.words()]) total_samples = sum(self.fdist.values()) for e in self.fdist.elements(): self.nltkwords[e] = self.fdist.freq(e) * total_samples self.N = len(list(self.nltkwords.keys())) self.M = sum(self.nltkwords.values()) self.media = self.M / self.N self.bigrm = list(nltk.bigrams(reuters.words())) basewords = self.nltkwords
def __init__(self)-> None: with open("reuters.json","r",encoding="utf8") as file: basewords = json.load(file) global basewords self.nltkwords=dict() self.fdist = nltk.FreqDist([w.lower() for w in reuters.words()]) total_samples = sum(self.fdist.values()) for e in self.fdist.elements(): self.nltkwords[e]= self.fdist.freq(e)*total_samples self.N = len(list(self.nltkwords.keys())) self.M = sum(self.nltkwords.values()) self.media = self.M/self.N self.bigrm = list(nltk.bigrams(reuters.words())) basewords = self.nltkwords def estimate(self,word): stemmer = PorterStemmer() ts1 = time.time() tmp=list() nword=len(word.split()) for i, w in enumerate(word.split()): if (w.strip() in self.nltkwords): if w.strip() in stops: # if (i>0 and (word.split()[i-1],word.split()[i]) in self.bigrm): # tmp.append(self.media*(1+0.5)) # else: tmp.append(self.nltkwords[w]/self.media) else: # if (i>0 and (word.split()[i-1],word.split()[i]) in self.bigrm): # tmp.append(self.nltkwords[w] *(len(w)-2)*(1 + 0.5)) # else: stem = stemmer.stem(w) if (stem != w and stem in self.nltkwords ): tmp.append((self.nltkwords[w]+self.nltkwords[stem]) *(len(w)-2)) else: tmp.append(self.nltkwords[w] *(len(w)-2)) else: tmp.append(-self.media) ts2 = time.time() if (len(tmp)>0): atmp = np.asarray(tmp) #atmp = np.exp(atmp) / np.sum(np.exp(atmp)) return np.nansum(atmp)/nword else: return 0
def compute_cosine(code_doc_1,code_doc_2): ''' Calcul la similarité entre deux documents à partir de l'angle entre deux documents Input: code_doc_1 : code du document, qui correspond à son code dans la liste documents code_doc_2 : document 2 Output: Le cosinus entre les deux docs ''' text_1,_,_ = st.tokenize_sentences([pre_processing(list(reuters.words(code_doc_1)))]) text_2,_,_ = st.tokenize_sentences([pre_processing(list(reuters.words(code_doc_2)))]) pred_1 = model.predict(text_1) pred_2 = model.predict(text_2) return(cosine(pred_1,pred_2),text_1)
def main(): # gutenberg gu_words = gutenberg.words() gu_words_exclude_stops = exclude_stopwords(gu_words) gu_fd1 = get_frequency_distribution(gu_words) gu_fd2 = get_frequency_distribution(gu_words_exclude_stops) pylab.plot(gu_fd1, color='red') pylab.plot(gu_fd2, color='orange') # inaugural in_words = inaugural.words() in_words_exclude_stops = exclude_stopwords(in_words) in_fd1 = get_frequency_distribution(in_words) in_fd2 = get_frequency_distribution(in_words_exclude_stops) pylab.plot(in_fd1, color='black') pylab.plot(in_fd2, color='gray') # reuters yen_words = reuters.words(categories='yen') yen_words_exclude_stops = exclude_stopwords(yen_words) yen_fd1 = get_frequency_distribution(yen_words) yen_fd2 = get_frequency_distribution(yen_words_exclude_stops) pylab.plot(yen_fd1, color='blue') pylab.plot(yen_fd2, color='green') pylab.xscale('log') pylab.yscale('log') pylab.show()
def collection_stats(): # 文档列表 documents = reuters.fileids() print(len(documents), "篇文档") train_docs = list(filter(lambda doc: doc.startswith("train"), documents)) print(str(len(train_docs)) + "篇训练文档") test_docs = list(filter(lambda doc: doc.startswith("test"), documents)) print(str(len(test_docs)) + "篇测试文档") # 类别列表 categories = reuters.categories() print(str(len(categories)) + "个类别") # 一个类别中的文档 category_docs = reuters.fileids("acq") # 文档单词 document_id = category_docs[0] document_words = reuters.words(category_docs[0]) print("文档单词:\n", document_words) # 原始文档 print("原始文档:\n", reuters.raw(document_id))
def getWeightedQuestionKeywords(query): browntext = brown.words() browndist = nltk.FreqDist(browntext) reuterstext = reuters.words() reutersdist = nltk.FreqDist(reuterstext) text = nltk.word_tokenize(query) tagged = nltk.pos_tag(text) filteredparts = [] for pair in tagged: if pair[1] in [ 'FW', 'JJ', 'JJR', 'JJS', 'JJT', 'N', 'NN', 'NNP', 'NNS', 'NP', 'NPS', 'NR', 'RB', 'RBR', 'RBT' 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NUM', 'CD', 'OD' ]: filteredparts.append(pair) filtereddist = {} for pair in filteredparts: frequency = browndist[pair[0]] + reutersdist[pair[0]] if frequency < 600 or (pair[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] and (frequency < 1500)): # more types here? filtereddist[pair[0]] = (frequency, pair[1]) return filtereddist # ensure there are at least a certain number of keywords? add stemming or synonyms? add phrases?
def __init__(self) -> None: with open("reuters.json", "r", encoding="utf8") as file: self.nltkwords = json.load(file) self.N = len(list(self.nltkwords.keys())) self.M = sum(self.nltkwords.values()) self.media = self.M / self.N self.bigrm = list(nltk.bigrams(reuters.words()))
def simple_results(query, added_vocab=None): wnl = WordNetLemmatizer() invertedindex = pickle.load(open("invertedindex_test.pkl", "rb")) l = [] flag = 0 zero_result = 0 words_used = [] for i, j in pos_tag(word_tokenize(query.lower())): if j[0].lower() in ['a', 'n', 'v']: q = wnl.lemmatize(i, j[0].lower()) else: q = wnl.lemmatize(i) if q not in invertedindex: words_used = [] zero_result = 1 break if q not in reuters.words("stopwords"): if flag == 0: l = list(invertedindex[q].keys()) words_used.append(q) else: l1 = [ value for value in l if value in list(invertedindex[q].keys()) ] l = l1 if len(l) == 0: if added_vocab is not None: words_used = [] zero_result = 1 break else: return l, words_used flag = 1 if added_vocab is not None: if zero_result or len(l) < 5: for v in added_vocab: try: if v[1].isalpha() and len(list( invertedindex[v[1]].keys())) > 0: return list(invertedindex[ v[1]].keys()) + l, words_used + [v[1]] except: pass for v in added_vocab: if v[1].isalpha() == False or v[1] not in invertedindex.keys(): continue if len(l) > 20: l1 = [ value for value in l if value in list(invertedindex[v[1]].keys()) ] if len(l1) < 20: break l = l1 words_used.append(v[1]) else: return l, words_used return l, words_used return l, words_used
def build_word_count(): if os.path.isfile('pickled/wcount.pickle'): return read_pickle('pickled/wcount.pickle') wcount = Counter() for fid in words.fileids(): for word in words.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in gutenberg.fileids(): for word in gutenberg.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in brown.fileids(): for word in brown.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in reuters.fileids(): for word in reuters.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in inaugural.fileids(): for word in inaugural.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 dump_pickle(wcount, 'pickled/wcount.pickle') return wcount
def getKeywordsDraft(query): browntext = brown.words() browndist = nltk.FreqDist(browntext) reuterstext = reuters.words() reutersdist = nltk.FreqDist(reuterstext) text = nltk.word_tokenize(query) tagged = nltk.pos_tag(text) print(tagged) filteredparts = [] for pair in tagged: if pair[1] in ['FW', 'JJ', 'JJR', 'JJS', 'JJT', 'N', 'NN', 'NNP', 'NNS', 'NP', 'NPS', 'NR', 'RB', 'RBR', 'RBT' 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NUM', 'CD', 'OD']: filteredparts.append(pair[0]) print(filteredparts) filtereddist = {} for word in filteredparts: frequency = browndist[word] + reutersdist[word] print word print frequency if frequency < 600: filtereddist[word] = frequency sortedlist = sorted(filtereddist.items(), key=itemgetter(1)) print(sortedlist) return sortedlist
def main(): # データ k = 100 docs = [corpus.words(fileid) for fileid in corpus.fileids()[:k]] # 前処理 pp_docs = preprocess_documents(docs) # ベクトル化 vectorizer = TfidfVectorizer(max_features=50, token_pattern=u'(?u)\\b\\w+\\b') tf_idf = vectorizer.fit_transform(pp_docs) # K-means num_clusters = 8 km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=300, random_state=0, precompute_distances=True) clusters = km.fit_predict(tf_idf) categories = [ ','.join(corpus.categories(fileid)) for fileid in corpus.fileids()[:k] ] keys = [] for k, _ in sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1]): keys.append(k) w_df = pd.DataFrame({'class': clusters, 'category': categories}) k_df = pd.DataFrame({'key': keys}) print(k_df) print(w_df) w_df.to_csv('result/kmeans_' + str(num_clusters) + '.csv') k_df.to_csv('result/tf_idf_key_' + str(num_clusters) + '.csv')
def reuters_dataframe(n=9160): def clean(words): stopwords = set(nltk.corpus.stopwords.words('english')) words_lower = [w.lower() for w in words] return [w for w in words_lower if w not in stopwords and len(w) >= 3] def title(words): return words[:20] fileids = [i for i in reuters.fileids() if len(reuters.categories(i)) == 1][:n] df = pd.DataFrame({'text': [' '.join(clean(reuters.words(i))) for i in fileids], 'category': [reuters.categories(i)[0] for i in fileids], 'title': [' '.join(title(reuters.words(i))) for i in fileids], 'fileids': fileids, 'words': [reuters.words(i) for i in fileids]}) return df
def load_english_frequencies(): nltk.download(['brown', 'gutenberg', 'reuters']) global english_frequencies english_frequencies = FreqDist(w.lower() for w in brown.words()) english_frequencies.update(w.lower() for w in gutenberg.words()) english_frequencies.update(w.lower() for w in reuters.words())
def compute_word_frequencies(self): nltk.download('brown') nltk.download('reuters') counter = Counter() counter.update(self.normalize_words(brown.words())) counter.update(self.normalize_words(reuters.words())) return counter
def retrieve_reuters_documents(max_documents=-1, filter_words=True): # List of documents documents = [] training_files = [ file_id for file_id in reuters.fileids() if 'training/' in file_id ] test_files = [ file_id for file_id in reuters.fileids() if 'test/' in file_id ] for file_id in interleave([training_files, test_files]): if max_documents > -1 and len(documents) >= max_documents: return documents words = list(reuters.words(fileids=file_id)) words_filtered = do_filter_words(words) document = { 'words': words, 'title': reuters.raw(fileids=file_id).split("\n")[0], 'categories': reuters.categories(fileids=file_id), 'is_training_example': True if 'training/' in file_id else False, 'is_test_example': True if 'test/' in file_id else False, 'words_filtered': words_filtered if filter_words else words, 'file_id': file_id } if len(words_filtered) < 30: continue documents.append(document) return documents
def __init__(self, category, index, m, k, contigous=True, blob_length=100000): ''' :param category: the name of the document's category :param m: the number of top features :param n: the length of each feature :param index: the index of the document into the Reuters data-set :param contigous: Boolean, True if features are contigous, True otherwise ''' self.m = m self.k = k self.index = index self.category = category self.contigous = contigous self.features = set() self.words = reuters.words(index) self.clean_data = self.remove_stops() if not self.contigous: self.blob_length = min(blob_length, len(self.words)) self.noncont_features = defaultdict(lambda: {'count':0, 'weights':[]}, {}) self.words = self.words[:self.blob_length] self.set_features() self.m = min(self.m, len(self.features)) self.sort_features()
def collection_stats(): # List of documents documents = reuters.fileids() print(str(len(documents)) + " documents") train_docs = list(filter(lambda doc: doc.startswith("train"), documents)) print(str(len(train_docs)) + " total train documents") test_docs = list(filter(lambda doc: doc.startswith("test"), documents)) print(str(len(test_docs)) + " total test documents") # List of categories categories = reuters.categories() print(str(len(categories)) + " categories") # Documents in a category category_docs = reuters.fileids("acq") # Words for a document document_id = category_docs[0] document_words = reuters.words(category_docs[0]) print(document_words) # Raw document print(reuters.raw(document_id))
def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq): labeled_words = [] for label in reuters.categories(): labeled_words.append((label, reuters.words(categories=[label]))) return high_information_words(labeled_words, score_fn=score_fn)
def collection_stats(): # List of documents documents = reuters.fileids() print(str(len(documents)) + " documents") train_docs = list(filter(lambda doc: doc.startswith("train"), documents)) print(str(len(train_docs)) + " total train documents") test_docs = list(filter(lambda doc: doc.startswith("test"), documents)) print(str(len(test_docs)) + " total test documents") # List of categories categories = reuters.categories() print(str(len(categories)) + " categories") # Documents in a category category_docs = reuters.fileids("acq") # Words for a document document_id = category_docs[0] document_words = reuters.words(category_docs[0]) print(document_words) # Raw document print(reuters.raw(document_id)) tokens = [] for docid in train_docs: t = tokenize(reuters.raw(docid)) tokens.extend(t) print(tokens[0]) v = set(tokens) print("number of terms=", len(tokens)) print("voc size=", len(v))
def get_reuters_ids_cnt(num_doc=100, max_voca=10000, remove_top_n=5): """To get test data for training a model reuters, stopwords, english words corpora should be installed in nltk_data: nltk.download() Parameters ---------- num_doc: int number of documents to be returned max_voca: int maximum number of vocabulary size for the returned corpus remove_top_n: int remove top n frequently used words Returns ------- voca_list: ndarray list of vocabulary used to construct a corpus doc_ids: list list of list of word id for each document doc_cnt: list list of list of word count for each document """ file_list = reuters.fileids() corpus = [reuters.words(file_list[i]) for i in xrange(num_doc)] return get_ids_cnt(corpus, max_voca, remove_top_n)
def generateKnownPatterns(): from nltk.corpus import brown, reuters, words, wordnet from string import ascii_lowercase as ALPH patterns = {} wordlist = sorted( set([x.lower() for x in brown.words()] + [x.lower() for x in reuters.words()] + [x.lower() for x in words.words()] + [x.lower() for x in wordnet.all_lemma_names()])) for word in list(wordlist): if any(x not in ALPH for x in word): wordlist.remove(word) with open("static/txt/wordlist.txt", "w", encoding="utf-8") as f: f.write("\n".join(wordlist)) for word in wordlist: p = pattern(word) if p in patterns: patterns[p].append(word) else: patterns[p] = [word] with open("static/txt/patterns.json", "w", encoding="utf-8") as f: json.dump(patterns, f)
def get_word_features(category): """"Returns list of words that occur in all the documensts in specified category""" #print "Getting word features.." all_words = nltk.FreqDist(w.lower() for w in reuters.words(categories=category)) word_features = list(all_words)[:5000] return word_features
def build(self, vocab_size=5000): words = reuters.words() words = [self.trim(w) for w in words] words = [w for w in words if w] freq = Counter(words) freq = freq.most_common(vocab_size) self.vocab = [w_c[0] for w_c in freq] self.vocab = [self.PAD, self.UNK] + self.vocab
def main(categories, document_ids, verbose=False): print("categories: {}".format(categories)) print("number of categories: {}".format(len(categories))) cat2catid = {} for catid, cat in enumerate(sorted(categories)): cat2catid[cat] = catid documents = document_ids test = [d for d in documents if d.startswith('test/')] train = [d for d in documents if d.startswith('training/')] print("train documents: {}".format(len(train))) print("test documents: {}".format(len(test))) # make it easy to map data to label # gather simple statistics id2cats = defaultdict(list) cat2count = {} for cat in categories: for fid in reuters.fileids(cat): id2cats[fid].append(cat) if cat not in cat2count: cat2count[cat] = {'train': 0, 'test': 0, 'words': []} if fid in train: cat2count[cat]['train'] += 1 else: cat2count[cat]['test'] += 1 cat2count[cat]['words'].append(len(reuters.words(fid))) print("How many labels do documents usually have?") labelcount2doccount = defaultdict(int) for _, cats in id2cats.items(): labelcount2doccount[len(cats)] += 1 s = sorted(labelcount2doccount.items(), reverse=True, key=lambda n: n[1]) for labelcount, documentcount in s: print("\tlabelcount={:>3}, documentcount={:>3}".format( labelcount, documentcount)) # Analyze data distribution to classes analyze_data_distribution(cat2count) # Build corpus corpus = [] for document_id in train: corpus += list(reuters.words(document_id)) analyze_vocabulary(corpus)
def docs_to_matrix(self, docs, seq_size): docs_i = [] for d in docs: words = reuters.words(d) words = self.sentence_to_ids(words, seq_size) docs_i.append(words) docs_i = np.array(docs_i) return docs_i
def create_dictionary(data, corpus): # for p in data.split('\r\n'): # untrimmed_docs.append(p) # docs.append(clean_text(p)) # untrimmed_docs.append(data) #docs.append(clean_text(data)) tt = nltk.tokenize.texttiling.TextTilingTokenizer() for segment in tt.tokenize(data): docs.append(clean_text(segment)) if corpus == "gutenberg": one_doc = [] for i in gt.fileids(): one_doc = one_doc + [ x.lower() for x in rt.words(i) if not x.lower() in stopwords ] #docs.append(cleaned_doc) docs.append(one_doc) if corpus == "reuters": one_doc = list() for i in rt.fileids(): #cleaned_doc = [x.lower() for x in rt.words(i) if not x.lower() in stopwords] #docs.append(cleaned_doc) one_doc = list( set(one_doc + [ x.lower() for x in rt.words(i) if not x.lower() in stopwords ])) docs.append(one_doc) if corpus == "brown": categories = [ 'adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction' ] #categories = ['adventure', 'editorial', 'fiction', 'hobbies', 'learned', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] for c in categories: cleaned_doc = [ x.lower() for x in brown.words(categories=c) if not x.lower() in stopwords ] docs.append(cleaned_doc) #cleaned_doc = [x.lower() for x in brown.words(categories=categories) if not x.lower() in stopwords] #docs.append(cleaned_doc) my_dictionary = corpora.Dictionary(docs) return my_dictionary
def reuter(): ''' Create the reuters corpus from nltk ''' word = [] for i in reuters.fileids(): word.append(reuters.words(fileids=[i])) return word