def load_data(config={}): """ Load the Reuters dataset. Returns ------- data : dict with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels' """ stop_words = stopwords.words("english") vectorizer = TfidfVectorizer(stop_words=stop_words) mlb = MultiLabelBinarizer() documents = reuters.fileids() test = [d for d in documents if d.startswith('test/')] train = [d for d in documents if d.startswith('training/')] docs = {} docs['train'] = [reuters.raw(doc_id) for doc_id in train] docs['test'] = [reuters.raw(doc_id) for doc_id in test] xs = {'train': [], 'test': []} xs['train'] = vectorizer.fit_transform(docs['train']).toarray() xs['test'] = vectorizer.transform(docs['test']).toarray() ys = {'train': [], 'test': []} ys['train'] = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train]) ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test]) data = {'x_train': xs['train'], 'y_train': ys['train'], 'x_test': xs['test'], 'y_test': ys['test'], 'labels': globals()["labels"]} return data
def load_data(): """ Load the Reuters dataset. Returns ------- train_docs, train_labels, test_docs, test_labels. """ documents = reuters.fileids() train = [d for d in documents if d.startswith('training/')] train_docs = [reuters.raw(doc_id) for doc_id in train] train_docs = [text_prepare(x) for x in train_docs] train_labels = [reuters.categories(doc_id) for doc_id in train] test = [d for d in documents if d.startswith('test/')] test_docs = [reuters.raw(doc_id) for doc_id in test] test_docs = [text_prepare(x) for x in test_docs] test_labels = [reuters.categories(doc_id) for doc_id in test] print("len(train_docs)={}, len(train_labels)={}".format( len(train_docs), len(train_labels))) print("len(test_docs)={}, len(test_labels)={}".format( len(test_docs), len(test_labels))) mlb = MultiLabelBinarizer(classes=sorted(labels)) train_labels = mlb.fit_transform(train_labels) test_labels = mlb.fit_transform(test_labels) print("y_train.shape={}, y_test.shape={}".format(train_labels.shape, test_labels.shape)) return (train_docs, train_labels, test_docs, test_labels, mlb.classes)
def getDocIDs_top10(): # Top 10 Categories documents = [ f for f in reuters.fileids() if len(reuters.categories(fileids=f)) == 1 ] train_docs_id = list( filter( lambda doc: doc.startswith("train") and len(reuters.raw(doc)) > 51, documents)) test_docs_id = list( filter( lambda doc: doc.startswith("test") and len(reuters.raw(doc)) > 51, documents)) new_train_docs_id = [] new_test_docs_id = [] for cat in reuters.categories(): li = [f for f in reuters.fileids(categories=cat) if f in train_docs_id] li_te = [ f for f in reuters.fileids(categories=cat) if f in test_docs_id ] if len(li) > 20 and len(li_te) > 20: new_train_docs_id.extend(li) new_test_docs_id.extend(li_te) train_docs_id = new_train_docs_id test_docs_id = new_test_docs_id return (train_docs_id, test_docs_id)
def load_data(config={}): """ Load the Reuters dataset. Returns ------- data : dict with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels' """ stop_words = stopwords.words("english") vectorizer = TfidfVectorizer(stop_words=stop_words) mlb = MultiLabelBinarizer() documents = reuters.fileids() test = [d for d in documents if d.startswith('test/')] train = [d for d in documents if d.startswith('training/')] docs = {} docs['train'] = [reuters.raw(doc_id) for doc_id in train] docs['test'] = [reuters.raw(doc_id) for doc_id in test] xs = {'train': [], 'test': []} xs['train'] = vectorizer.fit_transform(docs['train']).toarray() xs['test'] = vectorizer.transform(docs['test']).toarray() ys = {'train': [], 'test': []} ys['train'] = mlb.fit_transform( [reuters.categories(doc_id) for doc_id in train]) ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test]) data = { 'x_train': xs['train'], 'y_train': ys['train'], 'x_test': xs['test'], 'y_test': ys['test'], 'labels': globals()["labels"] } return data
def __init__(self, corpusData): # getting all the document ids in corpusData, such as reuters docs = corpusData.fileids() # splitting into training and test docs ids self.train_docs_ids = list( filter(lambda doc: doc.startswith("train"), docs)) self.test_docs_ids = list( filter(lambda doc: doc.startswith("test"), docs)) # getting the actual data from those ids self.train_docs = [ corpusData.raw(doc_id) for doc_id in self.train_docs_ids ] self.test_docs = [ corpusData.raw(doc_id) for doc_id in self.test_docs_ids ] self.docs = self.train_docs + self.test_docs # transforming multilabels mlb = MultiLabelBinarizer() self.train_labels = mlb.fit_transform( [reuters.categories(doc_id) for doc_id in self.train_docs_ids]) self.test_labels = mlb.transform( [reuters.categories(doc_id) for doc_id in self.test_docs_ids]) #vectorizers self.count_vectorizer = CountVectorizer(analyzer='word', stop_words='english') self.tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', token_pattern='[A-Za-z]{3,}')
def load_data(config={}): stop_words = stopwords.words("english") vectorizer = TfidfVectorizer(stop_words=stop_words, binary=True) mlb = MultiLabelBinarizer() documents = reuters.fileids() test = [d for d in documents if d.startswith('test/')] train = [d for d in documents if d.startswith('training/')] docs = {} docs['train'] = [reuters.raw(doc_id) for doc_id in train] docs['test'] = [reuters.raw(doc_id) for doc_id in test] xs = {'train': [], 'test': []} xs['train'] = vectorizer.fit_transform(docs['train']).toarray() xs['test'] = vectorizer.transform(docs['test']).toarray() ys = {'train': [], 'test': []} ys['train'] = mlb.fit_transform( [reuters.categories(doc_id) for doc_id in train]) ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test]) data = { 'x_train': xs['train'], 'y_train': ys['train'], 'x_test': xs['test'], 'y_test': ys['test'], 'labels': reuters.categories() } print(data['x_train']) print(data['y_train']) return data, vectorizer.vocabulary_
def categorize_kNNSimScoring(testDocs, trainDocs, docId, neighbors, fr=0.75): """ Uses a similarity scoring approach for categorizing the test docs. With the nearest neighbors, for each class we assign a score based on which neighboring nodes have that class and assign weightings based on the cosine similarity. """ #input is a test doc (assume nonempty or else the fr part in line 446 (second to last) will cause issues) #O(n^3) categories = reuters.categories(neighbors) catgScores = [] for c in categories: score = 0.0 for n in neighbors: if c in reuters.categories(n): score += classifierTraining.dot(testDocs[docId], trainDocs[n].doc_vec) catgScores.append((score, c)) catgScores = heapq.nlargest( int(round(len(categories) * fr)), catgScores) #should check if round function is O(1) or O(n) return [c for s, c in catgScores]
def get_raw_data(): nltk.download("reuters") from nltk.corpus import reuters documents = reuters.fileids() train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents)) test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents)) X_train = [(reuters.raw(doc_id)) for doc_id in train_docs_id] X_test = [(reuters.raw(doc_id)) for doc_id in test_docs_id] mlb = MultiLabelBinarizer() y_train = [reuters.categories(doc_id) for doc_id in train_docs_id] y_test = [reuters.categories(doc_id) for doc_id in test_docs_id] all_dataa = X_train + X_test all_lavelsa = y_train + y_test mlb = MultiLabelBinarizer() datas_y = mlb.fit_transform(all_lavelsa) return all_dataa,all_lavelsa
def __init__(self): self.documents = [] self.categories = reuters.categories() self.tfidf = tf_idf() #iteracja po wszystkich dokumentach reuters for docid in reuters.fileids(): #odrzucenie dokumentów, które mają więcej niż 1 kategorię if len(reuters.categories(docid)) > 1: continue #określenie kategorii dokumentu cat = 0 for i in range(90): if self.categories[i] in reuters.categories(docid): cat = i #określenie czy dokument jest przeznaczony do treningów czy do testów if docid.startswith("train"): train = 1 elif docid.startswith("test"): train = 0 else: raise () text = reuters.raw(docid) doc = document(text, cat, train) #dodanie dokumentu do klasy TfIdf - potrzebne do późniejszych obliczeń self.tfidf.add_document(doc) #dodanie dokumentu do tablicy dokumentów self.add_document(doc) self.initialize_vocabulary()
def reuters_dataset(): nltk.download('reuters') nltk.download('stopwords') stop_words = stopwords.words("english") documents = reuters.fileids() train_docs_id = [doc for doc in documents if doc.startswith("train")] test_docs_id = [doc for doc in documents if doc.startswith("test")] train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id] test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id] print(len(train_docs), len(test_docs)) vectorizer = TfidfVectorizer(stop_words=stop_words) vectorised_train_documents = vectorizer.fit_transform(train_docs) vectorised_test_documents = vectorizer.transform(test_docs) # print([reuters.categories(doc_id) for doc_id in test_docs_id]) mlb = MultiLabelBinarizer() train_labels = mlb.fit_transform( [reuters.categories(doc_id) for doc_id in train_docs_id]) test_labels = mlb.transform( [reuters.categories(doc_id) for doc_id in test_docs_id]) return vectorised_train_documents.toarray( ), vectorised_test_documents.toarray(), train_labels, test_labels
def reuters_to_df(set_name, label_to_idx): data = [x for x in reuters.fileids() if set_name in x] # collect all data to create df from all_texts = [ " ".join([" ".join(sen) for sen in reuters.sents(doc_id)]) for doc_id in data ] all_labels = np.zeros((len(all_texts), len(label_to_idx))) all_label_indices = [[ label_to_idx[lab] for lab in reuters.categories(doc_id) ] for doc_id in data] for i, labs in enumerate(all_label_indices): # binary encode the labels all_labels[i][labs] = 1 all_labels = all_labels.astype(int) # all_labels[all_label_indices] = 1 cols = ["text"] label_cols = ["topic_{}".format(lab) for lab in reuters.categories()] cols.extend(label_cols) # create df and set values df = pd.DataFrame(columns=cols) df["text"] = all_texts df[label_cols] = all_labels return df
def get_data_splits(): train_docs, train_labels = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('training/')]) test_docs, test_labels = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('test/')]) return train_docs, train_labels, test_docs, test_labels
def print_reuters(): from nltk.corpus import reuters # print reuters.fileids() # print reuters.categories() print reuters.categories('training/9865') print reuters.categories(['training/9865','training/9880']) print reuters.fileids('barley') print reuters.fileids(['barely','corn'])
def __init__(self): # print reuters categories print "reuters categories" print reuters.categories() # TODO this is probably bad print "getting nodes" self.nodes = database.get_all_nodes() print "training classifier" self.classifier = DocumentClassifier()
def explore_categories(max_len=5000, min_len=100, percentage=0.3): for cat in reuters.categories(): for cat2 in reuters.categories(): if cat2 > cat: if len(set(reuters.fileids(cat)) & set(reuters.fileids(cat2))) == 0: l1 = len(reuters.fileids(cat)) l2 = len(reuters.fileids(cat2)) if ( (l1 + l2) > min_len) and ( (l1 + l2) < max_len) and float((min(l1, l2))/float(l1+l2) > percentage): print cat, cat2, l1 + l2, float(min(l1, l2))/float(l1+l2)
def __init__(self, min_eic=5): self.test_classes = [] self.test_docs = [] self.train_classes = [] self.train_docs = [] self.table_of_classes = [] self.num_of_instances = [] # mininimal encounter in classes self.min_eic = min_eic if Path("training_cache/train_docs").is_file() and Path("training_cache/train_classes").is_file() \ and Path("training_cache/test_docs").is_file() and Path("training_cache/test_classes").is_file() \ and Path("classify_cache/table_of_classes").is_file(): self.train_docs = joblib.load("training_cache/train_docs") self.train_classes = joblib.load("training_cache/train_classes") self.test_docs = joblib.load("training_cache/test_docs") self.test_classes = joblib.load("training_cache/test_classes") self.table_of_classes = joblib.load( "classify_cache/table_of_classes") else: raw_test_classes = [] raw_train_classes = [] for doc_id in reuters.fileids(): if doc_id.startswith("train"): self.train_docs.append( prepare_text_for_analysis(reuters.raw(doc_id))) raw_train_classes.append(reuters.categories(doc_id)) else: self.test_docs.append( prepare_text_for_analysis(reuters.raw(doc_id))) raw_test_classes.append(reuters.categories(doc_id)) self.make_table_of_classes(raw_train_classes) self.train_classes = self.transform_classes( raw_train_classes, "train") self.test_classes = self.transform_classes(raw_test_classes, "test") joblib.dump(self.train_docs, "training_cache/train_docs", compress=9) joblib.dump(self.train_classes, "training_cache/train_classes", compress=9) joblib.dump(self.test_docs, "training_cache/test_docs", compress=9) joblib.dump(self.test_classes, "training_cache/test_classes", compress=9) joblib.dump(self.table_of_classes, "classify_cache/table_of_classes", compress=9)
def main(): collection_stats() print("Staring classifier ..") X_train = list() X_test = list() y_train = list() y_test = list() print("Reading training and testing data ..") for doc_id in reuters.fileids(): if doc_id.startswith("train"): X_train.append(reuters.raw(doc_id)) y_train.append(reuters.categories(doc_id)) else: X_test.append(reuters.raw(doc_id)) y_test.append(reuters.categories(doc_id)) X_train = numpy.array(X_train) y_train = numpy.array(y_train) X_test = numpy.array(X_test) y_test = numpy.array(y_test) binarizer = MultiLabelBinarizer(classes=reuters.categories()) classifier = Pipeline([ ('vectorizer', TfidfVectorizer(tokenizer=tokenize, min_df=0, max_df=0.90, max_features=3000, use_idf=True, sublinear_tf=True)), # ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LogisticRegression())) ]) print("Training classifier ..") classifier.fit(X_train, binarizer.fit_transform(y_train)) print("Testing classifier ..") res = classifier.predict(X_test) hard_precision = classifier.score(X_test, binarizer.transform(y_test)) precision = average_precision_score(res, binarizer.fit_transform(y_test), average=None) recall = recall_score(res, binarizer.fit_transform(y_test), average=None) f1score = f1_score(res, binarizer.fit_transform(y_test), average=None) print("Hard precision: " + str(hard_precision)) log_results(reuters.categories(), precision, recall, f1score)
def get_test_set(): single_categories = [(id, re.categories(id)[0]) for id in re.fileids() if len(re.categories(id)) == 1] single_cat_list = distribution(single_categories, itemgetter(1)) used_categories = [x[0] for x in single_cat_list if x[1] < 600 and x[1] > 200] return [pair for pair in single_categories if pair[1] in used_categories]
def get_target(self): # cat1 vs. cat2 if len(self.categories) > 1: target = [ [cat for cat in reuters.categories(fileid) if cat in self.categories][0] for fileid in self.fileids] # cat1 vs. not cat1 else: target = [ 1 if self.categories[0] in reuters.categories(fileid) else 0 for fileid in self.fileids] self.classes, target = np.unique(target, return_inverse=True) return target
def labels(filenames, cats=None): """Return topic labels (one-hot format) for given files :param filenames: selected files from Reuters dataset :param cats: categories to filter (optional) :return: topic labels (one-hot format) """ if cats is None: cats = reuters.categories() data = [[c for c in reuters.categories(f) if c in cats] for f in filenames] mb = MultiLabelBinarizer(classes = cats) onehot = mb.fit_transform(data) df = pd.DataFrame(onehot, columns=cats) return df
def get_default_split(): documents = reuters.fileids() train_docs_id = list( filter(lambda doc: doc.startswith("train"), documents)) test_docs_id = list( filter(lambda doc: doc.startswith("test"), documents)) X_train = [reuters.raw(doc_id) for doc_id in train_docs_id] X_test = [reuters.raw(doc_id) for doc_id in test_docs_id] Y_train = [reuters.categories(doc_id) for doc_id in train_docs_id] Y_test = [reuters.categories(doc_id) for doc_id in test_docs_id] return X_train, Y_train, X_test, Y_test
def get_labels(): most_common_class = collections.Counter( [c for cs in [reuters.categories(fileid) for fileid in fileids] \ for c in cs]).most_common(1)[0][0] print('Most common class in sampled documents:', most_common_class) return ( np.array( [1 if most_common_class in reuters.categories( fileid) else 0 for fileid in fileids], dtype=np.int32), ('other', most_common_class) )
def get_topics(min_samples=None): """Return set of topics from Reuters corpus If *min_samples* is specified, only topics with at least that many examples are included. :param min_samples: minimum number of example per topic :return: list of topics """ cats = reuters.categories() if min_samples is not None: cats = [c for c in reuters.categories() if len(reuters.fileids(c)) >= min_samples] return cats
def build_output(dataset='train'): documents = reuters.fileids() docs_id = list(filter(lambda doc: doc.startswith(dataset), documents)) output = np.zeros((len(docs_id), len(reuters.categories()))) reuters_categories = reuters.categories() i = 0 for docs in docs_id: if i % 100 == 0: print(i) for category in reuters.categories(docs): output[i, reuters_categories.index(category)] = 1 i += 1 return output
def load_data(valid_percent=0.1): """ Load the Reuters dataset. Returns: raw text and raw labels for train, valid, test set. """ nltk.download('reuters') n_classes = 90 labels = reuters.categories() documents = reuters.fileids() test = [d for d in documents if d.startswith('test/')] train = [d for d in documents if d.startswith('training/')] docs = {} docs['train'] = [reuters.raw(doc_id) for doc_id in train] docs['test'] = [reuters.raw(doc_id) for doc_id in test] ys = {'train': [], 'test': []} ys['train'] = [reuters.categories(doc_id) for doc_id in train] ys['test'] = [reuters.categories(doc_id) for doc_id in test] # Validation n_valid = int(valid_percent * len(ys['train'])) np.random.seed(5) idxs = np.random.choice(len(ys['train']), n_valid, replace=False) idx_set = set(idxs) docs['valid'] = [] ys['valid'] = [] train_docs = [] train_y = [] for idx, (x, y) in enumerate(zip(docs['train'], ys['train'])): if idx in idx_set: docs['valid'].append(x) ys['valid'].append(y) else: train_docs.append(x) train_y.append(y) data = { 'x_train': train_docs, 'y_train': train_y, 'x_valid': docs['valid'], 'y_valid': ys['valid'], 'x_test': docs['test'], 'y_test': ys['test'], 'labels': labels } return data
def load_data(): docs = reuters.fileids() train_ids = [doc for doc in docs if doc.startswith("train")] test_ids = [doc for doc in docs if doc.startswith("test")] train_data = pd.DataFrame([(reuters.raw(id), reuters.categories(id)[0]) for id in train_ids], columns=('text', 'labels')) test_data = pd.DataFrame([(reuters.raw(id), reuters.categories(id)[0]) for id in test_ids], columns=('text', 'labels')) return train_data, test_data
def loadReutersData(documents,labels): categories_list=['acq','crude','earn','grain','interest','money-fx','ship','trade'] docCount=0 for i in range(0,len(categories_list)): category_docs = reuters.fileids(categories_list[i]) print (categories_list[i]) for document_id in reuters.fileids(categories_list[i]): if(len(reuters.categories(document_id))==1): content=str(reuters.raw(document_id)) soup = BeautifulSoup(content) content=soup.get_text() documents.append(content) docCount+=1 labels.append(str(reuters.categories(document_id)))
def stats(self): """ :return: Important statistics about the dataset - numbers of documents in different classes with corresponding percentages, as well as vocabulary sizes for every class. """ lt = LemmaTokenizer() train_stats = {} test_stats = {} for c in reuters.categories(): train_stats[c] = { 'num_of_docs': 0, 'percentage': 0.0, 'words': set([]) } test_stats[c] = { 'num_of_docs': 0, 'percentage': 0.0, 'words': set([]) } for d in self.train: c = reuters.categories(d)[0] train_stats[c]['num_of_docs'] += 1 train_stats[c]['words'] |= set(lt.lemma_tokenize(reuters.raw(d))) for d in self.test: c = reuters.categories(d)[0] test_stats[c]['num_of_docs'] += 1 test_stats[c]['words'] |= set(lt.lemma_tokenize(reuters.raw(d))) s_train = sum(train_stats[c]['num_of_docs'] for c in train_stats.keys()) s_test = sum(test_stats[c]['num_of_docs'] for c in test_stats.keys()) res = ({}, {}) for c in train_stats.keys(): if train_stats[c]['num_of_docs'] != 0: train_stats[c][ 'percentage'] = train_stats[c]['num_of_docs'] / s_train train_stats[c]['words'] = len(train_stats[c]['words']) res[0][c] = train_stats[c] for c in test_stats.keys(): if test_stats[c]['num_of_docs'] != 0: test_stats[c][ 'percentage'] = test_stats[c]['num_of_docs'] / s_test test_stats[c]['words'] = len(test_stats[c]['words']) res[1][c] = test_stats[c] return res
def create_train_data(docs, dictionary): all_topics = reuters.categories() train_data = [] for i in range(len(docs)): # Process 1 raw document to list without numbers, signs, etc. wordlist = process_doc(docs[i]) len_wordlist = len(wordlist) # Remove stop words wordlist = remove_stop_words(wordlist) len_wordlist_no_sw = len(wordlist) # Number of stop words in document sw_count = len_wordlist - len_wordlist_no_sw # Replace words from doc not included in our dictionary by __OOV__ fv = [] fv_nn = [] for word in dictionary: # v tvare list slovnikov [ {2:x} , {5:y} ...] if word[-1] in wordlist: # x, y count = wordlist.count( word[-1]) # pocet vyskytov daneho slova v dokumente # fv.append(dict({count:word[-1]})) # human readable FV # fv_nn.append(count) # NN readable FV fv_nn.append(((count / len_wordlist) * 100)) # vyskyt daneho slova v dokumente v % else: # fv.append('__OOV__') fv_nn.append(0.) # Pocet vsetkych slov dokumentu, pripojime na koniec FV # fv.append(len_wordlist) fv_nn.append(len_wordlist) # Pocet stop-slov v dokumente, v percentach pripojime na koniec FV # fv.append(float("{0:.2f}".format((sw_count/len_wordlist)*100))) # fv_nn.append(float("{0:.2f}".format((sw_count/len_wordlist)*100))) fv_nn.append(((sw_count / len_wordlist) * 100)) # Na akej pozicii sa nachadza topic v zozname vsetkych topicov, jeho pozicia je UNIQUE, pripojime na koniec FV doc_topic = reuters.categories(docs[i]) topic_id = all_topics.index(doc_topic[0]) fv_nn.append(topic_id) # V kazdom cykle spracujeme 1 dokument a prilepime jeho vektor vlastnosti + topic id k trenovacim datam train_data.append(fv_nn) return train_data # all Feature Vectors as list + result topic
def create_new_dataset(train_docs_id, test_docs_id, top10_categories): #train data new_train_docs_id = [] y__train = [] instaces_vector = [0, 0] #vettore per contare istanze di earn e di acq for train_doc in train_docs_id: max_instances = float('inf') new_category = '' for doc_category in reuters.categories(train_doc): for i in range(0, len(top10_categories)): if doc_category == top10_categories[i][ 1] and max_instances > top10_categories[i][0]: new_category = doc_category max_instances = top10_categories[i][0] if new_category != '': #se il documento ha almeno una classe appartenente alla top10 if new_category == u'earn': if instaces_vector[0] < 500: instaces_vector[0] += 1 new_train_docs_id.append(train_doc) y__train.append(new_category) elif new_category == u'acq': if instaces_vector[1] < 500: instaces_vector[1] += 1 new_train_docs_id.append(train_doc) y__train.append(new_category) else: new_train_docs_id.append(train_doc) y__train.append(new_category) # test data new_test_docs_id = [] y__test = [] instaces_vector = [0, 0] for test_doc in test_docs_id: max_instances = float('inf') new_category = '' for doc_category in reuters.categories(test_doc): #ciclo sui documenti for i in range(0, len(top10_categories)): if doc_category in top10_categories[i][ 1] and max_instances > top10_categories[i][0]: new_category = doc_category max_instances = top10_categories[i][0] if new_category != '': # se il documento ha almeno una classe appartenente alla top10 new_test_docs_id.append(test_doc) y__test.append(new_category) return new_train_docs_id, y__train, new_test_docs_id, y__test
def create_tfidf_data(docs,categories,n=None): """ Crea una struttura [(label,[parole])] parsando il documento :param docs: lista dei documenti reuters :param categories: nomi delle categorie da considerare :param n: numero di documenti da usare :return: list """ if n: docs = docs[:n] cat_num = {}; i = 1 for c in categories: cat_num[c] = i i += 1 y = [] corpus = [] for d in docs: c = reuters.categories(d)[0] if c in categories: y.append(getSVMCategory(cat_num[c])) corpus.append(reuters.raw(d).lower()) return y, corpus
def categorize_reuters(): ''' Parses dataset to only examine documents associated with a single category. ''' categories = {} for file_id in reuters.fileids(): if len(reuters.categories(file_id)) == 1: cat = reuters.categories(file_id)[0] if cat not in categories.keys(): categories[cat] = {} text = reuters.raw(file_id) categories[cat][file_id.replace('/', '_')] = text return categories
def get_testset_trainset_nltk_reuters(): from nltk.corpus import reuters global categories_file_name_dict global cat_num_docs clean_files = [f for f in reuters.fileids() if len(reuters.categories(fileids=f))==1] testset = [f for f in clean_files if f[:5]=='test/'] trainset = [f for f in clean_files if f[:9]=='training/'] for cat in reuters.categories(): li=[f for f in reuters.fileids(categories=cat) if f in trainset] li_te = [f for f in reuters.fileids(categories=cat) if f in testset] if len(li)>20 and len(li_te)>20: cat_num_docs[cat]=len(li) li.extend(li_te) categories_file_name_dict[cat]=li return [[ f for f in trainset if f2c('reuters',f) in categories_file_name_dict], [ f for f in testset if f2c('reuters',f) in categories_file_name_dict]]
def f2c(corpus,fileName): if corpus=='mr': from nltk.corpus import movie_reviews as mr return mr.categories(fileids = fileName)[0] else: from nltk.corpus import reuters return reuters.categories(fileids = fileName)[0]
def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq): labeled_words = [] for label in reuters.categories(): labeled_words.append((label, reuters.words(categories=[label]))) return high_information_words(labeled_words, score_fn=score_fn)
def collection_stats(): # List of documents documents = reuters.fileids() print(str(len(documents)) + " documents"); train_docs = list(filter(lambda doc: doc.startswith("train"), documents)); print(str(len(train_docs)) + " total train documents"); test_docs = list(filter(lambda doc: doc.startswith("test"), documents)); print(str(len(test_docs)) + " total test documents"); # List of categories categories = reuters.categories(); print(str(len(categories)) + " categories"); # Documents in a category category_docs = reuters.fileids("acq"); # Words for a document document_id = category_docs[0] document_words = reuters.words(category_docs[0]); print(document_words); # Raw document print(reuters.raw(document_id));
def collection_stats(): # List of documents documents = reuters.fileids() print(str(len(documents)) + " documents") # List of categories categories = reuters.categories() print(str(len(categories)) + " categories") docs_per_category = list() # Documents in a category for category in categories: category_docs = reuters.fileids(category) print('for the following category: {} this is the amount of docs: {}'. format(category, len(category_docs))) docs_per_category.append(len(category_docs)) print('mean of docs per category is: {}'.format( np.mean(np.array(docs_per_category)))) print('standard deviation of docs per category is: {}'.format( np.std(np.array(docs_per_category)))) print('this is the min docs per category: {}'.format( min(docs_per_category))) print('this is the max docs per category: {}'.format( max(docs_per_category)))
def collection_stats(): # List of documents documents = reuters.fileids() print(str(len(documents)) + " documents") train_docs = list(filter(lambda doc: doc.startswith("train"), documents)) print(str(len(train_docs)) + " total train documents") test_docs = list(filter(lambda doc: doc.startswith("test"), documents)) print(str(len(test_docs)) + " total test documents") # List of categories categories = reuters.categories() print(str(len(categories)) + " categories") # Documents in a category category_docs = reuters.fileids("acq") # Words for a document document_id = category_docs[0] document_words = reuters.words(category_docs[0]) print(document_words) # Raw document print(reuters.raw(document_id))
def get_documents(): """ Get documents from 20 News Groups, Movie Reviews and Reuters corpora. Returns: list of str: Small subset of documents from News Groups, Movie Reviews and Reuters corpora """ dataset = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes')) corpus_20newsgroups = dataset.data[:5] tuples = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] corpus_movies = [tuple_[0] for tuple_ in tuples] shuffle(corpus_movies) corpus_movies = corpus_movies[:5] tuples = [(reuters.raw(fileid), reuters.categories(fileid)) for fileid in reuters.fileids()] corpus_reuters = [tuple_[0] for tuple_ in tuples] shuffle(corpus_reuters) corpus_reuters = corpus_reuters[:5] corpus = list() corpus.extend(corpus_20newsgroups) corpus.extend(corpus_movies) corpus.extend(corpus_reuters) return corpus
def __run2__(trainDocs, testDocs): """ This method takes the filtered&processed train and processed test docs and performs the nearest neighbor retrieve with n=k. With this result, the k nearest neighbors of this nearest neighbor are categorized using kNN similairty scoring and the required values necessary for F1 evaluation are returned. """ #testDocs is of form {docId : doc_vec} fp = 0 tp = 0 fn = 0 for docId, vec in testDocs.items(): nearest_neighbors = hyperplaneClassifier.nearestNeighbors( trainDocs, docId, vec) cats = set( hyperplaneClassifier.categorize_kNNSimScoring( testDocs, trainDocs, docId, nearest_neighbors)) print(cats) actual_cats = set(reuters.categories(docId)) print(actual_cats) print("\n") fp += len(cats.difference(actual_cats)) tp += len(cats.intersection(actual_cats)) fn += len(actual_cats.difference(cats)) return fp, tp, fn
def import_reuters_files(ds, silent=False, log=sys.stdout): """ Import the brown corpus into `ds`. E.g. >>> from nathan.core import Dataspace >>> ds = Dataspace() >>> %time brown.import_brown(ds, silent=True) CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s Wall time: 12min 29s """ if not silent: total = len(reuters.fileids()) counter = 0 root_handle = ds.insert("#reuters") for fileid in reuters.fileids(): tags = ["@%s" % category for category in reuters.categories(fileid)] file_handle = ds.insert(["#%s" % fileid] + tags) ds.link(root_handle, file_handle) for sent in reuters.sents(fileid): norm = [word.lower() for word in sent] sen_handle = ds.insert(norm) ds.link(file_handle, sen_handle) if not silent: counter += 1 if (counter % 10 == 0): print("importing %s of %s files..." % (counter, total), file=log)
def __init__(self): # Generate training set from sample of Reuters corpus train_docs = [(self.bag_of_words(reuters.words(fileid)), category) for category in reuters.categories() for fileid in reuters.fileids(category) if fileid.startswith("train")] # Create a classifier from the training data self.classifier = NaiveBayesClassifier.train(train_docs)
def format_data(docs, all_categories): y = []; corpus = [] for d in docs: current_categories = filter(lambda x: x in all_categories,reuters.categories(d)) if current_categories: y.append(current_categories[0]) corpus.append(reuters.raw(d).lower()) return y, corpus
def makeWordSet(args=None): '''Use the Brown corpus to see how many words used''' word_set = set() for cat in brown.categories(): word_set = word_set.union(set(brown.words(categories=cat))) for cat in reuters.categories(): word_set = word_set.union(set(reuters.words(categories=cat))) return word_set
def __init__(self, dataset=''): """ Docs in reuters corpus are identified by ids like "training|test/xxxx". :param dataset: filter for ids """ self.dataset = dataset # filter docs self.categories = {c: n for n, c in enumerate(reuters.categories())} # map class with int self.docs = {d: n for n, d in enumerate(reuters.fileids())} # map docs with int self.category_mask = [] # mask nth doc with its ith class
def __iter__(self): """ Generator of docs while collecting ordered structured info. """ for n, reutersid in enumerate(reuters.fileids()): # 'training|test/xxxx' dataset, _ = reutersid.split('/') # extract dataset if self.dataset in dataset: # yield only filtered dataset if self.categories is not None: top_category = reuters.categories(reutersid)[0] # grab first category only self.category_mask.append(self.categories[top_category]) # n-th doc -> classid yield reuters.raw(reutersid) # return raw document
def reuters_train_test_feats(feature_detector=bag_of_words): train_feats = [] test_feats = [] for fileid in reuters.fileids(): if fileid.startswith('training'): featlist = train_feats else: # fileid.startswith('test') featlist = test_feats feats = feature_detector(reuters.words(fileid)) labels = reuters.categories(fileid) featlist.append((feats, labels)) return train_feats, test_feats
def create_tfidf_data(docs,n=None): """ Crea una struttura [(label,[parole])] togliendo le stopwords e parsando il documento :param docs: lista dei documenti reuters :param n: numero di documenti da usare :return: list """ if n: docs = docs[:n] y = [reuters.categories(d)[0] for d in docs] corpus = [reuters.raw(d).lower() for d in docs] return y, corpus
def makeData(file, set): labels = [] f = open(file, "w") for doc in set: title = [] label = reuters.categories(doc)[0] labels.append(label) for i in reuters.words(doc): if not i.isupper(): break else: title.append(i) f.write(' '.join(title) + "\n") f.close() f = open("labels" + file, "w") f.write("\n".join(labels)) f.close()
def computeStats(self, categories): files = batchReadReuters('training', categories) for file_name in files: raw_txt = readFromFile('/home/dales3d/nltk_data/corpora/reuters/' + file_name) fileCategories = reuters.categories(file_name) #for cat in categories: # if cat not in self.activeCategories: # self.activeCategories.append(cat) self.activeCategories = categories words = extractWords(raw_txt) keywords = meter(words) for word in keywords: if word not in self.wordsStatDict: self.wordsStatDict[word] = WordStats() w_stat = self.wordsStatDict[word] w_stat.word = word w_stat.addText(file_name, keywords[word], fileCategories)
def reuters(): reuters.fileids() reuters.categories() reuters.categories('training/9865') reuters.categories(['training/9865', 'training/9880']) reuters.fileids('barley') reuters.fileids(['barley', 'corn']) reuters.words('training/9865')[:14] reuters.words(['training/9865', 'training/9880']) reuters.words(categories='barley') reuters.words(categories=['barley', 'corn'])
def exercise_reuters(): print reuters.fileids() print reuters.categories() # 查看单个文档的主题 print reuters.categories("training/9865") # 查看多个文档的主题 print reuters.categories(["training/9865", "training/9880"]) # 查看每个主题的文档 print reuters.fileids("barley") # 查看多个主题的文档 print reuters.fileids(["barley", "corn"]) # 查看某个文档的词汇 print reuters.words("training/9865") # 查看多个文档的词汇 print reuters.words(["training/9865", "training/9880"]) # 查看某个主题的词汇 print reuters.words(categories="barley") # 查看多个主题的词汇 print reuters.words(categories=["barley", "corn"])
from nltk.tokenize import word_tokenize from sklearn.preprocessing.label import MultiLabelBinarizer nltk.download('reuters') nltk.download('punkt') google_news_word2vec_model_location = 'data/GoogleNews-vectors-negative300.bin.gz' doc2vec_model_location = 'model/doc2vec-model.bin' doc2vec_dimensions = 300 classifier_model_location = 'model/classifier-model.bin' doc2vec = Doc2Vec.load(doc2vec_model_location) # Convert the categories to one hot encoded categories labelBinarizer = MultiLabelBinarizer() labelBinarizer.fit([reuters.categories(fileId) for fileId in reuters.fileids()]) # Convert load the articles with their corresponding categories train_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('training/')] test_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('test/')] shuffle(train_articles) shuffle(test_articles) # Convert the articles to document vectors using the doc2vec model train_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in train_articles] test_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in test_articles] train_labels = labelBinarizer.transform([article['categories'] for article in train_articles]) test_labels = labelBinarizer.transform([article['categories'] for article in test_articles]) train_data, test_data, train_labels, test_labels = numpy.asarray(train_data), numpy.asarray(test_data), numpy.asarray(train_labels), numpy.asarray(test_labels) # Initialize the neural network
def load_reuters(setName): html = HTMLParser.HTMLParser() doc_ids = reuters.fileids() cat2all_ids = {} cat2train_ids = {} cat2test_ids = {} cat2all_num = {} cand_docNum = 0 for doc_id in doc_ids: # only choose docs belonging in one category if len( reuters.categories(doc_id) ) == 1: cat = reuters.categories(doc_id)[0] cand_docNum += 1 if doc_id.startswith("train"): cat2set_ids = cat2train_ids else: cat2set_ids = cat2test_ids if cat in cat2set_ids: cat2set_ids[cat].append(doc_id) else: cat2set_ids[cat] = [ doc_id ] # both train and test doc_ids are put in cat2all_ids if cat in cat2all_ids: cat2all_ids[cat].append(doc_id) else: cat2all_ids[cat] = [ doc_id ] if cat in cat2all_num: cat2all_num[cat] += 1 else: cat2all_num[cat] = 1 print "Totally %d docs, %d single-category docs in %d categories" %( len(doc_ids), cand_docNum, len(cat2train_ids) ) sorted_cats = sorted( cat2all_num.keys(), key=lambda cat: cat2all_num[cat], reverse=True ) catNum = 10 cats_docsWords = [ [] for i in xrange(catNum) ] cats_docNames = [ [] for i in xrange(catNum) ] topN_cats = sorted_cats[:catNum] print "Top 10 categories:" keptAllDocNum = 0 keptTrainDocNum = 0 keptTestDocNum = 0 for cat in topN_cats: print "%s: %d/%d" %( cat, len(cat2train_ids[cat]), len(cat2test_ids[cat]) ) keptTrainDocNum += len(cat2train_ids[cat]) keptTestDocNum += len(cat2test_ids[cat]) keptAllDocNum += len(cat2train_ids[cat]) + len(cat2test_ids[cat]) print "Totally %d docs kept, %d in train, %d in test" %( keptAllDocNum, keptTrainDocNum, keptTestDocNum ) if setName == "train": cat2set_ids = cat2train_ids setDocNum = keptTrainDocNum elif setName == "test": cat2set_ids = cat2test_ids setDocNum = keptTestDocNum elif setName == "all": cat2set_ids = cat2all_ids setDocNum = keptAllDocNum else: raise Exception("Unknown set name %s" %setName) orig_docs_name = [] orig_docs_cat = [] orig_docs_words = [] readDocNum = 0 totalLineNum = 0 emptyFileNum = 0 for cat_id, cat in enumerate(topN_cats): for doc_id in cat2set_ids[cat]: if readDocNum % 50 == 49 or readDocNum == setDocNum - 1: print "\r%d %d\r" %( readDocNum + 1, totalLineNum ), text = html.unescape( reuters.raw(doc_id) ) text = text.encode("utf-8") lines = text.split("\n") if len(text) == 0 or len(lines) == 0: emptyFileNum += 1 continue readDocNum += 1 totalLineNum += len(lines) text = " ".join(lines) wordsInSentences, wc = extractSentenceWords(text) filename = doc_id orig_docs_words.append( wordsInSentences ) orig_docs_name.append(filename) orig_docs_cat.append(cat_id) cats_docsWords[cat_id].append(wordsInSentences) cats_docNames[cat_id].append(filename) print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum) return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \ cats_docsWords, cats_docNames, topN_cats
print m + ':', fdist[m] cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] cfd.tabulate(conditions=genres, samples=modals) #路透语料库 from nltk.corpus import reuters reuters.fileids() reuters.categories() reuters.categories(['training/9865', 'training/9880']) reuters.fileids(['barley', 'corn']) reuters.words('training/9865')[:14] reuters.words(['training/9865', 'training/9880']) reuters.words(categories=['barley', 'corn']) #演说语料库 from nltk.corpus import inaugural inaugural.fileids() #多国世界人权宣言 from nltk.corpus import udhr languages = ['Chickasaw', 'English', 'German_Deutsch','Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'] cfd = nltk.ConditionalFreqDist( (lang, len(word))
def first_occurrence_perc(self, term, document): return term.first_occurrence(document) / len(document.doc) # def terminals(self, term, document): # return ( # self.bool(term, document), self.tf(term, document), self.tf_idf(term, document), self.tf_ig(term, document), # self.tf_chi(term, document), self.tf_rf(term, document)) def raw_terminals(self, term, document): return (self.bool(term, document), self.tf(term, document), self.max_prob_term_and_category(term, document), self.max_prob_term_not_category(term, document), self.avg_prob_term_category(term, document), self.avg_prob_term_not_category(term, document), self.first_occurrence_perc(term, document)) if __name__ == '__main__': training_fileids = fileids = filter(lambda fileid: "training" in fileid and len(reuters.categories(fileid)) == 1, reuters.fileids()) documents = [sum(reuters.sents(fid), []) for fid in training_fileids] doc = documents[0] term = terminals.WordTerm("in") docs_categories = [reuters.categories(fid)[0] for fid in training_fileids] print docs_categories print doc fe = TWSCalculator(documents, docs_categories) print "tf =", fe.tf(term, doc), "idf =", fe.idf(term), "tf-idf =", fe.tf_idf(term, doc) term = terminals.WordTerm("in") print 'TF-CHI: ', fe.tf_chi(term, doc) print 'TF-CHI: ', fe.tf_chi(term, doc)
for word in flat_feat_pruned: features['%s' % word]=doc.count(word) return features #full_feat=[(doc_features(cfd[condition].keys()),condition=='gold') for condition in cfd.conditions()] #------------------------------------- #Use chosen features to create train/test sets #either based on boolean or count of features in docs #------------------------------------- #reutDocs=[(list(reuters.words(fileid)),cat) #better way: lower, stem, and only keep alpha numeric reutDocs=[([porter.stem(a.lower()) for a in reuters.words(fileid) if re.findall(r"[^\W\d]",a)] ,cat) for cat in reuters.categories() for fileid in reuters.fileids(cat)] #randomize for later test/train split random.seed(1979) random.shuffle(reutDocs) #run feature extractor on reutDocs instead of cfd; pickled as 'featureset.pkl' #Boolean based features e.g. 1 or 0 for each word in flat_feat_pruned per document featureset=[(doc_features(d),c) for (d,c) in reutDocs] #Count based features count_featureset=[(count_features(d),c) for (d,c) in reutDocs] #-------------------------------------
def encode_doc(doc): d = np.zeros((1, max_len-1, n_chars), dtype=np.bool) for p, j in enumerate(doc.lower()[:max_len]): d[0, p, char_to_idx[j]] = 1 return d # load or create the character encoding dictionaries if os.path.exists(char_idx_path): with open(char_idx_path, 'rb') as f: logger.info('Loading character encodings from "%s"' % char_idx_path) idx_to_char = pickle.load(f) char_to_idx = pickle.load(f) cat_enc = pickle.load(f) else: n_docs = len(reuters.fileids()) cat_enc = dict((x, i+1) for i, x in enumerate(set(reuters.categories()))) chars = set() for fid in reuters.fileids(): chars = chars.union(set(reuters.raw(fid).lower())) idx_to_char = dict((i, c) for i, c in enumerate(chars)) char_to_idx = dict((c, i) for i, c in enumerate(chars)) with open(char_idx_path, 'wb') as f: logger.info('Saving character encodings to "%s"' % char_idx_path) pickle.dump(idx_to_char, f) pickle.dump(char_to_idx, f) pickle.dump(cat_enc, f) if os.path.exists(reuters_enc_path):
# Logistic Regression parameters ITERATIONS = 800 ALPHA = 0.1 THRESHOLD = 0.5 #LAMBDA = 1 ## Extract the fileids of the necessary subset of documents ## documents_fileids = reuters.fileids(CATEGORIES) ## Split into training and testing ## training_fileids = [w for w in documents_fileids if w.startswith('training')] testing_fileids = [w for w in documents_fileids if w.startswith('test')] ## Extract features ## most_frequent_words = extract_most_frequent_words(training_fileids, NUM_MOST_FREQUENT) training_featureset = [(document_features(reuters.words(fileid), most_frequent_words), reuters.categories(fileid)) for fileid in training_fileids] testing_featureset = [(document_features(reuters.words(fileid), most_frequent_words), reuters.categories(fileid)) for fileid in testing_fileids] ## Train a classifier ## # Create a matrix with the values of the features to send to the classifier training_matrix = np.array([list(x.values()) for x, y in training_featureset]) # Create a matrix with only the classes labels of each document classes_vector = [y for x, y in training_featureset] n = training_matrix.shape[1] # Number of features m = len(training_matrix) # Number of training documents # Add the bias term to the training matrix training_matrix = np.concatenate((np.ones((m,1)), training_matrix), axis = 1) # Create ten classifiers classifiers = dict( (c, np.zeros((n+1,1)) ) for c in CATEGORIES )