document_name = six.text_type( file_path.rstrip('.eml').lstrip(path_prefix)) documents.append(document_name) except UnicodeDecodeError: # So we will not use that file. del new_file_paths[i] print('Skipping file: ', file_path, '.') return documents, new_file_paths print('Loading documents.') if use_small_dataset: docs, input_type, tokenizer = dataset_small() paths = docs else: unchecked_paths, input_type, tokenizer = dataset_mails(doc_path) docs, paths = get_document_names(doc_path, unchecked_paths) print('Loaded ', len(docs), ' files from path: ', doc_path, '.') print('Extracting features.') vectorizer = create_vectorizer(input_type, tokenizer=tokenizer, ngram_range=(1, 1)) data = vectorizer.fit_transform(paths) features = vectorizer.get_feature_names() data = coo_matrix(data) data = apply_threshold(data, 0.1) # Filter out everything, that is too weak.
from tools.datasets import dataset_mails # TODO: move to scripts def fit_lda(corpus, vocabulary, n_topics=10, passes=1): return LdaModel(corpus, num_topics=n_topics, passes=passes, id2word={i: s for i, s in enumerate(vocabulary)}) def fit_hdp_lda(corpus, vocabulary): return HdpModel(corpus, {i: s for i, s in enumerate(vocabulary)}) if __name__ == '__main__': content, input_type, tokenizer = dataset_mails( '/Users/yanchith/workspace/won-corpora/processed') # content, input_type, tokenizer = dataset_newsgroups() vectorizer = TfidfVectorizer(min_df=3, input=input_type, ngram_range=(1, 1), stop_words='english', tokenizer=tokenizer) X = vectorizer.fit_transform(content) features = vectorizer.get_feature_names() print('Number of features:', len(features)) print('Bag of words shape:', X.shape) print(features) # Beware, gensim requires the matrix transposed model = fit_hdp_lda(matutils.Sparse2Corpus(X, documents_columns=False), features)