def __make_dataset_bags(self, in_files): result = [] file_indices = [] for (filename, index) in zip(in_files, itertools.count()): sentences = self.sentences_extractor(filename) sentences_filtered = [] for raw_sentence in sentences: sentences_filtered.append([word for word in raw_sentence if word not in self.stop_list]) bag = bag_of_words.sentences_to_bag_of_words(sentences_filtered) if len(bag) >= 20: result.append(bag) file_indices.append(index) return (result, file_indices)
def process_data(in_folder, in_sentences_extractor): bags = [] (files, categories) = dataset_loading.get_files_list(in_folder) for filename in files: sentences = in_sentences_extractor(filename) bags.append(bag_of_words.sentences_to_bag_of_words(sentences)) categories_dict = dataset_loading.get_categories_dict(categories) categories_vector = [categories_dict[category] for category in categories] vectorizer = DictVectorizer() # builds a vocabulary out of all words in both sets term_document_matrix = vectorizer.fit_transform(bags) tfidf_transformer = TfidfTransformer() # in this matrix rows are documents, columns - features (terms' tfidf's) tfidf_matrix = tfidf_transformer.fit_transform(term_document_matrix) return (tfidf_matrix, categories_dict, categories_vector)