def _load_default_texts(): """ Loads default general texts Returns ------- result : default 20newsgroup texts """ dataset = Dataset() dataset.fetch_dataset("20NewsGroup") return dataset.get_corpus()
def test_model_output_lda_tomotopy(data_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + '/M10') num_topics = 3 model = LDATOMOTO(num_topics=num_topics, alpha=0.1) output = model.train_model(dataset) assert 'topics' in output.keys() assert 'topic-word-matrix' in output.keys() assert 'test-topic-document-matrix' in output.keys() # check topics format assert type(output['topics']) == list assert len(output['topics']) == num_topics # check topic-word-matrix format assert type(output['topic-word-matrix']) == np.ndarray assert output['topic-word-matrix'].shape == (num_topics, len(dataset.get_vocabulary())) # check topic-document-matrix format assert type(output['topic-document-matrix']) == np.ndarray assert output['topic-document-matrix'].shape == ( num_topics, len(dataset.get_partitioned_corpus()[0])) # check test-topic-document-matrix format assert type(output['test-topic-document-matrix']) == np.ndarray assert output['test-topic-document-matrix'].shape == ( num_topics, len(dataset.get_partitioned_corpus()[2]))
def test_model_output_ctm_combined(data_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + '/M10') num_topics = 3 model = CTM(num_topics=num_topics, num_epochs=5, inference_type='combined') output = model.train_model(dataset) assert 'topics' in output.keys() assert 'topic-word-matrix' in output.keys() assert 'test-topic-document-matrix' in output.keys() # check topics format assert type(output['topics']) == list assert len(output['topics']) == num_topics # check topic-word-matrix format assert type(output['topic-word-matrix']) == np.ndarray assert output['topic-word-matrix'].shape == (num_topics, len(dataset.get_vocabulary())) # check topic-document-matrix format assert type(output['topic-document-matrix']) == np.ndarray assert output['topic-document-matrix'].shape == ( num_topics, len(dataset.get_partitioned_corpus()[0])) # check test-topic-document-matrix format assert type(output['test-topic-document-matrix']) == np.ndarray assert output['test-topic-document-matrix'].shape == ( num_topics, len(dataset.get_partitioned_corpus()[2]))
def test_model_output_nmf(data_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + '/M10') num_topics = 3 model = NMF(num_topics=num_topics, w_max_iter=10, h_max_iter=10, use_partitions=True) output = model.train_model(dataset) assert 'topics' in output.keys() assert 'topic-word-matrix' in output.keys() assert 'test-topic-document-matrix' in output.keys() # check topics format assert type(output['topics']) == list assert len(output['topics']) == num_topics # check topic-word-matrix format assert type(output['topic-word-matrix']) == np.ndarray assert output['topic-word-matrix'].shape == (num_topics, len(dataset.get_vocabulary())) # check topic-document-matrix format assert type(output['topic-document-matrix']) == np.ndarray assert output['topic-document-matrix'].shape == ( num_topics, len(dataset.get_partitioned_corpus()[0])) # check test-topic-document-matrix format assert type(output['test-topic-document-matrix']) == np.ndarray assert output['test-topic-document-matrix'].shape == ( num_topics, len(dataset.get_partitioned_corpus()[2]))
def test_model_output_prodlda_not_partitioned(data_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + '/M10') num_topics = 3 model = ProdLDA(num_topics=num_topics, num_epochs=5, use_partitions=False) output = model.train_model(dataset) assert 'topics' in output.keys() assert 'topic-word-matrix' in output.keys() assert 'test-topic-document-matrix' not in output.keys() # check topics format assert type(output['topics']) == list assert len(output['topics']) == num_topics # check topic-word-matrix format assert type(output['topic-word-matrix']) == np.ndarray assert output['topic-word-matrix'].shape == (num_topics, len(dataset.get_vocabulary())) # check topic-document-matrix format assert type(output['topic-document-matrix']) == np.ndarray assert output['topic-document-matrix'].shape == (num_topics, len(dataset.get_corpus()))
def test_partitions_custom(data_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + "M10") partitions = dataset.get_partitioned_corpus() assert len(partitions[0]) == 5847 assert len(partitions[1]) == 1254
def test_partitions_fetch(): dataset = Dataset() dataset.fetch_dataset("M10") partitions = dataset.get_partitioned_corpus() assert len(partitions[0]) == 5847 assert len(partitions[1]) == 1254
def test_load_M10(): dataset = Dataset() dataset.fetch_dataset("M10") assert len(set(dataset.get_labels())) == 10
def test_load_20ng(): data_home = get_data_home(data_home=None) cache_path = _pkl_filepath(data_home, "20NewsGroup" + ".pkz") if os.path.exists(cache_path): os.remove(cache_path) dataset = Dataset() dataset.fetch_dataset("20NewsGroup") assert len(dataset.get_corpus()) == 16309 assert len(dataset.get_labels()) == 16309 assert os.path.exists(cache_path) dataset = Dataset() dataset.fetch_dataset("20NewsGroup") assert len(dataset.get_corpus()) == 16309
def preprocess_dataset(self, documents_path, labels_path=None): """ preprocess the input dataset :param documents_path: path to the documents file. Each row of the file represents a document :type documents_path: str :param labels_path: path to the documents file. Each row of the file represents a label. Its index corresponds to the index of the documents file (default: None) :type labels_path: str :return octis.dataset.dataset.Dataset """ docs = [line.strip() for line in open(documents_path, 'r').readlines()] if self.num_processes is not None: # with Pool(self.num_processes) as p: # docs = p.map(self.simple_preprocessing_steps, docs) docs = process_map(self.simple_preprocessing_steps, docs, max_workers=self.num_processes, chunksize=1) else: docs = self.simple_preprocessing_steps(docs) if self.lowercase: self.preprocessing_steps.append("lowercase") if self.remove_punctuation: self.preprocessing_steps.append('remove_punctuation') if self.lemmatize: self.preprocessing_steps.append('lemmatize') vocabulary = self.filter_words(docs) print("created vocab") # with Pool(self.num_processes) as p: # final_docs, final_labels = p.starmap(self._foo, product(docs, vocabulary, labels_path, repeat=2)) print(len(vocabulary)) final_docs, final_labels, document_indexes = [], [], [] if labels_path is not None: labels = [line.strip() for line in open(labels_path, 'r').readlines()] for i, doc, label in zip(range(len(docs)), docs, labels): vocab = set(vocabulary) new_doc = [w for w in doc.split() if w in vocab] if len(new_doc) > self.min_doc_words: final_docs.append(new_doc) final_labels.append(label) document_indexes.append(i) else: for i, doc in enumerate(docs): vocab = set(vocabulary) new_doc = [w for w in doc.split() if w in vocab] if len(new_doc) > self.min_doc_words: final_docs.append(new_doc) document_indexes.append(i) self.preprocessing_steps.append('filter documents with less than ' + str(self.min_doc_words) + " words") if self.verbose: print("words filtering done") metadata = {"total_documents": len(docs), "vocabulary_length": len(vocabulary), "preprocessing-info": self.preprocessing_steps, "labels": list(set(final_labels)), "total_labels": len(set(final_labels))} if self.split: if len(final_labels) > 0: train, test, y_train, y_test = train_test_split( range(len(final_docs)), final_labels, test_size=0.15, random_state=1, stratify=final_labels) train, validation = train_test_split(train, test_size=3 / 17, random_state=1, stratify=y_train) partitioned_labels = [final_labels[doc] for doc in train + validation + test] partitioned_corpus = [final_docs[doc] for doc in train + validation + test] document_indexes = [document_indexes[doc] for doc in train + validation + test] metadata["last-training-doc"] = len(train) metadata["last-validation-doc"] = len(validation) + len(train) if self.save_original_indexes: return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, labels=partitioned_labels, document_indexes=document_indexes) else: return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, labels=partitioned_labels) else: train, test = train_test_split(range(len(final_docs)), test_size=0.15, random_state=1) train, validation = train_test_split(train, test_size=3 / 17, random_state=1) metadata["last-training-doc"] = len(train) metadata["last-validation-doc"] = len(validation) + len(train) partitioned_corpus = [final_docs[doc] for doc in train + validation + test] document_indexes = [document_indexes[doc] for doc in train + validation + test] if self.save_original_indexes: return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, labels=final_labels, document_indexes=document_indexes) else: return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, labels=final_labels, document_indexes=document_indexes) else: if self.save_original_indexes: Dataset(final_docs, vocabulary=vocabulary, metadata=metadata, labels=final_labels, document_indexes=document_indexes) else: Dataset(final_docs, vocabulary=vocabulary, metadata=metadata, labels=final_labels)
def dataset(data_dir): # Load dataset dataset = Dataset() dataset.load_custom_dataset_from_folder(data_dir + '/M10') return dataset
def dataset(root_dir): dataset = Dataset() dataset.load_custom_dataset_from_folder(root_dir + "/../preprocessed_datasets/" + '/M10') return dataset
def _restore_parameters(self, name_path): """ Restore the BO parameters from the json file :param name_path: name of the json file :type name_path: str :return: result of BO optimization (scikit-optimize object), surrogate model (scikit-learn object) :rtype: tuple """ # Load the previous results with open(name_path, 'rb') as file: optimization_object = json.load(file) self.search_space = load_search_space(optimization_object["search_space"]) self.acq_func = optimization_object["acq_func"] self.surrogate_model = optimization_object["surrogate_model"] self.kernel = eval(optimization_object["kernel"]) self.optimization_type = optimization_object["optimization_type"] self.model_runs = optimization_object["model_runs"] self.save_models = optimization_object["save_models"] self.save_step = optimization_object["save_step"] self.save_name = optimization_object["save_name"] self.save_models = optimization_object["save_models"] self.save_path = optimization_object["save_path"] self.early_stop = optimization_object["early_stop"] self.early_step = optimization_object["early_step"] self.plot_model = optimization_object["plot_model"] self.plot_best_seen = optimization_object["plot_best_seen"] self.plot_name = optimization_object["plot_name"] self.log_scale_plot = optimization_object["log_scale_plot"] self.random_state = optimization_object["random_state"] self.dict_model_runs = optimization_object['dict_model_runs'] self.number_of_previous_calls = optimization_object['current_call'] + 1 self.current_call = optimization_object['current_call'] + 1 self.number_of_call = optimization_object['number_of_call'] self.save_path = optimization_object['save_path'] self.x0 = optimization_object['x0'] self.y0 = optimization_object['y0'] self.n_random_starts = optimization_object['n_random_starts'] self.initial_point_generator = optimization_object['initial_point_generator'] self.topk = optimization_object['topk'] self.time_eval = optimization_object["time_eval"] res = None # Load the dataset dataset = Dataset() if not optimization_object["is_cached"]: dataset.load_custom_dataset_from_folder(optimization_object["dataset_path"]) else: dp = optimization_object["dataset_path"][:-(len(optimization_object["dataset_name"]) + len("_py3.pkz"))] dataset.fetch_dataset(optimization_object["dataset_name"], data_home=dp) self.dataset = dataset # Load the metric self._load_metric(optimization_object, dataset) # Load the model self.model = load_model(optimization_object) # Creation of the hyperparameters self.hyperparameters = list(sorted(self.search_space.keys())) # Choice of the optimizer opt = choose_optimizer(self) # Update number_of_call for restarting for i in range(self.number_of_previous_calls): next_x = [optimization_object["x_iters"][key][i] for key in self.hyperparameters] f_val = -optimization_object["f_val"][i] if self.optimization_type == 'Maximize' else \ optimization_object["f_val"][i] res = opt.tell(next_x, f_val) # Create the directory where the results are saved Path(self.save_path).mkdir(parents=True, exist_ok=True) self.model_path_models = self.save_path + "models/" return res, opt