Exemplo n.º 1
0
def test_load_20ng():
    data_home = get_data_home(data_home=None)
    cache_path = _pkl_filepath(data_home, "20NewsGroup" + ".pkz")
    if os.path.exists(cache_path):
        os.remove(cache_path)

    dataset = Dataset()
    dataset.fetch_dataset("20NewsGroup")
    assert len(dataset.get_corpus()) == 16309
    assert len(dataset.get_labels()) == 16309
    assert os.path.exists(cache_path)

    dataset = Dataset()
    dataset.fetch_dataset("20NewsGroup")
    assert len(dataset.get_corpus()) == 16309
Exemplo n.º 2
0
def test_model_output_lda_tomotopy(data_dir):
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder(data_dir + '/M10')
    num_topics = 3
    model = LDATOMOTO(num_topics=num_topics, alpha=0.1)
    output = model.train_model(dataset)
    assert 'topics' in output.keys()
    assert 'topic-word-matrix' in output.keys()
    assert 'test-topic-document-matrix' in output.keys()

    # check topics format
    assert type(output['topics']) == list
    assert len(output['topics']) == num_topics

    # check topic-word-matrix format
    assert type(output['topic-word-matrix']) == np.ndarray
    assert output['topic-word-matrix'].shape == (num_topics,
                                                 len(dataset.get_vocabulary()))

    # check topic-document-matrix format
    assert type(output['topic-document-matrix']) == np.ndarray
    assert output['topic-document-matrix'].shape == (
        num_topics, len(dataset.get_partitioned_corpus()[0]))

    # check test-topic-document-matrix format
    assert type(output['test-topic-document-matrix']) == np.ndarray
    assert output['test-topic-document-matrix'].shape == (
        num_topics, len(dataset.get_partitioned_corpus()[2]))
Exemplo n.º 3
0
def test_model_output_ctm_combined(data_dir):
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder(data_dir + '/M10')
    num_topics = 3
    model = CTM(num_topics=num_topics, num_epochs=5, inference_type='combined')
    output = model.train_model(dataset)
    assert 'topics' in output.keys()
    assert 'topic-word-matrix' in output.keys()
    assert 'test-topic-document-matrix' in output.keys()

    # check topics format
    assert type(output['topics']) == list
    assert len(output['topics']) == num_topics

    # check topic-word-matrix format
    assert type(output['topic-word-matrix']) == np.ndarray
    assert output['topic-word-matrix'].shape == (num_topics,
                                                 len(dataset.get_vocabulary()))

    # check topic-document-matrix format
    assert type(output['topic-document-matrix']) == np.ndarray
    assert output['topic-document-matrix'].shape == (
        num_topics, len(dataset.get_partitioned_corpus()[0]))

    # check test-topic-document-matrix format
    assert type(output['test-topic-document-matrix']) == np.ndarray
    assert output['test-topic-document-matrix'].shape == (
        num_topics, len(dataset.get_partitioned_corpus()[2]))
Exemplo n.º 4
0
def test_model_output_nmf(data_dir):
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder(data_dir + '/M10')
    num_topics = 3
    model = NMF(num_topics=num_topics,
                w_max_iter=10,
                h_max_iter=10,
                use_partitions=True)
    output = model.train_model(dataset)
    assert 'topics' in output.keys()
    assert 'topic-word-matrix' in output.keys()
    assert 'test-topic-document-matrix' in output.keys()

    # check topics format
    assert type(output['topics']) == list
    assert len(output['topics']) == num_topics

    # check topic-word-matrix format
    assert type(output['topic-word-matrix']) == np.ndarray
    assert output['topic-word-matrix'].shape == (num_topics,
                                                 len(dataset.get_vocabulary()))

    # check topic-document-matrix format
    assert type(output['topic-document-matrix']) == np.ndarray
    assert output['topic-document-matrix'].shape == (
        num_topics, len(dataset.get_partitioned_corpus()[0]))

    # check test-topic-document-matrix format
    assert type(output['test-topic-document-matrix']) == np.ndarray
    assert output['test-topic-document-matrix'].shape == (
        num_topics, len(dataset.get_partitioned_corpus()[2]))
Exemplo n.º 5
0
def _load_default_texts():
    """
    Loads default general texts

    Returns
    -------
    result : default 20newsgroup texts
    """
    dataset = Dataset()
    dataset.fetch_dataset("20NewsGroup")
    return dataset.get_corpus()
Exemplo n.º 6
0
def test_model_output_prodlda_not_partitioned(data_dir):
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder(data_dir + '/M10')
    num_topics = 3
    model = ProdLDA(num_topics=num_topics, num_epochs=5, use_partitions=False)
    output = model.train_model(dataset)
    assert 'topics' in output.keys()
    assert 'topic-word-matrix' in output.keys()
    assert 'test-topic-document-matrix' not in output.keys()

    # check topics format
    assert type(output['topics']) == list
    assert len(output['topics']) == num_topics

    # check topic-word-matrix format
    assert type(output['topic-word-matrix']) == np.ndarray
    assert output['topic-word-matrix'].shape == (num_topics, len(dataset.get_vocabulary()))

    # check topic-document-matrix format
    assert type(output['topic-document-matrix']) == np.ndarray
    assert output['topic-document-matrix'].shape == (num_topics, len(dataset.get_corpus()))
Exemplo n.º 7
0
def test_partitions_custom(data_dir):
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder(data_dir + "M10")
    partitions = dataset.get_partitioned_corpus()
    assert len(partitions[0]) == 5847
    assert len(partitions[1]) == 1254
Exemplo n.º 8
0
def test_partitions_fetch():
    dataset = Dataset()
    dataset.fetch_dataset("M10")
    partitions = dataset.get_partitioned_corpus()
    assert len(partitions[0]) == 5847
    assert len(partitions[1]) == 1254
Exemplo n.º 9
0
def test_load_M10():
    dataset = Dataset()
    dataset.fetch_dataset("M10")
    assert len(set(dataset.get_labels())) == 10
Exemplo n.º 10
0
    def preprocess_dataset(self, documents_path, labels_path=None):
        """
        preprocess the input dataset

        :param documents_path: path to the documents file. Each row of the file represents a document
        :type documents_path: str
        :param labels_path: path to the documents file. Each row of the file represents a label. Its index corresponds
        to the index of the documents file (default: None)
        :type labels_path: str

        :return octis.dataset.dataset.Dataset
        """
        docs = [line.strip() for line in open(documents_path, 'r').readlines()]
        if self.num_processes is not None:
            # with Pool(self.num_processes) as p:
            #    docs = p.map(self.simple_preprocessing_steps, docs)
            docs = process_map(self.simple_preprocessing_steps, docs, max_workers=self.num_processes, chunksize=1)
        else:
            docs = self.simple_preprocessing_steps(docs)
        if self.lowercase:
            self.preprocessing_steps.append("lowercase")
        if self.remove_punctuation:
            self.preprocessing_steps.append('remove_punctuation')
        if self.lemmatize:
            self.preprocessing_steps.append('lemmatize')

        vocabulary = self.filter_words(docs)
        print("created vocab")
        # with Pool(self.num_processes) as p:
        #    final_docs, final_labels = p.starmap(self._foo, product(docs, vocabulary, labels_path, repeat=2))
        print(len(vocabulary))
        final_docs, final_labels, document_indexes = [], [], []
        if labels_path is not None:
            labels = [line.strip() for line in open(labels_path, 'r').readlines()]
            for i, doc, label in zip(range(len(docs)), docs, labels):
                vocab = set(vocabulary)
                new_doc = [w for w in doc.split() if w in vocab]
                if len(new_doc) > self.min_doc_words:
                    final_docs.append(new_doc)
                    final_labels.append(label)
                    document_indexes.append(i)
        else:
            for i, doc in enumerate(docs):
                vocab = set(vocabulary)
                new_doc = [w for w in doc.split() if w in vocab]
                if len(new_doc) > self.min_doc_words:
                    final_docs.append(new_doc)
                    document_indexes.append(i)

        self.preprocessing_steps.append('filter documents with less than ' + str(self.min_doc_words) + " words")
        if self.verbose:
            print("words filtering done")
        metadata = {"total_documents": len(docs), "vocabulary_length": len(vocabulary),
                    "preprocessing-info": self.preprocessing_steps, "labels": list(set(final_labels)),
                    "total_labels": len(set(final_labels))}
        if self.split:
            if len(final_labels) > 0:
                train, test, y_train, y_test = train_test_split(
                    range(len(final_docs)), final_labels, test_size=0.15, random_state=1, stratify=final_labels)

                train, validation = train_test_split(train, test_size=3 / 17, random_state=1, stratify=y_train)
                partitioned_labels = [final_labels[doc] for doc in train + validation + test]
                partitioned_corpus = [final_docs[doc] for doc in train + validation + test]
                document_indexes = [document_indexes[doc] for doc in train + validation + test]
                metadata["last-training-doc"] = len(train)
                metadata["last-validation-doc"] = len(validation) + len(train)
                if self.save_original_indexes:
                    return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, labels=partitioned_labels,
                                   document_indexes=document_indexes)
                else:
                    return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, labels=partitioned_labels)
            else:
                train, test = train_test_split(range(len(final_docs)), test_size=0.15, random_state=1)
                train, validation = train_test_split(train, test_size=3 / 17, random_state=1)

                metadata["last-training-doc"] = len(train)
                metadata["last-validation-doc"] = len(validation) + len(train)
                partitioned_corpus = [final_docs[doc] for doc in train + validation + test]
                document_indexes = [document_indexes[doc] for doc in train + validation + test]
                if self.save_original_indexes:
                    return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, labels=final_labels,
                                   document_indexes=document_indexes)
                else:
                    return Dataset(partitioned_corpus, vocabulary=vocabulary, metadata=metadata, labels=final_labels,
                                   document_indexes=document_indexes)
        else:
            if self.save_original_indexes:
                Dataset(final_docs, vocabulary=vocabulary, metadata=metadata, labels=final_labels,
                        document_indexes=document_indexes)
            else:

                Dataset(final_docs, vocabulary=vocabulary, metadata=metadata, labels=final_labels)
Exemplo n.º 11
0
def dataset(data_dir):
    # Load dataset
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder(data_dir + '/M10')

    return dataset
Exemplo n.º 12
0
def dataset(root_dir):
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder(root_dir + "/../preprocessed_datasets/" + '/M10')
    return dataset
Exemplo n.º 13
0
    def _restore_parameters(self, name_path):
        """
        Restore the BO parameters  from the json file

        :param name_path: name of the json file
        :type name_path: str
        :return: result of BO optimization (scikit-optimize object), surrogate model (scikit-learn object)
        :rtype: tuple
        """

        # Load the previous results
        with open(name_path, 'rb') as file:
            optimization_object = json.load(file)

        self.search_space = load_search_space(optimization_object["search_space"])
        self.acq_func = optimization_object["acq_func"]
        self.surrogate_model = optimization_object["surrogate_model"]
        self.kernel = eval(optimization_object["kernel"])
        self.optimization_type = optimization_object["optimization_type"]
        self.model_runs = optimization_object["model_runs"]
        self.save_models = optimization_object["save_models"]
        self.save_step = optimization_object["save_step"]
        self.save_name = optimization_object["save_name"]
        self.save_models = optimization_object["save_models"]
        self.save_path = optimization_object["save_path"]
        self.early_stop = optimization_object["early_stop"]
        self.early_step = optimization_object["early_step"]
        self.plot_model = optimization_object["plot_model"]
        self.plot_best_seen = optimization_object["plot_best_seen"]
        self.plot_name = optimization_object["plot_name"]
        self.log_scale_plot = optimization_object["log_scale_plot"]
        self.random_state = optimization_object["random_state"]
        self.dict_model_runs = optimization_object['dict_model_runs']
        self.number_of_previous_calls = optimization_object['current_call'] + 1
        self.current_call = optimization_object['current_call'] + 1
        self.number_of_call = optimization_object['number_of_call']
        self.save_path = optimization_object['save_path']
        self.x0 = optimization_object['x0']
        self.y0 = optimization_object['y0']
        self.n_random_starts = optimization_object['n_random_starts']
        self.initial_point_generator = optimization_object['initial_point_generator']
        self.topk = optimization_object['topk']
        self.time_eval = optimization_object["time_eval"]
        res = None

        # Load the dataset
        dataset = Dataset()
        if not optimization_object["is_cached"]:
            dataset.load_custom_dataset_from_folder(optimization_object["dataset_path"])
        else:
            dp = optimization_object["dataset_path"][:-(len(optimization_object["dataset_name"]) + len("_py3.pkz"))]
            dataset.fetch_dataset(optimization_object["dataset_name"], data_home=dp)

        self.dataset = dataset

        # Load the metric
        self._load_metric(optimization_object, dataset)

        # Load the model
        self.model = load_model(optimization_object)
        # Creation of the hyperparameters
        self.hyperparameters = list(sorted(self.search_space.keys()))
        # Choice of the optimizer
        opt = choose_optimizer(self)

        # Update number_of_call for restarting
        for i in range(self.number_of_previous_calls):
            next_x = [optimization_object["x_iters"][key][i] for key in self.hyperparameters]
            f_val = -optimization_object["f_val"][i] if self.optimization_type == 'Maximize' else \
                optimization_object["f_val"][i]
            res = opt.tell(next_x, f_val)

            # Create the directory where the results are saved
        Path(self.save_path).mkdir(parents=True, exist_ok=True)

        self.model_path_models = self.save_path + "models/"

        return res, opt