Пример #1
0
def test_specials_indexes():
    specials = (UNK(), PAD())
    vocab = Vocab(specials=specials)
    for i in DATA:
        vocab += i.split(" ")
    vocab.finalize()

    tfidf = TfIdfVectorizer(vocab=vocab)
    tfidf._init_special_indexes()

    assert len(tfidf._special_indexes) == 2
    for i in specials:
        assert vocab.stoi[i] in tfidf._special_indexes
Пример #2
0
def test_build_count_matrix_costum_specials_vocab_with_specials():
    vocab = Vocab(specials=(UNK(), PAD()))
    vocab_words = ["this", "is", "the", "first", "document"]
    vocab += vocab_words
    vocab.finalize()
    tfidf = TfIdfVectorizer(vocab=vocab, specials=[PAD(), "this", "first"])
    tfidf._init_special_indexes()

    numericalized_data = get_numericalized_data(data=DATA, vocab=vocab)
    count_matrix = tfidf._build_count_matrix(
        data=numericalized_data, unpack_data=tfidf._get_tensor_values)
    expected = np.array([[0, 1, 1, 1], [1, 1, 1, 2], [3, 1, 1, 0],
                         [0, 1, 1, 1]])
    assert np.all(count_matrix == expected)
Пример #3
0
def test_build_count_matrix_costum_specials_vocab_without_specials():
    vocab = Vocab(specials=())
    for i in DATA:
        vocab += i.split(" ")
    vocab.finalize()
    tfidf = TfIdfVectorizer(
        vocab=vocab,
        specials=["the", "first", "second", "one", "third", "and"])
    tfidf._init_special_indexes()

    numericalized_data = get_numericalized_data(data=DATA, vocab=vocab)
    count_matrix = tfidf._build_count_matrix(
        data=numericalized_data, unpack_data=tfidf._get_tensor_values)
    expected = np.array([[1, 1, 1], [1, 1, 2], [1, 1, 0], [1, 1, 1]])
    assert np.all(count_matrix == expected)
Пример #4
0
def test_build_count_matrix_from_tensor_with_specials():
    vocab = Vocab(specials=(UNK(), PAD()))
    for i in DATA:
        vocab += i.split(" ")
    vocab.finalize()
    tfidf = TfIdfVectorizer(vocab=vocab)
    tfidf._init_special_indexes()

    numericalized_data = get_numericalized_data(data=DATA, vocab=vocab)
    count_matrix = tfidf._build_count_matrix(
        data=numericalized_data, unpack_data=tfidf._get_tensor_values)
    expected = np.array([
        [1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 2, 0, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0],
    ])
    assert np.all(count_matrix == expected)
Пример #5
0
def test_tfidf_equality_with_scikit(tabular_dataset):
    text_field = tabular_dataset.field_dict["text"]
    vocab = text_field.vocab

    tfidf = TfIdfVectorizer(vocab=vocab)
    tfidf.fit(dataset=tabular_dataset, field=text_field)

    numericalized_data = get_numericalized_data(data=TABULAR_TEXT, vocab=vocab)
    vectorized_text = tfidf.transform(numericalized_data).todense()

    scikit_vectorizer = text.TfidfVectorizer(vocabulary=vocab.stoi,
                                             token_pattern=r"(?u)\b\w+\b")
    scikit_vectorizer.fit(TABULAR_TEXT)
    scikit_vectors = scikit_vectorizer.transform(TABULAR_TEXT).todense()
    scikit_vectors = np.delete(scikit_vectors, [0, 1], axis=1)
    # delete weights for special symbols, in scikit they are 0 and in podium we skip them

    assert np.allclose(a=vectorized_text,
                       b=scikit_vectors,
                       rtol=0,
                       atol=1.0e-6)
Пример #6
0
def test_transform_example_none(tabular_dataset):
    text_field = tabular_dataset.field_dict["text"]
    vocab = text_field.vocab

    tfidf = TfIdfVectorizer(vocab=vocab)
    tfidf.fit(dataset=tabular_dataset, field=text_field)

    with pytest.raises(ValueError):
        tfidf.transform(examples=None)
Пример #7
0
def train_multilabel_svm(
    dataset_path,
    param_grid,
    cutoff,
    n_outer_splits=5,
    n_inner_splits=3,
    n_jobs=1,
    is_verbose=True,
    include_classes_with_no_train_examples=False,
    include_classes_with_no_test_examples=False,
):
    """
    Trains the multilabel SVM model on a given instance of dataset.

    Parameters
    ----------
    dataset_path : str
        Path to the instance of EuroVoc dataset stored as a dill file.
    param_grid : dict or list(dict)
            Dictionary with parameters names (string) as keys and lists of parameter
            settings to try as values, or a list of such dictionaries, in which case the
            grids spanned by each dictionary in the list are explored. This enables
            searching over any sequence of parameter settings. For more information,
            refer to
            https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    cutoff : int
        If the number of positive training examples for a class is less than the
        cut-off, no model is trained for such class and the index of the label is
        added in the missing model indexes.
    n_outer_splits : int
        Number of splits in an outer loop of a nested cross validation.
    n_inner_splits : int
        Number of splits in an inner loop of a nested cross validation.
    n_jobs : int
        Number of threads to be used.
    is_verbose : boolean
        If set to true, scores on test set are printed for each fold of the
        outer loop in the nested cross validation.
    include_classes_with_no_train_examples : boolean
        If True, scores of the classes witn an unsufficient number of training examples
        (less than the specified cut-off) are included when calculating general scores.
        Note that this makes sense if cut-off=1 because that means classes with no train
        examples will be taken into consideration.
    include_classes_with_no_test_examples : boolean
        If True, scores for classes with no positive instances in the test set are
        included in the general score.
    """
    dataset = None
    with open(dataset_path, "rb") as input_file:
        dataset = dill.load(input_file)

    vectorizer = TfIdfVectorizer()
    vectorizer.fit(dataset, dataset.field_dict["text"])

    outer_cv = KFold(n_splits=n_outer_splits, shuffle=True, random_state=0)

    micro_P = []
    micro_R = []
    micro_F1 = []
    macro_P = []
    macro_R = []
    macro_F1 = []

    for train, test in outer_cv.split(dataset):
        train_iter = Iterator(dataset=train, batch_size=len(train))
        clf = MultilabelSVM()
        for X, Y in train_iter:
            X = vectorizer.transform(X.text)
            Y = get_label_matrix(Y)

            clf.fit(X, Y, parameter_grid=param_grid, cutoff=cutoff, n_jobs=n_jobs)

        test_iter = Iterator(dataset=test, batch_size=len(test))
        for X, Y in test_iter:
            X = vectorizer.transform(X.text)
            Y = get_label_matrix(Y)
            prediction_dict = clf.predict(X)
            Y_pred = prediction_dict[AbstractSupervisedModel.PREDICTION_KEY]

            if not include_classes_with_no_train_examples:
                Y_pred = np.delete(
                    Y_pred, list(clf.get_indexes_of_missing_models()), axis=1
                )
                Y = np.delete(Y, list(clf.get_indexes_of_missing_models()), axis=1)

            # deletes all zero columns (all labels which don't have any positive exaples
            # in the current test set)
            if not include_classes_with_no_test_examples:
                cols = ~(Y == 0).all(axis=0)
                Y = Y[:, cols]
                Y_pred = Y_pred[:, cols]

            micro_P.append(precision_score(Y, Y_pred, average="micro"))
            micro_R.append(recall_score(Y, Y_pred, average="micro"))
            micro_F1.append(f1_score(Y, Y_pred, average="micro"))

            macro_P.append(precision_score(Y, Y_pred, average="macro"))
            macro_R.append(recall_score(Y, Y_pred, average="macro"))
            macro_F1.append(f1_score(Y, Y_pred, average="macro"))

            if is_verbose:
                print("Scores on test set:")
                print("micro P", micro_P[-1])
                print("micro R", micro_R[-1])
                print("micro F1", micro_F1[-1])
                print("macro P", macro_P[-1])
                print("macro R", macro_R[-1])
                print("macro F1", macro_F1[-1])

    print("Average scores on test sets:")

    print("average micro P", np.average(micro_P))
    print("average micro R", np.average(micro_R))
    print("average micro F1", np.average(micro_F1))

    print("average macro P", np.average(macro_P))
    print("average macro R", np.average(macro_R))
    print("average macro F1", np.average(macro_F1))
Пример #8
0
def test_vocab_none(tabular_dataset):
    tfidf = TfIdfVectorizer()
    with pytest.raises(ValueError):
        tfidf.fit(dataset=tabular_dataset,
                  field=Field("text", numericalizer=None))
Пример #9
0
def test_transform_before_fit_error():
    tfidf = TfIdfVectorizer()
    with pytest.raises(RuntimeError):
        tfidf.transform([[1, 1, 1, 1, 1, 0, 0, 0, 0]])
Пример #10
0
def test_fit_invalid_field_error(tabular_dataset):
    tfidf = TfIdfVectorizer()
    with pytest.raises(ValueError):
        tfidf.fit(dataset=tabular_dataset, field=Field("non_present_field"))
Пример #11
0
def test_fit_field_none_error(tabular_dataset):
    tfidf = TfIdfVectorizer()
    with pytest.raises(ValueError):
        tfidf.fit(dataset=tabular_dataset, field=None)
Пример #12
0
def test_fit_dataset_none_error():
    tfidf = TfIdfVectorizer()
    with pytest.raises(ValueError):
        tfidf.fit(dataset=None, field=Field("text"))