def test_specials_indexes(): specials = (UNK(), PAD()) vocab = Vocab(specials=specials) for i in DATA: vocab += i.split(" ") vocab.finalize() tfidf = TfIdfVectorizer(vocab=vocab) tfidf._init_special_indexes() assert len(tfidf._special_indexes) == 2 for i in specials: assert vocab.stoi[i] in tfidf._special_indexes
def test_build_count_matrix_costum_specials_vocab_with_specials(): vocab = Vocab(specials=(UNK(), PAD())) vocab_words = ["this", "is", "the", "first", "document"] vocab += vocab_words vocab.finalize() tfidf = TfIdfVectorizer(vocab=vocab, specials=[PAD(), "this", "first"]) tfidf._init_special_indexes() numericalized_data = get_numericalized_data(data=DATA, vocab=vocab) count_matrix = tfidf._build_count_matrix( data=numericalized_data, unpack_data=tfidf._get_tensor_values) expected = np.array([[0, 1, 1, 1], [1, 1, 1, 2], [3, 1, 1, 0], [0, 1, 1, 1]]) assert np.all(count_matrix == expected)
def test_build_count_matrix_costum_specials_vocab_without_specials(): vocab = Vocab(specials=()) for i in DATA: vocab += i.split(" ") vocab.finalize() tfidf = TfIdfVectorizer( vocab=vocab, specials=["the", "first", "second", "one", "third", "and"]) tfidf._init_special_indexes() numericalized_data = get_numericalized_data(data=DATA, vocab=vocab) count_matrix = tfidf._build_count_matrix( data=numericalized_data, unpack_data=tfidf._get_tensor_values) expected = np.array([[1, 1, 1], [1, 1, 2], [1, 1, 0], [1, 1, 1]]) assert np.all(count_matrix == expected)
def test_build_count_matrix_from_tensor_with_specials(): vocab = Vocab(specials=(UNK(), PAD())) for i in DATA: vocab += i.split(" ") vocab.finalize() tfidf = TfIdfVectorizer(vocab=vocab) tfidf._init_special_indexes() numericalized_data = get_numericalized_data(data=DATA, vocab=vocab) count_matrix = tfidf._build_count_matrix( data=numericalized_data, unpack_data=tfidf._get_tensor_values) expected = np.array([ [1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 2, 0, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0, 0], ]) assert np.all(count_matrix == expected)
def test_tfidf_equality_with_scikit(tabular_dataset): text_field = tabular_dataset.field_dict["text"] vocab = text_field.vocab tfidf = TfIdfVectorizer(vocab=vocab) tfidf.fit(dataset=tabular_dataset, field=text_field) numericalized_data = get_numericalized_data(data=TABULAR_TEXT, vocab=vocab) vectorized_text = tfidf.transform(numericalized_data).todense() scikit_vectorizer = text.TfidfVectorizer(vocabulary=vocab.stoi, token_pattern=r"(?u)\b\w+\b") scikit_vectorizer.fit(TABULAR_TEXT) scikit_vectors = scikit_vectorizer.transform(TABULAR_TEXT).todense() scikit_vectors = np.delete(scikit_vectors, [0, 1], axis=1) # delete weights for special symbols, in scikit they are 0 and in podium we skip them assert np.allclose(a=vectorized_text, b=scikit_vectors, rtol=0, atol=1.0e-6)
def test_transform_example_none(tabular_dataset): text_field = tabular_dataset.field_dict["text"] vocab = text_field.vocab tfidf = TfIdfVectorizer(vocab=vocab) tfidf.fit(dataset=tabular_dataset, field=text_field) with pytest.raises(ValueError): tfidf.transform(examples=None)
def train_multilabel_svm( dataset_path, param_grid, cutoff, n_outer_splits=5, n_inner_splits=3, n_jobs=1, is_verbose=True, include_classes_with_no_train_examples=False, include_classes_with_no_test_examples=False, ): """ Trains the multilabel SVM model on a given instance of dataset. Parameters ---------- dataset_path : str Path to the instance of EuroVoc dataset stored as a dill file. param_grid : dict or list(dict) Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of parameter settings. For more information, refer to https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html cutoff : int If the number of positive training examples for a class is less than the cut-off, no model is trained for such class and the index of the label is added in the missing model indexes. n_outer_splits : int Number of splits in an outer loop of a nested cross validation. n_inner_splits : int Number of splits in an inner loop of a nested cross validation. n_jobs : int Number of threads to be used. is_verbose : boolean If set to true, scores on test set are printed for each fold of the outer loop in the nested cross validation. include_classes_with_no_train_examples : boolean If True, scores of the classes witn an unsufficient number of training examples (less than the specified cut-off) are included when calculating general scores. Note that this makes sense if cut-off=1 because that means classes with no train examples will be taken into consideration. include_classes_with_no_test_examples : boolean If True, scores for classes with no positive instances in the test set are included in the general score. """ dataset = None with open(dataset_path, "rb") as input_file: dataset = dill.load(input_file) vectorizer = TfIdfVectorizer() vectorizer.fit(dataset, dataset.field_dict["text"]) outer_cv = KFold(n_splits=n_outer_splits, shuffle=True, random_state=0) micro_P = [] micro_R = [] micro_F1 = [] macro_P = [] macro_R = [] macro_F1 = [] for train, test in outer_cv.split(dataset): train_iter = Iterator(dataset=train, batch_size=len(train)) clf = MultilabelSVM() for X, Y in train_iter: X = vectorizer.transform(X.text) Y = get_label_matrix(Y) clf.fit(X, Y, parameter_grid=param_grid, cutoff=cutoff, n_jobs=n_jobs) test_iter = Iterator(dataset=test, batch_size=len(test)) for X, Y in test_iter: X = vectorizer.transform(X.text) Y = get_label_matrix(Y) prediction_dict = clf.predict(X) Y_pred = prediction_dict[AbstractSupervisedModel.PREDICTION_KEY] if not include_classes_with_no_train_examples: Y_pred = np.delete( Y_pred, list(clf.get_indexes_of_missing_models()), axis=1 ) Y = np.delete(Y, list(clf.get_indexes_of_missing_models()), axis=1) # deletes all zero columns (all labels which don't have any positive exaples # in the current test set) if not include_classes_with_no_test_examples: cols = ~(Y == 0).all(axis=0) Y = Y[:, cols] Y_pred = Y_pred[:, cols] micro_P.append(precision_score(Y, Y_pred, average="micro")) micro_R.append(recall_score(Y, Y_pred, average="micro")) micro_F1.append(f1_score(Y, Y_pred, average="micro")) macro_P.append(precision_score(Y, Y_pred, average="macro")) macro_R.append(recall_score(Y, Y_pred, average="macro")) macro_F1.append(f1_score(Y, Y_pred, average="macro")) if is_verbose: print("Scores on test set:") print("micro P", micro_P[-1]) print("micro R", micro_R[-1]) print("micro F1", micro_F1[-1]) print("macro P", macro_P[-1]) print("macro R", macro_R[-1]) print("macro F1", macro_F1[-1]) print("Average scores on test sets:") print("average micro P", np.average(micro_P)) print("average micro R", np.average(micro_R)) print("average micro F1", np.average(micro_F1)) print("average macro P", np.average(macro_P)) print("average macro R", np.average(macro_R)) print("average macro F1", np.average(macro_F1))
def test_vocab_none(tabular_dataset): tfidf = TfIdfVectorizer() with pytest.raises(ValueError): tfidf.fit(dataset=tabular_dataset, field=Field("text", numericalizer=None))
def test_transform_before_fit_error(): tfidf = TfIdfVectorizer() with pytest.raises(RuntimeError): tfidf.transform([[1, 1, 1, 1, 1, 0, 0, 0, 0]])
def test_fit_invalid_field_error(tabular_dataset): tfidf = TfIdfVectorizer() with pytest.raises(ValueError): tfidf.fit(dataset=tabular_dataset, field=Field("non_present_field"))
def test_fit_field_none_error(tabular_dataset): tfidf = TfIdfVectorizer() with pytest.raises(ValueError): tfidf.fit(dataset=tabular_dataset, field=None)
def test_fit_dataset_none_error(): tfidf = TfIdfVectorizer() with pytest.raises(ValueError): tfidf.fit(dataset=None, field=Field("text"))