Exemplo n.º 1
0
def acc(net: NeuralNet, ds: Dataset, y: torch.Tensor) -> float:
    predict_values = net.predict(ds)
    return acc_as_metric(predict_values, y)
Exemplo n.º 2
0
def run():
    parser = get_arg_parser()
    cmd_args = parser.parse_args()

    if cmd_args.gpu is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = str(cmd_args.gpu)
        gpunum = os.getenv('CUDA_VISIBLE_DEVICES')
        logging.info("GPU has been set to {}".format(gpunum))

    logging.info("Model used for the regression network: {}"
                 .format(cmd_args.model_name))

    # 1. Dataset retrieval
    # --------------------

    tab_printer(constants.Dataset)
    dataset = Dataset(nrows=constants.Dataset.nrows,
                      augment_labels=constants.Dataset.augment_labels,
                      top_n=constants.Dataset.top_n)

    logging.info("Going to create vocabulary and fit a preprocessing pipeline"
                 "using {} samples. Settings will be listed below"
                 .format(len(dataset.X_train)))

    # 2. Preprocessing
    # -----------------

    tab_printer(constants.NLP)
    preprocessor = Preprocessing(dataset.X_train)

    # Preprocess documents
    X_train = preprocessor.transform_documents(dataset.X_train)
    X_test = preprocessor.transform_documents(dataset.X_test)

    # 3. Word embeddings with word2vec
    # --------------------------------

    # Train word2vec embeddings if train_word2vec option is selected
    if cmd_args.train_word2vec: utils.embeddings.main()
    weights = get_embedding_tensor(preprocessor)

    # 4. Node embeddings with AttentionWalk
    # -------------------------------------
    args = _generate_deepwalk_parameters(dataset.y_train_graph)
    if cmd_args.train_attentionwalk: train_attention_walk(args)

    graph_embeddings = pd.read_csv(args.embedding_path).iloc[:, 1:].values

    # Get document representations using node embeddings
    y_embedded = _get_label_embeddings(dataset.y_train, graph_embeddings)
    y_test_embedded = _get_label_embeddings(dataset.y_test, graph_embeddings)

    # 5. Regressor Training
    # ---------------------

    device = 'cuda:' + str(os.getenv("CUDA_VISIBLE_DEVICES")) \
        if torch.cuda.is_available() else 'cpu'

    regressor_nn = NeuralNet(
        get_network_class(cmd_args.model_name),
        max_epochs=constants.NeuralNetworkTraining.epochs,
        lr=constants.NeuralNetworkTraining.learning_rate,
        batch_size=constants.NeuralNetworkTraining.batch_size,
        optimizer=torch.optim.Adam,
        criterion=torch.nn.MSELoss,

        module__output_dim=args.dimensions,
        module__embedding=weights,
        module__embedding_dim=constants.NLP.embedding_size,

        device=device,
        train_split=None,
    )

    # Train the regressor neural network
    regressor_nn.fit(X_train, y_embedded.astype(np.float32))

    # 6. Train Multi-label KNN algorithm
    # ----------------------------------

    tab_printer(constants.MLKNN)

    # Train multi-label KNN to turn label embeddings into label predictions
    classifier = MLkNN(k=constants.MLKNN.k, s=constants.MLKNN.s)
    classifier.fit(y_embedded, dataset.y_train)

    # 7. Evaluation
    # -------------

    # Label prediction with documents
    y_test_pred = regressor_nn.predict(X_test)
    preds = classifier.predict(y_test_pred)
    preds_raw = classifier.predict_proba(y_test_pred)

    # Label prediction with label embeddings
    preds_w_labels = classifier.predict(y_test_embedded)
    preds_w_labels_raw = classifier.predict_proba(y_test_embedded)

    # Log evaluation result with label embeddings
    eval_metrics_w_labels = evaluation \
        .all_metrics(preds_w_labels.toarray(),
                     dataset.y_test,
                     yhat_raw=preds_w_labels_raw.toarray())

    logging.info(str(eval_metrics_w_labels))

    # Log evaluation result with documents
    report_evaluation(preds.toarray(),
                      dataset.y_test,
                      yhat_raw=preds_raw.toarray())
#%% Fit the model

io = net.fit(WD)

# Save model
net.save_params(f_params='models/baselineNN.pkl')

#%% Or load it from disk

net.initialize()
net.load_params(f_params="models/baselineNN.pkl")

#%% Predict on train

# Out
yhat = net.predict(WD)
# Classes
yhatc = yhat.argmax(axis=1)
# True labels
ytrue = WD.y
(ytrue == yhatc).sum() / yhatc.size

# Classification report
from sklearn import metrics
print(
    metrics.classification_report(ytrue,
                                  yhatc,
                                  target_names=list(category_map.values())))
metrics.confusion_matrix(ytrue, yhatc)

#%% Predict on test