Пример #1
0
def get_supervised_result(model,
                          train_iterator,
                          val_iterator,
                          test_iterator,
                          EPOCHS=5,
                          cls_thresh=None,
                          n_classes=cfg['data']['num_classes']):
    """ Train and Predict on full supervised mode.

    Returns:

    """

    model_best, val_preds_trues_best, val_preds_trues_all, losses = trainer(
        model, train_iterator, val_iterator, N_EPOCHS=EPOCHS)

    # logger.debug(losses)

    # evaluate the model
    test_loss, test_preds_trues = predict_with_label(model_best, test_iterator)

    if cls_thresh is None:
        cls_thresh = [0.5] * n_classes

    predicted_labels = logit2label(DataFrame(
        test_preds_trues['preds'].numpy()),
                                   cls_thresh,
                                   drop_irrelevant=False)

    result = calculate_performance_pl(test_preds_trues['trues'],
                                      predicted_labels)

    logger.info("Supervised result: {}".format(dumps(result, indent=4)))
    return result, model_best
Пример #2
0
def test_graph_classifier(model: GAT_Graph_Classifier, loss_func,
                          data_loader: torch.utils.data.dataloader.DataLoader):
    model.eval()
    preds = []
    trues = []
    losses = []
    for iter, (graph_batch, label) in enumerate(data_loader):
        ## Store emb in a separate file as self_loop removes emb info:
        emb = graph_batch.ndata['emb']
        # graph_batch = dgl.add_self_loop(graph_batch)
        prediction = model(graph_batch, emb)
        loss = loss_func(prediction, label)
        preds.append(prediction.detach())
        trues.append(label.detach())
        losses.append(loss.detach())
    losses = torch.mean(torch.stack(losses))
    preds = torch.cat(preds)

    ## Converting raw scores to probabilities using Sigmoid:
    preds = torch.sigmoid(preds)

    ## Converting probabilities to class labels:
    preds = logit2label(preds.detach(), cls_thresh=0.5)
    trues = torch.cat(trues)
    result_dict = calculate_performance(trues, preds)
    test_output = {'preds': preds, 'trues': trues, 'result': result_dict}
    # logger.info(dumps(result_dict, indent=4))

    return losses, test_output
def eval_graph_classifier(model: GAT_GCN_Classifier,
                          G,
                          X,
                          loss_func,
                          data_loader: utils.data.dataloader.DataLoader,
                          n_classes=cfg['data']['num_classes'],
                          save_gcn_embs=False):
    model.eval()
    preds = []
    trues = []
    losses = []
    for iter, (graph_batch, local_ids, label, global_ids,
               node_counts) in enumerate(data_loader):
        ## Store emb in a separate file as self_loop removes emb info:
        emb = graph_batch.ndata['emb']
        # graph_batch = dgl.add_self_loop(graph_batch)
        if cfg['model']['use_cuda'][plat][user] and cuda.is_available():
            graph_batch = graph_batch.to(device)
            emb = emb.to(device)
            # local_ids = local_ids.to(device)
            # node_counts = node_counts.to(device)
            # global_ids = global_ids.to(device)
            G = G.to(device)
            X = X.to(device)
        if save_gcn_embs:
            save(X, 'X_glove.pt')
        start_time = timeit.default_timer()
        prediction = model(graph_batch, emb, local_ids, node_counts,
                           global_ids, G, X, save_gcn_embs)
        test_time = timeit.default_timer() - start_time
        test_count = label.shape[0]
        logger.info(f"Test time per example: [{test_time / test_count} sec]")
        if prediction.dim() == 1:
            prediction = prediction.unsqueeze(1)
        if cfg['model']['use_cuda'][plat][user] and cuda.is_available():
            prediction = prediction.to(device)
        loss = loss_func(prediction, label)
        preds.append(prediction.detach())
        trues.append(label.detach())
        losses.append(loss.detach())
    losses = mean(stack(losses))
    preds = cat(preds)

    ## Converting raw scores to probabilities using Sigmoid:
    preds = sigmoid(preds)

    ## Converting probabilities to class labels:
    preds = logit2label(preds.detach(), cls_thresh=0.5)
    trues = cat(trues)
    if n_classes == 1:
        result_dict = calculate_performance_bin_sk(trues, preds)
    else:
        result_dict = calculate_performance(trues, preds)
    test_output = {'preds': preds, 'trues': trues, 'result': result_dict}
    # logger.info(dumps(result_dict, indent=4))

    return losses, test_output
Пример #4
0
def train_graph_classifier(
        model: GAT_Graph_Classifier,
        data_loader: torch.utils.data.dataloader.DataLoader,
        loss_func: torch.nn.modules.loss.BCEWithLogitsLoss,
        optimizer,
        epochs: int = 5,
        eval_data_loader: torch.utils.data.dataloader.DataLoader = None):
    train_epoch_losses = []
    train_epoch_dict = OrderedDict()
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        preds = []
        trues = []
        for iter, (graph_batch, label) in enumerate(data_loader):
            ## Store emb in a separate file as self_loop removes emb info:
            emb = graph_batch.ndata['emb']
            # graph_batch = dgl.add_self_loop(graph_batch)
            prediction = model(graph_batch, emb)
            loss = loss_func(prediction, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.detach().item()
            preds.append(prediction.detach())
            trues.append(label.detach())
        epoch_loss /= (iter + 1)
        losses, test_output = test_graph_classifier(
            model, loss_func=loss_func, data_loader=eval_data_loader)
        logger.info(
            f"Epoch {epoch}, Train loss {epoch_loss}, Eval loss {losses},"
            f" Macro F1 {test_output['result']['f1']['macro'].item()}")
        # logger.info(dumps(test_output['result'], indent=4))
        train_epoch_losses.append(epoch_loss)
        preds = torch.cat(preds)

        ## Converting raw scores to probabilities using Sigmoid:
        preds = torch.sigmoid(preds)

        ## Converting probabilities to class labels:
        preds = logit2label(preds.detach(), cls_thresh=0.5)
        trues = torch.cat(trues)
        result_dict = calculate_performance(trues, preds)
        # logger.info(dumps(result_dict, indent=4))
        train_epoch_dict[epoch] = {
            'preds': preds,
            'trues': trues,
            'result': result_dict
        }
        # logger.info(f'Epoch {epoch} result: \n{result_dict}')

    return train_epoch_losses, train_epoch_dict
def train_graph_classifier(
        model,
        G,
        X,
        data_loader: utils.data.dataloader.DataLoader,
        loss_func: nn.modules.loss.BCEWithLogitsLoss,
        optimizer,
        epochs: int = 5,
        eval_data_loader: utils.data.dataloader.DataLoader = None,
        test_data_loader: utils.data.dataloader.DataLoader = None,
        n_classes=cfg['data']['num_classes']):
    logger.info("Started training...")
    train_epoch_losses = []
    train_epoch_dict = OrderedDict()
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        preds = []
        trues = []
        for iter, (graph_batch, local_ids, label, global_ids,
                   node_counts) in enumerate(data_loader):
            ## Store emb in a separate file as self_loop removes emb info:
            emb = graph_batch.ndata['emb']
            # graph_batch = dgl.add_self_loop(graph_batch)
            if cfg['model']['use_cuda'][plat][user] and cuda.is_available():
                graph_batch = graph_batch.to(device)
                emb = emb.to(device)
                # local_ids = local_ids.to(device)
                # node_counts = node_counts.to(device)
                # global_ids = global_ids.to(device)
                G = G.to(device)
                X = X.to(device)
            start_time = timeit.default_timer()
            prediction = model(graph_batch, emb, local_ids, node_counts,
                               global_ids, G, X)
            # if epoch == 30:
            #     from evaluations import get_freq_disjoint_token_vecs, plot
            #     glove_vecs = get_freq_disjoint_token_vecs(S_vocab, T_vocab, X)
            #     plot(glove_vecs)
            if cfg['model']['use_cuda'][plat][user] and cuda.is_available():
                prediction = prediction.to(device)
            if prediction.dim() == 1:
                prediction = prediction.unsqueeze(1)
            loss = loss_func(prediction, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_time = timeit.default_timer() - start_time
            train_count = label.shape[0]
            logger.info(
                f"Training time per example: [{train_time / train_count} sec]")
            logger.info(f"Iteration {iter}, loss: {loss.detach().item()}")
            epoch_loss += loss.detach().item()
            preds.append(prediction.detach())
            trues.append(label.detach())
        epoch_loss /= (iter + 1)
        val_losses, val_output = eval_graph_classifier(
            model, G, X, loss_func=loss_func, data_loader=eval_data_loader)
        logger.info(f'val_output: \n{dumps(val_output["result"], indent=4)}')
        test_losses, test_output = eval_graph_classifier(
            model, G, X, loss_func=loss_func, data_loader=test_data_loader)
        logger.info(f'test_output: \n{dumps(test_output["result"], indent=4)}')
        logger.info(
            f"Epoch {epoch}, Train loss {epoch_loss}, val loss "
            f"{val_losses}, test loss {test_losses}, Val Macro F1 "
            f"{val_output['result']['f1']['macro'].item()} Test Macro F1"
            f" {test_output['result']['f1']['macro'].item()}")
        # logger.info(f"Epoch {epoch}, Train loss {epoch_loss}, val loss "
        #             f"{val_losses}, Val Macro F1 {val_output['result']['f1']['macro'].item()}")
        train_epoch_losses.append(epoch_loss)
        preds = cat(preds)

        ## Converting raw scores to probabilities using Sigmoid:
        preds = sigmoid(preds)

        ## Converting probabilities to class labels:
        preds = logit2label(preds.detach(), cls_thresh=0.5)
        trues = cat(trues)
        if n_classes == 1:
            result_dict = calculate_performance_bin_sk(trues, preds)
        else:
            result_dict = calculate_performance(trues, preds)
        # logger.info(dumps(result_dict, indent=4))
        train_epoch_dict[epoch] = {
            'preds': preds,
            'trues': trues,
            'result': result_dict
        }
        # logger.info(f'Epoch {epoch} result: \n{result_dict}')

    return train_epoch_losses, train_epoch_dict
Пример #6
0
def classify(
    train_df=None,
    test_df=None,
    stoi=None,
    vectors=None,
    n_classes=cfg['data']['num_classes'],
    dim=cfg['embeddings']['emb_dim'],
    data_dir=dataset_dir,
    train_filename=cfg['data']['train'],
    test_filename=cfg['data']['test'],
    cls_thresh=None,
    epoch=cfg['training']['num_epoch'],
    num_layers=cfg['lstm_params']['num_layers'],
    num_hidden_nodes=cfg['lstm_params']['hid_size'],
    dropout=cfg['model']['dropout'],
    default_thresh=0.5,
    lr=cfg['model']['optimizer']['lr'],
    train_batch_size=cfg['training']['train_batch_size'],
    test_batch_size=cfg['training']['eval_batch_size'],
):
    """

    :param n_classes:
    :param test_batch_size:
    :param train_df:
    :param test_df:
    :param stoi:
    :param vectors:
    :param dim:
    :param data_dir:
    :param train_filename:
    :param test_filename:
    :param cls_thresh:
    :param epoch:
    :param num_layers:
    :param num_hidden_nodes:
    :param dropout:
    :param default_thresh:
    :param lr:
    :param train_batch_size:
    :return:
    """
    ## Prepare labelled source data:
    # logger.info('Prepare labelled source data')
    # if train_df is None:
    #     train_df = read_labelled_json(data_dir, train_filename)
    #     train_df = labels_mapper(train_df)
    train_dataname = train_filename + "_4class.csv"
    train_df.to_csv(join(data_dir, train_dataname))

    if stoi is None:
        logger.critical('GLOVE features')
        train_dataset, (train_vocab, train_label) = get_dataset_fields(
            csv_dir=data_dir,
            csv_file=train_dataname,
            min_freq=1,
            labelled_data=True)
    else:
        logger.critical('GCN features')
        train_dataset, (train_vocab, train_label) = get_dataset_fields(
            csv_dir=data_dir,
            csv_file=train_dataname,
            min_freq=1,
            labelled_data=True,
            embedding_file=None,
            embedding_dir=None)
        train_vocab.vocab.set_vectors(stoi=stoi, vectors=vectors, dim=dim)

    ## Plot representations:
    # plot_features_tsne(train_vocab.vocab.vectors,
    #                    list(train_vocab.vocab.stoi.keys()))

    ## Prepare labelled target data:
    logger.info('Prepare labelled target data')
    if test_df is None:
        test_df = read_labelled_json(data_dir, test_filename)
    test_dataname = test_filename + "_4class.csv"
    test_df.to_csv(join(data_dir, test_dataname))
    test_dataset, (test_vocab, test_label) = get_dataset_fields(
        csv_dir=data_dir,
        csv_file=test_dataname,  # init_vocab=True,
        labelled_data=True)

    # check whether cuda is available
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    logger.info('Get iterator')
    train_iter, val_iter = dataset2bucket_iter(
        (train_dataset, test_dataset),
        batch_sizes=(train_batch_size, test_batch_size))

    size_of_vocab = len(train_vocab.vocab)
    num_output_nodes = n_classes

    # instantiate the model
    logger.info('instantiate the model')
    model = BiLSTM_Classifier(size_of_vocab,
                              num_hidden_nodes,
                              num_output_nodes,
                              dim,
                              num_layers,
                              dropout=dropout)

    # architecture
    logger.info(model)

    # No. of trianable parameters
    logger.info('No. of trianable parameters')
    count_parameters(model)

    # Initialize the pretrained embedding
    logger.info('Initialize the pretrained embedding')
    pretrained_embeddings = train_vocab.vocab.vectors
    model.embedding.weight.data.copy_(pretrained_embeddings)

    logger.debug(pretrained_embeddings.shape)

    # label_cols = [str(cls) for cls in range(n_classes)]

    logger.info('Training model')
    model_best, val_preds_trues_best, val_preds_trues_all, losses = trainer(
        model, train_iter, val_iter, N_EPOCHS=epoch, lr=lr)

    plot_training_loss(losses['train'],
                       losses['val'],
                       plot_name='loss' + str(epoch) + str(lr))

    if cls_thresh is None:
        cls_thresh = [default_thresh] * n_classes

    predicted_labels = logit2label(DataFrame(
        val_preds_trues_best['preds'].cpu().numpy()),
                                   cls_thresh,
                                   drop_irrelevant=False)

    logger.info('Calculate performance')
    result = calculate_performance_pl(val_preds_trues_best['trues'],
                                      val_preds_trues_best['preds'])

    logger.info("Result: {}".format(result))

    # result_df = flatten_results(result)
    # result_df.round(decimals=4).to_csv(
    #     join(data_dir, test_filename + '_results.csv'))

    return result
Пример #7
0
def predict_with_label(model, iterator, criterion=None, metric=True):
    """ Predicts and calculates performance. Labels mandatory

    Args:
        model:
        iterator:
        criterion:

    Returns:

    """
    # initialize every epoch
    epoch_loss = 0

    if criterion is None:
        criterion = nn.BCEWithLogitsLoss()

    preds_trues = {
        'preds': [],
        'trues': [],
        'ids': [],
        'losses': [],
        'results': []
    }

    # deactivating dropout layers
    model.eval()

    # deactivates autograd
    with no_grad():
        for i, batch in enumerate(iterator):
            # retrieve text and no. of words
            text, text_lengths = batch.text

            # convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()

            # compute loss and accuracy
            batch_labels = torchtext_batch2multilabel(batch)
            preds_trues['preds'].append(predictions)
            preds_trues['trues'].append(batch_labels)
            preds_trues['ids'].append(batch.ids)
            loss = criterion(predictions, batch_labels)

            # keep track of loss and accuracy
            epoch_loss += loss.item()
            preds_trues['losses'].append(epoch_loss)
            # epoch_acc += acc.item()
            # epoch_acc += acc["accuracy"]["unnormalize"]
        if metric:
            ## Converting raw scores to probabilities using Sigmoid:
            preds = sigmoid(predictions)

            ## Converting probabilities to class labels:
            preds = logit2label(preds.detach(), cls_thresh=0.5)
            trues = cat(preds_trues['trues'])
            result_dict = calculate_performance(trues, preds)

        preds_trues['preds'] = cat(preds_trues['preds'])
        preds_trues['trues'] = cat(preds_trues['trues'])
        preds_trues['ids'] = cat(preds_trues['ids'])
        preds_trues['losses'] = cat(preds_trues['losses'])

    return epoch_loss / len(iterator), preds_trues