Exemplo n.º 1
0
def load_json(filename: str, filepath: str = dataset_dir, ext: str = ".json",
              show_path: bool = True) -> OrderedDict:
    """ Reads json file as a Python OrderedDict.

    :param filename:
    :param filepath:
    :param ext:
    :param show_path:
    :return:
    """
    file_loc = join(filepath, filename + ext)
    if show_path:
        logger.debug("Reading JSON file: [{}]".format(file_loc))
    if exists(file_loc):
        try:
            with open(file_loc, encoding="utf-8") as file:
                json_dict = load(file)
                json_dict = OrderedDict(json_dict)
            file.close()
            return json_dict
        except Exception as e:
            logger.debug(
                "Could not open file as JSON: [{}]. \n Reason:[{}]".format(
                    file_loc, e))

            logger.warn("Reading JSON as STR: [{}]".format(file_loc))
            with open(file_loc, encoding="utf-8") as file:
                json_dict = str(file)
                json_dict = loads(json_dict)
            return json_dict
    else:
        raise FileNotFoundError("File not found at: [{}]".format(file_loc))
def create_tabular_dataset(
        csv_file: str,
        data_dir: str,
        fields=None,
        skip_header: bool = True) -> data.dataset.TabularDataset:
    """ Reads a csv file and returns TorchText TabularDataset format.

    Args:
        csv_file:
        fields:
        skip_header:

    Returns:

    """
    if fields is None:
        _, fields, unlabelled_fields = prepare_fields()

    dataset = data.TabularDataset(path=join(data_dir, csv_file),
                                  format='csv',
                                  fields=fields,
                                  skip_header=skip_header)

    logger.debug(vars(dataset.examples[0]))
    return dataset
def propagate_labels(
    features,
    labels,
):
    label_prop_model = LabelSpreading(kernel=construct_graph, n_jobs=-1)
    label_prop_model.fit(features, labels)
    logger.debug(label_prop_model.classes_)
    # preds = label_prop_model.predict(features)
    preds = label_prop_model.predict_proba(features)
    # logger.debug(label_prop_model.classes_)

    return preds
def create_dataset(examples, fields=None):
    """ Creates a TorchText Dataset from examples (list) and fields (dict).

    Args:
        fields:
        skip_header:

    Returns:

    """
    if fields is None:
        _, fields, unlabelled_fields = prepare_fields()

    dataset = data.Dataset(examples=examples, fields=fields)

    logger.debug(vars(dataset.examples[0]))
    return dataset
Exemplo n.º 5
0
def logit2label(predictions_df: pd.core.frame.DataFrame,
                cls_thresh: [list, float],
                drop_irrelevant=False,
                return_df=False):
    """ Converts logit to multi-hot based on threshold per class.

    :param predictions_df: can be pd.DataFrame or np.NDArray or torch.tensor
    :param cls_thresh: List of floats as threshold for each class
    :param drop_irrelevant: Remove samples for which no class crossed it's
    threshold. i.e. [0.,0.,0.,0.]
    """
    if isinstance(predictions_df, pd.core.frame.DataFrame):
        logger.debug(
            (predictions_df.values.min(), predictions_df.values.max()))
        df_np = predictions_df.to_numpy()
    elif isinstance(predictions_df, (np.ndarray, torch.Tensor)):
        df_np = predictions_df
    else:
        NotImplementedError(
            f'Only supports pd.DataFrame or np.ndarray or '
            f'torch.Tensor but received [{type(predictions_df)}]')

    ## Create threshold list for all classes if only one threshold float is provided:
    if isinstance(cls_thresh, float):
        cls_thresh = [cls_thresh for i in range(df_np.shape[1])]

    for col in range(df_np.shape[1]):
        df_np[:, col][df_np[:, col] > cls_thresh[col]] = 1.
        df_np[:, col][df_np[:, col] <= cls_thresh[col]] = 0.

    if return_df:
        predictions_df = pd.DataFrame(df_np, index=predictions_df.index)

        if drop_irrelevant:
            # delete all rows where sum == 0
            irrelevant_rows = []
            for i, row in predictions_df.iterrows():
                if sum(row) < 1:
                    irrelevant_rows.append(i)

            predictions_df = predictions_df.drop(irrelevant_rows)
        return predictions_df
    else:
        return df_np
Exemplo n.º 6
0
def classify(
    train_df=None,
    test_df=None,
    stoi=None,
    vectors=None,
    n_classes=cfg['data']['num_classes'],
    dim=cfg['embeddings']['emb_dim'],
    data_dir=dataset_dir,
    train_filename=cfg['data']['train'],
    test_filename=cfg['data']['test'],
    cls_thresh=None,
    epoch=cfg['training']['num_epoch'],
    num_layers=cfg['lstm_params']['num_layers'],
    num_hidden_nodes=cfg['lstm_params']['hid_size'],
    dropout=cfg['model']['dropout'],
    default_thresh=0.5,
    lr=cfg['model']['optimizer']['lr'],
    train_batch_size=cfg['training']['train_batch_size'],
    test_batch_size=cfg['training']['eval_batch_size'],
):
    """

    :param n_classes:
    :param test_batch_size:
    :param train_df:
    :param test_df:
    :param stoi:
    :param vectors:
    :param dim:
    :param data_dir:
    :param train_filename:
    :param test_filename:
    :param cls_thresh:
    :param epoch:
    :param num_layers:
    :param num_hidden_nodes:
    :param dropout:
    :param default_thresh:
    :param lr:
    :param train_batch_size:
    :return:
    """
    ## Prepare labelled source data:
    # logger.info('Prepare labelled source data')
    # if train_df is None:
    #     train_df = read_labelled_json(data_dir, train_filename)
    #     train_df = labels_mapper(train_df)
    train_dataname = train_filename + "_4class.csv"
    train_df.to_csv(join(data_dir, train_dataname))

    if stoi is None:
        logger.critical('GLOVE features')
        train_dataset, (train_vocab, train_label) = get_dataset_fields(
            csv_dir=data_dir,
            csv_file=train_dataname,
            min_freq=1,
            labelled_data=True)
    else:
        logger.critical('GCN features')
        train_dataset, (train_vocab, train_label) = get_dataset_fields(
            csv_dir=data_dir,
            csv_file=train_dataname,
            min_freq=1,
            labelled_data=True,
            embedding_file=None,
            embedding_dir=None)
        train_vocab.vocab.set_vectors(stoi=stoi, vectors=vectors, dim=dim)

    ## Plot representations:
    # plot_features_tsne(train_vocab.vocab.vectors,
    #                    list(train_vocab.vocab.stoi.keys()))

    ## Prepare labelled target data:
    logger.info('Prepare labelled target data')
    if test_df is None:
        test_df = read_labelled_json(data_dir, test_filename)
    test_dataname = test_filename + "_4class.csv"
    test_df.to_csv(join(data_dir, test_dataname))
    test_dataset, (test_vocab, test_label) = get_dataset_fields(
        csv_dir=data_dir,
        csv_file=test_dataname,  # init_vocab=True,
        labelled_data=True)

    # check whether cuda is available
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    logger.info('Get iterator')
    train_iter, val_iter = dataset2bucket_iter(
        (train_dataset, test_dataset),
        batch_sizes=(train_batch_size, test_batch_size))

    size_of_vocab = len(train_vocab.vocab)
    num_output_nodes = n_classes

    # instantiate the model
    logger.info('instantiate the model')
    model = BiLSTM_Classifier(size_of_vocab,
                              num_hidden_nodes,
                              num_output_nodes,
                              dim,
                              num_layers,
                              dropout=dropout)

    # architecture
    logger.info(model)

    # No. of trianable parameters
    logger.info('No. of trianable parameters')
    count_parameters(model)

    # Initialize the pretrained embedding
    logger.info('Initialize the pretrained embedding')
    pretrained_embeddings = train_vocab.vocab.vectors
    model.embedding.weight.data.copy_(pretrained_embeddings)

    logger.debug(pretrained_embeddings.shape)

    # label_cols = [str(cls) for cls in range(n_classes)]

    logger.info('Training model')
    model_best, val_preds_trues_best, val_preds_trues_all, losses = trainer(
        model, train_iter, val_iter, N_EPOCHS=epoch, lr=lr)

    plot_training_loss(losses['train'],
                       losses['val'],
                       plot_name='loss' + str(epoch) + str(lr))

    if cls_thresh is None:
        cls_thresh = [default_thresh] * n_classes

    predicted_labels = logit2label(DataFrame(
        val_preds_trues_best['preds'].cpu().numpy()),
                                   cls_thresh,
                                   drop_irrelevant=False)

    logger.info('Calculate performance')
    result = calculate_performance_pl(val_preds_trues_best['trues'],
                                      val_preds_trues_best['preds'])

    logger.info("Result: {}".format(result))

    # result_df = flatten_results(result)
    # result_df.round(decimals=4).to_csv(
    #     join(data_dir, test_filename + '_results.csv'))

    return result
Exemplo n.º 7
0
def main(model_type='GNN',
         data_dir: str = dataset_dir,
         lr=cfg["model"]["optimizer"]["lr"],
         mittens_iter: int = 300,
         gcn_hops: int = 5,
         glove_embs=None,
         labelled_source_name: str = cfg['data']['train'],
         labelled_val_name: str = cfg['data']['val'],
         unlabelled_source_name: str = cfg["data"]["source"]['unlabelled'],
         labelled_target_name: str = cfg['data']['test'],
         unlabelled_target_name: str = cfg["data"]["target"]['unlabelled'],
         train_batch_size=cfg['training']['train_batch_size'],
         test_batch_size=cfg['training']['eval_batch_size'],
         use_lpa=False):
    logger.critical(f'Current Learning Rate: [{lr}]')
    labelled_source_path = join(data_dir, labelled_source_name)
    unlabelled_source_name = unlabelled_source_name
    unlabelled_target_name = unlabelled_target_name
    S_dataname = unlabelled_source_name + "_data.csv"
    T_dataname = unlabelled_target_name + "_data.csv"

    if exists(labelled_source_path + 'S_vocab.json')\
            and exists(labelled_source_path + 'T_vocab.json')\
            and exists(labelled_source_path + 'labelled_token2vec_map.json'):
        # ## Read labelled source data
        # s_lab_df = read_labelled_json(data_dir, labelled_source_name)
        # ## Match label space between two datasets:
        # if str(labelled_source_name).startswith('fire16'):
        #     s_lab_df = labels_mapper(s_lab_df)

        C_vocab = read_json(labelled_source_path + 'C_vocab')
        S_vocab = read_json(labelled_source_path + 'S_vocab')
        T_vocab = read_json(labelled_source_path + 'T_vocab')
        labelled_token2vec_map = read_json(labelled_source_path +
                                           'labelled_token2vec_map')

        if not exists(labelled_source_path + 'high_oov_freqs.json'):
            S_dataset, (S_fields,
                        LABEL) = get_dataset_fields(csv_dir=data_dir,
                                                    csv_file=S_dataname)
            T_dataset, (T_fields,
                        LABEL) = get_dataset_fields(csv_dir=data_dir,
                                                    csv_file=T_dataname)
    else:
        C_vocab, C_dataset, S_vocab, S_dataset, S_fields, T_vocab,\
        T_dataset, T_fields, labelled_token2vec_map, s_lab_df =\
            create_vocab(s_lab_df=None, data_dir=data_dir,
                         labelled_source_name=labelled_source_name,
                         unlabelled_source_name=unlabelled_source_name,
                         unlabelled_target_name=unlabelled_target_name)
        ## Save vocabs:
        save_json(C_vocab, labelled_source_path + 'C_vocab')
        save_json(S_vocab, labelled_source_path + 'S_vocab')
        save_json(T_vocab, labelled_source_path + 'T_vocab')
        save_json(labelled_token2vec_map,
                  labelled_source_path + 'labelled_token2vec_map')

    if glove_embs is None:
        glove_embs = glove2dict()
    if exists(labelled_source_path + 'high_oov_freqs.json')\
            and exists(labelled_source_path + 'corpus.json')\
            and exists(labelled_source_path + 'corpus_toks.json'):
        high_oov_freqs = read_json(labelled_source_path + 'high_oov_freqs')
        # low_glove_freqs = read_json(labelled_source_name+'low_glove_freqs')
        corpus = read_json(labelled_source_path + 'corpus',
                           convert_ordereddict=False)
        corpus_toks = read_json(labelled_source_path + 'corpus_toks',
                                convert_ordereddict=False)
    else:
        ## Get all OOVs which does not have Glove embedding:
        high_oov_freqs, low_glove_freqs, corpus, corpus_toks =\
            preprocess_and_find_oov(
                (S_dataset, T_dataset), C_vocab, glove_embs=glove_embs,
                labelled_vocab_set=set(labelled_token2vec_map.keys()))

        ## Save token sets: high_oov_freqs, low_glove_freqs, corpus, corpus_toks
        save_json(high_oov_freqs, labelled_source_path + 'high_oov_freqs')
        # save_json(low_glove_freqs, labelled_source_name+'low_glove_freqs', overwrite=True)
        save_json(corpus, labelled_source_path + 'corpus')
        save_json(corpus_toks, labelled_source_path + 'corpus_toks')
        save_json(C_vocab, labelled_source_path + 'C_vocab', overwrite=True)

    ## Read labelled datasets and prepare:
    logger.info('Read labelled datasets and prepare')
    train_dataset, val_dataset, test_dataset, train_vocab, val_vocab, test_vocab\
        = prepare_splitted_datasets()

    logger.info('Creating instance graphs')
    train_instance_graphs = Instance_Dataset_DGL(
        train_dataset,
        train_vocab,
        labelled_source_name,
        class_names=cfg['data']['class_names'])
    logger.debug(train_instance_graphs.num_labels)
    # logger.debug(train_instance_graphs.graphs, train_instance_graphs.labels)

    train_dataloader = DataLoader(
        train_instance_graphs,
        batch_size=train_batch_size,
        shuffle=True,
        collate_fn=train_instance_graphs.batch_graphs)

    logger.info(
        f"Number of training instance graphs: {len(train_instance_graphs)}")

    val_instance_graphs = Instance_Dataset_DGL(
        val_dataset,
        train_vocab,
        labelled_val_name,
        class_names=cfg['data']['class_names'])

    val_dataloader = DataLoader(val_instance_graphs,
                                batch_size=test_batch_size,
                                shuffle=True,
                                collate_fn=val_instance_graphs.batch_graphs)

    logger.info(
        f"Number of validating instance graphs: {len(val_instance_graphs)}")

    test_instance_graphs = Instance_Dataset_DGL(
        test_dataset,
        train_vocab,
        labelled_target_name,
        class_names=cfg['data']['class_names'])

    test_dataloader = DataLoader(test_instance_graphs,
                                 batch_size=test_batch_size,
                                 shuffle=True,
                                 collate_fn=test_instance_graphs.batch_graphs)

    logger.info(
        f"Number of testing instance graphs: {len(test_instance_graphs)}")

    # model_type = 'GAT'
    logger.info(f'Classifying graphs using {model_type} model.')
    if model_type == 'GAT':
        logger.info('Using GAT model')
        train_epochs_output_dict, test_output = GAT_multilabel_classification(
            train_dataloader,
            val_dataloader,
            test_dataloader,
            in_dim=cfg['embeddings']['emb_dim'],
            hid_dim=cfg['gnn_params']['hid_dim'],
            num_heads=cfg['gnn_params']['num_heads'],
            epochs=cfg['training']['num_epoch'],
            lr=lr)

    else:
        ## Create token graph:
        logger.info(f'Using GNN model and creating token graph:')
        g_ob = Token_Dataset_nx(corpus_toks,
                                C_vocab,
                                S_vocab,
                                T_vocab,
                                dataset_name=labelled_source_name)
        g_ob.add_edge_weights()
        G = g_ob.G
        num_tokens = g_ob.num_tokens

        node_list = list(G.nodes)
        logger.info(
            f"Number of nodes {len(node_list)} and edges {len(G.edges)} in token graph"
        )

        ## Create new embeddings for OOV tokens:
        oov_emb_filename = labelled_source_name + '_OOV_vectors_dict'
        if exists(join(data_dir, oov_emb_filename + '.pkl')):
            logger.info('Read OOV embeddings:')
            oov_embs = load_pickle(filepath=data_dir,
                                   filename=oov_emb_filename)
        else:
            logger.info('Create OOV embeddings using Mittens:')
            high_oov_tokens_list = list(high_oov_freqs.keys())
            c_corpus = corpus[0] + corpus[1]
            oov_mat_coo = calculate_cooccurrence_mat(high_oov_tokens_list,
                                                     c_corpus)
            oov_embs = train_mittens(oov_mat_coo,
                                     high_oov_tokens_list,
                                     glove_embs,
                                     max_iter=mittens_iter)
            save_pickle(oov_embs,
                        filepath=data_dir,
                        filename=oov_emb_filename,
                        overwrite=True)

        ## Get adjacency matrix and node embeddings in same order:
        logger.info('Accessing token adjacency matrix')
        ## Note: Saving sparse tensor usually gets corrupted.
        # adj_filename = join(data_dir, labelled_source_name + "_adj.pt")
        # if exists(adj_filename):
        #     adj = load(adj_filename)
        #     # adj = sp_coo2torch_coo(adj)
        # else:
        #     adj = adjacency_matrix(G, nodelist=node_list, weight='weight')
        #     adj = sp_coo2torch_coo(adj)
        #     save(adj, adj_filename)
        adj = adjacency_matrix(G, nodelist=node_list, weight='weight')
        adj = sp_coo2torch_coo(adj)

        logger.info('Accessing token graph node embeddings:')
        emb_filename = join(data_dir, labelled_source_name + "_emb.pt")
        if exists(emb_filename):
            X = load(emb_filename)
        else:
            logger.info('Get node embeddings from token graph:')
            X = g_ob.get_node_embeddings(oov_embs, glove_embs,
                                         C_vocab['idx2str_map'])
            # X = sp_coo2torch_coo(X)
            save(X, emb_filename)

        # logger.info('Applying GCN Forward old')
        # X_hat = GCN_forward_old(adj, X, forward=gcn_hops)
        # logger.info('Applying GCN Forward')
        # X_hat = GCN_forward(adj, X, forward=gcn_hops)

        ## Apply Label Propagation to get label vectors for unlabelled nodes:
        if use_lpa:
            logger.info('Getting propagated label vectors:')
            label_proba_filename = join(data_dir,
                                        labelled_source_name + "_lpa_vecs.pt")
            if exists(label_proba_filename):
                lpa_vecs = torch.load(label_proba_filename)
            else:
                all_node_labels, labelled_masks = fetch_all_nodes(
                    node_list,
                    labelled_token2vec_map,
                    C_vocab['idx2str_map'],
                    # default_fill=[0.])
                    default_fill=[0., 0., 0., 0.])
                lpa_vecs = label_propagation(adj, all_node_labels,
                                             labelled_masks)
                torch.save(lpa_vecs, label_proba_filename)

            logger.info('Recalculate edge weights using LPA vectors:')
            g_ob.normalize_edge_weights(lpa_vecs)

            adj = adjacency_matrix(g_ob.G, nodelist=node_list, weight='weight')
            adj = sp_coo2torch_coo(adj)

        ## Normalize Adjacency matrix:
        logger.info('Normalize token graph:')
        adj = g_ob.normalize_adj(adj)

        # ## Create label to propagated vector map:
        # logger.info('Create label to propagated vector map')
        # node_txt2label_vec = {}
        # for node_id in node_list:
        #     node_txt2label_vec[C_vocab['idx2str_map'][node_id]] =\
        #         lpa_vecs[node_id].tolist()
        # DataFrame.from_dict(node_txt2label_vec, orient='index').to_csv(labelled_source_name + 'node_txt2label_vec.csv')

        logger.info('Using GNN model')
        train_epochs_output_dict, test_output = GAT_GCN_trainer(
            adj,
            X,
            train_dataloader,
            val_dataloader,
            test_dataloader,
            num_tokens=num_tokens,
            in_feats=cfg['embeddings']['emb_dim'],
            hid_feats=cfg['gnn_params']['hid_dim'],
            num_heads=cfg['gnn_params']['num_heads'],
            epochs=cfg['training']['num_epoch'],
            lr=lr)

    # ## Propagating label vectors using GCN forward instead of LPA:
    # X_labels_hat = GCN_forward(adj, all_node_labels, forward=gcn_hops)
    # torch.save(X_labels_hat, 'X_labels_hat_05.pt')

    return C_vocab['str2idx_map']  # , X_hat