Пример #1
0
 def predict(self, trained_model_path, output_file_path, k=10, emb_saved=0, can_saved=0):
     print 'generate X-embedding'
     if emb_saved:
         self.train_x_emb = load_pickle(trained_model_path+'train_x_emb.pkl')
         self.test_x_emb = load_pickle(trained_model_path+'test_x_emb.pkl')
     else:
         self.generate_x_embedding(trained_model_path)
     print 'get candidate label from X-embedding'
     if can_saved:
         self.test_unique_candidate_label = load_pickle(trained_model_path+'test_candidate_label.pkl')
     else:
         self.get_candidate_label_from_x_emb(trained_model_path, k)
     test_loader = copy.deepcopy(self.test_data)
     test_loader.candidate_label_data = self.test_unique_candidate_label
     test_loader.reset_data()
     print 'begin predict'
     self.test(trained_model_path, output_file_path, test_loader)
Пример #2
0
def main():
    parse = argparse.ArgumentParser()
    # ---------- environment setting: which gpu -------
    parse.add_argument('-gpu', '--gpu', type=str, default='0', help='which gpu to use: 0 or 1')
    # ---------- foler path of train/test data -------
    parse.add_argument('-folder', '--folder_path', type=str,
                       default='datasets/Amazon-670K/data/deeplearning_data/adjacent_labels/',
                       help='path to train/test data')
    # ---------- vocab and word embeddings --------
    #parse.add_argument('-vocab', '--vocab_path', type=str, default='vocab.6B.300d.pkl', help='path to the vocab')
    parse.add_argument('-word_embeddings', '--word_embedding_path', type=str,
                       default='word_emb.6B.300d.npy',
                       help='path to the word embeddings')
    # ---------- model ----------
    parse.add_argument('-max_seq_len', '--max_seq_len', type=int, default=500,
                       help='maximum sequence length')
    parse.add_argument('-model', '--model', type=str, default='CNN', help='model: LSTM, biLSTM, CNN')
    parse.add_argument('-pretrained_model', '--pretrained_model_path', type=str,
                       default=None, help='path to the pretrained model')
    parse.add_argument('-cal_metrics', '--cal_metrics', type=int, default=1,
                       help='if calculate wts_p and wts_ndcg for baseline results')
    parse.add_argument('-alpha', '--alpha', type=float, default=0.2,
                       help='trade off parameter between baseline score and refinement score')
    # ---------- params for CNN ------------
    parse.add_argument('-num_filters', '--num_filters', type=int,
                       default=32, help='number of filters in CNN')
    parse.add_argument('-pooling_units', '--pooling_units', type=int,
                       default=64, help='number of pooling units')
    parse.add_argument('-dropout_keep_prob', '--dropout_keep_prob', type=float,
                       default=0.5, help='keep probability in dropout layer')
    filter_sizes = [2, 4, 8]
    # ---------- training parameters --------
    #parse.add_argument('-if_use_all_true', '--if_use_all_true', type=int, default=0, help='if use all true labels for training')
    parse.add_argument('-if_output_all_labels', '--if_output_all_labels', type=int, default=0, help='if output all labels')
    parse.add_argument('-n_epochs', '--n_epochs', type=int, default=10, help='number of epochs')
    parse.add_argument('-batch_size', '--batch_size', type=int, default=256, help='batch size for training')
    parse.add_argument('-batch_pid_size', '--batch_pid_size', type=int, default=5, help='batch pid size for testing')
    #parse.add_argument('-num_candidate', '--num_candidate', type=int, default=20, help='number of candidate labels')
    #parse.add_argument('-topk', '--topk', type=int, default=6, help='k in competitive layer')
    parse.add_argument('-show_batches', '--show_batches', type=int,
                       default=1000, help='show how many batches have been processed.')
    parse.add_argument('-lr', '--learning_rate', type=float, default=0.0001, help='learning rate')
    parse.add_argument('-update_rule', '--update_rule', type=str, default='adam', help='update rule')
    # ------ train or predict -------
    parse.add_argument('-train', '--train', type=int, default=1, help='if training')
    parse.add_argument('-test', '--test', type=int, default=0, help='if testing')
    parse.add_argument('-predict', '--predict', type=int, default=0, help='if predicting')
    args = parse.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    print('-------------- load vocab and word embeddings ---------------')
    word_embeddings = np.load('datasets/material/' + args.word_embedding_path)
    # word_embedding_dim
    word_embedding_dim = word_embeddings.shape[-1]
    print('word_embeddings shape: ' + str(word_embeddings.shape))
    # add '<PAD>' embedding
    word_embeddings = np.concatenate((np.zeros((1, word_embedding_dim)), word_embeddings), axis=0)
    print('after add PAD embedding, word_embeddings shape:' + str(word_embeddings.shape))
    print('-------------- load label embeddings ------------------------')
    all_labels, label_embeddings = generate_label_embedding_from_file_2(args.folder_path + 'label.embeddings')
    label_embeddings = np.array(label_embeddings)
    label_dict = dict(zip(all_labels, range(len(all_labels))))
    print('number of labels: ' + str(len(all_labels)))
    # label_embedding_dim
    label_embedding_dim = len(label_embeddings[all_labels[0]])
    print('-------------- load label propensity ------------------------')
    label_prop = load_pickle(args.folder_path + 'inv_prop_dict.pkl')
    print('-------------- load train/test data -------------------------')
    train_doc = load_pickle(args.folder_path + 'train_doc_wordID.pkl')
    test_doc = load_pickle(args.folder_path + 'test_doc_wordID.pkl')
    train_label = load_pickle(args.folder_path + 'train_label.pkl')
    test_label = load_pickle(args.folder_path + 'test_label.pkl')
    print('-------------- load candidate labels ------------------------')
    if 'sleec' in args.model:
        candidate_type = 'sleec'
    elif 'pfastrexml' in args.model:
        candidate_type = 'pfastrexml'
    elif 'pfastxml' in args.model:
        candidate_type = 'pfastxml'
    elif 'fastxml' in args.model:
        candidate_type = 'fastxml'
    print('candidate from: ' + candidate_type)
    candidate_folder_path = args.folder_path + candidate_type + '_candidate/'
    train_candidate_label = load_pickle(candidate_folder_path + 'train_candidate_label.pkl')
    test_candidate_label = load_pickle(candidate_folder_path + 'test_candidate_label.pkl')
    print('============== create train/test data loader ...')
    if 'XML' not in args.model:
        train_loader = TrainDataLoader2(train_doc, train_label, train_candidate_label, label_dict, label_prop,
                                   10, 10, max_seq_len=args.max_seq_len)
        max_seq_len = train_loader.max_seq_len
        #max_seq_len = args.max_seq_len
        #train_loader = {}
        print('max_seq_len: ' + str(max_seq_len))
        test_loader = TestDataLoader2(test_doc, test_label, test_candidate_label, label_dict, label_prop,
                                      max_seq_len=max_seq_len, if_cal_metrics=args.cal_metrics)
        # test_loader = DataLoader3(test_doc, test_label, test_candidate_label, label_dict, args.batch_size,
        #                           given_seq_len=True, max_seq_len=max_seq_len)
    # ----------------------- train ------------------------
    print('============== build model ...')
    if 'biLSTM' in args.model:
        print('build biLSTM model ...')
        # biLSTM: max_seq_len, word_embedding_dim, hidden_dim, label_embedding_dim, num_classify_hidden, args.batch_size
        model = biLSTM(max_seq_len, word_embedding_dim, 64, label_embedding_dim, 32, args)
        args.if_use_seq_len = 1
    elif 'LSTM' in args.model:
        print('build LSTM model ...')
        # LSTM: max_seq_len, word_embedding_dim, hidden_dim, label_embedding_dim, num_classify_hidden, args.batch_size
        model = LSTM(max_seq_len, word_embedding_dim, 64, label_embedding_dim, 32, args)
        args.if_use_seq_len = 1
    elif 'CNN' in args.model:
        print('build CNN model ...')
        # CNN: sequence_length, word_embeddings, filter_sizes, label_embeddings, num_classify_hidden, args
        # args.num_filters, args.pooling_units, args.batch_size, args.dropout_keep_prob
        model = CNN(max_seq_len, word_embeddings, filter_sizes, label_embeddings, 32, args)
        args.if_use_seq_len = 0
    elif 'XML' in args.model:
        print('build XML-CNN model ...')
        train_loader = DataLoader5(train_doc, train_label, all_labels,
                                   args.batch_size,
                                   given_seq_len=False, max_seq_len=args.max_seq_len)
        max_seq_len = train_loader.max_seq_len
        test_loader = DataLoader5(test_doc, test_label, all_labels,
                                  args.batch_size,
                                  given_seq_len=True, max_seq_len=max_seq_len)
        # max_seq_len, word_embedding, filter_sizes, label_output_dim, hidden_dim, args
        # args.num_filters, args.pooling_units, args.batch_size, args.dropout_keep_prob
        model = XML_CNN(max_seq_len, word_embeddings, filter_sizes, len(all_labels), 128, args)
        args.if_output_all_labels = 1
        args.if_use_seq_len = 0

    print('================= model solver ...')
    # solver: __init__(self, model, train_data, test_data, **kwargs):
    solver = ModelSolver(model, train_loader, test_loader,
                          if_use_seq_len=args.if_use_seq_len,
                          if_output_all_labels=args.if_output_all_labels,
                          show_batches=args.show_batches,
                          n_epochs=args.n_epochs,
                          batch_size=args.batch_size,
                          update_rule=args.update_rule,
                          learning_rate=args.learning_rate,
                          pretrained_model=args.pretrained_model_path,
                          model_path=args.folder_path + args.model + '/',
                          test_path=args.folder_path + args.model + '/')
    # train
    if args.train:
        print('================= begin training...')
        solver.train(args.folder_path + args.model + '/outcome.txt')

    # test
    if args.test:
        print('================= begin testing...')
        solver.test(args.folder_path + args.model + '/' + args.pretrained_model_path, args.folder_path + args.model + '/test_outcome.txt')

    # predict
    if args.predict:
        print('================= begin predicting...')
        predict_path = args.folder_path+'model_save/'+args.model+'/'
        solver.predict(trained_model_path=predict_path,
                       output_file_path=predict_path+'predict_outcome.txt',
                       k=10, emb_saved=1, can_saved=1)
Пример #3
0
def main():
    parse = argparse.ArgumentParser()
    # ---------- environment setting: which gpu -------
    parse.add_argument('-gpu', '--gpu', type=str, default='0', help='which gpu to use: 0 or 1')
    # ---------- foler path of train/test data -------
    parse.add_argument('-valid_labels', '--valid_labels', type=int,
                       default=0, help='-if remove invalid labels')
    parse.add_argument('-data', '--data', type=str, default='eurlex',
                       help='dataset')
    # ---------- model ----------
    parse.add_argument('-word_embedding_dim', '--word_embedding_dim', type=int, default=100, help='dim of word embedding')
    parse.add_argument('-aggr_type', '--aggr_type', type=str, default='sum', help='aggregation type in embedding layer')
    parse.add_argument('-vocab_size', '--vocab_size', type=int, default=5000, help='vocabulary size')
    parse.add_argument('-max_seq_len', '--max_seq_len', type=int, default=500, help='maximum sequence length')
    parse.add_argument('-model', '--model', type=str, default='NN', help='model: NN, LSTM, biLSTM, CNN')
    parse.add_argument('-pretrained_model', '--pretrained_model_path', type=str, default=None, help='path to the pretrained model')
    parse.add_argument('-dropout_keep_prob', '--dropout_keep_prob', type=float,
                       default=0.5, help='keep probability in dropout layer')
    parse.add_argument('-saved_dis', '--saved_dis', type=int, default=0,
                       help='whether to use saved dis matrix')
    parse.add_argument('-use_sne', '--use_sne', type=int, default=1,
                       help='whether to use sne regularization')
    parse.add_argument('-ac_lbl_ratio', '--ac_lbl_ratio', type=float, default=0.5,
                       help='ratio of active labels in sne regularization')
    parse.add_argument('-use_propensity', '--use_propensity', type=int, default=1,
                       help='whether to use propensity loss')
    parse.add_argument('-use_bi_inter', '--use_bi_inter', type=int, default=0,
                       help='whether to add bi-interactive layer')
    parse.add_argument('-use_comp', '--use_comp', type=int, default=0,
                       help='whether to add competitive layer')
    parse.add_argument('-topk', '--topk', type=int, default=10,
                       help='top k neurons in competitive layer')
    parse.add_argument('-factor', '--factor', type=float, default=0.01,
                       help='factor in competitive layer')
    parse.add_argument('-lamb', '--lamb', type=float, default=0.002,
                       help='lambda for weight regularization')
    # ---------- training parameters --------
    parse.add_argument('-n_epochs', '--n_epochs', type=int, default=10, help='number of epochs')
    parse.add_argument('-batch_size', '--batch_size', type=int, default=32, help='batch size for training')
    parse.add_argument('-lr', '--learning_rate', type=float, default=0.002, help='learning rate')
    parse.add_argument('-update_rule', '--update_rule', type=str, default='adam', help='update rule')
    # ------ train or predict -------
    parse.add_argument('-train', '--train', type=int, default=1, help='if training')
    parse.add_argument('-test', '--test', type=int, default=0, help='if testing')
    parse.add_argument('-predict', '--predict', type=int, default=0, help='if predicting')
    args = parse.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    print '-------------- load labels ------------------------'
    # default='datasets/eurlex/trn_tst_data/'
    if args.valid_labels:
        args.folder_path = 'datasets/' + args.data + '/trn_tst_data/valid_label_data/'
    else:
        args.folder_path = 'datasets/' + args.data + '/trn_tst_data/all_label_data/'
    #label_prop_dict = load_pickle(args.folder_path + 'inv_prop_dict.pkl')
    label_prop = load_pickle(args.folder_path + 'inv_prop.pkl')
    label_dict = load_pickle(args.folder_path + 'label_dict.pkl')
    all_labels = load_pickle(args.folder_path + 'index_label.pkl')
    num_labels = len(all_labels)
    print 'real number of labels: ' + str(num_labels)
    print 'maximum label: ' + str(np.max(all_labels))
    print 'minimum label: ' + str(np.min(all_labels))
    print 'number of labels: ' + str(num_labels)
    print '-------------- load train/test data -------------------------'
    train_doc = load_pickle(args.folder_path + 'train_doc_wordID.pkl')
    test_doc = load_pickle(args.folder_path + 'test_doc_wordID.pkl')
    train_label = load_pickle(args.folder_path + 'train_label.pkl')
    test_label = load_pickle(args.folder_path + 'test_label.pkl')
    print '============== create train/test data loader ...'
    train_loader = DataLoader_all(train_doc, train_label, label_dict, label_prop,
                                  batch_size=args.batch_size, max_seq_len=args.max_seq_len, ac_lbl_ratio=args.ac_lbl_ratio, folder_path=args.folder_path)
    test_loader = DataLoader_all(test_doc, test_label, label_dict, label_prop,
                                 batch_size=args.batch_size, max_seq_len=args.max_seq_len, folder_path=args.folder_path)
    print '============== build model ...'
    print 'build NN model ...'
    model = NN(args.max_seq_len, args.vocab_size, args.word_embedding_dim, num_labels, label_prop, 32, args)
    args.if_use_seq_len = 1

    print '================= model solver ...'
    solver = ModelSolver(model, train_loader, test_loader,
                         n_epochs=args.n_epochs,
                         batch_size=args.batch_size,
                         update_rule=args.update_rule,
                         learning_rate=args.learning_rate,
                         pretrained_model=args.pretrained_model_path,
                         model_path=args.folder_path + args.model + '/',
                         log_path=args.folder_path + args.model + '/',
                         test_path=args.folder_path + args.model + '/',
                         use_sne=args.use_sne,
                         saved_dis=args.saved_dis
                         )
    # train
    if args.train:
        print '================= begin training...'
        solver.train(args.folder_path + args.model + '/outcome.txt')

    # test
    if args.test:
        print '================= begin testing...'
        solver.test(args.folder_path + args.model + '/' + args.pretrained_model_path, args.folder_path + args.model + '/test_outcome.txt')

    # predict
    if args.predict:
        print '================= begin predicting...'
        predict_path = args.folder_path+'model_save/'+args.model+'/'
        solver.predict(trained_model_path=predict_path,
                       output_file_path=predict_path+'predict_outcome.txt',
                       k=10, emb_saved=1, can_saved=1)
Пример #4
0
def main():
    parse = argparse.ArgumentParser()
    # ---------- environment setting: which gpu -------
    parse.add_argument('-gpu',
                       '--gpu',
                       type=str,
                       default='0',
                       help='which gpu to use: 0 or 1')
    # ---------- foler path of train/test data -------
    parse.add_argument(
        '-folder',
        '--folder_path',
        type=str,
        default='datasets/EUR-Lex/data/deeplearning_data/xml_data/',
        help='path to train/test data')
    # ---------- vocab and word embeddings --------
    parse.add_argument('-word_embeddings',
                       '--word_embedding_path',
                       type=str,
                       default='word_emb.6B.300d.npy',
                       help='path to the word embeddings')
    parse.add_argument('-word_embedding_dim',
                       '--word_embedding_dim',
                       type=int,
                       default=100,
                       help='dim of word embedding')
    # ---------- model ----------
    parse.add_argument('-max_seq_len',
                       '--max_seq_len',
                       type=int,
                       default=500,
                       help='maximum sequence length')
    parse.add_argument('-model',
                       '--model',
                       type=str,
                       default='NN',
                       help='model: NN, LSTM, biLSTM, CNN')
    parse.add_argument('-use_attention',
                       '--use_attention',
                       type=int,
                       default=1,
                       help='whether to use attention')
    parse.add_argument('-pretrained_model',
                       '--pretrained_model_path',
                       type=str,
                       default=None,
                       help='path to the pretrained model')
    parse.add_argument(
        '-cal_metrics',
        '--cal_metrics',
        type=int,
        default=1,
        help='if calculate wts_p and wts_ndcg for baseline results')
    parse.add_argument(
        '-alpha',
        '--alpha',
        type=float,
        default=1,
        help='trade off parameter between baseline score and refinement score')
    #
    parse.add_argument('-active_feature_num',
                       '--active_feature_num',
                       type=int,
                       default=100,
                       help='number of active features for each label')
    # --- graph ---
    parse.add_argument('-use_graph',
                       '--use_graph',
                       type=int,
                       default=1,
                       help='if use graph for joint training')
    parse.add_argument(
        '-neg_samp',
        '--neg_samp',
        type=int,
        default=1,
        help='if use negative sampling in graph embedding network')
    parse.add_argument(
        '-random_label_embedding',
        '--random_label_embedding',
        type=int,
        default=0,
        help='initialize label embeddings using deepwalk or random initializer'
    )
    parse.add_argument(
        '-fix_label_embedding',
        '--fix_label_embedding',
        type=int,
        default=1,
        help=
        'fix label embedding or train label embedding based on initial value')
    # ---------- params for CNN ------------
    parse.add_argument('-num_filters',
                       '--num_filters',
                       type=int,
                       default=32,
                       help='number of filters in CNN')
    parse.add_argument('-pooling_units',
                       '--pooling_units',
                       type=int,
                       default=64,
                       help='number of pooling units')
    parse.add_argument('-dropout_keep_prob',
                       '--dropout_keep_prob',
                       type=float,
                       default=0.5,
                       help='keep probability in dropout layer')
    filter_sizes = [2, 4, 8]
    # ---------- training parameters --------
    parse.add_argument('-if_output_all_labels',
                       '--if_output_all_labels',
                       type=int,
                       default=0,
                       help='if output all labels')
    parse.add_argument('-n_epochs',
                       '--n_epochs',
                       type=int,
                       default=10,
                       help='number of epochs')
    parse.add_argument('-batch_size',
                       '--batch_size',
                       type=int,
                       default=32,
                       help='batch size for training')
    parse.add_argument('-batch_pid_size',
                       '--batch_pid_size',
                       type=int,
                       default=4,
                       help='batch pid size for testing')
    parse.add_argument('-show_batches',
                       '--show_batches',
                       type=int,
                       default=500,
                       help='show how many batches have been processed.')
    parse.add_argument('-lr',
                       '--learning_rate',
                       type=float,
                       default=0.0001,
                       help='learning rate')
    parse.add_argument('-update_rule',
                       '--update_rule',
                       type=str,
                       default='adam',
                       help='update rule')
    # ------ train or predict -------
    parse.add_argument('-train',
                       '--train',
                       type=int,
                       default=1,
                       help='if training')
    parse.add_argument('-test',
                       '--test',
                       type=int,
                       default=0,
                       help='if testing')
    parse.add_argument('-predict',
                       '--predict',
                       type=int,
                       default=0,
                       help='if predicting')
    args = parse.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    # print '-------------- load vocab and word embeddings ---------------'
    # word_embeddings = np.load('datasets/material/' + args.word_embedding_path)
    # # word_embedding_dim
    # word_embedding_dim = word_embeddings.shape[-1]
    # print 'word_embeddings shape: ' + str(word_embeddings.shape)
    # # add '<PAD>' embedding
    # word_embeddings = np.concatenate((np.zeros((1, word_embedding_dim)), word_embeddings), axis=0)
    # print 'after add PAD embedding, word_embeddings shape:' + str(word_embeddings.shape)
    print('-------------- load label embeddings ------------------------')
    all_labels, label_embeddings = generate_label_embedding_from_file_2(
        args.folder_path + 'label.embeddings')
    label_embeddings = np.array(label_embeddings)
    label_dict = dict(zip(all_labels, range(len(all_labels))))
    print('number of labels: ' + str(len(all_labels)))
    # label_embedding_dim
    label_embedding_dim = len(label_embeddings[all_labels[0]])
    print('-------------- load label propensity ------------------------')
    label_prop = load_pickle(args.folder_path + 'inv_prop_dict.pkl')
    print('-------------- load train/test data -------------------------')
    train_doc = load_pickle(args.folder_path + 'train_doc_wordID.pkl')
    test_doc = load_pickle(args.folder_path + 'test_doc_wordID.pkl')
    train_label = load_pickle(args.folder_path + 'train_label.pkl')
    test_label = load_pickle(args.folder_path + 'test_label.pkl')
    print('-------------- load candidate labels ------------------------')
    if 'sleec' in args.model:
        candidate_type = 'sleec'
    elif 'pfastrexml' in args.model:
        candidate_type = 'pfastrexml'
    elif 'pfastxml' in args.model:
        candidate_type = 'pfastxml'
    elif 'fastxml' in args.model:
        candidate_type = 'fastxml'
    print('candidate from: ' + candidate_type)
    candidate_folder_path = args.folder_path + candidate_type + '_candidate/'
    train_candidate_label = load_pickle(candidate_folder_path +
                                        'train_candidate_label.pkl')
    test_candidate_label = load_pickle(candidate_folder_path +
                                       'test_candidate_label.pkl')
    print('============== create train/test data loader ...')
    if 'XML' not in args.model:
        feature_processor = FeatureProcessor(5000, 100)
        train_loader = TrainDataLoader_final(train_doc,
                                             train_label,
                                             feature_processor,
                                             train_candidate_label,
                                             label_dict,
                                             label_prop,
                                             16,
                                             16,
                                             max_seq_len=args.max_seq_len)
        max_seq_len = train_loader.max_seq_len
        print('max_seq_len: ' + str(max_seq_len))
        test_loader = TestDataLoader_final(test_doc,
                                           test_label,
                                           feature_processor,
                                           test_candidate_label,
                                           label_dict,
                                           label_prop,
                                           max_seq_len=max_seq_len)
        if args.use_graph:
            graph = read_label_pairs(args.folder_path + 'labels.edgelist')
            graph_loader = DataLoader_graph(graph, label_dict)
        else:
            graph_loader = None
    # ----------------------- train ------------------------
    print('============== build model ...')
    if 'NN' in args.model:
        print('build NN model ...')
        model = NN_graph2(max_seq_len, 5000, args.word_embedding_dim,
                          label_embeddings, 32, args)
        args.if_use_seq_len = 1

    print('================= model solver ...')
    # solver: __init__(self, model, train_data, test_data, **kwargs):
    solver = ModelSolver2(model,
                          train_loader,
                          test_loader,
                          feature_processor,
                          graph_loader,
                          if_use_seq_len=args.if_use_seq_len,
                          if_output_all_labels=args.if_output_all_labels,
                          show_batches=args.show_batches,
                          n_epochs=args.n_epochs,
                          batch_size=args.batch_size,
                          update_rule=args.update_rule,
                          learning_rate=args.learning_rate,
                          pretrained_model=args.pretrained_model_path,
                          model_path=args.folder_path + args.model + '/',
                          log_path=args.folder_path + args.model + '/',
                          test_path=args.folder_path + args.model + '/',
                          use_graph=args.use_graph)
    # train
    if args.train:
        print('================= begin training...')
        solver.train(args.folder_path + args.model + '/outcome.txt')

    # test
    if args.test:
        print('================= begin testing...')
        solver.test(
            args.folder_path + args.model + '/' + args.pretrained_model_path,
            args.folder_path + args.model + '/test_outcome.txt')

    # predict
    if args.predict:
        print('================= begin predicting...')
        predict_path = args.folder_path + 'model_save/' + args.model + '/'
        solver.predict(trained_model_path=predict_path,
                       output_file_path=predict_path + 'predict_outcome.txt',
                       k=10,
                       emb_saved=1,
                       can_saved=1)
Пример #5
0
def main():
    parse = argparse.ArgumentParser()
    # ---------- environment setting: which gpu -------
    parse.add_argument('-gpu',
                       '--gpu',
                       type=str,
                       default='0',
                       help='which gpu to use: 0 or 1')
    # ---------- foler path of train/test data -------
    parse.add_argument('-folder',
                       '--folder_path',
                       type=str,
                       default='datasets/EUR-Lex/trn_tst_data/',
                       help='path to train/test data')
    # ---------- vocab and word embeddings --------
    parse.add_argument('-word_embedding_dim',
                       '--word_embedding_dim',
                       type=int,
                       default=100,
                       help='dim of word embedding')
    # ---------- model ----------
    parse.add_argument('-max_seq_len',
                       '--max_seq_len',
                       type=int,
                       default=500,
                       help='maximum sequence length')
    parse.add_argument('-model',
                       '--model',
                       type=str,
                       default='NN',
                       help='model: NN, LSTM, biLSTM, CNN')
    parse.add_argument('-use_attention',
                       '--use_attention',
                       type=int,
                       default=1,
                       help='whether to use attention')
    parse.add_argument('-pretrained_model',
                       '--pretrained_model_path',
                       type=str,
                       default=None,
                       help='path to the pretrained model')
    parse.add_argument('-dropout_keep_prob',
                       '--dropout_keep_prob',
                       type=float,
                       default=0.5,
                       help='keep probability in dropout layer')
    # ---------- training parameters --------
    parse.add_argument('-n_epochs',
                       '--n_epochs',
                       type=int,
                       default=10,
                       help='number of epochs')
    parse.add_argument('-batch_size',
                       '--batch_size',
                       type=int,
                       default=32,
                       help='batch size for training')
    parse.add_argument('-lr',
                       '--learning_rate',
                       type=float,
                       default=0.0001,
                       help='learning rate')
    parse.add_argument('-update_rule',
                       '--update_rule',
                       type=str,
                       default='adam',
                       help='update rule')
    # ------ train or predict -------
    parse.add_argument('-train',
                       '--train',
                       type=int,
                       default=1,
                       help='if training')
    parse.add_argument('-test',
                       '--test',
                       type=int,
                       default=0,
                       help='if testing')
    parse.add_argument('-predict',
                       '--predict',
                       type=int,
                       default=0,
                       help='if predicting')
    args = parse.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    print '-------------- load labels ------------------------'
    label_prop_dict = load_pickle(args.folder_path + 'inv_prop_dict.pkl')
    all_labels = label_prop_dict.keys()
    num_labels = np.max(all_labels) + 1
    label_prop = np.zeros(num_labels)
    for i in xrange(num_labels):
        try:
            label_prop[i] = label_prop_dict[i]
        except KeyError:
            label_prop[i] = 1.0
    print 'real number of labels: ' + str(len(all_labels))
    print 'maximum label: ' + str(np.max(all_labels))
    print 'minimum label: ' + str(np.min(all_labels))
    print 'number of labels: ' + str(num_labels)
    print '-------------- load train/test data -------------------------'
    train_doc = load_pickle(args.folder_path + 'train_doc_wordID.pkl')
    test_doc = load_pickle(args.folder_path + 'test_doc_wordID.pkl')
    train_label = load_pickle(args.folder_path + 'train_label.pkl')
    test_label = load_pickle(args.folder_path + 'test_label.pkl')
    print '============== create train/test data loader ...'
    train_loader = DataLoader_all(train_doc,
                                  train_label,
                                  num_labels,
                                  label_prop_dict,
                                  batch_size=args.batch_size,
                                  max_seq_len=args.max_seq_len)
    test_loader = DataLoader_all(test_doc,
                                 test_label,
                                 num_labels,
                                 label_prop_dict,
                                 batch_size=args.batch_size,
                                 max_seq_len=args.max_seq_len)
    # ----------------------- train ------------------------
    print '============== build model ...'
    if 'NN' in args.model:
        print 'build NN model ...'
        model = NN(args.max_seq_len, 5000, args.word_embedding_dim, num_labels,
                   label_prop, 32, args)
        args.if_use_seq_len = 1

    print '================= model solver ...'
    # solver: __init__(self, model, train_data, test_data, **kwargs):
    solver = ModelSolver3(model,
                          train_loader,
                          test_loader,
                          n_epochs=args.n_epochs,
                          batch_size=args.batch_size,
                          update_rule=args.update_rule,
                          learning_rate=args.learning_rate,
                          pretrained_model=args.pretrained_model_path,
                          model_path=args.folder_path + args.model + '/',
                          log_path=args.folder_path + args.model + '/',
                          test_path=args.folder_path + args.model + '/')
    # train
    if args.train:
        print '================= begin training...'
        solver.train(args.folder_path + args.model + '/outcome.txt')

    # test
    if args.test:
        print '================= begin testing...'
        solver.test(
            args.folder_path + args.model + '/' + args.pretrained_model_path,
            args.folder_path + args.model + '/test_outcome.txt')

    # predict
    if args.predict:
        print '================= begin predicting...'
        predict_path = args.folder_path + 'model_save/' + args.model + '/'
        solver.predict(trained_model_path=predict_path,
                       output_file_path=predict_path + 'predict_outcome.txt',
                       k=10,
                       emb_saved=1,
                       can_saved=1)