示例#1
0
 def create_model(self, sess, config):
     text_cnn = TextCNN(config)
     saver = tf.train.Saver()
     if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
         print("Restoring Variables from Checkpoint.")
         saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
         if FLAGS.decay_lr_flag:
             for i in range(2):  # decay learning rate if necessary.
                 print(i, "Going to decay learning rate by half.")
                 sess.run(text_cnn.learning_rate_decay_half_op)
     else:
         print('Initializing Variables')
         sess.run(tf.global_variables_initializer())
         if not os.path.exists(FLAGS.ckpt_dir):
             os.makedirs(FLAGS.ckpt_dir)
         if FLAGS.use_pretrained_embedding:  # 加载预训练的词向量
             print("===>>>going to use pretrained word embeddings...")
             old_emb_matrix = sess.run(text_cnn.Embedding.read_value())
             new_emb_matrix = load_word_embedding(old_emb_matrix,
                                                  FLAGS.word2vec_model_path,
                                                  FLAGS.embed_size,
                                                  self.index_to_word)
             word_embedding = tf.constant(new_emb_matrix,
                                          dtype=tf.float32)  # 转为tensor
             t_assign_embedding = tf.assign(
                 text_cnn.Embedding,
                 word_embedding)  # 将word_embedding复制给text_cnn.Embedding
             sess.run(t_assign_embedding)
             print("using pre-trained word emebedding.ended...")
     return text_cnn, saver
示例#2
0
    test_file = '../data/SICK/SICK_test_annotated.txt'

    train = utils.load_SICK(train_file)
    dev = utils.load_SICK(dev_file)
    test = utils.load_SICK(test_file)

# get emb data
data = train + dev + test
sentences = []
for x in data:
    sentences.append(x[0])
    sentences.append(x[1])

idf_weight = utils.idf_calculator(sentences)
w2i = {w: i for i, w in enumerate(idf_weight.keys())}
emb = utils.load_word_embedding(w2i, utils.my_emb_rep['paragram300'], 300)

# params
params = utils.Params()
params.memsize = 50
params.minval = 0
params.maxval = 5
params.nout = params.maxval - params.minval + 1
params.LW = 1e-03
params.LC = 1e-05
params.learner = lasagne.updates.adam
params.batchsize = 50
params.dim = 300
params.eta = 0.01
params.clip = None
# params.hid_size = 300
示例#3
0
    comp_filename = 'dataset/all.bin'
    train_filename = 'dataset/train.bin'
    test_filename = 'dataset/test.bin'
    dev_filename = 'dataset/dev.bin'
    embedding_filename = 'dataset/word_embedding.txt'
    sem_embed_filename = 'dataset/sememe_vector.txt'
    logdir_name = 'phrase_sim/SCMSA'

    # load hownet,并把hownet.comp分成test_set和train_set
    hownet = utils.Hownet(hownet_file=hownet_filename, comp_file=comp_filename)
    hownet.build_hownet()
    hownet.token2id()
    hownet.load_split_dataset(train_filename=train_filename,
                              test_filename=test_filename,
                              dev_filename=dev_filename)
    word_embedding_np, hownet = utils.load_word_embedding(
        embedding_filename, hownet, scale=False)  # load word embedding
    sememe_embedding_np = utils.load_sememe_embedding(
        sem_embed_filename, hownet, scale=True)  # load sememe embedding
    hownet, wordsim_words = utils.fliter_wordsim_all(
        hownet)  # remove MWEs in testset
    train_num = len(hownet.comp_train)
    pos_dict, word_remove = utils.load_hownet_pos()
    hownet, cls_dict = utils.divide_data_with_pos(pos_dict, hownet)
    print("number of dataset in training set:{}".format(len(
        hownet.comp_train)))
    print("number of dataset in test set:{}".format(len(hownet.comp_test)))
    print("number of dataset in dev set:{}".format(len(hownet.comp_dev)))

    if not os.path.exists(logdir_name):
        os.makedirs(logdir_name)
        os.makedirs(os.path.join(logdir_name, 'print_files'))
    def __init__(self,
                 embedding_dim=100,
                 batch_size=64,
                 n_hidden=100,
                 learning_rate=0.01,
                 n_class=3,
                 max_sentence_len=50,
                 l2_reg=0.,
                 display_step=4,
                 n_iter=100,
                 type_=''):
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.n_hidden = n_hidden
        self.learning_rate = learning_rate
        self.n_class = n_class
        self.max_sentence_len = max_sentence_len
        self.l2_reg = l2_reg
        self.display_step = display_step
        self.n_iter = n_iter
        self.type_ = type_
        self.word_id_mapping, self.w2v = load_word_embedding(
            FLAGS.word_id_file_path, FLAGS.embedding_file_path,
            self.embedding_dim)
        # self.word_embedding = tf.constant(self.w2v, dtype=tf.float32, name='word_embedding')
        self.word_embedding = tf.Variable(self.w2v,
                                          dtype=tf.float32,
                                          name='word_embedding')
        # self.word_id_mapping = load_word_id_mapping(FLAGS.word_id_file_path)
        # self.word_embedding = tf.Variable(
        #     tf.random_uniform([len(self.word_id_mapping), self.embedding_dim], -0.1, 0.1), name='word_embedding')
        self.aspect_id_mapping, self.aspect_embed = load_aspect2id(
            FLAGS.aspect_id_file_path, self.word_id_mapping, self.w2v,
            self.embedding_dim)
        self.aspect_embedding = tf.Variable(self.aspect_embed,
                                            dtype=tf.float32,
                                            name='aspect_embedding')

        self.keep_prob1 = tf.placeholder(tf.float32)
        self.keep_prob2 = tf.placeholder(tf.float32)
        with tf.name_scope('inputs'):
            self.x = tf.placeholder(tf.int32, [None, self.max_sentence_len],
                                    name='x')
            self.y = tf.placeholder(tf.int32, [None, self.n_class], name='y')
            self.sen_len = tf.placeholder(tf.int32, None, name='sen_len')
            self.aspect_id = tf.placeholder(tf.int32, None, name='aspect_id')

        with tf.name_scope('weights'):
            self.weights = {
                'softmax':
                tf.get_variable(
                    name='softmax_w',
                    shape=[self.n_hidden, self.n_class],
                    initializer=tf.random_uniform_initializer(-0.01, 0.01),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
            }

        with tf.name_scope('biases'):
            self.biases = {
                'softmax':
                tf.get_variable(
                    name='softmax_b',
                    shape=[self.n_class],
                    initializer=tf.random_uniform_initializer(-0.01, 0.01),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
            }

        self.W = tf.get_variable(
            name='W',
            shape=[
                self.n_hidden + self.embedding_dim,
                self.n_hidden + self.embedding_dim
            ],
            initializer=tf.random_uniform_initializer(-0.01, 0.01),
            regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
        self.w = tf.get_variable(
            name='w',
            shape=[self.n_hidden + self.embedding_dim, 1],
            initializer=tf.random_uniform_initializer(-0.01, 0.01),
            regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
        self.Wp = tf.get_variable(
            name='Wp',
            shape=[self.n_hidden, self.n_hidden],
            initializer=tf.random_uniform_initializer(-0.01, 0.01),
            regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
        self.Wx = tf.get_variable(
            name='Wx',
            shape=[self.n_hidden, self.n_hidden],
            initializer=tf.random_uniform_initializer(-0.01, 0.01),
            regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
def main():
    parser = options.get_parser('Trainer')
    options.add_dataset_args(parser)
    options.add_preprocessing_args(parser)
    options.add_model_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    
    args = parser.parse_args()
    print(args)
    
    args.cuda = not args.disable_cuda and torch.cuda.is_available()
    
    # checkpoint
    checkpoint_dir = os.path.dirname(args.checkpoint)
    if not os.path.isdir(checkpoint_dir):
        os.mkdir(checkpoint_dir)
    
    # load dataset
    train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir)
    assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!'
    train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus]
    val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus]
    test_corpus = [(line.sent, line.type, line.p1, line.p2) for line in test_raw_corpus]
    
    
    start_epoch = 0
    caseless = args.caseless
    batch_size = args.batch_size
    num_epoch = args.num_epoch
    
    # preprocessing
    sents = [tup[0] for tup in train_corpus + val_corpus]
    feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless)
    ##
#    target_map = {c:i for i, c in enumerate(['null', 'true'])}
    target_map = ddi2013.target_map
    train_features, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless)
    val_features, val_targets = utils.build_corpus(val_corpus, feature_map, target_map, caseless)
    test_features, test_targets = utils.build_corpus(test_corpus, feature_map, target_map, caseless)
    
    class_weights = torch.Tensor(utils.get_class_weights(train_targets)) if args.class_weight else None
    train_loader = utils.construct_bucket_dataloader(train_features, train_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=True)
    val_loader = utils.construct_bucket_dataloader(val_features, val_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False)
    test_loader = utils.construct_bucket_dataloader(test_features, test_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False)
    print('Preprocessing done! Vocab size: {}'.format(len(feature_map)))
    
    # build model
    vocab_size = len(feature_map)
    tagset_size = len(target_map)
    model = utils.build_model(args, vocab_size, tagset_size)
    
    # loss
    criterion = utils.build_loss(args, class_weights=class_weights)
    
    # load states
    if os.path.isfile(args.load_checkpoint):
        print('Loading checkpoint file from {}...'.format(args.load_checkpoint))
        checkpoint_file = torch.load(args.load_checkpoint)
        start_epoch = checkpoint_file['epoch'] + 1
        model.load_state_dict(checkpoint_file['state_dict'])
    #    optimizer.load_state_dict(checkpoint_file['optimizer'])
    else:
        print('no checkpoint file found: {}, train from scratch...'.format(args.load_checkpoint))
        if not args.rand_embedding:
            pretrained_word_embedding, in_doc_word_indices = utils.load_word_embedding(args.emb_file, feature_map, args.embedding_dim)
            print(pretrained_word_embedding.size())
            print(vocab_size)
            model.load_pretrained_embedding(pretrained_word_embedding)
            if args.disable_fine_tune:
                model.update_part_embedding(in_doc_word_indices) # update only non-pretrained words
        model.rand_init(init_embedding=args.rand_embedding)
    
    # trainer
    trainer = SeqTrainer(args, model, criterion)
    
    if os.path.isfile(args.load_checkpoint):
        dev_prec, dev_rec, dev_f1, _ = evaluate(trainer, val_loader, target_map, cuda=args.cuda)
        test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda)
        print('checkpoint dev_prec: {:.4f}, dev_rec: {:.4f}, dev_f1: {:.4f}, test_prec: {:.4f}, test_rec: {:.4f}, test_f1: {:.4f}'.format(
            dev_prec, dev_rec, dev_f1, test_prec, test_rec, test_f1))
    
    track_list = []
    best_f1 = float('-inf')
    patience_count = 0
    start_time = time.time()
    
    
    for epoch in range(start_epoch, num_epoch):
        epoch_loss = train(train_loader, trainer, epoch)
    
        # update lr
        trainer.lr_step()
           
        dev_prec, dev_rec, dev_f1, dev_loss = evaluate(trainer, val_loader, target_map, cuda=args.cuda)
        if dev_f1 >= best_f1:
            patience_count = 0
            best_f1 = dev_f1
    
            test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda)
    
            track_list.append({'epoch': epoch, 'loss': epoch_loss, 
                'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss, 
                'test_prec': test_prec, 'test_rec': test_rec, 'test_f1': test_f1})
            print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}\tsaving...'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1))
    
            try:
                utils.save_checkpoint({
                            'epoch': epoch,
                            'state_dict': model.state_dict(),
                            'optimizer': trainer.optimizer.state_dict(),
                            'f_map': feature_map,
                            't_map': target_map,
                        }, {'track_list': track_list,
                            'args': vars(args)
                            }, args.checkpoint + '_lstm')
            except Exception as inst:
                print(inst)
        else:
            patience_count += 1
            track_list.append({'epoch': epoch,'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss})
            print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}'.format(epoch, epoch_loss, dev_f1, dev_loss))
    
        print('epoch: {} in {} take: {} s'.format(epoch, args.num_epoch, time.time() - start_time))
        if patience_count >= args.patience:
            break
示例#6
0
    def __init__(self,
                 embedding_dim=100,
                 batch_size=64,
                 n_hidden=100,
                 learning_rate=0.01,
                 n_class=3,
                 max_sentence_len=50,
                 l2_reg=0.,
                 display_step=4,
                 n_iter=100,
                 type_=''):
        self.embedding_dim = embedding_dim  #300
        self.batch_size = batch_size  #25
        self.n_hidden = n_hidden  #300
        self.learning_rate = learning_rate  #0.01
        self.n_class = n_class  #3
        self.max_sentence_len = max_sentence_len  #80
        self.l2_reg = l2_reg  #0.001
        self.display_step = display_step  #4
        self.n_iter = n_iter  #20
        self.type_ = type_  #AT
        self.word_id_mapping, self.w2v = load_word_embedding(
            FLAGS.word_id_file_path, FLAGS.embedding_file_path,
            self.embedding_dim)
        # dict(3909)          3910 * 300                      word->id 路径                   词嵌入的路径             词嵌入维度:300
        # self.word_embedding = tf.constant(self.w2v, dtype=tf.float32, name='word_embedding')
        self.word_embedding = tf.Variable(
            self.w2v, dtype=tf.float32,
            name='word_embedding')  # 定义word_embedding变量
        # self.word_id_mapping = load_word_id_mapping(FLAGS.word_id_file_path)
        # self.word_embedding = tf.Variable(
        #     tf.random_uniform([len(self.word_id_mapping), self.embedding_dim], -0.1, 0.1), name='word_embedding')
        self.aspect_id_mapping, self.aspect_embed = load_aspect2id(
            FLAGS.aspect_id_file_path, self.word_id_mapping, self.w2v,
            self.embedding_dim)
        # dict(1219)            1220 * 300
        self.aspect_embedding = tf.Variable(
            self.aspect_embed, dtype=tf.float32,
            name='aspect_embedding')  #定义word_embedding变量

        #定义droupout占位符
        self.keep_prob1 = tf.placeholder(tf.float32, name="dropout_keep_prob1")
        self.keep_prob2 = tf.placeholder(tf.float32, name="dropout_keep_prob2")
        with tf.name_scope('inputs'):
            self.x = tf.placeholder(tf.int32, [None, self.max_sentence_len],
                                    name='x')  #25 * 80
            #print (self.max_sentence_len)   #80
            #print ('sxl================')
            self.y = tf.placeholder(tf.int32, [None, self.n_class],
                                    name='y')  #25 * 3
            self.sen_len = tf.placeholder(tf.int32, None,
                                          name='sen_len')  #list(25)
            self.aspect_id = tf.placeholder(tf.int32, None,
                                            name='aspect_id')  #list(25)

        with tf.name_scope('weights'):
            self.weights = {
                'softmax':
                tf.get_variable(
                    name='softmax_w',
                    shape=[self.n_hidden, self.n_class],  #300 * 3
                    initializer=tf.random_uniform_initializer(-0.01, 0.01),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
            }

        with tf.name_scope('biases'):
            self.biases = {
                'softmax':
                tf.get_variable(
                    name='softmax_b',
                    shape=[self.n_class],  # 3
                    initializer=tf.random_uniform_initializer(-0.01, 0.01),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
            }

        self.W = tf.get_variable(
            name='W',
            shape=[
                self.n_hidden + self.embedding_dim,
                self.n_hidden + self.embedding_dim
            ],  #600 * 600
            initializer=tf.random_uniform_initializer(-0.01, 0.01),
            regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
        self.w = tf.get_variable(
            name='w',
            shape=[self.n_hidden + self.embedding_dim, 1],  #600 * 1
            initializer=tf.random_uniform_initializer(-0.01, 0.01),
            regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
        self.Wp = tf.get_variable(
            name='Wp',
            shape=[self.n_hidden, self.n_hidden],  #300 * 300
            initializer=tf.random_uniform_initializer(-0.01, 0.01),
            regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
        self.Wx = tf.get_variable(
            name='Wx',
            shape=[self.n_hidden, self.n_hidden],  #300 * 300
            initializer=tf.random_uniform_initializer(-0.01, 0.01),
            regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
示例#7
0
def main(mode, args):
    # build vocab
    word2index, index2word = build_all_vocab(init_vocab={
        UNK_WORD: 0,
        BOS_WORD: 1
    })
    args.vocab, args.vocab_size, args.index2word = word2index, len(
        word2index), index2word
    # get data_from_train from only_label = True, for same as train baseline
    args.only_label = True
    train_dataset = BindingDataset('train', args=args)
    data_from_train = (train_dataset.tokenize_max_len,
                       train_dataset.columns_token_max_len,
                       train_dataset.columns_split_marker_max_len,
                       train_dataset.cells_token_max_len,
                       train_dataset.cells_split_marker_max_len,
                       train_dataset.pos_tag_vocab,
                       train_dataset.bert_tokenize_max_len,
                       train_dataset.bert_tokenize_marker_max_len,
                       train_dataset.bert_columns_split_max_len,
                       train_dataset.bert_columns_split_marker_max_len,
                       train_dataset.bert_cells_split_max_len,
                       train_dataset.bert_cells_split_marker_max_len)
    args.tokenize_max_len, args.columns_token_max_len, args.columns_split_marker_max_len, \
    args.cells_token_max_len, args.cells_split_marker_max_len, args.pos_tag_vocab,\
    args.bert_tokenize_max_len, args.bert_tokenize_marker_max_len, args.bert_columns_split_max_len, args.bert_columns_split_marker_max_len,\
    args.bert_cells_split_max_len, args.bert_cells_split_marker_max_len = data_from_train
    logger.info('data_from_train'), logger.info(data_from_train)
    # set only_label
    if mode == 'train baseline':
        args.only_label = True
    elif mode == 'policy gradient':
        args.only_label = False
    elif mode == 'test model':
        args.only_label = True
    elif mode == 'add feature':
        args.only_label = False
    elif mode == 'write cases':
        args.only_label = True
    elif mode == 'anonymous':
        args.only_label = False
    # build train_dataloader
    train_dataset = BindingDataset('train',
                                   args=args,
                                   data_from_train=data_from_train)
    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=args.shuffle)
    # build dev_dataloader
    args.shuffle = False
    dev_dataset = BindingDataset('dev',
                                 args=args,
                                 data_from_train=data_from_train)
    dev_dataloader = DataLoader(dataset=dev_dataset,
                                batch_size=args.batch_size,
                                shuffle=args.shuffle)
    # build test_dataloader
    # test_dataset = BindingDataset('test', args=args, data_from_train=data_from_train)
    # test_dataloader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=args.shuffle)
    # load word embedding
    if args.load_w2v:
        args.embed_matrix = load_word_embedding(args.word_dim, word2index)
    # train
    if mode == 'train baseline':
        if args.model == 'baseline':
            model = Baseline(args=args)
        elif args.model == 'gate':
            if args.bert_model is None:
                model = Gate(args=args)
            else:
                model = BertGate(args=args)
        else:
            raise NotImplementedError
        train(train_dataloader, dev_dataloader, args=args, model=model)
    elif mode == 'policy gradient':
        model = torch.load('./res/' + args.model +
                           '/2816_False_True_True_726425',
                           map_location=lambda storage, loc: storage.cuda(0))
        train_rl(train_dataloader, dev_dataloader, args=args, model=model)
    elif mode == 'test model':
        # also need the correct 'model' for dataloader
        model = torch.load(
            './res/policy_gradient/0.819928_True_True_True_412532',
            map_location=lambda storage, loc: storage.cuda(0))
        eval(dev_dataloader, args, model, epoch=0)
        eval_rl(dev_dataloader, args, model, epoch=0)
    elif mode == 'add feature':
        model = torch.load('./res/policy_gradient/0.804922_22-16-28',
                           map_location=lambda storage, loc: storage.cuda(0))
        res = test(dev_dataloader, args, model)
        add_abstraction('dev', res=res, args=args)
    elif mode == 'write cases':
        model = torch.load(
            './res/policy_gradient/0.819928_True_True_True_412532',
            map_location=lambda storage, loc: storage.cuda(0))
        res_pg = test(dev_dataloader, args, model, sep=' ')
        model = torch.load('./res/gate/epoch100',
                           map_location=lambda storage, loc: storage.cuda(0))
        res_gate = test(dev_dataloader, args, model, sep=' ')
        with open('cases.txt', 'w', encoding='utf-8') as f:
            for key in res_pg.keys():
                # diff between gate and policy
                if res_gate[key]['pred'] != res_pg[key]['pred']:
                    if res_gate[key]['pred'] == res_gate[key]['label']:
                        f.write(key + '\n')
                        f.write('Pred_Gate:\t\t\t\t' +
                                json.dumps(res_gate[key]['pred']) + '\n')
                        f.write('Pred_Policy_Gradient:\t' +
                                json.dumps(res_pg[key]['pred']) + '\n')
                        f.write('Label:\t\t\t\t\t' +
                                json.dumps(res_pg[key]['label']) + '\n')
                        f.write('SQL_Labels:\t\t\t\t' +
                                json.dumps(res_pg[key]['sql_labels']) + '\n' +
                                '\n')
    elif mode == 'anonymous':
        model = torch.load(
            './res/policy_gradient/0.819928_True_True_True_412532',
            map_location=lambda storage, loc: storage.cuda(0))
        res = test(train_dataloader, args, model, sep='')
        anonymous('train', res, args)
示例#8
0
import tensorflow as tf
import numpy as np
from utils import load_w2v, batch_index, load_word_embedding, load_aspect2id, load_inputs_twitter_at

x_raw = ["$T$ is always fresh and hot - ready to eat !", "food"]
y_test = [1]

word_id_mapping, w2v = load_word_embedding(
    'data/restaurant/word_id_new.txt',
    'data/restaurant/rest_2014_word_embedding_300_new.txt', 300)
# dict(3909)          3910 * 300
aspect_id_mapping, aspect_embed = load_aspect2id(
    'data/restaurant/aspect_id_new.txt', word_id_mapping, w2v, 300)
# dict(1219)            1220 * 300
# print (aspect_id_mapping['food'])
# print ('sxlllllllllll')


def change_y_to_onehot(y):

    class_set = set([1, -1, 0])
    n_class = 3
    y_onehot_mapping = {0: 0, 1: 1, -1: 2}
    #print (y_onehot_mapping)
    onehot = []
    for label in y:
        tmp = [0] * n_class
        tmp[y_onehot_mapping[label]] = 1
        onehot.append(tmp)
    return np.asarray(onehot, dtype=np.int32)
示例#9
0
    def load_data(self):
        train_data, val_data, test_data = load_csv(self.args.data_path,
                                                   self.args.data)
        self.train_y = to_categorical(train_data.valid)
        self.val_y = to_categorical(val_data.valid)
        self.test_y = to_categorical(test_data.valid)

        print('Load word embedding')
        self.word2idx, idx2word, self.word_vectors = load_word_embedding(
            self.args.embedding_path, self.args.word_embedding, self.args.data)

        print('Load graph embedding')
        cpc_embed_dict, ipc_embed_dict, uspc_embed_dict = load_code_embeddings(
            self.args.embedding_path, self.args.code_embedding, self.args.data)

        self.cpc2idx, idx2cpc, self.cpc_vectors = create_code_vocab(
            cpc_embed_dict)
        self.ipc2idx, idx2ipc, self.ipc_vectors = create_code_vocab(
            ipc_embed_dict)
        self.uspc2idx, idx2uspc, self.uspc_vectors = create_code_vocab(
            uspc_embed_dict)

        self.max_cpc_len, self.max_ipc_len, self.max_uspc_len = get_code_length(
            train_data)

        print('Preparing train data')
        train_cpcs, train_ipcs, train_uspcs = convert_code_to_idx(
            train_data, self.max_cpc_len, self.max_ipc_len, self.max_uspc_len,
            self.cpc2idx, self.ipc2idx, self.uspc2idx)
        train_abs_sequence = get_text_sequence(train_data.abstract_text,
                                               self.word2idx,
                                               self.args.max_length)

        self.train.append(train_cpcs)
        self.train.append(train_ipcs)
        self.train.append(train_uspcs)
        self.train.append(train_abs_sequence)

        print('Preparing validation data')
        val_cpcs, val_ipcs, val_uspcs = convert_code_to_idx(
            val_data, self.max_cpc_len, self.max_ipc_len, self.max_uspc_len,
            self.cpc2idx, self.ipc2idx, self.uspc2idx)
        val_abs_sequence = get_text_sequence(val_data.abstract_text,
                                             self.word2idx,
                                             self.args.max_length)

        self.val.append(val_cpcs)
        self.val.append(val_ipcs)
        self.val.append(val_uspcs)
        self.val.append(val_abs_sequence)

        print('preparing test data')
        test_cpcs, test_ipcs, test_uspcs = convert_code_to_idx(
            test_data, self.max_cpc_len, self.max_ipc_len, self.max_uspc_len,
            self.cpc2idx, self.ipc2idx, self.uspc2idx)
        test_abs_sequence = get_text_sequence(test_data.abstract_text,
                                              self.word2idx,
                                              self.args.max_length)

        self.test.append(test_cpcs)
        self.test.append(test_ipcs)
        self.test.append(test_uspcs)
        self.test.append(test_abs_sequence)
示例#10
0
def main(args):
    conf = vars(args)
    device = torch.device("cpu")
    if torch.cuda.is_available():
        torch.cuda.set_device(args.gpu)
        torch.backends.cudnn.benchmark = True
        device = torch.device("cuda", args.gpu)

    if args.output_dir:
        shutil.rmtree(args.output_dir, ignore_errors=True)
        os.makedirs(args.output_dir, exist_ok=True)
    if args.log_dir:
        shutil.rmtree(args.log_dir, ignore_errors=True)
        os.makedirs(args.log_dir, exist_ok=True)

    train_file = os.path.join(args.data_dir, args.train_file)
    valid_file = os.path.join(args.data_dir, args.valid_file)
    test_file = os.path.join(args.data_dir, args.test_file)
    label_file = os.path.join(args.data_dir, args.label_file)

    logging.info("Loading Data")
    if args.do_train:
        cache_path = utils.get_cache_path(train_file, args.cache_dir)
        if args.cache_dataset and os.path.exists(cache_path):
            train_data = torch.load(cache_path)
        else:
            train_data = TextDataset(train_file, label_file, args.max_seq_len,
                                     args.min_seq_len, args.doc_stride)
            if args.cache_dataset:
                os.makedirs(args.cache_dir, exist_ok=True)
                torch.save(train_data, cache_path)
        train_loader = DataLoader(
            train_data,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.workers,
            pin_memory=True,
            drop_last=True,
        )
        cache_path = utils.get_cache_path(valid_file, args.cache_dir)
        if args.cache_dataset and os.path.exists(cache_path):
            valid_data = torch.load(cache_path)
        else:
            valid_data = TextDataset(valid_file, label_file, args.max_seq_len,
                                     args.min_seq_len, args.doc_stride)
            if args.cache_dataset:
                os.makedirs(args.cache_dir, exist_ok=True)
                torch.save(valid_data, cache_path)
        num_classes = len(valid_data.classes)
        valid_loader = DataLoader(valid_data,
                                  batch_size=args.batch_size,
                                  num_workers=args.workers,
                                  pin_memory=True)
    if args.do_test:
        cache_path = utils.get_cache_path(test_file, args.cache_dir)
        if args.cache_dataset and os.path.exists(cache_path):
            test_data = torch.load(cache_path)
        else:
            test_data = TextDataset(test_file, label_file, args.max_seq_len,
                                    args.min_seq_len)
            if args.cache_dataset:
                os.makedirs(args.cache_dir, exist_ok=True)
                torch.save(test_data, cache_path)
        num_classes = len(test_data.classes)
        test_loader = DataLoader(test_data,
                                 batch_size=args.batch_size,
                                 num_workers=args.workers,
                                 pin_memory=True)

    logging.info("Creating Model")
    W = utils.load_word_embedding(args.embedding, add_oov=True)
    model = DTT(
        seq_len=args.max_seq_len,
        num_label=num_classes,
        k_sizes=args.filter_sizes,
        num_k=args.num_filters,
        embedding=W,
        drop_prob=args.drop_prob,
    )
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    if args.resume:
        checkpoint = torch.load(args.resume, map_location="cpu")
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
    criterion = nn.CrossEntropyLoss()
    metric = utils.MetricLogger(args.log_dir)
    model.to(device)

    if args.do_train:
        logging.info("Start Training")
        for epoch in range(args.epochs):
            train(model, criterion, optimizer, train_loader, epoch,
                  args.log_freq, device, metric)
            epoch_acc = evaluate(model, criterion, valid_loader, device,
                                 metric)
            if args.output_dir:
                checkpoint = {
                    "model": model.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "epoch": epoch,
                    "args": args,
                }
                if epoch_acc > conf.get("best_acc", 0):
                    conf["best_acc"] = epoch_acc
                    conf["best_epoch"] = epoch
                json.dump(conf,
                          open(os.path.join(args.output_dir, "conf.json"),
                               "w"),
                          indent=2)
                torch.save(
                    checkpoint,
                    os.path.join(args.output_dir,
                                 "model_{}.pth".format(epoch)))
        logging.info("Finished Training")
        metric.close()

    if args.do_test:
        evaluate(model, criterion, test_loader, device, metric)
def main():
    parser = options.get_parser('Trainer')
    options.add_dataset_args(parser)
    options.add_preprocessing_args(parser)
    options.add_model_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    
    args = parser.parse_args()
    print(args)
    
    args.cuda = not args.disable_cuda and torch.cuda.is_available()
    torch.manual_seed(5)
    
    if args.cuda:
        torch.backends.cudnn.benchmark = True
    
    # increase recursion depth
    sys.setrecursionlimit(10000)
    # checkpoint
    checkpoint_dir = os.path.dirname(args.checkpoint)
    if not os.path.isdir(checkpoint_dir):
        os.mkdir(checkpoint_dir)
    
    # load dataset
    train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir, ddi=False)
    assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!'
    train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus]
    val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus]    
    
    start_epoch = 0
    caseless = args.caseless
    batch_size = args.batch_size
    num_epoch = args.num_epoch
    
    # build vocab
    sents = [tup[0] for tup in train_corpus + val_corpus]
    feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless)
    target_map = ddi2013.target_map
    
    # get class weights
    _, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless)
    class_weights = torch.Tensor(utils.get_class_weights(train_targets)) if args.class_weight else None
        
    train_loader, val_loader, test_loader = utils.load_datasets(args.processed_dir, args.train_size, args, feature_map, dataloader=True)            
    
    # build model
    vocab_size = len(feature_map)
    tagset_size = len(target_map)
    model = RelationTreeModel(vocab_size, tagset_size, args)
    
    # loss
    criterion = utils.build_loss(args, class_weights=class_weights)
    
    # load states
    if os.path.isfile(args.load_checkpoint):
        print('Loading checkpoint file from {}...'.format(args.load_checkpoint))
        checkpoint_file = torch.load(args.load_checkpoint)
        start_epoch = checkpoint_file['epoch'] + 1
        model.load_state_dict(checkpoint_file['state_dict'])
    #    optimizer.load_state_dict(checkpoint_file['optimizer'])
    else:
        print('no checkpoint file found: {}, train from scratch...'.format(args.load_checkpoint))
        if not args.rand_embedding:
            pretrained_word_embedding, in_doc_word_indices = utils.load_word_embedding(args.emb_file, feature_map, args.embedding_dim)
            print(pretrained_word_embedding.size())
            print(vocab_size)
            model.load_pretrained_embedding(pretrained_word_embedding)
            if args.disable_fine_tune:
                model.update_part_embedding(in_doc_word_indices) # update only non-pretrained words
        model.rand_init(init_embedding=args.rand_embedding)
    
    # trainer
    trainer = TreeTrainer(args, model, criterion)
    
    best_f1 = float('-inf')
    
    if os.path.isfile(args.load_checkpoint):
        dev_prec, dev_rec, dev_f1, _ = evaluate(trainer, val_loader, target_map, cuda=args.cuda)
        test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda)
        best_f1 = dev_f1
        print('checkpoint dev_prec: {:.4f}, dev_rec: {:.4f}, dev_f1: {:.4f}, test_prec: {:.4f}, test_rec: {:.4f}, test_f1: {:.4f}'.format(
            dev_prec, dev_rec, dev_f1, test_prec, test_rec, test_f1))
        
    track_list = []
    
    patience_count = 0
    start_time = time.time()
    q = mp.Queue()
    
    # set start methods
    try:
        mp.set_start_method('spawn')
    except RuntimeError:
        pass

    for epoch in range(start_epoch, num_epoch):
        epoch_loss = train(train_loader, trainer, epoch)
#        processes = []
#        for rank in range(args.num_processes):
#            p = mp.Process(target=train, args=(train_loader, trainer, epoch, q))
#            p.start()
#            processes.append(p)
#        for p in processes:
#            p.join()
#        
#        epoch_loss = q.get()

                
        # update lr
        trainer.lr_step(epoch_loss)
        
        dev_prec, dev_rec, dev_f1, dev_loss = evaluate(trainer, val_loader, target_map, cuda=args.cuda)
        test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda)
        if dev_f1 >= best_f1:
            patience_count = 0
            best_f1 = dev_f1
    
            track_list.append({'epoch': epoch, 'loss': epoch_loss, 
                'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss, 
                'test_prec': test_prec, 'test_rec': test_rec, 'test_f1': test_f1})
            print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}\tsaving...'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1))
    
            try:
                utils.save_checkpoint({
                            'epoch': epoch,
                            'state_dict': model.state_dict(),
                            'optimizer': trainer.optimizer.state_dict(),
                            'f_map': feature_map,
                            't_map': target_map,
                        }, {'track_list': track_list,
                            'args': vars(args)
                            }, args.checkpoint)
            except Exception as inst:
                print(inst)
        else:
            patience_count += 1
            track_list.append({'epoch': epoch,'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss})
            print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1))
    
        print('epoch: {} in {} take: {} s'.format(epoch, args.num_epoch, time.time() - start_time))
        if patience_count >= args.patience:
            break
示例#12
0
def main():
    # Training settings
    parser = ArgumentParser()
    parser.add_argument('-d',
                        '--device',
                        default=None,
                        type=str,
                        help='indices of GPUs to enable (default: None)')
    parser.add_argument('-b',
                        '--batch-size',
                        type=int,
                        default=1024,
                        help='number of batch size for training')
    parser.add_argument('-e',
                        '--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--save-path',
                        type=str,
                        default='result/model.pth',
                        help='path to trained model to save')
    parser.add_argument('--model',
                        choices=['MLP', 'BiLSTM', 'BiLSTMAttn', 'CNN'],
                        default='MLP',
                        help='model name')
    parser.add_argument('--env',
                        choices=['local', 'server'],
                        default='server',
                        help='development environment')
    parser.add_argument('--word-dim',
                        type=int,
                        default=128,
                        help='the dimension of embedding')
    parser.add_argument(
        '--word-lim',
        type=int,
        default=None,
        help='If specified, input sequence length is limited from tail.')
    parser.add_argument('--lr', type=float, default=1e-3, help='learning rate')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    os.makedirs(os.path.dirname(args.save_path), exist_ok=True)

    if args.device:
        os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    device = torch.device('cuda:0' if torch.cuda.is_available()
                          and args.device is not None else 'cpu')

    model_w2v = KeyedVectors.load_word2vec_format(W2V_MODEL_FILE[args.env],
                                                  binary=True)
    word_to_id = word2id(model_w2v)
    initial_embedding = load_word_embedding(model_w2v)

    # setup data_loader instances
    train_data_loader = PosNegDataLoader(TRAIN_FILE[args.env],
                                         word_to_id,
                                         args.word_lim,
                                         args.batch_size,
                                         shuffle=True,
                                         num_workers=2)
    valid_data_loader = PosNegDataLoader(VALID_FILE[args.env],
                                         word_to_id,
                                         args.word_lim,
                                         args.batch_size,
                                         shuffle=False,
                                         num_workers=2)

    # build model architecture
    if args.model == 'MLP':
        model = MLP(word_dim=args.word_dim,
                    hidden_size=100,
                    vocab_size=len(word_to_id))
    elif args.model == 'BiLSTM':
        model = BiLSTM(word_dim=args.word_dim,
                       hidden_size=100,
                       vocab_size=len(word_to_id))
    elif args.model == 'BiLSTMAttn':
        model = BiLSTMAttn(word_dim=args.word_dim,
                           hidden_size=100,
                           vocab_size=len(word_to_id))
    elif args.model == 'CNN':
        model = CNN(word_dim=args.word_dim,
                    word_lim=args.word_lim,
                    vocab_size=len(word_to_id))
    else:
        raise ValueError(
            f'model name should be "MLP", "BiLSTM", "BiLSTMAttn", or "CNN", but given {args.model}'
        )

    model.set_initial_embedding(initial_embedding)
    model.to(device)

    # build optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    best_valid_acc = -1

    for epoch in range(1, args.epochs + 1):
        print(f'*** epoch {epoch} ***')
        # train
        model.train()
        total_loss = 0
        total_correct = 0
        for batch_idx, (source, mask, target) in enumerate(train_data_loader):
            source = source.to(device)  # (b, len)
            mask = mask.to(device)  # (b, len)
            target = target.to(device)  # (b)

            # Forward pass
            output = model(source, mask)  # (b, 2)
            loss = loss_fn(output, target)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_correct += metric_fn(output, target)
        print(f'train_loss={total_loss / train_data_loader.n_samples:.3f}',
              end=' ')
        print(
            f'train_accuracy={total_correct / train_data_loader.n_samples:.3f}'
        )

        # validation
        model.eval()
        with torch.no_grad():
            total_loss = 0
            total_correct = 0
            for batch_idx, (source, mask,
                            target) in enumerate(valid_data_loader):
                source = source.to(device)  # (b, len)
                mask = mask.to(device)  # (b, len)
                target = target.to(device)  # (b)

                output = model(source, mask)  # (b, 2)

                total_loss += loss_fn(output, target)
                total_correct += metric_fn(output, target)
        valid_acc = total_correct / valid_data_loader.n_samples
        print(f'valid_loss={total_loss / valid_data_loader.n_samples:.3f}',
              end=' ')
        print(f'valid_accuracy={valid_acc:.3f}\n')
        if valid_acc > best_valid_acc:
            torch.save(model.state_dict(), args.save_path)
            best_valid_acc = valid_acc
示例#13
0
    def __init__(self, embedding_dim=100, batch_size=64, n_hidden=100, learning_rate=0.01,
                 n_class=3, max_sentence_len=50, l2_reg=0., display_step=4, n_iter=100, type_=''):
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.n_hidden = n_hidden
        self.learning_rate = learning_rate
        self.n_class = n_class
        self.max_sentence_len = max_sentence_len
        self.l2_reg = l2_reg
        self.display_step = display_step
        self.n_iter = n_iter
        self.type_ = type_
        self.word_id_mapping, self.w2v = load_word_embedding(FLAGS.word_id_file_path, FLAGS.embedding_file_path, self.embedding_dim)
        # self.word_embedding = tf.constant(self.w2v, dtype=tf.float32, name='word_embedding')
        self.word_embedding = tf.Variable(self.w2v, dtype=tf.float32, name='word_embedding')
        # self.word_id_mapping = load_word_id_mapping(FLAGS.word_id_file_path)
        # self.word_embedding = tf.Variable(
        #     tf.random_uniform([len(self.word_id_mapping), self.embedding_dim], -0.1, 0.1), name='word_embedding')
        self.aspect_id_mapping, self.aspect_embed = load_aspect2id(FLAGS.aspect_id_file_path, self.word_id_mapping, self.w2v, self.embedding_dim)
        self.aspect_embedding = tf.Variable(self.aspect_embed, dtype=tf.float32, name='aspect_embedding')

        self.keep_prob1 = tf.placeholder(tf.float32)
        self.keep_prob2 = tf.placeholder(tf.float32)
        with tf.name_scope('inputs'):
            self.x = tf.placeholder(tf.int32, [None, self.max_sentence_len], name='x')
            self.y = tf.placeholder(tf.int32, [None, self.n_class], name='y')
            self.sen_len = tf.placeholder(tf.int32, None, name='sen_len')
            self.aspect_id = tf.placeholder(tf.int32, None, name='aspect_id')

        with tf.name_scope('weights'):
            self.weights = {
                'softmax': tf.get_variable(
                    name='softmax_w',
                    shape=[self.n_hidden, self.n_class],
                    initializer=tf.random_uniform_initializer(-0.01, 0.01),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                )
            }

        with tf.name_scope('biases'):
            self.biases = {
                'softmax': tf.get_variable(
                    name='softmax_b',
                    shape=[self.n_class],
                    initializer=tf.random_uniform_initializer(-0.01, 0.01),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                )
            }

        self.W = tf.get_variable(
            name='W',
            shape=[self.n_hidden + self.embedding_dim, self.n_hidden + self.embedding_dim],
            initializer=tf.random_uniform_initializer(-0.01, 0.01),
            regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
        )
        self.w = tf.get_variable(
            name='w',
            shape=[self.n_hidden + self.embedding_dim, 1],
            initializer=tf.random_uniform_initializer(-0.01, 0.01),
            regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
        )
        self.Wp = tf.get_variable(
            name='Wp',
            shape=[self.n_hidden, self.n_hidden],
            initializer=tf.random_uniform_initializer(-0.01, 0.01),
            regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
        )
        self.Wx = tf.get_variable(
            name='Wx',
            shape=[self.n_hidden, self.n_hidden],
            initializer=tf.random_uniform_initializer(-0.01, 0.01),
            regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
        )
示例#14
0
    config_file = sys.argv[1]
    configure = json.load(open(config_file))
    config = configure["main_configuration"]
    print("Data extraction\nConfiguration: ")
    print(json.dumps(config, indent=2), end='\n')

    w2v_file = config["pretrained_embedding"]  # w2v_file
    data_index = config["index"]  # Indri index
    mapped_w2v_file = config["output_embedding"]  # output shared w2v dict

    print('load word dict ...')
    word_dict = load_word_dict(data_index)
    print("Dictionary length: {}".format(len(word_dict)))

    print('load word vectors ...')
    embeddings = load_word_embedding(word_dict, w2v_file)

    print('save word vectors ...')
    with open(mapped_w2v_file, 'w') as fw:
        # assert word_dict
        for w, idx in tqdm(word_dict.items()):
            try:
                print(word_dict[w],
                      ' '.join(map(str, embeddings[idx])),
                      file=fw)
            except Exception as error:
                print('Error saving this word : {}\n'.format(word_dict[w]) +
                      repr(error))
                # print(embeddings[idx])

    print('Map word vectors finished ...')