示例#1
0
def load_file(filePath, dataName='data', mode='pickle'):
    _logger.add()
    _logger.add('Trying to load %s from %s' % (dataName, filePath))
    data = None
    ifLoad = False

    if os.path.isfile(filePath):
        _logger.add('Have found the file, loading...')

        if mode == 'pickle':
            with open(filePath, 'rb') as f:
                data = pickle.load(f)
                ifLoad = True
        elif mode == 'json':
            with open(filePath, 'r', encoding='utf-8') as f:
                data = json.load(f)
                ifLoad = True
        elif mode == 'txt':
            pass
        else:
            raise (ValueError,
                   'Function save_file does not have mode % s' % (mode))
    else:
        _logger.add('Have not found the file')
    _logger.add('Done')
    return (ifLoad, data)
示例#2
0
    def build_loss(self):
        # weight_decay
        with tf.name_scope('weight_decay'):
            for var in set(tf.get_collection('reg_vars',
                                             self.scope)):  # store reg vars
                weight_decay = tf.multiply(
                    tf.nn.l2_loss(var),
                    cfg.weight_decay,
                    name='{}-weight_decay'.format('-'.join(
                        str(var.op.name).split('/'))))
                tf.add_to_collection('losses', weight_decay)  # store losses
        reg_vars = tf.get_collection('losses', self.scope)
        trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           self.scope)
        _logger.add('regularization var num: %d' % len(reg_vars))
        _logger.add('trainable var num: %d' % len(trainable_vars))
        losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=tf.stop_gradient(self.gold_label), logits=self.logits)
        tf.add_to_collection('losses',
                             tf.reduce_mean(losses, name='xentropy_loss_mean'))
        loss = tf.add_n(tf.get_collection('losses', self.scope), name='loss')
        tf.summary.scalar(loss.op.name, loss)
        tf.add_to_collection('ema/scalar', loss)

        return loss
示例#3
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s nerual network structure ...' % cfg.network_type)

        vocab_len = self.vocab_len
        max_sent_len = self.max_sent_len
        word_embedding_len = self.word_embedding_len
        hn = self.hn

        batch_size = self.batch_size
        sent1_len = self.sent1_len #include pad
        sent2_len = self.sent2_len

        # --------------------- embedding mat ------------------------------------
        with tf.variable_scope('emb'):
            # token_emb_mat, 0-->empty, 1-->unknown token
            token_emb_mat = tf.get_variable('token_emb_mat',initializer=tf.constant(self.emb_mat,dtype=tf.float32),dtype=tf.float32,trainable=True)

            sent1_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent1_token) # batch_size,sent1_len,word_embedding_len
            sent2_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent2_token)

            self.tensor_dict['sent1_emb'] = sent1_emb
            self.tensor_dict['sent2_emb'] = sent2_emb

        # -------------------- sentence encoding ---------------------------------
        with tf.variable_scope('sent_encoding'):
            sent1_rep = disan(sent1_emb, self.sent1_token_mask, 'DiSAN', cfg.dropout, self.is_train,
                              cfg.weight_decay, 'elu', self.tensor_dict, 'sent1')
            self.tensor_dict['sent1_rep'] = sent1_rep  # batch_size,2*word_embedding_len

            tf.get_variable_scope().reuse_variables()

            sent2_rep = disan(sent2_emb, self.sent2_token_mask, 'DiSAN', cfg.dropout, self.is_train,
                              cfg.weight_decay, 'elu', self.tensor_dict, 'sent2')
            self.tensor_dict['sent2_rep'] = sent2_rep

        # -------------------- output ---------------------------------------------
        with tf.variable_scope('output'):
            # batch_size, 4*2*word_embedding_len
            out_rep = tf.concat([sent1_rep, sent2_rep, sent1_rep-sent2_rep, sent1_rep*sent2_rep], axis=-1)
            pre_output = tf.nn.elu(linear([out_rep], hn, True, 0., scope='pre_output', squeeze=False,
                                          weight_decay=cfg.weight_decay, input_keep_prob=cfg.dropout, is_train=self.is_train))
            logits = linear([pre_output], self.output_class, True, 0., scope='logits', squeeze=False,
                            weight_decay=cfg.weight_decay, input_keep_prob=cfg.dropout, is_train=self.is_train)
            self.tensor_dict['logits'] = logits

        return logits
示例#4
0
    def update_tensor_add_ema_and_opt(self):
        self.logits = self.build_network()
        self.loss = self.build_loss()
        self.accuracy, self.accuracy_0, self.accuracy_1 = self.build_accuracy()

        # -------------------------- ema --------------------------TODO
        if True:
            self.var_ema = tf.train.ExponentialMovingAverage(cfg.var_decay)
            self.build_var_ema()

        if cfg.mode == 'train':
            self.ema = tf.train.ExponentialMovingAverage(cfg.decay)
            self.build_ema()
        self.summary = tf.summary.merge_all()

        # -------------------------- optimization ----------------
        if cfg.optimizer.lower() == 'adadelta':
            self.opt = tf.train.AdadeltaOptimizer(self.learning_rate)
        elif cfg.optimizer.lower() == 'adam':
            self.opt = tf.train.AdamOptimizer(self.learning_rate)
        elif cfg.optimizer.lower() == 'rmsprop':
            self.opt = tf.train.RMSPropOptimizer(self.learning_rate)
        else:
            raise AttributeError('no optimizer named as \'%s\' ' %
                                 cfg.optimizer)

        trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           self.scope)
        # trainable param num:
        # print params num:
        all_params_num = 0
        for elem in trainable_vars:
            # elem.name
            var_name = elem.name.split(':')[0]
            if var_name.endswith('emb_mat'):
                continue  # emb_mat para excluded here
            params_num = 1
            for l in elem.get_shape().as_list():
                params_num *= l
            all_params_num += params_num
        _logger.add('Trainable parameters number: %d: ' % all_params_num)

        self.train_op = self.opt.minimize(self.loss,
                                          self.global_step,
                                          var_list=tf.get_collection(
                                              tf.GraphKeys.TRAINABLE_VARIABLES,
                                              self.scope))
示例#5
0
    def __init__(self,
                 data_file_path,
                 data_type,
                 dicts=None,
                 language_type='es',
                 unlabeled_file_path=None,
                 emb_file_path=None):
        self.data_type = data_type
        _logger.add('building data set object for %s' % data_type)
        assert data_type in ['train', 'dev', 'test', 'infer']

        # check
        if data_type in ['dev', 'test', 'infer']:
            assert dicts is not None

        # build vocab  --> deprecated --> dicts is preprocessed
        # if data_type == 'train':
        #     assert unlabeled_file_path is not None
        #     self.dicts = {}
        #     vocab_es_count, vocab_es_token2id, vocab_es_id2token, vocab_en_count, vocab_en_token2id, vocab_en_id2token=build_vocab(unlabeled_file_path)
        #     self.dicts['es'] = vocab_es_token2id
        #     self.dicts['en'] = vocab_en_token2id
        # else:
        #     self.dicts = dicts
        self.dicts = dicts

        # data
        if data_type == 'train':
            self.nn_data = load_en_train_data(data_file_path, self.dicts['es'],
                                              self.dicts['en'])
        elif data_type == 'dev':
            self.nn_data = load_en_train_data(data_file_path, self.dicts['es'],
                                              self.dicts['en'])
        elif data_type == 'test':
            self.nn_data = load_es_train_data(data_file_path, self.dicts['es'],
                                              self.dicts['en'])
        elif data_type == 'infer':
            self.nn_data = load_es_test_data(data_file_path, self.dicts['es'])
        self.sample_num = len(self.nn_data)
        # generate embedding
        if data_type == 'train':
            if language_type == 'en':
                self.emb_mat_token = load_emb_mat(emb_file_path,
                                                  self.dicts['en'])
            elif language_type == 'es':
                self.emb_mat_token = load_emb_mat(emb_file_path,
                                                  self.dicts['es'])
示例#6
0
    def get_inference(self, sess, dataset_obj):
        _logger.add()
        _logger.add('getting inference result for %s' % dataset_obj.data_type)

        logits_list = []
        prob_list = []
        for sample_batch, _, _, _ in dataset_obj.generate_batch_sample_iter():
            feed_dict = self.model.get_feed_dict(sample_batch, 'infer')
            logits = sess.run(self.model.logits, feed_dict=feed_dict)
            logits_list.append(np.argmax(logits, -1))
            prob_list.append(
                np.exp(logits[:, 1]) /
                (np.exp(logits[:, 0]) + np.exp(logits[:, 1])))

        logits_array = np.concatenate(logits_list, 0)
        prob_array = np.concatenate(prob_list, 0)

        return logits_array, prob_array
示例#7
0
def save_file(data, filePath, dataName='data', mode='pickle'):
    _logger.add()
    _logger.add('Saving %s to %s' % (dataName, filePath))

    if mode == 'pickle':
        with open(filePath, 'wb') as f:
            pickle.dump(obj=data, file=f)
    elif mode == 'json':
        with open(filePath, 'w', encoding='utf-8') as f:
            json.dump(obj=data, fp=f)
    elif mode == 'txt':
        pass
    else:
        raise (ValueError, 'Fuction save_file does not have mode %s' % (mode))
    _logger.add('Done')
示例#8
0
def train():
    # output_model_params()

    # need to fixed, reusability of data
    loadFile = True
    ifLoad, data = False, None
    loaddict, dicts = load_file(cfg.dict_path, 'dict', 'pickle')
    if not loaddict:
        raise (ValueError, 'dict load failed')
    if loadFile:
        ifLoad, data = load_file(cfg.processed_path, 'processed data',
                                 'pickle')
    if not ifLoad or not loadFile:
        train_data_obj = Dataset(cfg.train_data_path,
                                 'train',
                                 dicts=dicts,
                                 language_type='es',
                                 unlabeled_file_path=cfg.unlabeled_data_path,
                                 emb_file_path=cfg.emb_es_path)
        dev_data_obj = Dataset(cfg.dev_data_path, 'dev', dicts=dicts)
        test_data_obj = Dataset(cfg.test_data_path, 'test', dicts=dicts)

        save_file(
            {
                'train_data_obj': train_data_obj,
                'dev_data_obj': dev_data_obj,
                'test_data_obj': test_data_obj
            }, cfg.processed_path)

        # train_data_obj.save_dict(cfg.dict_path)
    else:
        train_data_obj = data['train_data_obj']
        dev_data_obj = data['dev_data_obj']
        test_data_obj = data['test_data_obj']

    emb_mat_token = train_data_obj.emb_mat_token

    with tf.variable_scope(network_type) as scope:
        if network_type in model_type_set:
            model = Model(emb_mat_token, len(train_data_obj.dicts['es']), 100,
                          scope.name)

    # TODO
    graphHandler = GraphHandler(model)
    evaluator = Evaluator(model)
    performRecorder = PerformRecorder(5)

    if cfg.gpu_mem < 1:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem, allow_growth=True)
    else:
        gpu_options = tf.GPUOptions()
    graph_config = tf.ConfigProto(gpu_options=gpu_options,
                                  allow_soft_placement=True)
    sess = tf.Session(config=graph_config)
    graphHandler.initialize(sess)

    # load model
    if cfg.load_model and cfg.load_step:
        saver = tf.train.Saver()
        step = cfg.load_step
        model_path = os.path.join(cfg.ckpt_dir,
                                  'top_result_saver_step_%d.ckpt' % step)
        saver.restore(sess, model_path)

    # begin training
    steps_per_epoch = int(
        math.ceil(1.0 * train_data_obj.sample_num / cfg.train_batch_size))
    num_steps = cfg.num_steps or steps_per_epoch * cfg.max_epoch

    global_step = 0

    for sample_batch, batch_num, data_round, idx_b in train_data_obj.generate_batch_sample_iter(
            num_steps):
        global_step = sess.run(model.global_step) + 1
        if_get_summary = global_step % (cfg.log_period or steps_per_epoch) == 0
        loss, summary, train_op = model.step(sess,
                                             sample_batch,
                                             get_summary=if_get_summary)
        if global_step % 100 == 0 or global_step == 1:
            _logger.add('data round: %d: %d/%d, global_step:%d -- loss:%.4f' %
                        (data_round, idx_b, batch_num, global_step, loss))

        if if_get_summary:
            graphHandler.add_summary(summary, global_step)

        # Occasional evaluation
        #if global_step > int(cfg.num_steps - 100000) and (global_step % (cfg.eval_period or steps_per_epoch) == 0):
        if global_step % (cfg.eval_period or steps_per_epoch) == 0:  # debug
            # ---- dev ----
            dev_loss, dev_accu, dev_accu_0, dev_accu_1 = evaluator.get_evaluation(
                sess, dev_data_obj, global_step)
            _logger.add(
                '==> for dev, loss: %.4f, accuracy: %.4f, accuracy_0: %.4f, accuracy_1: %.4f'
                % (dev_loss, dev_accu, dev_accu_0, dev_accu_1))

            # ---- test ----
            test_loss, test_accu, test_accu_0, test_accu_1 = evaluator.get_evaluation(
                sess, test_data_obj, global_step)
            _logger.add(
                '~~> for test, loss: %.4f, accuracy: %.4f, accuracy_0: %.4f, accuracy_1: %.4f'
                % (test_loss, test_accu, test_accu_0, test_accu_1))

            if global_step > cfg.update_lr_step:
                model.update_learning_rate(dev_loss, cfg.lr_decay)
            if global_step > cfg.record_model_step:
                is_in_top, deleted_step = performRecorder.update_top_list(
                    global_step, test_accu, sess)

        this_epoch_time, mean_epoch_time = cfg.time_counter.update_data_round(
            data_round)
        if this_epoch_time is not None and mean_epoch_time is not None:
            _logger.add('##> this epoch time: %f, mean epoch time: %f' %
                        (this_epoch_time, mean_epoch_time))
示例#9
0
def test():

    assert cfg.load_path is not None

    #TODO
    loadFile = False
    ifLoad, data = False, None
    if loadFile:
        ifLoad, data = load_file(cfg.processed_path, 'processed data',
                                 'pickle')
    if not ifLoad or not loadFile:
        train_data_obj = Dataset(cfg.train_data_path,
                                 'train',
                                 language_type='es',
                                 unlabeled_file_path=cfg.unlabeled_data_path,
                                 emb_file_path=cfg.emb_es_path)
        dev_data_obj = Dataset(cfg.dev_data_path,
                               'dev',
                               dicts=train_data_obj.dicts)
        test_data_obj = Dataset(cfg.test_data_path,
                                'test',
                                dicts=train_data_obj.dicts)

        save_file(
            {
                'train_data_obj': train_data_obj,
                'dev_data_obj': dev_data_obj,
                'test_data_obj': test_data_obj
            }, cfg.processed_path)

        train_data_obj.save_dict(cfg.dict_path)
    else:
        train_data_obj = data['train_data_obj']
        dev_data_obj = data['dev_data_obj']
        test_data_obj = data['test_data_obj']

    emb_mat_token = train_data_obj.emb_mat_token

    with tf.variable_scope(network_type) as scope:
        if network_type in model_type_set:
            model = Model(emb_mat_token, len(train_data_obj.dicts['es']), 100,
                          scope.name)

    # TODO
    graphHandler = GraphHandler(model)
    evaluator = Evaluator(model)

    graph_config = tf.ConfigProto()
    sess = tf.Session(config=graph_config)
    graphHandler.initialize(sess)

    # ---- dev ----
    dev_loss, dev_accu = evaluator.get_evaluation(sess, dev_data_obj, None)
    _logger.add('==> for dev, loss: %.4f, accuracy: %.4f' %
                (dev_loss, dev_accu))

    # ---- test ----
    test_loss, test_accu = evaluator.get_evaluation(sess, test_data_obj, None)
    _logger.add('~~> for test, loss: %.4f, accuracy: %.4f' %
                (test_loss, test_accu))

    # ---- train ----
    train_loss, train_accu = evaluator.get_evaluation(sess, train_data_obj,
                                                      None)
    _logger.add('--> for train, loss: %.4f, accuracy: %.4f' %
                (train_loss, train_accu))
示例#10
0
    def restore(self, sess):
        _logger.add()
        # print(cfg.ckpt_dir)

        if cfg.load_step is None:
            if cfg.load_path is None:
                _logger.add('trying to restore from dir %s' % cfg.ckpt_dir)
                latest_checkpoint_path = tf.train.latest_checkpoint(
                    cfg.ckpt_dir)
            else:
                latest_checkpoint_path = cfg.load_path
        else:
            latest_checkpoint_path = cfg.ckpt_path + '-' + str(cfg.load_step)

        if latest_checkpoint_path is not None:
            _logger.add('trying to restore from ckpt file %s' %
                        latest_checkpoint_path)
            try:
                self.saver.restore(sess, latest_checkpoint_path)
                _logger.add('success to restore')
            except tf.errors.NotFoundError:
                _logger.add('failure to restore')
                if cfg.mode != 'train':
                    raise FileNotFoundError('cannot find model file')
        else:
            _logger.add('No check point file in dir %s' % cfg.ckpt_dir)
            if cfg.mode != 'train':
                raise FileNotFoundError('cannot find model file')

        _logger.done()
示例#11
0
 def save(self, sess, global_step=None):
     _logger.add()
     _logger.add('saving model to %s' % cfg.ckpt_path)
     self.saver.save(sess, cfg.ckpt_path, global_step)
     _logger.done()
示例#12
0
 def add_summary(self, summary, global_step):
     _logger.add()
     _logger.add('saving summary...')
     self.writer.add_summary(summary, global_step)
     _logger.done()
示例#13
0
    def get_evaluation(self, sess, dataset_obj, global_step=None):
        _logger.add()
        _logger.add('getting evaluation result for %s' % dataset_obj.data_type)

        logits_list, loss_list, accu_list, accu_0_list, accu_1_list, gold_label_list = [], [], [], [], [], []

        for sample_batch, _, _, _ in dataset_obj.generate_batch_sample_iter():
            feed_dict = self.model.get_feed_dict(sample_batch, 'dev')
            logits, loss, accu, accu_0, accu_1, gold_label = sess.run([self.model.logits,self.model.loss, self.model.accuracy,
                                                                       self.model.accuracy_0, self.model.accuracy_1, self.model.gold_label], feed_dict=feed_dict)
            logits_list.append(np.argmax(logits, -1))
            loss_list.append(loss)
            accu_list.append(accu)
            accu_0_list.append(accu_0)
            accu_1_list.append(accu_1)
            gold_label_list.append(gold_label)

        logits_array = np.concatenate(logits_list, 0)
        loss_value = np.mean(loss_list)
        accu_array = np.concatenate(accu_list, 0)
        accu_value = np.mean(accu_array)
        accu_0_array = np.concatenate(accu_0_list, 0)
        accu_0_value = np.mean(accu_0_array)
        accu_1_array = np.concatenate(accu_1_list, 0)
        accu_1_value = np.mean(accu_1_array)
        gold_label_array = np.concatenate(gold_label_list, 0)
        count_gold_label = gold_label_array.size
        count_gold_label_1 = np.count_nonzero(gold_label_array)
        count_gold_label_0 = count_gold_label - count_gold_label_1
        accu_0_value = accu_0_value * count_gold_label / count_gold_label_0
        accu_1_value = accu_1_value * count_gold_label / count_gold_label_1

        if global_step is not None:
            if dataset_obj.data_type == 'train':
                summary_feed_dict = {
                    self.train_loss: loss_value,
                    self.train_accuracy: accu_value,
                    self.train_accuracy_0: accu_0_value,
                    self.train_accuracy_1: accu_1_value
                }
                summary = sess.run(self.train_summaries, summary_feed_dict)
                self.writer.add_summary(summary, global_step)
            elif dataset_obj.data_type == 'dev':
                summary_feed_dict = {
                    self.dev_loss: loss_value,
                    self.dev_accuracy: accu_value,
                    self.dev_accuracy_0: accu_0_value,
                    self.dev_accuracy_1: accu_1_value
                }
                summary = sess.run(self.dev_summaries, summary_feed_dict)
                self.writer.add_summary(summary, global_step)
            else:
                summary_feed_dict = {
                    self.test_loss: loss_value,
                    self.test_accuracy: accu_value,
                    self.test_accuracy_0: accu_0_value,
                    self.test_accuracy_1: accu_1_value
                }
                summary = sess.run(self.test_summaries, summary_feed_dict)
                self.writer.add_summary(summary, global_step)

        return loss_value, accu_value, accu_0_value, accu_1_value