Exemplo n.º 1
0
    def filter_data(self, only_sent=False, fine_grained=False):
        _logger.add()
        _logger.add('filtering data for %s, only sentence: %s' %
                    (self.data_type, only_sent))
        if only_sent:
            counter = 0
            new_nn_data = []
            for trees in self.nn_data:
                new_trees = []
                for tree in trees:
                    if tree['is_sent']:
                        new_trees.append(tree)
                        counter += 1
                new_nn_data.append(new_trees)
            self.nn_data = new_nn_data
            self.sample_num = counter

        if not fine_grained:
            # delete the neutral sample
            counter = 0
            new_nn_data = []
            for trees in self.nn_data:
                new_trees = []
                for tree in trees:
                    sent_label = tree['root_node']['sentiment_label']
                    if sent_label <= 0.4 or sent_label > 0.6:
                        counter += 1
                        new_trees.append(tree)
                new_nn_data.append(new_trees)
            self.nn_data = new_nn_data
            self.sample_num = counter
Exemplo n.º 2
0
 def gene_sub_trees_and_shift_reduce_info(self):
     _logger.add()
     _logger.add('generating sub-trees and shift reduce info: %s...' %
                 self.data_type)
     counter = 0
     new_data_list = []
     for tree in self.digitized_data_list:
         sub_trees = []
         idx_to_node_dict = dict(
             (tree_node['node_index'], tree_node) for tree_node in tree)
         for tree_node in tree:
             # get all node for a sub tree
             if tree_node['is_leaf']:
                 new_sub_tree = [tree_node]
             else:
                 new_sub_tree = []
                 new_sub_tree_leaves = [
                     idx_to_node_dict[node_index]
                     for node_index in tree_node['leaf_node_index_seq']
                 ]
                 new_sub_tree += new_sub_tree_leaves
                 for leaf_node in new_sub_tree_leaves:
                     pre_node = leaf_node
                     while pre_node[
                             'parent_index'] > 0 and pre_node != tree_node:  # fixme
                         cur_node = idx_to_node_dict[
                             pre_node['parent_index']]
                         if cur_node not in new_sub_tree:
                             new_sub_tree.append(cur_node)
                         pre_node = cur_node
             # get shift reduce info
             child_node_indices = [
                 new_tree_node['node_index']
                 for new_tree_node in new_sub_tree
             ]
             parent_node_indices = [
                 new_tree_node['parent_index'] if
                 new_tree_node['parent_index'] in child_node_indices else 0
                 for new_tree_node in new_sub_tree
             ]
             sr_result = shift_reduce_constituency_forest(
                 list(zip(child_node_indices, parent_node_indices)))
             operation_list, node_list_in_stack, reduce_mat = zip(
                 *sr_result)
             shift_reduce_info = {
                 'op_list': operation_list,
                 'reduce_mat': reduce_mat,
                 'node_list_in_stack': node_list_in_stack
             }
             sub_tree = {
                 'tree_nodes': new_sub_tree,
                 'shift_reduce_info': shift_reduce_info,
                 'root_node': tree_node,
                 'is_sent':
                 True if tree_node['parent_index'] == 0 else False
             }
             sub_trees.append(sub_tree)
             counter += 1
         new_data_list.append(sub_trees)
     return new_data_list, counter
Exemplo n.º 3
0
    def __init__(self, data_list, data_type, dicts=None):
        self.data_type = data_type
        _logger.add('building data set object for %s' % data_type)
        assert data_type in ['train', 'dev', 'test']
        # check
        if data_type in ['dev', 'test']:
            assert dicts is not None

        processed_data_list = self.process_raw_data(data_list, data_type)

        if data_type == 'train':
            self.dicts, self.max_lens = self.count_data_and_build_dict(
                processed_data_list)
        else:
            _, self.max_lens = self.count_data_and_build_dict(
                processed_data_list, False)
            self.dicts = dicts

        self.digitized_data_list = self.digitize_dataset(
            processed_data_list, self.dicts, data_type)
        self.nn_data, self.sample_num = self.gene_sub_trees_and_shift_reduce_info(
        )

        self.emb_mat_token, self.emb_mat_glove = None, None
        if data_type == 'train':
            self.emb_mat_token, self.emb_mat_glove = self.generate_index2vec_matrix(
            )
    def __init__(self,
                 train_file_path,
                 dev_file_path=None,
                 test_file_path=None):
        _logger.add('building data set object')

        train_data_list = self.load_data(train_file_path, 'train')
        dev_data_list = self.load_data(dev_file_path, 'dev')
        if test_file_path != None:
            test_data_list = self.load_data(test_file_path, 'test')

        data_list = []
        data_list.extend(train_data_list)
        data_list.extend(dev_data_list)
        if test_file_path != None:
            data_list.extend(test_data_list)

        self.dicts, self.max_lens = self.count_data_and_build_dict(data_list)

        self.digitized_train_data_list = self.digitize_dataset(
            train_data_list, self.dicts)
        self.digitized_dev_data_list = self.digitize_dataset(
            dev_data_list, self.dicts)
        if test_file_path != None:
            self.digitized_test_data_list = self.digitize_dataset(
                test_data_list, self.dicts)

        self.emb_mat_token, self.emb_mat_glove = self.generate_index2vec_matrix(
        )
Exemplo n.º 5
0
    def digitize_dataset(dataset, dicts, data_type):
        token2index = dict([
            (token, idx)
            for idx, token in enumerate(dicts['token'] + dicts['glove'])
        ])

        def digitize_token(token):
            token = token if not cfg.lower_word else token.lower()
            try:
                return token2index[token]
            except KeyError:
                return 1

        _logger.add()
        _logger.add('digitizing data: %s...' % data_type)

        for topic in tqdm(dataset):
            for paragraph in topic['paragraphs']:
                paragraph['context_token_digital'] = [[
                    digitize_token(token) for token in sent
                ] for sent in paragraph['context_token']]
                for qa in paragraph['qas']:
                    qa['question_token_digital'] = [
                        digitize_token(token) for token in qa['question_token']
                    ]
        _logger.done()
        return dataset
Exemplo n.º 6
0
    def update_tensor_add_ema_and_opt(self):
        self.logits, (self.s1_act, self.s1_logpa), (self.s2_act, self.s2_logpa), \
            (self.s1_percentage, self.s2_percentage) = self.build_network()
        self.loss_sl, self.loss_rl = self.build_loss()
        self.accuracy = self.build_accuracy()

        # ------------ema-------------
        if True:
            self.var_ema = tf.train.ExponentialMovingAverage(cfg.var_decay)
            self.build_var_ema()

        if cfg.mode == 'train':
            self.ema = tf.train.ExponentialMovingAverage(cfg.decay)
            self.build_ema()
        self.summary = tf.summary.merge_all()

        # ---------- optimization ---------
        if cfg.optimizer.lower() == 'adadelta':
            assert cfg.learning_rate > 0.1 and cfg.learning_rate < 1.
            self.opt_sl = tf.train.AdadeltaOptimizer(cfg.learning_rate)
            self.opt_rl = tf.train.AdadeltaOptimizer(cfg.learning_rate)
        elif cfg.optimizer.lower() == 'adam':
            assert cfg.learning_rate < 0.1
            self.opt_sl = tf.train.AdamOptimizer(cfg.learning_rate)
            self.opt_rl = tf.train.AdamOptimizer(cfg.learning_rate)
        elif cfg.optimizer.lower() == 'rmsprop':
            assert cfg.learning_rate < 0.1
            self.opt_sl = tf.train.RMSPropOptimizer(cfg.learning_rate)
            self.opt_rl = tf.train.RMSPropOptimizer(cfg.learning_rate)
        else:
            raise AttributeError('no optimizer named as \'%s\'' % cfg.optimizer)

        trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
        # trainable param num:
        # print params num
        all_params_num = 0
        for elem in trainable_vars:
            # elem.name
            var_name = elem.name.split(':')[0]
            if var_name.endswith('emb_mat'):
                continue
            params_num = 1
            for l in elem.get_shape().as_list(): params_num *= l
            all_params_num += params_num
        _logger.add('Trainable Parameters Number: %d' % all_params_num)

        sl_vars = [var for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
                      if not var.op.name.startswith(self.scope+'/hard_network')]
        self.train_op_sl = self.opt_sl.minimize(
            self.loss_sl, self.global_step,
            var_list=sl_vars)

        rl_vars = [var for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
                      if var.op.name.startswith(self.scope + '/hard_network')]
        if len(rl_vars) > 0:
            self.train_op_rl = self.opt_rl.minimize(
                self.loss_rl,
                var_list=rl_vars)
        else:
            self.train_op_rl = None
Exemplo n.º 7
0
    def digitize_data(self, data_list, dicts, dataset_type):
        token2index = dict([
            (token, idx)
            for idx, token in enumerate(dicts['token'] + dicts['glove'])
        ])
        char2index = dict([(token, idx)
                           for idx, token in enumerate(dicts['char'])])

        def digitize_token(token):
            token = token if not cfg.lower_word else token.lower()
            try:
                return token2index[token]
            except KeyError:
                return 1

        def digitize_char(char):
            try:
                return char2index[char]
            except KeyError:
                return 1

        _logger.add()
        _logger.add('digitizing data: %s...' % dataset_type)
        for sample in data_list:
            sample['token_digital'] = [
                digitize_token(token) for token in sample['token']
            ]
            sample['char_digital'] = [[
                digitize_char(char) for char in list(token)
            ] for token in sample['token']]
        _logger.done()
        return data_list
Exemplo n.º 8
0
    def process_raw_dataset(raw_data, data_type):
        _logger.add()
        _logger.add('processing raw data: %s...' % data_type)
        for topic in tqdm(raw_data):
            for paragraph in topic['paragraphs']:
                # context
                paragraph['context'] = paragraph['context'].replace(
                    "''", '" ').replace("``", '" ')
                paragraph['context_token'] = [[
                    token.replace("''", '"').replace("``", '"')
                    for token in nltk.word_tokenize(sent)
                ] for sent in nltk.sent_tokenize(paragraph['context'])]
                paragraph['context_token'] = [
                    Dataset.further_tokenize(sent)
                    for sent in paragraph['context_token']
                ]

                # qas
                for qa in paragraph['qas']:
                    qa['question'] = qa['question'].replace("''",
                                                            '" ').replace(
                                                                "``", '" ')
                    qa['question_token'] = Dataset.further_tokenize([
                        token.replace("''", '"').replace("``", '"')
                        for token in nltk.word_tokenize(qa['question'])
                    ])
                    # # tag generation
                    for answer in qa['answers']:
                        answer[
                            'sent_label'] = Dataset.sentence_label_generation(
                                paragraph['context'],
                                paragraph['context_token'], answer['text'],
                                answer['answer_start'])
        _logger.done()
        return raw_data
Exemplo n.º 9
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s neural network structure...' % cfg.network_type)
        tds, cds = self.tds, self.cds
        tl = self.tl
        tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh
        hn = self.hn
        bs, sl, ol, mc = self.bs, self.sl, self.ol, self.mc

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat,
                                                   extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb,
                                                   scope='gene_token_emb_mat')
            emb = tf.nn.embedding_lookup(token_emb_mat, self.token_seq)  # bs,sl,tel
            self.tensor_dict['emb'] = emb

        rep = disan(
            emb, self.token_mask, 'DiSAN', cfg.dropout,
            self.is_train, cfg.wd, 'relu', tensor_dict=self.tensor_dict, name='')

        with tf.variable_scope('output'):
            pre_logits = tf.nn.relu(linear([rep], hn, True, scope='pre_logits_linear',
                                          wd=cfg.wd, input_keep_prob=cfg.dropout,
                                          is_train=self.is_train))  # bs, hn
            logits = linear([pre_logits], self.output_class, False, scope='get_output',
                            wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) # bs, 5
        _logger.done()
        return logits
Exemplo n.º 10
0
    def generate_tree_shift_reduce_info(self, dataset, data_type):
        _logger.add()
        _logger.add('generating tree shift reduce for %s' % data_type)

        for sample in dataset:
            # sent1
            s1_child_parent_node_indices = [
                (new_tree_node.node_index, new_tree_node.parent_index)
                for new_tree_node in sample['sentence1_binary_parse_node_list']
            ]
            s1_sr = shift_reduce_constituency_forest(
                s1_child_parent_node_indices)
            s1_op_list, s1_node_list_in_stack, s1_reduce_mat = zip(*s1_sr)
            s1_sr_info = {
                'op_list': s1_op_list,
                'reduce_mat': s1_reduce_mat,
                'node_list_in_stack': s1_node_list_in_stack
            }
            sample['s1_sr_info'] = s1_sr_info
            # tree tag
            # s1_tree_tag = []
            # for node_idx in s1_node_list_in_stack:
            #     ### find tree node
            #     tree_node_found = None
            #     for tree_node in sample['sentence1_parse_node_list']:
            #         if tree_node.node_index == node_idx:
            #             tree_node_found = tree_node
            #             break
            #     assert tree_node_found is not None
            #     s1_tree_tag.append(tree_node_found.tag)
            # sample['s1_tree_tag'] = s1_tree_tag

            # s2
            s2_child_parent_node_indices = [
                (new_tree_node.node_index, new_tree_node.parent_index)
                for new_tree_node in sample['sentence2_binary_parse_node_list']
            ]
            s2_sr = shift_reduce_constituency_forest(
                s2_child_parent_node_indices)
            s2_op_list, s2_node_list_in_stack, s2_reduce_mat = zip(*s2_sr)
            s2_sr_info = {
                'op_list': s2_op_list,
                'reduce_mat': s2_reduce_mat,
                'node_list_in_stack': s2_node_list_in_stack
            }
            sample['s2_sr_info'] = s2_sr_info
            # # tree tag
            # s2_tree_tag = []
            # for node_idx in s2_node_list_in_stack:
            #     ### find tree node
            #     tree_node_found = None
            #     for tree_node in sample['sentence2_parse_node_list']:
            #         if tree_node.node_index == node_idx:
            #             tree_node_found = tree_node
            #             break
            #     assert tree_node_found is not None
            #     s2_tree_tag.append(tree_node_found.tag)
            # sample['s2_tree_tag'] = s2_tree_tag

        return dataset
Exemplo n.º 11
0
    def __init__(self, data_file_path, data_type, dicts=None):
        self.data_type = data_type
        _logger.add('building data set object for %s' % data_type)
        assert data_type in ['train', 'dev', 'test']
        # check
        if data_type in ['dev', 'test']:
            assert dicts is not None

        # temporary params
        self.only_bi_tree = True

        raw_data = self.load_snli_data(data_file_path, data_type)
        data_with_tree = self.transform_str_to_tree(raw_data, data_type)
        # data_with_tree = self.generate_tree_shift_reduce_info(data_with_tree, data_type)
        processed_data_list = self.process_raw_data(data_with_tree, data_type)

        if data_type == 'train':
            self.dicts, self.max_lens = self.count_data_and_build_dict(
                processed_data_list)
        else:
            _, self.max_lens = self.count_data_and_build_dict(
                processed_data_list, False)
            self.dicts = dicts
        digital_data = self.digitize_data(processed_data_list, self.dicts,
                                          data_type)
        self.nn_data = self.clip_filter_data(digital_data,
                                             cfg.data_clip_method, data_type)
        self.sample_num = len(self.nn_data)
        if data_type == 'train':
            self.emb_mat_token, self.emb_mat_glove = self.generate_index2vec_matrix(
            )
Exemplo n.º 12
0
    def build_network(self):
        _logger.add()
        _logger.add('building %s neural network structure...' % cfg.network_type)
        tds, cds = self.tds, self.cds
        tl = self.tl
        tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh
        hn = self.hn
        bs = self.bs

        with tf.variable_scope('emb'):
            token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat,
                                                   extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb,
                                                   scope='gene_token_emb_mat')
            emb = tf.nn.embedding_lookup(token_emb_mat, self.token_seq)  # bs,sl1,tel

        with tf.variable_scope('sent_encoding'):
            rep = sentence_encoding_models(
                emb, self.token_mask, cfg.context_fusion_method, 'relu',
                'ct_based_sent2vec', cfg.wd, self.is_train, cfg.dropout,
                block_len=cfg.block_len)

        with tf.variable_scope('output'):
            pre_logits = tf.nn.relu(linear([rep], hn, True, scope='pre_logits_linear',
                                           wd=cfg.wd, input_keep_prob=cfg.dropout,
                                           is_train=self.is_train))  # bs, hn
            logits = linear([pre_logits], self.output_class, False, scope='get_output',
                            wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) # bs, 5
        _logger.done()
        return logits
Exemplo n.º 13
0
    def build_loss(self):
        # weight_decay
        with tf.name_scope("weight_decay"):
            for var in set(tf.get_collection('reg_vars', self.scope)):
                weight_decay = tf.multiply(tf.nn.l2_loss(var),
                                           cfg.wd,
                                           name="{}-wd".format('-'.join(
                                               str(var.op.name).split('/'))))
                tf.add_to_collection('losses', weight_decay)
        reg_vars = tf.get_collection('losses', self.scope)
        trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           self.scope)
        _logger.add('regularization var num: %d' % len(reg_vars))
        _logger.add('trainable var num: %d' % len(trainable_vars))

        target_dist = tf.clip_by_value(self.target_distribution, 1e-10, 1.)
        predicted_dist = tf.clip_by_value(tf.nn.softmax(self.logits), 1e-10,
                                          1.)

        kl_batch = tf.reduce_sum(
            target_dist * tf.log(target_dist / predicted_dist), -1)
        # kl_batch = tf.reduce_sum((target_dist - predicted_dist) ** 2, -1)

        tf.add_to_collection(
            'losses', tf.reduce_mean(kl_batch, name='kl_divergence_mean'))
        loss = tf.add_n(tf.get_collection('losses', self.scope), name='loss')
        tf.summary.scalar(loss.op.name, loss)
        tf.add_to_collection('ema/scalar', loss)
        return loss
Exemplo n.º 14
0
    def process_raw_data(self, dataset, data_type):
        def further_tokenize(temp_tokens):
            tokens = []  # [[(s,e),...],...]
            for token in temp_tokens:
                l = (
                "-", "\u2212", "\u2014", "\u2013", "/", "~", '"', "'", "\u201C", "\u2019", "\u201D", "\u2018", "\u00B0")
                tokens.extend(re.split("([{}])".format("".join(l)), token))
            return tokens

        # tokens
        _logger.add()
        _logger.add('processing raw data for %s' % data_type)

        for sample in tqdm(dataset):
            sample['sentence1_token'] = [node.token
                                         for node in sample['sentence1_binary_parse_node_list'] if node.is_leaf]
            sample['sentence1_tag'] = [node.tag
                                       for node in sample['sentence1_binary_parse_node_list'] if node.is_leaf]

            sample['sentence2_token'] = [node.token
                                         for node in sample['sentence2_binary_parse_node_list'] if node.is_leaf]
            sample['sentence2_tag'] = [node.tag
                                       for node in sample['sentence2_binary_parse_node_list'] if node.is_leaf]

            if cfg.data_clip_method == 'no_tree':
                sample['sentence1_token'] = further_tokenize(sample['sentence1_token'])
                sample['sentence2_token'] = further_tokenize(sample['sentence2_token'])
        _logger.done()
        return dataset
 def load_data_pickle(self, data_file_path, data_type):
     _logger.add()
     _logger.add('load file for %s' % data_type)
     dataset = None
     with open(data_file_path, 'rb', encoding='utf-8') as file:
         dataset = pickle.load(file)
     _logger.done()
     return dataset
 def load_data(self, data_file_path, data_type):
     _logger.add()
     _logger.add('load file for %s' % data_type)
     dataset = []
     with open(data_file_path, 'r', encoding='utf-8') as f:
         dataset = json.load(f)
     _logger.done()
     return dataset
Exemplo n.º 17
0
    def transform_str_to_tree(self, dataset, data_type):
        _logger.add()
        _logger.add('transforming str format tree into real tree for %s' %
                    data_type)
        for sample in tqdm(dataset):
            sample['sentence1_binary_parse_tree'] = recursive_build_binary(
                tokenize_str_format_tree(sample['sentence1_binary_parse']))
            sample['sentence2_binary_parse_tree'] = recursive_build_binary(
                tokenize_str_format_tree(sample['sentence2_binary_parse']))
            # sample['sentence1_parse_tree'] = recursive_build_penn_format(
            #     tokenize_str_format_tree(sample['sentence1_parse']))
            # sample['sentence2_parse_tree'] = recursive_build_penn_format(
            #     tokenize_str_format_tree(sample['sentence2_parse']))

            # to node_list
            sample['sentence1_binary_parse_tree'], sample['sentence1_binary_parse_node_list'] = \
                transform_tree_to_parent_index(sample['sentence1_binary_parse_tree'])
            sample['sentence2_binary_parse_tree'], sample['sentence2_binary_parse_node_list'] = \
                transform_tree_to_parent_index(sample['sentence2_binary_parse_tree'])
            # sample['sentence1_parse_tree'], sample['sentence1_parse_node_list'] = \
            #     transform_tree_to_parent_index(sample['sentence1_parse_tree'])
            # sample['sentence2_parse_tree'], sample['sentence2_parse_node_list'] = \
            #     transform_tree_to_parent_index(sample['sentence2_parse_tree'])

            # shift reduce info
            # # s1
            s1_child_parent_node_indices = [
                (new_tree_node.node_index, new_tree_node.parent_index)
                for new_tree_node in sample['sentence1_binary_parse_node_list']
            ]
            s1_sr = shift_reduce_constituency_forest(
                s1_child_parent_node_indices)
            s1_op_list, s1_node_list_in_stack, s1_reduce_mat = zip(*s1_sr)
            s1_sr_info = {
                'op_list': s1_op_list,
                'reduce_mat': s1_reduce_mat,
                'node_list_in_stack': s1_node_list_in_stack
            }
            sample['s1_sr_info'] = s1_sr_info

            # # s2
            s2_child_parent_node_indices = [
                (new_tree_node.node_index, new_tree_node.parent_index)
                for new_tree_node in sample['sentence2_binary_parse_node_list']
            ]
            s2_sr = shift_reduce_constituency_forest(
                s2_child_parent_node_indices)
            s2_op_list, s2_node_list_in_stack, s2_reduce_mat = zip(*s2_sr)
            s2_sr_info = {
                'op_list': s2_op_list,
                'reduce_mat': s2_reduce_mat,
                'node_list_in_stack': s2_node_list_in_stack
            }
            sample['s2_sr_info'] = s2_sr_info

        _logger.done()
        return dataset
Exemplo n.º 18
0
    def get_evaluation(self, sess, dataset_obj, global_step=None):
        _logger.add()
        _logger.add('getting evaluation result for %s' % dataset_obj.data_type)

        logits_list, loss_list, accu_list = [], [], []
        is_sent_list = []
        for sample_batch, _, _, _ in dataset_obj.generate_batch_sample_iter():
            feed_dict = self.model.get_feed_dict(sample_batch, 'dev')
            logits, loss, accu = sess.run(
                [self.model.logits, self.model.loss, self.model.accuracy],
                feed_dict)
            logits_list.append(np.argmax(logits, -1))
            loss_list.append(loss)
            accu_list.append(accu)
            is_sent_list += [sample['is_sent'] for sample in sample_batch]
        logits_array = np.concatenate(logits_list, 0)
        loss_value = np.mean(loss_list)
        accu_array = np.concatenate(accu_list, 0)
        accu_value = np.mean(accu_array)
        sent_accu_list = []
        for idx, is_sent in enumerate(is_sent_list):
            if is_sent:
                sent_accu_list.append(accu_array[idx])
        sent_accu_value = np.mean(sent_accu_list)

        # analysis
        # analysis_save_dir = cfg.mkdir(cfg.answer_dir,'gs_%s'%global_step or 'test')
        # OutputAnalysis.do_analysis(dataset_obj, logits_array, accu_array, analysis_save_dir,
        #                            cfg.fine_grained)

        # add summary
        if global_step is not None:
            if dataset_obj.data_type == 'train':
                summary_feed_dict = {
                    self.train_loss: loss_value,
                    self.train_accuracy: accu_value,
                    self.train_sent_accuracy: sent_accu_value,
                }
                summary = sess.run(self.train_summaries, summary_feed_dict)
                self.writer.add_summary(summary, global_step)
            elif dataset_obj.data_type == 'dev':
                summary_feed_dict = {
                    self.dev_loss: loss_value,
                    self.dev_accuracy: accu_value,
                    self.dev_sent_accuracy: sent_accu_value,
                }
                summary = sess.run(self.dev_summaries, summary_feed_dict)
                self.writer.add_summary(summary, global_step)
            else:
                summary_feed_dict = {
                    self.test_loss: loss_value,
                    self.test_accuracy: accu_value,
                    self.test_sent_accuracy: sent_accu_value,
                }
                summary = sess.run(self.test_summaries, summary_feed_dict)
                self.writer.add_summary(summary, global_step)
            return loss_value, accu_value, sent_accu_value
Exemplo n.º 19
0
    def get_evaluation(self,
                       sess,
                       dataset_obj,
                       global_step=None,
                       time_counter=None):
        _logger.add()
        _logger.add('getting evaluation result for %s' % dataset_obj.data_type)

        logits_list, loss_list, accu_list = [], [], []
        for sample_batch, _, _, _ in dataset_obj.generate_batch_sample_iter():
            feed_dict = self.model.get_feed_dict(sample_batch, 'dev')
            if time_counter is not None:
                time_counter.add_start()
            logits, loss, accu = sess.run([
                self.model.logits, self.model.xentropy_loss,
                self.model.accuracy
            ], feed_dict)
            if time_counter is not None:
                time_counter.add_stop()
            logits_list.append(np.argmax(logits, -1))
            loss_list.append(loss)
            accu_list.append(accu)

        logits_array = np.concatenate(logits_list, 0)
        loss_value = np.mean(loss_list)
        accu_array = np.concatenate(accu_list, 0)
        accu_value = np.mean(accu_array)

        # todo: analysis
        # analysis_save_dir = cfg.mkdir(cfg.answer_dir, 'gs_%d' % global_step or 0)
        # OutputAnalysis.do_analysis(dataset_obj, logits_array, accu_array, analysis_save_dir,
        #                            cfg.fine_grained)

        if global_step is not None:
            if dataset_obj.data_type == 'train':
                summary_feed_dict = {
                    self.train_loss: loss_value,
                    self.train_accuracy: accu_value,
                }
                summary = sess.run(self.train_summaries, summary_feed_dict)
                self.writer.add_summary(summary, global_step)
            elif dataset_obj.data_type == 'dev':
                summary_feed_dict = {
                    self.dev_loss: loss_value,
                    self.dev_accuracy: accu_value,
                }
                summary = sess.run(self.dev_summaries, summary_feed_dict)
                self.writer.add_summary(summary, global_step)
            else:
                summary_feed_dict = {
                    self.test_loss: loss_value,
                    self.test_accuracy: accu_value,
                }
                summary = sess.run(self.test_summaries, summary_feed_dict)
                self.writer.add_summary(summary, global_step)

        return loss_value, accu_value
    def count_data_and_build_dict(self, data_list, gene_dicts=True):
        def add_ept_and_unk(a_list):
            a_list.insert(0, '@@@empty')
            a_list.insert(1, '@@@unk')
            return a_list

        _logger.add()
        _logger.add('counting and build dictionaries')

        token_collection = []
        char_collection = []

        sent_len_collection = []
        token_len_collection = []

        for sample in data_list:
            token_collection += sample['sentence_token']
            sent_len_collection += [len(sample['sentence_token'])]

            for token in sample['sentence_token']:
                char_collection += list(token)
                token_len_collection.append(len(token))

        max_sent_len = max(sent_len_collection)
        max_token_len = max(token_len_collection)
        token_set = list(set(token_collection))
        char_set = list(set(char_collection))

        if gene_dicts:
            if cfg.use_glove_unk_token:
                glove_data = load_glove(cfg.word_embedding_length)
                glove_token_set = list(glove_data.keys())
                if cfg.lower_word:
                    token_set = list(token.lower() for token in token_set)
                    glove_token_set = list(
                        set(token.lower() for token in glove_token_set))

                # delete token from glove_token_set which appears in token_set
                for token in token_set:
                    try:
                        glove_token_set.remove(token)
                    except ValueError:
                        pass
            else:
                if cfg.lower_word:
                    token_set = list(token.lower() for token in token_set)
                glove_token_set = []
            token_set = add_ept_and_unk(token_set)
            char_set = add_ept_and_unk(char_set)
            dicts = {
                'token': token_set,
                'char': char_set,
                'glove': glove_token_set
            }

        return dicts, {'sent': max_sent_len, 'token': max_token_len}
Exemplo n.º 21
0
 def load_snli_data(self, data_path, data_type):
     _logger.add()
     _logger.add('load file for %s' % data_type)
     dataset = []
     with open(data_path, 'r', encoding='utf-8') as file:
         for line in file:
             json_obj = json.loads(line)
             dataset.append(json_obj)
     _logger.done()
     return dataset
Exemplo n.º 22
0
 def process_raw_data(self, data_list, data_type):
     _logger.add()
     _logger.add('processing raw data: %s...' % data_type)
     for sample in data_list:
         for tree_node in sample:
             # node_index, parent_index, token_seq, leaf_node_index_seq, is_leaf, token, sentiment_label
             # char_seq
             tree_node['char_seq'] = [list(token) for token in tree_node['token_seq']]
     _logger.done()
     return data_list
Exemplo n.º 23
0
    def count_data_and_build_dict(self, data_list, gene_dicts=True):
        def add_ept_and_unk(a_list):
            a_list.insert(0, '@@@empty')
            a_list.insert(1, '@@@unk')
            return a_list

        _logger.add()
        _logger.add('counting and build dictionaries')

        token_collection = []
        char_collection = []

        sent_len_collection = []
        token_len_collection = []

        for sample in data_list:
            for tree_node in sample:
                token_collection += tree_node['token_seq']
                sent_len_collection.append(len(tree_node['token_seq']))
                for char_seq in tree_node['char_seq']:
                    char_collection += char_seq
                    token_len_collection.append(len(char_seq))

        max_sent_len = dynamic_length(sent_len_collection, 1, security=False)[0]
        max_token_len = dynamic_length(token_len_collection, 0.99, security=False)[0]

        if gene_dicts:
            # token & char
            tokenSet = dynamic_keep(token_collection, 1)
            charSet = dynamic_keep(char_collection, 1)
            if cfg.use_glove_unk_token:
                gloveData = load_glove(cfg.word_embedding_length)
                gloveTokenSet = list(gloveData.keys())
                if cfg.lower_word:
                    tokenSet = list(set([token.lower() for token in tokenSet]))  ##!!!
                    gloveTokenSet = list(set([token.lower() for token in gloveTokenSet]))  ##!!!

                # delete token from gloveTokenSet which appears in tokenSet
                for token in tokenSet:
                    try:
                        gloveTokenSet.remove(token)
                    except ValueError:
                        pass
            else:
                if cfg.lower_word:
                    tokenSet = list(set([token.lower() for token in tokenSet]))
                gloveTokenSet = []
            tokenSet = add_ept_and_unk(tokenSet)
            charSet = add_ept_and_unk(charSet)
            dicts = {'token': tokenSet, 'char': charSet, 'glove': gloveTokenSet}
        else:
            dicts = {}

        _logger.done()
        return dicts, {'sent': max_sent_len, 'token': max_token_len}
Exemplo n.º 24
0
    def get_evaluation(self,
                       sess,
                       dataset_obj,
                       global_step=None,
                       time_counter=None):
        _logger.add()
        _logger.add('getting evaluation result for %s' % dataset_obj.data_type)

        logits_list, loss_list, accu_list = [], [], []
        for sample_batch, _, _, _ in dataset_obj.generate_batch_sample_iter():
            feed_dict = self.model.get_feed_dict(sample_batch, 'dev')

            if time_counter is not None:
                time_counter.add_start()
            logits, loss, accu = sess.run(
                [self.model.logits, self.model.loss, self.model.accuracy],
                feed_dict)
            if time_counter is not None:
                time_counter.add_stop()
            logits_list.append(np.argmax(logits, -1))
            loss_list.append(loss)
            accu_list.append(accu)

        logits_array = np.concatenate(logits_list, 0)
        loss_value = np.mean(loss_list)
        # accu_array = np.concatenate(accu_list, 0)
        # accu_value = np.mean(accu_array)

        # calculate accuracy
        correct_list = []
        for qa, predicted_label in zip(dataset_obj.nn_data,
                                       list(logits_array)):
            correct = 0.
            for ans in qa['answers']:
                if int(predicted_label) == int(ans['sent_label']):
                    correct = 1.
            correct_list.append(correct)
        accu_value = np.mean(correct_list)

        if global_step is not None:
            if dataset_obj.data_type == 'train':
                summary_feed_dict = {
                    self.train_loss: loss_value,
                    self.train_accuracy: accu_value,
                }
                summary = sess.run(self.train_summaries, summary_feed_dict)
                self.writer.add_summary(summary, global_step)
            elif dataset_obj.data_type == 'dev':
                summary_feed_dict = {
                    self.dev_loss: loss_value,
                    self.dev_accuracy: accu_value,
                }
                summary = sess.run(self.dev_summaries, summary_feed_dict)
                self.writer.add_summary(summary, global_step)
        return loss_value, accu_value
Exemplo n.º 25
0
    def update_tensor_add_ema_and_opt(self):
        self.logits = self.build_network()
        self.loss = self.build_loss()
        self.accuracy = self.build_accuracy()

        # ------------ema-------------
        if True:
            self.var_ema = tf.train.ExponentialMovingAverage(cfg.var_decay)
            self.build_var_ema()

        if cfg.mode == 'train':
            self.ema = tf.train.ExponentialMovingAverage(cfg.decay)
            self.build_ema()
        self.summary = tf.summary.merge_all()

        # ---------- optimization ---------
        if cfg.optimizer.lower() == 'adadelta':
            assert cfg.learning_rate > 0.1 and cfg.learning_rate <= 1.
            self.opt = tf.train.AdadeltaOptimizer(cfg.learning_rate)
        elif cfg.optimizer.lower() == 'adam':
            assert cfg.learning_rate < 0.1
            self.opt = tf.train.AdamOptimizer(cfg.learning_rate)
        elif cfg.optimizer.lower() == 'rmsprop':
            assert cfg.learning_rate < 0.1
            self.opt = tf.train.RMSPropOptimizer(cfg.learning_rate)

        elif cfg.optimizer.lower() == 'test':
            self.opt = tf.train.RMSPropOptimizer(0.001, 0.75)
            # self.opt = tf.contrib.keras.optimizers.Nadam()
        else:
            raise AttributeError('no optimizer named as \'%s\'' %
                                 cfg.optimizer)

        trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           self.scope)
        # trainable param num:
        # print params num
        all_params_num = 0
        for elem in trainable_vars:
            # elem.name
            var_name = elem.name.split(':')[0]
            if var_name.endswith('emb_mat'): continue
            params_num = 1
            for l in elem.get_shape().as_list():
                params_num *= l
            all_params_num += params_num
        _logger.add('Trainable Parameters Number: %d' % all_params_num)

        self.train_op = self.opt.minimize(self.loss,
                                          self.global_step,
                                          var_list=tf.get_collection(
                                              tf.GraphKeys.TRAINABLE_VARIABLES,
                                              self.scope))
Exemplo n.º 26
0
    def build_loss(self):
        _logger.add('regularization var num: %d' % len(set(tf.get_collection('reg_vars', self.scope))))
        _logger.add('trainable var num: %d' % len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)))
        # weight_decay
        with tf.name_scope("weight_decay"):
            for var in set(tf.get_collection('reg_vars', self.scope)):
                tensor_name = var.op.name
                weight_decay = tf.multiply(tf.nn.l2_loss(var), cfg.wd,
                                           name="{}-wd".format('-'.join(str(var.op.name).split('/'))))
                if not tensor_name.startswith(self.scope+'/hard_network'):
                    tf.add_to_collection('losses_sl', weight_decay)
                if tensor_name.startswith(self.scope+'/hard_network'):
                    tf.add_to_collection('losses_rl', weight_decay)

        a = tf.get_collection('losses_sl', self.scope)
        b = tf.get_collection('losses_rl', self.scope)

        cost_batch = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self.gold_label,
            logits=self.logits
        )  # [bs]
        # @ 1. for normal
        cost_sl = tf.reduce_mean(cost_batch, name='cost_sl')
        tf.add_to_collection('losses_sl', cost_sl)
        loss_sl = tf.add_n(tf.get_collection('losses_sl', self.scope), name='loss_sl')
        tf.summary.scalar(loss_sl.op.name, loss_sl)
        tf.add_to_collection('ema/scalar', loss_sl)

        # @ 2. for rl
        self.choose_percentage = tf.reduce_mean(tf.stack([self.s1_percentage, self.s2_percentage]),
                                                name='choose_percentage')
        tf.summary.scalar(self.choose_percentage.op.name, self.choose_percentage)
        tf.add_to_collection('ema/scalar', self.choose_percentage)

        # # loss_rl
        s1_rewards_raw = - (cost_batch + cfg.rl_sparsity * self.s1_percentage)
        s2_rewards_raw = - (cost_batch + cfg.rl_sparsity * self.s2_percentage)

        self.reward_mean = tf.reduce_mean(tf.stack([s1_rewards_raw, s2_rewards_raw]), name='reward_mean')
        tf.summary.scalar(self.reward_mean.op.name, self.reward_mean)
        tf.add_to_collection('ema/scalar', self.reward_mean)

        cost_rl = - tf.reduce_mean(
            s1_rewards_raw * tf.reduce_sum(self.s1_logpa, 1) +
            s2_rewards_raw * tf.reduce_sum(self.s2_logpa, 1), name='cost_rl')
        tf.add_to_collection('losses_rl', cost_rl)
        loss_rl = tf.add_n(tf.get_collection('losses_rl', self.scope), name='loss_rl')
        tf.summary.scalar(loss_rl.op.name, loss_rl)
        tf.add_to_collection('ema/scalar', loss_rl)

        return loss_sl, loss_rl
Exemplo n.º 27
0
 def load_question_classification_data(self, data_file_path, data_type):
     _logger.add()
     _logger.add('load file for %s' % data_type)
     dataset = []
     with open(data_file_path, 'r', encoding='latin-1') as file:
         for line in file:
             line_split = line.strip().split(' ')
             cls = line_split[0].split(':')[0]
             sub_cls = line_split[0]
             token = line_split[1:]
             sample = {'token': token, 'cls': cls, 'sub_cls': sub_cls}
             dataset.append(sample)
     _logger.done()
     return dataset
Exemplo n.º 28
0
def output_model_params():
    _logger.add()
    _logger.add('==>model_title: ' + cfg.model_name[1:])
    _logger.add()
    for key, value in cfg.args.__dict__.items():
        if key not in ['test', 'shuffle']:
            _logger.add('%s: %s' % (key, value))
Exemplo n.º 29
0
    def clip_filter_data(self, data_list, data_clip_method, data_type):
        _logger.add()
        _logger.add('%s cliping data for  %s...' %
                    (data_clip_method, data_type))

        for sample in data_list:
            if data_clip_method == 'no_tree':
                sample.pop('sentence1_parse')
                sample.pop('sentence2_parse')
                # sample.pop('sentence1_parse_tree')
                # sample.pop('sentence2_parse_tree')
                # sample.pop('sentence1_parse_node_list')
                # sample.pop('sentence2_parse_node_list')
                sample.pop('sentence1_binary_parse')
                sample.pop('sentence2_binary_parse')
                sample.pop('sentence1_binary_parse_tree')
                sample.pop('sentence2_binary_parse_tree')
                sample.pop('sentence1_binary_parse_node_list')
                sample.pop('sentence2_binary_parse_node_list')
                sample.pop('s1_sr_info')
                sample.pop('s2_sr_info')
                # sample.pop('s1_tree_tag')
                # sample.pop('s2_tree_tag')
            elif data_clip_method == 'no_redundancy':
                sample.pop('sentence1_parse')
                sample.pop('sentence2_parse')
                # sample.pop('sentence1_parse_tree')
                # sample.pop('sentence2_parse_tree')
                # sample.pop('sentence1_parse_node_list')
                # sample.pop('sentence2_parse_node_list')

                sample.pop('sentence1_binary_parse')
                sample.pop('sentence2_binary_parse')
                sample.pop('sentence1_binary_parse_tree')
                sample.pop('sentence2_binary_parse_tree')

                for node in sample['sentence1_binary_parse_node_list']:
                    node.children_nodes = None
                    node.leaf_node_index_seq = None

                for node in sample['sentence1_binary_parse_node_list']:
                    node.children_nodes = None
                    node.leaf_node_index_seq = None

            else:
                raise AttributeError('no data clip method named as %s' %
                                     data_clip_method)
        _logger.done()
        return data_list
Exemplo n.º 30
0
    def restore(self, sess):
        _logger.add()

        if cfg.load_path is not None:
            _logger.add('trying to restore from ckpt file %s' % cfg.load_path)
            try:
                self.saver.restore(sess, cfg.load_path)
                _logger.add('success to restore')
            except tf.errors.NotFoundError:
                _logger.add('failure to restore')
                if cfg.mode != 'train': raise FileNotFoundError('cannot find model file')
        else:
            _logger.add('No check point file')
            if cfg.mode != 'train': raise FileNotFoundError('cannot find model file')

        _logger.done()