Exemplo n.º 1
0
 def test_equals(self):
     dp = PaddedList([[6, 66]], shape=[3, 4])
     self.assertTrue(dp == self.bp[2])
     self.assertFalse(dp != self.bp[2])
     dp = PaddedList([[6, 666]], shape=[3, 4])
     self.assertFalse(dp == self.bp[2])
     self.assertTrue(dp != self.bp[2])
Exemplo n.º 2
0
 def test_index(self):
     res = self.bp.index(PaddedList([[6, 66]], shape=[3, 4]))
     self.assertEquals(res, 2)
     cp = PaddedList([1, 2, 3], shape=[4])
     res = cp.index(0)
     self.assertEquals(res, 3)
     res = cp.index(5)
     self.assertEquals(res, -1)
 def parse_input_tensor(batch_data, do_sample=False):
     input_seq = to_cuda(
         torch.LongTensor(PaddedList(batch_data['input_seq'])))
     inp_seq_len = to_cuda(torch.LongTensor(batch_data['input_seq_len']))
     target_seq = to_cuda(
         torch.LongTensor(PaddedList(batch_data['target_seq'])))
     target_seq_len = to_cuda(torch.LongTensor(
         batch_data['target_seq_len']))
     return input_seq, inp_seq_len, None, target_seq, target_seq_len, None, batch_data[
         'masked_positions']
    def _preprocess(self, batch_data):
        from common.util import PaddedList
        s1 = batch_data["s1"]
        s2 = batch_data['s2']

        return to_cuda(torch.LongTensor(PaddedList(s1, fill_value=self._pad_idx))), \
               to_cuda(torch.LongTensor(PaddedList(batch_data['s1_char'], fill_value=self._character_pad_idx))), \
               to_cuda(torch.LongTensor(PaddedList(s2, fill_value=self._pad_idx))), \
               to_cuda(torch.LongTensor(PaddedList(batch_data['s2_char'],
                                  fill_value=self._character_pad_idx)))
Exemplo n.º 5
0
 def parse_target_batch_data(batch_data, ):
     forward_target_seq = to_cuda(
         torch.LongTensor(
             PaddedList(batch_data['forward_target'],
                        fill_value=ignore_id)))
     backward_target_seq = to_cuda(
         torch.LongTensor(
             PaddedList(batch_data['backward_target'],
                        fill_value=ignore_id)))
     return forward_target_seq, backward_target_seq
    def _preprocess(self, batch_data):
        from common.util import PaddedList
        s1 = batch_data["s1"]
        s2 = batch_data['s2']

        batch_size = len(s1)
        size = max(len(t1)+len(t2)+1 for t1, t2 in zip(s1, s2))
        if self._summary_node:
            size += 2
        # print("size:{}".format(size))

        if not self._summary_node:
            sentences = to_cuda(torch.LongTensor(
                PaddedList([t1 + [self._pad_idx] + t2 for t1, t2 in zip(s1, s2)], fill_value=self._pad_idx,)))

            sentences_char = to_cuda(torch.LongTensor(
                PaddedList([t1 + [[self._character_pad_idx]] + t2 for t1, t2 in zip(batch_data['s1_char'], batch_data['s2_char'])],
                           fill_value=self._character_pad_idx)))
        else:
            sentences = to_cuda(torch.LongTensor(
                PaddedList([t1 + [self._pad_idx] + t2 + [self._pad_idx, self._pad_idx] for t1, t2 in zip(s1, s2)],
                           fill_value=self._pad_idx, )))

            sentences_char = to_cuda(torch.LongTensor(
                PaddedList(
                    [t1 + [[self._character_pad_idx]] + t2 + [[self._character_pad_idx], [self._character_pad_idx]] for
                     t1, t2 in
                     zip(batch_data['s1_char'], batch_data['s2_char'])],
                    fill_value=self._character_pad_idx)))

        distance_matrix = np.ones((batch_size, size, size)) * float('-inf')
        for i, (t1, t2) in enumerate(zip(s1, s2)):
            s1_matrix = util.create_distance_node_matrix(len(t1))
            s2_matrix = util.create_distance_node_matrix(len(t2))
            distance_matrix[i, :len(t1), :len(t1)] = s1_matrix
            distance_matrix[i, len(t1)+1:len(t1)+len(t2)+1, len(t1)+1:len(t1)+len(t2)+1] = s2_matrix
            if self._summary_node:
                distance_matrix[i, :len(t1), -2] = 0
                distance_matrix[i, len(t1)+1:len(t1)+len(t2)+1, -1] = 0

        distance_matrix = to_cuda(torch.FloatTensor(np.stack(distance_matrix, axis=0)))

        # sentence_same_token_link_matrix = []
        # for t1, t2 in zip(s1, s2):
        #     idx, idy, data = util.create_sentence_pair_same_node_matrix(t1, 0, t2, len(t1)+1)
        #     sentence_same_token_link_matrix.append(
        #         sparse.coo_matrix(
        #             (data, (idx, idy)),
        #             shape=(size, size), dtype=np.float
        #         ).toarray()
        #     )
        # sentence_same_token_link_matrix = to_cuda(torch.FloatTensor(np.stack(sentence_same_token_link_matrix, axis=0)))

        return sentences, sentences_char, distance_matrix,
Exemplo n.º 7
0
 def parse_target(batch_data):
     if 'error_line' not in batch_data.keys() or no_target:
         return None
     target_error_position = to_cuda(
         torch.LongTensor(PaddedList(batch_data['error_line'])))
     target_seq = to_cuda(
         torch.LongTensor(
             PaddedList(batch_data['target_line_ids'],
                        fill_value=ignore_id)))
     target_seq = target_seq[:, 1:]
     return target_error_position, target_seq
 def parse_input(batch_data, do_sample=False):
     inputs = to_cuda(torch.LongTensor(PaddedList(batch_data['input_seq'])))
     input_length = to_cuda(
         torch.LongTensor(PaddedList(batch_data['input_length'])))
     if not do_sample:
         targets = to_cuda(
             torch.LongTensor(PaddedList(batch_data['target_seq'])))
         targets_length = to_cuda(
             torch.LongTensor(PaddedList(batch_data['target_length'])))
     else:
         targets = None
         targets_length = None
     return inputs, input_length, targets, targets_length
Exemplo n.º 9
0
 def _forward_pre_process(self, batch_data):
     input_seq = to_cuda(
         torch.LongTensor(PaddedList(batch_data['input_seq'])))
     input_length = to_cuda(torch.LongTensor(batch_data['input_length']))
     decoder_input = to_cuda(
         torch.LongTensor(PaddedList(batch_data['decoder_input'])))
     grammar_index = list(
         more_itertools.flatten(batch_data['grammar_index']))
     grammar_index_length = to_cuda(
         torch.LongTensor([len(t) for t in grammar_index]))
     grammar_index = to_cuda(torch.LongTensor(PaddedList(grammar_index)))
     target_index = batch_data['target_index']
     return input_seq, input_length, decoder_input, grammar_index, grammar_index_length, target_index
    def all_output_and_target_evaluate(model_output, model_target, batch_data):
        p1, p2, is_copy, copy_ids, sample_output, sample_output_ids = model_output
        p1_t, p2_t, is_copy_t, copy_target_t, sample_target_t, sample_small_target_t = model_target

        output_mask = torch.ne(is_copy_t, ignore_token)

        result = torch.eq(p1_t, p1)
        result = result & torch.eq(p2_t, p2)

        # is_copy_ne_count = torch.sum(torch.ne(is_copy, is_copy_t) & output_mask, dim=-1)
        # result = result & torch.eq(is_copy_ne_count, 0)
        #
        # copy_ids_ne_count = torch.sum(torch.ne(copy_ids, copy_target_t) & output_mask, dim=-1)
        # result = result & torch.eq(copy_ids_ne_count, 0)
        #
        # sample_ids_ne_count = torch.sum(torch.ne(sample_output, sample_target_t) & output_mask, dim=-1)
        # result = result & torch.eq(sample_ids_ne_count, 0)

        target_output = torch.LongTensor(
            PaddedList(batch_data['target'],
                       fill_value=ignore_token)).to(p1_t.device)
        sample_ids_ne_count = torch.sum(
            torch.ne(sample_output_ids, target_output[:, 1:]) & output_mask,
            dim=-1)
        result = result & torch.eq(sample_ids_ne_count, 0)
        return result
def parse_graph_input_from_mask_lm_output(input_seq,
                                          input_length,
                                          adj,
                                          use_ast=True):
    from common.problem_util import to_cuda
    from common.util import PaddedList

    def to_long(x):
        return to_cuda(torch.LongTensor(x))

    if not use_ast:
        adjacent_matrix = to_long(adj)
    else:
        adjacent_tuple = [[[i] + tt for tt in t] for i, t in enumerate(adj)]
        adjacent_tuple = [
            list(t) for t in unzip(more_itertools.flatten(adjacent_tuple))
        ]
        size = max(input_length)
        # print("max length in this batch:{}".format(size))
        adjacent_tuple = torch.LongTensor(adjacent_tuple)
        adjacent_values = torch.ones(adjacent_tuple.shape[1]).long()
        adjacent_size = torch.Size([len(input_length), size, size])
        # info('batch_data input_length: ' + str(batch_data['input_length']))
        # info('size: ' + str(size))
        # info('adjacent_tuple: ' + str(adjacent_tuple.shape))
        # info('adjacent_size: ' + str(adjacent_size))
        adjacent_matrix = to_cuda(
            torch.sparse.LongTensor(
                adjacent_tuple,
                adjacent_values,
                adjacent_size,
            ).float().to_dense())
    input_seq = to_long(PaddedList(input_seq))
    input_length = to_long(input_length)
    return adjacent_matrix, input_seq, input_length
Exemplo n.º 12
0
    def parse_input_batch_data(batch_data, do_sample=False):
        def to_long(x):
            return to_cuda(torch.LongTensor(x))

        input_seq = to_long(PaddedList(batch_data['input_seq']))
        input_length = to_long(batch_data['input_length'])
        return input_seq, input_length
Exemplo n.º 13
0
 def test_tensor(self):
     ten = torch.Tensor(self.ap)
     print(ten)
     ten = torch.Tensor(self.bp)
     print(ten)
     ep = PaddedList([[4, 44], [5, 55, 555, 5555], [6]], shape=[3, 4])
     ten = torch.Tensor(ep)
     print(ten)
Exemplo n.º 14
0
 def parse_target_batch_data(batch_data):
     is_copy = to_cuda(
         torch.FloatTensor(
             PaddedList(batch_data['is_copy'], fill_value=ignore_token)))
     target = to_cuda(
         torch.LongTensor(list(more_itertools.flatten(
             batch_data['target']))))
     return is_copy, target
Exemplo n.º 15
0
 def test_reverse(self):
     res = []
     target1 = [1, 2, 3]
     for i in reversed(self.ap):
         res = [i] + res
     for r, t in zip(res, target1):
         self.assertEquals(r, t)
     res = []
     target2 = [
         PaddedList([[1, 11, 111], [2]], shape=[3, 4]),
         PaddedList([[4, 44], [5, 55, 555, 5555], [6]], shape=[3, 4]),
         PaddedList([[6, 66]], shape=[3, 4])
     ]
     for i in reversed(self.bp):
         res = [i] + res
     for r, t in zip(res, target2):
         self.assertEquals(r.shape, t.shape)
         self.assertEquals(r.l, t.l)
         self.assertEquals(r.to_list(), t.to_list())
    def add_result(self,
                   output,
                   model_output,
                   model_target,
                   model_input,
                   ignore_token=None,
                   batch_data=None):
        model_output = [t.data for t in model_output]
        if ignore_token is None:
            ignore_token = self.ignore_token
        is_copy = (torch.sigmoid(model_output[2]) > 0.5).float()
        is_copy_target = model_target[2]
        is_copy_accuracy = self.is_copy_accuracy.add_result(
            is_copy, is_copy_target)
        p0 = torch.topk(F.softmax(model_output[0], dim=-1), dim=-1, k=1)[1]
        p1 = torch.topk(F.softmax(model_output[1], dim=-1), dim=-1, k=1)[1]
        position = torch.cat([p0, p1], dim=1)
        position_target = torch.stack([model_target[0], model_target[1]],
                                      dim=1)
        position_correct = self.position_correct.add_result(
            position, position_target)

        all_output, sample_output_ids = output
        target_output = to_cuda(
            torch.LongTensor(
                PaddedList(batch_data['target'], fill_value=ignore_token)))
        sample_output_ids, target_output = expand_tensor_sequence_to_same(
            sample_output_ids, target_output[:, 1:])
        output_accuracy = self.output_accuracy.add_result(
            sample_output_ids, target_output)

        full_output_target = to_cuda(
            torch.LongTensor(
                PaddedList(batch_data['full_output_target'],
                           fill_value=ignore_token)))
        all_output, full_output_target = expand_tensor_sequence_to_same(
            all_output, full_output_target, fill_value=ignore_token)
        all_correct = self.all_correct.add_result(all_output,
                                                  full_output_target)
        return "is_copy_accuracy evaluate:{}, position_correct evaluate:{}, output_accuracy evaluate:{}, " \
               "all_correct evaluate: {}".format(is_copy_accuracy, position_correct, output_accuracy, all_correct)
Exemplo n.º 17
0
    def parse_input(batch_data, do_sample=False):
        input_seq = to_cuda(
            torch.LongTensor(
                PaddedList(batch_data['error_token_ids'], fill_value=0)))
        input_line_length = to_cuda(
            torch.LongTensor(PaddedList(batch_data['error_line_length'])))
        input_line_token_length = to_cuda(
            torch.LongTensor(PaddedList(
                batch_data['error_line_token_length'])))

        input_length = to_cuda(
            torch.LongTensor(PaddedList(batch_data['error_token_length'])))
        if not use_ast:
            adj_matrix = to_cuda(torch.LongTensor(batch_data['adj']))
        else:
            adjacent_tuple = [[[i] + tt for tt in t]
                              for i, t in enumerate(batch_data['adj'])]
            adjacent_tuple = [
                list(t) for t in unzip(more_itertools.flatten(adjacent_tuple))
            ]
            size = max(batch_data['error_token_length'])
            # print("max length in this batch:{}".format(size))
            adjacent_tuple = torch.LongTensor(adjacent_tuple)
            adjacent_values = torch.ones(adjacent_tuple.shape[1]).long()
            adjacent_size = torch.Size(
                [len(batch_data['error_token_length']), size, size])
            info('batch_data input_length: ' +
                 str(batch_data['error_token_length']))
            info('size: ' + str(size))
            info('adjacent_tuple: ' + str(adjacent_tuple.shape))
            info('adjacent_size: ' + str(adjacent_size))
            adj_matrix = to_cuda(
                torch.sparse.LongTensor(
                    adjacent_tuple,
                    adjacent_values,
                    adjacent_size,
                ).float().to_dense())

        if not do_sample:
            target_error_position = to_cuda(
                torch.LongTensor(PaddedList(batch_data['error_line'])))
            target_seq = to_cuda(
                torch.LongTensor(
                    PaddedList(batch_data['target_line_ids'],
                               fill_value=ignore_id)))
            target_length = to_cuda(
                torch.LongTensor(PaddedList(batch_data['target_line_length'])))
        else:
            target_error_position = None
            target_seq = None
            target_length = None

        return input_seq, input_line_length, input_line_token_length, input_length, adj_matrix, target_error_position, target_seq, target_length
Exemplo n.º 18
0
def train(model, dataset, batch_size, loss_function, optimizer):
    print('in train')
    total_loss = torch.Tensor([0])
    count = torch.Tensor([0])
    steps = 0
    model.train()
    for batch_data in data_loader(dataset, batch_size=batch_size, is_shuffle=True, drop_last=True):
        # with torch.autograd.profiler.profile() as prof:
        error_tokens = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['error_tokens'])))
        error_length = trans_to_cuda(torch.LongTensor(batch_data['error_length']))
        ac_tokens_input = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['ac_tokens_input'])))
        ac_tokens_length = trans_to_cuda(torch.LongTensor(batch_data['ac_length']))
        target_tokens = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['target_tokens'], fill_value=TARGET_PAD_TOKEN)))


        del batch_data["error_tokens"], batch_data["error_length"], batch_data["ac_tokens_input"], batch_data["ac_length"], batch_data["target_tokens"]

        model.zero_grad()
        log_probs = model.ibm_forward(error_tokens, error_length, ac_tokens_input, ac_tokens_length)
        print('finish one step train')
        loss = loss_function(log_probs.view(log_probs.shape[0]*log_probs.shape[1], -1), target_tokens.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
        optimizer.step()
        print('finish optimizer step train')

        cur_target_count = (torch.sum(ac_tokens_length.data.cpu()) - batch_size).float()
        total_loss += (loss.data.cpu() * cur_target_count)
        count += cur_target_count
        steps += 1
        print('step {} loss: {}'.format(steps, loss))
        # print(prof)
        sys.stdout.flush()
        sys.stderr.flush()

    return (total_loss/count).data[0]
Exemplo n.º 19
0
 def test_count(self):
     res = self.bp.count(PaddedList([[6, 66]], shape=[3, 4]))
     self.assertEquals(res, 1)
     res = self.bp.count(PaddedList([[6, 666]], shape=[3, 4]))
     self.assertEquals(res, 0)
     cp = PaddedList([1, 2, 3], shape=[6])
     res = cp.count(0)
     self.assertEquals(res, 3)
     res = cp.count(5)
     self.assertEquals(res, 0)
    def _preprocess(self, batch_data):
        from common.util import PaddedList
        s1 = batch_data["s1"]
        s2 = batch_data['s2']
        batch_size = len(s1)
        s = [t1[:-1]+[self._delimeter_idx]+t2[1:-1]+[self._summary_node_idx] for t1, t2 in zip(s1, s2)]
        # summary_node_index = []
        # for t, i in zip(s, range(batch_size)):
        #     summary_node_index.append(len(t)+i * self._max_length - 1)
        length = [len(t) for t, i in zip(s, range(batch_size))]
        for t in length:
            assert t <= self._max_length

        def to(x):
            return to_cuda(torch.LongTensor(x))

        return [torch.cat([to(PaddedList(s, fill_value=self._pad_idx, shape=[batch_size, self._max_length], )).unsqueeze(-1),
                          to(np.repeat(self._position_range, batch_size, axis=0)).unsqueeze(-1)], dim=-1)]
 def parse_data(df: pd.DataFrame):
     res = []
     for row in df.iterrows():
         row = row[1]
         res.append({
             "text":
             row['tokens'],
             "input":
             row['subtokens_id'],
             "label":
             PaddedList(row['names_id'],
                        fill_value=output_pad_id,
                        shape=[
                            decoder_max_length,
                        ]),
             "length":
             len(row['subtokens_id']),
             "max_decoder_length":
             decoder_max_length,
         })
     return res
Exemplo n.º 22
0
def sequence_transform_data_config2(is_debug, output_log=None):
    from model.encoder_decoder_graph import SequenceEncoderDecoderModelUseEncodePad
    import numpy as np
    from read_data.sequencec_transform_data.load_data import load_generated_random_target_data
    train, valid, test = load_generated_random_target_data(is_debug)
    valid.train = False
    test.train = False
    max_index = 10
    max_length = 20
    begin_index = 11
    end_index = 12
    delimiter_index = 13
    hole_index = 14
    pad_index = 15
    for t in [train, valid, test]:
        t.end = [end_index]
    from model.transformer_lm import dotdict
    from model.encoder_decoder_graph import SequencePreprocesserWithInputPad
    return {
        "model_fn": SequenceEncoderDecoderModelUseEncodePad,
        "model_dict": {
            "cfg": dotdict({
                'n_embd': 768,
                'n_head': 1,
                'n_layer': 1,
                'embd_pdrop': 0.1,
                'attn_pdrop': 0.1,
                'resid_pdrop': 0.1,
                'afn': 'gelu',
                'clf_pdrop': 0.1}),
            "vocab": 16 + max_length*2+4,
            "n_ctx": max_length*2+4,
            "encoder_length": max_length+2,
        },
        "pre_process_module_fn": SequencePreprocesserWithInputPad,
        "pre_process_module_dict": {
            "hole_idx": hole_index,
            "begin_idx":  begin_index,
            "delimeter_idx": delimiter_index,
            "pad_idx": pad_index,
            "max_length": max_length+2,
            "position_embedding_base": 16,
        },
        "data": [train, valid, test],
        "label_preprocess": lambda x: to_cuda(torch.LongTensor([PaddedList(t, fill_value=pad_index, shape=[max_length+1]) for t in x['y']])),
        "batch_size": 800,
        "train_loss": lambda: NCE_train_loss(ignore_index=pad_index),
        "clip_norm": 1,
        "name": "Transformer_seq_to_seq_model_use_random_target_use_encoder_pad",
        "optimizer": OpenAIAdam,
        "need_pad": True,
        "optimizer_dict": {
                           "schedule": 'warmup_linear',
                           "warmup": 0.002,
                           "t_total": (80000//800)*80,
                           "b1": 0.9,
                           "b2": 0.999,
                           "e": 1e-8,
                           "l2": 0.01,
                           "vector_l2": 'store_true',
                           "max_grad_norm": 1},
        "epcohes": 80,
        "lr": 6.25e-4,
        "evaluate_object_list": [SequenceExactMatch(gpu_index=get_gpu_index(), ignore_token=pad_index),
                                 SequenceOutputIDToWord(vocab=None, file_path=output_log, ignore_token=pad_index)],
        "epoch_ratio": 1,
        "scheduler_fn": None
    }
Exemplo n.º 23
0
def evaluate(model, dataset, loss_function, batch_size, start, end, unk, id_to_word_fn, file_path='test.c', use_force_train=False):
    print('in evaluate')
    global print_count
    steps = 0
    success = 0
    total = 0
    total_loss = torch.Tensor([0])
    count = torch.Tensor([0])
    total_correct = torch.Tensor([0])
    total_compare_correct = torch.Tensor([0])
    total_loss_in_train = torch.Tensor([0])
    count_in_train = torch.Tensor([0])
    model.eval()
    for batch_data in data_loader(dataset, batch_size=batch_size, is_shuffle=True, drop_last=True):
        error_tokens = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['error_tokens'])))
        error_length = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['error_length'])))
        ac_tokens_input = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['ac_tokens_input'])))
        ac_tokens_length = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['ac_length'])))
        target_tokens = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['target_tokens'], fill_value=TARGET_PAD_TOKEN)))
        target_tokens_padded = padded_tensor_one_dim_to_length(target_tokens.float(), dim=1,
                                                          padded_length=MAX_LENGTH,
                                                          is_cuda=available_cuda, gpu_index=GPU_INDEX, fill_value=TARGET_PAD_TOKEN).long()
        del batch_data["error_tokens"], batch_data["error_length"], batch_data["ac_tokens_input"], batch_data[
            "ac_length"], batch_data["target_tokens"]
        includes = batch_data['includes']

        loss_in_train = None
        # calculate loss like train
        if use_force_train:
            log_probs = model.ibm_forward(error_tokens, error_length, ac_tokens_input, ac_tokens_length)
            print('finish one step train')
            loss_in_train = loss_function(log_probs.view(log_probs.shape[0] * log_probs.shape[1], -1), target_tokens.view(-1))

            cur_target_count = (torch.sum(ac_tokens_length.data.cpu()) - batch_size).float()
            total_loss_in_train += (loss_in_train.data.cpu() * cur_target_count)
            count_in_train += cur_target_count
        else:
            log_probs = model._ibm_test_forward(error_tokens, error_length)

        # do evaluate
        cur_batch_len = len(batch_data['includes'])

        predict_log_probs = torch.transpose(log_probs, 0, 1)
        target_label = torch.transpose(target_tokens_padded, 0, 1)
        cur_loss = torch.Tensor([0])
        cur_step = torch.Tensor([0])
        cur_correct = torch.Tensor([0])
        is_compare_success = torch.Tensor([1] * batch_size)
        for i, step_output in enumerate(predict_log_probs):
            step_target = target_label[i, :].view(batch_size)
            batch_loss = loss_function(step_output.view(batch_size, -1), step_target)
            batch_predict_label = step_output.view(batch_size, -1).topk(1)[1].view(batch_size)

            in_step_count = step_target.ne(TARGET_PAD_TOKEN).sum().float()
            cur_loss += (batch_loss.data.cpu() * in_step_count.cpu())
            cur_step += in_step_count.data.cpu()
            batch_correct = (step_target.ne(TARGET_PAD_TOKEN) & step_target.eq(batch_predict_label)).sum().cpu().float()
            batch_error = step_target.ne(TARGET_PAD_TOKEN) & step_target.ne(batch_predict_label)
            is_compare_success[batch_error.cpu()] = 0
            if batch_correct > 16:
                print(batch_correct)
            # batch_correct = (step_target.ne(TARGET_PAD_TOKEN) & step_target.eq(step_target)).sum().cpu().float()
            cur_correct += batch_correct
        total_loss += cur_loss
        total_correct += cur_correct
        count += cur_step
        total_compare_correct += is_compare_success.sum().float()

        _, output_tokens = torch.max(log_probs, dim=2)

        cur_success = 0
        for token_ids, include, ac_token_ids in zip(output_tokens, includes, ac_tokens_input):
            if print_count % 100 == 0:
                code = convert_one_token_ids_to_code(token_ids.tolist(), id_to_word_fn, start, end, unk, include)
                ac_code = convert_one_token_ids_to_code(ac_token_ids.tolist(), id_to_word_fn, start, end, unk, include)
                print(code)
                print(ac_code)
            # res = compile_c_code_by_gcc(code, file_path)
            res = False
            if res:
                cur_success += 1
        success += cur_success

        steps += 1
        total += cur_batch_len
        print_count += 1
        print('step {} accuracy: {}, loss: {}, correct: {}, compare correct: {}, loss according train: {}'.format(steps, cur_success/cur_batch_len, (cur_loss/cur_step).data[0], (cur_correct/cur_step).data[0], (is_compare_success.sum()/cur_batch_len).data[0], loss_in_train))
        sys.stdout.flush()
        sys.stderr.flush()
    return (total_loss/count).data[0], float(success/total), (total_correct/count).data[0], (total_compare_correct/total).data[0], (total_loss_in_train/count_in_train).data[0]
 def parse_output(batch_data):
     target_seq = [t[1:] for t in batch_data['target_seq']]
     targets = to_cuda(
         torch.LongTensor(PaddedList(target_seq, fill_value=ignore_id)))
     return [targets]
Exemplo n.º 25
0
def sequence_transform_data_config3(is_debug, output_log=None):
    from model.encoder_decoder_graph import SEDWithInitialStatePreproceser
    import numpy as np
    from read_data.sequencec_transform_data.load_data import load_generated_random_target_data
    train, valid, test = load_generated_random_target_data(is_debug)
    valid.train = False
    test.train = False
    max_index = 10
    def new_id():
        nonlocal max_index
        max_index += 1
        return max_index
    max_length = 20
    begin_index = new_id()
    end_index = new_id()
    delimiter_index = new_id()
    pad_index = new_id()
    decoder_init_idx = new_id()
    for t in [train, valid, test]:
        t.end = [end_index]
    train_size = len(train)
    itr_num = 80
    batch_size = 14
    from model.transformer_lm import dotdict
    from model.encoder_decoder_graph import SEDWithInitialState
    return {
        "model_fn": SEDWithInitialState,
        "model_dict": {
            "cfg": dotdict({
                'n_embd': 768,
                'n_head': 12,
                'n_layer': 12,
                'embd_pdrop': 0.1,
                'attn_pdrop': 0.1,
                'resid_pdrop': 0.1,
                'afn': 'gelu',
                'clf_pdrop': 0.1}),
            "vocab": max_index + 1 + max_length * 2 + 4,
            "n_source_ctx": max_length + 2,
            "n_ctx": max_length * 2 + 4,
            "decoder_init_idx": decoder_init_idx,
        },
        "pre_process_module_fn": SEDWithInitialStatePreproceser,
        "pre_process_module_dict": {
            "begin_idx":  begin_index,
            "delimeter_idx": delimiter_index,
            "summary_idx": decoder_init_idx,
            "pad_idx": pad_index,
            "source_ctx": max_length+2,
            "position_embedding_base": max_index+1,
        },
        "data": [train, valid, test],
        "label_preprocess": lambda x: to_cuda(torch.LongTensor([PaddedList(t, fill_value=pad_index, shape=[max_length+1]) for t in x['y']])),
        "batch_size": batch_size,
        "train_loss": lambda: NCE_train_loss(ignore_index=pad_index),
        "clip_norm": 1,
        "name": "SEDWithInitialState",
        "optimizer": OpenAIAdam,
        "need_pad": True,
        "optimizer_dict": {
                           "schedule": 'warmup_linear',
                           "warmup": 0.002,
                           "t_total": (train_size//batch_size)*itr_num,
                           "b1": 0.9,
                           "b2": 0.999,
                           "e": 1e-8,
                           "l2": 0.01,
                           "vector_l2": 'store_true',
                           "max_grad_norm": 1},
        "epcohes": itr_num,
        "lr": 6.25e-5,
        "evaluate_object_list": [SequenceExactMatch(gpu_index=get_gpu_index(), ignore_token=pad_index),
                                 SequenceOutputIDToWord(vocab=None, file_path=output_log, ignore_token=pad_index)],
        "epoch_ratio": 1,
        "scheduler_fn": None
    }
Exemplo n.º 26
0
def combine_train(p_model,
                  s_model,
                  seq_model,
                  dataset,
                  batch_size,
                  loss_fn,
                  p_optimizer,
                  s_optimizer,
                  delay_reward_fn,
                  baseline_fn,
                  delay_loss_fn,
                  vocab,
                  train_type=None,
                  predict_type='first',
                  include_error_reward=-10000,
                  pretrain=False,
                  random_action=None):
    if train_type == 'p_model':
        change_model_state([p_model], [s_model, seq_model])
        policy_train = True
    elif train_type == 's_model':
        change_model_state([s_model, seq_model], [p_model])
        policy_train = False
    else:
        change_model_state([], [p_model, s_model, seq_model])
        policy_train = False

    begin_tensor = s_model.begin_token
    end_tensor = s_model.end_token
    gap_tensor = s_model.gap_token

    begin_len = 1
    begin_token = vocab.word_to_id(vocab.begin_tokens[0])
    end_token = vocab.word_to_id(vocab.end_tokens[0])
    gap_token = vocab.word_to_id(vocab.addition_tokens[0])
    step = 0
    select_count = torch.LongTensor([0])
    seq_count = torch.LongTensor([0])
    decoder_input_count = torch.LongTensor([0])
    total_seq_loss = torch.Tensor([0])
    total_p_loss = torch.Tensor([0])
    total_s_accuracy_top_k = {}
    for data in data_loader(dataset,
                            batch_size=batch_size,
                            is_shuffle=True,
                            drop_last=True):
        p_model.zero_grad()
        s_model.zero_grad()
        seq_model.zero_grad()

        error_tokens = transform_to_cuda(
            torch.LongTensor(PaddedList(data['error_tokens'])))
        error_length = transform_to_cuda(torch.LongTensor(
            data['error_length']))
        error_action_masks = transform_to_cuda(
            torch.ByteTensor(PaddedList(data['error_mask'], fill_value=0)))

        max_len = torch.max(error_length)
        error_token_masks = create_sequence_length_mask(
            error_length, max_len=max_len.data.item(), gpu_index=gpu_index)

        # add full code context information to each position word using BiRNN.
        context_input, context_hidden = s_model.do_context_rnn(error_tokens)
        # sample the action by interaction between policy model(p_model) and structed model(s_model)
        if not pretrain:
            action_probs_records_list, action_records_list, output_records_list, hidden = create_policy_action_batch(
                p_model, s_model, context_input, policy_train=policy_train)
        else:
            action_probs_records_list, action_records_list, output_records_list, hidden = create_policy_action_batch(
                p_model,
                s_model,
                context_input,
                policy_train=True,
                random_action=[0.8, 0.2])
        action_probs_records = torch.stack(action_probs_records_list, dim=1)
        action_records = torch.stack(action_records_list, dim=1)
        output_records = torch.cat(output_records_list, dim=1)
        masked_action_records = action_records.data.masked_fill_(
            ~error_token_masks, 0)
        if pretrain:
            masked_action_records = error_action_masks.byte(
            ) | masked_action_records.byte()

        include_all_error = check_action_include_all_error(
            masked_action_records, error_action_masks)
        contain_all_error_count = torch.sum(include_all_error)

        tokens_tensor, token_length, part_ac_tokens_list, ac_token_length = combine_spilt_tokens_batch_with_tensor(
            output_records,
            data['ac_tokens'],
            masked_action_records,
            data['token_map'],
            gap_tensor,
            begin_tensor,
            end_tensor,
            gap_token,
            begin_token,
            end_token,
            gpu_index=gpu_index)

        if predict_type == 'start':
            decoder_input = [tokens[:-1] for tokens in part_ac_tokens_list]
            decoder_length = [len(inp) for inp in decoder_input]
            target_output = [tokens[1:] for tokens in part_ac_tokens_list]
        elif predict_type == 'first':
            decoder_input = [
                tokens[begin_len:-1] for tokens in part_ac_tokens_list
            ]
            decoder_length = [len(inp) for inp in decoder_input]
            target_output = [
                tokens[begin_len + 1:] for tokens in part_ac_tokens_list
            ]

        token_length_tensor = transform_to_cuda(torch.LongTensor(token_length))
        ac_token_tensor = transform_to_cuda(
            torch.LongTensor(PaddedList(decoder_input, fill_value=0)))
        ac_token_length_tensor = transform_to_cuda(
            torch.LongTensor(decoder_length))
        log_probs = seq_model.forward(tokens_tensor, token_length_tensor,
                                      ac_token_tensor, ac_token_length_tensor)

        target_output_tensor = transform_to_cuda(
            torch.LongTensor(
                PaddedList(target_output, fill_value=TARGET_PAD_TOKEN)))
        s_loss = loss_fn(log_probs.view(-1, vocab.vocabulary_size),
                         target_output_tensor.view(-1))

        remain_batch = torch.sum(masked_action_records, dim=1)
        add_batch = torch.eq(remain_batch, 0).long()
        remain_batch = remain_batch + add_batch
        total_batch = torch.sum(error_token_masks, dim=1)
        force_error_rewards = (
            ~include_all_error).float() * include_error_reward
        delay_reward = delay_reward_fn(log_probs, target_output_tensor,
                                       total_batch, remain_batch,
                                       force_error_rewards)
        delay_reward = torch.unsqueeze(delay_reward, dim=1).expand(-1, max_len)
        delay_reward = delay_reward * error_token_masks.float()

        if baseline_fn is not None:
            baseline_reward = baseline_fn(delay_reward, error_token_masks)
            total_reward = delay_reward - baseline_reward
        else:
            total_reward = delay_reward

        # force_error_rewards = torch.unsqueeze(~include_all_error, dim=1).float() * error_token_masks.float() * include_error_reward
        force_error_rewards = torch.unsqueeze(
            ~include_all_error, dim=1).float() * error_token_masks.float() * 0
        p_loss = delay_loss_fn(action_probs_records, total_reward,
                               error_token_masks, force_error_rewards)

        if math.isnan(p_loss):
            print('p_loss is nan')
            continue
        # iterate record variable
        step += 1
        one_decoder_input_count = torch.sum(ac_token_length_tensor)
        decoder_input_count += one_decoder_input_count.data.cpu()
        total_seq_loss += s_loss.cpu().data.item(
        ) * one_decoder_input_count.float().cpu()

        one_seq_count = torch.sum(error_length)
        seq_count += one_seq_count.cpu()
        total_p_loss += p_loss.cpu().data.item() * one_seq_count.float().cpu()

        s_accuracy_top_k = calculate_accuracy_of_code_completion(
            log_probs,
            target_output_tensor,
            ignore_token=TARGET_PAD_TOKEN,
            topk_range=(1, 5),
            gpu_index=gpu_index)
        for key, value in s_accuracy_top_k.items():
            total_s_accuracy_top_k[key] = s_accuracy_top_k.get(key, 0) + value

        select_count_each_batch = torch.sum(masked_action_records, dim=1)
        select_count = select_count + torch.sum(
            select_count_each_batch).data.cpu()

        print(
            'train_type: {} step {} sequence model loss: {}, policy model loss: {}, contain all error count: {}, select of each batch: {}, total of each batch: {}, total decoder_input_cout: {}, topk: {}, '
            .format(train_type, step, s_loss, p_loss, contain_all_error_count,
                    select_count_each_batch.data.tolist(),
                    error_length.data.tolist(),
                    one_decoder_input_count.data.item(), s_accuracy_top_k))
        sys.stdout.flush()
        sys.stderr.flush()

        if train_type != 'p_model':
            p_model.zero_grad()
        if train_type != 's_model':
            s_model.zero_grad()
            seq_model.zero_grad()

        if train_type == 'p_model':
            torch.nn.utils.clip_grad_norm_(p_model.parameters(), 0.5)
            p_loss.backward()
            p_optimizer.step()
        elif train_type == 's_model':
            torch.nn.utils.clip_grad_norm_(s_model.parameters(), 8)
            torch.nn.utils.clip_grad_norm_(seq_model.parameters(), 8)
            s_loss.backward()
            s_optimizer.step()

    for key, value in total_s_accuracy_top_k.items():
        total_s_accuracy_top_k[key] = total_s_accuracy_top_k.get(
            key, 0) / decoder_input_count.data.item()

    return (total_seq_loss / decoder_input_count.float()).data.item(), (
        total_p_loss / seq_count.float()).data.item(), (
            select_count.float() /
            seq_count.float()).data.item(), total_s_accuracy_top_k
 def parse_target_tensor(batch_data):
     masked_target_seq = to_cuda(
         torch.LongTensor(
             PaddedList(batch_data['target_seq'], fill_value=ignore_id)))
     return [masked_target_seq]
Exemplo n.º 28
0
 def setUp(self):
     self.a = [1, 2, 3]
     self.ap = PaddedList(self.a)
     self.b = [[[1, 11, 111], [2]], [[4, 44], [5, 55, 555, 5555], [6]],
               [[6, 66]]]
     self.bp = PaddedList(self.b)
def evaluate(model,
             dataset,
             batch_size,
             loss_fn,
             id_to_word_fn,
             file_path,
             gap_token,
             begin_tokens,
             end_tokens,
             predict_type,
             use_force_train=False):
    print('in evaluate')
    model.train()
    total_loss_in_train = torch.Tensor([0])
    count = torch.Tensor([0])
    count_in_train = torch.Tensor([0])
    steps = 0

    begin_len = len(begin_tokens) if begin_tokens is not None else 0
    end_len = len(end_tokens) if end_tokens is not None else 0

    for data in data_loader(dataset,
                            batch_size=batch_size,
                            is_shuffle=True,
                            drop_last=True):
        error_tokens = transform_to_cuda(
            torch.LongTensor(PaddedList(data['error_tokens'])))
        error_length = transform_to_cuda(torch.LongTensor(
            data['error_length']))
        ac_tokens_input = transform_to_cuda(
            torch.LongTensor(PaddedList(data['ac_tokens'])))
        ac_tokens_length = transform_to_cuda(
            torch.LongTensor(data['ac_length']))
        token_maps = transform_to_cuda(
            torch.LongTensor(
                PaddedList(data['token_map'], fill_value=TARGET_PAD_TOKEN)))

        # get split of error list. replace it to rl model
        stay_label_list = choose_token_random_batch(data['error_length'],
                                                    data['error_mask'],
                                                    random_value=0.2)

        part_tokens, part_ac_tokens = combine_spilt_tokens_batch(
            data['error_tokens'], data['ac_tokens'], stay_label_list,
            data['token_map'], gap_token, begin_tokens, end_tokens)

        encoder_input = part_tokens
        encoder_length = [len(inp) for inp in encoder_input]

        if use_force_train:
            if predict_type == 'start':
                decoder_input = [tokens[:-1] for tokens in part_ac_tokens]
                decoder_length = [len(inp) for inp in decoder_input]
                target_output = [tokens[1:] for tokens in part_ac_tokens]
            elif predict_type == 'first':
                decoder_input = [
                    tokens[begin_len:-1] for tokens in part_ac_tokens
                ]
                decoder_length = [len(inp) for inp in decoder_input]
                target_output = [
                    tokens[begin_len + 1:] for tokens in part_ac_tokens
                ]

            encoder_input = transform_to_cuda(
                torch.LongTensor(PaddedList(encoder_input)))
            encoder_length = transform_to_cuda(
                torch.LongTensor(encoder_length))
            decoder_input = transform_to_cuda(
                torch.LongTensor(PaddedList(decoder_input)))
            decoder_length = transform_to_cuda(
                torch.LongTensor(decoder_length))
            target_output = PaddedList(target_output,
                                       fill_value=TARGET_PAD_TOKEN)

            log_probs = model.forward(encoder_input, encoder_length,
                                      decoder_input, decoder_length)
            loss = loss_fn(
                log_probs.view(-1, log_probs.shape[-1]),
                transform_to_cuda(torch.LongTensor(target_output)).view(-1))

            cur_target_count = torch.sum(decoder_length.data.cpu()).float()
            total_loss_in_train += (loss.data.cpu() * cur_target_count)

            count_in_train += cur_target_count
        steps += 1

    return (total_loss_in_train / count_in_train).data.item()
def train(model,
          dataset,
          batch_size,
          loss_fn,
          optimizer,
          gap_token,
          begin_tokens,
          end_tokens,
          predict_type='start'):
    print('in train')
    model.train()
    total_loss = torch.Tensor([0])
    count = torch.Tensor([0])
    steps = 0

    begin_len = len(begin_tokens) if begin_tokens is not None else 0
    end_len = len(end_tokens) if end_tokens is not None else 0

    for data in data_loader(dataset,
                            batch_size=batch_size,
                            is_shuffle=True,
                            drop_last=True):
        error_tokens = transform_to_cuda(
            torch.LongTensor(PaddedList(data['error_tokens'])))
        error_length = transform_to_cuda(torch.LongTensor(
            data['error_length']))
        ac_tokens_input = transform_to_cuda(
            torch.LongTensor(PaddedList(data['ac_tokens'])))
        ac_tokens_length = transform_to_cuda(
            torch.LongTensor(data['ac_length']))
        token_maps = transform_to_cuda(
            torch.LongTensor(
                PaddedList(data['token_map'], fill_value=TARGET_PAD_TOKEN)))

        model.zero_grad()

        # get split of error list. replace it to rl model
        stay_label_list = choose_token_random_batch(data['error_length'],
                                                    data['error_mask'],
                                                    random_value=0.2)

        part_tokens, part_ac_tokens = combine_spilt_tokens_batch(
            data['error_tokens'], data['ac_tokens'], stay_label_list,
            data['token_map'], gap_token, begin_tokens, end_tokens)
        print('part_tokens: length: {}/{},{}/{}'.format(
            len(part_tokens[0]), len(data['error_tokens'][0]),
            len(part_tokens[1]), len(data['error_tokens'][1])))
        if predict_type == 'start':
            encoder_input = part_tokens
            encoder_length = [len(inp) for inp in encoder_input]
            decoder_input = [tokens[:-1] for tokens in part_ac_tokens]
            decoder_length = [len(inp) for inp in decoder_input]
            target_output = [tokens[1:] for tokens in part_ac_tokens]

        elif predict_type == 'first':
            encoder_input = part_tokens
            encoder_length = [len(inp) for inp in encoder_input]
            decoder_input = [tokens[begin_len:-1] for tokens in part_ac_tokens]
            decoder_length = [len(inp) for inp in decoder_input]
            target_output = [
                tokens[begin_len + 1:] for tokens in part_ac_tokens
            ]

        encoder_input = transform_to_cuda(
            torch.LongTensor(PaddedList(encoder_input)))
        encoder_length = transform_to_cuda(torch.LongTensor(encoder_length))
        decoder_input = transform_to_cuda(
            torch.LongTensor(PaddedList(decoder_input)))
        decoder_length = transform_to_cuda(torch.LongTensor(decoder_length))
        target_output = PaddedList(target_output, fill_value=TARGET_PAD_TOKEN)

        log_probs = model.forward(encoder_input, encoder_length, decoder_input,
                                  decoder_length)
        loss = loss_fn(
            log_probs.view(-1, log_probs.shape[-1]),
            transform_to_cuda(torch.LongTensor(target_output)).view(-1))

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 40)
        optimizer.step()

        cur_target_count = torch.sum(decoder_length.data.cpu()).float()
        total_loss += (loss.data.cpu() * cur_target_count)
        count += cur_target_count
        steps += 1

    return (total_loss / count).data.item()