def test_equals(self): dp = PaddedList([[6, 66]], shape=[3, 4]) self.assertTrue(dp == self.bp[2]) self.assertFalse(dp != self.bp[2]) dp = PaddedList([[6, 666]], shape=[3, 4]) self.assertFalse(dp == self.bp[2]) self.assertTrue(dp != self.bp[2])
def test_index(self): res = self.bp.index(PaddedList([[6, 66]], shape=[3, 4])) self.assertEquals(res, 2) cp = PaddedList([1, 2, 3], shape=[4]) res = cp.index(0) self.assertEquals(res, 3) res = cp.index(5) self.assertEquals(res, -1)
def parse_input_tensor(batch_data, do_sample=False): input_seq = to_cuda( torch.LongTensor(PaddedList(batch_data['input_seq']))) inp_seq_len = to_cuda(torch.LongTensor(batch_data['input_seq_len'])) target_seq = to_cuda( torch.LongTensor(PaddedList(batch_data['target_seq']))) target_seq_len = to_cuda(torch.LongTensor( batch_data['target_seq_len'])) return input_seq, inp_seq_len, None, target_seq, target_seq_len, None, batch_data[ 'masked_positions']
def _preprocess(self, batch_data): from common.util import PaddedList s1 = batch_data["s1"] s2 = batch_data['s2'] return to_cuda(torch.LongTensor(PaddedList(s1, fill_value=self._pad_idx))), \ to_cuda(torch.LongTensor(PaddedList(batch_data['s1_char'], fill_value=self._character_pad_idx))), \ to_cuda(torch.LongTensor(PaddedList(s2, fill_value=self._pad_idx))), \ to_cuda(torch.LongTensor(PaddedList(batch_data['s2_char'], fill_value=self._character_pad_idx)))
def parse_target_batch_data(batch_data, ): forward_target_seq = to_cuda( torch.LongTensor( PaddedList(batch_data['forward_target'], fill_value=ignore_id))) backward_target_seq = to_cuda( torch.LongTensor( PaddedList(batch_data['backward_target'], fill_value=ignore_id))) return forward_target_seq, backward_target_seq
def _preprocess(self, batch_data): from common.util import PaddedList s1 = batch_data["s1"] s2 = batch_data['s2'] batch_size = len(s1) size = max(len(t1)+len(t2)+1 for t1, t2 in zip(s1, s2)) if self._summary_node: size += 2 # print("size:{}".format(size)) if not self._summary_node: sentences = to_cuda(torch.LongTensor( PaddedList([t1 + [self._pad_idx] + t2 for t1, t2 in zip(s1, s2)], fill_value=self._pad_idx,))) sentences_char = to_cuda(torch.LongTensor( PaddedList([t1 + [[self._character_pad_idx]] + t2 for t1, t2 in zip(batch_data['s1_char'], batch_data['s2_char'])], fill_value=self._character_pad_idx))) else: sentences = to_cuda(torch.LongTensor( PaddedList([t1 + [self._pad_idx] + t2 + [self._pad_idx, self._pad_idx] for t1, t2 in zip(s1, s2)], fill_value=self._pad_idx, ))) sentences_char = to_cuda(torch.LongTensor( PaddedList( [t1 + [[self._character_pad_idx]] + t2 + [[self._character_pad_idx], [self._character_pad_idx]] for t1, t2 in zip(batch_data['s1_char'], batch_data['s2_char'])], fill_value=self._character_pad_idx))) distance_matrix = np.ones((batch_size, size, size)) * float('-inf') for i, (t1, t2) in enumerate(zip(s1, s2)): s1_matrix = util.create_distance_node_matrix(len(t1)) s2_matrix = util.create_distance_node_matrix(len(t2)) distance_matrix[i, :len(t1), :len(t1)] = s1_matrix distance_matrix[i, len(t1)+1:len(t1)+len(t2)+1, len(t1)+1:len(t1)+len(t2)+1] = s2_matrix if self._summary_node: distance_matrix[i, :len(t1), -2] = 0 distance_matrix[i, len(t1)+1:len(t1)+len(t2)+1, -1] = 0 distance_matrix = to_cuda(torch.FloatTensor(np.stack(distance_matrix, axis=0))) # sentence_same_token_link_matrix = [] # for t1, t2 in zip(s1, s2): # idx, idy, data = util.create_sentence_pair_same_node_matrix(t1, 0, t2, len(t1)+1) # sentence_same_token_link_matrix.append( # sparse.coo_matrix( # (data, (idx, idy)), # shape=(size, size), dtype=np.float # ).toarray() # ) # sentence_same_token_link_matrix = to_cuda(torch.FloatTensor(np.stack(sentence_same_token_link_matrix, axis=0))) return sentences, sentences_char, distance_matrix,
def parse_target(batch_data): if 'error_line' not in batch_data.keys() or no_target: return None target_error_position = to_cuda( torch.LongTensor(PaddedList(batch_data['error_line']))) target_seq = to_cuda( torch.LongTensor( PaddedList(batch_data['target_line_ids'], fill_value=ignore_id))) target_seq = target_seq[:, 1:] return target_error_position, target_seq
def parse_input(batch_data, do_sample=False): inputs = to_cuda(torch.LongTensor(PaddedList(batch_data['input_seq']))) input_length = to_cuda( torch.LongTensor(PaddedList(batch_data['input_length']))) if not do_sample: targets = to_cuda( torch.LongTensor(PaddedList(batch_data['target_seq']))) targets_length = to_cuda( torch.LongTensor(PaddedList(batch_data['target_length']))) else: targets = None targets_length = None return inputs, input_length, targets, targets_length
def _forward_pre_process(self, batch_data): input_seq = to_cuda( torch.LongTensor(PaddedList(batch_data['input_seq']))) input_length = to_cuda(torch.LongTensor(batch_data['input_length'])) decoder_input = to_cuda( torch.LongTensor(PaddedList(batch_data['decoder_input']))) grammar_index = list( more_itertools.flatten(batch_data['grammar_index'])) grammar_index_length = to_cuda( torch.LongTensor([len(t) for t in grammar_index])) grammar_index = to_cuda(torch.LongTensor(PaddedList(grammar_index))) target_index = batch_data['target_index'] return input_seq, input_length, decoder_input, grammar_index, grammar_index_length, target_index
def all_output_and_target_evaluate(model_output, model_target, batch_data): p1, p2, is_copy, copy_ids, sample_output, sample_output_ids = model_output p1_t, p2_t, is_copy_t, copy_target_t, sample_target_t, sample_small_target_t = model_target output_mask = torch.ne(is_copy_t, ignore_token) result = torch.eq(p1_t, p1) result = result & torch.eq(p2_t, p2) # is_copy_ne_count = torch.sum(torch.ne(is_copy, is_copy_t) & output_mask, dim=-1) # result = result & torch.eq(is_copy_ne_count, 0) # # copy_ids_ne_count = torch.sum(torch.ne(copy_ids, copy_target_t) & output_mask, dim=-1) # result = result & torch.eq(copy_ids_ne_count, 0) # # sample_ids_ne_count = torch.sum(torch.ne(sample_output, sample_target_t) & output_mask, dim=-1) # result = result & torch.eq(sample_ids_ne_count, 0) target_output = torch.LongTensor( PaddedList(batch_data['target'], fill_value=ignore_token)).to(p1_t.device) sample_ids_ne_count = torch.sum( torch.ne(sample_output_ids, target_output[:, 1:]) & output_mask, dim=-1) result = result & torch.eq(sample_ids_ne_count, 0) return result
def parse_graph_input_from_mask_lm_output(input_seq, input_length, adj, use_ast=True): from common.problem_util import to_cuda from common.util import PaddedList def to_long(x): return to_cuda(torch.LongTensor(x)) if not use_ast: adjacent_matrix = to_long(adj) else: adjacent_tuple = [[[i] + tt for tt in t] for i, t in enumerate(adj)] adjacent_tuple = [ list(t) for t in unzip(more_itertools.flatten(adjacent_tuple)) ] size = max(input_length) # print("max length in this batch:{}".format(size)) adjacent_tuple = torch.LongTensor(adjacent_tuple) adjacent_values = torch.ones(adjacent_tuple.shape[1]).long() adjacent_size = torch.Size([len(input_length), size, size]) # info('batch_data input_length: ' + str(batch_data['input_length'])) # info('size: ' + str(size)) # info('adjacent_tuple: ' + str(adjacent_tuple.shape)) # info('adjacent_size: ' + str(adjacent_size)) adjacent_matrix = to_cuda( torch.sparse.LongTensor( adjacent_tuple, adjacent_values, adjacent_size, ).float().to_dense()) input_seq = to_long(PaddedList(input_seq)) input_length = to_long(input_length) return adjacent_matrix, input_seq, input_length
def parse_input_batch_data(batch_data, do_sample=False): def to_long(x): return to_cuda(torch.LongTensor(x)) input_seq = to_long(PaddedList(batch_data['input_seq'])) input_length = to_long(batch_data['input_length']) return input_seq, input_length
def test_tensor(self): ten = torch.Tensor(self.ap) print(ten) ten = torch.Tensor(self.bp) print(ten) ep = PaddedList([[4, 44], [5, 55, 555, 5555], [6]], shape=[3, 4]) ten = torch.Tensor(ep) print(ten)
def parse_target_batch_data(batch_data): is_copy = to_cuda( torch.FloatTensor( PaddedList(batch_data['is_copy'], fill_value=ignore_token))) target = to_cuda( torch.LongTensor(list(more_itertools.flatten( batch_data['target'])))) return is_copy, target
def test_reverse(self): res = [] target1 = [1, 2, 3] for i in reversed(self.ap): res = [i] + res for r, t in zip(res, target1): self.assertEquals(r, t) res = [] target2 = [ PaddedList([[1, 11, 111], [2]], shape=[3, 4]), PaddedList([[4, 44], [5, 55, 555, 5555], [6]], shape=[3, 4]), PaddedList([[6, 66]], shape=[3, 4]) ] for i in reversed(self.bp): res = [i] + res for r, t in zip(res, target2): self.assertEquals(r.shape, t.shape) self.assertEquals(r.l, t.l) self.assertEquals(r.to_list(), t.to_list())
def add_result(self, output, model_output, model_target, model_input, ignore_token=None, batch_data=None): model_output = [t.data for t in model_output] if ignore_token is None: ignore_token = self.ignore_token is_copy = (torch.sigmoid(model_output[2]) > 0.5).float() is_copy_target = model_target[2] is_copy_accuracy = self.is_copy_accuracy.add_result( is_copy, is_copy_target) p0 = torch.topk(F.softmax(model_output[0], dim=-1), dim=-1, k=1)[1] p1 = torch.topk(F.softmax(model_output[1], dim=-1), dim=-1, k=1)[1] position = torch.cat([p0, p1], dim=1) position_target = torch.stack([model_target[0], model_target[1]], dim=1) position_correct = self.position_correct.add_result( position, position_target) all_output, sample_output_ids = output target_output = to_cuda( torch.LongTensor( PaddedList(batch_data['target'], fill_value=ignore_token))) sample_output_ids, target_output = expand_tensor_sequence_to_same( sample_output_ids, target_output[:, 1:]) output_accuracy = self.output_accuracy.add_result( sample_output_ids, target_output) full_output_target = to_cuda( torch.LongTensor( PaddedList(batch_data['full_output_target'], fill_value=ignore_token))) all_output, full_output_target = expand_tensor_sequence_to_same( all_output, full_output_target, fill_value=ignore_token) all_correct = self.all_correct.add_result(all_output, full_output_target) return "is_copy_accuracy evaluate:{}, position_correct evaluate:{}, output_accuracy evaluate:{}, " \ "all_correct evaluate: {}".format(is_copy_accuracy, position_correct, output_accuracy, all_correct)
def parse_input(batch_data, do_sample=False): input_seq = to_cuda( torch.LongTensor( PaddedList(batch_data['error_token_ids'], fill_value=0))) input_line_length = to_cuda( torch.LongTensor(PaddedList(batch_data['error_line_length']))) input_line_token_length = to_cuda( torch.LongTensor(PaddedList( batch_data['error_line_token_length']))) input_length = to_cuda( torch.LongTensor(PaddedList(batch_data['error_token_length']))) if not use_ast: adj_matrix = to_cuda(torch.LongTensor(batch_data['adj'])) else: adjacent_tuple = [[[i] + tt for tt in t] for i, t in enumerate(batch_data['adj'])] adjacent_tuple = [ list(t) for t in unzip(more_itertools.flatten(adjacent_tuple)) ] size = max(batch_data['error_token_length']) # print("max length in this batch:{}".format(size)) adjacent_tuple = torch.LongTensor(adjacent_tuple) adjacent_values = torch.ones(adjacent_tuple.shape[1]).long() adjacent_size = torch.Size( [len(batch_data['error_token_length']), size, size]) info('batch_data input_length: ' + str(batch_data['error_token_length'])) info('size: ' + str(size)) info('adjacent_tuple: ' + str(adjacent_tuple.shape)) info('adjacent_size: ' + str(adjacent_size)) adj_matrix = to_cuda( torch.sparse.LongTensor( adjacent_tuple, adjacent_values, adjacent_size, ).float().to_dense()) if not do_sample: target_error_position = to_cuda( torch.LongTensor(PaddedList(batch_data['error_line']))) target_seq = to_cuda( torch.LongTensor( PaddedList(batch_data['target_line_ids'], fill_value=ignore_id))) target_length = to_cuda( torch.LongTensor(PaddedList(batch_data['target_line_length']))) else: target_error_position = None target_seq = None target_length = None return input_seq, input_line_length, input_line_token_length, input_length, adj_matrix, target_error_position, target_seq, target_length
def train(model, dataset, batch_size, loss_function, optimizer): print('in train') total_loss = torch.Tensor([0]) count = torch.Tensor([0]) steps = 0 model.train() for batch_data in data_loader(dataset, batch_size=batch_size, is_shuffle=True, drop_last=True): # with torch.autograd.profiler.profile() as prof: error_tokens = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['error_tokens']))) error_length = trans_to_cuda(torch.LongTensor(batch_data['error_length'])) ac_tokens_input = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['ac_tokens_input']))) ac_tokens_length = trans_to_cuda(torch.LongTensor(batch_data['ac_length'])) target_tokens = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['target_tokens'], fill_value=TARGET_PAD_TOKEN))) del batch_data["error_tokens"], batch_data["error_length"], batch_data["ac_tokens_input"], batch_data["ac_length"], batch_data["target_tokens"] model.zero_grad() log_probs = model.ibm_forward(error_tokens, error_length, ac_tokens_input, ac_tokens_length) print('finish one step train') loss = loss_function(log_probs.view(log_probs.shape[0]*log_probs.shape[1], -1), target_tokens.view(-1)) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 10) optimizer.step() print('finish optimizer step train') cur_target_count = (torch.sum(ac_tokens_length.data.cpu()) - batch_size).float() total_loss += (loss.data.cpu() * cur_target_count) count += cur_target_count steps += 1 print('step {} loss: {}'.format(steps, loss)) # print(prof) sys.stdout.flush() sys.stderr.flush() return (total_loss/count).data[0]
def test_count(self): res = self.bp.count(PaddedList([[6, 66]], shape=[3, 4])) self.assertEquals(res, 1) res = self.bp.count(PaddedList([[6, 666]], shape=[3, 4])) self.assertEquals(res, 0) cp = PaddedList([1, 2, 3], shape=[6]) res = cp.count(0) self.assertEquals(res, 3) res = cp.count(5) self.assertEquals(res, 0)
def _preprocess(self, batch_data): from common.util import PaddedList s1 = batch_data["s1"] s2 = batch_data['s2'] batch_size = len(s1) s = [t1[:-1]+[self._delimeter_idx]+t2[1:-1]+[self._summary_node_idx] for t1, t2 in zip(s1, s2)] # summary_node_index = [] # for t, i in zip(s, range(batch_size)): # summary_node_index.append(len(t)+i * self._max_length - 1) length = [len(t) for t, i in zip(s, range(batch_size))] for t in length: assert t <= self._max_length def to(x): return to_cuda(torch.LongTensor(x)) return [torch.cat([to(PaddedList(s, fill_value=self._pad_idx, shape=[batch_size, self._max_length], )).unsqueeze(-1), to(np.repeat(self._position_range, batch_size, axis=0)).unsqueeze(-1)], dim=-1)]
def parse_data(df: pd.DataFrame): res = [] for row in df.iterrows(): row = row[1] res.append({ "text": row['tokens'], "input": row['subtokens_id'], "label": PaddedList(row['names_id'], fill_value=output_pad_id, shape=[ decoder_max_length, ]), "length": len(row['subtokens_id']), "max_decoder_length": decoder_max_length, }) return res
def sequence_transform_data_config2(is_debug, output_log=None): from model.encoder_decoder_graph import SequenceEncoderDecoderModelUseEncodePad import numpy as np from read_data.sequencec_transform_data.load_data import load_generated_random_target_data train, valid, test = load_generated_random_target_data(is_debug) valid.train = False test.train = False max_index = 10 max_length = 20 begin_index = 11 end_index = 12 delimiter_index = 13 hole_index = 14 pad_index = 15 for t in [train, valid, test]: t.end = [end_index] from model.transformer_lm import dotdict from model.encoder_decoder_graph import SequencePreprocesserWithInputPad return { "model_fn": SequenceEncoderDecoderModelUseEncodePad, "model_dict": { "cfg": dotdict({ 'n_embd': 768, 'n_head': 1, 'n_layer': 1, 'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'resid_pdrop': 0.1, 'afn': 'gelu', 'clf_pdrop': 0.1}), "vocab": 16 + max_length*2+4, "n_ctx": max_length*2+4, "encoder_length": max_length+2, }, "pre_process_module_fn": SequencePreprocesserWithInputPad, "pre_process_module_dict": { "hole_idx": hole_index, "begin_idx": begin_index, "delimeter_idx": delimiter_index, "pad_idx": pad_index, "max_length": max_length+2, "position_embedding_base": 16, }, "data": [train, valid, test], "label_preprocess": lambda x: to_cuda(torch.LongTensor([PaddedList(t, fill_value=pad_index, shape=[max_length+1]) for t in x['y']])), "batch_size": 800, "train_loss": lambda: NCE_train_loss(ignore_index=pad_index), "clip_norm": 1, "name": "Transformer_seq_to_seq_model_use_random_target_use_encoder_pad", "optimizer": OpenAIAdam, "need_pad": True, "optimizer_dict": { "schedule": 'warmup_linear', "warmup": 0.002, "t_total": (80000//800)*80, "b1": 0.9, "b2": 0.999, "e": 1e-8, "l2": 0.01, "vector_l2": 'store_true', "max_grad_norm": 1}, "epcohes": 80, "lr": 6.25e-4, "evaluate_object_list": [SequenceExactMatch(gpu_index=get_gpu_index(), ignore_token=pad_index), SequenceOutputIDToWord(vocab=None, file_path=output_log, ignore_token=pad_index)], "epoch_ratio": 1, "scheduler_fn": None }
def evaluate(model, dataset, loss_function, batch_size, start, end, unk, id_to_word_fn, file_path='test.c', use_force_train=False): print('in evaluate') global print_count steps = 0 success = 0 total = 0 total_loss = torch.Tensor([0]) count = torch.Tensor([0]) total_correct = torch.Tensor([0]) total_compare_correct = torch.Tensor([0]) total_loss_in_train = torch.Tensor([0]) count_in_train = torch.Tensor([0]) model.eval() for batch_data in data_loader(dataset, batch_size=batch_size, is_shuffle=True, drop_last=True): error_tokens = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['error_tokens']))) error_length = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['error_length']))) ac_tokens_input = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['ac_tokens_input']))) ac_tokens_length = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['ac_length']))) target_tokens = trans_to_cuda(torch.LongTensor(PaddedList(batch_data['target_tokens'], fill_value=TARGET_PAD_TOKEN))) target_tokens_padded = padded_tensor_one_dim_to_length(target_tokens.float(), dim=1, padded_length=MAX_LENGTH, is_cuda=available_cuda, gpu_index=GPU_INDEX, fill_value=TARGET_PAD_TOKEN).long() del batch_data["error_tokens"], batch_data["error_length"], batch_data["ac_tokens_input"], batch_data[ "ac_length"], batch_data["target_tokens"] includes = batch_data['includes'] loss_in_train = None # calculate loss like train if use_force_train: log_probs = model.ibm_forward(error_tokens, error_length, ac_tokens_input, ac_tokens_length) print('finish one step train') loss_in_train = loss_function(log_probs.view(log_probs.shape[0] * log_probs.shape[1], -1), target_tokens.view(-1)) cur_target_count = (torch.sum(ac_tokens_length.data.cpu()) - batch_size).float() total_loss_in_train += (loss_in_train.data.cpu() * cur_target_count) count_in_train += cur_target_count else: log_probs = model._ibm_test_forward(error_tokens, error_length) # do evaluate cur_batch_len = len(batch_data['includes']) predict_log_probs = torch.transpose(log_probs, 0, 1) target_label = torch.transpose(target_tokens_padded, 0, 1) cur_loss = torch.Tensor([0]) cur_step = torch.Tensor([0]) cur_correct = torch.Tensor([0]) is_compare_success = torch.Tensor([1] * batch_size) for i, step_output in enumerate(predict_log_probs): step_target = target_label[i, :].view(batch_size) batch_loss = loss_function(step_output.view(batch_size, -1), step_target) batch_predict_label = step_output.view(batch_size, -1).topk(1)[1].view(batch_size) in_step_count = step_target.ne(TARGET_PAD_TOKEN).sum().float() cur_loss += (batch_loss.data.cpu() * in_step_count.cpu()) cur_step += in_step_count.data.cpu() batch_correct = (step_target.ne(TARGET_PAD_TOKEN) & step_target.eq(batch_predict_label)).sum().cpu().float() batch_error = step_target.ne(TARGET_PAD_TOKEN) & step_target.ne(batch_predict_label) is_compare_success[batch_error.cpu()] = 0 if batch_correct > 16: print(batch_correct) # batch_correct = (step_target.ne(TARGET_PAD_TOKEN) & step_target.eq(step_target)).sum().cpu().float() cur_correct += batch_correct total_loss += cur_loss total_correct += cur_correct count += cur_step total_compare_correct += is_compare_success.sum().float() _, output_tokens = torch.max(log_probs, dim=2) cur_success = 0 for token_ids, include, ac_token_ids in zip(output_tokens, includes, ac_tokens_input): if print_count % 100 == 0: code = convert_one_token_ids_to_code(token_ids.tolist(), id_to_word_fn, start, end, unk, include) ac_code = convert_one_token_ids_to_code(ac_token_ids.tolist(), id_to_word_fn, start, end, unk, include) print(code) print(ac_code) # res = compile_c_code_by_gcc(code, file_path) res = False if res: cur_success += 1 success += cur_success steps += 1 total += cur_batch_len print_count += 1 print('step {} accuracy: {}, loss: {}, correct: {}, compare correct: {}, loss according train: {}'.format(steps, cur_success/cur_batch_len, (cur_loss/cur_step).data[0], (cur_correct/cur_step).data[0], (is_compare_success.sum()/cur_batch_len).data[0], loss_in_train)) sys.stdout.flush() sys.stderr.flush() return (total_loss/count).data[0], float(success/total), (total_correct/count).data[0], (total_compare_correct/total).data[0], (total_loss_in_train/count_in_train).data[0]
def parse_output(batch_data): target_seq = [t[1:] for t in batch_data['target_seq']] targets = to_cuda( torch.LongTensor(PaddedList(target_seq, fill_value=ignore_id))) return [targets]
def sequence_transform_data_config3(is_debug, output_log=None): from model.encoder_decoder_graph import SEDWithInitialStatePreproceser import numpy as np from read_data.sequencec_transform_data.load_data import load_generated_random_target_data train, valid, test = load_generated_random_target_data(is_debug) valid.train = False test.train = False max_index = 10 def new_id(): nonlocal max_index max_index += 1 return max_index max_length = 20 begin_index = new_id() end_index = new_id() delimiter_index = new_id() pad_index = new_id() decoder_init_idx = new_id() for t in [train, valid, test]: t.end = [end_index] train_size = len(train) itr_num = 80 batch_size = 14 from model.transformer_lm import dotdict from model.encoder_decoder_graph import SEDWithInitialState return { "model_fn": SEDWithInitialState, "model_dict": { "cfg": dotdict({ 'n_embd': 768, 'n_head': 12, 'n_layer': 12, 'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'resid_pdrop': 0.1, 'afn': 'gelu', 'clf_pdrop': 0.1}), "vocab": max_index + 1 + max_length * 2 + 4, "n_source_ctx": max_length + 2, "n_ctx": max_length * 2 + 4, "decoder_init_idx": decoder_init_idx, }, "pre_process_module_fn": SEDWithInitialStatePreproceser, "pre_process_module_dict": { "begin_idx": begin_index, "delimeter_idx": delimiter_index, "summary_idx": decoder_init_idx, "pad_idx": pad_index, "source_ctx": max_length+2, "position_embedding_base": max_index+1, }, "data": [train, valid, test], "label_preprocess": lambda x: to_cuda(torch.LongTensor([PaddedList(t, fill_value=pad_index, shape=[max_length+1]) for t in x['y']])), "batch_size": batch_size, "train_loss": lambda: NCE_train_loss(ignore_index=pad_index), "clip_norm": 1, "name": "SEDWithInitialState", "optimizer": OpenAIAdam, "need_pad": True, "optimizer_dict": { "schedule": 'warmup_linear', "warmup": 0.002, "t_total": (train_size//batch_size)*itr_num, "b1": 0.9, "b2": 0.999, "e": 1e-8, "l2": 0.01, "vector_l2": 'store_true', "max_grad_norm": 1}, "epcohes": itr_num, "lr": 6.25e-5, "evaluate_object_list": [SequenceExactMatch(gpu_index=get_gpu_index(), ignore_token=pad_index), SequenceOutputIDToWord(vocab=None, file_path=output_log, ignore_token=pad_index)], "epoch_ratio": 1, "scheduler_fn": None }
def combine_train(p_model, s_model, seq_model, dataset, batch_size, loss_fn, p_optimizer, s_optimizer, delay_reward_fn, baseline_fn, delay_loss_fn, vocab, train_type=None, predict_type='first', include_error_reward=-10000, pretrain=False, random_action=None): if train_type == 'p_model': change_model_state([p_model], [s_model, seq_model]) policy_train = True elif train_type == 's_model': change_model_state([s_model, seq_model], [p_model]) policy_train = False else: change_model_state([], [p_model, s_model, seq_model]) policy_train = False begin_tensor = s_model.begin_token end_tensor = s_model.end_token gap_tensor = s_model.gap_token begin_len = 1 begin_token = vocab.word_to_id(vocab.begin_tokens[0]) end_token = vocab.word_to_id(vocab.end_tokens[0]) gap_token = vocab.word_to_id(vocab.addition_tokens[0]) step = 0 select_count = torch.LongTensor([0]) seq_count = torch.LongTensor([0]) decoder_input_count = torch.LongTensor([0]) total_seq_loss = torch.Tensor([0]) total_p_loss = torch.Tensor([0]) total_s_accuracy_top_k = {} for data in data_loader(dataset, batch_size=batch_size, is_shuffle=True, drop_last=True): p_model.zero_grad() s_model.zero_grad() seq_model.zero_grad() error_tokens = transform_to_cuda( torch.LongTensor(PaddedList(data['error_tokens']))) error_length = transform_to_cuda(torch.LongTensor( data['error_length'])) error_action_masks = transform_to_cuda( torch.ByteTensor(PaddedList(data['error_mask'], fill_value=0))) max_len = torch.max(error_length) error_token_masks = create_sequence_length_mask( error_length, max_len=max_len.data.item(), gpu_index=gpu_index) # add full code context information to each position word using BiRNN. context_input, context_hidden = s_model.do_context_rnn(error_tokens) # sample the action by interaction between policy model(p_model) and structed model(s_model) if not pretrain: action_probs_records_list, action_records_list, output_records_list, hidden = create_policy_action_batch( p_model, s_model, context_input, policy_train=policy_train) else: action_probs_records_list, action_records_list, output_records_list, hidden = create_policy_action_batch( p_model, s_model, context_input, policy_train=True, random_action=[0.8, 0.2]) action_probs_records = torch.stack(action_probs_records_list, dim=1) action_records = torch.stack(action_records_list, dim=1) output_records = torch.cat(output_records_list, dim=1) masked_action_records = action_records.data.masked_fill_( ~error_token_masks, 0) if pretrain: masked_action_records = error_action_masks.byte( ) | masked_action_records.byte() include_all_error = check_action_include_all_error( masked_action_records, error_action_masks) contain_all_error_count = torch.sum(include_all_error) tokens_tensor, token_length, part_ac_tokens_list, ac_token_length = combine_spilt_tokens_batch_with_tensor( output_records, data['ac_tokens'], masked_action_records, data['token_map'], gap_tensor, begin_tensor, end_tensor, gap_token, begin_token, end_token, gpu_index=gpu_index) if predict_type == 'start': decoder_input = [tokens[:-1] for tokens in part_ac_tokens_list] decoder_length = [len(inp) for inp in decoder_input] target_output = [tokens[1:] for tokens in part_ac_tokens_list] elif predict_type == 'first': decoder_input = [ tokens[begin_len:-1] for tokens in part_ac_tokens_list ] decoder_length = [len(inp) for inp in decoder_input] target_output = [ tokens[begin_len + 1:] for tokens in part_ac_tokens_list ] token_length_tensor = transform_to_cuda(torch.LongTensor(token_length)) ac_token_tensor = transform_to_cuda( torch.LongTensor(PaddedList(decoder_input, fill_value=0))) ac_token_length_tensor = transform_to_cuda( torch.LongTensor(decoder_length)) log_probs = seq_model.forward(tokens_tensor, token_length_tensor, ac_token_tensor, ac_token_length_tensor) target_output_tensor = transform_to_cuda( torch.LongTensor( PaddedList(target_output, fill_value=TARGET_PAD_TOKEN))) s_loss = loss_fn(log_probs.view(-1, vocab.vocabulary_size), target_output_tensor.view(-1)) remain_batch = torch.sum(masked_action_records, dim=1) add_batch = torch.eq(remain_batch, 0).long() remain_batch = remain_batch + add_batch total_batch = torch.sum(error_token_masks, dim=1) force_error_rewards = ( ~include_all_error).float() * include_error_reward delay_reward = delay_reward_fn(log_probs, target_output_tensor, total_batch, remain_batch, force_error_rewards) delay_reward = torch.unsqueeze(delay_reward, dim=1).expand(-1, max_len) delay_reward = delay_reward * error_token_masks.float() if baseline_fn is not None: baseline_reward = baseline_fn(delay_reward, error_token_masks) total_reward = delay_reward - baseline_reward else: total_reward = delay_reward # force_error_rewards = torch.unsqueeze(~include_all_error, dim=1).float() * error_token_masks.float() * include_error_reward force_error_rewards = torch.unsqueeze( ~include_all_error, dim=1).float() * error_token_masks.float() * 0 p_loss = delay_loss_fn(action_probs_records, total_reward, error_token_masks, force_error_rewards) if math.isnan(p_loss): print('p_loss is nan') continue # iterate record variable step += 1 one_decoder_input_count = torch.sum(ac_token_length_tensor) decoder_input_count += one_decoder_input_count.data.cpu() total_seq_loss += s_loss.cpu().data.item( ) * one_decoder_input_count.float().cpu() one_seq_count = torch.sum(error_length) seq_count += one_seq_count.cpu() total_p_loss += p_loss.cpu().data.item() * one_seq_count.float().cpu() s_accuracy_top_k = calculate_accuracy_of_code_completion( log_probs, target_output_tensor, ignore_token=TARGET_PAD_TOKEN, topk_range=(1, 5), gpu_index=gpu_index) for key, value in s_accuracy_top_k.items(): total_s_accuracy_top_k[key] = s_accuracy_top_k.get(key, 0) + value select_count_each_batch = torch.sum(masked_action_records, dim=1) select_count = select_count + torch.sum( select_count_each_batch).data.cpu() print( 'train_type: {} step {} sequence model loss: {}, policy model loss: {}, contain all error count: {}, select of each batch: {}, total of each batch: {}, total decoder_input_cout: {}, topk: {}, ' .format(train_type, step, s_loss, p_loss, contain_all_error_count, select_count_each_batch.data.tolist(), error_length.data.tolist(), one_decoder_input_count.data.item(), s_accuracy_top_k)) sys.stdout.flush() sys.stderr.flush() if train_type != 'p_model': p_model.zero_grad() if train_type != 's_model': s_model.zero_grad() seq_model.zero_grad() if train_type == 'p_model': torch.nn.utils.clip_grad_norm_(p_model.parameters(), 0.5) p_loss.backward() p_optimizer.step() elif train_type == 's_model': torch.nn.utils.clip_grad_norm_(s_model.parameters(), 8) torch.nn.utils.clip_grad_norm_(seq_model.parameters(), 8) s_loss.backward() s_optimizer.step() for key, value in total_s_accuracy_top_k.items(): total_s_accuracy_top_k[key] = total_s_accuracy_top_k.get( key, 0) / decoder_input_count.data.item() return (total_seq_loss / decoder_input_count.float()).data.item(), ( total_p_loss / seq_count.float()).data.item(), ( select_count.float() / seq_count.float()).data.item(), total_s_accuracy_top_k
def parse_target_tensor(batch_data): masked_target_seq = to_cuda( torch.LongTensor( PaddedList(batch_data['target_seq'], fill_value=ignore_id))) return [masked_target_seq]
def setUp(self): self.a = [1, 2, 3] self.ap = PaddedList(self.a) self.b = [[[1, 11, 111], [2]], [[4, 44], [5, 55, 555, 5555], [6]], [[6, 66]]] self.bp = PaddedList(self.b)
def evaluate(model, dataset, batch_size, loss_fn, id_to_word_fn, file_path, gap_token, begin_tokens, end_tokens, predict_type, use_force_train=False): print('in evaluate') model.train() total_loss_in_train = torch.Tensor([0]) count = torch.Tensor([0]) count_in_train = torch.Tensor([0]) steps = 0 begin_len = len(begin_tokens) if begin_tokens is not None else 0 end_len = len(end_tokens) if end_tokens is not None else 0 for data in data_loader(dataset, batch_size=batch_size, is_shuffle=True, drop_last=True): error_tokens = transform_to_cuda( torch.LongTensor(PaddedList(data['error_tokens']))) error_length = transform_to_cuda(torch.LongTensor( data['error_length'])) ac_tokens_input = transform_to_cuda( torch.LongTensor(PaddedList(data['ac_tokens']))) ac_tokens_length = transform_to_cuda( torch.LongTensor(data['ac_length'])) token_maps = transform_to_cuda( torch.LongTensor( PaddedList(data['token_map'], fill_value=TARGET_PAD_TOKEN))) # get split of error list. replace it to rl model stay_label_list = choose_token_random_batch(data['error_length'], data['error_mask'], random_value=0.2) part_tokens, part_ac_tokens = combine_spilt_tokens_batch( data['error_tokens'], data['ac_tokens'], stay_label_list, data['token_map'], gap_token, begin_tokens, end_tokens) encoder_input = part_tokens encoder_length = [len(inp) for inp in encoder_input] if use_force_train: if predict_type == 'start': decoder_input = [tokens[:-1] for tokens in part_ac_tokens] decoder_length = [len(inp) for inp in decoder_input] target_output = [tokens[1:] for tokens in part_ac_tokens] elif predict_type == 'first': decoder_input = [ tokens[begin_len:-1] for tokens in part_ac_tokens ] decoder_length = [len(inp) for inp in decoder_input] target_output = [ tokens[begin_len + 1:] for tokens in part_ac_tokens ] encoder_input = transform_to_cuda( torch.LongTensor(PaddedList(encoder_input))) encoder_length = transform_to_cuda( torch.LongTensor(encoder_length)) decoder_input = transform_to_cuda( torch.LongTensor(PaddedList(decoder_input))) decoder_length = transform_to_cuda( torch.LongTensor(decoder_length)) target_output = PaddedList(target_output, fill_value=TARGET_PAD_TOKEN) log_probs = model.forward(encoder_input, encoder_length, decoder_input, decoder_length) loss = loss_fn( log_probs.view(-1, log_probs.shape[-1]), transform_to_cuda(torch.LongTensor(target_output)).view(-1)) cur_target_count = torch.sum(decoder_length.data.cpu()).float() total_loss_in_train += (loss.data.cpu() * cur_target_count) count_in_train += cur_target_count steps += 1 return (total_loss_in_train / count_in_train).data.item()
def train(model, dataset, batch_size, loss_fn, optimizer, gap_token, begin_tokens, end_tokens, predict_type='start'): print('in train') model.train() total_loss = torch.Tensor([0]) count = torch.Tensor([0]) steps = 0 begin_len = len(begin_tokens) if begin_tokens is not None else 0 end_len = len(end_tokens) if end_tokens is not None else 0 for data in data_loader(dataset, batch_size=batch_size, is_shuffle=True, drop_last=True): error_tokens = transform_to_cuda( torch.LongTensor(PaddedList(data['error_tokens']))) error_length = transform_to_cuda(torch.LongTensor( data['error_length'])) ac_tokens_input = transform_to_cuda( torch.LongTensor(PaddedList(data['ac_tokens']))) ac_tokens_length = transform_to_cuda( torch.LongTensor(data['ac_length'])) token_maps = transform_to_cuda( torch.LongTensor( PaddedList(data['token_map'], fill_value=TARGET_PAD_TOKEN))) model.zero_grad() # get split of error list. replace it to rl model stay_label_list = choose_token_random_batch(data['error_length'], data['error_mask'], random_value=0.2) part_tokens, part_ac_tokens = combine_spilt_tokens_batch( data['error_tokens'], data['ac_tokens'], stay_label_list, data['token_map'], gap_token, begin_tokens, end_tokens) print('part_tokens: length: {}/{},{}/{}'.format( len(part_tokens[0]), len(data['error_tokens'][0]), len(part_tokens[1]), len(data['error_tokens'][1]))) if predict_type == 'start': encoder_input = part_tokens encoder_length = [len(inp) for inp in encoder_input] decoder_input = [tokens[:-1] for tokens in part_ac_tokens] decoder_length = [len(inp) for inp in decoder_input] target_output = [tokens[1:] for tokens in part_ac_tokens] elif predict_type == 'first': encoder_input = part_tokens encoder_length = [len(inp) for inp in encoder_input] decoder_input = [tokens[begin_len:-1] for tokens in part_ac_tokens] decoder_length = [len(inp) for inp in decoder_input] target_output = [ tokens[begin_len + 1:] for tokens in part_ac_tokens ] encoder_input = transform_to_cuda( torch.LongTensor(PaddedList(encoder_input))) encoder_length = transform_to_cuda(torch.LongTensor(encoder_length)) decoder_input = transform_to_cuda( torch.LongTensor(PaddedList(decoder_input))) decoder_length = transform_to_cuda(torch.LongTensor(decoder_length)) target_output = PaddedList(target_output, fill_value=TARGET_PAD_TOKEN) log_probs = model.forward(encoder_input, encoder_length, decoder_input, decoder_length) loss = loss_fn( log_probs.view(-1, log_probs.shape[-1]), transform_to_cuda(torch.LongTensor(target_output)).view(-1)) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 40) optimizer.step() cur_target_count = torch.sum(decoder_length.data.cpu()).float() total_loss += (loss.data.cpu() * cur_target_count) count += cur_target_count steps += 1 return (total_loss / count).data.item()