def evaluate_ppl(model: RenamingModel, dataset: Dataset, config: Dict, predicate: Any = None): if predicate is None: def predicate(_): return True eval_batch_size = config['train']['batch_size'] num_readers = config['train']['num_readers'] num_batchers = config['train']['num_batchers'] data_iter = dataset.batch_iterator(batch_size=eval_batch_size, train=False, progress=True, return_examples=False, return_prediction_target=True, config=model.config, num_readers=num_readers, num_batchers=num_batchers) was_training = model.training model.eval() cum_log_probs = 0. cum_num_examples = 0 with torch.no_grad(): for batch in data_iter: td = batch.tensor_dict nn_util.to(td, model.device) result = model(td, td['prediction_target']) log_probs = result['batch_log_prob'].cpu().tolist() for e_id, test_meta in enumerate(td['test_meta']): if predicate(test_meta): log_prob = log_probs[e_id] cum_log_probs += log_prob cum_num_examples += 1 ppl = np.exp(-cum_log_probs / cum_num_examples) if was_training: model.train() return ppl
def decode(model: RenamingModel, dataset: Dataset, config: Dict, eval_batch_size=None): if eval_batch_size is None: if 'eval_batch_size' in config['train']: eval_batch_size = config['train']['eval_batch_size'] else: eval_batch_size = config['train']['batch_size'] num_readers = config['train']['num_readers'] num_batchers = config['train']['num_batchers'] data_iter = dataset.batch_iterator(batch_size=eval_batch_size, train=False, progress=True, return_examples=True, config=model.config, num_readers=num_readers, num_batchers=num_batchers) model.eval() all_examples = dict() with torch.no_grad(): for batch in data_iter: examples = batch.examples rename_results = model.predict(examples) for example, rename_result in zip(examples, rename_results): example_pred_accs = [] top_rename_result = rename_result[0] for old_name, gold_new_name \ in example.variable_name_map.items(): pred = top_rename_result[old_name] pred_new_name = pred['new_name'] var_metric = Evaluator.get_soft_metrics( pred_new_name, gold_new_name) example_pred_accs.append(var_metric) file_name = example.binary_file['file_name'] line_num = example.binary_file['line_num'] fun_name = example.ast.compilation_unit all_examples[f'{file_name}_{line_num}_{fun_name}'] = \ (rename_result, Evaluator.average(example_pred_accs)) return all_examples
def decode_and_evaluate(model: RenamingModel, dataset: Dataset, config: Dict, return_results=False, eval_batch_size=None, approx=False): if eval_batch_size is None: eval_batch_size = config['train'][ 'eval_batch_size'] if 'eval_batch_size' in config[ 'train'] else config['train']['batch_size'] data_iter = dataset.batch_iterator( batch_size=eval_batch_size, train=False, progress=True, return_examples=True, max_seq_len=512, config=model.module.config if isinstance(model, torch.nn.DataParallel) else model.config, num_readers=config['train']['num_readers'], num_batchers=config['train']['num_batchers'], truncate=approx) was_training = model.training model.eval() example_acc_list = [] variable_acc_list = [] need_rename_cases = [] func_name_in_train_acc_list = [] func_name_not_in_train_acc_list = [] func_body_in_train_acc_list = [] func_body_not_in_train_acc_list = [] all_examples = dict() with torch.no_grad(): for i, batch in enumerate(data_iter): examples = batch.examples if isinstance(model, torch.nn.DataParallel): rename_results = model.module.predict(examples) else: rename_results = model.predict(examples) for example, rename_result in zip(examples, rename_results): example_pred_accs = [] top_rename_result = rename_result[0] for old_name, gold_new_name in example.variable_name_map.items( ): pred = top_rename_result[old_name] pred_new_name = pred['new_name'] var_metric = Evaluator.get_soft_metrics( pred_new_name, gold_new_name) # is_correct = pred_new_name == gold_new_name example_pred_accs.append(var_metric) if gold_new_name != old_name: # and gold_new_name in model.vocab.target: need_rename_cases.append(var_metric) if example.test_meta['function_name_in_train']: func_name_in_train_acc_list.append(var_metric) else: func_name_not_in_train_acc_list.append( var_metric) if example.test_meta['function_body_in_train']: func_body_in_train_acc_list.append(var_metric) else: func_body_not_in_train_acc_list.append( var_metric) variable_acc_list.extend(example_pred_accs) example_acc_list.append(example_pred_accs) if return_results: all_examples[example.binary_file['file_name'] + '_' + str(example.binary_file['line_num'])] = ( rename_result, Evaluator.average(example_pred_accs)) # all_examples.append((example, rename_result, example_pred_accs)) valid_example_num = len(example_acc_list) num_variables = len(variable_acc_list) corpus_acc = Evaluator.average(variable_acc_list) if was_training: model.train() eval_results = dict( corpus_acc=corpus_acc, corpus_need_rename_acc=Evaluator.average(need_rename_cases), func_name_in_train_acc=Evaluator.average( func_name_in_train_acc_list), func_name_not_in_train_acc=Evaluator.average( func_name_not_in_train_acc_list), func_body_in_train_acc=Evaluator.average( func_body_in_train_acc_list), func_body_not_in_train_acc=Evaluator.average( func_body_not_in_train_acc_list), num_variables=num_variables, num_valid_examples=valid_example_num) if return_results: return eval_results, all_examples return eval_results
def decode_and_evaluate(model: RenamingModel, dataset: Dataset, config: Dict, return_results=False, eval_batch_size=None): if eval_batch_size is None: if 'eval_batch_size' in config['train']: eval_batch_size = config['train']['eval_batch_size'] else: eval_batch_size = config['train']['batch_size'] num_readers = config['train']['num_readers'] num_batchers = config['train']['num_batchers'] data_iter = dataset.batch_iterator(batch_size=eval_batch_size, train=False, progress=True, return_examples=True, config=model.config, num_readers=num_readers, num_batchers=num_batchers) was_training = model.training model.eval() example_acc_list = [] variable_acc_list = [] need_rename_cases = [] func_name_in_train_acc = [] func_name_not_in_train_acc = [] func_body_in_train_acc = [] func_body_not_in_train_acc = [] all_examples = dict() with torch.no_grad(): for batch in data_iter: examples = batch.examples rename_results = model.predict(examples) for example, rename_result in zip(examples, rename_results): example_pred_accs = [] top_rename_result = rename_result[0] for old_name, gold_new_name \ in example.variable_name_map.items(): pred = top_rename_result[old_name] pred_new_name = pred['new_name'] var_metric = Evaluator.get_soft_metrics( pred_new_name, gold_new_name) # is_correct = pred_new_name == gold_new_name example_pred_accs.append(var_metric) if gold_new_name != old_name: need_rename_cases.append(var_metric) if example.test_meta['function_name_in_train']: func_name_in_train_acc.append(var_metric) else: func_name_not_in_train_acc.append(var_metric) if example.test_meta['function_body_in_train']: func_body_in_train_acc.append(var_metric) else: func_body_not_in_train_acc.append(var_metric) variable_acc_list.extend(example_pred_accs) example_acc_list.append(example_pred_accs) if return_results: example = \ f"{example.binary_file['file_name']}_" \ f"{example.binary_file['line_num']}" all_examples[example] = \ (rename_result, Evaluator.average(example_pred_accs)) valid_example_num = len(example_acc_list) num_variables = len(variable_acc_list) corpus_acc = Evaluator.average(variable_acc_list) if was_training: model.train() need_rename_acc = Evaluator.average(need_rename_cases) name_in_train_acc = Evaluator.average(func_name_in_train_acc) name_not_in_train_acc = Evaluator.average(func_name_not_in_train_acc) body_in_train_acc = Evaluator.average(func_body_in_train_acc) body_not_in_train_acc = Evaluator.average(func_body_not_in_train_acc) eval_results = dict(corpus_acc=corpus_acc, corpus_need_rename_acc=need_rename_acc, func_name_in_train_acc=name_in_train_acc, func_name_not_in_train_acc=name_not_in_train_acc, func_body_in_train_acc=body_in_train_acc, func_body_not_in_train_acc=body_not_in_train_acc, num_variables=num_variables, num_valid_examples=valid_example_num) if return_results: return eval_results, all_examples return eval_results
def train(args): work_dir = args['--work-dir'] config = json.loads(_jsonnet.evaluate_file(args['CONFIG_FILE'])) config['work_dir'] = work_dir if not os.path.exists(work_dir): print(f'creating work dir [{work_dir}]', file=sys.stderr) os.makedirs(work_dir) if args['--extra-config']: extra_config = args['--extra-config'] extra_config = json.loads(extra_config) config = util.update(config, extra_config) json.dump(config, open(os.path.join(work_dir, 'config.json'), 'w'), indent=2) model = RenamingModel.build(config) config = model.config model.train() if args['--cuda']: model = model.cuda() params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.Adam(params, lr=0.001) nn_util.glorot_init(params) # set the padding index for embedding layers to zeros # model.encoder.var_node_name_embedding.weight[0].fill_(0.) train_set = Dataset(config['data']['train_file']) dev_set = Dataset(config['data']['dev_file']) batch_size = config['train']['batch_size'] print(f'Training set size {len(train_set)}, dev set size {len(dev_set)}', file=sys.stderr) # training loop train_iter = epoch = cum_examples = 0 log_every = config['train']['log_every'] evaluate_every_nepoch = config['train']['evaluate_every_nepoch'] max_epoch = config['train']['max_epoch'] max_patience = config['train']['patience'] cum_loss = 0. patience = 0. t_log = time.time() history_accs = [] while True: # load training dataset, which is a collection of ASTs and maps of gold-standard renamings train_set_iter = train_set.batch_iterator( batch_size=batch_size, return_examples=False, config=config, progress=True, train=True, num_readers=config['train']['num_readers'], num_batchers=config['train']['num_batchers']) epoch += 1 for batch in train_set_iter: train_iter += 1 optimizer.zero_grad() # t1 = time.time() nn_util.to(batch.tensor_dict, model.device) # print(f'[Learner] {time.time() - t1}s took for moving tensors to device', file=sys.stderr) # t1 = time.time() result = model(batch.tensor_dict, batch.tensor_dict['prediction_target']) # print(f'[Learner] batch {train_iter}, {batch.size} examples took {time.time() - t1:4f}s', file=sys.stderr) loss = -result['batch_log_prob'].mean() cum_loss += loss.item() * batch.size cum_examples += batch.size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(params, 5.) optimizer.step() del loss if train_iter % log_every == 0: print( f'[Learner] train_iter={train_iter} avg. loss={cum_loss / cum_examples}, ' f'{cum_examples} examples ({cum_examples / (time.time() - t_log)} examples/s)', file=sys.stderr) cum_loss = cum_examples = 0. t_log = time.time() print(f'[Learner] Epoch {epoch} finished', file=sys.stderr) if epoch % evaluate_every_nepoch == 0: print(f'[Learner] Perform evaluation', file=sys.stderr) t1 = time.time() # ppl = Evaluator.evaluate_ppl(model, dev_set, config, predicate=lambda e: not e['function_body_in_train']) eval_results = Evaluator.decode_and_evaluate( model, dev_set, config) # print(f'[Learner] Evaluation result ppl={ppl} (took {time.time() - t1}s)', file=sys.stderr) print( f'[Learner] Evaluation result {eval_results} (took {time.time() - t1}s)', file=sys.stderr) dev_metric = eval_results['func_body_not_in_train_acc']['accuracy'] # dev_metric = -ppl if len(history_accs) == 0 or dev_metric > max(history_accs): patience = 0 model_save_path = os.path.join(work_dir, f'model.bin') model.save(model_save_path) print( f'[Learner] Saved currently the best model to {model_save_path}', file=sys.stderr) else: patience += 1 if patience == max_patience: print( f'[Learner] Reached max patience {max_patience}, exiting...', file=sys.stderr) patience = 0 exit() history_accs.append(dev_metric) if epoch == max_epoch: print(f'[Learner] Reached max epoch', file=sys.stderr) exit() t1 = time.time()
def train(args): if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) if args.gpu != '-1' and torch.cuda.is_available(): device = torch.device('cuda') torch.cuda.set_rng_state(torch.cuda.get_rng_state()) torch.backends.cudnn.deterministic = True else: device = torch.device('cpu') config = { 'train': { 'unchanged_variable_weight': 0.1, 'buffer_size': 5000 }, 'encoder': { 'type': 'SequentialEncoder' }, 'data': { 'vocab_file': 'data/vocab.bpe10000/vocab' } } train_set = Dataset('data/preprocessed_data/train-shard-*.tar') dev_set = Dataset('data/preprocessed_data/dev.tar') vocab = Vocab.load('data/vocab.bpe10000/vocab') if args.decoder: vocab_size = len(vocab.all_subtokens) + 1 else: vocab_size = len(vocab.source_tokens) + 1 max_iters = args.max_iters lr = args.lr warm_up = args.warm_up batch_size = 4096 effective_batch_size = args.batch_size max_embeds = 1000 if args.decoder else 512 bert_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_embeds, num_hidden_layers=6, hidden_size=256, num_attention_heads=4) model = BertForPreTraining(bert_config) if args.restore: state_dict = torch.load(os.path.join(args.save_dir, args.res_name)) model.load_state_dict(state_dict['model']) batch_count = state_dict['step'] epoch = state_dict['epoch'] model.train() model.to(device) if len(args.gpu) > 1 and device == torch.device('cuda'): model = nn.DataParallel(model) def lr_func(step): if step > warm_up: return (max_iters - step) / (max_iters - warm_up) else: return (step / warm_up) optimizer = torch.optim.Adam(model.parameters(), lr=lr, eps=1e-6, weight_decay=0.01) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_func, last_epoch=-1) loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none') if args.restore: optimizer.load_state_dict(state_dict['optim']) scheduler.load_state_dict(state_dict['scheduler']) batch_count = 0 epoch = 0 cum_loss = 0.0 while True: # load training dataset, which is a collection of ASTs and maps of gold-standard renamings train_set_iter = train_set.batch_iterator( batch_size=batch_size, return_examples=False, config=config, progress=True, train=True, max_seq_len=512, num_readers=args.num_readers, num_batchers=args.num_batchers) epoch += 1 print("Epoch {}".format(epoch)) loss = 0 num_seq = 0 optimizer.zero_grad() for batch in train_set_iter: if args.decoder: input_ids = batch.tensor_dict['prediction_target'][ 'src_with_true_var_names'] else: input_ids = batch.tensor_dict['src_code_tokens'] attention_mask = torch.ones_like(input_ids) attention_mask[input_ids == 0] = 0.0 assert torch.max(input_ids) < vocab_size assert torch.min(input_ids) >= 0 if input_ids.shape[0] > max_embeds: print( "Warning - length {} is greater than max length {}. Skipping." .format(input_ids.shape[0], max_embeds)) continue input_ids, labels = mask_tokens(inputs=input_ids, mask_token_id=vocab_size - 1, vocab_size=vocab_size, mlm_probability=0.15) input_ids[attention_mask == 0] = 0 labels[attention_mask == 0] = -100 if torch.cuda.is_available(): input_ids = input_ids.cuda() labels = labels.cuda() attention_mask = attention_mask.cuda() outputs = model(input_ids=input_ids, attention_mask=attention_mask, masked_lm_labels=labels) unreduced_loss = loss_fn( outputs[0].view(-1, bert_config.vocab_size), labels.view(-1)).reshape(labels.shape) / ( torch.sum(labels != -100, axis=1).unsqueeze(1) + 1e-7) loss += unreduced_loss.sum() num_seq += input_ids.shape[0] if num_seq > effective_batch_size: batch_count += 1 loss /= num_seq cum_loss += loss.item() if batch_count % 20 == 0: print("{} batches, Loss : {:.4}, LR : {:.6}".format( batch_count, cum_loss / 20, scheduler.get_lr()[0])) cum_loss = 0.0 if batch_count % 10000 == 0: fname1 = os.path.join( args.save_dir, 'bert_{}_step_{}.pth'.format( ('decoder' if args.decoder else 'encoder'), batch_count)) fname2 = os.path.join( args.save_dir, 'bert_{}.pth'.format( ('decoder' if args.decoder else 'encoder'), batch_count)) state = { 'epoch': epoch, 'step': batch_count, 'model': model.module.state_dict(), 'optim': optimizer.state_dict(), 'scheduler': scheduler.state_dict() } torch.save(state, fname1) torch.save(state, fname2) print("Saved file to path {}".format(fname1)) print("Saved file to path {}".format(fname2)) loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() loss = 0 num_seq = 0 if batch_count == max_iters: print(f'[Learner] Reached max iters', file=sys.stderr) exit() print("Max_len = {}".format(max_len)) break
def decode_and_evaluate(model: RenamingModel, dataset: Dataset, config: Dict, return_results=False, eval_batch_size=None): if eval_batch_size is None: eval_batch_size = config['train'][ 'eval_batch_size'] if 'eval_batch_size' in config[ 'train'] else config['train']['batch_size'] data_iter = dataset.batch_iterator( batch_size=eval_batch_size, train=False, progress=True, return_examples=True, config=model.config, num_readers=config['train']['num_readers'], num_batchers=config['train']['num_batchers']) was_training = model.training model.eval() example_acc_list = [] variable_acc_list = [] need_rename_cases = [] func_name_in_train_acc_list = [] func_name_not_in_train_acc_list = [] func_body_in_train_acc_list = [] func_body_not_in_train_acc_list = [] all_examples = dict() results = {} with torch.no_grad(): for batch in data_iter: examples = batch.examples rename_results = model.predict(examples) for example, rename_result in zip(examples, rename_results): example_pred_accs = [] binary = example.binary_file[ 'file_name'][:example.binary_file['file_name']. index("_")] func_name = example.ast.compilation_unit top_rename_result = rename_result[0] for old_name, gold_new_name in example.variable_name_map.items( ): pred = top_rename_result[old_name] pred_new_name = pred['new_name'] results.setdefault(binary, {}).setdefault( func_name, {})[old_name] = "", pred_new_name var_metric = Evaluator.get_soft_metrics( pred_new_name, gold_new_name) # is_correct = pred_new_name == gold_new_name example_pred_accs.append(var_metric) if gold_new_name != old_name: # and gold_new_name in model.vocab.target: need_rename_cases.append(var_metric) if example.test_meta['function_name_in_train']: func_name_in_train_acc_list.append(var_metric) else: func_name_not_in_train_acc_list.append( var_metric) if example.test_meta['function_body_in_train']: func_body_in_train_acc_list.append(var_metric) else: func_body_not_in_train_acc_list.append( var_metric) variable_acc_list.extend(example_pred_accs) example_acc_list.append(example_pred_accs) if return_results: all_examples[example.binary_file['file_name'] + '_' + str(example.binary_file['line_num'])] = ( rename_result, Evaluator.average(example_pred_accs)) # all_examples.append((example, rename_result, example_pred_accs)) json.dump(results, open(f"pred_dire_{time.strftime('%d%H%M')}.json", "w")) valid_example_num = len(example_acc_list) num_variables = len(variable_acc_list) corpus_acc = Evaluator.average(variable_acc_list) if was_training: model.train() eval_results = dict( corpus_acc=corpus_acc, corpus_need_rename_acc=Evaluator.average(need_rename_cases), func_name_in_train_acc=Evaluator.average( func_name_in_train_acc_list), func_name_not_in_train_acc=Evaluator.average( func_name_not_in_train_acc_list), func_body_in_train_acc=Evaluator.average( func_body_in_train_acc_list), func_body_not_in_train_acc=Evaluator.average( func_body_not_in_train_acc_list), num_variables=num_variables, num_valid_examples=valid_example_num) if return_results: return eval_results, all_examples return eval_results