예제 #1
0
    def evaluate_ppl(model: RenamingModel,
                     dataset: Dataset,
                     config: Dict,
                     predicate: Any = None):
        if predicate is None:

            def predicate(_):
                return True

        eval_batch_size = config['train']['batch_size']
        num_readers = config['train']['num_readers']
        num_batchers = config['train']['num_batchers']
        data_iter = dataset.batch_iterator(batch_size=eval_batch_size,
                                           train=False,
                                           progress=True,
                                           return_examples=False,
                                           return_prediction_target=True,
                                           config=model.config,
                                           num_readers=num_readers,
                                           num_batchers=num_batchers)

        was_training = model.training
        model.eval()
        cum_log_probs = 0.
        cum_num_examples = 0
        with torch.no_grad():
            for batch in data_iter:
                td = batch.tensor_dict
                nn_util.to(td, model.device)
                result = model(td, td['prediction_target'])
                log_probs = result['batch_log_prob'].cpu().tolist()
                for e_id, test_meta in enumerate(td['test_meta']):
                    if predicate(test_meta):
                        log_prob = log_probs[e_id]
                        cum_log_probs += log_prob
                        cum_num_examples += 1

        ppl = np.exp(-cum_log_probs / cum_num_examples)

        if was_training:
            model.train()

        return ppl
예제 #2
0
    def decode(model: RenamingModel,
               dataset: Dataset,
               config: Dict,
               eval_batch_size=None):
        if eval_batch_size is None:
            if 'eval_batch_size' in config['train']:
                eval_batch_size = config['train']['eval_batch_size']
            else:
                eval_batch_size = config['train']['batch_size']
        num_readers = config['train']['num_readers']
        num_batchers = config['train']['num_batchers']
        data_iter = dataset.batch_iterator(batch_size=eval_batch_size,
                                           train=False,
                                           progress=True,
                                           return_examples=True,
                                           config=model.config,
                                           num_readers=num_readers,
                                           num_batchers=num_batchers)
        model.eval()
        all_examples = dict()

        with torch.no_grad():
            for batch in data_iter:
                examples = batch.examples
                rename_results = model.predict(examples)
                for example, rename_result in zip(examples, rename_results):
                    example_pred_accs = []
                    top_rename_result = rename_result[0]
                    for old_name, gold_new_name \
                            in example.variable_name_map.items():
                        pred = top_rename_result[old_name]
                        pred_new_name = pred['new_name']
                        var_metric = Evaluator.get_soft_metrics(
                            pred_new_name, gold_new_name)
                        example_pred_accs.append(var_metric)
                    file_name = example.binary_file['file_name']
                    line_num = example.binary_file['line_num']
                    fun_name = example.ast.compilation_unit
                    all_examples[f'{file_name}_{line_num}_{fun_name}'] = \
                        (rename_result, Evaluator.average(example_pred_accs))

        return all_examples
예제 #3
0
    def decode_and_evaluate(model: RenamingModel,
                            dataset: Dataset,
                            config: Dict,
                            return_results=False,
                            eval_batch_size=None,
                            approx=False):
        if eval_batch_size is None:
            eval_batch_size = config['train'][
                'eval_batch_size'] if 'eval_batch_size' in config[
                    'train'] else config['train']['batch_size']
        data_iter = dataset.batch_iterator(
            batch_size=eval_batch_size,
            train=False,
            progress=True,
            return_examples=True,
            max_seq_len=512,
            config=model.module.config
            if isinstance(model, torch.nn.DataParallel) else model.config,
            num_readers=config['train']['num_readers'],
            num_batchers=config['train']['num_batchers'],
            truncate=approx)

        was_training = model.training
        model.eval()
        example_acc_list = []
        variable_acc_list = []
        need_rename_cases = []

        func_name_in_train_acc_list = []
        func_name_not_in_train_acc_list = []
        func_body_in_train_acc_list = []
        func_body_not_in_train_acc_list = []

        all_examples = dict()

        with torch.no_grad():
            for i, batch in enumerate(data_iter):
                examples = batch.examples
                if isinstance(model, torch.nn.DataParallel):
                    rename_results = model.module.predict(examples)
                else:
                    rename_results = model.predict(examples)
                for example, rename_result in zip(examples, rename_results):
                    example_pred_accs = []

                    top_rename_result = rename_result[0]
                    for old_name, gold_new_name in example.variable_name_map.items(
                    ):
                        pred = top_rename_result[old_name]
                        pred_new_name = pred['new_name']
                        var_metric = Evaluator.get_soft_metrics(
                            pred_new_name, gold_new_name)
                        # is_correct = pred_new_name == gold_new_name
                        example_pred_accs.append(var_metric)

                        if gold_new_name != old_name:  # and gold_new_name in model.vocab.target:
                            need_rename_cases.append(var_metric)

                            if example.test_meta['function_name_in_train']:
                                func_name_in_train_acc_list.append(var_metric)
                            else:
                                func_name_not_in_train_acc_list.append(
                                    var_metric)

                            if example.test_meta['function_body_in_train']:
                                func_body_in_train_acc_list.append(var_metric)
                            else:
                                func_body_not_in_train_acc_list.append(
                                    var_metric)

                    variable_acc_list.extend(example_pred_accs)
                    example_acc_list.append(example_pred_accs)

                    if return_results:
                        all_examples[example.binary_file['file_name'] + '_' +
                                     str(example.binary_file['line_num'])] = (
                                         rename_result,
                                         Evaluator.average(example_pred_accs))
                        # all_examples.append((example, rename_result, example_pred_accs))

        valid_example_num = len(example_acc_list)
        num_variables = len(variable_acc_list)
        corpus_acc = Evaluator.average(variable_acc_list)

        if was_training:
            model.train()

        eval_results = dict(
            corpus_acc=corpus_acc,
            corpus_need_rename_acc=Evaluator.average(need_rename_cases),
            func_name_in_train_acc=Evaluator.average(
                func_name_in_train_acc_list),
            func_name_not_in_train_acc=Evaluator.average(
                func_name_not_in_train_acc_list),
            func_body_in_train_acc=Evaluator.average(
                func_body_in_train_acc_list),
            func_body_not_in_train_acc=Evaluator.average(
                func_body_not_in_train_acc_list),
            num_variables=num_variables,
            num_valid_examples=valid_example_num)

        if return_results:
            return eval_results, all_examples
        return eval_results
예제 #4
0
    def decode_and_evaluate(model: RenamingModel,
                            dataset: Dataset,
                            config: Dict,
                            return_results=False,
                            eval_batch_size=None):
        if eval_batch_size is None:
            if 'eval_batch_size' in config['train']:
                eval_batch_size = config['train']['eval_batch_size']
            else:
                eval_batch_size = config['train']['batch_size']
        num_readers = config['train']['num_readers']
        num_batchers = config['train']['num_batchers']
        data_iter = dataset.batch_iterator(batch_size=eval_batch_size,
                                           train=False,
                                           progress=True,
                                           return_examples=True,
                                           config=model.config,
                                           num_readers=num_readers,
                                           num_batchers=num_batchers)

        was_training = model.training
        model.eval()
        example_acc_list = []
        variable_acc_list = []
        need_rename_cases = []

        func_name_in_train_acc = []
        func_name_not_in_train_acc = []
        func_body_in_train_acc = []
        func_body_not_in_train_acc = []

        all_examples = dict()

        with torch.no_grad():
            for batch in data_iter:
                examples = batch.examples
                rename_results = model.predict(examples)
                for example, rename_result in zip(examples, rename_results):
                    example_pred_accs = []

                    top_rename_result = rename_result[0]
                    for old_name, gold_new_name \
                            in example.variable_name_map.items():
                        pred = top_rename_result[old_name]
                        pred_new_name = pred['new_name']
                        var_metric = Evaluator.get_soft_metrics(
                            pred_new_name, gold_new_name)
                        # is_correct = pred_new_name == gold_new_name
                        example_pred_accs.append(var_metric)

                        if gold_new_name != old_name:
                            need_rename_cases.append(var_metric)

                            if example.test_meta['function_name_in_train']:
                                func_name_in_train_acc.append(var_metric)
                            else:
                                func_name_not_in_train_acc.append(var_metric)

                            if example.test_meta['function_body_in_train']:
                                func_body_in_train_acc.append(var_metric)
                            else:
                                func_body_not_in_train_acc.append(var_metric)

                    variable_acc_list.extend(example_pred_accs)
                    example_acc_list.append(example_pred_accs)

                    if return_results:
                        example = \
                            f"{example.binary_file['file_name']}_" \
                            f"{example.binary_file['line_num']}"
                        all_examples[example] = \
                            (rename_result,
                             Evaluator.average(example_pred_accs))

        valid_example_num = len(example_acc_list)
        num_variables = len(variable_acc_list)
        corpus_acc = Evaluator.average(variable_acc_list)

        if was_training:
            model.train()

        need_rename_acc = Evaluator.average(need_rename_cases)
        name_in_train_acc = Evaluator.average(func_name_in_train_acc)
        name_not_in_train_acc = Evaluator.average(func_name_not_in_train_acc)
        body_in_train_acc = Evaluator.average(func_body_in_train_acc)
        body_not_in_train_acc = Evaluator.average(func_body_not_in_train_acc)
        eval_results = dict(corpus_acc=corpus_acc,
                            corpus_need_rename_acc=need_rename_acc,
                            func_name_in_train_acc=name_in_train_acc,
                            func_name_not_in_train_acc=name_not_in_train_acc,
                            func_body_in_train_acc=body_in_train_acc,
                            func_body_not_in_train_acc=body_not_in_train_acc,
                            num_variables=num_variables,
                            num_valid_examples=valid_example_num)

        if return_results:
            return eval_results, all_examples
        return eval_results
예제 #5
0
def train(args):
    work_dir = args['--work-dir']
    config = json.loads(_jsonnet.evaluate_file(args['CONFIG_FILE']))
    config['work_dir'] = work_dir

    if not os.path.exists(work_dir):
        print(f'creating work dir [{work_dir}]', file=sys.stderr)
        os.makedirs(work_dir)

    if args['--extra-config']:
        extra_config = args['--extra-config']
        extra_config = json.loads(extra_config)
        config = util.update(config, extra_config)

    json.dump(config,
              open(os.path.join(work_dir, 'config.json'), 'w'),
              indent=2)

    model = RenamingModel.build(config)
    config = model.config
    model.train()

    if args['--cuda']:
        model = model.cuda()

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.Adam(params, lr=0.001)
    nn_util.glorot_init(params)

    # set the padding index for embedding layers to zeros
    # model.encoder.var_node_name_embedding.weight[0].fill_(0.)

    train_set = Dataset(config['data']['train_file'])
    dev_set = Dataset(config['data']['dev_file'])
    batch_size = config['train']['batch_size']

    print(f'Training set size {len(train_set)}, dev set size {len(dev_set)}',
          file=sys.stderr)

    # training loop
    train_iter = epoch = cum_examples = 0
    log_every = config['train']['log_every']
    evaluate_every_nepoch = config['train']['evaluate_every_nepoch']
    max_epoch = config['train']['max_epoch']
    max_patience = config['train']['patience']
    cum_loss = 0.
    patience = 0.
    t_log = time.time()

    history_accs = []
    while True:
        # load training dataset, which is a collection of ASTs and maps of gold-standard renamings
        train_set_iter = train_set.batch_iterator(
            batch_size=batch_size,
            return_examples=False,
            config=config,
            progress=True,
            train=True,
            num_readers=config['train']['num_readers'],
            num_batchers=config['train']['num_batchers'])
        epoch += 1

        for batch in train_set_iter:
            train_iter += 1
            optimizer.zero_grad()

            # t1 = time.time()
            nn_util.to(batch.tensor_dict, model.device)
            # print(f'[Learner] {time.time() - t1}s took for moving tensors to device', file=sys.stderr)

            # t1 = time.time()
            result = model(batch.tensor_dict,
                           batch.tensor_dict['prediction_target'])
            # print(f'[Learner] batch {train_iter}, {batch.size} examples took {time.time() - t1:4f}s', file=sys.stderr)

            loss = -result['batch_log_prob'].mean()

            cum_loss += loss.item() * batch.size
            cum_examples += batch.size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(params, 5.)

            optimizer.step()
            del loss

            if train_iter % log_every == 0:
                print(
                    f'[Learner] train_iter={train_iter} avg. loss={cum_loss / cum_examples}, '
                    f'{cum_examples} examples ({cum_examples / (time.time() - t_log)} examples/s)',
                    file=sys.stderr)

                cum_loss = cum_examples = 0.
                t_log = time.time()

        print(f'[Learner] Epoch {epoch} finished', file=sys.stderr)

        if epoch % evaluate_every_nepoch == 0:
            print(f'[Learner] Perform evaluation', file=sys.stderr)
            t1 = time.time()
            # ppl = Evaluator.evaluate_ppl(model, dev_set, config, predicate=lambda e: not e['function_body_in_train'])
            eval_results = Evaluator.decode_and_evaluate(
                model, dev_set, config)
            # print(f'[Learner] Evaluation result ppl={ppl} (took {time.time() - t1}s)', file=sys.stderr)
            print(
                f'[Learner] Evaluation result {eval_results} (took {time.time() - t1}s)',
                file=sys.stderr)
            dev_metric = eval_results['func_body_not_in_train_acc']['accuracy']
            # dev_metric = -ppl
            if len(history_accs) == 0 or dev_metric > max(history_accs):
                patience = 0
                model_save_path = os.path.join(work_dir, f'model.bin')
                model.save(model_save_path)
                print(
                    f'[Learner] Saved currently the best model to {model_save_path}',
                    file=sys.stderr)
            else:
                patience += 1
                if patience == max_patience:
                    print(
                        f'[Learner] Reached max patience {max_patience}, exiting...',
                        file=sys.stderr)
                    patience = 0
                    exit()

            history_accs.append(dev_metric)

        if epoch == max_epoch:
            print(f'[Learner] Reached max epoch', file=sys.stderr)
            exit()

        t1 = time.time()
예제 #6
0
def train(args):

    if not os.path.exists(args.save_dir): os.mkdir(args.save_dir)

    if args.gpu != '-1' and torch.cuda.is_available():
        device = torch.device('cuda')
        torch.cuda.set_rng_state(torch.cuda.get_rng_state())
        torch.backends.cudnn.deterministic = True
    else:
        device = torch.device('cpu')

    config = {
        'train': {
            'unchanged_variable_weight': 0.1,
            'buffer_size': 5000
        },
        'encoder': {
            'type': 'SequentialEncoder'
        },
        'data': {
            'vocab_file': 'data/vocab.bpe10000/vocab'
        }
    }

    train_set = Dataset('data/preprocessed_data/train-shard-*.tar')
    dev_set = Dataset('data/preprocessed_data/dev.tar')

    vocab = Vocab.load('data/vocab.bpe10000/vocab')

    if args.decoder:
        vocab_size = len(vocab.all_subtokens) + 1
    else:
        vocab_size = len(vocab.source_tokens) + 1

    max_iters = args.max_iters
    lr = args.lr
    warm_up = args.warm_up

    batch_size = 4096
    effective_batch_size = args.batch_size

    max_embeds = 1000 if args.decoder else 512

    bert_config = BertConfig(vocab_size=vocab_size,
                             max_position_embeddings=max_embeds,
                             num_hidden_layers=6,
                             hidden_size=256,
                             num_attention_heads=4)
    model = BertForPreTraining(bert_config)

    if args.restore:
        state_dict = torch.load(os.path.join(args.save_dir, args.res_name))
        model.load_state_dict(state_dict['model'])
        batch_count = state_dict['step']
        epoch = state_dict['epoch']

    model.train()
    model.to(device)

    if len(args.gpu) > 1 and device == torch.device('cuda'):
        model = nn.DataParallel(model)

    def lr_func(step):
        if step > warm_up:
            return (max_iters - step) / (max_iters - warm_up)
        else:
            return (step / warm_up)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr,
                                 eps=1e-6,
                                 weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                  lr_lambda=lr_func,
                                                  last_epoch=-1)
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none')

    if args.restore:
        optimizer.load_state_dict(state_dict['optim'])
        scheduler.load_state_dict(state_dict['scheduler'])

    batch_count = 0
    epoch = 0
    cum_loss = 0.0

    while True:
        # load training dataset, which is a collection of ASTs and maps of gold-standard renamings
        train_set_iter = train_set.batch_iterator(
            batch_size=batch_size,
            return_examples=False,
            config=config,
            progress=True,
            train=True,
            max_seq_len=512,
            num_readers=args.num_readers,
            num_batchers=args.num_batchers)
        epoch += 1
        print("Epoch {}".format(epoch))

        loss = 0
        num_seq = 0

        optimizer.zero_grad()

        for batch in train_set_iter:
            if args.decoder:
                input_ids = batch.tensor_dict['prediction_target'][
                    'src_with_true_var_names']
            else:
                input_ids = batch.tensor_dict['src_code_tokens']

            attention_mask = torch.ones_like(input_ids)
            attention_mask[input_ids == 0] = 0.0

            assert torch.max(input_ids) < vocab_size
            assert torch.min(input_ids) >= 0

            if input_ids.shape[0] > max_embeds:
                print(
                    "Warning - length {} is greater than max length {}. Skipping."
                    .format(input_ids.shape[0], max_embeds))
                continue

            input_ids, labels = mask_tokens(inputs=input_ids,
                                            mask_token_id=vocab_size - 1,
                                            vocab_size=vocab_size,
                                            mlm_probability=0.15)

            input_ids[attention_mask == 0] = 0
            labels[attention_mask == 0] = -100

            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                labels = labels.cuda()
                attention_mask = attention_mask.cuda()

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            masked_lm_labels=labels)

            unreduced_loss = loss_fn(
                outputs[0].view(-1, bert_config.vocab_size),
                labels.view(-1)).reshape(labels.shape) / (
                    torch.sum(labels != -100, axis=1).unsqueeze(1) + 1e-7)
            loss += unreduced_loss.sum()
            num_seq += input_ids.shape[0]

            if num_seq > effective_batch_size:
                batch_count += 1
                loss /= num_seq
                cum_loss += loss.item()

                if batch_count % 20 == 0:
                    print("{} batches, Loss : {:.4}, LR : {:.6}".format(
                        batch_count, cum_loss / 20,
                        scheduler.get_lr()[0]))
                    cum_loss = 0.0

                if batch_count % 10000 == 0:
                    fname1 = os.path.join(
                        args.save_dir, 'bert_{}_step_{}.pth'.format(
                            ('decoder' if args.decoder else 'encoder'),
                            batch_count))
                    fname2 = os.path.join(
                        args.save_dir, 'bert_{}.pth'.format(
                            ('decoder' if args.decoder else 'encoder'),
                            batch_count))

                    state = {
                        'epoch': epoch,
                        'step': batch_count,
                        'model': model.module.state_dict(),
                        'optim': optimizer.state_dict(),
                        'scheduler': scheduler.state_dict()
                    }

                    torch.save(state, fname1)
                    torch.save(state, fname2)

                    print("Saved file to path {}".format(fname1))
                    print("Saved file to path {}".format(fname2))

                loss.backward()
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

                loss = 0
                num_seq = 0

            if batch_count == max_iters:
                print(f'[Learner] Reached max iters', file=sys.stderr)
                exit()

        print("Max_len = {}".format(max_len))
        break
예제 #7
0
    def decode_and_evaluate(model: RenamingModel,
                            dataset: Dataset,
                            config: Dict,
                            return_results=False,
                            eval_batch_size=None):
        if eval_batch_size is None:
            eval_batch_size = config['train'][
                'eval_batch_size'] if 'eval_batch_size' in config[
                    'train'] else config['train']['batch_size']
        data_iter = dataset.batch_iterator(
            batch_size=eval_batch_size,
            train=False,
            progress=True,
            return_examples=True,
            config=model.config,
            num_readers=config['train']['num_readers'],
            num_batchers=config['train']['num_batchers'])

        was_training = model.training
        model.eval()
        example_acc_list = []
        variable_acc_list = []
        need_rename_cases = []

        func_name_in_train_acc_list = []
        func_name_not_in_train_acc_list = []
        func_body_in_train_acc_list = []
        func_body_not_in_train_acc_list = []

        all_examples = dict()

        results = {}
        with torch.no_grad():
            for batch in data_iter:
                examples = batch.examples
                rename_results = model.predict(examples)
                for example, rename_result in zip(examples, rename_results):
                    example_pred_accs = []
                    binary = example.binary_file[
                        'file_name'][:example.binary_file['file_name'].
                                     index("_")]
                    func_name = example.ast.compilation_unit

                    top_rename_result = rename_result[0]
                    for old_name, gold_new_name in example.variable_name_map.items(
                    ):
                        pred = top_rename_result[old_name]
                        pred_new_name = pred['new_name']
                        results.setdefault(binary, {}).setdefault(
                            func_name, {})[old_name] = "", pred_new_name
                        var_metric = Evaluator.get_soft_metrics(
                            pred_new_name, gold_new_name)
                        # is_correct = pred_new_name == gold_new_name
                        example_pred_accs.append(var_metric)

                        if gold_new_name != old_name:  # and gold_new_name in model.vocab.target:
                            need_rename_cases.append(var_metric)

                            if example.test_meta['function_name_in_train']:
                                func_name_in_train_acc_list.append(var_metric)
                            else:
                                func_name_not_in_train_acc_list.append(
                                    var_metric)

                            if example.test_meta['function_body_in_train']:
                                func_body_in_train_acc_list.append(var_metric)
                            else:
                                func_body_not_in_train_acc_list.append(
                                    var_metric)

                    variable_acc_list.extend(example_pred_accs)
                    example_acc_list.append(example_pred_accs)

                    if return_results:
                        all_examples[example.binary_file['file_name'] + '_' +
                                     str(example.binary_file['line_num'])] = (
                                         rename_result,
                                         Evaluator.average(example_pred_accs))
                        # all_examples.append((example, rename_result, example_pred_accs))

        json.dump(results,
                  open(f"pred_dire_{time.strftime('%d%H%M')}.json", "w"))

        valid_example_num = len(example_acc_list)
        num_variables = len(variable_acc_list)
        corpus_acc = Evaluator.average(variable_acc_list)

        if was_training:
            model.train()

        eval_results = dict(
            corpus_acc=corpus_acc,
            corpus_need_rename_acc=Evaluator.average(need_rename_cases),
            func_name_in_train_acc=Evaluator.average(
                func_name_in_train_acc_list),
            func_name_not_in_train_acc=Evaluator.average(
                func_name_not_in_train_acc_list),
            func_body_in_train_acc=Evaluator.average(
                func_body_in_train_acc_list),
            func_body_not_in_train_acc=Evaluator.average(
                func_body_not_in_train_acc_list),
            num_variables=num_variables,
            num_valid_examples=valid_example_num)

        if return_results:
            return eval_results, all_examples
        return eval_results