예제 #1
0
    def train(self, num_epochs, start_epoch=0):
        """
		starts the training, logs loss and metrics in logging file and prints progress
		in the console, including an ETA. Also stores snapshots of current model each epoch.

		:param num_epochs:
			number of epochs to train
		:param start_epoch:
			the first epoch, default to 0.
			Can be set higher for finetuning, etc.
		"""
        try:
            rem = ETATimer(num_epochs - start_epoch)
            for epoch in range(start_epoch + 1, num_epochs + 1):
                self.state['epoch'] = epoch
                if not self.state['quiet']:
                    print('Epoch', epoch)
                self.print_info()
                self.train_epoch(epoch)
                if self.state['val_iter'] is not None:
                    self.validate_epoch()
                self.snapshot(epoch)
                if not self.state['quiet']:
                    print('ETA:', rem())
            return self.state
        except Exception as e:
            logging.exception(e)
            raise
예제 #2
0
def train(args, data_loader, model, global_stats, logger):
    """Run through one epoch of model training with the provided data loader."""
    # Initialize meters + timers
    ml_loss = AverageMeter()
    loc_loss = AverageMeter()
    fix_loss = AverageMeter()
    epoch_time = Timer()

    current_epoch = global_stats['epoch']
    if args.use_tqdm:
        pbar = tqdm(data_loader)
        pbar.set_description(
            "%s" %
            'Epoch = %d tot_loss = x.xx loc_loss = x.xx fix_loss = x.xx]' %
            current_epoch)
    else:
        pbar = data_loader

    # Run one epoch
    for idx, ex in enumerate(pbar):
        bsz = ex['batch_size']
        if args.optimizer in ['sgd', 'adam'
                              ] and current_epoch <= args.warmup_epochs:
            cur_lrate = global_stats['warmup_factor'] * (model.updates + 1)
            for param_group in model.optimizer.param_groups:
                param_group['lr'] = cur_lrate

        net_loss = model.update(ex)
        ml_loss.update(net_loss["loss"].detach().item(), bsz)
        loc_loss.update(net_loss["loc_loss"].detach().item(), bsz)
        fix_loss.update(net_loss["fix_loss"].detach().item(), bsz)
        log_info = 'Epoch = %d [tot_loss = %.2f loc_loss = %.2f fix_loss = %.2f]' % \
                   (current_epoch, ml_loss.avg, loc_loss.avg, fix_loss.avg)

        if args.use_tqdm:
            pbar.set_description("%s" % log_info)

        if idx % 1000 == 0:
            logger.print(
                'train: Epoch %d | tot_loss = %.2f | loc_loss = %.2f | fix_loss = %.2f'
                % (current_epoch, ml_loss.avg, loc_loss.avg, fix_loss.avg))

    kvs = [("ml_lo_tr", ml_loss.avg), ("loc_lo_tr", loc_loss.avg), ("fix_lo_tr", fix_loss.avg),\
               ("epoch_time", epoch_time.time())]

    for k, v in kvs:
        logger.add(current_epoch, **{k: v})
    logger.print(
        'train: Epoch %d | tot_loss = %.2f | loc_loss = %.2f | fix_loss = %.2f | '
        'Time for epoch = %.2f (s)' %
        (current_epoch, ml_loss.avg, loc_loss.avg, fix_loss.avg,
         epoch_time.time()))

    # Checkpoint
    if args.checkpoint:
        model.checkpoint(logger.path + '/model.cpt.checkpoint',
                         current_epoch + 1)

    gc.collect()
예제 #3
0
    def print_info(self):
        """
		prints and logs current learning rates as well as the epoch.

		:param epoch: the current epoch.
		"""
        if not self.state['quiet']:
            s = 'learning rates ' + (', '.join(map(str, self._lrs())))
            print(s)
예제 #4
0
def init_from_scratch(args, logger):
    """New model, new data, new dictionary."""
    # Build a dictionary from the data questions + words (train/dev splits)
    logger.print('-' * 100)
    logger.print('Build word dictionary')
    if args.src_dict_filename is not None:
        logger.print("Loading dict. from " + args.src_dict_filename)
        src_dict = util.load_word_and_char_dict(args, args.src_dict_filename, \
                                                   dict_size=args.src_vocab_size,
                                                    special_tokens="pad_unk")
    else:
        src_dict = util.build_word_and_char_dict_from_file(
            filenames=[args.train_src_file],
            dict_size=args.src_vocab_size,
            special_tokens="pad_unk")

    type_dict = util.build_word_and_char_dict_from_file(
                             filenames=[args.train_src_tag_file],
                             special_tokens="pad_unk",\
                             dict_size=None)

    logger.print('Num words in source = %d' % (len(src_dict)))

    # Initialize model
    model = VarmisuseModel(config.get_model_args(args), src_dict, type_dict)

    return model
예제 #5
0
def findBestMatchingRow(filename, data, tagNameToColumnIndex):
    search_results = sorted(
        data.iterrows(),
        key=lambda row: difflib.SequenceMatcher(
            None, str(row[1][tagNameToColumnIndex['workTitle']]), filename).
        ratio(),
        reverse=True  # see https://stackoverflow.com/a/17903726
        # This might be a better approach: https://stackoverflow.com/a/36132391
    )  # sort row of xlsx to get the row

    selection = search_results[0]
    print('Selected XLSX row {}: "{}"'.format(
        selection[0], selection[1][tagNameToColumnIndex['workTitle']]))
    return selection
예제 #6
0
def train(args, data_loader, model, global_stats, logger):
    """Run through one epoch of model training with the provided data loader."""
    # Initialize meters + timers
    ml_loss = AverageMeter()
    perplexity = AverageMeter()
    epoch_time = Timer()

    current_epoch = global_stats['epoch']
    pbar = tqdm(data_loader)

    pbar.set_description("%s" %
                         'Epoch = %d [perplexity = x.xx, ml_loss = x.xx]' %
                         current_epoch)

    # Run one epoch
    for idx, ex in enumerate(pbar):
        bsz = ex['batch_size']
        if args.optimizer in ['sgd', 'adam'
                              ] and current_epoch <= args.warmup_epochs:
            cur_lrate = global_stats['warmup_factor'] * (model.updates + 1)
            for param_group in model.optimizer.param_groups:
                param_group['lr'] = cur_lrate

        net_loss = model.update(ex)
        ml_loss.update(net_loss['ml_loss'], bsz)
        perplexity.update(net_loss['perplexity'], bsz)
        log_info = 'Epoch = %d [perplexity = %.2f, ml_loss = %.2f]' % \
                   (current_epoch, perplexity.avg, ml_loss.avg)

        pbar.set_description("%s" % log_info)
        #break
    kvs = [("perp_tr", perplexity.avg), ("ml_lo_tr", ml_loss.avg),\
               ("epoch_time", epoch_time.time())]
    for k, v in kvs:
        logger.add(current_epoch, **{k: v})
    logger.print(
        'train: Epoch %d | perplexity = %.2f | ml_loss = %.2f | '
        'Time for epoch = %.2f (s)' %
        (current_epoch, perplexity.avg, ml_loss.avg, epoch_time.time()))

    # Checkpoint
    if args.checkpoint:
        model.checkpoint(logger.path + '/best_model.cpt.checkpoint',
                         current_epoch + 1)
예제 #7
0
def init_from_scratch(args, train_exs, dev_exs, logger):
    """New model, new data, new dictionary."""
    # Build a dictionary from the data questions + words (train/dev splits)
    logger.print('-' * 100)
    logger.print('Build word dictionary')
    src_dict = util.build_word_and_char_dict(args,
                                             examples=train_exs,# + dev_exs,
                                             fields=['code'],
                                             dict_size=args.src_vocab_size,
                                             special_tokens="pad_unk",\
                                             attrname="tokens" if \
                                                 not args.sum_over_subtokens\
                                                 else "subtokens")
    tgt_dict = util.build_word_and_char_dict(
        args,
        examples=train_exs,  # + dev_exs,
        fields=['summary'],
        dict_size=args.tgt_vocab_size,
        special_tokens="pad_unk_bos_eos")
    if args.use_tree_relative_attn:
        rel_dict = util.build_word_and_char_dict(args,
                                                 examples=train_exs,
                                                 fields=["rel_matrix"],
                                                 dict_size=None,
                                                 special_tokens="unk")
    else:
        rel_dict = None

    if args.use_code_type:
        type_dict = util.build_word_and_char_dict(args,
                                             examples=train_exs,# + dev_exs,
                                             fields=['code'],
                                             dict_size=None,
                                             special_tokens="pad_unk",\
                                             attrname="type")
    else:
        type_dict = None

    logger.print('Num words in source = %d and target = %d' %
                 (len(src_dict), len(tgt_dict)))
    if args.use_tree_relative_attn:
        logger.print("Num relations in relative matrix = %d" % (len(rel_dict)))

    # Initialize model
    model = Code2NaturalLanguage(config.get_model_args(args), src_dict,
                                 tgt_dict, rel_dict, type_dict)

    return model
예제 #8
0
def main(args, logger):
    # --------------------------------------------------------------------------
    # MODEL
    logger.print('-' * 100)
    start_epoch = 1
    if args.only_test:
        if not os.path.isfile(args.model_file):
            raise IOError('No such file: %s' % args.model_file)
        model = VarmisuseModel.load(args.model_file)
    else:
        if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'):
            # Just resume training, no modifications.
            logger.print('Found a checkpoint...')
            checkpoint_file = args.model_file + '.checkpoint'
            model, start_epoch = VarmisuseModel.load_checkpoint(
                checkpoint_file, args.cuda)
        else:
            # Training starts fresh. But the model state is either pretrained or
            # newly (randomly) initialized.
            if args.pretrained:
                logger.print('Using pretrained model...')
                model = VarmisuseModel.load(args.pretrained, args)
            else:
                logger.print('Training model from scratch...')
                model = init_from_scratch(args, logger)

            # Set up optimizer
            model.init_optimizer()
            # log the parameter details
            logger.print(
                'Trainable #parameters [encoder-decoder] {} [total] {}'.format(
                    human_format(model.network.count_encoder_parameters() +
                                 model.network.count_decoder_parameters()),
                    human_format(model.network.count_parameters())))
            table = model.network.layer_wise_parameters()
            logger.print('Breakdown of the trainable paramters\n%s' % table)

    # Use the GPU?
    if args.cuda:
        model.cuda()

    if args.parallel:
        model.parallelize()

    # --------------------------------------------------------------------------
    # DATA ITERATORS
    # Two datasets: train and dev. If we sort by length it's faster.
    logger.print('-' * 100)
    logger.print('Make data loaders')

    dev_files = dict()
    dev_files['src'] = args.dev_src_file
    dev_files['src_tag'] = args.dev_src_tag_file
    dev_files['tgt'] = args.dev_tgt_file
    if not args.only_test:
        train_files = dict()
        train_files['src'] = args.train_src_file
        train_files['src_tag'] = args.train_src_tag_file
        train_files['tgt'] = args.train_tgt_file

        train_dataset = data_utils.VarmisuseDataset(model, args, train_files)
        if args.sort_by_len:
            train_sampler = data.SortedBatchSampler(train_dataset.lengths(),
                                                    args.batch_size,
                                                    shuffle=True)
        else:
            train_sampler = torch.utils.data.sampler.RandomSampler(
                train_dataset)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.batch_size,
            sampler=train_sampler,
            num_workers=args.data_workers,
            collate_fn=data_utils.batchify_varmisuse,
            pin_memory=args.cuda,
            drop_last=args.parallel)

    dev_dataset = data_utils.VarmisuseDataset(model, args, dev_files)
    dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)

    dev_loader = torch.utils.data.DataLoader(
        dev_dataset,
        batch_size=args.test_batch_size,
        sampler=dev_sampler,
        num_workers=args.data_workers,
        collate_fn=data_utils.batchify_varmisuse,
        pin_memory=args.cuda,
        drop_last=args.parallel)

    # -------------------------------------------------------------------------
    # PRINT CONFIG
    logger.print('-' * 100)
    logger.print('CONFIG:\n%s' %
                 json.dumps(vars(args), indent=4, sort_keys=True))

    # --------------------------------------------------------------------------
    # DO TEST

    if args.only_test:
        stats = {
            'timer': Timer(),
            'epoch': 100000,
            'best_valid': 0,
            'no_improvement': 0
        }
        validate_official(args, dev_loader, model, stats, logger, mode='test')
        logger.save(silent=True)

    # --------------------------------------------------------------------------
    # TRAIN/VALID LOOP
    else:
        logger.print('-' * 100)
        logger.print('Starting training...')
        stats = {
            'timer': Timer(),
            'epoch': start_epoch,
            'best_valid': 0,
            'no_improvement': 0
        }

        if args.optimizer in ['sgd', 'adam'
                              ] and args.warmup_epochs >= start_epoch:
            logger.print(
                "Use warmup lrate for the %d epoch, from 0 up to %s." %
                (args.warmup_epochs, args.learning_rate))
            num_batches = len(train_loader.dataset) // args.batch_size
            warmup_factor = (args.learning_rate + 0.) / (num_batches *
                                                         args.warmup_epochs)
            stats['warmup_factor'] = warmup_factor

        for epoch in range(start_epoch, args.num_epochs + 1):
            stats['epoch'] = epoch
            if args.optimizer in ['sgd', 'adam'
                                  ] and epoch > args.warmup_epochs + 1:
                model.optimizer.param_groups[0]['lr'] = \
                    model.optimizer.param_groups[0]['lr'] * args.lr_decay

            train(args, train_loader, model, stats, logger)
            if epoch % args.print_fq == 0:
                model.save(logger.path + '/best_model.cpt')
                result = validate_official(args, dev_loader, model, stats,
                                           logger)
            logger.save(silent=True)
            if epoch % args.save_fq == 0:
                model.save(logger.path + '/model_epoch%d.cpt' % epoch)

            # Save best valid
            if ((epoch % args.print_fq == 0) and \
                              (result[args.valid_metric] > stats['best_valid'])):
                logger.print('Best valid: %s = %.2f (epoch %d, %d updates)' %
                             (args.valid_metric, result[args.valid_metric],
                              stats['epoch'], model.updates))
                stats['best_valid'] = result[args.valid_metric]
                stats['no_improvement'] = 0
            else:
                stats['no_improvement'] += 1
                if stats['no_improvement'] >= args.early_stop:
                    break
예제 #9
0
def validate_official(
    args,
    data_loader,
    model,
    global_stats,
    logger,
    mode='dev',
):
    """Run one full validation.
    """
    eval_time = Timer()
    # Run through examples

    global_pred_loc, global_target_loc, is_buggy, global_target_probs, \
    global_correct_fix = None, None, None, None, None
    with torch.no_grad():
        if args.use_tqdm:
            pbar = tqdm(data_loader)
        else:
            pbar = data_loader
        for idx, ex in enumerate(pbar):
            batch_size = ex['batch_size']
            logits_loc, logits_fix = model.predict(ex)
            pred_loc = np.argmax(logits_loc.cpu().numpy(), axis=1) - 1
            pred_fix = np.argmax(logits_fix.cpu().numpy(), axis=1)
            scope_mask = ex["scope_t"]  # batch x seq_len
            logits_fix = logits_fix.masked_fill(~scope_mask, -1e18)
            pointer_probs = F.softmax(logits_fix, dim=1)  # batch x seq_len
            target_mask = ex["fixes_t"]  # batch x seq_len
            target_probs = (target_mask * pointer_probs).sum(dim=-1)  # batch
            target_fix = ex["target_fix"].cpu().numpy()
            correct_fix = target_fix[np.arange(target_fix.shape[0]), pred_fix]
            if global_pred_loc is None:
                global_pred_loc = pred_loc
                global_target_loc = ex["target_pos"].cpu().numpy()
                global_correct_fix = correct_fix
                is_buggy = ex["mask_incorrect"].cpu().numpy()
                global_target_probs = target_probs.cpu().numpy()
            else:
                global_pred_loc = np.hstack((global_pred_loc, pred_loc))
                global_target_loc = np.hstack((global_target_loc,\
                                               ex["target_pos"].cpu().numpy()))
                global_correct_fix = np.hstack(
                    (global_correct_fix, correct_fix))
                is_buggy = np.hstack(
                    (is_buggy, ex["mask_incorrect"].cpu().numpy()))
                global_target_probs = np.hstack((global_target_probs, \
                                                target_probs.cpu().numpy()))
    # Store two metrics: the accuracy at predicting specifically the non-buggy samples correctly (to measure false alarm rate), and the accuracy at detecting the real bugs.
    loc_correct = (global_pred_loc == global_target_loc)
    no_bug_pred_acc = (
        (1 - is_buggy) * loc_correct).sum() / (1e-9 +
                                               (1 - is_buggy).sum()) * 100
    bug_loc_acc = (is_buggy * loc_correct).sum() / (1e-9 +
                                                    (is_buggy).sum()) * 100

    # Version by Hellendoorn et al:
    # To simplify the comparison, accuracy is computed as achieving >= 50% probability for the top guess
    # (as opposed to the slightly more accurate, but hard to compute quickly, greatest probability among distinct variable names).
    fix_correct = (global_target_probs >= 0.5)
    target_fix_acc = (is_buggy * fix_correct).sum() / (1e-9 +
                                                       (is_buggy).sum()) * 100

    joint_acc_bug = (is_buggy * loc_correct *
                     fix_correct).sum() / (1e-9 + (is_buggy).sum()) * 100
    result = dict()
    result['no_bug_pred_acc'] = no_bug_pred_acc
    result['bug_loc_acc'] = bug_loc_acc
    result['bug_fix_acc'] = target_fix_acc
    result['joint_acc_bug'] = joint_acc_bug
    result["ev_time"] = eval_time.time()
    logger.add(global_stats['epoch'], **result)

    logger.print("%s valid official: " % mode +
                 "no_bug_pred_acc = %.2f | bug_loc_acc = %.2f " %
                 (no_bug_pred_acc, bug_loc_acc) +
                 "target_fix_acc = %.2f | joint_acc_bug = %.2f " %
                 (target_fix_acc, joint_acc_bug) +
                 'test time = %.2f (s)' % eval_time.time())

    gc.collect()

    return result
예제 #10
0
    parser = argparse.ArgumentParser(
        'Code to Natural Language Generation',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    add_train_args(parser)
    config.add_model_args(parser)
    args, unknown = parser.parse_known_args()
    args.use_tgt_word = False
    args.tgt_pos_emb = False

    # Set cuda
    args.cuda = torch.cuda.is_available()
    args.parallel = torch.cuda.device_count() > 1

    # Set random state UNCOMMENT IF NEEDED
    # np.random.seed(args.random_seed)
    # torch.manual_seed(args.random_seed)
    # if args.cuda:
    #    torch.cuda.manual_seed(args.random_seed)

    # Set logging
    if args.only_test:
        path = args.model_file[:args.model_file.rfind("/") + 1] + "eval/"
        logger = logger.Logger("", fmt={}, base=args.dir, path=path)
    else:
        logger = logger.Logger(args.comment, fmt={}, base=args.dir)
    logger.print(" ".join(sys.argv))
    logger.print(args)

    set_defaults(args)

    main(args, logger)
예제 #11
0
def convert(args):
    tmp_directory = os.path.abspath(os.path.join(args.output_directory,
                                                 'temp'))

    for path in [
            args.output_directory, tmp_directory, args.parameters_directory
    ]:
        if not os.path.exists(path):
            os.makedirs(path)

    # logger_output_filepath = os.path.abspath(os.path.join(tmp_directory, 'logs.txt'))
    # logger.config(logger_output_filepath)

    xlsx_filepath = os.path.abspath(
        os.path.join(args.parameters_directory, 'metadata.xlsm'))
    xlsx = pd.read_excel(xlsx_filepath, header=None)
    columnIndexToTagName = {
        columnIndex: tagName
        for columnIndex, tagName in xlsx.iloc[0].items()
        if isinstance(tagName, str)
    }
    tagNameToColumnIndex = {
        value: key
        for key, value in columnIndexToTagName.items()
    }
    columnIndexToTextFieldName = {
        columnIndex: tagName
        for columnIndex, tagName in xlsx.iloc[1].items()
        if isinstance(tagName, str)
    }
    textFieldNameToColumnIndex = {
        value: key
        for key, value in columnIndexToTextFieldName.items()
    }
    data = xlsx[4:]

    mscz_filenames = [
        filename for filename in os.listdir(args.input_directory)
        if os.path.splitext(filename)[-1].lower() == '.mscz'
    ]

    print('Processing files from directory "{}" into directory "{}".'.format(
        args.input_directory, args.output_directory))

    # TODO: load mapping from file
    mapping.load(args)
    # TODO: for each file, check if already mapped. Otherwise compute mapping
    for filename in mscz_filenames:
        mapping.checkFile(filename, data, tagNameToColumnIndex)
    # TODO: persist mapping
    mapping.persist(args)
    # TODO: process files
    for filename in mscz_filenames:
        print('')
        print('Processing file "{}" ...'.format(filename))
        inputMsczPath = os.path.join(args.input_directory, filename)
        outputMsczPath = os.path.join(args.output_directory,
                                      os.path.basename(filename))
        # rowId, rowData = findBestMatchingRow(filename, data, tagNameToColumnIndex)
        fileData = mapping.checkFile(filename, data, tagNameToColumnIndex)
        rowData = fileData[mapping.ROW_DATA]
        filename_extensionless = os.path.splitext(filename)[0]
        tmp_unzipped_path = os.path.join(tmp_directory, filename_extensionless)
        # https://stackoverflow.com/questions/1807063/extract-files-with-invalid-characters-in-filename-with-python
        with zipfile.ZipFile(inputMsczPath, "r") as zip_ref:
            zip_ref.extractall(tmp_unzipped_path)
        mscx_filenames = [
            filename for filename in os.listdir(tmp_unzipped_path)
            if os.path.splitext(filename)[-1].lower() == '.mscx'
        ]
        original_mscx_filename = mscx_filenames[0]
        # ascii_mscx_filename = trans.trans('{}.mscx'.format(filename_extensionless))
        ascii_mscx_filename = 'file.mscx'
        original_tmp_mscx_path = os.path.join(tmp_unzipped_path,
                                              original_mscx_filename)
        safe_tmp_mscx_path = os.path.join(tmp_unzipped_path,
                                          ascii_mscx_filename)
        os.rename(original_tmp_mscx_path, safe_tmp_mscx_path)
        print('Working on temporary {} file...'.format(safe_tmp_mscx_path))

        xmlTree = ET.parse(safe_tmp_mscx_path)
        injectMetadata(xmlTree, rowData, tagNameToColumnIndex)
        injectTextField(xmlTree, rowData, textFieldNameToColumnIndex)

        xmlTree.write(safe_tmp_mscx_path)

        # update container metadata to take the safe filename into account
        metadata_file_path = os.path.join(tmp_unzipped_path,
                                          'META-INF/container.xml')
        metadataTree = ET.parse(metadata_file_path)
        rootfileTag = safe_child(metadataTree.getroot(), 'rootfile')
        rootfileTag.set('full-path', ascii_mscx_filename)

        # create the new mscz archive
        shutil.make_archive(outputMsczPath, 'zip', tmp_unzipped_path)
        os.rename('{}.zip'.format(outputMsczPath), outputMsczPath)
        pass

    print('Done')
예제 #12
0
        '--do_not_auto_upgrade',
        type=bool,
        default=False,
        help='Will download all the requirements, upgrade pip, ...')
    args = parser.parse_args()
    args.input_directory = os.path.abspath(args.input_directory)
    args.output_directory = os.path.abspath(args.output_directory)
    if os.path.exists(args.output_directory):
        shutil.rmtree(args.output_directory)
    if not args.do_not_auto_upgrade:
        import upgrade
        try:
            upgrade.checkRequirements()
        except Exception as e:
            print('Did not install Python packages, '
                  'but maybe are they already available...\n{}\n{}'.format(
                      e, str(e)))

import zipfile
import shutil
import pandas as pd
from lxml import etree as ET

from toxml import findBestMatchingRow, injectMetadata, \
    injectTextField, safe_child
import mapping

# import trans


def convert(args):
예제 #13
0
def main(args, logger):
    # --------------------------------------------------------------------------
    # DATA
    logger.print('-' * 100)
    logger.print('Load and process data files')

    train_exs = []
    if not args.only_test:
        args.dataset_weights = dict()
        for train_src, train_src_tag, train_tgt, train_rel_matrix, dataset_name in \
                zip(args.train_src_files, args.train_src_tag_files,
                    args.train_tgt_files, args.train_rel_matrix_files,\
                    args.dataset_name):
            train_files = dict()
            train_files['src'] = train_src
            train_files['src_tag'] = train_src_tag
            train_files['tgt'] = train_tgt
            train_files["rel_matrix"] = train_rel_matrix
            exs = util.load_data(args,
                                 train_files,
                                 max_examples=args.max_examples,
                                 dataset_name=dataset_name)
            lang_name = constants.DATA_LANG_MAP[dataset_name]
            args.dataset_weights[constants.LANG_ID_MAP[lang_name]] = len(exs)
            train_exs.extend(exs)

        logger.print('Num train examples = %d' % len(train_exs))
        args.num_train_examples = len(train_exs)
        for lang_id in args.dataset_weights.keys():
            weight = (1.0 * args.dataset_weights[lang_id]) / len(train_exs)
            args.dataset_weights[lang_id] = round(weight, 2)
        logger.print('Dataset weights = %s' % str(args.dataset_weights))

    dev_exs = []
    for dev_src, dev_src_tag, dev_tgt, dev_rel_matrix, dataset_name in \
            zip(args.dev_src_files, args.dev_src_tag_files,
                args.dev_tgt_files, args.dev_rel_matrix_files, args.dataset_name):
        dev_files = dict()
        dev_files['src'] = dev_src
        dev_files['src_tag'] = dev_src_tag
        dev_files['tgt'] = dev_tgt
        dev_files["rel_matrix"] = dev_rel_matrix
        exs = util.load_data(args,
                             dev_files,
                             max_examples=args.max_examples,
                             dataset_name=dataset_name,
                             test_split=True)
        dev_exs.extend(exs)
    logger.print('Num dev examples = %d' % len(dev_exs))

    # --------------------------------------------------------------------------
    # MODEL
    logger.print('-' * 100)
    start_epoch = 1
    if args.only_test:
        #if args.pretrained:
        #    model = Code2NaturalLanguage.load(args.pretrained)
        #else:
        if not os.path.isfile(args.model_file):
            raise IOError('No such file: %s' % args.model_file)
        model = Code2NaturalLanguage.load(args.model_file)
    else:
        if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'):
            # Just resume training, no modifications.
            logger.print('Found a checkpoint...')
            checkpoint_file = args.model_file + '.checkpoint'
            model, start_epoch = Code2NaturalLanguage.load_checkpoint(
                checkpoint_file, args.cuda)
        else:
            # Training starts fresh. But the model state is either pretrained or
            # newly (randomly) initialized.
            if args.pretrained:
                logger.print('Using pretrained model...')
                model = Code2NaturalLanguage.load(args.pretrained, args)
            else:
                logger.print('Training model from scratch...')
                model = init_from_scratch(args, train_exs, dev_exs, logger)

            # Set up optimizer
            model.init_optimizer()
            # log the parameter details
            logger.print(
                'Trainable #parameters [encoder-decoder] {} [total] {}'.format(
                    human_format(model.network.count_encoder_parameters() +
                                 model.network.count_decoder_parameters()),
                    human_format(model.network.count_parameters())))
            table = model.network.layer_wise_parameters()
            logger.print('Breakdown of the trainable paramters\n%s' % table)

    # Use the GPU?
    if args.cuda:
        model.cuda()

    if args.parallel:
        model.parallelize()

    # --------------------------------------------------------------------------
    # DATA ITERATORS
    # Two datasets: train and dev. If we sort by length it's faster.
    logger.print('-' * 100)
    logger.print('Make data loaders')

    if not args.only_test:
        train_dataset = data.CommentDataset(train_exs, model)
        if args.sort_by_len:
            train_sampler = data.SortedBatchSampler(train_dataset.lengths(),
                                                    args.batch_size,
                                                    shuffle=True)
        else:
            train_sampler = torch.utils.data.sampler.RandomSampler(
                train_dataset)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.batch_size,
            sampler=train_sampler,
            num_workers=args.data_workers,
            collate_fn=vector.batchify,
            pin_memory=args.cuda,
            drop_last=args.parallel)

    dev_dataset = data.CommentDataset(dev_exs, model)
    dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)

    dev_loader = torch.utils.data.DataLoader(dev_dataset,
                                             batch_size=args.test_batch_size,
                                             sampler=dev_sampler,
                                             num_workers=args.data_workers,
                                             collate_fn=vector.batchify,
                                             pin_memory=args.cuda,
                                             drop_last=args.parallel)

    # -------------------------------------------------------------------------
    # PRINT CONFIG
    logger.print('-' * 100)
    logger.print('CONFIG:\n%s' %
                 json.dumps(vars(args), indent=4, sort_keys=True))

    # --------------------------------------------------------------------------
    # DO TEST

    if args.only_test:
        stats = {
            'timer': Timer(),
            'epoch': 100000,
            'best_valid': 0,
            'no_improvement': 0
        }
        validate_official(args, dev_loader, model, stats, logger, mode='test')

    # --------------------------------------------------------------------------
    # TRAIN/VALID LOOP
    else:
        logger.print('-' * 100)
        logger.print('Starting training...')
        stats = {
            'timer': Timer(),
            'epoch': start_epoch,
            'best_valid': 0,
            'no_improvement': 0
        }

        if args.optimizer in ['sgd', 'adam'
                              ] and args.warmup_epochs >= start_epoch:
            logger.print(
                "Use warmup lrate for the %d epoch, from 0 up to %s." %
                (args.warmup_epochs, args.learning_rate))
            num_batches = len(train_loader.dataset) // args.batch_size
            warmup_factor = (args.learning_rate + 0.) / (num_batches *
                                                         args.warmup_epochs)
            stats['warmup_factor'] = warmup_factor

        for epoch in range(start_epoch, args.num_epochs + 1):
            stats['epoch'] = epoch
            if args.optimizer in ['sgd', 'adam'
                                  ] and epoch > args.warmup_epochs:
                model.optimizer.param_groups[0]['lr'] = \
                    model.optimizer.param_groups[0]['lr'] * args.lr_decay

            train(args, train_loader, model, stats, logger)
            if epoch % args.print_fq == 0:
                result = validate_official(args, dev_loader, model, stats,
                                           logger)
            logger.save(silent=True)

            # Save best valid
            if ((epoch % args.print_fq == 0) and \
                              (result[args.valid_metric] > stats['best_valid'])):
                logger.print('Best valid: %s = %.2f (epoch %d, %d updates)' %
                             (args.valid_metric, result[args.valid_metric],
                              stats['epoch'], model.updates))
                model.save(logger.path + '/best_model.cpt')
                stats['best_valid'] = result[args.valid_metric]
                stats['no_improvement'] = 0
            else:
                stats['no_improvement'] += 1
                if stats['no_improvement'] >= args.early_stop:
                    break
예제 #14
0
def validate_official(
    args,
    data_loader,
    model,
    global_stats,
    logger,
    mode='dev',
):
    """Run one full official validation. Uses exact spans and same
    exact match/F1 score computation as in the SQuAD script.
    Extra arguments:
        offsets: The character start/end indices for the tokens in each context.
        texts: Map of qid --> raw text of examples context (matches offsets).
        answers: Map of qid --> list of accepted answers.
    """
    eval_time = Timer()
    # Run through examples
    examples = 0
    sources, hypotheses, references, copy_dict = dict(), dict(), dict(), dict()
    with torch.no_grad():
        pbar = tqdm(data_loader)
        for idx, ex in enumerate(pbar):
            batch_size = ex['batch_size']
            ex_ids = list(
                range(idx * batch_size, (idx * batch_size) + batch_size))
            predictions, targets, copy_info = model.predict(ex,
                                                            replace_unk=True)

            src_sequences = [code for code in ex['code_text']]
            examples += batch_size
            for key, src, pred, tgt in zip(ex_ids, src_sequences, predictions,
                                           targets):
                hypotheses[key] = [pred]
                references[key] = tgt if isinstance(tgt, list) else [tgt]
                sources[key] = src

            if copy_info is not None:
                copy_info = copy_info.cpu().numpy().astype(int).tolist()
                for key, cp in zip(ex_ids, copy_info):
                    copy_dict[key] = cp

            pbar.set_description("%s" % 'Epoch = %d [validating ... ]' %
                                 global_stats['epoch'])
            #break

    copy_dict = None if len(copy_dict) == 0 else copy_dict
    bleu, rouge_l, meteor, precision, recall, f1 = eval_accuracies(hypotheses,
                                                                   references,
                                                                   copy_dict,
                                                                   sources=sources,
                                                                   filename=logger.path+"/preds.json"\
                                                                       if args.save_pred else None,
                                                                   print_copy_info=args.print_copy_info,
                                                                   mode=mode)
    result = dict()
    result['bleu'] = bleu
    result['rouge_l'] = rouge_l
    result['meteor'] = meteor
    result['precision'] = precision
    result['recall'] = recall
    result['f1'] = f1
    result["ev_time"] = eval_time.time()
    result["examples"] = examples
    logger.add(global_stats['epoch'], **result)

    if mode == 'test':
        logger.print('test valid official: '
                     'bleu = %.2f | rouge_l = %.2f | meteor = %.2f | ' %
                     (bleu, rouge_l, meteor) +
                     'Precision = %.2f | Recall = %.2f | F1 = %.2f | '
                     'examples = %d | ' % (precision, recall, f1, examples) +
                     'test time = %.2f (s)' % eval_time.time())

    else:
        logger.print(
            'dev valid official: Epoch = %d | ' % (global_stats['epoch']) +
            'bleu = %.2f | rouge_l = %.2f | meteor = %.2f | '
            'Precision = %.2f | Recall = %.2f | F1 = %.2f | examples = %d | ' %
            (bleu, rouge_l, meteor, precision, recall, f1, examples) +
            'valid time = %.2f (s)' % eval_time.time())

    return result
예제 #15
0
def set_defaults(args):
    """Make sure the commandline arguments are initialized properly."""
    # Check critical files exist
    if not args.only_test:
        args.train_src_files = []
        args.train_tgt_files = []
        args.train_src_tag_files = []
        args.train_rel_matrix_files = []

        num_dataset = len(args.dataset_name)
        if num_dataset > 1:
            if len(args.train_src) == 1:
                args.train_src = args.train_src * num_dataset
            if len(args.train_tgt) == 1:
                args.train_tgt = args.train_tgt * num_dataset
            if len(args.train_src_tag) == 1:
                args.train_src_tag = args.train_src_tag * num_dataset
            if len(args.train_rel_matrix) == 1:
                args.train_rel_matrix = args.train_rel_matrix * num_dataset

        for i in range(num_dataset):
            dataset_name = args.dataset_name[i]
            data_dir = os.path.join(args.data_dir, dataset_name)
            train_src = os.path.join(data_dir, args.train_src[i])
            train_tgt = os.path.join(data_dir, args.train_tgt[i])
            if not os.path.isfile(train_src):
                raise IOError('No such file: %s' % train_src)
            if not os.path.isfile(train_tgt):
                raise IOError('No such file: %s' % train_tgt)
            if args.use_code_type:
                train_src_tag = os.path.join(data_dir, args.train_src_tag[i])
                if not os.path.isfile(train_src_tag):
                    raise IOError('No such file: %s' % train_src_tag)
            else:
                train_src_tag = None
            if args.use_tree_relative_attn:
                train_rel_matrix = os.path.join(data_dir,
                                                args.train_rel_matrix[i])
                if not os.path.isfile(train_rel_matrix):
                    raise IOError('No such file: %s' % train_rel_matrix)
            else:
                train_rel_matrix = None

            args.train_src_files.append(train_src)
            args.train_tgt_files.append(train_tgt)
            args.train_src_tag_files.append(train_src_tag)
            args.train_rel_matrix_files.append(train_rel_matrix)

    args.dev_src_files = []
    args.dev_tgt_files = []
    args.dev_src_tag_files = []
    args.dev_rel_matrix_files = []

    num_dataset = len(args.dataset_name)
    if num_dataset > 1:
        if len(args.dev_src) == 1:
            args.dev_src = args.dev_src * num_dataset
        if len(args.dev_tgt) == 1:
            args.dev_tgt = args.dev_tgt * num_dataset
        if len(args.dev_src_tag) == 1:
            args.dev_src_tag = args.dev_src_tag * num_dataset
        if len(args.dev_rel_matrix) == 1:
            args.dev_rel_matrix = args.dev_rel_matrix * num_dataset

    for i in range(num_dataset):
        dataset_name = args.dataset_name[i]
        data_dir = os.path.join(args.data_dir, dataset_name)
        dev_src = os.path.join(data_dir, args.dev_src[i])
        dev_tgt = os.path.join(data_dir, args.dev_tgt[i])
        if not os.path.isfile(dev_src):
            raise IOError('No such file: %s' % dev_src)
        if not os.path.isfile(dev_tgt):
            raise IOError('No such file: %s' % dev_tgt)
        if args.use_code_type:
            dev_src_tag = os.path.join(data_dir, args.dev_src_tag[i])
            if not os.path.isfile(dev_src_tag):
                raise IOError('No such file: %s' % dev_src_tag)
        else:
            dev_src_tag = None
        if args.use_tree_relative_attn:
            dev_rel_matrix = os.path.join(data_dir, args.dev_rel_matrix[i])
            if not os.path.isfile(dev_rel_matrix):
                raise IOError('No such file: %s' % dev_rel_matrix)
        else:
            dev_rel_matrix = None

        args.dev_src_files.append(dev_src)
        args.dev_tgt_files.append(dev_tgt)
        args.dev_src_tag_files.append(dev_src_tag)
        args.dev_rel_matrix_files.append(dev_rel_matrix)

    # Set model directory
    #subprocess.call(['mkdir', '-p', args.model_dir])

    # Set model name
    #if not args.model_name:
    #    import uuid
    #    import time
    #    args.model_name = time.strftime("%Y%m%d-") + str(uuid.uuid4())[:8]

    # Set log + model file names
    #suffix = '_test' if args.only_test else ''
    #args.model_file = os.path.join(args.model_dir, args.model_name + '.mdl')
    #args.log_file = os.path.join(args.model_dir, args.model_name + suffix + '.txt')
    #if args.save_pred:
    #    args.pred_file = os.path.join(args.model_dir, args.model_name + suffix + '.json')
    #if args.pretrained:
    #    args.pretrained = os.path.join(args.model_dir, args.pretrained + '.mdl')

    if args.use_src_word or args.use_tgt_word:
        # Make sure fix_embeddings and pretrained are consistent
        if args.fix_embeddings and not args.pretrained:
            logger.print('WARN: fix_embeddings set to False '
                         'as embeddings are random.')
            args.fix_embeddings = False
    else:
        args.fix_embeddings = False

    return args