def train(self, num_epochs, start_epoch=0): """ starts the training, logs loss and metrics in logging file and prints progress in the console, including an ETA. Also stores snapshots of current model each epoch. :param num_epochs: number of epochs to train :param start_epoch: the first epoch, default to 0. Can be set higher for finetuning, etc. """ try: rem = ETATimer(num_epochs - start_epoch) for epoch in range(start_epoch + 1, num_epochs + 1): self.state['epoch'] = epoch if not self.state['quiet']: print('Epoch', epoch) self.print_info() self.train_epoch(epoch) if self.state['val_iter'] is not None: self.validate_epoch() self.snapshot(epoch) if not self.state['quiet']: print('ETA:', rem()) return self.state except Exception as e: logging.exception(e) raise
def train(args, data_loader, model, global_stats, logger): """Run through one epoch of model training with the provided data loader.""" # Initialize meters + timers ml_loss = AverageMeter() loc_loss = AverageMeter() fix_loss = AverageMeter() epoch_time = Timer() current_epoch = global_stats['epoch'] if args.use_tqdm: pbar = tqdm(data_loader) pbar.set_description( "%s" % 'Epoch = %d tot_loss = x.xx loc_loss = x.xx fix_loss = x.xx]' % current_epoch) else: pbar = data_loader # Run one epoch for idx, ex in enumerate(pbar): bsz = ex['batch_size'] if args.optimizer in ['sgd', 'adam' ] and current_epoch <= args.warmup_epochs: cur_lrate = global_stats['warmup_factor'] * (model.updates + 1) for param_group in model.optimizer.param_groups: param_group['lr'] = cur_lrate net_loss = model.update(ex) ml_loss.update(net_loss["loss"].detach().item(), bsz) loc_loss.update(net_loss["loc_loss"].detach().item(), bsz) fix_loss.update(net_loss["fix_loss"].detach().item(), bsz) log_info = 'Epoch = %d [tot_loss = %.2f loc_loss = %.2f fix_loss = %.2f]' % \ (current_epoch, ml_loss.avg, loc_loss.avg, fix_loss.avg) if args.use_tqdm: pbar.set_description("%s" % log_info) if idx % 1000 == 0: logger.print( 'train: Epoch %d | tot_loss = %.2f | loc_loss = %.2f | fix_loss = %.2f' % (current_epoch, ml_loss.avg, loc_loss.avg, fix_loss.avg)) kvs = [("ml_lo_tr", ml_loss.avg), ("loc_lo_tr", loc_loss.avg), ("fix_lo_tr", fix_loss.avg),\ ("epoch_time", epoch_time.time())] for k, v in kvs: logger.add(current_epoch, **{k: v}) logger.print( 'train: Epoch %d | tot_loss = %.2f | loc_loss = %.2f | fix_loss = %.2f | ' 'Time for epoch = %.2f (s)' % (current_epoch, ml_loss.avg, loc_loss.avg, fix_loss.avg, epoch_time.time())) # Checkpoint if args.checkpoint: model.checkpoint(logger.path + '/model.cpt.checkpoint', current_epoch + 1) gc.collect()
def print_info(self): """ prints and logs current learning rates as well as the epoch. :param epoch: the current epoch. """ if not self.state['quiet']: s = 'learning rates ' + (', '.join(map(str, self._lrs()))) print(s)
def init_from_scratch(args, logger): """New model, new data, new dictionary.""" # Build a dictionary from the data questions + words (train/dev splits) logger.print('-' * 100) logger.print('Build word dictionary') if args.src_dict_filename is not None: logger.print("Loading dict. from " + args.src_dict_filename) src_dict = util.load_word_and_char_dict(args, args.src_dict_filename, \ dict_size=args.src_vocab_size, special_tokens="pad_unk") else: src_dict = util.build_word_and_char_dict_from_file( filenames=[args.train_src_file], dict_size=args.src_vocab_size, special_tokens="pad_unk") type_dict = util.build_word_and_char_dict_from_file( filenames=[args.train_src_tag_file], special_tokens="pad_unk",\ dict_size=None) logger.print('Num words in source = %d' % (len(src_dict))) # Initialize model model = VarmisuseModel(config.get_model_args(args), src_dict, type_dict) return model
def findBestMatchingRow(filename, data, tagNameToColumnIndex): search_results = sorted( data.iterrows(), key=lambda row: difflib.SequenceMatcher( None, str(row[1][tagNameToColumnIndex['workTitle']]), filename). ratio(), reverse=True # see https://stackoverflow.com/a/17903726 # This might be a better approach: https://stackoverflow.com/a/36132391 ) # sort row of xlsx to get the row selection = search_results[0] print('Selected XLSX row {}: "{}"'.format( selection[0], selection[1][tagNameToColumnIndex['workTitle']])) return selection
def train(args, data_loader, model, global_stats, logger): """Run through one epoch of model training with the provided data loader.""" # Initialize meters + timers ml_loss = AverageMeter() perplexity = AverageMeter() epoch_time = Timer() current_epoch = global_stats['epoch'] pbar = tqdm(data_loader) pbar.set_description("%s" % 'Epoch = %d [perplexity = x.xx, ml_loss = x.xx]' % current_epoch) # Run one epoch for idx, ex in enumerate(pbar): bsz = ex['batch_size'] if args.optimizer in ['sgd', 'adam' ] and current_epoch <= args.warmup_epochs: cur_lrate = global_stats['warmup_factor'] * (model.updates + 1) for param_group in model.optimizer.param_groups: param_group['lr'] = cur_lrate net_loss = model.update(ex) ml_loss.update(net_loss['ml_loss'], bsz) perplexity.update(net_loss['perplexity'], bsz) log_info = 'Epoch = %d [perplexity = %.2f, ml_loss = %.2f]' % \ (current_epoch, perplexity.avg, ml_loss.avg) pbar.set_description("%s" % log_info) #break kvs = [("perp_tr", perplexity.avg), ("ml_lo_tr", ml_loss.avg),\ ("epoch_time", epoch_time.time())] for k, v in kvs: logger.add(current_epoch, **{k: v}) logger.print( 'train: Epoch %d | perplexity = %.2f | ml_loss = %.2f | ' 'Time for epoch = %.2f (s)' % (current_epoch, perplexity.avg, ml_loss.avg, epoch_time.time())) # Checkpoint if args.checkpoint: model.checkpoint(logger.path + '/best_model.cpt.checkpoint', current_epoch + 1)
def init_from_scratch(args, train_exs, dev_exs, logger): """New model, new data, new dictionary.""" # Build a dictionary from the data questions + words (train/dev splits) logger.print('-' * 100) logger.print('Build word dictionary') src_dict = util.build_word_and_char_dict(args, examples=train_exs,# + dev_exs, fields=['code'], dict_size=args.src_vocab_size, special_tokens="pad_unk",\ attrname="tokens" if \ not args.sum_over_subtokens\ else "subtokens") tgt_dict = util.build_word_and_char_dict( args, examples=train_exs, # + dev_exs, fields=['summary'], dict_size=args.tgt_vocab_size, special_tokens="pad_unk_bos_eos") if args.use_tree_relative_attn: rel_dict = util.build_word_and_char_dict(args, examples=train_exs, fields=["rel_matrix"], dict_size=None, special_tokens="unk") else: rel_dict = None if args.use_code_type: type_dict = util.build_word_and_char_dict(args, examples=train_exs,# + dev_exs, fields=['code'], dict_size=None, special_tokens="pad_unk",\ attrname="type") else: type_dict = None logger.print('Num words in source = %d and target = %d' % (len(src_dict), len(tgt_dict))) if args.use_tree_relative_attn: logger.print("Num relations in relative matrix = %d" % (len(rel_dict))) # Initialize model model = Code2NaturalLanguage(config.get_model_args(args), src_dict, tgt_dict, rel_dict, type_dict) return model
def main(args, logger): # -------------------------------------------------------------------------- # MODEL logger.print('-' * 100) start_epoch = 1 if args.only_test: if not os.path.isfile(args.model_file): raise IOError('No such file: %s' % args.model_file) model = VarmisuseModel.load(args.model_file) else: if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'): # Just resume training, no modifications. logger.print('Found a checkpoint...') checkpoint_file = args.model_file + '.checkpoint' model, start_epoch = VarmisuseModel.load_checkpoint( checkpoint_file, args.cuda) else: # Training starts fresh. But the model state is either pretrained or # newly (randomly) initialized. if args.pretrained: logger.print('Using pretrained model...') model = VarmisuseModel.load(args.pretrained, args) else: logger.print('Training model from scratch...') model = init_from_scratch(args, logger) # Set up optimizer model.init_optimizer() # log the parameter details logger.print( 'Trainable #parameters [encoder-decoder] {} [total] {}'.format( human_format(model.network.count_encoder_parameters() + model.network.count_decoder_parameters()), human_format(model.network.count_parameters()))) table = model.network.layer_wise_parameters() logger.print('Breakdown of the trainable paramters\n%s' % table) # Use the GPU? if args.cuda: model.cuda() if args.parallel: model.parallelize() # -------------------------------------------------------------------------- # DATA ITERATORS # Two datasets: train and dev. If we sort by length it's faster. logger.print('-' * 100) logger.print('Make data loaders') dev_files = dict() dev_files['src'] = args.dev_src_file dev_files['src_tag'] = args.dev_src_tag_file dev_files['tgt'] = args.dev_tgt_file if not args.only_test: train_files = dict() train_files['src'] = args.train_src_file train_files['src_tag'] = args.train_src_tag_file train_files['tgt'] = args.train_tgt_file train_dataset = data_utils.VarmisuseDataset(model, args, train_files) if args.sort_by_len: train_sampler = data.SortedBatchSampler(train_dataset.lengths(), args.batch_size, shuffle=True) else: train_sampler = torch.utils.data.sampler.RandomSampler( train_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.data_workers, collate_fn=data_utils.batchify_varmisuse, pin_memory=args.cuda, drop_last=args.parallel) dev_dataset = data_utils.VarmisuseDataset(model, args, dev_files) dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset) dev_loader = torch.utils.data.DataLoader( dev_dataset, batch_size=args.test_batch_size, sampler=dev_sampler, num_workers=args.data_workers, collate_fn=data_utils.batchify_varmisuse, pin_memory=args.cuda, drop_last=args.parallel) # ------------------------------------------------------------------------- # PRINT CONFIG logger.print('-' * 100) logger.print('CONFIG:\n%s' % json.dumps(vars(args), indent=4, sort_keys=True)) # -------------------------------------------------------------------------- # DO TEST if args.only_test: stats = { 'timer': Timer(), 'epoch': 100000, 'best_valid': 0, 'no_improvement': 0 } validate_official(args, dev_loader, model, stats, logger, mode='test') logger.save(silent=True) # -------------------------------------------------------------------------- # TRAIN/VALID LOOP else: logger.print('-' * 100) logger.print('Starting training...') stats = { 'timer': Timer(), 'epoch': start_epoch, 'best_valid': 0, 'no_improvement': 0 } if args.optimizer in ['sgd', 'adam' ] and args.warmup_epochs >= start_epoch: logger.print( "Use warmup lrate for the %d epoch, from 0 up to %s." % (args.warmup_epochs, args.learning_rate)) num_batches = len(train_loader.dataset) // args.batch_size warmup_factor = (args.learning_rate + 0.) / (num_batches * args.warmup_epochs) stats['warmup_factor'] = warmup_factor for epoch in range(start_epoch, args.num_epochs + 1): stats['epoch'] = epoch if args.optimizer in ['sgd', 'adam' ] and epoch > args.warmup_epochs + 1: model.optimizer.param_groups[0]['lr'] = \ model.optimizer.param_groups[0]['lr'] * args.lr_decay train(args, train_loader, model, stats, logger) if epoch % args.print_fq == 0: model.save(logger.path + '/best_model.cpt') result = validate_official(args, dev_loader, model, stats, logger) logger.save(silent=True) if epoch % args.save_fq == 0: model.save(logger.path + '/model_epoch%d.cpt' % epoch) # Save best valid if ((epoch % args.print_fq == 0) and \ (result[args.valid_metric] > stats['best_valid'])): logger.print('Best valid: %s = %.2f (epoch %d, %d updates)' % (args.valid_metric, result[args.valid_metric], stats['epoch'], model.updates)) stats['best_valid'] = result[args.valid_metric] stats['no_improvement'] = 0 else: stats['no_improvement'] += 1 if stats['no_improvement'] >= args.early_stop: break
def validate_official( args, data_loader, model, global_stats, logger, mode='dev', ): """Run one full validation. """ eval_time = Timer() # Run through examples global_pred_loc, global_target_loc, is_buggy, global_target_probs, \ global_correct_fix = None, None, None, None, None with torch.no_grad(): if args.use_tqdm: pbar = tqdm(data_loader) else: pbar = data_loader for idx, ex in enumerate(pbar): batch_size = ex['batch_size'] logits_loc, logits_fix = model.predict(ex) pred_loc = np.argmax(logits_loc.cpu().numpy(), axis=1) - 1 pred_fix = np.argmax(logits_fix.cpu().numpy(), axis=1) scope_mask = ex["scope_t"] # batch x seq_len logits_fix = logits_fix.masked_fill(~scope_mask, -1e18) pointer_probs = F.softmax(logits_fix, dim=1) # batch x seq_len target_mask = ex["fixes_t"] # batch x seq_len target_probs = (target_mask * pointer_probs).sum(dim=-1) # batch target_fix = ex["target_fix"].cpu().numpy() correct_fix = target_fix[np.arange(target_fix.shape[0]), pred_fix] if global_pred_loc is None: global_pred_loc = pred_loc global_target_loc = ex["target_pos"].cpu().numpy() global_correct_fix = correct_fix is_buggy = ex["mask_incorrect"].cpu().numpy() global_target_probs = target_probs.cpu().numpy() else: global_pred_loc = np.hstack((global_pred_loc, pred_loc)) global_target_loc = np.hstack((global_target_loc,\ ex["target_pos"].cpu().numpy())) global_correct_fix = np.hstack( (global_correct_fix, correct_fix)) is_buggy = np.hstack( (is_buggy, ex["mask_incorrect"].cpu().numpy())) global_target_probs = np.hstack((global_target_probs, \ target_probs.cpu().numpy())) # Store two metrics: the accuracy at predicting specifically the non-buggy samples correctly (to measure false alarm rate), and the accuracy at detecting the real bugs. loc_correct = (global_pred_loc == global_target_loc) no_bug_pred_acc = ( (1 - is_buggy) * loc_correct).sum() / (1e-9 + (1 - is_buggy).sum()) * 100 bug_loc_acc = (is_buggy * loc_correct).sum() / (1e-9 + (is_buggy).sum()) * 100 # Version by Hellendoorn et al: # To simplify the comparison, accuracy is computed as achieving >= 50% probability for the top guess # (as opposed to the slightly more accurate, but hard to compute quickly, greatest probability among distinct variable names). fix_correct = (global_target_probs >= 0.5) target_fix_acc = (is_buggy * fix_correct).sum() / (1e-9 + (is_buggy).sum()) * 100 joint_acc_bug = (is_buggy * loc_correct * fix_correct).sum() / (1e-9 + (is_buggy).sum()) * 100 result = dict() result['no_bug_pred_acc'] = no_bug_pred_acc result['bug_loc_acc'] = bug_loc_acc result['bug_fix_acc'] = target_fix_acc result['joint_acc_bug'] = joint_acc_bug result["ev_time"] = eval_time.time() logger.add(global_stats['epoch'], **result) logger.print("%s valid official: " % mode + "no_bug_pred_acc = %.2f | bug_loc_acc = %.2f " % (no_bug_pred_acc, bug_loc_acc) + "target_fix_acc = %.2f | joint_acc_bug = %.2f " % (target_fix_acc, joint_acc_bug) + 'test time = %.2f (s)' % eval_time.time()) gc.collect() return result
parser = argparse.ArgumentParser( 'Code to Natural Language Generation', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_train_args(parser) config.add_model_args(parser) args, unknown = parser.parse_known_args() args.use_tgt_word = False args.tgt_pos_emb = False # Set cuda args.cuda = torch.cuda.is_available() args.parallel = torch.cuda.device_count() > 1 # Set random state UNCOMMENT IF NEEDED # np.random.seed(args.random_seed) # torch.manual_seed(args.random_seed) # if args.cuda: # torch.cuda.manual_seed(args.random_seed) # Set logging if args.only_test: path = args.model_file[:args.model_file.rfind("/") + 1] + "eval/" logger = logger.Logger("", fmt={}, base=args.dir, path=path) else: logger = logger.Logger(args.comment, fmt={}, base=args.dir) logger.print(" ".join(sys.argv)) logger.print(args) set_defaults(args) main(args, logger)
def convert(args): tmp_directory = os.path.abspath(os.path.join(args.output_directory, 'temp')) for path in [ args.output_directory, tmp_directory, args.parameters_directory ]: if not os.path.exists(path): os.makedirs(path) # logger_output_filepath = os.path.abspath(os.path.join(tmp_directory, 'logs.txt')) # logger.config(logger_output_filepath) xlsx_filepath = os.path.abspath( os.path.join(args.parameters_directory, 'metadata.xlsm')) xlsx = pd.read_excel(xlsx_filepath, header=None) columnIndexToTagName = { columnIndex: tagName for columnIndex, tagName in xlsx.iloc[0].items() if isinstance(tagName, str) } tagNameToColumnIndex = { value: key for key, value in columnIndexToTagName.items() } columnIndexToTextFieldName = { columnIndex: tagName for columnIndex, tagName in xlsx.iloc[1].items() if isinstance(tagName, str) } textFieldNameToColumnIndex = { value: key for key, value in columnIndexToTextFieldName.items() } data = xlsx[4:] mscz_filenames = [ filename for filename in os.listdir(args.input_directory) if os.path.splitext(filename)[-1].lower() == '.mscz' ] print('Processing files from directory "{}" into directory "{}".'.format( args.input_directory, args.output_directory)) # TODO: load mapping from file mapping.load(args) # TODO: for each file, check if already mapped. Otherwise compute mapping for filename in mscz_filenames: mapping.checkFile(filename, data, tagNameToColumnIndex) # TODO: persist mapping mapping.persist(args) # TODO: process files for filename in mscz_filenames: print('') print('Processing file "{}" ...'.format(filename)) inputMsczPath = os.path.join(args.input_directory, filename) outputMsczPath = os.path.join(args.output_directory, os.path.basename(filename)) # rowId, rowData = findBestMatchingRow(filename, data, tagNameToColumnIndex) fileData = mapping.checkFile(filename, data, tagNameToColumnIndex) rowData = fileData[mapping.ROW_DATA] filename_extensionless = os.path.splitext(filename)[0] tmp_unzipped_path = os.path.join(tmp_directory, filename_extensionless) # https://stackoverflow.com/questions/1807063/extract-files-with-invalid-characters-in-filename-with-python with zipfile.ZipFile(inputMsczPath, "r") as zip_ref: zip_ref.extractall(tmp_unzipped_path) mscx_filenames = [ filename for filename in os.listdir(tmp_unzipped_path) if os.path.splitext(filename)[-1].lower() == '.mscx' ] original_mscx_filename = mscx_filenames[0] # ascii_mscx_filename = trans.trans('{}.mscx'.format(filename_extensionless)) ascii_mscx_filename = 'file.mscx' original_tmp_mscx_path = os.path.join(tmp_unzipped_path, original_mscx_filename) safe_tmp_mscx_path = os.path.join(tmp_unzipped_path, ascii_mscx_filename) os.rename(original_tmp_mscx_path, safe_tmp_mscx_path) print('Working on temporary {} file...'.format(safe_tmp_mscx_path)) xmlTree = ET.parse(safe_tmp_mscx_path) injectMetadata(xmlTree, rowData, tagNameToColumnIndex) injectTextField(xmlTree, rowData, textFieldNameToColumnIndex) xmlTree.write(safe_tmp_mscx_path) # update container metadata to take the safe filename into account metadata_file_path = os.path.join(tmp_unzipped_path, 'META-INF/container.xml') metadataTree = ET.parse(metadata_file_path) rootfileTag = safe_child(metadataTree.getroot(), 'rootfile') rootfileTag.set('full-path', ascii_mscx_filename) # create the new mscz archive shutil.make_archive(outputMsczPath, 'zip', tmp_unzipped_path) os.rename('{}.zip'.format(outputMsczPath), outputMsczPath) pass print('Done')
'--do_not_auto_upgrade', type=bool, default=False, help='Will download all the requirements, upgrade pip, ...') args = parser.parse_args() args.input_directory = os.path.abspath(args.input_directory) args.output_directory = os.path.abspath(args.output_directory) if os.path.exists(args.output_directory): shutil.rmtree(args.output_directory) if not args.do_not_auto_upgrade: import upgrade try: upgrade.checkRequirements() except Exception as e: print('Did not install Python packages, ' 'but maybe are they already available...\n{}\n{}'.format( e, str(e))) import zipfile import shutil import pandas as pd from lxml import etree as ET from toxml import findBestMatchingRow, injectMetadata, \ injectTextField, safe_child import mapping # import trans def convert(args):
def main(args, logger): # -------------------------------------------------------------------------- # DATA logger.print('-' * 100) logger.print('Load and process data files') train_exs = [] if not args.only_test: args.dataset_weights = dict() for train_src, train_src_tag, train_tgt, train_rel_matrix, dataset_name in \ zip(args.train_src_files, args.train_src_tag_files, args.train_tgt_files, args.train_rel_matrix_files,\ args.dataset_name): train_files = dict() train_files['src'] = train_src train_files['src_tag'] = train_src_tag train_files['tgt'] = train_tgt train_files["rel_matrix"] = train_rel_matrix exs = util.load_data(args, train_files, max_examples=args.max_examples, dataset_name=dataset_name) lang_name = constants.DATA_LANG_MAP[dataset_name] args.dataset_weights[constants.LANG_ID_MAP[lang_name]] = len(exs) train_exs.extend(exs) logger.print('Num train examples = %d' % len(train_exs)) args.num_train_examples = len(train_exs) for lang_id in args.dataset_weights.keys(): weight = (1.0 * args.dataset_weights[lang_id]) / len(train_exs) args.dataset_weights[lang_id] = round(weight, 2) logger.print('Dataset weights = %s' % str(args.dataset_weights)) dev_exs = [] for dev_src, dev_src_tag, dev_tgt, dev_rel_matrix, dataset_name in \ zip(args.dev_src_files, args.dev_src_tag_files, args.dev_tgt_files, args.dev_rel_matrix_files, args.dataset_name): dev_files = dict() dev_files['src'] = dev_src dev_files['src_tag'] = dev_src_tag dev_files['tgt'] = dev_tgt dev_files["rel_matrix"] = dev_rel_matrix exs = util.load_data(args, dev_files, max_examples=args.max_examples, dataset_name=dataset_name, test_split=True) dev_exs.extend(exs) logger.print('Num dev examples = %d' % len(dev_exs)) # -------------------------------------------------------------------------- # MODEL logger.print('-' * 100) start_epoch = 1 if args.only_test: #if args.pretrained: # model = Code2NaturalLanguage.load(args.pretrained) #else: if not os.path.isfile(args.model_file): raise IOError('No such file: %s' % args.model_file) model = Code2NaturalLanguage.load(args.model_file) else: if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'): # Just resume training, no modifications. logger.print('Found a checkpoint...') checkpoint_file = args.model_file + '.checkpoint' model, start_epoch = Code2NaturalLanguage.load_checkpoint( checkpoint_file, args.cuda) else: # Training starts fresh. But the model state is either pretrained or # newly (randomly) initialized. if args.pretrained: logger.print('Using pretrained model...') model = Code2NaturalLanguage.load(args.pretrained, args) else: logger.print('Training model from scratch...') model = init_from_scratch(args, train_exs, dev_exs, logger) # Set up optimizer model.init_optimizer() # log the parameter details logger.print( 'Trainable #parameters [encoder-decoder] {} [total] {}'.format( human_format(model.network.count_encoder_parameters() + model.network.count_decoder_parameters()), human_format(model.network.count_parameters()))) table = model.network.layer_wise_parameters() logger.print('Breakdown of the trainable paramters\n%s' % table) # Use the GPU? if args.cuda: model.cuda() if args.parallel: model.parallelize() # -------------------------------------------------------------------------- # DATA ITERATORS # Two datasets: train and dev. If we sort by length it's faster. logger.print('-' * 100) logger.print('Make data loaders') if not args.only_test: train_dataset = data.CommentDataset(train_exs, model) if args.sort_by_len: train_sampler = data.SortedBatchSampler(train_dataset.lengths(), args.batch_size, shuffle=True) else: train_sampler = torch.utils.data.sampler.RandomSampler( train_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.data_workers, collate_fn=vector.batchify, pin_memory=args.cuda, drop_last=args.parallel) dev_dataset = data.CommentDataset(dev_exs, model) dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset) dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=args.test_batch_size, sampler=dev_sampler, num_workers=args.data_workers, collate_fn=vector.batchify, pin_memory=args.cuda, drop_last=args.parallel) # ------------------------------------------------------------------------- # PRINT CONFIG logger.print('-' * 100) logger.print('CONFIG:\n%s' % json.dumps(vars(args), indent=4, sort_keys=True)) # -------------------------------------------------------------------------- # DO TEST if args.only_test: stats = { 'timer': Timer(), 'epoch': 100000, 'best_valid': 0, 'no_improvement': 0 } validate_official(args, dev_loader, model, stats, logger, mode='test') # -------------------------------------------------------------------------- # TRAIN/VALID LOOP else: logger.print('-' * 100) logger.print('Starting training...') stats = { 'timer': Timer(), 'epoch': start_epoch, 'best_valid': 0, 'no_improvement': 0 } if args.optimizer in ['sgd', 'adam' ] and args.warmup_epochs >= start_epoch: logger.print( "Use warmup lrate for the %d epoch, from 0 up to %s." % (args.warmup_epochs, args.learning_rate)) num_batches = len(train_loader.dataset) // args.batch_size warmup_factor = (args.learning_rate + 0.) / (num_batches * args.warmup_epochs) stats['warmup_factor'] = warmup_factor for epoch in range(start_epoch, args.num_epochs + 1): stats['epoch'] = epoch if args.optimizer in ['sgd', 'adam' ] and epoch > args.warmup_epochs: model.optimizer.param_groups[0]['lr'] = \ model.optimizer.param_groups[0]['lr'] * args.lr_decay train(args, train_loader, model, stats, logger) if epoch % args.print_fq == 0: result = validate_official(args, dev_loader, model, stats, logger) logger.save(silent=True) # Save best valid if ((epoch % args.print_fq == 0) and \ (result[args.valid_metric] > stats['best_valid'])): logger.print('Best valid: %s = %.2f (epoch %d, %d updates)' % (args.valid_metric, result[args.valid_metric], stats['epoch'], model.updates)) model.save(logger.path + '/best_model.cpt') stats['best_valid'] = result[args.valid_metric] stats['no_improvement'] = 0 else: stats['no_improvement'] += 1 if stats['no_improvement'] >= args.early_stop: break
def validate_official( args, data_loader, model, global_stats, logger, mode='dev', ): """Run one full official validation. Uses exact spans and same exact match/F1 score computation as in the SQuAD script. Extra arguments: offsets: The character start/end indices for the tokens in each context. texts: Map of qid --> raw text of examples context (matches offsets). answers: Map of qid --> list of accepted answers. """ eval_time = Timer() # Run through examples examples = 0 sources, hypotheses, references, copy_dict = dict(), dict(), dict(), dict() with torch.no_grad(): pbar = tqdm(data_loader) for idx, ex in enumerate(pbar): batch_size = ex['batch_size'] ex_ids = list( range(idx * batch_size, (idx * batch_size) + batch_size)) predictions, targets, copy_info = model.predict(ex, replace_unk=True) src_sequences = [code for code in ex['code_text']] examples += batch_size for key, src, pred, tgt in zip(ex_ids, src_sequences, predictions, targets): hypotheses[key] = [pred] references[key] = tgt if isinstance(tgt, list) else [tgt] sources[key] = src if copy_info is not None: copy_info = copy_info.cpu().numpy().astype(int).tolist() for key, cp in zip(ex_ids, copy_info): copy_dict[key] = cp pbar.set_description("%s" % 'Epoch = %d [validating ... ]' % global_stats['epoch']) #break copy_dict = None if len(copy_dict) == 0 else copy_dict bleu, rouge_l, meteor, precision, recall, f1 = eval_accuracies(hypotheses, references, copy_dict, sources=sources, filename=logger.path+"/preds.json"\ if args.save_pred else None, print_copy_info=args.print_copy_info, mode=mode) result = dict() result['bleu'] = bleu result['rouge_l'] = rouge_l result['meteor'] = meteor result['precision'] = precision result['recall'] = recall result['f1'] = f1 result["ev_time"] = eval_time.time() result["examples"] = examples logger.add(global_stats['epoch'], **result) if mode == 'test': logger.print('test valid official: ' 'bleu = %.2f | rouge_l = %.2f | meteor = %.2f | ' % (bleu, rouge_l, meteor) + 'Precision = %.2f | Recall = %.2f | F1 = %.2f | ' 'examples = %d | ' % (precision, recall, f1, examples) + 'test time = %.2f (s)' % eval_time.time()) else: logger.print( 'dev valid official: Epoch = %d | ' % (global_stats['epoch']) + 'bleu = %.2f | rouge_l = %.2f | meteor = %.2f | ' 'Precision = %.2f | Recall = %.2f | F1 = %.2f | examples = %d | ' % (bleu, rouge_l, meteor, precision, recall, f1, examples) + 'valid time = %.2f (s)' % eval_time.time()) return result
def set_defaults(args): """Make sure the commandline arguments are initialized properly.""" # Check critical files exist if not args.only_test: args.train_src_files = [] args.train_tgt_files = [] args.train_src_tag_files = [] args.train_rel_matrix_files = [] num_dataset = len(args.dataset_name) if num_dataset > 1: if len(args.train_src) == 1: args.train_src = args.train_src * num_dataset if len(args.train_tgt) == 1: args.train_tgt = args.train_tgt * num_dataset if len(args.train_src_tag) == 1: args.train_src_tag = args.train_src_tag * num_dataset if len(args.train_rel_matrix) == 1: args.train_rel_matrix = args.train_rel_matrix * num_dataset for i in range(num_dataset): dataset_name = args.dataset_name[i] data_dir = os.path.join(args.data_dir, dataset_name) train_src = os.path.join(data_dir, args.train_src[i]) train_tgt = os.path.join(data_dir, args.train_tgt[i]) if not os.path.isfile(train_src): raise IOError('No such file: %s' % train_src) if not os.path.isfile(train_tgt): raise IOError('No such file: %s' % train_tgt) if args.use_code_type: train_src_tag = os.path.join(data_dir, args.train_src_tag[i]) if not os.path.isfile(train_src_tag): raise IOError('No such file: %s' % train_src_tag) else: train_src_tag = None if args.use_tree_relative_attn: train_rel_matrix = os.path.join(data_dir, args.train_rel_matrix[i]) if not os.path.isfile(train_rel_matrix): raise IOError('No such file: %s' % train_rel_matrix) else: train_rel_matrix = None args.train_src_files.append(train_src) args.train_tgt_files.append(train_tgt) args.train_src_tag_files.append(train_src_tag) args.train_rel_matrix_files.append(train_rel_matrix) args.dev_src_files = [] args.dev_tgt_files = [] args.dev_src_tag_files = [] args.dev_rel_matrix_files = [] num_dataset = len(args.dataset_name) if num_dataset > 1: if len(args.dev_src) == 1: args.dev_src = args.dev_src * num_dataset if len(args.dev_tgt) == 1: args.dev_tgt = args.dev_tgt * num_dataset if len(args.dev_src_tag) == 1: args.dev_src_tag = args.dev_src_tag * num_dataset if len(args.dev_rel_matrix) == 1: args.dev_rel_matrix = args.dev_rel_matrix * num_dataset for i in range(num_dataset): dataset_name = args.dataset_name[i] data_dir = os.path.join(args.data_dir, dataset_name) dev_src = os.path.join(data_dir, args.dev_src[i]) dev_tgt = os.path.join(data_dir, args.dev_tgt[i]) if not os.path.isfile(dev_src): raise IOError('No such file: %s' % dev_src) if not os.path.isfile(dev_tgt): raise IOError('No such file: %s' % dev_tgt) if args.use_code_type: dev_src_tag = os.path.join(data_dir, args.dev_src_tag[i]) if not os.path.isfile(dev_src_tag): raise IOError('No such file: %s' % dev_src_tag) else: dev_src_tag = None if args.use_tree_relative_attn: dev_rel_matrix = os.path.join(data_dir, args.dev_rel_matrix[i]) if not os.path.isfile(dev_rel_matrix): raise IOError('No such file: %s' % dev_rel_matrix) else: dev_rel_matrix = None args.dev_src_files.append(dev_src) args.dev_tgt_files.append(dev_tgt) args.dev_src_tag_files.append(dev_src_tag) args.dev_rel_matrix_files.append(dev_rel_matrix) # Set model directory #subprocess.call(['mkdir', '-p', args.model_dir]) # Set model name #if not args.model_name: # import uuid # import time # args.model_name = time.strftime("%Y%m%d-") + str(uuid.uuid4())[:8] # Set log + model file names #suffix = '_test' if args.only_test else '' #args.model_file = os.path.join(args.model_dir, args.model_name + '.mdl') #args.log_file = os.path.join(args.model_dir, args.model_name + suffix + '.txt') #if args.save_pred: # args.pred_file = os.path.join(args.model_dir, args.model_name + suffix + '.json') #if args.pretrained: # args.pretrained = os.path.join(args.model_dir, args.pretrained + '.mdl') if args.use_src_word or args.use_tgt_word: # Make sure fix_embeddings and pretrained are consistent if args.fix_embeddings and not args.pretrained: logger.print('WARN: fix_embeddings set to False ' 'as embeddings are random.') args.fix_embeddings = False else: args.fix_embeddings = False return args