def main(opt, device_id): opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] else: checkpoint = None model_opt = opt # Peek the first dataset to determine the data_type. # (All datasets have the same data_type). first_dataset = next(lazily_load_dataset("train", opt)) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = _load_fields(first_dataset, data_type, opt, checkpoint) # Report src/tgt features. src_features, tgt_features = _collect_report_features(fields) for j, feat in enumerate(src_features): logger.info(' * src feature %d size = %d' % (j, len(fields[feat].vocab))) for j, feat in enumerate(tgt_features): logger.info(' * tgt feature %d size = %d' % (j, len(fields[feat].vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. optim = build_optim(model, opt, checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer(opt, device_id, model, fields, optim, data_type, model_saver=model_saver) def train_iter_fct(): return build_dataset_iter( lazily_load_dataset("train", opt), fields, opt) def valid_iter_fct(): return build_dataset_iter( lazily_load_dataset("valid", opt), fields, opt, is_train=False) # Do training. trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps, opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
def main(opt): opt = training_opt_postprocessing(opt) # Load checkpoint if we resume from a previous training. if opt.train_from: print('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] # I don't like reassigning attributes of opt: it's not clear. opt.start_epoch = checkpoint['epoch'] + 1 else: checkpoint = None model_opt = opt # Peek the fisrt dataset to determine the data_type. # (All datasets have the same data_type). first_dataset = next(lazily_load_dataset("train", opt)) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = _load_fields(first_dataset, data_type, opt, checkpoint) # Report src/tgt features. _collect_report_features(fields) # Build model. model = build_model(model_opt, opt, fields, checkpoint) _tally_parameters(model) _check_save_model_path(opt) # Build optimizer. optim = build_optim(model, opt, checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer(opt, model, fields, optim, data_type, model_saver=model_saver) def train_iter_fct(): return build_dataset_iter(lazily_load_dataset("train", opt), fields, opt) def valid_iter_fct(): return build_dataset_iter(lazily_load_dataset("valid", opt), fields, opt) # Do training. trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps, opt.valid_steps, opt.save_checkpoint_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
opt.stack_depth, opt.stack_size, opt.stack_elem_size, opt.dropout) model = PhraseSim(encoder, opt.dropout).to(device) init_model(opt, model) # print(model.state_dict()) if opt.load_idx != -1: basename = "{}-epoch-{}".format(opt.exp, opt.load_idx) model_fname = basename + ".model" location = { 'cuda:' + str(opt.gpu): 'cuda:' + str(opt.gpu) } if opt.gpu != -1 else 'cpu' model_dict = torch.load(model_fname, map_location=location) model.load_state_dict(model_dict) print("Loading model from '%s'" % (model_fname)) # model.generator = generator.to(device) optim = optimizers.build_optim(model, opt, None) weight = torch.Tensor([cweights['wneg'], cweights['wpos']]).to(device) criterion_ps = nn.CrossEntropyLoss(weight=weight) criterion_lm = nn.CrossEntropyLoss( ignore_index=TEXT.vocab.stoi[iters.PAD_WORD]) criterion = {'ps': criterion_ps, 'lm': criterion_lm} epoch = { 'start': opt.load_idx + 1 if opt.load_idx != -1 else 0, 'end': opt.nepoch, 'save_per': opt.save_per } train(train_iter, valid_iter, epoch, model, optim, criterion, opt)
def main(opt, device_id): opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] else: checkpoint = None model_opt = opt # Peek the first dataset to determine the data_type. # (All datasets have the same data_type). first_dataset = next(lazily_load_dataset("train", opt)) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = _load_fields(first_dataset, data_type, opt, checkpoint) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) logger.info('* batch_size: %d' % opt.batch_size) _check_save_model_path(opt) # Build optimizer. optim = build_optim(model, opt, checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer(opt, device_id, model, fields, optim, data_type, model_saver=model_saver) def data_iter_fct(data_stage): """data_stage: train / valid""" pt_file = opt.data + '.' + data_stage + '.pt' logger.info('Loading {} dataset'.format(data_stage)) dataset = torch.load(pt_file) logger.info('Loaded {} dataset'.format(data_stage)) dataset.fields = fields is_train = True if data_stage == "train" else False batch_size = opt.batch_size if is_train else opt.valid_batch_size repeat = True if data_stage == "train" else False if opt.gpuid != -1: device = "cuda" else: device = "cpu" def sort_key(ex): """ Sort using length of source sentences. """ return ex.total_tokens return torchtext.data.Iterator(dataset=dataset, batch_size=batch_size, device=device, train=is_train, sort=False, sort_key=sort_key, repeat=repeat) # Do training. trainer.train(data_iter_fct, opt.train_steps, opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
def main(opt, device_id): opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) # Load default opts values then overwrite it with opts from # the checkpoint. It's usefull in order to re-train a model # after adding a new option (not set in checkpoint) dummy_parser = configargparse.ArgumentParser() opts.model_opts(dummy_parser) default_opt = dummy_parser.parse_known_args([])[0] model_opt = default_opt model_opt.__dict__.update(checkpoint['opt'].__dict__) else: checkpoint = None model_opt = opt # Peek the first dataset to determine the data_type. # (All datasets have the same data_type). first_dataset = next(lazily_load_dataset("train", opt)) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = load_fields(first_dataset, opt, checkpoint) # Report src/tgt features. src_features, tgt_features = _collect_report_features(fields) for j, feat in enumerate(src_features): logger.info(' * src feature %d size = %d' % (j, len(fields[feat].vocab))) for j, feat in enumerate(tgt_features): logger.info(' * tgt feature %d size = %d' % (j, len(fields[feat].vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. optim = build_optim(model, opt, checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer(opt, device_id, model, fields, optim, data_type, model_saver=model_saver) def train_iter_fct(): return build_dataset_iter(lazily_load_dataset("train", opt), fields, opt) def valid_iter_fct(): return build_dataset_iter(lazily_load_dataset("valid", opt), fields, opt, is_train=False) # Do training. if len(opt.gpu_ranks): logger.info('Starting training on GPU: %s' % opt.gpu_ranks) else: logger.info('Starting training on CPU, could be very slow') trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps, opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
def main(opt, device_id): # opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] else: raise Exception('You need to load a model') logger.info('Loading data from %s' % opt.data) dataset = next(lazily_load_dataset("train", opt)) data_type = dataset.data_type logger.info('Data type %s' % data_type) # Load fields generated from preprocess phase. fields = _load_fields(dataset, data_type, opt, checkpoint) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. optim = build_optim(model, opt, checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) dataset_iter = build_dataset_iter(lazily_load_dataset("train", opt), fields, opt) out_file = codecs.open(opt.output, 'w+', 'utf-8') scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta, opt.coverage_penalty, opt.length_penalty) translation_builder = TranslationBuilder(dataset, fields, n_best=opt.n_best, replace_unk=opt.replace_unk, has_tgt=False) def train_iter_fct(): return build_dataset_iter(lazily_load_dataset("train", opt), fields, opt) trainer = build_trainer(opt, device_id, model, fields, optim, data_type, model_saver=model_saver) translator = Translator(trainer.model, fields, opt.beam_size, global_scorer=scorer, out_file=out_file, report_score=False, copy_attn=model_opt.copy_attn, logger=logger) for i, batch in enumerate(dataset_iter): unprocessed_translations = translator.translate_batch(batch, dataset) translations = translation_builder.from_batch(unprocessed_translations) print "Translations: ", ' '.join(translations[0].pred_sents[0]) trainer.train_from_data(batch, train_steps=1) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
def main(opt, device_id): opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) # Gather information related to the training script and commit version script_path = os.path.abspath(__file__) script_dir = os.path.dirname(os.path.dirname(script_path)) logger.info('Train script dir: %s' % script_dir) git_commit = str(subprocess.check_output(['bash', script_dir + '/cluster_scripts/git_version.sh'])) logger.info("Git Commit: %s" % git_commit[2:-3]) # Load checkpoint if we resume from a previous training. if opt.train_from: # TODO: load MTL model logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) # Load default opts values then overwrite it with opts from # the checkpoint. It's usefull in order to re-train a model # after adding a new option (not set in checkpoint) dummy_parser = configargparse.ArgumentParser() opts.model_opts(dummy_parser) default_opt = dummy_parser.parse_known_args([])[0] model_opt = default_opt model_opt.__dict__.update(checkpoint['opt'].__dict__) else: checkpoint = None model_opt = opt num_tasks = len(opt.data.split(',')) opt.num_tasks = num_tasks checkpoint_list=[] if opt.warm_model: base_name=opt.warm_model for task_id in range(num_tasks): chkpt_path=base_name.replace("X",str(task_id)) if not os.path.isfile(chkpt_path): chkpt_path = base_name.replace("X", str(0)) logger.info('Loading a checkpoint from %s' % chkpt_path) checkpoint = torch.load(chkpt_path, map_location=lambda storage, loc: storage) checkpoint_list.append(checkpoint) else: for task_id in range(num_tasks): checkpoint_list.append(None) fields_list = [] data_type=None for task_id in range(num_tasks): # Peek the first dataset to determine the data_type. # (All datasets have the same data_type). first_dataset = next(lazily_load_dataset("train", opt, task_id=task_id)) data_type = first_dataset.data_type # Load fields generated from preprocess phase. if opt.mtl_shared_vocab and task_id > 0: logger.info(' * vocabulary size. Same as the main task!') fields = fields_list[0] else: fields = load_fields(first_dataset, opt, checkpoint_list[task_id], task_id=task_id) # Report src/tgt features. src_features, tgt_features = _collect_report_features(fields) for j, feat in enumerate(src_features): logger.info(' * (Task %d) src feature %d size = %d' % (task_id, j, len(fields[feat].vocab))) for j, feat in enumerate(tgt_features): logger.info(' * (Task %) tgt feature %d size = %d' % (task_id, j, len(fields[feat].vocab))) fields_list.append(fields) if opt.epochs > -1: total_num_batch = 0 for task_id in range(num_tasks): train_iter = build_dataset_iter(lazily_load_dataset("train", opt, task_id=task_id), fields_list[task_id], opt) for i, batch in enumerate(train_iter): num_batch = i total_num_batch+=num_batch if opt.mtl_schedule < 10: break num_batch = total_num_batch opt.train_steps = (num_batch * opt.epochs) + 1 # Do the validation and save after each epoch opt.valid_steps = num_batch opt.save_checkpoint_steps = 1 # logger.info(opt_to_string(opt)) logger.info(opt) # Build model(s). models_list = [] for task_id in range(num_tasks): if opt.mtl_fully_share and task_id > 0: # Since we only have one model, copy the pointer to the model for all models_list.append(models_list[0]) else: main_model = models_list[0] if task_id > 0 else None model = build_model(model_opt, opt, fields_list[task_id], checkpoint_list[task_id], main_model=main_model, task_id=task_id) n_params, enc, dec = _tally_parameters(model) logger.info('(Task %d) encoder: %d' % (task_id, enc)) logger.info('(Task %d) decoder: %d' % (task_id, dec)) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) models_list.append(model) # combine parameters of different models and consider shared parameters just once. def combine_named_parameters(named_params_list): observed_params = [] for model_named_params in named_params_list: for name, p in model_named_params: is_observed = False # Check whether we observed this parameter before for param in observed_params: if p is param: is_observed = True break if not is_observed: observed_params.append(p) yield name, p # Build optimizer. optims_list = [] all_models_params=[] for task_id in range(num_tasks): if not opt.mtl_shared_optimizer: optim = build_optim(models_list[task_id], opt, checkpoint) optims_list.append(optim) else: all_models_params.append(models_list[task_id].named_parameters()) # Extract the list of shared parameters among the models of all tasks. observed_params = [] shared_params = [] for task_id in range(num_tasks): for name, p in models_list[task_id].named_parameters(): is_observed = False # Check whether we observed this parameter before for param in observed_params: if p is param: shared_params.append(name) is_observed = True break if not is_observed: observed_params.append(p) opt.shared_params = shared_params if opt.mtl_shared_optimizer: optim = build_optim_mtl_params(combine_named_parameters(all_models_params), opt, checkpoint) optims_list.append(optim) # Build model saver model_saver = build_mtl_model_saver(model_opt, opt, models_list, fields_list, optims_list) trainer = build_trainer(opt, device_id, models_list, fields_list, optims_list, data_type, model_saver=model_saver) def train_iter_fct(task_id): return build_dataset_iter( lazily_load_dataset("train", opt, task_id=task_id), fields_list[task_id], opt) def valid_iter_fct(task_id): return build_dataset_iter( lazily_load_dataset("valid", opt, task_id=task_id), fields_list[task_id], opt) def meta_valid_iter_fct(task_id, is_log=False): return build_dataset_iter( lazily_load_dataset("meta_valid", opt, task_id=task_id, is_log=is_log), fields_list[task_id], opt) # Do training. if len(opt.gpu_ranks): logger.info('Starting training on GPU: %s' % opt.gpu_ranks) else: logger.info('Starting training on CPU, could be very slow') trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps, opt.valid_steps, meta_valid_iter_fct=meta_valid_iter_fct) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
def main(opt, device_id): #TODO delete all these lines related to WALS features #begin SimulationLanguages = [opt.wals_src, opt.wals_tgt] print('Loading WALS features from databases...') cwd = os.getcwd() db = sqlite3.connect(cwd + '/onmt/WalsValues.db') cursor = db.cursor() cursor.execute('SELECT * FROM WalsValues') WalsValues = cursor.fetchall() db = sqlite3.connect(cwd + '/onmt/FeaturesList.db') cursor = db.cursor() cursor.execute('SELECT * FROM FeaturesList') FeaturesList = cursor.fetchall() db = sqlite3.connect(cwd + '/onmt/FTInfos.db') cursor = db.cursor() cursor.execute('SELECT * FROM FTInfos') FTInfos = cursor.fetchall() db = sqlite3.connect(cwd + '/onmt/FTList.db') cursor = db.cursor() cursor.execute('SELECT * FROM FTList') FTList = cursor.fetchall() ListLanguages = [] for i in WalsValues: ListLanguages.append(i[0]) FeatureTypes = [] for i in FTList: FeatureTypes.append((i[0], i[1].split(','))) FeatureNames = [] for i in FeatureTypes: FeatureNames += i[1] FeatureTypesNames = [] for i in FeatureTypes: FeatureTypesNames.append(i[0]) FeatureValues, FeatureTensors = get_feat_values(SimulationLanguages, WalsValues, FeaturesList, ListLanguages, FeatureTypes, FeatureNames) print('WALS databases loaded!') #end #TODO: load wals features from command-line (wals.npz) # FeatureValues: defaultdict with feature values, per language. # FeatureTensors: tensor of possible outputs, per feature. opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] else: checkpoint = None model_opt = opt # Peek the first dataset to determine the data_type. # (All datasets have the same data_type). first_dataset = next(lazily_load_dataset("train", opt)) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = _load_fields(first_dataset, data_type, opt, checkpoint) # Report src/tgt features. src_features, tgt_features = _collect_report_features(fields) for j, feat in enumerate(src_features): logger.info(' * src feature %d size = %d' % (j, len(fields[feat].vocab))) for j, feat in enumerate(tgt_features): logger.info(' * tgt feature %d size = %d' % (j, len(fields[feat].vocab))) # Build model. #TODO: remove all parameters related to WALS features: FeatureValues, FeatureTensors, FeatureTypes, FeaturesList, FeatureNames, FTInfos, FeatureTypesNames, SimulationLanguages #TODO: include four parameter related to WALS features: the four numpy arrays separately model = build_model(model_opt, opt, fields, checkpoint, FeatureValues, FeatureTensors, FeatureTypes, FeaturesList, FeatureNames, FTInfos, FeatureTypesNames, SimulationLanguages) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. optim = build_optim(model, opt, checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer(opt, device_id, model, fields, optim, data_type, model_saver=model_saver) def train_iter_fct(): return build_dataset_iter( lazily_load_dataset("train", opt), fields, opt) def valid_iter_fct(): return build_dataset_iter( lazily_load_dataset("valid", opt), fields, opt, is_train=False) # Do training. trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps, opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
def main(opt, device_id): opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) # Load default opts values then overwrite it with opts from # the checkpoint. It's usefull in order to re-train a model # after adding a new option (not set in checkpoint) dummy_parser = configargparse.ArgumentParser() opts.model_opts(dummy_parser) default_opt = dummy_parser.parse_known_args([])[0] model_opt = default_opt model_opt.__dict__.update(checkpoint['opt'].__dict__) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: checkpoint = None model_opt = opt vocab = torch.load(opt.data + '.vocab.pt') # Load a shard dataset to determine the data_type. # (All datasets have the same data_type). # this should be refactored out of existence reasonably soon first_dataset = torch.load(glob.glob(opt.data + '.train*.pt')[0]) data_type = first_dataset.data_type # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way if old_style_vocab(vocab): fields = load_fields_from_vocab(vocab, data_type) else: fields = vocab # Report src and tgt vocab sizes, including for features for side in ['src', 'tgt']: for name, f in fields[side]: if f.use_vocab: logger.info(' * %s vocab size = %d' % (name, len(f.vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. optim = build_optim(model, opt, checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer(opt, device_id, model, fields, optim, data_type, model_saver=model_saver) # this line is kind of a temporary kludge because different objects expect # fields to have a different structure dataset_fields = dict(chain.from_iterable(fields.values())) train_iter = build_dataset_iter("train", dataset_fields, opt) valid_iter = build_dataset_iter("valid", dataset_fields, opt, is_train=False) if len(opt.gpu_ranks): logger.info('Starting training on GPU: %s' % opt.gpu_ranks) else: logger.info('Starting training on CPU, could be very slow') trainer.train(train_iter, valid_iter, opt.train_steps, opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
def main(opt): opt = training_opt_postprocessing(opt) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] else: checkpoint = None model_opt = opt if opt.load_pretrained_selector_from: logger.info('Loading selector checkpoint from %s' % opt.load_pretrained_selector_from) sel_checkpoint = torch.load(opt.load_pretrained_selector_from, map_location=lambda storage, loc: storage) else: sel_checkpoint = None if opt.load_pretrained_s2s_generator_from: logger.info('Loading s2s generator checkpoint from %s' % opt.load_pretrained_s2s_generator_from) s2s_gen_checkpoint = torch.load( opt.load_pretrained_s2s_generator_from, map_location=lambda storage, loc: storage) else: s2s_gen_checkpoint = None # Peek the fisrt dataset to determine the data_type. # (All datasets have the same data_type). first_dataset = next(lazily_load_dataset("train", opt)) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = _load_fields(first_dataset, data_type, opt, checkpoint) # Report src/tgt features. src_features, tgt_features = _collect_report_features(fields) for j, feat in enumerate(src_features): logger.info(' * src feature %d size = %d' % (j, len(fields[feat].vocab))) for j, feat in enumerate(tgt_features): logger.info(' * tgt feature %d size = %d' % (j, len(fields[feat].vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint, sel_checkpoint, s2s_gen_checkpoint) # Fix the pretrained selector parameters if needed if model_opt.fix_sel_all: assert opt.load_pretrained_selector_from assert opt.sel_lambda == 0.0 assert not model_opt.fix_sel_classifier for name, param in model.named_parameters(): if 'selector' in name: param.requires_grad = False # only fix the classifier of the selector if model_opt.fix_sel_classifier: assert opt.load_pretrained_selector_from assert not model_opt.fix_sel_all for name, param in model.named_parameters(): if 'selector' in name and 'rnn' not in name and 'embeddings' not in name: param.requires_grad = False n_params, sel, enc, dec, gen = _my_tally_parameters(model) logger.info('selector: %d' % sel) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('generator: %d' % gen) logger.info('* number of parameters: %d' % n_params) print_trainable_parameters(model) _check_save_model_path(opt) # Build optimizer. optim = build_optim(model, opt, checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer(opt, model, fields, optim, data_type, model_saver=model_saver) def train_iter_fct(): return build_dataset_iter(lazily_load_dataset("train", opt), fields, opt) def valid_iter_fct(): return build_dataset_iter(lazily_load_dataset("valid", opt), fields, opt) # Do training. trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps, opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
def main(opt, device_id): opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) out_file = None best_test_score, best_ckpt = -10000, None dummy_parser = argparse.ArgumentParser(description='all_dev.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] for i in range(0, opt.train_epochs, 10): ckpt_path = '{}_epoch_{}.pt'.format(opt.save_model, i + 1) logger.info('Loading checkpoint from %s' % ckpt_path) checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] fields = load_fields_from_vocab(checkpoint['vocab'], data_type="text") # Build model. model = build_model(model_opt, opt, fields, checkpoint) assert opt.train_from == '' # do not load optimizer state optim = build_optim(model, opt, checkpoint) # Build model saver, no need to create task dir for dev if not os.path.exists('experiments/all_dev'): os.mkdir('experiments/all_dev') os.mkdir('experiments/all_dev/' + opt.meta_dev_task) elif not os.path.exists('experiments/all_dev/' + opt.meta_dev_task): os.mkdir('experiments/all_dev/' + opt.meta_dev_task) model_saver = build_model_saver( model_opt, 'experiments/all_dev/' + opt.meta_dev_task + '/model', opt, model, fields, optim) trainer = build_trainer(opt, device_id, model, fields, optim, "text", model_saver=model_saver) train_iter = list( build_dataset_iter(lazily_load_dataset("train", opt), fields, opt)) # do training on trainset of meta-dev task trainer.train(train_iter, opt.inner_iterations) # do evaluation on devset of meta-dev task best_dev_score, best_model_path = -10000, None for model_path in os.listdir('experiments/all_dev/' + opt.meta_dev_task): if model_path.find('.pt') == -1: continue if out_file is None: out_file = codecs.open(opt.output, 'w+', 'utf-8') fields, model, model_opt = onmt.model_builder.load_test_model( opt, dummy_opt.__dict__, model_path='experiments/all_dev/' + opt.meta_dev_task + '/' + model_path) scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta, opt.coverage_penalty, opt.length_penalty) kwargs = { k: getattr(opt, k) for k in [ "beam_size", "n_best", "max_length", "min_length", "stepwise_penalty", "block_ngram_repeat", "ignore_when_blocking", "dump_beam", "report_bleu", "replace_unk", "gpu", "verbose", "fast", "mask_from" ] } fields['graph'] = torchtext.data.Field(sequential=False) translator = Translator(model, fields, global_scorer=scorer, out_file=out_file, report_score=False, copy_attn=model_opt.copy_attn, logger=logger, log_probs_out_file=None, **kwargs) # make translation and save result all_scores, all_predictions = translator.translate( src_path='processed_data/meta-dev/' + opt.meta_dev_task + '/src-dev.txt', tgt_path=None, src_dir=None, batch_size=opt.translate_batch_size, attn_debug=False) # dump predictions f = open('experiments/all_dev/' + opt.meta_dev_task + '/dev_predictions.csv', 'w', encoding='utf-8') f.write('smiles,property\n') for n_best_mols in all_predictions: for mol in n_best_mols: f.write(mol.replace(' ', '') + ',0\n') f.close() # call chemprop to get scores test_path = '\"' + 'experiments/all_dev/' + opt.meta_dev_task + '/dev_predictions.csv' + '\"' checkpoint_path = '\"' + 'scorer_ckpts/' + opt.meta_dev_task + '/model.pt' + '\"' preds_path = '\"' + 'experiments/all_dev/' + opt.meta_dev_task + '/dev_scores.csv' + '\"' # in case of all mols are invalid (will produce not output file by chemprop) # the predictions are copied into score file cmd = 'cp {} {}'.format(test_path, preds_path) result = os.popen(cmd) result.close() cmd = 'python chemprop/predict.py --test_path {} --checkpoint_path {} --preds_path {} --num_workers 0'.format( test_path, checkpoint_path, preds_path) scorer_result = os.popen(cmd) scorer_result.close() # read score file and get score score = read_score_csv('experiments/all_dev/' + opt.meta_dev_task + '/dev_scores.csv') assert len(score) % opt.beam_size == 0 # dev_scores = [] # for i in range(0, len(score), opt.beam_size): # dev_scores.append(sum([x[1] for x in score[i:i+opt.beam_size]]) / opt.beam_size) # report dev score dev_metrics = calculate_metrics(opt.meta_dev_task, 'dev', 'dev', score) logger.info('dev metrics: ' + str(dev_metrics)) dev_score = dev_metrics['success_rate'] if dev_score > best_dev_score: logger.info('New best dev success rate: {:.4f} by {}'.format( dev_score, model_path)) best_model_path = model_path best_dev_score = dev_score else: logger.info('dev success rate: {:.4f} by {}'.format( dev_score, model_path)) del fields del model del model_opt del scorer del translator gc.collect() assert best_model_path != None # do testing on testset of meta-dev task if out_file is None: out_file = codecs.open(opt.output, 'w+', 'utf-8') fields, model, model_opt = onmt.model_builder.load_test_model( opt, dummy_opt.__dict__, model_path='experiments/all_dev/' + opt.meta_dev_task + '/' + best_model_path) scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta, opt.coverage_penalty, opt.length_penalty) kwargs = { k: getattr(opt, k) for k in [ "beam_size", "n_best", "max_length", "min_length", "stepwise_penalty", "block_ngram_repeat", "ignore_when_blocking", "dump_beam", "report_bleu", "replace_unk", "gpu", "verbose", "fast", "mask_from" ] } fields['graph'] = torchtext.data.Field(sequential=False) translator = Translator(model, fields, global_scorer=scorer, out_file=out_file, report_score=False, copy_attn=model_opt.copy_attn, logger=logger, log_probs_out_file=None, **kwargs) # make translation and save result all_scores, all_predictions = translator.translate( src_path='processed_data/meta-dev/' + opt.meta_dev_task + '/src-test.txt', tgt_path=None, src_dir=None, batch_size=opt.translate_batch_size, attn_debug=False) # dump predictions f = open('experiments/all_dev/' + opt.meta_dev_task + '/test_predictions.csv', 'w', encoding='utf-8') f.write('smiles,property\n') for n_best_mols in all_predictions: for mol in n_best_mols: f.write(mol.replace(' ', '') + ',0\n') f.close() # call chemprop to get scores test_path = '\"' + 'experiments/all_dev/' + opt.meta_dev_task + '/test_predictions.csv' + '\"' checkpoint_path = '\"' + 'scorer_ckpts/' + opt.meta_dev_task + '/model.pt' + '\"' preds_path = '\"' + 'experiments/all_dev/' + opt.meta_dev_task + '/test_scores.csv' + '\"' # in case of all mols are invalid (will produce not output file by chemprop) # the predictions are copied into score file cmd = 'cp {} {}'.format(test_path, preds_path) result = os.popen(cmd) result.close() cmd = 'python chemprop/predict.py --test_path {} --checkpoint_path {} --preds_path {} --num_workers 0'.format( test_path, checkpoint_path, preds_path) scorer_result = os.popen(cmd) # logger.info('{}'.format('\n'.join(scorer_result.readlines()))) scorer_result.close() # read score file and get score score = read_score_csv('experiments/all_dev/' + opt.meta_dev_task + '/test_scores.csv') assert len(score) % opt.beam_size == 0 # test_scores = [] # for i in range(0, len(score), opt.beam_size): # test_scores.append(sum([x[1] for x in score[i:i+opt.beam_size]]) / opt.beam_size) # report if it is the best on test test_metrics = calculate_metrics(opt.meta_dev_task, 'dev', 'test', score) logger.info('test metrics: ' + str(test_metrics)) test_score = test_metrics['success_rate'] if test_score > best_test_score: best_ckpt = ckpt_path logger.info('New best test success rate: {:.4f} by {}'.format( test_score, ckpt_path)) best_test_score = test_score else: logger.info('test success rate: {:.4f} by {}'.format( test_score, ckpt_path)) del model_opt del fields del checkpoint del model del optim del model_saver del trainer gc.collect()
probe( args.name, model, args.host, args.port, when=lambda m, o: m._v.state == "dev", which=lambda m, o: True, #o._v.operation_name in ["encoder", "decoder"], parameters=False, forward=True, backward=False, batch_axis=1) opt.report_every = 1 # Build optimizer. optim = build_optim(model, opt, checkpoint) trainer = build_trainer(opt, model, fields, optim, "text") def train_iter_fct(): return build_dataset_iter(lazily_load_dataset("train", opt), fields, opt) def valid_iter_fct(): return build_dataset_iter(lazily_load_dataset("valid", opt), fields, opt) #print(args.epochs) # Do training. trainer.train(train_iter_fct, valid_iter_fct, args.epochs, 1)
def main(opt, device_id): opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] else: checkpoint = None model_opt = opt # Peek the first dataset to determine the data_type. # (All datasets have the same data_type). first_dataset = next(lazily_load_dataset("train", opt)) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = _load_fields(first_dataset, data_type, opt, checkpoint) # Report src/tgt features. src_features, tgt_features = _collect_report_features(fields) for j, feat in enumerate(src_features): logger.info(' * src feature %d size = %d' % (j, len(fields[feat].vocab))) for j, feat in enumerate(tgt_features): logger.info(' * tgt feature %d size = %d' % (j, len(fields[feat].vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. if opt.train_from and opt.reset_optim != 'all': logger.info('* checkpoint training not considered by me yet') else: # warmup_steps and rnn_size are parameters for Noam decay (transformer): # https://arxiv.org/pdf/1706.03762.pdf (Section 3) decay_method = opt.decay_method if opt.decay_method else "standard" logger.info( '* Opt: %s (rate %.5f, maxgnorm %.1f, %s decay, ' 'decay_rate %.1f, start_decay_at %d, decay_every %d, ' 'ab1 %.5f, ab2 %.5f, adagradaccum %.1f, ' 'warmupsteps %d, hiddensize %d)' % (opt.optim, opt.learning_rate, opt.max_grad_norm, decay_method, opt.learning_rate_decay, opt.start_decay_steps, opt.decay_steps, opt.adam_beta1, opt.adam_beta2, opt.adagrad_accumulator_init, opt.warmup_steps, opt.rnn_size)) optim = build_optim(model, opt, checkpoint) # Build model saver v model_saver = build_model_saver(model_opt, opt, model, fields, optim) logger.info('* model_saver built, using it to build trainer with ') trainer = build_trainer(opt, device_id, model, fields, optim, data_type, model_saver=model_saver) #--------------------------------------------------------------------------- # 1. lazily_load_dataset = for pt in pts: yield torch.load(pt) # 2. build_dataset_iter = return DatasetLazyIter (train_iter_fct) # 3. train_iter_fct() = iterator over torchtext.data.batch.Batches #--------------------------------------------------------------------------- def train_iter_fct(): return build_dataset_iter(lazily_load_dataset("train", opt), fields, opt) def valid_iter_fct(): return build_dataset_iter(lazily_load_dataset("valid", opt), fields, opt, is_train=False) # Do training. if len(opt.gpu_ranks): logger.info('Starting training on GPU: %s' % opt.gpu_ranks) else: logger.info('Starting training on CPU, could be very slow') trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps, opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
def main(opt, device_id): opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] else: checkpoint = None model_opt = opt first_dataset = pickle.load(open('processed_data/all-train/train.pt', 'rb')) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = _load_fields(first_dataset, data_type, opt, checkpoint) # Report src/tgt features. src_features, tgt_features = _collect_report_features(fields) for j, feat in enumerate(src_features): logger.info(' * src feature %d size = %d' % (j, len(fields[feat].vocab))) for j, feat in enumerate(tgt_features): logger.info(' * tgt feature %d size = %d' % (j, len(fields[feat].vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint) optim = build_optim(model, opt, checkpoint) # opt.train_from == '' # Build model saver if not os.path.exists('experiments/all_train'): os.mkdir('experiments/all_train') model_saver = build_model_saver(model_opt, opt.save_model, opt, model, fields, optim) trainer = build_trainer(opt, device_id, model, fields, optim, "text", model_saver=model_saver) def _lazy_dataset_loader(pt_file): # dataset = torch.load(pt_file) def dataset_loader(pt_file): with open(pt_file, 'rb') as f: dataset = pickle.load(f) # logger.info('Loading task from <{}>, number of examples: {}'.format(pt_file, len(dataset))) return dataset yield dataset_loader(pt_file) train_iter = list( build_dataset_iter( _lazy_dataset_loader('processed_data/all-train/train.pt'), fields, opt)) trainer.train(train_iter, opt.train_epochs)
def main(opt, device_id): opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) # Load default opts values then overwrite it with opts from # the checkpoint. It's usefull in order to re-train a model # after adding a new option (not set in checkpoint) dummy_parser = configargparse.ArgumentParser() opts.model_opts(dummy_parser) default_opt = dummy_parser.parse_known_args([])[0] model_opt = default_opt model_opt.__dict__.update(checkpoint['opt'].__dict__) else: checkpoint = None model_opt = opt # Peek the first dataset to determine the data_type. # (All datasets have the same data_type). first_dataset = next(lazily_load_dataset("train", opt)) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = load_fields(first_dataset, opt, checkpoint) # Report src/tgt features. src_features, tgt_features = _collect_report_features(fields) for j, feat in enumerate(src_features): logger.info(' * src feature %d size = %d' % (j, len(fields[feat].vocab))) for j, feat in enumerate(tgt_features): logger.info(' * tgt feature %d size = %d' % (j, len(fields[feat].vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. optim = build_optim(model, opt, checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer(opt, device_id, model, fields, optim, data_type, model_saver=model_saver) def train_iter_fct(): return build_dataset_iter(lazily_load_dataset("train", opt), fields, opt) def valid_iter_fct(): return build_dataset_iter(lazily_load_dataset("valid", opt), fields, opt, is_train=False) # Do training. if len(opt.gpu_ranks): logger.info('Starting training on GPU: %s' % opt.gpu_ranks) else: logger.info('Starting training on CPU, could be very slow') if opt.no_base == False: trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps, opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close() if opt.comparable: logger.info('') logger.info('Beginning comparable data extraction and training.') # 1. Initialize Comparable object comp = Comparable(model, trainer, fields, logger, opt) # 2. Infer similarity threshold from training data for epoch in range(opt.comp_epochs): # 3. Update threshold if dynamic if opt.threshold_dynamics != 'static' and epoch != 0: comp.update_threshold(opt.threshold_dynamics, opt.infer_threshold) # 4. Extract parallel data and train #if opt.match_articles: # comparable_data = comp.match_articles(opt.match_articles) # train_stats = comp.extract_and_train(comparable_data) #else: train_stats = comp.extract_and_train(opt.comparable_data) # 5. Validate on validation set if opt.no_valid == False: valid_iter = build_dataset_iter( lazily_load_dataset("valid", opt), fields, opt) valid_stats = comp.validate(valid_iter) # 6. Drop a checkpoint if needed comp.trainer.model_saver._save(epoch)