def _build_fv_from_multifield(multifield, counters, build_fv_args, size_multiple=1): for name, field in multifield: _build_field_vocab( field, counters[name], size_multiple=size_multiple, **build_fv_args[name]) logger.info(" * %s vocab size: %d." % (name, len(field.vocab)))
def _load_vocab(vocab_path, name, counters): # counters changes in place vocab = _read_vocab_file(vocab_path, name) vocab_size = len(vocab) logger.info('Loaded %s vocab has %d tokens.' % (name, vocab_size)) for i, token in enumerate(vocab): # keep the order of tokens specified in the vocab file by # adding them to the counter with decreasing counting values counters[name][token] = vocab_size - i return vocab, vocab_size
def validate_train_opts(cls, opt): if opt.epochs: raise AssertionError( "-epochs is deprecated please use -train_steps.") if opt.truncated_decoder > 0 and max(opt.accum_count) > 1: raise AssertionError("BPTT is not compatible with -accum > 1") if opt.gpuid: raise AssertionError("gpuid is deprecated \ see world_size and gpu_ranks") if torch.cuda.is_available() and not opt.gpu_ranks: logger.info("WARNING: You have a CUDA device, \ should run with -gpu_ranks")
def _read_vocab_file(vocab_path, tag): """Loads a vocabulary from the given path. Args: vocab_path (str): Path to utf-8 text file containing vocabulary. Each token should be on a line by itself. Tokens must not contain whitespace (else only before the whitespace is considered). tag (str): Used for logging which vocab is being read. """ logger.info("Loading {} vocabulary from {}".format(tag, vocab_path)) if not os.path.exists(vocab_path): raise RuntimeError( "{} vocabulary not found at {}".format(tag, vocab_path)) else: with codecs.open(vocab_path, 'r', 'utf-8') as f: return [line.strip().split()[0] for line in f if line.strip()]
def _iter_dataset(self, path): cur_dataset = torch.load(path) logger.info('Loading dataset from %s, number of examples: %d' % (path, len(cur_dataset))) cur_dataset.fields = self.fields cur_iter = OrderedIterator( dataset=cur_dataset, batch_size=self.batch_size, batch_size_multiple=self.batch_size_multiple, batch_size_fn=self.batch_size_fn, device=self.device, train=self.is_train, sort=False, sort_within_batch=True, repeat=False ) for batch in cur_iter: yield batch cur_dataset.examples = None gc.collect() del cur_dataset gc.collect()
def output(self, step, num_steps, learning_rate, start): """Write out statistics to stdout. Args: step (int): current step n_batch (int): total batches start (int): start time of step. """ t = self.elapsed_time() step_fmt = "%2d" % step if num_steps > 0: step_fmt = "%s/%5d" % (step_fmt, num_steps) logger.info( ("Step %s; acc: %6.2f; ppl: %5.2f; xent: %4.2f; " + "lr: %7.5f; %3.0f/%3.0f tok/s; %6.0f sec") % (step_fmt, self.accuracy(), self.ppl(), self.xent(), learning_rate, self.n_src_words / (t + 1e-5), self.n_words / (t + 1e-5), time.time() - start)) sys.stdout.flush()
def build_save_dataset(corpus_type, fields, src_reader, tgt_reader, opt): assert corpus_type in ['train', 'valid'] if corpus_type == 'train': src = opt.train_src tgt = opt.train_tgt else: src = opt.valid_src tgt = opt.valid_tgt logger.info("Reading source and target files: %s %s." % (src, tgt)) src_shards = split_corpus(src, opt.shard_size) tgt_shards = split_corpus(tgt, opt.shard_size) shard_pairs = zip(src_shards, tgt_shards) dataset_paths = [] if (corpus_type == "train" or opt.filter_valid) and tgt is not None: filter_pred = partial( inputters.filter_example, use_src_len=opt.data_type == "text", max_src_len=opt.src_seq_length, max_tgt_len=opt.tgt_seq_length) else: filter_pred = None for i, (src_shard, tgt_shard) in enumerate(shard_pairs): assert len(src_shard) == len(tgt_shard) logger.info("Building shard %d." % i) dataset = inputters.Dataset( fields, readers=[src_reader, tgt_reader] if tgt_reader else [src_reader], data=([("src", src_shard), ("tgt", tgt_shard)] if tgt_reader else [("src", src_shard)]), dirs=[opt.src_dir, None] if tgt_reader else [opt.src_dir], sort_key=inputters.str2sortkey[opt.data_type], filter_pred=filter_pred ) data_path = "{:s}.{:s}.{:d}.pt".format(opt.save_data, corpus_type, i) dataset_paths.append(data_path) logger.info(" * saving %sth %s data shard to %s." % (i, corpus_type, data_path)) dataset.save(data_path) del dataset.examples gc.collect() del dataset gc.collect() return dataset_paths
def main(opt, device_id): # NOTE: It's important that ``opt`` has been validated and updated # at this point. configure_process(opt, device_id) init_logger(opt.log_file) assert len(opt.accum_count) == len(opt.accum_steps), \ 'Number of accum_count values must match number of accum_steps' # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = ArgumentParser.ckpt_model_opts(checkpoint["opt"]) ArgumentParser.update_model_opts(model_opt) ArgumentParser.validate_model_opts(model_opt) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: checkpoint = None model_opt = opt vocab = torch.load(opt.data + '.vocab.pt') # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) if old_style_vocab(vocab): fields = load_old_vocab( vocab, opt.model_type, dynamic_dict=opt.copy_attn) else: fields = vocab # Report src and tgt vocab sizes, including for features for side in ['src', 'tgt']: f = fields[side] try: f_iter = iter(f) except TypeError: f_iter = [(side, f)] for sn, sf in f_iter: if sf.use_vocab: logger.info(' * %s vocab size = %d' % (sn, len(sf.vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. optim = Optimizer.from_opt(model, opt, checkpoint=checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer( opt, device_id, model, fields, optim, model_saver=model_saver) train_iter = build_dataset_iter("train", fields, opt) valid_iter = build_dataset_iter( "valid", fields, opt, is_train=False) if len(opt.gpu_ranks): logger.info('Starting training on GPU: %s' % opt.gpu_ranks) else: logger.info('Starting training on CPU, could be very slow') train_steps = opt.train_steps if opt.single_pass and train_steps > 0: logger.warning("Option single_pass is enabled, ignoring train_steps.") train_steps = 0 trainer.train( train_iter, train_steps, save_checkpoint_steps=opt.save_checkpoint_steps, valid_iter=valid_iter, valid_steps=opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
def preprocess(opt): ArgumentParser.validate_preprocess_args(opt) torch.manual_seed(opt.seed) init_logger(opt.log_file) logger.info("Extracting features...") src_nfeats = 0 tgt_nfeats = 0 src_nfeats = count_features(opt.train_src[0]) if opt.data_type == 'text' \ else 0 tgt_nfeats = count_features(opt.train_tgt[0]) # tgt always text so far if len(opt.train_src) > 1 and opt.data_type == 'text': for src, tgt in zip(opt.train_src[1:], opt.train_tgt[1:]): assert src_nfeats == count_features(src),\ "%s seems to mismatch features of "\ "the other source datasets" % src assert tgt_nfeats == count_features(tgt),\ "%s seems to mismatch features of "\ "the other target datasets" % tgt logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats, dynamic_dict=opt.dynamic_dict, with_align=opt.train_align[0] is not None, src_truncate=opt.src_seq_length_trunc, tgt_truncate=opt.tgt_seq_length_trunc) src_reader = inputters.str2reader[opt.data_type].from_opt(opt) tgt_reader = inputters.str2reader["text"].from_opt(opt) align_reader = inputters.str2reader["text"].from_opt(opt) logger.info("Building & saving training data...") build_save_dataset('train', fields, src_reader, tgt_reader, align_reader, opt) if opt.valid_src and opt.valid_tgt: logger.info("Building & saving validation data...") build_save_dataset('valid', fields, src_reader, tgt_reader, align_reader, opt)
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # for back compat when attention_dropout was not defined try: model_opt.attention_dropout except AttributeError: model_opt.attention_dropout = model_opt.dropout # Build embeddings. if model_opt.model_type == "text" \ or model_opt.model_type == "vec" \ or model_opt.model_type == "keyphrase": src_field = fields["src"] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. encoder = build_encoder(model_opt, src_emb) # Build decoder. tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight decoder = build_decoder(model_opt, tgt_emb) reconstruct_decoder = build_decoder(model_opt, src_emb) # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") #Build memory @shizhe if model_opt.model_type == "keyphrase": if model_opt.memory == False: logger.info("Do not use memory") model = onmt.models.NMTModel(encoder, decoder) elif model_opt.memory == True: logger.info("Do use memory") all_docs = load_all_docs(model_opt, fields, device) logger.info("finish load all documents") src_vocab_size = len(fields["src"].base_field.vocab) model = onmt.models.NMTModel(encoder, decoder, reconstruct_decoder, src_vocab_size, all_docs, model_opt, src_emb) else: logger.info("NO MODEL!!!") else: model = onmt.models.NMTModel(encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = { fix_key(k): v for k, v in checkpoint['model'].items() } # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) model.generator = generator model.to(device) # if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam': # model.half() return model
def _gradient_accumulation(self, true_batches, normalization, total_stats, report_stats): if self.accum_count > 1: self.optim.zero_grad() for k, batch in enumerate(true_batches): target_size = batch.tgt.size(0) # Truncated BPTT: reminder not compatible with accum > 1 if self.trunc_size: trunc_size = self.trunc_size else: trunc_size = target_size src, src_lengths = batch.src if isinstance(batch.src, tuple) \ else (batch.src, None) if src_lengths is not None: report_stats.n_src_words += src_lengths.sum().item() tgt_outer = batch.tgt bptt = False for j in range(0, target_size-1, trunc_size): # 1. Create truncated target. tgt = tgt_outer[j: j + trunc_size] # 2. F-prop all but generator. if self.accum_count == 1: self.optim.zero_grad() if self.decoder_rnn_weights: kwargs = {'dec_weights': pad_sequence([ torch.Tensor(self.decoder_rnn_weights[batch.indices[b].item()]) for b in range(batch.batch_size) ], batch_first=False).to(self._dev)} else: kwargs = dict() outputs, attns = self.model(src, tgt, src_lengths, bptt=bptt, with_align=self.with_align, **kwargs) bptt = True # 3. Compute loss. try: loss, batch_stats = self.train_loss( batch, outputs, attns, normalization=normalization, shard_size=self.shard_size, trunc_start=j, trunc_size=trunc_size) if loss is not None: self.optim.backward(loss) total_stats.update(batch_stats) report_stats.update(batch_stats) except Exception: traceback.print_exc() logger.info("At step %d, we removed a batch - accum %d", self.optim.training_step, k) # 4. Update the parameters and statistics. if self.accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] onmt.utils.distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step() # If truncated, don't backprop fully. # TO CHECK # if dec_state is not None: # dec_state.detach() if self.model.decoder.state is not None: self.model.decoder.detach_state() # in case of multi step gradient accumulation, # update only after accum batches if self.accum_count > 1: if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] onmt.utils.distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step()
def train(self, train_iter, train_steps, save_checkpoint_steps=5000, valid_iter=None, valid_steps=10000): """ The main training loop by iterating over `train_iter` and possibly running validation on `valid_iter`. Args: train_iter: A generator that returns the next training batch. train_steps: Run training for this many iterations. save_checkpoint_steps: Save a checkpoint every this many iterations. valid_iter: A generator that returns the next validation batch. valid_steps: Run evaluation every this many iterations. Returns: The gathered statistics. """ if valid_iter is None: logger.info('Start training loop without validation...') else: logger.info('Start training loop and validate every %d steps...', valid_steps) total_stats = onmt.utils.Statistics() report_stats = onmt.utils.Statistics() self._start_report_manager(start_time=total_stats.start_time) for i, (batches, normalization) in enumerate( self._accum_batches(train_iter)): step = self.optim.training_step # UPDATE DROPOUT self._maybe_update_dropout(step) if self.gpu_verbose_level > 1: logger.info("GpuRank %d: index: %d", self.gpu_rank, i) if self.gpu_verbose_level > 0: logger.info("GpuRank %d: reduce_counter: %d \ n_minibatch %d" % (self.gpu_rank, i + 1, len(batches))) if self.n_gpu > 1: normalization = sum(onmt.utils.distributed .all_gather_list (normalization)) self._gradient_accumulation( batches, normalization, total_stats, report_stats) if self.average_decay > 0 and i % self.average_every == 0: self._update_average(step) report_stats = self._maybe_report_training( step, train_steps, self.optim.learning_rate(), report_stats) if valid_iter is not None and step % valid_steps == 0: if self.gpu_verbose_level > 0: logger.info('GpuRank %d: validate step %d' % (self.gpu_rank, step)) valid_stats = self.validate( valid_iter, moving_average=self.moving_average) if self.gpu_verbose_level > 0: logger.info('GpuRank %d: gather valid stat \ step %d' % (self.gpu_rank, step)) valid_stats = self._maybe_gather_stats(valid_stats) if self.gpu_verbose_level > 0: logger.info('GpuRank %d: report stat step %d' % (self.gpu_rank, step)) self._report_step(self.optim.learning_rate(), step, valid_stats=valid_stats) # Run patience mechanism if self.earlystopper is not None: self.earlystopper(valid_stats, step) # If the patience has reached the limit, stop training if self.earlystopper.has_stopped(): break if (self.model_saver is not None and (save_checkpoint_steps != 0 and step % save_checkpoint_steps == 0)): self.model_saver.save(step, moving_average=self.moving_average) if train_steps > 0 and step >= train_steps: break if self.model_saver is not None: self.model_saver.save(step, moving_average=self.moving_average) return total_stats
def main(opt): ArgumentParser.validate_preprocess_args(opt) torch.manual_seed(opt.seed) if not (opt.overwrite): check_existing_pt_files(opt) init_logger(opt.log_file) logger.info("Extracting features...") src_nfeats = 0 tgt_nfeats = 0 for src, tgt in zip(opt.train_src, opt.train_tgt): src_nfeats += count_features(src) if opt.data_type == 'text' \ else 0 tgt_nfeats += count_features(tgt) # tgt always text so far logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats, dynamic_dict=opt.dynamic_dict, src_truncate=opt.src_seq_length_trunc, tgt_truncate=opt.tgt_seq_length_trunc) src_reader = inputters.str2reader[opt.data_type].from_opt(opt) tgt_reader = inputters.str2reader["text"].from_opt(opt) logger.info("Building & saving training data...") build_save_dataset('train', fields, src_reader, tgt_reader, opt) if opt.valid_src and opt.valid_tgt: logger.info("Building & saving validation data...") build_save_dataset('valid', fields, src_reader, tgt_reader, opt)
def main(opt, device_id, batch_queue=None, semaphore=None): # NOTE: It's important that ``opt`` has been validated and updated # at this point. configure_process(opt, device_id) init_logger(opt.log_file) assert len(opt.accum_count) == len(opt.accum_steps), \ 'Number of accum_count values must match number of accum_steps' # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = ArgumentParser.ckpt_model_opts(checkpoint["opt"]) ArgumentParser.update_model_opts(model_opt) ArgumentParser.validate_model_opts(model_opt) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: checkpoint = None model_opt = opt vocab = torch.load(opt.data + '.vocab.pt') # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) if old_style_vocab(vocab): fields = load_old_vocab( vocab, opt.model_type, dynamic_dict=opt.copy_attn) else: fields = vocab # Report src and tgt vocab sizes, including for features for side in ['src', 'tgt']: f = fields[side] try: f_iter = iter(f) except TypeError: f_iter = [(side, f)] for sn, sf in f_iter: if sf.use_vocab: logger.info(' * %s vocab size = %d' % (sn, len(sf.vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. optim = Optimizer.from_opt(model, opt, checkpoint=checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer( opt, device_id, model, fields, optim, model_saver=model_saver) if batch_queue is None: if len(opt.data_ids) > 1: train_shards = [] for train_id in opt.data_ids: shard_base = "train_" + train_id train_shards.append(shard_base) train_iter = build_dataset_iter_multiple(train_shards, fields, opt) else: if opt.data_ids[0] is not None: shard_base = "train_" + opt.data_ids[0] else: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt) else: assert semaphore is not None, \ "Using batch_queue requires semaphore as well" def _train_iter(): while True: batch = batch_queue.get() semaphore.release() yield batch train_iter = _train_iter() valid_iter = build_dataset_iter( "valid", fields, opt, is_train=False) if len(opt.gpu_ranks): logger.info('Starting training on GPU: %s' % opt.gpu_ranks) else: logger.info('Starting training on CPU, could be very slow') train_steps = opt.train_steps if opt.single_pass and train_steps > 0: logger.warning("Option single_pass is enabled, ignoring train_steps.") train_steps = 0 trainer.train( train_iter, train_steps, save_checkpoint_steps=opt.save_checkpoint_steps, valid_iter=valid_iter, valid_steps=opt.valid_steps, unlearn=opt.unlearn) if trainer.report_manager.tensorboard_writer is not None: trainer.report_manager.tensorboard_writer.close()
def process_one_shard(corpus_params, params): corpus_type, fields, src_reader, cue_reader, tgt_reader, opt, existing_fields,\ src_vocab, tgt_vocab = corpus_params i, (src_shard, cue_shard, tgt_shard, maybe_id, filter_pred) = params # create one counter per shard sub_sub_counter = defaultdict(Counter) assert len(src_shard) == len(tgt_shard) and len(src_shard) == len( cue_shard) logger.info("Building shard %d." % i) dataset = inputters.Dataset( fields, readers=([src_reader, cue_reader, tgt_reader] if tgt_reader else [src_reader, cue_reader]), data=([("src", src_shard), ("cue", cue_shard), ("tgt", tgt_shard)] if tgt_reader else [("src", src_shard), ("cue", cue_shard)]), dirs=([opt.src_dir, None, None] if tgt_reader else [opt.src_dir, None]), sort_key=inputters.str2sortkey[opt.data_type], filter_pred=filter_pred) if corpus_type == "train" and existing_fields is None: for ex in dataset.examples: for name, field in fields.items(): if ((opt.data_type == "audio") and (name == "src")): continue try: f_iter = iter(field) except TypeError: f_iter = [(name, field)] all_data = [getattr(ex, name, None)] else: all_data = getattr(ex, name) for (sub_n, sub_f), fd in zip(f_iter, all_data): has_vocab = (sub_n == 'src' and src_vocab is not None) or \ (sub_n == 'tgt' and tgt_vocab is not None) if (hasattr(sub_f, 'sequential') and sub_f.sequential and not has_vocab): val = fd if sub_n == 'cue': val = list(chain.from_iterable(val)) sub_sub_counter[sub_n].update(val) if maybe_id: shard_base = corpus_type + "_" + maybe_id else: shard_base = corpus_type data_path = "{:s}.{:s}.{:d}.pt".\ format(opt.save_data, shard_base, i) logger.info(" * saving %sth %s data shard to %s." % (i, shard_base, data_path)) dataset.save(data_path) del dataset.examples gc.collect() del dataset gc.collect() return sub_sub_counter
def main(opt, device_id): opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] else: checkpoint = None model_opt = opt first_dataset = pickle.load(open('processed_data/all-train/train.pt', 'rb')) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = _load_fields(first_dataset, data_type, opt, checkpoint) # Report src/tgt features. src_features, tgt_features = _collect_report_features(fields) for j, feat in enumerate(src_features): logger.info(' * src feature %d size = %d' % (j, len(fields[feat].vocab))) for j, feat in enumerate(tgt_features): logger.info(' * tgt feature %d size = %d' % (j, len(fields[feat].vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint) optim = build_optim(model, opt, checkpoint) # opt.train_from == '' # Build model saver if not os.path.exists('experiments/all_train'): os.mkdir('experiments/all_train') model_saver = build_model_saver(model_opt, opt.save_model, opt, model, fields, optim) trainer = build_trainer(opt, device_id, model, fields, optim, "text", model_saver=model_saver) def _lazy_dataset_loader(pt_file): # dataset = torch.load(pt_file) def dataset_loader(pt_file): with open(pt_file, 'rb') as f: dataset = pickle.load(f) # logger.info('Loading task from <{}>, number of examples: {}'.format(pt_file, len(dataset))) return dataset yield dataset_loader(pt_file) train_iter = list( build_dataset_iter( _lazy_dataset_loader('processed_data/all-train/train.pt'), fields, opt)) trainer.train(train_iter, opt.train_epochs)
def train(self, train_iter, train_steps, save_checkpoint_steps=5000, valid_iter=None, valid_steps=10000): """ The main training loop by iterating over `train_iter` and possibly running validation on `valid_iter`. Args: train_iter: A generator that returns the next training batch. train_steps: Run training for this many iterations. save_checkpoint_steps: Save a checkpoint every this many iterations. valid_iter: A generator that returns the next validation batch. valid_steps: Run evaluation every this many iterations. Returns: The gathered statistics. """ if valid_iter is None: logger.info('Start training loop without validation...') else: logger.info('Start training loop and validate every %d steps...', valid_steps) total_stats = RLStatistics() report_stats = RLStatistics() self._start_report_manager(start_time=total_stats.start_time) for i, (batches, normalization) in enumerate( self._accum_batches(train_iter)): step = self.optim.training_step # UPDATE DROPOUT self._maybe_update_dropout(step) if self.gpu_verbose_level > 1: logger.info("GpuRank %d: index: %d", self.gpu_rank, i) if self.gpu_verbose_level > 0: logger.info("GpuRank %d: reduce_counter: %d \ n_minibatch %d" % (self.gpu_rank, i + 1, len(batches))) self._gradient_accumulation( batches, normalization, total_stats, report_stats) report_stats = self._maybe_report_training( step, train_steps, self.optim.learning_rate(), report_stats) if (self.model_saver is not None and (save_checkpoint_steps != 0 and step % save_checkpoint_steps == 0)): self.model_saver.save(step, moving_average=self.moving_average) if train_steps > 0 and step >= train_steps: break if self.model_saver is not None: self.model_saver.save(step, moving_average=self.moving_average) return total_stats
def _gradient_accumulation(self, true_batches, normalization, total_stats, report_stats): if self.accum_count > 1: self.optim.zero_grad() for k, batch in enumerate(true_batches): if self.accum_count == 1: self.optim.zero_grad() target_size = batch.tgt.size(0) src, src_lengths = batch.src device = src.device if src_lengths is not None: report_stats.n_src_words += src_lengths.sum().item() report_stats.n_batches += 1 # Encoder forward. enc_states, memory_bank, src_lengths = self.model.encoder(src, src_lengths) # Teacher forcing self.model.decoder.init_state(src, memory_bank, enc_states) ml_outputs, ml_attns = self.model.decoder(batch.tgt[:-1], memory_bank, memory_lengths=src_lengths) # Sampling a path rl_forward = self._forward_model(batch, enc_states, memory_bank, sample="reinforce") # baseline computing doesn't need gradient with torch.no_grad(): baseline_forward = self._forward_model(batch, enc_states, memory_bank, sample="topk") # 3. Compute loss. try: rl_loss, rl_stats = self.rl_loss( batch, rl_forward, baseline_forward) ml_loss, ml_stats = self.ml_loss( batch, ml_outputs, ml_attns, normalization=normalization, shard_size=0, trunc_start=0, trunc_size=target_size) loss = rl_loss * self.gamma_loss + ml_loss * (1 - self.gamma_loss) self.optim.backward(loss) total_stats.update(rl_stats, ml_stats) report_stats.update(rl_stats, ml_stats) except Exception: traceback.print_exc() logger.info("At step %d, we removed a batch - accum %d", self.optim.training_step, k) # 4. Update the parameters and statistics. if self.accum_count == 1: self.optim.step() # report the grad norms by modules #GradNorm.output_norms(self.model) # in case of multi step gradient accumulation, # update only after accum batches if self.accum_count > 1: self.optim.step()
def log_msg(self, msg): logger.info(msg) sys.stdout.flush()
def build_model(model_opt, opt, fields, checkpoint): """ Build the Model """ logger.info("Building model...") model = build_base_model(model_opt, fields, use_gpu(opt), checkpoint) logger.info(model) return model
def rouge_results_to_str(results_dict): return ">> ROUGE(1/2/3/L/SU4): {:.2f}/{:.2f}/{:.2f}/{:.2f}/{:.2f}".format( results_dict["rouge_1_f_score"] * 100, results_dict["rouge_2_f_score"] * 100, results_dict["rouge_3_f_score"] * 100, results_dict["rouge_l_f_score"] * 100, results_dict["rouge_su*_f_score"] * 100) if __name__ == "__main__": init_logger('test_rouge.log') parser = argparse.ArgumentParser() parser.add_argument('-c', type=str, default="candidate.txt", help='candidate file') parser.add_argument('-r', type=str, default="reference.txt", help='reference file') args = parser.parse_args() if args.c.upper() == "STDIN": candidates = sys.stdin else: candidates = codecs.open(args.c, encoding="utf-8") references = codecs.open(args.r, encoding="utf-8") results_dict = test_rouge(candidates, references) logger.info(rouge_results_to_str(results_dict))
def main(opt): ArgumentParser.validate_preprocess_args(opt) torch.manual_seed(opt.seed) check_existing_pt_files(opt) init_logger(opt.log_file) logger.info("Extracting features...") src_nfeats = count_features(opt.train_src) if opt.data_type == 'text' \ else 0 tgt_nfeats = count_features(opt.train_tgt) # tgt always text so far logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields( opt.data_type, src_nfeats, tgt_nfeats, dynamic_dict=opt.dynamic_dict, src_truncate=opt.src_seq_length_trunc, tgt_truncate=opt.tgt_seq_length_trunc) src_reader = inputters.str2reader[opt.data_type].from_opt(opt) tgt_reader = inputters.str2reader["text"].from_opt(opt) logger.info("Building & saving training data...") train_dataset_files = build_save_dataset( 'train', fields, src_reader, tgt_reader, opt) if opt.valid_src and opt.valid_tgt: logger.info("Building & saving validation data...") build_save_dataset('valid', fields, src_reader, tgt_reader, opt) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files, fields, opt)
def main(opt): if opt.gpuid: raise AssertionError("gpuid is deprecated \ see world_size and gpu_ranks") assert opt.world_size <= 1, "you don't need multi-gpu for morphology" device_id = 0 if len(opt.gpu_ranks) == 1 else -1 opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) # Load default opts values then overwrite it with opts from # the checkpoint. It's useful in order to re-train a model # after adding a new option (not set in checkpoint) dummy_parser = configargparse.ArgumentParser() opts.model_opts(dummy_parser) default_opt = dummy_parser.parse_known_args([])[0] model_opt = default_opt model_opt.__dict__.update(checkpoint['opt'].__dict__) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) fields = checkpoint['vocab'] else: checkpoint = None model_opt = opt fields = torch.load(opt.data + '.vocab.pt') for key, values in fields.items(): for name, f in values: if f.use_vocab: logger.info(' * %s vocab size = %d' % (name, len(f.vocab))) # Build model. logger.info('Building model...') model = build_model(model_opt, fields, use_gpu(opt), checkpoint) logger.info(model) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. params = model.parameters() optim_args = {"lr": opt.learning_rate} if opt.optim == "adam": # no need to mess with the default betas optim_args["eps"] = 1e-9 elif opt.optim == "adagrad": optim_args["initial_accumulator_value"] = opt.adagrad_accumulator_init optim = getattr(torch.optim, opt.optim.title())(params, **optim_args) print(optim) trainer = build_trainer(opt, model_opt, device_id, model, fields, optim) # this line is kind of a temporary kludge because different objects expect # fields to have a different structure dataset_fields = dict(chain.from_iterable(fields.values())) device = "cuda" if opt.gpu_ranks else "cpu" train_dataset = torch.load(opt.data + '.train.pt') train_dataset.fields = dataset_fields train_iter = OrderedIterator(train_dataset, opt.batch_size, sort_within_batch=True, device=device, repeat=False, shuffle=not opt.no_shuffle) valid_dataset = torch.load(opt.data + '.valid.pt') valid_dataset.fields = dataset_fields valid_iter = OrderedIterator(valid_dataset, opt.valid_batch_size, train=False, sort_within_batch=True, device=device) logger.info('Starting training on {}'.format(device)) trainer.train(train_iter, valid_iter, opt.epochs)
def _lazy_dataset_loader(pt_file, corpus_type): dataset = torch.load(pt_file) logger.info('Loading %s dataset from %s, number of examples: %d' % (corpus_type, pt_file, len(dataset))) return dataset
def build_vocab(train_dataset_files, fields, data_type, share_vocab, src_vocab_path, src_vocab_size, src_words_min_frequency, tgt_vocab_path, tgt_vocab_size, tgt_words_min_frequency, vocab_size_multiple=1): """Build the fields for all data sides. Args: train_dataset_files: a list of train dataset pt file. fields (dict[str, Field]): fields to build vocab for. data_type (str): A supported data type string. share_vocab (bool): share source and target vocabulary? src_vocab_path (str): Path to src vocabulary file. src_vocab_size (int): size of the source vocabulary. src_words_min_frequency (int): the minimum frequency needed to include a source word in the vocabulary. tgt_vocab_path (str): Path to tgt vocabulary file. tgt_vocab_size (int): size of the target vocabulary. tgt_words_min_frequency (int): the minimum frequency needed to include a target word in the vocabulary. vocab_size_multiple (int): ensure that the vocabulary size is a multiple of this value. Returns: Dict of Fields """ counters = defaultdict(Counter) if src_vocab_path: try: logger.info("Using existing vocabulary...") vocab = torch.load(src_vocab_path) # return vocab to dump with standard name return vocab except torch.serialization.pickle.UnpicklingError: logger.info("Building vocab from text file...") # empty train_dataset_files so that vocab is only loaded from # given paths in src_vocab_path, tgt_vocab_path train_dataset_files = [] # Load vocabulary if src_vocab_path: src_vocab, src_vocab_size = _load_vocab( src_vocab_path, "src", counters) else: src_vocab = None if tgt_vocab_path: tgt_vocab, tgt_vocab_size = _load_vocab( tgt_vocab_path, "tgt", counters) else: tgt_vocab = None for i, path in enumerate(train_dataset_files): dataset = torch.load(path) logger.info(" * reloading %s." % path) for ex in dataset.examples: for name, field in fields.items(): try: f_iter = iter(field) except TypeError: f_iter = [(name, field)] all_data = [getattr(ex, name, None)] else: all_data = getattr(ex, name) for (sub_n, sub_f), fd in zip( f_iter, all_data): has_vocab = (sub_n == 'src' and src_vocab) or \ (sub_n == 'tgt' and tgt_vocab) if sub_f.sequential and not has_vocab: val = fd counters[sub_n].update(val) # Drop the none-using from memory but keep the last if i < len(train_dataset_files) - 1: dataset.examples = None gc.collect() del dataset.examples gc.collect() del dataset gc.collect() build_fv_args = defaultdict(dict) build_fv_args["src"] = dict( max_size=src_vocab_size, min_freq=src_words_min_frequency) build_fv_args["tgt"] = dict( max_size=tgt_vocab_size, min_freq=tgt_words_min_frequency) tgt_multifield = fields["tgt"] _build_fv_from_multifield( tgt_multifield, counters, build_fv_args, size_multiple=vocab_size_multiple if not share_vocab else 1) if data_type == 'text': src_multifield = fields["src"] _build_fv_from_multifield( src_multifield, counters, build_fv_args, size_multiple=vocab_size_multiple if not share_vocab else 1) if share_vocab: # `tgt_vocab_size` is ignored when sharing vocabularies logger.info(" * merging src and tgt vocab...") src_field = src_multifield.base_field tgt_field = tgt_multifield.base_field _merge_field_vocabs( src_field, tgt_field, vocab_size=src_vocab_size, min_freq=src_words_min_frequency, vocab_size_multiple=vocab_size_multiple) logger.info(" * merged vocab size: %d." % len(src_field.vocab)) return fields # is the return necessary?
def build_save_dataset(corpus_type, fields, src_reader, tgt_reader, opt): assert corpus_type in ['train', 'valid'] if corpus_type == 'train': counters = defaultdict(Counter) srcs = opt.train_src tgts = opt.train_tgt ids = opt.train_ids else: srcs = [opt.valid_src] tgts = [opt.valid_tgt] ids = [None] for src, tgt, maybe_id in zip(srcs, tgts, ids): logger.info("Reading source and target files: %s %s." % (src, tgt)) src_shards = split_corpus(src, opt.shard_size) tgt_shards = split_corpus(tgt, opt.shard_size) shard_pairs = zip(src_shards, tgt_shards) dataset_paths = [] if (corpus_type == "train" or opt.filter_valid) and tgt is not None: filter_pred = partial(inputters.filter_example, use_src_len=opt.data_type == "text", max_src_len=opt.src_seq_length, max_tgt_len=opt.tgt_seq_length) else: filter_pred = None # @shared if corpus_type == "train": existing_fields = None if opt.src_vocab != "": try: logger.info("Using existing vocabulary...") existing_fields = torch.load(opt.src_vocab) except torch.serialization.pickle.UnpicklingError: logger.info("Building vocab from text file...") src_vocab, src_vocab_size = _load_vocab( opt.src_vocab, "src", counters, opt.src_words_min_frequency) else: src_vocab = None if opt.tgt_vocab != "": tgt_vocab, tgt_vocab_size = _load_vocab( opt.tgt_vocab, "tgt", counters, opt.tgt_words_min_frequency) else: tgt_vocab = None for i, (src_shard, tgt_shard) in enumerate(shard_pairs): assert len(src_shard) == len(tgt_shard) logger.info("Building shard %d." % i) dataset = inputters.Dataset( fields, readers=([src_reader, tgt_reader] if tgt_reader else [src_reader]), data=([("src", src_shard), ("tgt", tgt_shard)] if tgt_reader else [("src", src_shard)]), dirs=([opt.src_dir, None] if tgt_reader else [opt.src_dir]), sort_key=inputters.str2sortkey[opt.data_type], filter_pred=filter_pred) if corpus_type == "train" and existing_fields is None: for ex in dataset.examples: for name, field in fields.items(): try: f_iter = iter(field) except TypeError: f_iter = [(name, field)] all_data = [getattr(ex, name, None)] else: all_data = getattr(ex, name) for (sub_n, sub_f), fd in zip(f_iter, all_data): has_vocab = (sub_n == 'src' and src_vocab is not None) or \ (sub_n == 'tgt' and tgt_vocab is not None) if (hasattr(sub_f, 'sequential') and sub_f.sequential and not has_vocab): val = fd counters[sub_n].update(val) if maybe_id: shard_base = corpus_type + "_" + maybe_id else: shard_base = corpus_type data_path = "{:s}.{:s}.{:d}.pt".\ format(opt.save_data, shard_base, i) dataset_paths.append(data_path) logger.info(" * saving %sth %s data shard to %s." % (i, shard_base, data_path)) dataset.save(data_path) del dataset.examples gc.collect() del dataset gc.collect() if corpus_type == "train": vocab_path = opt.save_data + '.vocab.pt' if existing_fields is None: fields = _build_fields_vocab( fields, counters, opt.data_type, opt.share_vocab, opt.vocab_size_multiple, opt.src_vocab_size, opt.src_words_min_frequency, opt.tgt_vocab_size, opt.tgt_words_min_frequency) else: fields = existing_fields torch.save(fields, vocab_path)
def build_save_in_shards_using_shards_size(src_corpus, tgt_corpus, fields, corpus_type, opt): """ Divide src_corpus and tgt_corpus into smaller multiples src_copus and tgt corpus files, then build shards, each shard will have opt.shard_size samples except last shard. The reason we do this is to avoid taking up too much memory due to sucking in a huge corpus file. """ with codecs.open(src_corpus, "r", encoding="utf-8") as fsrc: with codecs.open(tgt_corpus, "r", encoding="utf-8") as ftgt: src_data = fsrc.readlines() tgt_data = ftgt.readlines() src_corpus = "".join(src_corpus.split(".")[:-1]) tgt_corpus = "".join(tgt_corpus.split(".")[:-1]) num_shards = int(len(src_data) / opt.shard_size) for x in range(num_shards): f = codecs.open(src_corpus + ".{0}.txt".format(x), "w", encoding="utf-8") f.writelines(src_data[x * opt.shard_size:(x + 1) * opt.shard_size]) f.close() f = codecs.open(tgt_corpus + ".{0}.txt".format(x), "w", encoding="utf-8") f.writelines(tgt_data[x * opt.shard_size:(x + 1) * opt.shard_size]) f.close() num_written = num_shards * opt.shard_size if len(src_data) > num_written: f = codecs.open(src_corpus + ".{0}.txt".format(num_shards), 'w', encoding="utf-8") f.writelines(src_data[num_shards * opt.shard_size:]) f.close() f = codecs.open(tgt_corpus + ".{0}.txt".format(num_shards), 'w', encoding="utf-8") f.writelines(tgt_data[num_shards * opt.shard_size:]) f.close() src_list = sorted(glob.glob(src_corpus + '.*.txt')) tgt_list = sorted(glob.glob(tgt_corpus + '.*.txt')) ret_list = [] for index, src in enumerate(src_list): dataset = inputters.build_dataset( fields, opt.data_type, src_path=src, tgt_path=tgt_list[index], src_dir=opt.src_dir, src_seq_length=opt.src_seq_length, tgt_seq_length=opt.tgt_seq_length, src_seq_length_trunc=opt.src_seq_length_trunc, tgt_seq_length_trunc=opt.tgt_seq_length_trunc, dynamic_dict=opt.dynamic_dict, sample_rate=opt.sample_rate, window_size=opt.window_size, window_stride=opt.window_stride, window=opt.window, image_channel_size=opt.image_channel_size, reentrancies=opt.reentrancies) pt_file = "{:s}.{:s}.{:d}.pt".format(opt.save_data, corpus_type, index) # We save fields in vocab.pt seperately, so make it empty. dataset.fields = [] logger.info(" * saving %sth %s data shard to %s." % (index, corpus_type, pt_file)) torch.save(dataset, pt_file) ret_list.append(pt_file) del dataset.examples gc.collect() del dataset gc.collect() return ret_list
def main(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: vocab = torch.load(opt.data + '.vocab.pt') # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) if old_style_vocab(vocab): fields = load_old_vocab(vocab, opt.model_type, dynamic_dict=opt.copy_attn) else: fields = vocab if len(opt.data_ids) > 1: train_shards = [] for train_id in opt.data_ids: shard_base = "train_" + train_id train_shards.append(shard_base) train_iter = build_dataset_iter_multiple(train_shards, fields, opt) else: if opt.data_ids[0] is not None: shard_base = "train_" + opt.data_ids[0] else: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt) nb_gpu = len(opt.gpu_ranks) if opt.world_size > 1: queues = [] mp = torch.multiprocessing.get_context('spawn') semaphore = mp.Semaphore(opt.world_size * opt.queue_size) # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): q = mp.Queue(opt.queue_size) queues += [q] procs.append( mp.Process(target=run, args=(opt, device_id, error_queue, q, semaphore), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) producer = mp.Process(target=batch_producer, args=( train_iter, queues, semaphore, opt, ), daemon=True) producer.start() error_handler.add_child(producer.pid) for p in procs: p.join() producer.terminate() elif nb_gpu == 1: # case 1 GPU only single_main(opt, 0) else: # case only CPU single_main(opt, -1)
def main(): opt = parse_args() if (opt.max_shard_size > 0): raise AssertionError("-max_shard_size is deprecated, please use \ -shard_size (number of examples) instead.") init_logger(opt.log_file) logger.info("Extracting features...") src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src, 'src') tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt, 'tgt') logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats) logger.info("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt) logger.info("Building & saving validation data...") valid_dataset_files = build_save_dataset('valid', fields, opt) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files + valid_dataset_files, fields, opt)
def _maybe_update_dropout(self, step): for i in range(len(self.dropout_steps)): if step > 1 and step == self.dropout_steps[i] + 1: self.model.update_dropout(self.dropout[i]) logger.info("Updated dropout to %f from step %d" % (self.dropout[i], step))
def log(self, *args, **kwargs): logger.info(*args, **kwargs)
def _gradient_accumulation(self, true_batches, normalization, total_stats, report_stats): if self.accum_count > 1: self.optim.zero_grad() for k, batch in enumerate(true_batches): target_size = batch.tgt.size(0) # Truncated BPTT: reminder not compatible with accum > 1 if self.trunc_size: trunc_size = self.trunc_size else: trunc_size = target_size src, src_lengths = batch.src if isinstance(batch.src, tuple) \ else (batch.src, None) if src_lengths is not None: report_stats.n_src_words += src_lengths.sum().item() tgt_outer = batch.tgt # TODO(yida) train if "tag" in self.model.generators.keys(): pos_src, _ = batch.tag_src \ if isinstance(batch.tag_src, tuple) else (batch.tag_src, None) pos_outer = batch.tag_tgt else: pos_src = None pos_outer = None # pos_outer = batch.tag_tgt if hasattr(batch, "tag_tgt") else None bptt = False for j in range(0, target_size-1, trunc_size): # 1. Create truncated target. tgt = tgt_outer[j: j + trunc_size] # TODO(yida) pos_tgt = pos_outer[j: j + trunc_size] \ if "tag" in self.model.generators.keys() else None # 2. F-prop all but generator. if self.accum_count == 1: self.optim.zero_grad() # TODO(yida) trainer # TODO(yida) temp rl # with torch.no_grad(): outputs, attns, rnn_outs = self.model(src, tgt, pos_src, pos_tgt, src_lengths, bptt=bptt) bptt = True # 3. Compute loss. try: # TODO(yida) temp rl # with torch.no_grad(): loss, batch_stats = self.train_loss( batch, outputs, attns, # TODO(yida) trainer rnn_outs, normalization=normalization, shard_size=self.shard_size, trunc_start=j, trunc_size=trunc_size) if loss is not None: self.optim.backward(loss) total_stats.update(batch_stats) report_stats.update(batch_stats) except Exception: traceback.print_exc() logger.info("At step %d, we removed a batch - accum %d", self.optim.training_step, k) # 4. Update the parameters and statistics. if self.accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] onmt.utils.distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step() # If truncated, don't backprop fully. # TO CHECK # if dec_state is not None: # dec_state.detach() if self.model.decoder.state is not None: self.model.decoder.detach_state() # in case of multi step gradient accumulation, # update only after accum batches if self.accum_count > 1: if self.n_gpu > 1: grads = [p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None] onmt.utils.distributed.all_reduce_and_rescale_tensors( grads, float(1)) self.optim.step()
def build_model(model_opt, opt, fields, checkpoint): logger.info('Building model...') model = build_base_model(model_opt, fields, use_gpu(opt), checkpoint) logger.info(model) return model
def process_one_shard(corpus_params, params): corpus_type, fields, src_reader, tgt_reader, align_reader, opt,\ existing_fields, src_vocab, tgt_vocab = corpus_params i, (src_shard, tgt_shard, align_shard, maybe_id, filter_pred) = params # create one counter per shard sub_sub_counter = defaultdict(Counter) assert len(src_shard) == len(tgt_shard) logger.info("Building shard %d." % i) src_data = {"reader": src_reader, "data": src_shard, "dir": opt.src_dir} tgt_data = {"reader": tgt_reader, "data": tgt_shard, "dir": None} align_data = {"reader": align_reader, "data": align_shard, "dir": None} _readers, _data, _dir = inputters.Dataset.config([('src', src_data), ('tgt', tgt_data), ('align', align_data)]) dataset = inputters.Dataset(fields, readers=_readers, data=_data, dirs=_dir, sort_key=inputters.str2sortkey[opt.data_type], filter_pred=filter_pred, corpus_id=maybe_id) if corpus_type == "train" and existing_fields is None: for ex in dataset.examples: sub_sub_counter['corpus_id'].update( ["train" if maybe_id is None else maybe_id]) for name, field in fields.items(): if (opt.data_type in ["audio", "vec"]) and name == "src": continue try: f_iter = iter(field) except TypeError: f_iter = [(name, field)] all_data = [getattr(ex, name, None)] else: all_data = getattr(ex, name) for (sub_n, sub_f), fd in zip(f_iter, all_data): has_vocab = (sub_n == 'src' and src_vocab is not None) or \ (sub_n == 'tgt' and tgt_vocab is not None) if (hasattr(sub_f, 'sequential') and sub_f.sequential and not has_vocab): val = fd sub_sub_counter[sub_n].update(val) if maybe_id: shard_base = corpus_type + "_" + maybe_id else: shard_base = corpus_type data_path = "{:s}.{:s}.{:d}.pt".\ format(opt.save_data, shard_base, i) logger.info(" * saving %sth %s data shard to %s." % (i, shard_base, data_path)) dataset.save(data_path) del dataset.examples gc.collect() del dataset gc.collect() return sub_sub_counter
def main(opt, device_id): opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) # Load default opts values then overwrite it with opts from # the checkpoint. It's usefull in order to re-train a model # after adding a new option (not set in checkpoint) dummy_parser = configargparse.ArgumentParser() opts.model_opts(dummy_parser) default_opt = dummy_parser.parse_known_args([])[0] model_opt = default_opt model_opt.__dict__.update(checkpoint['opt'].__dict__) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: checkpoint = None model_opt = opt vocab = torch.load(opt.data + '.vocab.pt') # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) if old_style_vocab(vocab): data_type = opt.model_type fields = load_old_vocab(vocab, data_type, dynamic_dict=opt.copy_attn) else: fields = vocab # Report src and tgt vocab sizes, including for features for side in ['src', 'tgt']: for name, f in fields[side]: try: f_iter = iter(f) except TypeError: f_iter = [(name, f)] for sn, sf in f_iter: if sf.use_vocab: logger.info(' * %s vocab size = %d' % (sn, len(sf.vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. optim = Optimizer.from_opt(model, opt, checkpoint=checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer(opt, device_id, model, fields, optim, model_saver=model_saver) # this line is kind of a temporary kludge because different objects expect # fields to have a different structure dataset_fields = dict(chain.from_iterable(fields.values())) train_iter = build_dataset_iter("train", dataset_fields, opt) valid_iter = build_dataset_iter("valid", dataset_fields, opt, is_train=False) if len(opt.gpu_ranks): logger.info('Starting training on GPU: %s' % opt.gpu_ranks) else: logger.info('Starting training on CPU, could be very slow') trainer.train(train_iter, opt.train_steps, save_checkpoint_steps=opt.save_checkpoint_steps, valid_iter=valid_iter, valid_steps=opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
def main(opt, device_id): opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) # Load default opts values then overwrite it with opts from # the checkpoint. It's usefull in order to re-train a model # after adding a new option (not set in checkpoint) dummy_parser = configargparse.ArgumentParser() opts.model_opts(dummy_parser) default_opt = dummy_parser.parse_known_args([])[0] model_opt = default_opt model_opt.__dict__.update(checkpoint['opt'].__dict__) else: checkpoint = None model_opt = opt # Peek the first dataset to determine the data_type. # (All datasets have the same data_type). first_dataset = next(lazily_load_dataset("train", opt)) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = load_fields(first_dataset, opt, checkpoint) # Report src/tgt features. src_features, tgt_features = _collect_report_features(fields) for j, feat in enumerate(src_features): logger.info(' * src feature %d size = %d' % (j, len(fields[feat].vocab))) for j, feat in enumerate(tgt_features): logger.info(' * tgt feature %d size = %d' % (j, len(fields[feat].vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. optim = build_optim(model, opt, checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer(opt, device_id, model, fields, optim, data_type, model_saver=model_saver) def train_iter_fct(): return build_dataset_iter(lazily_load_dataset("train", opt), fields, opt) def valid_iter_fct(): return build_dataset_iter(lazily_load_dataset("valid", opt), fields, opt, is_train=False) # Do training. if len(opt.gpu_ranks): logger.info('Starting training on GPU: %s' % opt.gpu_ranks) else: logger.info('Starting training on CPU, could be very slow') if opt.no_base == False: trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps, opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close() if opt.comparable: logger.info('') logger.info('Beginning comparable data extraction and training.') # 1. Initialize Comparable object comp = Comparable(model, trainer, fields, logger, opt) # 2. Infer similarity threshold from training data for epoch in range(opt.comp_epochs): # 3. Update threshold if dynamic if opt.threshold_dynamics != 'static' and epoch != 0: comp.update_threshold(opt.threshold_dynamics, opt.infer_threshold) # 4. Extract parallel data and train #if opt.match_articles: # comparable_data = comp.match_articles(opt.match_articles) # train_stats = comp.extract_and_train(comparable_data) #else: train_stats = comp.extract_and_train(opt.comparable_data) # 5. Validate on validation set if opt.no_valid == False: valid_iter = build_dataset_iter( lazily_load_dataset("valid", opt), fields, opt) valid_stats = comp.validate(valid_iter) # 6. Drop a checkpoint if needed comp.trainer.model_saver._save(epoch)
def train(self, train_iter, train_steps, save_checkpoint_steps=5000, valid_iter=None, valid_steps=10000): """ The main training loop by iterating over `train_iter` and possibly running validation on `valid_iter`. Args: train_iter: A generator that returns the next training batch. train_steps: Run training for this many iterations. save_checkpoint_steps: Save a checkpoint every this many iterations. valid_iter: A generator that returns the next validation batch. valid_steps: Run evaluation every this many iterations. Returns: The gathered statistics. """ if valid_iter is None: logger.info('Start training loop without validation...') else: logger.info('Start training loop and validate every %d steps...', valid_steps) total_stats = onmt.utils.Statistics() report_stats = onmt.utils.Statistics() self._start_report_manager(start_time=total_stats.start_time) if self.gpt2_params_std > 0: total_size = train_iter.total_size else: total_size = 0 if self.n_gpu > 1: train_iter = itertools.islice(train_iter, self.gpu_rank, None, self.n_gpu) #torch.cuda.synchronize() #last_end_time = time.time() for i, (batches, normalization) in enumerate(self._accum_batches(train_iter)): #print('batch time: %0.5f' % (time.time() - last_end_time)) step = self.optim.training_step if self.gpu_verbose_level > 1: logger.info("GpuRank %d: index: %d", self.gpu_rank, i) if self.gpu_verbose_level > 0: logger.info("GpuRank %d: reduce_counter: %d \ n_minibatch %d" % (self.gpu_rank, i + 1, len(batches))) if self.n_gpu > 1: normalization = sum( onmt.utils.distributed.all_gather_list(normalization)) #torch.cuda.synchronize() #tt = time.time() self._gradient_accumulation(batches, normalization, total_stats, report_stats, total_size) #torch.cuda.synchronize() #print('grad time: %0.5f' % (time.time() - tt)) if self.average_decay > 0 and i % self.average_every == 0: self._update_average(step) report_stats = self._maybe_report_training( step, train_steps, self.optim.learning_rate(), report_stats) if valid_iter is not None and step % valid_steps == 0: if self.gpu_verbose_level > 0: logger.info('GpuRank %d: validate step %d' % (self.gpu_rank, step)) valid_stats = self.validate(valid_iter, moving_average=self.moving_average) if self.gpu_verbose_level > 0: logger.info('GpuRank %d: gather valid stat \ step %d' % (self.gpu_rank, step)) valid_stats = self._maybe_gather_stats(valid_stats) if self.gpu_verbose_level > 0: logger.info('GpuRank %d: report stat step %d' % (self.gpu_rank, step)) self._report_step(self.optim.learning_rate(), step, valid_stats=valid_stats) if (self.model_saver is not None and (save_checkpoint_steps != 0 and step % save_checkpoint_steps == 0)): self.model_saver.save(step, moving_average=self.moving_average) if train_steps > 0 and step >= train_steps: break #torch.cuda.synchronize() #last_end_time = time.time() if self.model_saver is not None: self.model_saver.save(step, moving_average=self.moving_average) return total_stats
def main(opt, device_id): # NOTE: It's important that ``opt`` has been validated and updated # at this point. configure_process(opt, device_id) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = ArgumentParser.ckpt_model_opts(checkpoint["opt"]) ArgumentParser.update_model_opts(model_opt) ArgumentParser.validate_model_opts(model_opt) if opt.fine_tune: model_opt.learning_rate = opt.learning_rate model_opt.warmup_steps = opt.warmup_steps model_opt.n_clusters = opt.n_clusters logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: checkpoint = None model_opt = opt vocab = torch.load(opt.data + '.vocab.pt') # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) if old_style_vocab(vocab): fields = load_old_vocab(vocab, opt.model_type, dynamic_dict=opt.copy_attn) else: fields = vocab # Report src and tgt vocab sizes, including for features for side in ['src', 'tgt']: f = fields[side] try: f_iter = iter(f) except TypeError: f_iter = [(side, f)] for sn, sf in f_iter: if sf.use_vocab: logger.info(' * %s vocab size = %d' % (sn, len(sf.vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. optim = Optimizer.from_opt(model, opt, checkpoint=checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer(opt, device_id, model, fields, optim, model_saver=model_saver) train_iter = build_dataset_iter("train", fields, opt) valid_iter = build_dataset_iter("valid", fields, opt, is_train=False) if len(opt.gpu_ranks): logger.info('Starting training on GPU: %s' % opt.gpu_ranks) else: logger.info('Starting training on CPU, could be very slow') train_steps = opt.train_steps if opt.single_pass and train_steps > 0: logger.warning("Option single_pass is enabled, ignoring train_steps.") train_steps = 0 trainer.train(train_iter, train_steps, save_checkpoint_steps=opt.save_checkpoint_steps, valid_iter=valid_iter, valid_steps=opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
def build_vocab(train_dataset_files, fields, data_type, share_vocab, src_vocab_path, src_vocab_size, src_words_min_frequency, tgt_vocab_path, tgt_vocab_size, tgt_words_min_frequency): """ Args: train_dataset_files: a list of train dataset pt file. fields (dict): fields to build vocab for. data_type: "text", "img" or "audio"? share_vocab(bool): share source and target vocabulary? src_vocab_path(string): Path to src vocabulary file. src_vocab_size(int): size of the source vocabulary. src_words_min_frequency(int): the minimum frequency needed to include a source word in the vocabulary. tgt_vocab_path(string): Path to tgt vocabulary file. tgt_vocab_size(int): size of the target vocabulary. tgt_words_min_frequency(int): the minimum frequency needed to include a target word in the vocabulary. Returns: Dict of Fields """ # Prop src from field to get lower memory using when training with image if data_type == 'img' or data_type == 'audio': fields.pop("src") counters = {k: Counter() for k in fields} # Load vocabulary if src_vocab_path: src_vocab = load_vocabulary(src_vocab_path, "src") src_vocab_size = len(src_vocab) logger.info('Loaded source vocab has %d tokens.' % src_vocab_size) for i, token in enumerate(src_vocab): # keep the order of tokens specified in the vocab file by # adding them to the counter with decreasing counting values counters['src'][token] = src_vocab_size - i else: src_vocab = None if tgt_vocab_path: tgt_vocab = load_vocabulary(tgt_vocab_path, "tgt") tgt_vocab_size = len(tgt_vocab) logger.info('Loaded source vocab has %d tokens.' % tgt_vocab_size) for i, token in enumerate(tgt_vocab): counters['tgt'][token] = tgt_vocab_size - i else: tgt_vocab = None for i, path in enumerate(train_dataset_files): dataset = torch.load(path) logger.info(" * reloading %s." % path) for ex in dataset.examples: for k in fields: has_vocab = (k == 'src' and src_vocab) or \ (k == 'tgt' and tgt_vocab) if fields[k].sequential and not has_vocab: val = getattr(ex, k, None) counters[k].update(val) # Drop the none-using from memory but keep the last if i < len(train_dataset_files) - 1: dataset.examples = None gc.collect() del dataset.examples gc.collect() del dataset gc.collect() _build_field_vocab(fields["tgt"], counters["tgt"], max_size=tgt_vocab_size, min_freq=tgt_words_min_frequency) logger.info(" * tgt vocab size: %d." % len(fields["tgt"].vocab)) # All datasets have same num of n_tgt_features, # getting the last one is OK. n_tgt_feats = sum('tgt_feat_' in k for k in fields) for j in range(n_tgt_feats): key = "tgt_feat_" + str(j) _build_field_vocab(fields[key], counters[key]) logger.info(" * %s vocab size: %d." % (key, len(fields[key].vocab))) if data_type == 'text': _build_field_vocab(fields["src"], counters["src"], max_size=src_vocab_size, min_freq=src_words_min_frequency) logger.info(" * src vocab size: %d." % len(fields["src"].vocab)) # All datasets have same num of n_src_features, # getting the last one is OK. n_src_feats = sum('src_feat_' in k for k in fields) for j in range(n_src_feats): key = "src_feat_" + str(j) _build_field_vocab(fields[key], counters[key]) logger.info(" * %s vocab size: %d." % (key, len(fields[key].vocab))) if share_vocab: # `tgt_vocab_size` is ignored when sharing vocabularies logger.info(" * merging src and tgt vocab...") _merge_field_vocabs(fields["src"], fields["tgt"], vocab_size=src_vocab_size, min_freq=src_words_min_frequency) logger.info(" * merged vocab size: %d." % len(fields["src"].vocab)) return fields
def main(opt): opt = training_opt_postprocessing(opt) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] else: checkpoint = None model_opt = opt # Peek the fisrt dataset to determine the data_type. # (All datasets have the same data_type). first_dataset = next(lazily_load_dataset("train", opt)) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = _load_fields(first_dataset, data_type, opt, checkpoint) # Report src/tgt features. src_features, tgt_features = _collect_report_features(fields) for j, feat in enumerate(src_features): logger.info(' * src feature %d size = %d' % (j, len(fields[feat].vocab))) for j, feat in enumerate(tgt_features): logger.info(' * tgt feature %d size = %d' % (j, len(fields[feat].vocab))) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. optim = build_optim(model, opt, checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) # todo: notice this # original trainer # trainer = build_trainer( # opt, model, fields, optim, data_type, model_saver=model_saver) # own trainer trainer = build_trainer(opt, model, fields, optim, opt.wpe_pair_size, model_saver=model_saver) def train_iter_fct(): return build_dataset_iter(lazily_load_dataset("train", opt), fields, opt) def valid_iter_fct(): return build_dataset_iter(lazily_load_dataset("valid", opt), fields, opt) # Do training. trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps, opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
pass if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) def rouge_results_to_str(results_dict): return ">> ROUGE(1/2/3/L/SU4): {:.2f}/{:.2f}/{:.2f}/{:.2f}/{:.2f}".format( results_dict["rouge_1_f_score"] * 100, results_dict["rouge_2_f_score"] * 100, results_dict["rouge_3_f_score"] * 100, results_dict["rouge_l_f_score"] * 100, results_dict["rouge_su*_f_score"] * 100) if __name__ == "__main__": init_logger('test_rouge.log') parser = argparse.ArgumentParser() parser.add_argument('-c', type=str, default="candidate.txt", help='candidate file') parser.add_argument('-r', type=str, default="reference.txt", help='reference file') args = parser.parse_args() if args.c.upper() == "STDIN": candidates = sys.stdin else: candidates = codecs.open(args.c, encoding="utf-8") references = codecs.open(args.r, encoding="utf-8") results_dict = test_rouge(candidates, references) logger.info(rouge_results_to_str(results_dict))