def write_parallel(data, src_file, tgt_file): count = 0 with IO.writer(src_file) as src_f, IO.writer(tgt_file) as tgt_f: for src_seq, tgt_seq in data: src_seq = ' '.join(map(str, src_seq)) tgt_seq = ' '.join(map(str, tgt_seq)) src_f.write(f'{src_seq}\n') tgt_f.write(f'{tgt_seq}\n') count += 1 log.info(f"Wrote {count} records to {src_file} and {tgt_file}")
def train(cls, model_type: str, vocab_size: int, model_path: Union[Path, str], files: List[str], tok_coverage=0.9999, **kwargs): # Note: char_coverage is abused as subword_coverage hub_api = cls.load_hub_model(model_type) bpe = hub_api.bpe dicto = hub_api.task.dictionary freqs = coll.Counter() lines = IO.get_liness(*files) for line in tqdm(lines, mininterval=2, dynamic_ncols=True, unit='line'): freqs.update(bpe.encode(line).split()) total_toks = sum(freqs.values()) log.info(f"Found {len(freqs)} bpe types and {total_toks} toks") freqs = list(sorted(freqs.items(), reverse=True, key=lambda x: x[1])) vocabulary, oovs = [], [] cumulative = 0 for t, f in freqs: if cumulative / total_toks <= tok_coverage: vocabulary.append((t, f)) cumulative += f else: oovs.append((t, f)) oovs_str = ' '.join(f'{t}:{f}' for t, f in oovs) log.info(f'Excluded {len(oovs)} types as OOVs.\n:{oovs_str}') log.info(f'Included {len(vocabulary)} types as in vocabulary; ' f'Coverage = {cumulative / total_toks:g}') # TODO: mapping should be list[int] with one on one map types, indices = [], {} for typ, new_idx in cls.reserved(): assert len(types) == new_idx types.append(typ) old_idx = dicto.indices.get(typ, -1) indices[typ] = [new_idx, old_idx] for typ, freq in vocabulary: # [new index, old index] indices[typ] = [len(types), dicto.indices.get(typ, -1)] types.append(typ) data = {'model_id': model_type, 'mapping': indices} with IO.writer(model_path) as wrtr: yaml.dump(data, wrtr) return cls(model_path)
def store_model(self, epoch: int, model, train_score: float, val_score: float, keep: int, prefix='model', keeper_sort='step'): """ saves model to a given path :param epoch: epoch number of model :param model: model object itself :param train_score: score of model on training split :param val_score: score of model on validation split :param keep: number of good models to keep, bad models will be deleted :param prefix: prefix to store model. default is "model" :param keeper_sort: criteria for choosing the old or bad models for deletion. Choices: {'total_score', 'step'} :return: """ # TODO: improve this by skipping the model save if the model is not good enough to be saved if self.read_only: log.warning("Ignoring the store request; experiment is readonly") return name = f'{prefix}_{epoch:03d}_{train_score:.6f}_{val_score:.6f}.pkl' path = self.model_dir / name log.info(f"Saving epoch {epoch} to {path}") torch.save(model, str(path)) del_models = [] if keeper_sort == 'total_score': del_models = self.list_models(sort='total_score', desc=False)[keep:] elif keeper_sort == 'step': del_models = self.list_models(sort='step', desc=True)[keep:] else: Exception(f'Sort criteria{keeper_sort} not understood') for d_model in del_models: log.info( f"Deleting model {d_model} . Keep={keep}, sort={keeper_sort}") os.remove(str(d_model)) with IO.writer(os.path.join(self.model_dir, 'scores.tsv'), append=True) as f: cols = [ str(epoch), datetime.now().isoformat(), name, f'{train_score:g}', f'{val_score:g}' ] f.write('\t'.join(cols) + '\n')
def train(cls, model_type: str, vocab_size: int, model_path: str, files: List[str], no_split_toks: Optional[List[str]] = None, char_coverage: float = 0, dedup=True, spark=None): """ :param model_type: word, char, bpe :param vocab_size: vocabulary size :param model_path: where to store vocabulary model :param files: text for creating vcabulary :param no_split_toks: :param char_coverage: character coverage (0, 1]. value <= 0 => default coverage :return: """ assert not no_split_toks, 'not supported in nlcodec yet' from nlcodec import learn_vocab, term_freq kwargs = dict(char_coverage=char_coverage) if char_coverage > 0 else {} if not spark: inp = IO.get_liness(*files) else: # extract and store frequencies to this file stats_file = model_path + '.termfreqs' if not Path(stats_file).exists(): log.info("Extracting term frequencies... ") paths = [f if isinstance(f, Path) else Path(f) for f in files] wfs, chfs, n_lines = term_freq.word_counts(paths=paths, dedup=dedup, spark=spark) log.info( f"Lines = {n_lines:,}, Word Types: {len(wfs):,} Char Types:{len(chfs):,}" ) stats = chfs if model_type == 'char' else wfs log.info(f"Writing frequencies to {stats_file}") with IO.writer(stats_file) as out: term_freq.write_stats(stats=stats, out=out, line_count=n_lines) kwargs['term_freqs'] = True inp = IO.get_lines(stats_file, delim='\n') learn_vocab(inp=inp, level=model_type, model=model_path, vocab_size=vocab_size, **kwargs) return cls(model_path)
def shell_pipe(cls, cmd_line, inp, out): """ :param cmd_line: shell commandlines :param inp: input file, to read records :param out: output file to store records :return: """ log.info("Shell cmd:: {cmd_line}") with IO.reader(inp) as rdr, IO.writer(out) as wtr: proc = subprocess.Popen(cmd_line, stdin=rdr, stdout=wtr, shell=True) proc.wait() log.info("Shell cmd:: Done")
def save_embeddings(self, step, train_loss, val_loss, txt=True): matrix = self.model.emb.weight vocab = self.exp.shared_vocab words = [vocab.id_to_piece(i) for i in range(len(vocab))] self.tbd.add_embedding(matrix, metadata=words, global_step=step) ext = 'txt.gz' if txt else 'pkl' path = self.exp.model_dir / f'embeddings_{step}_{train_loss:.6f}_{val_loss:.6f}.{ext}' log.info(f"writing embedding after step {step} to {path}") if txt: with IO.writer(path) as w: w.write(f'{matrix.shape[0]} {matrix.shape[1]}\n') for i in range(matrix.shape[0]): word = words[i] vect = ' '.join(f'{x:g}' for x in matrix[i]) w.write(f'{word} {vect}\n') else: with path.open('wb') as f: data = {'words': words, 'vectors': matrix.numpy} pickle.dump(data, f)
def train(self, steps: int, batch_size: int): log.info(f"Going to train for {steps}") batches = self.exp.get_combo_data(batch_size=batch_size, steps=steps) with tqdm(batches, total=steps, unit='step', dynamic_ncols=True) as data_bar: for i, batch in enumerate(data_bar): batch = batch.to(device) y_probs = self.combo(batch) # B x T x V loss = self.loss_func(y_probs, y_seqs=batch.y_seqs, norm=batch.y_toks) wt_str = ','.join(f'{wt:g}' for wt in self.combo.weight) progress_msg = f'loss={loss:g}, weights={wt_str}' data_bar.set_postfix_str(progress_msg, refresh=False) weights = dict( zip([str(x) for x in self.combo.model_paths], self.combo.model_weights.tolist())) log.info(f" Training finished. {weights}") with IO.writer(self.w_file) as wtr: yaml.dump(dict(weights=weights), wtr, default_flow_style=False)
def decode_eval_file(self, decoder, src: Union[Path, List[str]], out_file: Path, ref: Optional[Union[Path, List[str]]], lowercase: bool = True, **dec_args) -> float: if out_file.exists() and out_file.stat().st_size > 0 and line_count( out_file) == (len(src) if isinstance(src, list) else line_count(src)): log.warning( f"{out_file} exists and has desired number of lines. Skipped..." ) else: if isinstance(src, Path): log.info(f"decoding {src.name}") src = list(IO.get_lines(src)) if isinstance(ref, Path): ref = list(IO.get_lines(ref)) with IO.writer(out_file) as out: decoder.decode_file(src, out, **dec_args) detok_hyp = self.detokenize(out_file) if ref: return self.evaluate_file(detok_hyp, ref, lowercase=lowercase)
def write_lines(lines, path): log.info(f"Storing data at {path}") with IO.writer(path) as f: for line in lines: f.write(line) f.write('\n')
def _write_dict(dict, path: Path): with IO.writer(path) as out: for key, val in dict.items(): out.write(f"{key}\t{val}\n")