예제 #1
0
파일: dummy.py 프로젝트: MGheini/rtg
def write_parallel(data, src_file, tgt_file):
    count = 0
    with IO.writer(src_file) as src_f, IO.writer(tgt_file) as tgt_f:
        for src_seq, tgt_seq in data:
            src_seq = ' '.join(map(str, src_seq))
            tgt_seq = ' '.join(map(str, tgt_seq))
            src_f.write(f'{src_seq}\n')
            tgt_f.write(f'{tgt_seq}\n')
            count += 1
    log.info(f"Wrote {count} records to {src_file} and {tgt_file}")
예제 #2
0
파일: codec.py 프로젝트: isi-nlp/rtg
    def train(cls,
              model_type: str,
              vocab_size: int,
              model_path: Union[Path, str],
              files: List[str],
              tok_coverage=0.9999,
              **kwargs):
        # Note: char_coverage is abused as subword_coverage
        hub_api = cls.load_hub_model(model_type)
        bpe = hub_api.bpe
        dicto = hub_api.task.dictionary

        freqs = coll.Counter()
        lines = IO.get_liness(*files)
        for line in tqdm(lines, mininterval=2, dynamic_ncols=True,
                         unit='line'):
            freqs.update(bpe.encode(line).split())
        total_toks = sum(freqs.values())
        log.info(f"Found {len(freqs)} bpe types and {total_toks} toks")

        freqs = list(sorted(freqs.items(), reverse=True, key=lambda x: x[1]))
        vocabulary, oovs = [], []
        cumulative = 0
        for t, f in freqs:
            if cumulative / total_toks <= tok_coverage:
                vocabulary.append((t, f))
                cumulative += f
            else:
                oovs.append((t, f))

        oovs_str = ' '.join(f'{t}:{f}' for t, f in oovs)
        log.info(f'Excluded {len(oovs)} types as OOVs.\n:{oovs_str}')
        log.info(f'Included {len(vocabulary)} types as in vocabulary; '
                 f'Coverage = {cumulative / total_toks:g}')
        # TODO: mapping should be list[int] with one on one map
        types, indices = [], {}
        for typ, new_idx in cls.reserved():
            assert len(types) == new_idx
            types.append(typ)
            old_idx = dicto.indices.get(typ, -1)
            indices[typ] = [new_idx, old_idx]

        for typ, freq in vocabulary:
            # [new index, old index]
            indices[typ] = [len(types), dicto.indices.get(typ, -1)]
            types.append(typ)

        data = {'model_id': model_type, 'mapping': indices}
        with IO.writer(model_path) as wrtr:
            yaml.dump(data, wrtr)
        return cls(model_path)
예제 #3
0
파일: exp.py 프로젝트: MGheini/rtg
    def store_model(self,
                    epoch: int,
                    model,
                    train_score: float,
                    val_score: float,
                    keep: int,
                    prefix='model',
                    keeper_sort='step'):
        """
        saves model to a given path
        :param epoch: epoch number of model
        :param model: model object itself
        :param train_score: score of model on training split
        :param val_score: score of model on validation split
        :param keep: number of good models to keep, bad models will be deleted
        :param prefix: prefix to store model. default is "model"
        :param keeper_sort: criteria for choosing the old or bad models for deletion.
            Choices: {'total_score', 'step'}
        :return:
        """
        # TODO: improve this by skipping the model save if the model is not good enough to be saved
        if self.read_only:
            log.warning("Ignoring the store request; experiment is readonly")
            return
        name = f'{prefix}_{epoch:03d}_{train_score:.6f}_{val_score:.6f}.pkl'
        path = self.model_dir / name
        log.info(f"Saving epoch {epoch} to {path}")
        torch.save(model, str(path))

        del_models = []
        if keeper_sort == 'total_score':
            del_models = self.list_models(sort='total_score',
                                          desc=False)[keep:]
        elif keeper_sort == 'step':
            del_models = self.list_models(sort='step', desc=True)[keep:]
        else:
            Exception(f'Sort criteria{keeper_sort} not understood')
        for d_model in del_models:
            log.info(
                f"Deleting model {d_model} . Keep={keep}, sort={keeper_sort}")
            os.remove(str(d_model))

        with IO.writer(os.path.join(self.model_dir, 'scores.tsv'),
                       append=True) as f:
            cols = [
                str(epoch),
                datetime.now().isoformat(), name, f'{train_score:g}',
                f'{val_score:g}'
            ]
            f.write('\t'.join(cols) + '\n')
예제 #4
0
파일: codec.py 프로젝트: isi-nlp/rtg
    def train(cls,
              model_type: str,
              vocab_size: int,
              model_path: str,
              files: List[str],
              no_split_toks: Optional[List[str]] = None,
              char_coverage: float = 0,
              dedup=True,
              spark=None):
        """
        :param model_type: word, char, bpe
        :param vocab_size: vocabulary size
        :param model_path: where to store vocabulary model
        :param files: text for creating vcabulary
        :param no_split_toks:
        :param char_coverage: character coverage (0, 1]. value <= 0 => default coverage
        :return:
        """
        assert not no_split_toks, 'not supported in nlcodec yet'
        from nlcodec import learn_vocab, term_freq
        kwargs = dict(char_coverage=char_coverage) if char_coverage > 0 else {}
        if not spark:
            inp = IO.get_liness(*files)
        else:
            # extract and store frequencies to this file
            stats_file = model_path + '.termfreqs'
            if not Path(stats_file).exists():
                log.info("Extracting term frequencies... ")
                paths = [f if isinstance(f, Path) else Path(f) for f in files]
                wfs, chfs, n_lines = term_freq.word_counts(paths=paths,
                                                           dedup=dedup,
                                                           spark=spark)
                log.info(
                    f"Lines = {n_lines:,}, Word Types: {len(wfs):,} Char Types:{len(chfs):,}"
                )
                stats = chfs if model_type == 'char' else wfs
                log.info(f"Writing frequencies to {stats_file}")
                with IO.writer(stats_file) as out:
                    term_freq.write_stats(stats=stats,
                                          out=out,
                                          line_count=n_lines)
                kwargs['term_freqs'] = True
            inp = IO.get_lines(stats_file, delim='\n')

        learn_vocab(inp=inp,
                    level=model_type,
                    model=model_path,
                    vocab_size=vocab_size,
                    **kwargs)
        return cls(model_path)
예제 #5
0
    def shell_pipe(cls, cmd_line, inp, out):
        """

        :param cmd_line: shell commandlines
        :param inp: input file, to read records
        :param out:  output file to store records
        :return:
        """
        log.info("Shell cmd:: {cmd_line}")
        with IO.reader(inp) as rdr, IO.writer(out) as wtr:
            proc = subprocess.Popen(cmd_line,
                                    stdin=rdr,
                                    stdout=wtr,
                                    shell=True)
            proc.wait()
        log.info("Shell cmd:: Done")
예제 #6
0
파일: word2vec.py 프로젝트: MGheini/rtg
 def save_embeddings(self, step, train_loss, val_loss, txt=True):
     matrix = self.model.emb.weight
     vocab = self.exp.shared_vocab
     words = [vocab.id_to_piece(i) for i in range(len(vocab))]
     self.tbd.add_embedding(matrix, metadata=words, global_step=step)
     ext = 'txt.gz' if txt else 'pkl'
     path = self.exp.model_dir / f'embeddings_{step}_{train_loss:.6f}_{val_loss:.6f}.{ext}'
     log.info(f"writing  embedding after step {step} to {path}")
     if txt:
         with IO.writer(path) as w:
             w.write(f'{matrix.shape[0]} {matrix.shape[1]}\n')
             for i in range(matrix.shape[0]):
                 word = words[i]
                 vect = ' '.join(f'{x:g}' for x in matrix[i])
                 w.write(f'{word} {vect}\n')
     else:
         with path.open('wb') as f:
             data = {'words': words, 'vectors': matrix.numpy}
             pickle.dump(data, f)
예제 #7
0
    def train(self, steps: int, batch_size: int):
        log.info(f"Going to train for {steps}")
        batches = self.exp.get_combo_data(batch_size=batch_size, steps=steps)
        with tqdm(batches, total=steps, unit='step',
                  dynamic_ncols=True) as data_bar:
            for i, batch in enumerate(data_bar):
                batch = batch.to(device)
                y_probs = self.combo(batch)  # B x T x V
                loss = self.loss_func(y_probs,
                                      y_seqs=batch.y_seqs,
                                      norm=batch.y_toks)
                wt_str = ','.join(f'{wt:g}' for wt in self.combo.weight)
                progress_msg = f'loss={loss:g}, weights={wt_str}'
                data_bar.set_postfix_str(progress_msg, refresh=False)

        weights = dict(
            zip([str(x) for x in self.combo.model_paths],
                self.combo.model_weights.tolist()))
        log.info(f" Training finished. {weights}")
        with IO.writer(self.w_file) as wtr:
            yaml.dump(dict(weights=weights), wtr, default_flow_style=False)
예제 #8
0
 def decode_eval_file(self,
                      decoder,
                      src: Union[Path, List[str]],
                      out_file: Path,
                      ref: Optional[Union[Path, List[str]]],
                      lowercase: bool = True,
                      **dec_args) -> float:
     if out_file.exists() and out_file.stat().st_size > 0 and line_count(
             out_file) == (len(src)
                           if isinstance(src, list) else line_count(src)):
         log.warning(
             f"{out_file} exists and has desired number of lines. Skipped..."
         )
     else:
         if isinstance(src, Path):
             log.info(f"decoding {src.name}")
             src = list(IO.get_lines(src))
         if isinstance(ref, Path):
             ref = list(IO.get_lines(ref))
         with IO.writer(out_file) as out:
             decoder.decode_file(src, out, **dec_args)
     detok_hyp = self.detokenize(out_file)
     if ref:
         return self.evaluate_file(detok_hyp, ref, lowercase=lowercase)
예제 #9
0
 def write_lines(lines, path):
     log.info(f"Storing data at {path}")
     with IO.writer(path) as f:
         for line in lines:
             f.write(line)
             f.write('\n')
예제 #10
0
파일: exp.py 프로젝트: MGheini/rtg
 def _write_dict(dict, path: Path):
     with IO.writer(path) as out:
         for key, val in dict.items():
             out.write(f"{key}\t{val}\n")