Пример #1
0
 def validate(self):
     conf = self.conf
     # report & reset training stat
     if self.tp.uidx > 0:
         train_result = self._run_train_report(
         )  # first report training stat
         self.train_recorder.reset()  # reset training stat
     else:  # for validate_first
         train_result = None
     # dev
     ss, cur_cidx = self.current_name(), self.tp.cidx
     zlog("", func="plain")  # empty line
     with Timer(info=f"Valid {ss}",
                print_date=True), self.model.ema_wrap_dev():
         # no validation if specified
         if (self.tp.eidx < conf.valid_start_eidx) or (
                 self.tp.uidx < conf.valid_start_uidx):
             zlog("No validation since not the time yet!\n", func="plain")
             return
         # validate
         if len(self.dev_runners
                ) == 0:  # simply use train if there are no dev
             zlog(
                 "Use training results for dev since there are no dev set provided!",
                 func="warn")
             dev_result = train_result
         else:
             dev_result = self._run_validate(self.dev_runners)
         # record
         cur_no_bad = (self.tp.eidx < conf.bad_start_eidx) or (
             self.tp.uidx < conf.bad_start_uidx)
         cur_record_best = (self.tp.cidx >= conf.record_best_cidx)
         if_overall_best, if_best, if_anneal = self.tp.update_checkpoint(
             train_result, dev_result, cur_no_bad, cur_record_best,
             conf.anneal_patience)
         # save curr & best
         self.save(conf.model_prefix + conf.model_suffix_curr)
         if if_overall_best:
             zlog("Curr is overall best " +
                  str(self.tp.info_overall_best()),
                  func="result")
         else:
             zlog("Curr not overall best, the overall best is " +
                  str(self.tp.info_overall_best()),
                  func="result")
         if if_best:
             self.save(conf.model_prefix + conf.model_suffix_best)
             zlog("Curr is best: " + str(self.tp.info_best()),
                  func="result")
         else:
             zlog("Curr not best, the best is " + str(self.tp.info_best()),
                  func="result")
         if cur_cidx >= conf.save_start_cidx and cur_cidx % conf.save_cfreq == 0:
             self.save(conf.model_prefix + ss)  # speical save
         if if_anneal and conf.anneal_restore:
             zlog("Restore from previous best model!!", func="plain")
             self.load(conf.model_prefix + conf.model_suffix_best, False)
     zlog("", func="plain")  # empty line
Пример #2
0
 def run(self):
     rec = self.test_recorder
     with Timer(info="Run-test", print_date=True):
         for insts in self.test_stream:
             # results are stored in insts themselves
             with rec.go():
                 res = self._run_batch(insts)
             rec.record(res)
         res = self._run_end()
     return res
Пример #3
0
 def do_dev(self):
     conf = self.conf
     # report & reset training stat
     if self.tp.uidx > 0:
         train_result = self.run_train_report(
         )  # first report training stat
         self.train_recorder.reset()  # reset training stat
     else:  # for validate_first
         train_result = ResultRecord.get_nil()
     # dev
     ss, cur_cidx = self.current_name(), self.tp.cidx
     zlog("", func="plain")  # empty line
     with Timer(info=f"Valid {ss}", print_date=True):
         # no validation if specified
         if self.tp.uidx < conf.valid_start_uidx:
             zlog("No validation since not the time yet!\n", func="plain")
             return
         # validate
         if len(self.d_center.get_datasets(
                 wset="dev")) == 0:  # simply use train if there are no dev
             zlog(
                 "Use training results for dev since there are no dev set provided!",
                 func="warn")
             dev_result = train_result
         else:
             dev_result = self.do_test("dev")
         # record
         cur_record_best = (self.tp.cidx >= conf.record_best_start_cidx)
         if_overall_best, if_best, if_anneal = self.tp.update_checkpoint(
             train_result, dev_result, record_best=cur_record_best)
         # save curr & best
         self.save(conf.model_save_prefix + conf.model_save_suffix_curr)
         if if_overall_best:
             zlog("Curr is overall best " +
                  str(self.tp.info_overall_best()),
                  func="result")
         else:
             zlog("Curr not overall best, the overall best is " +
                  str(self.tp.info_overall_best()),
                  func="result")
         if if_best:
             self.save(conf.model_save_prefix + conf.model_save_suffix_best)
             zlog("Curr is best: " + str(self.tp.info_best()),
                  func="result")
         else:
             zlog("Curr not best, the best is " + str(self.tp.info_best()),
                  func="result")
         if cur_cidx >= conf.save_special_start_cidx and cur_cidx % conf.save_special_cfreq == 0:
             self.save(conf.model_save_prefix + ss)  # speical save
     # --
     zlog("", func="plain")  # empty line
Пример #4
0
def main(args):
    conf, model, vpack, test_iter = prepare_test(args)
    dconf = conf.dconf
    # --
    if conf.tconf.test_do_oracle_batching:
        # note: special mode!!
        zlog("First decode to get oracle!")
        all_insts = []  # simply decode and get them
        grouped_insts = defaultdict(list)
        for insts in test_iter:
            model.predict_on_batch(insts)
            all_insts.extend(insts)
            for inst in insts:
                for frame in inst.events:
                    _key = frame.info["exit_lidx"]
                    grouped_insts[_key].append(frame)
        group_info = {
            k: len(grouped_insts[k])
            for k in sorted(grouped_insts.keys())
        }
        zlog(f"group: {group_info}")
        # then feed them within groups
        rr = ZmtlTestingRunner(model,
                               None,
                               conf,
                               dconf.output,
                               dconf.test,
                               do_score=dconf.test_do_score)
        rec = rr.test_recorder
        with Timer(info="Run-test", print_date=True):
            tconf = conf.tconf
            tconf.test_count_mode = "tok"  # note: here we already get the frames!!
            for frames in grouped_insts.values():
                stream, _ = batch_stream(IterStreamer(frames), tconf, False)
                for binsts in stream:
                    with rec.go():
                        res0 = rr._run_batch(binsts)
                    rec.record(res0)
            rr.all_insts = all_insts  # replace by sents!!
            res = rr._run_end()
    else:
        # go
        rr = ZmtlTestingRunner(model,
                               test_iter,
                               conf,
                               dconf.output,
                               dconf.test,
                               do_score=dconf.test_do_score)
        res = rr.run()
    zlog(f"zzzfinal: {res}")
    zlog("The end of testing.")
Пример #5
0
 def do_test(self, wset="test"):
     model, t_center, d_center = self.model, self.t_center, self.d_center
     conf = self.conf
     # --
     to_test_datasets = d_center.get_datasets(wset=wset)
     t_center.prepare_datasets(to_test_datasets)  # re-prepare!!
     aggr = ResultAggregator()
     for one_ii, one_dataset in enumerate(to_test_datasets):
         with Timer(
                 info=
                 f"Test({one_ii+1}/{len(to_test_datasets)}): {one_dataset}",
                 print_date=True):
             one_res = self.run_test_dataset(one_dataset)
             aggr.add(one_dataset.name, one_res,
                      one_dataset.conf.group_eval_weight)
     ret = aggr.get_res()
     return ret
Пример #6
0
# --
def main(args):
    conf: OverallConf = init_everything(OverallConf(), args)
    dconf, tconf = conf.dconf, conf.tconf
    # data
    from .train import prepare_train_data
    train_streamers, dev_streamers, test_streamer, _ = prepare_train_data(
        dconf)
    extra_streamers = dev_streamers if test_streamer is None else dev_streamers + [
        test_streamer
    ]
    # vocab
    vpack = ZmtlVocabPackage.build_from_stream(
        dconf, MultiCatStreamer(train_streamers),
        MultiCatStreamer(extra_streamers))
    vpack.save(dconf.dict_dir)
    zlog("The end of Building.")


if __name__ == '__main__':
    import sys
    with Timer(info=f"Building", print_date=True) as et:
        main(sys.argv[1:])

# example: for building vocab and filtering embeds
"""
# filter for pb
PYTHONPATH=../src/ python3 -m msp2.tasks.zmtl.main.build train:../pb/conll05/train.conll.ud.json dev:../pb/conll05/dev.conll.ud.json,../pb/conll05/test.wsj.conll.ud.json,../pb/conll05/test.brown.conll.ud.json dict_dir:./ pretrain_hits_outf:hits_conll05.vec pretrain_wv_file:wiki-news-300d-1M-subword.vec |& tee _log.voc_conll05
"""
Пример #7
0
        ms_budgets = [ScheduledValue(f"ms_budget{i}", c) for i,c in enumerate(dconf.get_ms_train_budgets())]
        joined_train_streamer = MultiJoinStreamer(prepared_train_streamers, dconf.ms_stop_idx, ratios=ms_budgets)
    else:
        ms_budgets = []
        joined_train_streamer = prepared_train_streamers[0]
    train_iter, train_batch_f = batch_stream(joined_train_streamer, tconf, True)
    dev_iters = [batch_stream(index_stream(
        z, vpack, tconf.dev_use_cache, 0, test_inst_preparer), tconf, False)[0] for z in dev_streamers]
    # training runner
    tr = ZmtlTrainingRunner.create(model, train_iter, train_batch_f, conf, dev_iters,
                                   [dconf.output+f".dev{i}" for i in range(len(dev_golds))], dev_golds)
    for mv in ms_budgets:  # add them for scheduling!
        tr.add_scheduled_value(mv)
    # load?
    if tconf.load_model:
        # tr.load(dconf.model_load_name, tconf.load_process, load_strict=dconf.model_load_strict)
        tr.load(dconf.model_load_name, tconf.load_process)
    # go
    tr.run()
    zlog("The end of Training.")

if __name__ == '__main__':
    import sys
    with Timer(info=f"Training", print_date=True) as et:
        main(sys.argv[1:])

# --
"""
CUDA_VISIBLE_DEVICES= PYTHONPATH=../src:../../src/:../../../src python3 -m msp2.tasks.zmtl.main.train _conf
"""
Пример #8
0
            new_vocab = aug_words_and_embs(_embedder,
                                           vpack.get_voc("word"),
                                           extra_vocab,
                                           extra_embedding,
                                           aug_scale=dconf.pretrain_scale)
            vpack.put_voc("word", new_vocab)
    # =====
    # No Cache!!
    test_inst_preparer = model.get_inst_preper(False)
    test_iter, _ = batch_stream(
        index_stream(test_streamer, vpack, False, False, test_inst_preparer),
        tconf, False)
    return conf, model, vpack, test_iter


# -----
def main(args):
    conf, model, vpack, test_iter = prepare_test(args)
    dconf = conf.dconf
    # go
    rr = ZsfpTestingRunner(model, test_iter, conf, dconf.output, dconf.test)
    res = rr.run()
    zlog(f"zzzfinal: {res}")
    zlog("The end of testing.")


if __name__ == '__main__':
    import sys
    with Timer(info=f"Testing", print_date=True) as et:
        main(sys.argv[1:])
Пример #9
0
                                1000)  # 1000 sents once time
    test_dataset = ZDataset(d_center.conf.testM,
                            'testM',
                            'decode',
                            _no_load=True)  # use testM for other options!
    for lines in yield_lines(sys.stdin, BATCH_LINE):
        insts = [Sent.create(one.split())
                 for one in lines]  # note: simply split as sentence!!
        test_dataset.set_insts(insts)  # directly set it!
        cc["sent"] += len(insts)
        if cc["sent"] % 50000 == 0:
            zlog(f"Decode for {cc}")
        # --
        t_center.prepare_datasets([test_dataset])  # re-prepare!!
        for ibatch in test_dataset.yield_batches(loop=False):
            one_res = model.predict_on_batch(ibatch)
        # --
        for inst in insts:
            sys.stdout.write(
                json.dumps(inst.to_json(), ensure_ascii=False) + "\n")
    # =====
    zlog(f"The end of Decoding: {cc}")


# --
# MDIR=??
# PYTHONPATH=../src/ CUDA_VISIBLE_DEVICES=0 python3 -m msp2.tasks.zmtl2.main.decode ${MDIR}/_conf model_load_name:${MDIR}/zmodel.best.m vocab_load_dir:${MDIR}/ log_stderr:1 testM.group_tasks:??
if __name__ == '__main__':
    with Timer(info=f"Decoding", print_date=True) as et:
        main(sys.argv[1:])
Пример #10
0
    # conf
    conf: ZOverallConf = init_everything(ZOverallConf(), args)
    # task
    t_center = TaskCenter(conf.tconf)
    enc = t_center.tasks['enc']
    # data
    d_center = DataCenter(conf.dconf)
    for dataset in d_center.get_datasets():
        enc.prepare_dataset(dataset)
        vv = SimpleVocab.build_by_static([])
        vv2 = SimpleVocab.build_by_static([])
        for item in dataset.items:
            vv.feed_one(item._batch_len)
            vv2.feed_one(sum(len(z) for z in item.sents) + 1)
        vv.build_sort(lambda w, i, c: w)
        vv2.build_sort(lambda w, i, c: w)
        zlog(
            f"#== For {dataset} (subword):\n{vv.get_info_table().to_string()}")
        zlog(f"#== For {dataset} (word):\n{vv2.get_info_table().to_string()}")
    # --
    zlog("The end of Building.")


if __name__ == '__main__':
    import sys
    with Timer(info=f"CheckLength", print_date=True) as et:
        main(sys.argv[1:])

# --
# python3 -m msp2.tasks.zmtl2.main.check_length train0.input_dir:ud train0.input_format:conllu train0.group_files:_ud14/en2 train0.approx_prev_next:1 train0.left_extend_nsent:1 train0.right_extend_nsent:1