Пример #1
0
def main(args):
    conf: MainConf = init_everything(MainConf(), args, add_nn=False)
    zlog(f"** Run with {conf.to_json()}")
    # --
    if conf.input_table_file:
        with zopen(conf.input_table_file) as fd:
            s = fd.read()
            table = eval(s)
            mm = table[conf.name]  # read the table from file
    else:
        mm = globals()[conf.name]
    # --
    workers = Worker.get_gpu_workers([int(z) for z in conf.gpus],
                                     ncore=int(conf.ncore))
    # --
    x = TuneDriver(conf.tune_conf, workers, conf.name, extra_info=mm)
    task_iter = [
        MyTask(gid, conf.name, sel_idxes,
               [a[i] for a, i in zip(mm, sel_idxes)], conf.task_conf)
        for gid, sel_idxes in enumerate(
            iter_arg_choices(mm,
                             repeat=conf.repeat,
                             shuffle=conf.shuffle,
                             max_num=conf.max_count))
    ]
    x.main(task_iter)
Пример #2
0
def main(args):
    # conf
    conf: ZOverallConf = init_everything(ZOverallConf(), args)
    # task
    t_center = TaskCenter(conf.tconf)
    # data
    d_center = DataCenter(conf.dconf, specified_wset=["test"])
    # load vocab
    t_center.load_vocabs(t_center.conf.vocab_load_dir)
    # prepare datasets
    t_center.prepare_datasets(d_center.get_datasets())
    # build model
    model = ZModel(conf.mconf)
    t_center.build_mods(model)
    model.finish_sr()  # note: build sr before possible loading in testing!!
    # run
    r_center = RunCenter(conf.rconf, model, t_center, d_center)
    if conf.rconf.model_load_name != "":
        r_center.load(conf.rconf.model_load_name)
    else:
        zwarn("No model to load, Debugging mode??")
    res = r_center.do_test()
    zlog(f"zzzztestfinal: {res}")
    # --
    zlog("The end of Testing.")
Пример #3
0
def main(*args):
    conf: TrainConf = init_everything(TrainConf(), args)
    # --
    reader = conf.train.get_reader()
    inputs = yield_sents(reader)
    extractor = RuleTargetExtractor.train(inputs, conf.econf)
    # save
    zlog(f"Save extractor to {conf.save_name}")
    default_json_serializer.to_file(extractor.to_json(), conf.save_name)
Пример #4
0
def main(args):
    conf: OverallConf = init_everything(OverallConf(), args)
    dconf, tconf = conf.dconf, conf.tconf
    # data
    from .train import prepare_train_data
    train_streamers, dev_streamers, test_streamer, _ = prepare_train_data(dconf)
    extra_streamers = dev_streamers if test_streamer is None else dev_streamers + [test_streamer]
    # vocab
    vpack = ZsfpVocabPackage.build_from_stream(dconf, MultiCatStreamer(train_streamers), MultiCatStreamer(extra_streamers))
    vpack.save(dconf.dict_dir)
    zlog("The end of Building.")
Пример #5
0
def main(analyzer: str, *args):
    conf = MainConf()
    a_res = Analyzer.try_load_and_lookup(analyzer)
    one_conf, one_type = a_res.conf, a_res.T
    conf.aconf = one_conf()
    # --
    conf = init_everything(conf, args)
    zlog(f"Ready to analyze with {analyzer}.")
    # --
    ana: Analyzer = one_type(conf.aconf)
    ana.main()
Пример #6
0
def main(*args):
    conf: MainConf2 = init_everything(MainConf2(), args)
    # --
    # first read aux ones
    aux_insts = list(conf.aux.get_reader())
    aux_index = MyIndexer2(conf)
    num_aux_sent = 0
    for sent in yield_sents(aux_insts):
        num_aux_sent += 1
        fix_words(sent)
        aux_index.put(sent)
    zlog(
        f"Read from {conf.aux.input_path}: insts={len(aux_insts)}, sents={num_aux_sent}, len(index)={len(aux_index)}"
    )
    # --
    # then read input
    input_insts = list(conf.input.get_reader())
    output_sents = []
    num_input_sent = 0
    num_reset_sent = 0
    num_hit_sent = 0
    cc_status = Counter()
    for sent in yield_sents(input_insts):
        num_input_sent += 1
        fix_words(sent)
        # --
        trg_sent, trg_status = aux_index.query(sent)
        cc_status[trg_status] += 1
        if trg_sent is not None:
            num_hit_sent += 1
            # note: currently we replace upos & tree_dep
            upos_vals, head_vals, deplab_vals = \
                trg_sent.seq_upos.vals, trg_sent.tree_dep.seq_head.vals, trg_sent.tree_dep.seq_label.vals
            sent.build_uposes(upos_vals)
            sent.build_dep_tree(head_vals, deplab_vals)
            output_sents.append(sent)
        else:
            zlog(f"Miss sent: {sent.seq_word}")
            if not conf.output_sent_and_discard_nonhit:
                output_sents.append(sent)
    # --
    zlog(
        f"Read from {conf.input.input_path}: insts={len(input_insts)}, sents={num_input_sent}, (out-sent-{len(output_sents)})"
        f"reset={num_reset_sent}({num_reset_sent/num_input_sent:.4f}) hit={num_hit_sent}({num_hit_sent/num_input_sent:.4f})"
    )
    zlog(f"Query status: {cc_status}")
    # write
    with conf.output.get_writer() as writer:
        if conf.output_sent_and_discard_nonhit:
            writer.write_insts(output_sents)
        else:  # write the original insts
            writer.write_insts(input_insts)
Пример #7
0
def main(args):
    # conf
    conf: ZOverallConf = init_everything(ZOverallConf(), args)
    # task
    t_center = TaskCenter(conf.tconf)
    # data
    d_center = DataCenter(conf.dconf)
    # build vocab: no try loading here!!
    t_center.build_vocabs(d_center)
    # save vocab
    t_center.save_vocabs(t_center.conf.vocab_save_dir)
    # --
    zlog("The end of Building.")
Пример #8
0
def main(annotators: str, *args):
    # find annotators
    conf = MainConf()
    all_confs, all_ann_types = [], []
    for ann_ss in annotators.split(","):
        if len(ann_ss) == 0:
            continue  # ignore empty
        t_res, ann_ss_real = Annotator.try_load_and_lookup(ann_ss,
                                                           ret_name=True)
        one_conf_type, one_ann_type = t_res.conf, t_res.T
        one_conf = one_conf_type()
        all_confs.append(one_conf)
        all_ann_types.append(one_ann_type)
        assert not hasattr(conf, ann_ss_real)
        setattr(conf, ann_ss_real, one_conf)  # add conf
    # --
    conf = init_everything(conf, args)
    zlog(f"Ready to annotate with {annotators}: {conf.R} {conf.W}")
    # init all annotators
    all_anns: List[Annotator] = [
        at(cc) for cc, at in zip(all_confs, all_ann_types)
    ]
    # --
    reader, writer = conf.R.get_reader(), conf.W.get_writer()

    # =====
    def _process(_batch: List):
        for ann in all_anns:
            ann.annotate(_batch)
        writer.write_insts(_batch)

    # =====
    with BK.no_grad_env():  # note: decoding mode!!
        c, c2 = 0, 0
        cur_insts = []
        for one in reader:
            # input one
            c += 1
            cur_insts.append(one)
            # process?
            if len(cur_insts) >= conf.ann_batch_size:
                _process(cur_insts)
                cur_insts.clear()
                c2 += 1
                if c2 % conf.report_batch_interval == 0:
                    zlog(f"Annotate roughly: inst={c},batch={c2}")
        # remaining ones
        if len(cur_insts) > 0:
            _process(cur_insts)
    zlog(f"Annotate Finish: processed {c} insts.")
Пример #9
0
def main(args):
    # conf
    conf: ZOverallConf = init_everything(ZOverallConf(), args)
    # task
    t_center = TaskCenter(conf.tconf)
    # data
    d_center = DataCenter(conf.dconf,
                          specified_wset=[])  # nothing to load here!
    # load vocab
    t_center.load_vocabs(t_center.conf.vocab_load_dir)
    # prepare datasets
    t_center.prepare_datasets(d_center.get_datasets())
    # build model
    model = ZModel(conf.mconf)
    t_center.build_mods(model)
    model.finish_sr()  # note: build sr before possible loading in testing!!
    # run
    r_center = RunCenter(conf.rconf, model, t_center, d_center)
    if conf.rconf.model_load_name != "":
        r_center.load(conf.rconf.model_load_name)
    else:
        zwarn("No model to load, Debugging mode??")
    # =====
    cc = Counter()
    BATCH_LINE = os.environ.get('ZMSP_BATCH_LINE',
                                1000)  # 1000 sents once time
    test_dataset = ZDataset(d_center.conf.testM,
                            'testM',
                            'decode',
                            _no_load=True)  # use testM for other options!
    for lines in yield_lines(sys.stdin, BATCH_LINE):
        insts = [Sent.create(one.split())
                 for one in lines]  # note: simply split as sentence!!
        test_dataset.set_insts(insts)  # directly set it!
        cc["sent"] += len(insts)
        if cc["sent"] % 50000 == 0:
            zlog(f"Decode for {cc}")
        # --
        t_center.prepare_datasets([test_dataset])  # re-prepare!!
        for ibatch in test_dataset.yield_batches(loop=False):
            one_res = model.predict_on_batch(ibatch)
        # --
        for inst in insts:
            sys.stdout.write(
                json.dumps(inst.to_json(), ensure_ascii=False) + "\n")
    # =====
    zlog(f"The end of Decoding: {cc}")
Пример #10
0
def prepare_test(args):
    conf: OverallConf = init_everything(OverallConf(), args)
    dconf, tconf = conf.dconf, conf.tconf
    # vocab
    vpack = ZmtlVocabPackage.build_by_reading(dconf)
    # prepare data
    test_streamer = dconf.R.get_reader(input_path=dconf.test)
    # model
    model = build_model(conf, vpack=vpack)
    if dconf.model_load_name != "":
        model.load(dconf.model_load_name, strict=dconf.model_load_strict)
    else:
        zwarn("No model to load, Debugging mode??")
    # =====
    # augment with extra embeddings
    extra_embed_files = dconf.test_extra_pretrain_wv_files
    model_emb = model.get_emb()
    if model_emb is not None:
        _embedder = model_emb.eg.get_embedder("word")
        if len(extra_embed_files
               ) > 0 and _embedder is not None:  # has extra_emb and need_emb
            # get embeddings
            extra_embedding = WordVectors.load(extra_embed_files[0])
            extra_embedding.merge_others([
                WordVectors.load(one_file)
                for one_file in extra_embed_files[1:]
            ])
            # get extra dictionary (only those words hit in extra-embed)
            extra_vocab = SimpleVocab.build_by_static(get_extra_hit_words(
                test_streamer, extra_embedding, vpack.get_voc("word")),
                                                      pre_list=None,
                                                      post_list=None)
            # give them to the model
            new_vocab = aug_words_and_embs(_embedder,
                                           vpack.get_voc("word"),
                                           extra_vocab,
                                           extra_embedding,
                                           aug_scale=dconf.pretrain_scale)
            vpack.put_voc("word", new_vocab)
    # =====
    # No Cache!!
    test_inst_preparer = model.get_inst_preper(False)
    test_iter, _ = batch_stream(
        index_stream(test_streamer, vpack, False, False, test_inst_preparer),
        tconf, False)
    return conf, model, vpack, test_iter
Пример #11
0
def main(*args):
    conf: MainConf = init_everything(MainConf(), args)
    # --
    # first read them all
    src_sents, trg_sents = list(yield_sents(conf.src_input.get_reader())), \
                           list(yield_sents(conf.trg_input.get_reader()))
    assert len(src_sents) == len(trg_sents)
    cc = Counter()
    conv = Converter(conf)
    # --
    outputs = []
    for src_sent, trg_sent in zip(src_sents, trg_sents):
        res = conv.convert(src_sent, trg_sent, cc)
        outputs.append(res)
    zlog("Stat:")
    OtherHelper.printd(cc)
    # --
    with conf.output.get_writer() as writer:
        writer.write_insts(outputs)
Пример #12
0
def main(args):
    conf: OverallConf = init_everything(OverallConf(), args)
    dconf, tconf = conf.dconf, conf.tconf
    # data
    train_streamers, dev_streamers, test_streamer, dev_golds = prepare_train_data(dconf)
    # vocab
    if tconf.no_build_dict:  # read
        vpack = ZmtlVocabPackage.build_by_reading(dconf)
    else:
        # include dev/test only for convenience of including words hit in pre-trained embeddings
        extra_streamers = dev_streamers if test_streamer is None else dev_streamers + [test_streamer]
        vpack = ZmtlVocabPackage.build_from_stream(dconf, MultiCatStreamer(train_streamers), MultiCatStreamer(extra_streamers))
        vpack.save(dconf.dict_dir)
    # model
    model = build_model(conf, vpack=vpack)
    train_inst_preparer = model.get_inst_preper(True)
    test_inst_preparer = model.get_inst_preper(False)
    # actual streams
    prepared_train_streamers = [
        train_prep_stream(index_stream(z, vpack, tconf.train_use_cache, tconf.cache_shuffle_times, train_inst_preparer), tconf)
        for z in train_streamers]
    if len(prepared_train_streamers) > 1:  # ms_train
        ms_budgets = [ScheduledValue(f"ms_budget{i}", c) for i,c in enumerate(dconf.get_ms_train_budgets())]
        joined_train_streamer = MultiJoinStreamer(prepared_train_streamers, dconf.ms_stop_idx, ratios=ms_budgets)
    else:
        ms_budgets = []
        joined_train_streamer = prepared_train_streamers[0]
    train_iter, train_batch_f = batch_stream(joined_train_streamer, tconf, True)
    dev_iters = [batch_stream(index_stream(
        z, vpack, tconf.dev_use_cache, 0, test_inst_preparer), tconf, False)[0] for z in dev_streamers]
    # training runner
    tr = ZmtlTrainingRunner.create(model, train_iter, train_batch_f, conf, dev_iters,
                                   [dconf.output+f".dev{i}" for i in range(len(dev_golds))], dev_golds)
    for mv in ms_budgets:  # add them for scheduling!
        tr.add_scheduled_value(mv)
    # load?
    if tconf.load_model:
        # tr.load(dconf.model_load_name, tconf.load_process, load_strict=dconf.model_load_strict)
        tr.load(dconf.model_load_name, tconf.load_process)
    # go
    tr.run()
    zlog("The end of Training.")
Пример #13
0
def main(args):
    # conf
    conf: ZOverallConf = init_everything(ZOverallConf(), args)
    # task
    t_center = TaskCenter(conf.tconf)
    enc = t_center.tasks['enc']
    # data
    d_center = DataCenter(conf.dconf)
    for dataset in d_center.get_datasets():
        enc.prepare_dataset(dataset)
        vv = SimpleVocab.build_by_static([])
        vv2 = SimpleVocab.build_by_static([])
        for item in dataset.items:
            vv.feed_one(item._batch_len)
            vv2.feed_one(sum(len(z) for z in item.sents) + 1)
        vv.build_sort(lambda w, i, c: w)
        vv2.build_sort(lambda w, i, c: w)
        zlog(
            f"#== For {dataset} (subword):\n{vv.get_info_table().to_string()}")
        zlog(f"#== For {dataset} (word):\n{vv2.get_info_table().to_string()}")
    # --
    zlog("The end of Building.")
Пример #14
0
def main(evaluator: str, *args):
    # find evaluator
    conf = MainConf()
    e_res = Evaluator.try_load_and_lookup(evaluator)
    one_conf, one_type = e_res.conf, e_res.T
    conf.econf = one_conf()
    # --
    conf = init_everything(conf, args)
    zlog(f"Ready to evaluate with {evaluator}: {conf.gold} {conf.pred}")
    # --
    gold_insts = list(conf.gold.get_reader())
    pred_insts = list(conf.pred.get_reader())
    evaler: Evaluator = one_type(conf.econf)
    res = evaler.eval(gold_insts, pred_insts)
    if conf.result_file:
        with zopen(conf.result_file,
                   'a') as fd:  # note: here we use append mode
            fd.write(
                f"# Eval with {args}:\n{res.get_brief_str()}\n{res.get_detailed_str()}\n"
            )
    zlog(f"Eval on {conf.gold} vs. {conf.pred}; RESULT = {res}")
    if conf.print_details:
        zlog(f"#-- details:\n{res.get_detailed_str()}")
Пример #15
0
def main(args):
    # conf
    conf: ZOverallConf = init_everything(ZOverallConf(), args)
    # task
    t_center = TaskCenter(conf.tconf)
    # data
    d_center = DataCenter(conf.dconf)
    # build/load vocab: try loading here, and save new built ones!
    _tcf = t_center.conf
    t_center.build_vocabs(d_center, try_load_vdir=(None if _tcf.vocab_force_rebuild else _tcf.vocab_load_dir),
                          save_vdir=_tcf.vocab_save_dir)
    # prepare datasets
    t_center.prepare_datasets(d_center.get_datasets())
    # build model
    model = ZModel(conf.mconf)
    t_center.build_mods(model)
    # run
    r_center = RunCenter(conf.rconf, model, t_center, d_center)
    if conf.rconf.train_preload_model:
        r_center.load(conf.rconf.train_preload_model)
    model.finish_sr()  # note: build sr after possible loading in training!!
    r_center.do_train()
    # --
    zlog("The end of Training.")
Пример #16
0
def main(*args):
    conf: MainConf = init_everything(MainConf(), args)
    # --
    word_map_f = {
        "PTB": lambda x: re.sub("-LRB-", "(", re.sub("-RRB-", ")", x)),
    }.get(conf.word_map, lambda x: x)
    convert_f = None if conf.convert_f == "" else globals()[conf.convert_f]
    # --
    # first read aux ones
    aux_insts = list(conf.aux.get_reader())
    aux_index = MyIndexer(conf)
    num_aux_sent = 0
    for sent in yield_sents(aux_insts):
        num_aux_sent += 1
        aux_index.put(sent)
    zlog(
        f"Read from {conf.aux.input_path}: insts={len(aux_insts)}, sents={num_aux_sent}, len(index)={len(aux_index)}"
    )
    # then read input
    input_insts = list(conf.input.get_reader())
    output_sents = []
    num_input_sent = 0
    num_rebuild_sent = 0
    num_reset_sent = 0
    num_hit_sent = 0
    for sent in yield_sents(input_insts):
        num_input_sent += 1
        # --
        new_word_vals = [word_map_f(w) for w in sent.seq_word.vals]
        if new_word_vals != sent.seq_word.vals:
            num_rebuild_sent += 1
            sent.build_words(new_word_vals)
        # --
        trg_sent = aux_index.query(sent)
        # -- debug
        # if trg_sent is None:
        #     breakpoint()
        #     trg_sent = aux_index.query(sent)
        # --
        if trg_sent is not None:
            num_hit_sent += 1
            # --
            # note: currently we replace upos & tree_dep
            upos_vals, head_vals, deplab_vals = \
                trg_sent.seq_upos.vals, trg_sent.tree_dep.seq_head.vals, trg_sent.tree_dep.seq_label.vals
            if convert_f is not None:
                upos_vals, head_vals, deplab_vals = convert_f(
                    upos_vals, head_vals, deplab_vals)
            # --
            if conf.change_words and sent.seq_word.vals != trg_sent.seq_word.vals:
                num_reset_sent += 1
                sent.seq_word.set_vals(trg_sent.seq_word.vals)  # reset it!
            # --
            sent.build_uposes(upos_vals)
            sent.build_dep_tree(head_vals, deplab_vals)
            # --
            output_sents.append(sent)
        else:
            zlog(f"Miss sent: {sent.seq_word}")
            if not conf.output_sent_and_discard_nonhit:
                output_sents.append(sent)
    zlog(
        f"Read from {conf.input.input_path}: insts={len(input_insts)}, sents={num_input_sent}, (out-sent-{len(output_sents)})"
        f"rebuild={num_rebuild_sent}({num_rebuild_sent/num_input_sent:.4f}), "
        f"reset={num_reset_sent}({num_reset_sent/num_input_sent:.4f}) hit={num_hit_sent}({num_hit_sent/num_input_sent:.4f})"
    )
    # write
    with conf.output.get_writer() as writer:
        if conf.output_sent_and_discard_nonhit:
            writer.write_insts(output_sents)
        else:  # write the original insts
            writer.write_insts(input_insts)
Пример #17
0
def main(*args):
    conf: MainConf = init_everything(MainConf(), args)
    # --
    cons_lex = LexConstrainer(conf.lex_conf)
    cons_fe = FEConstrainer(conf.fe_conf)
    # --
    # some confs
    lex_use_fn_style = conf.lex_conf.use_fn_style
    # --
    # first try to read frame file
    if conf.frame_file:
        assert lex_use_fn_style, "Otherwise do not provide 'frame_file'!!s"
        external_frames = default_json_serializer.from_file(conf.frame_file)
        for fname, fv in external_frames.items():
            # LU
            for lu in fv["lexUnit"]:
                lu_name = lu["name"]
                cons_lex.add(cons_lex.lu2feat(lu_name), fname,
                             c=0)  # no count now, only add entry
                lu_name2 = LexConstrainer.norm_lu(lu_name)
                if lu_name2 != lu_name:  # also add normed name!
                    cons_lex.add(cons_lex.lu2feat(lu_name2), fname, c=0)
            # FE
            for fe in fv["FE"]:
                fe_name = fe["name"]
                cons_fe.add(fname, fe_name, c=0)  # again no count here!
        zlog(
            f"Read from {conf.frame_file}: LU={cons_lex.summary()}, FE={cons_fe.summary()}"
        )
    # --
    # then read data!
    if conf.train.input_path:
        reader = conf.train.get_reader()
        for sent in yield_sents(reader):
            for frame in sent.get_frames(conf.lex_conf.cons_ftag):
                frame_name = frame.type
                # LU
                feats = []
                if lex_use_fn_style:  # then directly use the stored one!!
                    lu_name = frame.info.get("luName")
                    feats.append(cons_lex.lu2feat(lu_name))
                    lu_name2 = LexConstrainer.norm_lu(lu_name)
                    if lu_name2 != lu_name:
                        feats.append(cons_lex.lu2feat(lu_name2))
                # also add the plain one!!
                widx, wlen = frame.mention.get_span()
                feat = cons_lex.span2feat(frame.sent, widx, wlen)
                feats.append(feat)
                # --
                for feat in feats:
                    cons_lex.add(feat, frame_name, c=1)
                # FE
                for alink in frame.args:
                    cons_fe.add(frame_name, alink.role, c=1)
        zlog(
            f"Read from {conf.train.input_path}: LU={cons_lex.summary()}, FE={cons_fe.summary()}"
        )
    # --
    # summary and save
    cons_lex.save(conf.lex_save_name)
    cons_fe.save(conf.fe_save_name)
Пример #18
0
def main(tconf: MyTaskConf, args):
    conf = MainConf()
    conf.task_conf = tconf
    conf: MainConf = init_everything(conf, args, add_nn=False)
    zlog(f"** Run with {conf.to_json()}")
    # --
    if conf.tune_table_file:
        with zopen(conf.tune_table_file) as fd:
            s = fd.read()
            table = eval(s)
            mm = table.get(conf.tune_name)  # read the table from file
    else:
        mm = globals().get(conf.tune_name)
    # --
    if mm is not None:  # if we can read it!
        # note: if tuning, we want it to be quiet
        conf.task_conf.quite = True
        # --
        workers = te.Worker.get_gpu_workers([int(z) for z in conf.gpus],
                                            ncore=int(conf.ncore))
        x = te.TuneDriver(conf.tune_conf,
                          workers,
                          conf.tune_name,
                          extra_info=mm)
        task_iter = []
        # --
        all_runs = enumerate(
            te.iter_arg_choices(mm,
                                repeat=conf.repeat,
                                shuffle=conf.shuffle,
                                max_num=conf.max_count))
        if not conf.repeat:
            all_runs = list(all_runs)
            orig_all_runs = list(all_runs)
        else:
            orig_all_runs = None
        if len(conf.task_sels) > 0:
            assert not conf.repeat and not conf.shuffle
            _sels = [int(z) for z in conf.task_sels]
            all_runs = list(all_runs)
            zlog(f"Select {len(_sels)}/{len(all_runs)}: {_sels}")
            all_runs = [all_runs[z] for z in _sels]
        # --
        if not conf.repeat:
            _max_gid = 0 if len(orig_all_runs) == 0 else max(
                z[0] for z in orig_all_runs)
            _padn = len(str(_max_gid))
            _pads = f"%0{_padn}d"
        else:
            _pads = "%d"
        # --
        for gid, sel_idxes in all_runs:
            # note: override run_dir!!
            s_gid = _pads % gid
            one = conf.task_conf.make_task(
                run_dir=f"run_{conf.tune_name}_{s_gid}",
                _id_str=f"{s_gid}:{sel_idxes}",
                _train_extras=" ".join([a[i] for a, i in zip(mm, sel_idxes)]))
            task_iter.append(one)
        x.main(task_iter)
    else:  # otherwise single run
        assert not conf.tune_name
        task = conf.task_conf.make_task()  # no setting here!!
        task.execute(
            f"CUDA_VISIBLE_DEVICES={','.join(conf.gpus)} OMP_NUM_THREADS={conf.ncore} MKL_NUM_THREADS={conf.ncore}"
        )
Пример #19
0
def main(*args):
    conf: RunConf = init_everything(RunConf(),
                                    args,
                                    add_utils=False,
                                    add_nn=False)
    # =====
    # get paths
    RUN_DIR = conf.run_dir
    if RUN_DIR:
        mkdir_p(RUN_DIR, raise_error=True)
        os.chdir(RUN_DIR)  # change to it!!
    SRC_DIR = zglob1(conf.src_dir, check_prefix="..", check_iter=10)
    VOC_DIR = zglob1(conf.voc_dir, check_prefix="..", check_iter=10)
    DATA_DIR = zglob1(conf.dataset.data_dir, check_prefix="..", check_iter=10)
    zlog(
        f"RUN with RUN={RUN_DIR}, SRC={SRC_DIR}, VOC={VOC_DIR}, DATA={DATA_DIR}"
    )
    # =====
    # modes
    dataset_choice = conf.dataset._choice
    is_pb, is_fn = [dataset_choice.startswith(z) for z in ["pb", "fn"]]
    assert is_pb or is_fn
    # =====
    # options
    # --
    # base ones
    base_opt = "conf_output:_conf"
    # eval
    if is_pb:
        base_opt += f" eval_conf:pb"
    elif is_fn:
        base_opt += f" dict_frame_file:{DATA_DIR}/{conf.dataset.frame_file}"
        base_opt += f" eval_conf:fn eval_conf.frame_file:{DATA_DIR}/{conf.dataset.frame_file}"  # eval
    # --
    # =====
    # modeling
    if conf.use_word_input:
        base_opt += " ec_word.dim:300 ec_word.drop_rate:0.2 ec_word.init_from_pretrain:1 ec_word.rare_unk_thr:2"  # word
    # base_opt += " ec_posi.dim:512"  # posi?
    # base_opt += " ec_char.dim:50 ec_char.init_scale:5."  # char?
    if conf.use_bert_input:
        base_opt += " ec_bert.dim:768 bert_model:bert-base-cased bert_output_layers:7,8,9"  # bert features?
    base_opt += " eproj_dim:512"  # --
    if conf.use_rel_posi:
        base_opt += " enc_conf.enc_att.n_layers:2 enc_conf.enc_att.use_posi:0 enc_conf.clip_dist:16"  # enc1
    else:
        base_opt += " enc_conf.enc_att.n_layers:2 enc_conf.enc_att.use_posi:1 enc_conf.clip_dist:0"  # enc1
    # base_opt += " enc_conf.enc_tatt.n_layers:2 enc_conf.enc_tatt.use_posi:1"  # enc1
    # base_opt += " enc_conf.enc_rnn.n_layers:1 enc_conf.enc_hidden:1024"  # enc1
    # --
    # frame
    base_opt += " loss_evt:0.5 pred_evt:1"  # with evts
    base_opt += " evt_conf.cand_label_smoothing:0.05 evt_conf.label_smoothing:0.1"  # label smooth
    base_opt += " evt_conf.lookup_conf.use_emb:0"  # no adding frame embeddings?
    base_opt += " evt_conf.span_conf.sconf.hid_nlayer:1"  # pred scorer?
    if conf.assume_frame:  # no need for the evt module!!
        base_opt += " loss_evt:0 pred_evt:0 eval_conf.weight_frame:0."
    elif conf.assume_trg:  # no need for cand, but still need to identify frame types
        base_opt += " evt_conf.loss_cand:0. evt_conf.loss_use_posi:1 evt_conf.pred_use_posi:1"  # use-posi for evt
        base_opt += " evt_conf.pred_addition_non_score:-100000."  # NEGINF-non
        if is_fn:  # further use cons for fn
            base_opt += f" evt_cons_lex_file:{VOC_DIR}/cons_lex_{dataset_choice}.json evt_conf.pred_use_cons:1 evt_conf.pred_use_lu:1 evt_conf.loss_use_cons:0 evt_conf.loss_use_lu:0"  # cons & use-lu for evt
    else:
        # evt_conf -> direct
        base_opt += " evt_conf.loss_cand:1.0 evt_conf.loss_lab:1.0"  # loss_cand
        base_opt += " evt_conf.span_train_sample_rate:1.0 evt_conf.span_topk_rate:1.0 evt_conf.span_train_sample:1"  # some rates
        # --
        if is_pb:  # lab is aux for pb
            base_opt += " evt_conf.loss_lab:0.5 evt_conf.pred_score_prune:0. evt_conf.pred_addition_non_score:-100000."
        elif is_fn:  # lab is essential for fn
            base_opt += " loss_evt:1 evt_conf.loss_cand:0.5 evt_conf.span_train_sample_rate:0.33 evt_conf.span_topk_rate:0.4 evt_conf.span_train_sample:1"
        # --
        if conf.no_frame_label:
            base_opt += " evt_conf.loss_lab:0. evt_conf.pred_score_prune:0. evt_conf.pred_addition_non_score:-100000."
    # --
    # arg
    base_opt += " arg_use_finput:0"
    base_opt += f" fenc_conf.enc_att.n_layers:8 fenc_conf.clip_dist:{16 if conf.use_rel_posi else 0}"  # fenc
    # base_opt += " fenc_conf.enc_tatt.n_layers:6"  # fenc
    # base_opt += " fenc_conf.enc_rnn.n_layers:3 fenc_conf.enc_hidden:1024"  # enc1
    base_opt += " loss_arg:1. pred_arg:1"  # with args
    base_opt += " arg_conf.label_smoothing:0.1"  # label smooth
    if conf.arg_mode in ["span", "head"]:
        # arg_conf -> direct
        base_opt += " arg_conf.loss_cand:0.5"  # loss_cand
        # base_opt+=" arg_conf.span_train_sample_rate:0.33 arg_conf.span_topk_rate:0.4"  # some rates
        base_opt += " arg_conf.span_topk_rate:1. arg_conf.span_topk_count:10 arg_conf.span_train_sample:0"  # some rates
        base_opt += " arg_conf.loss_weight_non:1."  # less penalizing this?
        base_opt += " arg_conf.pp_check_more:1"  # check non-overlapping
        if conf.arg_mode == "span":
            base_opt += " arg_conf.max_width:30 arg_conf.softhead_topk:5 arg_conf.pred_non_overlapping:1"  # span
        elif conf.arg_mode == "head":
            base_opt += " arg_conf.core_span_mode:shead arg_conf.max_width:1"  # head
            # extender
            base_opt += " arg_conf.loss_ext:0.5 arg_conf.pred_ext:1 arg_conf.ext_use_finput:0"
            base_opt += f" arg_conf.ext_conf.eenc_conf.enc_att.n_layers:1 arg_conf.ext_conf.eenc_conf.enc_att.aconf.clip_dist:{16 if conf.use_rel_posi else 0}"
        else:
            raise NotImplementedError()
    elif conf.arg_mode == "soft":
        base_opt += " arg_conf:soft"
        base_opt += " arg_conf.loss_ext:0.5 arg_conf.pred_ext:1 arg_conf.ext_use_finput:0"
        base_opt += f" arg_conf.ext_conf.eenc_conf.enc_att.n_layers:1 arg_conf.ext_conf.eenc_conf.enc_att.aconf.clip_dist:{16 if conf.use_rel_posi else 0}"
        base_opt += " arg_conf.pp_check_more:1"
    elif conf.arg_mode in ["anchor", "anchor2"]:
        base_opt += " arg_conf:anchor"
        if conf.arg_mode == "anchor2":  # yet another head mode!
            base_opt += " arg_conf.core_span_mode:shead"
        base_opt += " arg_conf.loss_ext:0.5 arg_conf.pred_ext:1 arg_conf.ext_use_finput:0"
        base_opt += f" arg_conf.ext_conf.eenc_conf.enc_att.n_layers:1 arg_conf.ext_conf.eenc_conf.enc_att.aconf.clip_dist:{16 if conf.use_rel_posi else 0}"
        base_opt += " arg_conf.pp_check_more:1"
    elif conf.arg_mode in ["seq", "seq0"]:
        # arg_conf -> seq
        base_opt += " arg_conf:seq arg_conf.seq_scheme:BIO"  # use seq mode!
        base_opt += " arg_conf.loss_weight_non:1."  # less penalizing this?
        # --
        if conf.arg_mode == "seq":
            base_opt += " arg_conf.beam_k:150 arg_conf.use_bigram:0 arg_conf.pred_use_seq_cons:1"  # viterbi with constraints
            if conf.arg_seq_mod == "crf":  # crf-mode
                base_opt += " arg_conf.loss_mode:crf arg_conf.use_bigram:1 arg_conf.local_normalize:0"
        elif conf.arg_mode == "seq0":  # greedy mode: no crf and no viterbi
            base_opt += " arg_conf.pred_use_seq_cons:0 arg_conf.loss_mode:mle arg_conf.use_bigram:0 arg_conf.local_normalize:1"
        else:
            raise NotImplementedError()
    else:
        raise NotImplementedError()
    # --
    # =====
    # training
    base_opt += " ema_decay:0. ema_update_step:1"  # ema
    if 1:
        UPE = 1000  # 1000 update as one epoch
        base_opt += " lrate.val:0.0002 anneal_times:10 anneal_patience:10 lrate.m:0.75"
        base_opt += f" valid_ufreq:{UPE} valid_epoch:0 max_uidx:{UPE*150} lrate_warmup_uidx:{8*UPE} lrate_decrease_alpha:0."
        if conf.use_rel_posi:
            base_opt += " train_count_mode:ftok train_batch_size:4096 accu_batch:1"  # actually bs=bs*accu
            base_opt += " test_count_mode:ftok test_batch_size:2048"
        else:
            base_opt += " train_count_mode:ftok train_batch_size:4096 accu_batch:1"  # actually bs=bs*accu
            base_opt += " test_count_mode:ftok test_batch_size:2048"
        base_opt += " df_hdrop:0.2"  # general dropout
    else:  # possibly for rnn
        base_opt += " lrate.val:0.002 anneal_times:10 anneal_patience:10"
        base_opt += " train_count_mode:frame max_eidx:100 train_batch_size:32"
        base_opt += " df_hdrop:0.33"  # general dropout
    if is_pb:
        base_opt += " train_skip_noevt_rate:0.0"
    elif is_fn:
        base_opt += " train_skip_noevt_rate:1.0"  # skip sents where no targets!
    # data
    base_opt += " " + conf.dataset.get_data_str(DATA_DIR, conf.do_ms_train)
    base_opt += f" pretrain_wv_file:{VOC_DIR}/hits_{dataset_choice}.vec pretrain_scale:10."  # filtered pretrain file
    # nn
    base_opt += f" nn.device:0 nn.random_seed:9347{conf.cur_run} nn.random_cuda_seed:9349{conf.cur_run}"
    # =====
    # note: base_opt is only for training!!
    _L_PRE = conf.log_prefix
    DEBUG_OPTION = "-m pdb" if conf.debug else ""
    TRAIN_CMD = f"CUDA_VISIBLE_DEVICES={conf.rgpu} PYTHONPATH={SRC_DIR}:$PYTHONPATH python3 {DEBUG_OPTION} -m msp2.tasks.zsfp.main.train {base_opt} log_file:{_L_PRE}_train {conf.train_extras}"
    # --
    TEST_CMD = f"CUDA_VISIBLE_DEVICES={conf.rgpu} PYTHONPATH={SRC_DIR}:$PYTHONPATH python3 {DEBUG_OPTION} -m msp2.tasks.zsfp.main.test {conf.conf_output} log_file:{_L_PRE}_test {conf.test_extras}"
    # --
    if conf.do_train:
        system(TRAIN_CMD, pp=True)
    # --
    if conf.do_test:
        system(TEST_CMD, pp=True)
    # --
    if conf.do_test_all:
        for tfile in conf.dataset.all_dt_files:
            _TMP_CMD = f"CUDA_VISIBLE_DEVICES={conf.rgpu} PYTHONPATH={SRC_DIR}:$PYTHONPATH python3 {DEBUG_OPTION} -m msp2.tasks.zsfp.main.test {conf.conf_output} test:{DATA_DIR}/{tfile} output:{conf.out_prefix}.{tfile} log_file:{_L_PRE}.{tfile} test_extra_pretrain_wv_files:{VOC_DIR}/hits_{dataset_choice}.vec {conf.test_extras}"
            system(_TMP_CMD, pp=True)