Пример #1
0
 def __init__(self):
     self.input = ReaderGetterConf()
     self.aux = ReaderGetterConf()
     self.output = WriterGetterConf()
     self.output_sent_and_discard_nonhit = False
     # --
     # indexer options
     self.key_dels = "-_"  # delete these chars for key
     self.search_topk = 1  # search for topK?
Пример #2
0
 def __init__(self):
     super().__init__()
     # files
     self.main = ReaderGetterConf()  # as gold files
     self.gold = ""
     # eval with preds
     self.extra = ReaderGetterConf()
     self.preds = []  # list of preds
     self.econf = DparEvalConf().direct_update(deplab_l1=True)  # by default only check l1
Пример #3
0
def main(input_file: str, output_file: str, checking_file: str,
         keep_rate: float):
    keep_rate = float(keep_rate)
    _gen = Random.get_np_generator(12345)
    rstream = Random.stream(_gen.random_sample)
    # --
    # read input
    stat = {}
    input_sents = list(
        yield_sents(ReaderGetterConf().get_reader(input_path=input_file)))
    stat["input"] = get_stat(input_sents)
    if checking_file:
        checking_sents = list(
            yield_sents(
                ReaderGetterConf().get_reader(input_path=checking_file)))
        stat["check"] = get_stat(checking_sents)
        # collect keys
        hit_keys = set()
        for one_check_sent in checking_sents:
            tok_key = ''.join(one_check_sent.seq_word.vals).lower()
            tok_key = ''.join(tok_key.split())  # split and join again
            hit_keys.add(tok_key)
        # filter
        filtered_sents = []
        for one_input_sent in input_sents:
            tok_key = ''.join(one_input_sent.seq_word.vals).lower()
            tok_key = ''.join(tok_key.split())  # split and join again
            if tok_key not in hit_keys:
                filtered_sents.append(one_input_sent)
    else:
        filtered_sents = input_sents
    stat["filter"] = get_stat(filtered_sents)
    # sample
    if keep_rate < 1.:
        sample_sents = [
            s for r, s in zip(rstream, filtered_sents) if r < keep_rate
        ]
    elif keep_rate > 10:
        sample_sents = [z for z in filtered_sents]
        for _ in range(10):
            _gen.shuffle(sample_sents)
        sample_sents = sample_sents[:int(keep_rate)]
    else:
        sample_sents = filtered_sents
    stat["sample"] = get_stat(sample_sents)
    # write
    if os.path.exists(output_file):
        assert False, f"File exists: {output_file}, delete it first!"
    if output_file:
        with WriterGetterConf().get_writer(output_path=output_file) as writer:
            writer.write_insts(sample_sents)
    # stat
    zlog(
        f"Read {input_file}, check {checking_file}, output {output_file}, stat:"
    )
    OtherHelper.printd(stat)
Пример #4
0
def main(input_file: str, output_file: str):
    reader_conf = ReaderGetterConf().direct_update(input_format='conllufipb')
    reader_conf.validate()
    # --
    cc = Counter()
    arg_cc = Counter()
    all_insts = list(reader_conf.get_reader(input_path=input_file))
    for sent in all_insts:
        fields_args, fields_preds = sent.info[8], sent.info[9]
        assert len(fields_args) == len(fields_preds) and len(fields_args) == len(sent)
        # first collect preds
        all_preds = {}  # widx -> event
        for widx, vv in enumerate(fields_preds):
            pred_name = None
            for vv2 in vv.split("|"):
                if vv2.startswith("PBSENSE="):
                    assert pred_name is None
                    pred_name = vv2.split("=")[-1]
            if pred_name is not None:
                evt = sent.make_event(widx, 1, type=pred_name)
                assert widx not in all_preds
                all_preds[widx] = evt
        # then collect args
        for widx, vv in enumerate(fields_args):
            for vv2 in vv.split("|"):
                if ":" not in vv2:
                    continue
                tidx, aname = vv2.split(":", 1)
                tidx = int(tidx)
                role = None
                if aname.startswith("PBArg_"):
                    nn = aname[len("PBArg_"):]
                    role = f"ARG{nn}"
                elif aname.startswith("PBArgM_"):
                    _, nn = aname.split("_")
                    role = f"ARGM-{str.upper(nn)}"
                if role is not None:
                    evt = all_preds[tidx-1]
                    ef = sent.make_entity_filler(widx, 1, type="UNK")
                    evt.add_arg(ef, role)
                    arg_cc[role] += 1
        # --
        cc["sent"] += 1
        cc["frames"] += len(sent.events)
        cc["args"] += sum(len(z.args) for z in sent.events)
        # --
    # --
    with WriterGetterConf().get_writer(output_path=output_file) as writer:
        writer.write_insts(all_insts)
    # --
    zlog(f"Read fipb from {input_file} to {output_file}: {cc}")
    zlog(f"Role counts = {arg_cc}")
Пример #5
0
def main(input_format, *input_files: str):
    reader_conf = ReaderGetterConf().direct_update(input_format=input_format)
    reader_conf.validate()
    # --
    all_insts = []
    for ff in input_files:
        one_insts = list(reader_conf.get_reader(input_path=ff))
        zlog(f"Read from {ff}: {len(one_insts)} instances.")
        all_insts.extend(one_insts)
    # --
    if input_format == "conllu":
        do_stat(all_insts)
    do_stat_srl(all_insts)
Пример #6
0
 def __init__(self):
     self.frame_file = ""
     self.train = ReaderGetterConf()
     self.lex_conf = LexConstrainerConf()
     self.fe_conf = FEConstrainerConf()
     self.lex_save_name = "cons_lex.json"
     self.fe_save_name = "cons_fe.json"
Пример #7
0
def main(input_format, *input_files: str):
    reader_conf = ReaderGetterConf().direct_update(input_format=input_format)
    reader_conf.validate()
    # --
    all_insts = []
    for ff in input_files:
        one_insts = list(reader_conf.get_reader(input_path=ff))
        cc = Counter()
        for sent in yield_sents(one_insts):
            cc['sent'] += 1
            for evt in sent.events:
                cc['evt'] += 1
                cc['arg'] += len(evt.args)
        zlog(
            f"Read from {ff}: {cc['sent']/1000:.1f}k&{cc['evt']/1000:.1f}k&{cc['arg']/1000:.1f}k"
        )
Пример #8
0
 def __init__(self):
     self.R = ReaderGetterConf()
     self.W = WriterGetterConf()
     # -----
     # ann: to be added
     self.ann_batch_size = 1  # read how many instances and then fire?
     self.report_batch_interval = 1000  # how many batches to report
Пример #9
0
def main(output_prefix, *input_files):
    # input
    all_sents = []
    for f in input_files:
        one_reader = ReaderGetterConf().get_reader(input_path=f)
        one_insts = list(one_reader)
        all_sents.append([z for z in yield_sents(one_insts)])
        zlog(f"Read from {f}: {len(all_sents[-1])} sents")
    # align
    sent_map = OrderedDict()
    for fidx, sents in enumerate(all_sents):
        for sent in sents:
            doc_id = sent.info.get("doc_id", "UNK")
            if doc_id.split("/", 1)[0] == "ontonotes":
                doc_id = doc_id.split("/", 1)[1]
            key = doc_id + "|".join(sent.seq_word.vals)  # map by doc_id + key
            if key not in sent_map:
                sent_map[key] = [sent]
            else:
                sent_map[key].append(sent)
    # --
    num_files = len(input_files)
    matched_sents = [vs for vs in sent_map.values() if len(vs) == num_files]
    unmatched_sents = [vs for vs in sent_map.values() if len(vs) != num_files]
    zlog(f"Aligned sent of {len(matched_sents)}")
    breakpoint()
    # output
    for outi in range(num_files):
        out_sents = [z[outi] for z in matched_sents]
        writer = WriterGetterConf().get_writer(
            output_path=f"{output_prefix}{outi}")
        writer.write_insts(out_sents)
        writer.close()
Пример #10
0
def main(*aligned_files):
    RANDOM_DROP_EVT_RATE = 0.15
    # input
    aligned_insts = []
    for f in aligned_files:
        one_reader = ReaderGetterConf().get_reader(input_path=f)
        one_insts = list(one_reader)
        aligned_insts.append([z for z in yield_sents(one_insts)])
    # filter
    good_idxes = []
    for idx in range(len(aligned_insts[0])):
        sent_good = True
        for sent in yield_sents([z[idx] for z in aligned_insts]):
            if RANDOM_DROP_EVT_RATE > 0:
                for evt in list(sent.events):
                    if np.random.random_sample() < RANDOM_DROP_EVT_RATE:
                        sent.delete_frame(evt, "evt")
            for evt in sent.events:
                hits = set()
                for arg in evt.args:
                    widx, wlen = arg.arg.mention.get_span()
                    for ii in range(widx, widx + wlen):
                        if ii in hits:
                            sent_good = False
                        hits.add(ii)
        if sent_good:
            good_idxes.append(idx)
    # output
    output_prefix = "_tmp.json"
    for outi, insts in enumerate(aligned_insts):
        filtered_insts = [insts[ii] for ii in good_idxes]
        writer = WriterGetterConf().get_writer(
            output_path=f"{output_prefix}{outi}")
        writer.write_insts(filtered_insts)
        writer.close()
Пример #11
0
def main(file_in="", file_out=""):
    insts = list(ReaderGetterConf().get_reader(input_path=file_in))  # read from stdin
    with WriterGetterConf().get_writer(output_path=file_out) as writer:
        for inst in insts:
            for sent in yield_sents([inst]):
                sent.delete_frames("evt")
                sent.delete_frames("ef")
            writer.write_inst(inst)
Пример #12
0
 def __init__(self):
     self.br = ReaderGetterConf()  # basic reader
     # extra functions
     self.wl_use_lc = False  # use lower case words and lemmas?
     self.deplab_use_label0 = True  # using only first-level ud label
     self.sent_loss_weight_non = 1.0  # sent level loss_weight_non, default is 1.
     self.assume_frame_lu = False  # special mode, assume that we have the input frame's LUs
     self.set_ee_heads = True  # assign heads for evt and args (by default simply assign them!!)
Пример #13
0
 def annotate(self, insts: List):
     conf: AnnotatorP2DConf = self.conf
     # --
     # get all sentences and run in batch
     all_sents = list(yield_sents(insts))
     tmp_input = os.path.join(conf.p2d_tmp_dir, "_input.penn")
     with zopen(tmp_input, 'w') as fd:
         for sent in all_sents:
             fd.write(sent2tree(sent) + "\n")
     # run
     tmp_output = os.path.join(conf.p2d_tmp_dir, "_output.conllu")
     log_cmd = f'2>{conf.p2d_log}' if conf.p2d_log else ''
     system(f"{self.cmd} -treeFile {tmp_input} >{tmp_output} {log_cmd}")
     # read output and add back
     conll_reader_conf = ReaderGetterConf()
     conll_reader_conf.input_conf.use_multiline = True
     conll_reader_conf.input_conf.mtl_ignore_f = "'ignore_#'"
     conll_reader_conf.input_format = "conllu"
     conll_reader_conf.input_path = tmp_output
     conll_reader = get_reader(conll_reader_conf)
     new_sents = list(conll_reader)
     # --
     assert len(all_sents) == len(new_sents)
     for s0, s1 in zip(all_sents, new_sents):
         assert len(s0) == len(s1)
         mismatched_tokens = [
             (v1, v2) for v1, v2 in zip(s0.seq_word.vals, s1.seq_word.vals)
             if v1 != v2
         ]
         if len(mismatched_tokens) > 0:
             zwarn(
                 f"Mismatch token NUM={len(mismatched_tokens)}: {mismatched_tokens}"
             )
             if conf.p2d_change_words:
                 s0.build_words(s1.seq_word.vals)  # use the other one!!
             # breakpoint()
         # note: build again!
         s0.build_dep_tree(s1.tree_dep.seq_head.vals, [
             self.p2d_udp_converter(z) for z in s1.tree_dep.seq_label.vals
         ])
         if conf.p2d_use_xpos:
             trg_pos_list = s1.info.get["xpos"]
         else:
             trg_pos_list = s1.seq_upos.vals
         s0.build_uposes([self.p2d_upos_converter(z) for z in trg_pos_list])
Пример #14
0
 def __init__(self):
     self.input = ReaderGetterConf()
     self.aux = ReaderGetterConf()
     self.output = WriterGetterConf()
     # --
     self.word_map = "PTB"
     self.output_sent_and_discard_nonhit = False
     self.convert_f = ""  # further do conversion, like "convert_zh"
     self.change_words = False  # whether change to words from trg sent?
     # --
     # indexer conf
     self.delete_char_scheme = ""  # ar
     self.change_char_scheme = ""  # zh/en
     self.fuzzy_no_repeat_query = False  # no repeat for fuzzy match!
     self.fuzzy_word_cnum = 0  # allow how many chars in one word to be diff?
     self.fuzzy_seq_wnum = Constants.INT_PRAC_MAX  # allow how many words in seq to be diff?
     self.fuzzy_seq_wrate = 0.  # allow how much ratio of words in seq to be diff?
     self.no_exact_match = False  # only do fuzzy match (just debugging)
Пример #15
0
 def __init__(self):
     super().__init__()
     # files
     self.take_ratio = 1.0
     self.main = ReaderGetterConf()  # as gold files
     self.gold = ""
     self.align_gold_sent = False
     self.fake_gold_frames = False  # put syntax frames to gold
     # eval with preds
     self.extra = ReaderGetterConf()
     self.preds = []  # list of preds
     self.econf: FrameEvalConf = ConfEntryChoices(
         {
             'frame': FrameEvalConf(),
             'fn': MyFNEvalConf(),
             'pb': MyPBEvalConf()
         }, 'frame')
     # self.econf = FrameEvalConf()
     # others
     self.gold_set_ee_heads = False  # try to auto assign heads
     self.pred_set_ee_heads = False  # try to auto assign heads
Пример #16
0
def main(input_file: str, max_budget=1, output_file=''):
    cc = Counter()
    max_budget = int(max_budget)
    all_insts = list(ReaderGetterConf().get_reader(input_path=input_file))
    from msp2.data.vocab.frames import RoleBudgetHelper
    budgets = RoleBudgetHelper.build_role_budgets_from_data(all_insts, max_budget=max_budget)
    if output_file in ["", "-"]:
        # zlog(budgets)
        # better printing
        for k in sorted(budgets.keys()):
            zlog(f"'{k}': {budgets[k]},")
    else:
        default_json_serializer.to_file(budgets, output_file)
Пример #17
0
def main(input_file: str, output_file: str):
    cc = Counter()
    all_insts = list(ReaderGetterConf().get_reader(input_path=input_file))
    hf = HeadFinder("NOUN")
    for sent in yield_sents(all_insts):
        cc["sent"] += 1
        for evt in sent.events:
            cc["frame"] += 1
            for arg in list(evt.args):
                cc["arg"] += 1
                m = arg.arg.mention
                widx, wlen = m.get_span()
                hidx = hf.find_shead(m.sent, widx, wlen)
                m.set_span(hidx, 1)
    # --
    with WriterGetterConf().get_writer(output_path=output_file) as writer:
        writer.write_insts(all_insts)
    # --
    zlog(f"Convert from {input_file} to {output_file}: {cc}")
Пример #18
0
def main(vocab_file: str, input_path: str, output_file='lt.pkl'):
    # first get vocab
    vocabs = default_pickle_serializer.from_file(vocab_file)
    arg_voc = vocabs[0]['arg']
    zlog(f"Read {arg_voc} from {vocab_file}")
    # make it to BIO-vocab
    bio_voc = SeqVocab(arg_voc)
    zlog(f"Build bio-voc of {bio_voc}")
    # read insts
    insts = list(ReaderGetterConf().get_reader(
        input_path=input_path))  # read from stdin
    all_sents = list(yield_sents(insts))
    # --
    mat = np.ones([len(bio_voc), len(bio_voc)],
                  dtype=np.float32)  # add-1 smoothing!
    cc = Counter()
    for sent in all_sents:
        for evt in sent.events:
            labels = ['O'] * len(sent)
            for arg in evt.args:
                widx, wlen = arg.mention.get_span()
                labels[widx:wlen] = ["B-" + arg.role
                                     ] + ["I-" + arg.role] * (wlen - 1)
            for a, b in zip(labels, labels[1:]):
                cc[f"{a}->{b}"] += 1
                mat[bio_voc[a], bio_voc[b]] += 1
        # --
    # --
    v = SimpleVocab()
    for name, count in cc.items():
        v.feed_one(name, count)
    v.build_sort()
    print(v.get_info_table()[:50].to_string())
    # OtherHelper.printd(cc)
    # --
    # normalize & log according to row and save
    mat = mat / mat.sum(-1, keepdims=True)
    mat = np.log(mat)
    default_pickle_serializer.to_file(mat, output_file)
Пример #19
0
def main(input_path):
    insts = list(ReaderGetterConf().get_reader(
        input_path=input_path))  # read from stdin
    all_sents = list(yield_sents(insts))
    set_ee_heads(insts)
    # --
    cc = Counter()
    for sent in all_sents:
        cc["sent"] += 1
        arg_maps = [[] for _ in range(len(sent))]
        for evt in sent.events:
            cc["evt"] += 1
            for arg in evt.args:
                # --
                # no VERB
                if arg.role in ["V", "C-V"]:
                    cc["argV"] += 1
                    continue
                # --
                cc["arg"] += 1
                ef = arg.arg
                shidx = ef.mention.shead_widx
                span = ef.mention.get_span()
                arg_maps[shidx].append(ZObject(evt=evt, ef=ef, span=span))
        # check for all tokens
        cc["tok"] += len(arg_maps)
        for one_objs in arg_maps:
            cc[f"tok_N{len(one_objs)}"] += 1
            all_spans = set(z.span for z in one_objs)
            cc[f"tok_N{len(one_objs)}S{len(all_spans)}"] += 1
            # --
            if len(one_objs) > 0:
                cc[f"tok_diff={len(all_spans)>1}"] += 1
            if len(all_spans) > 1:
                breakpoint()
                pass
        # --
    # --
    OtherHelper.printd(cc)
Пример #20
0
 def __init__(self):
     # ==
     # top/group-level info (used at outside, put here for convenience)
     self.group_name = ""
     self.group_files = [
     ]  # List: # -> "input_file" or Dict: sub_name -> "input_file"
     self.group_tasks = []  # tasks to perform! note: allow sub-name!
     self.group_info = {}  # extra info?
     self.group_joint = False  # join all these into one dataset?
     # train (train)
     self.group_sample_rate = SVConf().direct_update(
         val=1., which_idx="cidx", mode="none")  # outside_sample by rate
     self.group_sample_alpha = 0.  # inside_sample by len(inst)**alpha
     # eval (test/dev)
     self.group_eval_weight = 1.  # weight for final eval
     # ==
     # (static) io
     self.R = ReaderGetterConf()
     self.W = WriterGetterConf()
     # - paths (further we have default ones for "*_gold", "*_output" if not provided in extras)
     self.input_dir = "./"  # if needed
     self.input_file = ""
     self.gold_file = ""  # by default the same as input_file
     self.output_dir = "./"  # if needed
     self.output_file = ""
     self.output_prefix = "_zout"  # default output prefix, full will be "{this}.{wset}.json"
     # - special
     self.preprocessors = []  # need to slightly modify the data?
     # self.approx_prev_next = False  # approx. setting of prev & next when loading, note: deprecated
     self.presample = 1.0  # (>1=N,<1=Rate) random sample how much at the very beginning, as pre-processing for convenience!
     self.presample_shuffle = False  # whether shuffle in presample?
     self.presample_reverse = False  # from back to start (for convenience)
     # ==
     # runtime
     self.convert_conf = ZIConverterConf()
     self.batch_conf = ZIBatcherConf()
Пример #21
0
 def __init__(self):
     self.R = ReaderGetterConf()
     self.result_key = ""  # by default, use input file name!
     self.result_center = "res.json"  # store key
     # for query
     self.key_re_pattern = ".*"
Пример #22
0
 def __init__(self):
     self.src_input = ReaderGetterConf()  # src (dep + srl)
     self.trg_input = ReaderGetterConf()  # trg (dep)
     self.output = WriterGetterConf()
     # --
     self.method = "path"  # span/path
Пример #23
0
 def __init__(self):
     self.gold = ReaderGetterConf()
     self.pred = ReaderGetterConf()
     self.result_file = ""  # file to output details
     self.econf: EvalConf = None
     self.print_details = True  # whether print get_detailed_str()
Пример #24
0
 def __init__(self):
     self.train = ReaderGetterConf()
     self.econf = RuleTargetExtractorConf()
     self.save_name = "rule.model.json"
Пример #25
0
def main():
    insts = list(ReaderGetterConf().get_reader())  # read from stdin
    for sent in yield_sents(insts):
        sorted_evts = sorted(sent.events, key=lambda x: x.mention.get_span())
        for evt in sorted_evts:
            print(" ".join(evt.info["slab"]))
Пример #26
0
 def __init__(self):
     self.gold = ReaderGetterConf()
     self.pred = ReaderGetterConf()
     # --
     self.output = WriterGetterConf()
Пример #27
0
 def __init__(self):
     self.R = ReaderGetterConf()
     self.W = WriterGetterConf()
     self.direction = "short"  # short:ARG->A, long:A->ARG
Пример #28
0
 def __init__(self):
     self.gold = ReaderGetterConf()
     self.pred = ReaderGetterConf()
Пример #29
0
 def __init__(self):
     self.input = ReaderGetterConf()