def __init__(self): self.input = ReaderGetterConf() self.aux = ReaderGetterConf() self.output = WriterGetterConf() self.output_sent_and_discard_nonhit = False # -- # indexer options self.key_dels = "-_" # delete these chars for key self.search_topk = 1 # search for topK?
def __init__(self): super().__init__() # files self.main = ReaderGetterConf() # as gold files self.gold = "" # eval with preds self.extra = ReaderGetterConf() self.preds = [] # list of preds self.econf = DparEvalConf().direct_update(deplab_l1=True) # by default only check l1
def main(input_file: str, output_file: str, checking_file: str, keep_rate: float): keep_rate = float(keep_rate) _gen = Random.get_np_generator(12345) rstream = Random.stream(_gen.random_sample) # -- # read input stat = {} input_sents = list( yield_sents(ReaderGetterConf().get_reader(input_path=input_file))) stat["input"] = get_stat(input_sents) if checking_file: checking_sents = list( yield_sents( ReaderGetterConf().get_reader(input_path=checking_file))) stat["check"] = get_stat(checking_sents) # collect keys hit_keys = set() for one_check_sent in checking_sents: tok_key = ''.join(one_check_sent.seq_word.vals).lower() tok_key = ''.join(tok_key.split()) # split and join again hit_keys.add(tok_key) # filter filtered_sents = [] for one_input_sent in input_sents: tok_key = ''.join(one_input_sent.seq_word.vals).lower() tok_key = ''.join(tok_key.split()) # split and join again if tok_key not in hit_keys: filtered_sents.append(one_input_sent) else: filtered_sents = input_sents stat["filter"] = get_stat(filtered_sents) # sample if keep_rate < 1.: sample_sents = [ s for r, s in zip(rstream, filtered_sents) if r < keep_rate ] elif keep_rate > 10: sample_sents = [z for z in filtered_sents] for _ in range(10): _gen.shuffle(sample_sents) sample_sents = sample_sents[:int(keep_rate)] else: sample_sents = filtered_sents stat["sample"] = get_stat(sample_sents) # write if os.path.exists(output_file): assert False, f"File exists: {output_file}, delete it first!" if output_file: with WriterGetterConf().get_writer(output_path=output_file) as writer: writer.write_insts(sample_sents) # stat zlog( f"Read {input_file}, check {checking_file}, output {output_file}, stat:" ) OtherHelper.printd(stat)
def main(input_file: str, output_file: str): reader_conf = ReaderGetterConf().direct_update(input_format='conllufipb') reader_conf.validate() # -- cc = Counter() arg_cc = Counter() all_insts = list(reader_conf.get_reader(input_path=input_file)) for sent in all_insts: fields_args, fields_preds = sent.info[8], sent.info[9] assert len(fields_args) == len(fields_preds) and len(fields_args) == len(sent) # first collect preds all_preds = {} # widx -> event for widx, vv in enumerate(fields_preds): pred_name = None for vv2 in vv.split("|"): if vv2.startswith("PBSENSE="): assert pred_name is None pred_name = vv2.split("=")[-1] if pred_name is not None: evt = sent.make_event(widx, 1, type=pred_name) assert widx not in all_preds all_preds[widx] = evt # then collect args for widx, vv in enumerate(fields_args): for vv2 in vv.split("|"): if ":" not in vv2: continue tidx, aname = vv2.split(":", 1) tidx = int(tidx) role = None if aname.startswith("PBArg_"): nn = aname[len("PBArg_"):] role = f"ARG{nn}" elif aname.startswith("PBArgM_"): _, nn = aname.split("_") role = f"ARGM-{str.upper(nn)}" if role is not None: evt = all_preds[tidx-1] ef = sent.make_entity_filler(widx, 1, type="UNK") evt.add_arg(ef, role) arg_cc[role] += 1 # -- cc["sent"] += 1 cc["frames"] += len(sent.events) cc["args"] += sum(len(z.args) for z in sent.events) # -- # -- with WriterGetterConf().get_writer(output_path=output_file) as writer: writer.write_insts(all_insts) # -- zlog(f"Read fipb from {input_file} to {output_file}: {cc}") zlog(f"Role counts = {arg_cc}")
def main(input_format, *input_files: str): reader_conf = ReaderGetterConf().direct_update(input_format=input_format) reader_conf.validate() # -- all_insts = [] for ff in input_files: one_insts = list(reader_conf.get_reader(input_path=ff)) zlog(f"Read from {ff}: {len(one_insts)} instances.") all_insts.extend(one_insts) # -- if input_format == "conllu": do_stat(all_insts) do_stat_srl(all_insts)
def __init__(self): self.frame_file = "" self.train = ReaderGetterConf() self.lex_conf = LexConstrainerConf() self.fe_conf = FEConstrainerConf() self.lex_save_name = "cons_lex.json" self.fe_save_name = "cons_fe.json"
def main(input_format, *input_files: str): reader_conf = ReaderGetterConf().direct_update(input_format=input_format) reader_conf.validate() # -- all_insts = [] for ff in input_files: one_insts = list(reader_conf.get_reader(input_path=ff)) cc = Counter() for sent in yield_sents(one_insts): cc['sent'] += 1 for evt in sent.events: cc['evt'] += 1 cc['arg'] += len(evt.args) zlog( f"Read from {ff}: {cc['sent']/1000:.1f}k&{cc['evt']/1000:.1f}k&{cc['arg']/1000:.1f}k" )
def __init__(self): self.R = ReaderGetterConf() self.W = WriterGetterConf() # ----- # ann: to be added self.ann_batch_size = 1 # read how many instances and then fire? self.report_batch_interval = 1000 # how many batches to report
def main(output_prefix, *input_files): # input all_sents = [] for f in input_files: one_reader = ReaderGetterConf().get_reader(input_path=f) one_insts = list(one_reader) all_sents.append([z for z in yield_sents(one_insts)]) zlog(f"Read from {f}: {len(all_sents[-1])} sents") # align sent_map = OrderedDict() for fidx, sents in enumerate(all_sents): for sent in sents: doc_id = sent.info.get("doc_id", "UNK") if doc_id.split("/", 1)[0] == "ontonotes": doc_id = doc_id.split("/", 1)[1] key = doc_id + "|".join(sent.seq_word.vals) # map by doc_id + key if key not in sent_map: sent_map[key] = [sent] else: sent_map[key].append(sent) # -- num_files = len(input_files) matched_sents = [vs for vs in sent_map.values() if len(vs) == num_files] unmatched_sents = [vs for vs in sent_map.values() if len(vs) != num_files] zlog(f"Aligned sent of {len(matched_sents)}") breakpoint() # output for outi in range(num_files): out_sents = [z[outi] for z in matched_sents] writer = WriterGetterConf().get_writer( output_path=f"{output_prefix}{outi}") writer.write_insts(out_sents) writer.close()
def main(*aligned_files): RANDOM_DROP_EVT_RATE = 0.15 # input aligned_insts = [] for f in aligned_files: one_reader = ReaderGetterConf().get_reader(input_path=f) one_insts = list(one_reader) aligned_insts.append([z for z in yield_sents(one_insts)]) # filter good_idxes = [] for idx in range(len(aligned_insts[0])): sent_good = True for sent in yield_sents([z[idx] for z in aligned_insts]): if RANDOM_DROP_EVT_RATE > 0: for evt in list(sent.events): if np.random.random_sample() < RANDOM_DROP_EVT_RATE: sent.delete_frame(evt, "evt") for evt in sent.events: hits = set() for arg in evt.args: widx, wlen = arg.arg.mention.get_span() for ii in range(widx, widx + wlen): if ii in hits: sent_good = False hits.add(ii) if sent_good: good_idxes.append(idx) # output output_prefix = "_tmp.json" for outi, insts in enumerate(aligned_insts): filtered_insts = [insts[ii] for ii in good_idxes] writer = WriterGetterConf().get_writer( output_path=f"{output_prefix}{outi}") writer.write_insts(filtered_insts) writer.close()
def main(file_in="", file_out=""): insts = list(ReaderGetterConf().get_reader(input_path=file_in)) # read from stdin with WriterGetterConf().get_writer(output_path=file_out) as writer: for inst in insts: for sent in yield_sents([inst]): sent.delete_frames("evt") sent.delete_frames("ef") writer.write_inst(inst)
def __init__(self): self.br = ReaderGetterConf() # basic reader # extra functions self.wl_use_lc = False # use lower case words and lemmas? self.deplab_use_label0 = True # using only first-level ud label self.sent_loss_weight_non = 1.0 # sent level loss_weight_non, default is 1. self.assume_frame_lu = False # special mode, assume that we have the input frame's LUs self.set_ee_heads = True # assign heads for evt and args (by default simply assign them!!)
def annotate(self, insts: List): conf: AnnotatorP2DConf = self.conf # -- # get all sentences and run in batch all_sents = list(yield_sents(insts)) tmp_input = os.path.join(conf.p2d_tmp_dir, "_input.penn") with zopen(tmp_input, 'w') as fd: for sent in all_sents: fd.write(sent2tree(sent) + "\n") # run tmp_output = os.path.join(conf.p2d_tmp_dir, "_output.conllu") log_cmd = f'2>{conf.p2d_log}' if conf.p2d_log else '' system(f"{self.cmd} -treeFile {tmp_input} >{tmp_output} {log_cmd}") # read output and add back conll_reader_conf = ReaderGetterConf() conll_reader_conf.input_conf.use_multiline = True conll_reader_conf.input_conf.mtl_ignore_f = "'ignore_#'" conll_reader_conf.input_format = "conllu" conll_reader_conf.input_path = tmp_output conll_reader = get_reader(conll_reader_conf) new_sents = list(conll_reader) # -- assert len(all_sents) == len(new_sents) for s0, s1 in zip(all_sents, new_sents): assert len(s0) == len(s1) mismatched_tokens = [ (v1, v2) for v1, v2 in zip(s0.seq_word.vals, s1.seq_word.vals) if v1 != v2 ] if len(mismatched_tokens) > 0: zwarn( f"Mismatch token NUM={len(mismatched_tokens)}: {mismatched_tokens}" ) if conf.p2d_change_words: s0.build_words(s1.seq_word.vals) # use the other one!! # breakpoint() # note: build again! s0.build_dep_tree(s1.tree_dep.seq_head.vals, [ self.p2d_udp_converter(z) for z in s1.tree_dep.seq_label.vals ]) if conf.p2d_use_xpos: trg_pos_list = s1.info.get["xpos"] else: trg_pos_list = s1.seq_upos.vals s0.build_uposes([self.p2d_upos_converter(z) for z in trg_pos_list])
def __init__(self): self.input = ReaderGetterConf() self.aux = ReaderGetterConf() self.output = WriterGetterConf() # -- self.word_map = "PTB" self.output_sent_and_discard_nonhit = False self.convert_f = "" # further do conversion, like "convert_zh" self.change_words = False # whether change to words from trg sent? # -- # indexer conf self.delete_char_scheme = "" # ar self.change_char_scheme = "" # zh/en self.fuzzy_no_repeat_query = False # no repeat for fuzzy match! self.fuzzy_word_cnum = 0 # allow how many chars in one word to be diff? self.fuzzy_seq_wnum = Constants.INT_PRAC_MAX # allow how many words in seq to be diff? self.fuzzy_seq_wrate = 0. # allow how much ratio of words in seq to be diff? self.no_exact_match = False # only do fuzzy match (just debugging)
def __init__(self): super().__init__() # files self.take_ratio = 1.0 self.main = ReaderGetterConf() # as gold files self.gold = "" self.align_gold_sent = False self.fake_gold_frames = False # put syntax frames to gold # eval with preds self.extra = ReaderGetterConf() self.preds = [] # list of preds self.econf: FrameEvalConf = ConfEntryChoices( { 'frame': FrameEvalConf(), 'fn': MyFNEvalConf(), 'pb': MyPBEvalConf() }, 'frame') # self.econf = FrameEvalConf() # others self.gold_set_ee_heads = False # try to auto assign heads self.pred_set_ee_heads = False # try to auto assign heads
def main(input_file: str, max_budget=1, output_file=''): cc = Counter() max_budget = int(max_budget) all_insts = list(ReaderGetterConf().get_reader(input_path=input_file)) from msp2.data.vocab.frames import RoleBudgetHelper budgets = RoleBudgetHelper.build_role_budgets_from_data(all_insts, max_budget=max_budget) if output_file in ["", "-"]: # zlog(budgets) # better printing for k in sorted(budgets.keys()): zlog(f"'{k}': {budgets[k]},") else: default_json_serializer.to_file(budgets, output_file)
def main(input_file: str, output_file: str): cc = Counter() all_insts = list(ReaderGetterConf().get_reader(input_path=input_file)) hf = HeadFinder("NOUN") for sent in yield_sents(all_insts): cc["sent"] += 1 for evt in sent.events: cc["frame"] += 1 for arg in list(evt.args): cc["arg"] += 1 m = arg.arg.mention widx, wlen = m.get_span() hidx = hf.find_shead(m.sent, widx, wlen) m.set_span(hidx, 1) # -- with WriterGetterConf().get_writer(output_path=output_file) as writer: writer.write_insts(all_insts) # -- zlog(f"Convert from {input_file} to {output_file}: {cc}")
def main(vocab_file: str, input_path: str, output_file='lt.pkl'): # first get vocab vocabs = default_pickle_serializer.from_file(vocab_file) arg_voc = vocabs[0]['arg'] zlog(f"Read {arg_voc} from {vocab_file}") # make it to BIO-vocab bio_voc = SeqVocab(arg_voc) zlog(f"Build bio-voc of {bio_voc}") # read insts insts = list(ReaderGetterConf().get_reader( input_path=input_path)) # read from stdin all_sents = list(yield_sents(insts)) # -- mat = np.ones([len(bio_voc), len(bio_voc)], dtype=np.float32) # add-1 smoothing! cc = Counter() for sent in all_sents: for evt in sent.events: labels = ['O'] * len(sent) for arg in evt.args: widx, wlen = arg.mention.get_span() labels[widx:wlen] = ["B-" + arg.role ] + ["I-" + arg.role] * (wlen - 1) for a, b in zip(labels, labels[1:]): cc[f"{a}->{b}"] += 1 mat[bio_voc[a], bio_voc[b]] += 1 # -- # -- v = SimpleVocab() for name, count in cc.items(): v.feed_one(name, count) v.build_sort() print(v.get_info_table()[:50].to_string()) # OtherHelper.printd(cc) # -- # normalize & log according to row and save mat = mat / mat.sum(-1, keepdims=True) mat = np.log(mat) default_pickle_serializer.to_file(mat, output_file)
def main(input_path): insts = list(ReaderGetterConf().get_reader( input_path=input_path)) # read from stdin all_sents = list(yield_sents(insts)) set_ee_heads(insts) # -- cc = Counter() for sent in all_sents: cc["sent"] += 1 arg_maps = [[] for _ in range(len(sent))] for evt in sent.events: cc["evt"] += 1 for arg in evt.args: # -- # no VERB if arg.role in ["V", "C-V"]: cc["argV"] += 1 continue # -- cc["arg"] += 1 ef = arg.arg shidx = ef.mention.shead_widx span = ef.mention.get_span() arg_maps[shidx].append(ZObject(evt=evt, ef=ef, span=span)) # check for all tokens cc["tok"] += len(arg_maps) for one_objs in arg_maps: cc[f"tok_N{len(one_objs)}"] += 1 all_spans = set(z.span for z in one_objs) cc[f"tok_N{len(one_objs)}S{len(all_spans)}"] += 1 # -- if len(one_objs) > 0: cc[f"tok_diff={len(all_spans)>1}"] += 1 if len(all_spans) > 1: breakpoint() pass # -- # -- OtherHelper.printd(cc)
def __init__(self): # == # top/group-level info (used at outside, put here for convenience) self.group_name = "" self.group_files = [ ] # List: # -> "input_file" or Dict: sub_name -> "input_file" self.group_tasks = [] # tasks to perform! note: allow sub-name! self.group_info = {} # extra info? self.group_joint = False # join all these into one dataset? # train (train) self.group_sample_rate = SVConf().direct_update( val=1., which_idx="cidx", mode="none") # outside_sample by rate self.group_sample_alpha = 0. # inside_sample by len(inst)**alpha # eval (test/dev) self.group_eval_weight = 1. # weight for final eval # == # (static) io self.R = ReaderGetterConf() self.W = WriterGetterConf() # - paths (further we have default ones for "*_gold", "*_output" if not provided in extras) self.input_dir = "./" # if needed self.input_file = "" self.gold_file = "" # by default the same as input_file self.output_dir = "./" # if needed self.output_file = "" self.output_prefix = "_zout" # default output prefix, full will be "{this}.{wset}.json" # - special self.preprocessors = [] # need to slightly modify the data? # self.approx_prev_next = False # approx. setting of prev & next when loading, note: deprecated self.presample = 1.0 # (>1=N,<1=Rate) random sample how much at the very beginning, as pre-processing for convenience! self.presample_shuffle = False # whether shuffle in presample? self.presample_reverse = False # from back to start (for convenience) # == # runtime self.convert_conf = ZIConverterConf() self.batch_conf = ZIBatcherConf()
def __init__(self): self.R = ReaderGetterConf() self.result_key = "" # by default, use input file name! self.result_center = "res.json" # store key # for query self.key_re_pattern = ".*"
def __init__(self): self.src_input = ReaderGetterConf() # src (dep + srl) self.trg_input = ReaderGetterConf() # trg (dep) self.output = WriterGetterConf() # -- self.method = "path" # span/path
def __init__(self): self.gold = ReaderGetterConf() self.pred = ReaderGetterConf() self.result_file = "" # file to output details self.econf: EvalConf = None self.print_details = True # whether print get_detailed_str()
def __init__(self): self.train = ReaderGetterConf() self.econf = RuleTargetExtractorConf() self.save_name = "rule.model.json"
def main(): insts = list(ReaderGetterConf().get_reader()) # read from stdin for sent in yield_sents(insts): sorted_evts = sorted(sent.events, key=lambda x: x.mention.get_span()) for evt in sorted_evts: print(" ".join(evt.info["slab"]))
def __init__(self): self.gold = ReaderGetterConf() self.pred = ReaderGetterConf() # -- self.output = WriterGetterConf()
def __init__(self): self.R = ReaderGetterConf() self.W = WriterGetterConf() self.direction = "short" # short:ARG->A, long:A->ARG
def __init__(self): self.gold = ReaderGetterConf() self.pred = ReaderGetterConf()
def __init__(self): self.input = ReaderGetterConf()