def convert_cql2hfr(**kwargs): cql_path = Path(kwargs['input']) dialect_pack_name = kwargs["dp"] if kwargs["dp"] else DEFAULT_DPACK hfr_dir = DIALECT_PACK_DIR / dialect_pack_name / "hfr_rules" hfr_dir.mkdir(exist_ok=True) hfr_file_path = hfr_dir / (cql_path.stem + ".tsv") cql_rules = cql_path.read_text(encoding='utf-8') hfr = cqlr2hfr(cql_rules) hfr_file_path.write_text(hfr, encoding='utf-8')
def rdr(infile, outdir=None, keep="model", type="cql"): """ :param infile: file to process. should be a POS tagged file :param outdir: optional. should be the output directory :param keep: all RDR files if "all", the .RDR and .DICT files if "model", none if None :return: RDR's log """ infile = Path(infile).resolve() # run the RDR training log = r(str(infile), mode="train", verbose=True) # translate to adjustment tsv rdr_rules = Path(infile.parent / (infile.name + ".RDR")).read_text(encoding="utf-8-sig") rules = rdr_2_replace_matcher(rdr_rules) if type is not "cql": rules = cqlr2hfr(rules) # remove RDR files and copy them if needed rdr_postprocess(rules, infile, outdir=outdir, keep=keep) return log if log else None
def extract_seg_rule(corpus_file_path, dialect_pack_name=DEFAULT_DPACK, type='cql', no_epochs=3): """Extracts segmentation rules. Args: corpus_file_path (pathlib): input file's path dialect_pack_name (string, optional): name of dialect pack for which rules are. Defaults to DEFAULT_DPACK. type (str, optional): type of rules can be human friendly rule(hfr) or corpus query rule. Defaults to 'cql'. no_epochs (int, optional): Number of times word filters need to perform. Defaults to 3. Returns: str: segmentation rules """ new_word_list = [] new_remove_word_list = [] corpus_file_name = corpus_file_path.stem[:-2] number_of_segmentation = 1 human_data = corpus_file_path.read_text(encoding='utf-8-sig') human_data = post_process_human_data(human_data) while True: bilou_tag_data = get_bilou_tag_data(human_data) print( f'[INFO]: SEGMENTATION PHASE {number_of_segmentation} COMPLETED..') new_word_list, new_remove_word_list = filter_seg_errors( bilou_tag_data, human_data) print('[INFO]: FILTER SEGMENTATION ERROR COMPLETED..') if new_word_list: new_word_list = add_word_2_adjustment(new_word_list, corpus_file_name, dialect_pack_name, type='words') if new_remove_word_list: new_remove_word_list = add_word_2_adjustment(new_remove_word_list, corpus_file_name, dialect_pack_name, type='remove') bilou_tag_data = get_bilou_tag_data(human_data) word_list, remove_word_list = filter_seg_errors( bilou_tag_data, human_data) new_remove_word_list = [ remove_word for remove_word in remove_word_list if remove_word not in new_remove_word_list ] new_word_list = [ word for word in word_list if word not in new_word_list ] number_of_segmentation += 1 if (not new_word_list and not new_remove_word_list ) or number_of_segmentation > no_epochs: break bilou_tag_data_path = (corpus_file_path.parent / f'{corpus_file_name}_tr_data.txt') bilou_tag_data_path.write_text(bilou_tag_data, encoding='utf-8') bilou_rules = get_bilou_rules(bilou_tag_data_path) (corpus_file_path.parent / f'{corpus_file_name}_bilou_rules.txt').write_text("\n".join(bilou_rules), encoding='utf-8') new_cql_rules = [] bilou_tag_init = (corpus_file_path.parent / f'{bilou_tag_data_path.name}.INIT').read_text( encoding='utf-8-sig') new_cql_rules = convert_bilou_rules(bilou_rules, bilou_tag_init, human_data) new_cql_rules = "\n".join(new_cql_rules) rdr_postprocess(bilou_tag_data_path) if type != 'cql': new_cql_rules = cqlr2hfr(new_cql_rules) return new_cql_rules
def test_cql2hfr(cqlr, hfr): hfr_result = cqlr2hfr(cqlr) print(hfr_result) assert hfr_result == hfr print("Test pass..")