Пример #1
0
def convert_cql2hfr(**kwargs):
    cql_path = Path(kwargs['input'])
    dialect_pack_name = kwargs["dp"] if kwargs["dp"] else DEFAULT_DPACK
    hfr_dir = DIALECT_PACK_DIR / dialect_pack_name / "hfr_rules"
    hfr_dir.mkdir(exist_ok=True)
    hfr_file_path = hfr_dir / (cql_path.stem + ".tsv")
    cql_rules = cql_path.read_text(encoding='utf-8')
    hfr = cqlr2hfr(cql_rules)
    hfr_file_path.write_text(hfr, encoding='utf-8')
Пример #2
0
def rdr(infile, outdir=None, keep="model", type="cql"):
    """

    :param infile: file to process. should be a POS tagged file
    :param outdir: optional. should be the output directory
    :param keep: all RDR files if "all", the .RDR and .DICT files if "model", none if None
    :return: RDR's log
    """
    infile = Path(infile).resolve()

    # run the RDR training
    log = r(str(infile), mode="train", verbose=True)

    # translate to adjustment tsv
    rdr_rules = Path(infile.parent /
                     (infile.name + ".RDR")).read_text(encoding="utf-8-sig")
    rules = rdr_2_replace_matcher(rdr_rules)
    if type is not "cql":
        rules = cqlr2hfr(rules)
    # remove RDR files and copy them if needed
    rdr_postprocess(rules, infile, outdir=outdir, keep=keep)

    return log if log else None
Пример #3
0
def extract_seg_rule(corpus_file_path,
                     dialect_pack_name=DEFAULT_DPACK,
                     type='cql',
                     no_epochs=3):
    """Extracts segmentation rules.

    Args:
        corpus_file_path (pathlib): input file's path
        dialect_pack_name (string, optional): name of dialect pack for which rules are. Defaults to DEFAULT_DPACK.
        type (str, optional): type of rules can be human friendly rule(hfr) or corpus query rule. Defaults to 'cql'.
        no_epochs (int, optional): Number of times word filters need to perform. Defaults to 3.

    Returns:
        str: segmentation rules
    """
    new_word_list = []
    new_remove_word_list = []
    corpus_file_name = corpus_file_path.stem[:-2]
    number_of_segmentation = 1
    human_data = corpus_file_path.read_text(encoding='utf-8-sig')
    human_data = post_process_human_data(human_data)
    while True:
        bilou_tag_data = get_bilou_tag_data(human_data)
        print(
            f'[INFO]: SEGMENTATION PHASE {number_of_segmentation} COMPLETED..')
        new_word_list, new_remove_word_list = filter_seg_errors(
            bilou_tag_data, human_data)
        print('[INFO]: FILTER SEGMENTATION ERROR COMPLETED..')
        if new_word_list:
            new_word_list = add_word_2_adjustment(new_word_list,
                                                  corpus_file_name,
                                                  dialect_pack_name,
                                                  type='words')
        if new_remove_word_list:
            new_remove_word_list = add_word_2_adjustment(new_remove_word_list,
                                                         corpus_file_name,
                                                         dialect_pack_name,
                                                         type='remove')
        bilou_tag_data = get_bilou_tag_data(human_data)
        word_list, remove_word_list = filter_seg_errors(
            bilou_tag_data, human_data)
        new_remove_word_list = [
            remove_word for remove_word in remove_word_list
            if remove_word not in new_remove_word_list
        ]
        new_word_list = [
            word for word in word_list if word not in new_word_list
        ]
        number_of_segmentation += 1
        if (not new_word_list and not new_remove_word_list
            ) or number_of_segmentation > no_epochs:
            break
    bilou_tag_data_path = (corpus_file_path.parent /
                           f'{corpus_file_name}_tr_data.txt')
    bilou_tag_data_path.write_text(bilou_tag_data, encoding='utf-8')
    bilou_rules = get_bilou_rules(bilou_tag_data_path)
    (corpus_file_path.parent /
     f'{corpus_file_name}_bilou_rules.txt').write_text("\n".join(bilou_rules),
                                                       encoding='utf-8')
    new_cql_rules = []
    bilou_tag_init = (corpus_file_path.parent /
                      f'{bilou_tag_data_path.name}.INIT').read_text(
                          encoding='utf-8-sig')
    new_cql_rules = convert_bilou_rules(bilou_rules, bilou_tag_init,
                                        human_data)
    new_cql_rules = "\n".join(new_cql_rules)
    rdr_postprocess(bilou_tag_data_path)
    if type != 'cql':
        new_cql_rules = cqlr2hfr(new_cql_rules)
    return new_cql_rules
Пример #4
0
def test_cql2hfr(cqlr, hfr):
    hfr_result = cqlr2hfr(cqlr)
    print(hfr_result)
    assert hfr_result == hfr
    print("Test pass..")