Пример #1
0
def refine_corpus(corpus, rule_path, output=None, thread=None):
    """
    Clean up the given corpus according to the rules defined in the files.
    This method utilizes multithreading to accelerate the process.

    Arguments:
        corpus(str): Path to the corpus file.
        rule_path(str): Path to where "parentheses.tsv" and 
            "refine_list.tsv" are.
        thread(int): Number of thread to process.
        output(str): Path to the output file.
    """
    if output is None:
        output = corpus[:-4] + "_cleaned.txt"
    if not rule_path.endswith("/"):
        rule_path += "/"

    # Load rule files
    file_p = rule_path + "parentheses.tsv"
    file_r = rule_path + "refine_list.tsv"
    parentheses = load_rules(file_p)
    refine_list = load_rules(file_r)

    # Acquire the corpus (skip first line)
    raw_data = readlines(corpus)

    # Threading
    param = (parentheses, refine_list)
    result = generic_threading(thread, raw_data, corpus_cleanup, param)

    # Write all result to file
    write_to_file(output, result)
Пример #2
0
def extract_vocabularies(corpus, rule, output=None, thread=None):
    """
    Extract vocabularies from the corpus, additional rules to achieve
    purer vocabularies can be defined in src/refine_rules/voc_cleanup.tsv

    Arguments:
        corpus(str): Path to the corpus file.
        rule(str): Path to the processing rule file.
        thread(int): Number of thread to process.
        output(str): Path to the output file.
    """
    if output is None:
        output = corpus[:-4] + "_vocabulary_list.json"

    # Load rules
    rules = load_rules(rule)

    # Acquire the corpus
    raw_data = readlines(corpus, limit=None)

    # Threading (TO-BE-IMPLEMENTED)
    # param = (rules, "SPLIT_WORDS")
    # generic_threading(thread, raw_data, punctuation_cleanup, param)
    result = punctuation_cleanup(0, raw_data, rules, mode='SPLIT_WORDS')

    # Counting occurance
    print("Counting occurance...")
    voc_list = Counter(result)

    # Save vocabulary to file
    write_to_file(output, voc_list)
Пример #3
0
def preliminary_cleanup(corpus, rule, output=None, thread=None, limit=None):
    """
    Preliminary cleanup the corpus to make it easier for further
    processing methods. This method can be used to correct the
    missing spaces after punctuations any other customized rules
    can be added to the rule file, see punctuation_cleanup in utils
    for the formatting of the rules.

    Arguments:
        corpus(str): Path to the corpus file.
        rule(str): Path to the processing rule file.
        thread(int): Number of thread to process.
        output(str): Path to the output file.
    """
    # output name
    if output is None:
        output = corpus[:-4] + "_preprocessed.tsv"

    # Load rules
    rules = load_rules(rule)
    # Load data
    raw_data = readlines(corpus, limit=limit, skip=True)

    # Threading
    param = (rules, "PRELIMINARY")
    result = generic_threading(thread, raw_data, punctuation_cleanup, param)

    # Write result to file
    write_to_file(output, result)
Пример #4
0
def test_exclude_word(engine):
    # Rust engine doesn't work here, because Re2 doesn't support backtracking operator
    parser = engine(load_rules("examples/excluding-word.rita"))

    t1 = "weather is awesome"
    t2 = "weather is cold"

    r1 = parser(t1)
    r2 = parser(t2)

    assert r1[0] == ("weather is awesome", "GOOD_WEATHER")
    assert len(r2) == 0
Пример #5
0
def test_dash_case(engine):
    parser = engine(load_rules("examples/dress-match.rita"))
    text = """
    Fitted, knee-length dress in soft velour
    """

    entities = set(parser(text))
    print(entities)
    expected = set([
        ("Fitted, knee-length dress", "DRESS_TYPE"),
        ("soft velour", "DRESS_FABRIC"),
    ])

    assert entities.issuperset(expected)
Пример #6
0
def test_color_car(engine):
    text = "John Silver was driving a red car. It was BMW X6 Mclass. John likes driving it very much."
    parser = engine(load_rules("examples/color-car.rita"))
    entities = set(parser(text))
    print(entities)

    expected = set([
        ("John Silver", "PERSON"),  # Normal NER
        ("red car", "CAR_COLOR"),  # Our first rule
        ("BMW X6 Mclass", "CAR_MODEL"),  # Our second rule
        ("John likes driving", "LIKED_ACTION")  # Our third rule
    ])

    assert entities.issuperset(expected)
Пример #7
0
def test_benchmark(benchmark, engine, bench_text):
    """
    These tests will only run if parameters:
    `--benchmark-enable` or
    `--benchmark-only`
    are added
    """
    parser = engine(load_rules("examples/cheap-phones.rita"))

    def parse_rows(parser, rows):
        for r in rows:
            parser(r)

    benchmark.pedantic(parse_rows,
                       args=(parser, bench_text),
                       iterations=3,
                       rounds=3)
Пример #8
0
def test_escape_string(engine):
    # If it compiles - good enough
    engine(load_rules("examples/match-with-escaped-string.rita"))
Пример #9
0
 def get_rules(self):
     return load_rules(self.rulesname)
Пример #10
0
    def __init__(self, config, iface_int, iface_ext):
        self.iface_int = iface_int
        self.iface_ext = iface_ext

        self.geodb = load_geodb()
        self.rules = load_rules(config['rule'], self.geodb)