Пример #1
0
def contract_graph(graph_file: str) -> None:
    '''Remove any additional information needed for filtering.'''
    with open_to_write(graph_file + '.tmp') as graph_tmp_fp:
        logging.getLogger('main').info('Contracting the graph...')
        for w1, w2, rule, freq in read_tsv_file(graph_file,
                                                show_progressbar=True):
            write_line(graph_tmp_fp, (w1, w2, rule))
    rename_file(graph_file + '.tmp', graph_file)
Пример #2
0
def filter_min_rule_freq(graph_file: str) -> None:
    logging.getLogger('main').info('filter_min_rule_freq')
    min_rule_freq = shared.config['preprocess'].getint('min_rule_freq')
    with open_to_write(graph_file + '.tmp') as graph_fil_fp:
        for (rule,
             freq), wordpairs in read_tsv_file_by_key(graph_file, (3, 4),
                                                      show_progressbar=True):
            if len(wordpairs) >= min_rule_freq:
                for word_1, word_2 in wordpairs:
                    write_line(graph_fil_fp, (word_1, word_2, rule, freq))
    rename_file(graph_file + '.tmp', graph_file)
    update_file_size(graph_file)
Пример #3
0
def filter_max_edges_per_wordpair(graph_file: str) -> None:
    logging.getLogger('main').info('filter_max_edges_per_wordpair')
    sort_file(graph_file, stable=True, key=(1, 2))
    max_edges_per_wordpair = \
        shared.config['preprocess'].getint('max_edges_per_wordpair')
    with open_to_write(graph_file + '.tmp') as graph_fil_fp:
        for (word_1,
             word_2), edges in read_tsv_file_by_key(graph_file, (1, 2),
                                                    show_progressbar=True):
            for rule, freq in edges[:max_edges_per_wordpair]:
                write_line(graph_fil_fp, (word_1, word_2, rule, freq))
    rename_file(graph_file + '.tmp', graph_file)
    sort_file(graph_file, key=3)
    sort_file(graph_file, stable=True, numeric=True, reverse=True, key=4)
    update_file_size(graph_file)
Пример #4
0
def expand_graph(graph_file: str) -> None:
    '''Annotate graph with additional information needed for filtering:
       currently rule frequencies.'''
    min_freq = shared.config['preprocess'].getint('min_rule_freq')
    with open_to_write(graph_file + '.tmp') as graph_tmp_fp:
        logging.getLogger('main').info('Expanding the graph for filtering...')
        for rule, wordpairs in read_tsv_file_by_key(graph_file,
                                                    3,
                                                    show_progressbar=True):
            freq = len(wordpairs)
            if freq >= min_freq:
                for w1, w2 in wordpairs:
                    write_line(graph_tmp_fp, (w1, w2, rule, freq))
    rename_file(graph_file + '.tmp', graph_file)
    update_file_size(graph_file)
Пример #5
0
def filter_max_num_rules(graph_file: str) -> None:
    logging.getLogger('main').info('filter_max_num_rules')
    sort_file(graph_file, stable=True, numeric=True, reverse=True, key=4)
    max_num_rules = shared.config['preprocess'].getint('max_num_rules')
    min_rule_freq = shared.config['preprocess'].getint('min_rule_freq')
    progressbar = tqdm.tqdm(total=max_num_rules)
    with open_to_write(graph_file + '.tmp') as graph_fil_fp:
        num_rules = 0
        for key, wordpairs in read_tsv_file_by_key(graph_file, (3, 4)):
            rule, freq = key
            num_rules += 1
            progressbar.update()
            if int(freq) >= min_rule_freq:
                for wordpair in wordpairs:
                    w1, w2 = wordpair
                    write_line(graph_fil_fp, (w1, w2, rule, freq))
            if num_rules >= max_num_rules:
                break
    progressbar.close()
    rename_file(graph_file + '.tmp', graph_file)
    update_file_size(graph_file)