def contract_graph(graph_file: str) -> None: '''Remove any additional information needed for filtering.''' with open_to_write(graph_file + '.tmp') as graph_tmp_fp: logging.getLogger('main').info('Contracting the graph...') for w1, w2, rule, freq in read_tsv_file(graph_file, show_progressbar=True): write_line(graph_tmp_fp, (w1, w2, rule)) rename_file(graph_file + '.tmp', graph_file)
def filter_min_rule_freq(graph_file: str) -> None: logging.getLogger('main').info('filter_min_rule_freq') min_rule_freq = shared.config['preprocess'].getint('min_rule_freq') with open_to_write(graph_file + '.tmp') as graph_fil_fp: for (rule, freq), wordpairs in read_tsv_file_by_key(graph_file, (3, 4), show_progressbar=True): if len(wordpairs) >= min_rule_freq: for word_1, word_2 in wordpairs: write_line(graph_fil_fp, (word_1, word_2, rule, freq)) rename_file(graph_file + '.tmp', graph_file) update_file_size(graph_file)
def filter_max_edges_per_wordpair(graph_file: str) -> None: logging.getLogger('main').info('filter_max_edges_per_wordpair') sort_file(graph_file, stable=True, key=(1, 2)) max_edges_per_wordpair = \ shared.config['preprocess'].getint('max_edges_per_wordpair') with open_to_write(graph_file + '.tmp') as graph_fil_fp: for (word_1, word_2), edges in read_tsv_file_by_key(graph_file, (1, 2), show_progressbar=True): for rule, freq in edges[:max_edges_per_wordpair]: write_line(graph_fil_fp, (word_1, word_2, rule, freq)) rename_file(graph_file + '.tmp', graph_file) sort_file(graph_file, key=3) sort_file(graph_file, stable=True, numeric=True, reverse=True, key=4) update_file_size(graph_file)
def expand_graph(graph_file: str) -> None: '''Annotate graph with additional information needed for filtering: currently rule frequencies.''' min_freq = shared.config['preprocess'].getint('min_rule_freq') with open_to_write(graph_file + '.tmp') as graph_tmp_fp: logging.getLogger('main').info('Expanding the graph for filtering...') for rule, wordpairs in read_tsv_file_by_key(graph_file, 3, show_progressbar=True): freq = len(wordpairs) if freq >= min_freq: for w1, w2 in wordpairs: write_line(graph_tmp_fp, (w1, w2, rule, freq)) rename_file(graph_file + '.tmp', graph_file) update_file_size(graph_file)
def filter_max_num_rules(graph_file: str) -> None: logging.getLogger('main').info('filter_max_num_rules') sort_file(graph_file, stable=True, numeric=True, reverse=True, key=4) max_num_rules = shared.config['preprocess'].getint('max_num_rules') min_rule_freq = shared.config['preprocess'].getint('min_rule_freq') progressbar = tqdm.tqdm(total=max_num_rules) with open_to_write(graph_file + '.tmp') as graph_fil_fp: num_rules = 0 for key, wordpairs in read_tsv_file_by_key(graph_file, (3, 4)): rule, freq = key num_rules += 1 progressbar.update() if int(freq) >= min_rule_freq: for wordpair in wordpairs: w1, w2 = wordpair write_line(graph_fil_fp, (w1, w2, rule, freq)) if num_rules >= max_num_rules: break progressbar.close() rename_file(graph_file + '.tmp', graph_file) update_file_size(graph_file)