def save(self, filename :str) -> None: with open_to_write(filename) as fp: for sym, prob in self.probs.items(): line = (sym[0], sym[1], prob) \ if isinstance(sym, tuple) \ else (sym, prob) write_line(fp, line)
def contract_graph(graph_file: str) -> None: '''Remove any additional information needed for filtering.''' with open_to_write(graph_file + '.tmp') as graph_tmp_fp: logging.getLogger('main').info('Contracting the graph...') for w1, w2, rule, freq in read_tsv_file(graph_file, show_progressbar=True): write_line(graph_tmp_fp, (w1, w2, rule)) rename_file(graph_file + '.tmp', graph_file)
def save_wordpair_stats(self, filename): stats, stat_names = [], [] for stat_name, stat in sorted(self.stats.items(), key=itemgetter(0)): if isinstance(stat, UnorderedWordPairStatistic): stat_names.append(stat_name) stats.append(stat) with open_to_write(filename) as fp: write_line(fp, ('word_1', 'word_2') + tuple(stat_names)) for key in self.unordered_word_pair_index: write_line(fp, key +\ tuple([stat.value(key) for stat in stats]))
def filter_min_rule_freq(graph_file: str) -> None: logging.getLogger('main').info('filter_min_rule_freq') min_rule_freq = shared.config['preprocess'].getint('min_rule_freq') with open_to_write(graph_file + '.tmp') as graph_fil_fp: for (rule, freq), wordpairs in read_tsv_file_by_key(graph_file, (3, 4), show_progressbar=True): if len(wordpairs) >= min_rule_freq: for word_1, word_2 in wordpairs: write_line(graph_fil_fp, (word_1, word_2, rule, freq)) rename_file(graph_file + '.tmp', graph_file) update_file_size(graph_file)
def write_edge_tr_mat(self, filename): with open_to_write(filename) as fp: for e_id, edge in enumerate(self.full_graph.edge_set): tag_probs = [] edge_tr_mat = self.edge_tr_mat[e_id] for (t1_id, t2_id), val in edge_tr_mat.todok().items(): tag_1 = self.tagset[t1_id] tag_2 = self.tagset[t2_id] tag_probs.append( (''.join(tag_1), ''.join(tag_2), str(val))) write_line(fp, (str(edge), ' '.join([t1+':'+t2+':'+prob \ for t1, t2, prob in tag_probs])))
def build_graph_from_training_edges(lexicon, training_file, graph_file): with open_to_write(graph_file) as fp: for word_1, word_2 in read_tsv_file(training_file, (str, str)): if word_1: try: n1, n2 = lexicon[word_1], lexicon[word_2] for rule in extract_all_rules(n1, n2): write_line(fp, (str(n1), str(n2), str(rule))) except KeyError: if word_1 not in lexicon: logging.getLogger('main').warning('%s not in lexicon' % word_1)
def save_edge_stats(self, filename): stats, stat_names = [], [] for stat_name, stat in sorted(self.stats.items(), key=itemgetter(0)): if isinstance(stat, EdgeStatistic): stat_names.append(stat_name) stats.append(stat) with open_to_write(filename) as fp: write_line(fp, ('word_1', 'word_2', 'rule') + tuple(stat_names)) for idx, edge in enumerate(self.edge_set): write_line(fp, (str(edge.source), str(edge.target), str(edge.rule)) + tuple([stat.val[idx]\ for stat in stats]))
def save_iter_stats(self, filename: str) -> None: stats, stat_names = [], [] for stat_name, stat in sorted(self.stats.items(), key=itemgetter(0)): if isinstance(stat, IterationStatistic): stat_names.append(stat_name) stats.append(stat) with open_to_write(filename) as fp: write_line(fp, ('iter_num', ) + tuple(stat_names)) for iter_num in range(self.iter_stat_interval, self.sampling_iter + 1, self.iter_stat_interval): write_line(fp, (str(iter_num),) + \ tuple([stat.value(iter_num) for stat in stats]))
def filter_max_edges_per_wordpair(graph_file: str) -> None: logging.getLogger('main').info('filter_max_edges_per_wordpair') sort_file(graph_file, stable=True, key=(1, 2)) max_edges_per_wordpair = \ shared.config['preprocess'].getint('max_edges_per_wordpair') with open_to_write(graph_file + '.tmp') as graph_fil_fp: for (word_1, word_2), edges in read_tsv_file_by_key(graph_file, (1, 2), show_progressbar=True): for rule, freq in edges[:max_edges_per_wordpair]: write_line(graph_fil_fp, (word_1, word_2, rule, freq)) rename_file(graph_file + '.tmp', graph_file) sort_file(graph_file, key=3) sort_file(graph_file, stable=True, numeric=True, reverse=True, key=4) update_file_size(graph_file)
def expand_graph(graph_file: str) -> None: '''Annotate graph with additional information needed for filtering: currently rule frequencies.''' min_freq = shared.config['preprocess'].getint('min_rule_freq') with open_to_write(graph_file + '.tmp') as graph_tmp_fp: logging.getLogger('main').info('Expanding the graph for filtering...') for rule, wordpairs in read_tsv_file_by_key(graph_file, 3, show_progressbar=True): freq = len(wordpairs) if freq >= min_freq: for w1, w2 in wordpairs: write_line(graph_tmp_fp, (w1, w2, rule, freq)) rename_file(graph_file + '.tmp', graph_file) update_file_size(graph_file)
def filter_max_num_rules(graph_file: str) -> None: logging.getLogger('main').info('filter_max_num_rules') sort_file(graph_file, stable=True, numeric=True, reverse=True, key=4) max_num_rules = shared.config['preprocess'].getint('max_num_rules') min_rule_freq = shared.config['preprocess'].getint('min_rule_freq') progressbar = tqdm.tqdm(total=max_num_rules) with open_to_write(graph_file + '.tmp') as graph_fil_fp: num_rules = 0 for key, wordpairs in read_tsv_file_by_key(graph_file, (3, 4)): rule, freq = key num_rules += 1 progressbar.update() if int(freq) >= min_rule_freq: for wordpair in wordpairs: w1, w2 = wordpair write_line(graph_fil_fp, (w1, w2, rule, freq)) if num_rules >= max_num_rules: break progressbar.close() rename_file(graph_file + '.tmp', graph_file) update_file_size(graph_file)
def save(self, filename :str) -> None: with open_to_write(filename) as fp: for sym, prob in self.probs.items(): write_line(fp, (sym, prob))
def write_leaf_prob(self, filename): with open_to_write(filename) as fp: for w_id, entry in enumerate(self.lexicon): tag_probs = [''.join(tag)+':'+str(self.leaf_prob[w_id,t_id]) \ for t_id, tag in enumerate(self.tagset)] write_line(fp, (str(entry), ' '.join(tag_probs)))
def save_edge_costs(self, filename): with open_to_write(filename) as fp: for i, edge in enumerate(self.edge_set): write_line(fp, (edge, self.edge_cost_cache[i]))
def save_root_costs(self, filename): with open_to_write(filename) as fp: for i, entry in enumerate(self.lexicon): write_line(fp, (entry, self.root_cost_cache[i]))
def save(self, filename: str) -> None: with open_to_write(filename) as fp: for edge in self.__iter__(): write_line(fp, edge.to_tuple()[:3])
def save(self, filename: str) -> None: with open_to_write(filename) as fp: write_line(fp, ('', self.smoothing_prob)) for tag, prob in self.probs.items(): write_line(fp, (''.join(tag), prob))
def run() -> None: logging.getLogger('main').info('Loading lexicon...') lexicon = Lexicon.load(shared.filenames['wordlist']) logging.getLogger('main').info('Loading rules...') rules_file = shared.filenames['rules-modsel'] if not file_exists(rules_file): rules_file = shared.filenames['rules'] rule_set = RuleSet.load(rules_file) edges_file = shared.filenames['graph-modsel'] if not file_exists(edges_file): edges_file = shared.filenames['graph'] logging.getLogger('main').info('Loading the graph...') edge_set = EdgeSet.load(edges_file, lexicon, rule_set) full_graph = FullGraph(lexicon, edge_set) # initialize a ModelSuite logging.getLogger('main').info('Loading the model...') model = ModelSuite.load() # setup the sampler logging.getLogger('main').info('Setting up the sampler...') sampler = MCMCGraphSamplerFactory.new( full_graph, model, warmup_iter=shared.config['sample'].getint('warmup_iterations'), sampling_iter=shared.config['sample'].getint('sampling_iterations'), iter_stat_interval=shared.config['sample'].getint( 'iter_stat_interval'), depth_cost=shared.config['Models'].getfloat('depth_cost')) if shared.config['sample'].getboolean('stat_cost'): sampler.add_stat('cost', stats.ExpectedCostStatistic(sampler)) if shared.config['sample'].getboolean('stat_acc_rate'): sampler.add_stat('acc_rate', stats.AcceptanceRateStatistic(sampler)) if shared.config['sample'].getboolean('stat_iter_cost'): sampler.add_stat('iter_cost', stats.CostAtIterationStatistic(sampler)) if shared.config['sample'].getboolean('stat_edge_freq'): sampler.add_stat('edge_freq', stats.EdgeFrequencyStatistic(sampler)) if shared.config['sample'].getboolean('stat_undirected_edge_freq'): sampler.add_stat('undirected_edge_freq', stats.UndirectedEdgeFrequencyStatistic(sampler)) if shared.config['sample'].getboolean('stat_rule_freq'): sampler.add_stat('freq', stats.RuleFrequencyStatistic(sampler)) if shared.config['sample'].getboolean('stat_rule_contrib'): sampler.add_stat('contrib', stats.RuleExpectedContributionStatistic(sampler)) # run sampling and print results logging.getLogger('main').info('Running sampling...') sampler.run_sampling() sampler.summary() sampler.save_root_costs('sample-root-costs.txt') sampler.save_edge_costs('sample-edge-costs.txt') # save paths to a file pathlen = 0 with open_to_write('paths.txt') as fp: for entry in lexicon: root = sampler.branching.root(entry) path = sampler.branching.path(root, entry) path.reverse() size = sampler.branching.subtree_size(root) fp.write(' <- '.join([str(e) for e in path]) + \ ' ({}, {})\n'.format(len(path), size)) pathlen += len(path) logging.getLogger('main').debug('Average path length: {}'\ .format(pathlen / len(lexicon))) # save rule frequency model fits to a file if model.edge_frequency_model == 'lognormal': with open_to_write('freqmodel.txt') as fp: for r_id, rule in enumerate(model.rule_set): write_line(fp, (rule, model.edge_frequency_model.means[r_id], model.edge_frequency_model.sdevs[r_id])) # count words at each depth in the graph counts_per_depth = defaultdict(lambda: 0) queue = [(word, 0) for word in lexicon \ if sampler.branching.parent(word) is None] while queue: (word, d) = queue.pop() counts_per_depth[d] += 1 queue.extend([(word, d+1) \ for word in sampler.branching.successors(word)]) logging.getLogger('main').debug('Number of nodes per depth:') for d, c in counts_per_depth.items(): logging.getLogger('main').debug('{} {}'.format(d, c))
def save_rule_stats(self, filename): freq, contrib = self.compute_rule_stats() with open_to_write(filename) as fp: for r_id, rule in enumerate(self.model.rule_set): write_line(fp, (rule, freq[r_id], contrib[r_id]))