def run() -> None: logging.getLogger('main').info('Loading lexicon...') lexicon = Lexicon.load(shared.filenames['wordlist']) logging.getLogger('main').info('Loading rules...') rules_file = shared.filenames['rules-modsel'] if not file_exists(rules_file): rules_file = shared.filenames['rules'] rule_set = RuleSet.load(rules_file) edges_file = shared.filenames['graph-modsel'] if not file_exists(edges_file): edges_file = shared.filenames['graph'] logging.getLogger('main').info('Loading the graph...') edge_set = EdgeSet.load(edges_file, lexicon, rule_set) full_graph = FullGraph(lexicon, edge_set) if shared.config['General'].getboolean('supervised'): full_graph.remove_isolated_nodes() # full_graph.load_edges_from_file(graph_file) # count rule frequencies in the full graph # rule_freq = defaultdict(lambda: 0) # for edge in full_graph.iter_edges(): # rule_freq[edge.rule] += 1 # initialize a PointModel logging.getLogger('main').info('Initializing the model...') model = ModelSuite(rule_set, lexicon=lexicon) # model = PointModel() # model.fit_rootdist(lexicon.entries()) # model.fit_ruledist(rule for (rule, domsize) in rules) # for rule, domsize in rules: # model.add_rule(rule, domsize, freq=rule_freq[rule]) softem(full_graph, model)
def get_analyzer(filename, lexicon, model): if file_exists(filename): analyzer = Analyzer.load(filename, lexicon, model) else: analyzer = Analyzer(lexicon, model) analyzer.save(filename) return analyzer
def create_new_words_acceptor_if_not_exists(filename, analyzer, lexicon): if not file_exists(filename): new_words_acceptor = hfst.HfstTransducer(analyzer.fst) new_words_acceptor.convert( hfst.ImplementationType.TROPICAL_OPENFST_TYPE) new_words_acceptor.input_project() new_words_acceptor.minimize() new_words_acceptor.subtract(lexicon.to_fst()) new_words_acceptor.minimize() FST.save_transducer(new_words_acceptor, filename)
def get_analyzer(filename, lexicon, model): kwargs = {} kwargs['predict_vec'] = \ shared.config['analyze'].getboolean('predict_vec') kwargs['max_results'] = shared.config['analyze'].getint('max_results') kwargs['include_roots'] = True kwargs['enable_back_formation'] = \ shared.config['analyze'].getboolean('enable_back_formation') if file_exists(filename): analyzer = Analyzer.load(filename, lexicon, model, **kwargs) else: analyzer = Analyzer(lexicon, model, **kwargs) analyzer.save(filename) return analyzer
def run() -> None: logging.getLogger('main').info('Loading lexicon...') lexicon = Lexicon.load(shared.filenames['wordlist']) logging.getLogger('main').info('Loading rules...') rules_file = shared.filenames['rules-modsel'] if not file_exists(rules_file): rules_file = shared.filenames['rules'] rule_set = RuleSet.load(rules_file) edges_file = shared.filenames['graph-modsel'] if not file_exists(edges_file): edges_file = shared.filenames['graph'] logging.getLogger('main').info('Loading the graph...') edge_set = EdgeSet.load(edges_file, lexicon, rule_set) full_graph = FullGraph(lexicon, edge_set) # initialize a ModelSuite and save it logging.getLogger('main').info('Initializing the model...') model = ModelSuite(rule_set, lexicon=lexicon) model.initialize(full_graph) logging.getLogger('main').info('Saving the model...') model.save()
def is_loadable() -> bool: return file_exists(shared.filenames['root-model']) and \ file_exists(shared.filenames['edge-model']) and \ (shared.config['Models'].get('root_tag_model') == 'none' or \ file_exists(shared.filenames['root-tag-model'])) and \ (shared.config['Models'].get('root_frequency_model') == 'none' or \ file_exists(shared.filenames['root-frequency-model'])) and \ (shared.config['Models'].get('root_feature_model') == 'none' or \ file_exists(shared.filenames['root-feature-model'])) and \ (shared.config['Models'].get('edge_frequency_model') == 'none' or \ file_exists(shared.filenames['edge-frequency-model'])) and \ (shared.config['Models'].get('edge_feature_model') == 'none' or \ file_exists(shared.filenames['edge-feature-model']))
def run() -> None: # load the lexicon logging.getLogger('main').info('Loading lexicon...') lexicon = Lexicon.load(shared.filenames['wordlist']) # load the rules logging.getLogger('main').info('Loading rules...') rules_file = shared.filenames['rules-modsel'] if not file_exists(rules_file): rules_file = shared.filenames['rules'] rule_set = RuleSet.load(rules_file) tagset = extract_tag_symbols_from_rules(rule_set) print(tagset) print(len(tagset)) # TODO compute the graph of possible edges # TODO save the graph edge_set = compute_possible_edges(lexicon, rule_set) edge_set.save('possible-edges.txt')
def load() -> 'ModelSuite': rules_file = shared.filenames['rules-modsel'] if not file_exists(rules_file): rules_file = shared.filenames['rules'] rule_set = RuleSet.load(rules_file) lexicon = Lexicon.load(shared.filenames['wordlist']) result = ModelSuite(rule_set) result.rule_model = RuleModelFactory.load( shared.config['Models'].get('rule_model'), shared.filenames['rule-model']) result.root_model = RootModelFactory.load( shared.config['Models'].get('root_model'), shared.filenames['root-model']) result.edge_model = EdgeModelFactory.load( shared.config['Models'].get('edge_model'), shared.filenames['edge-model'], rule_set) result.root_tag_model = \ TagModelFactory.load( shared.config['Models'].get('root_tag_model'), shared.filenames['root-tag-model']) result.root_frequency_model = \ RootFrequencyModelFactory.load( shared.config['Models'].get('root_frequency_model'), shared.filenames['root-frequency-model']) result.root_feature_model = \ RootFeatureModelFactory.load( shared.config['Models'].get('root_feature_model'), shared.filenames['root-feature-model'], lexicon) result.edge_frequency_model = \ EdgeFrequencyModelFactory.load( shared.config['Models'].get('edge_frequency_model'), shared.filenames['edge-frequency-model'], rule_set) result.edge_feature_model = \ EdgeFeatureModelFactory.load( shared.config['Models'].get('edge_feature_model'), shared.filenames['edge-feature-model'], rule_set) return result
def load_rules() -> List[Tuple[Rule, float]]: rules_filename = None if shared.config['compile'].getboolean('weighted'): if shared.config['Models'].get('edge_model') == 'simple': rules_filename = shared.filenames['edge-model'] max_cost = None \ if shared.config['compile'].get('max_cost') == 'none' \ else shared.config['compile'].getfloat('max_cost') rules = [(Rule.from_string(rule), -math.log(prod))\ for rule, prod in\ read_tsv_file(rules_filename, (str, float))\ if max_cost is None or -math.log(prod) < max_cost ] +\ [(Rule.from_string(':/:___:'), 0.0)] return rules else: raise Exception('Compiling a weighted analyzer is only possible' ' for the Bernoulli edge model.') else: rules_filename = shared.filenames['rules-modsel'] if not file_exists(rules_filename): rules_filename = shared.filenames['rules'] return [(Rule.from_string(rule), 0.0)\ for (rule,) in read_tsv_file(rules_filename, (str,))] +\ [(Rule.from_string(':/:___:'), 0.0)]
def run() -> None: logging.getLogger('main').info('Loading lexicon...') lexicon = Lexicon.load(shared.filenames['wordlist']) logging.getLogger('main').info('Loading rules...') rules_file = shared.filenames['rules-modsel'] if not file_exists(rules_file): rules_file = shared.filenames['rules'] rule_set = RuleSet.load(rules_file) edges_file = shared.filenames['graph-modsel'] if not file_exists(edges_file): edges_file = shared.filenames['graph'] logging.getLogger('main').info('Loading the graph...') edge_set = EdgeSet.load(edges_file, lexicon, rule_set) full_graph = FullGraph(lexicon, edge_set) # initialize a ModelSuite logging.getLogger('main').info('Loading the model...') model = ModelSuite.load() # setup the sampler logging.getLogger('main').info('Setting up the sampler...') sampler = MCMCGraphSamplerFactory.new( full_graph, model, warmup_iter=shared.config['sample'].getint('warmup_iterations'), sampling_iter=shared.config['sample'].getint('sampling_iterations'), iter_stat_interval=shared.config['sample'].getint( 'iter_stat_interval'), depth_cost=shared.config['Models'].getfloat('depth_cost')) if shared.config['sample'].getboolean('stat_cost'): sampler.add_stat('cost', stats.ExpectedCostStatistic(sampler)) if shared.config['sample'].getboolean('stat_acc_rate'): sampler.add_stat('acc_rate', stats.AcceptanceRateStatistic(sampler)) if shared.config['sample'].getboolean('stat_iter_cost'): sampler.add_stat('iter_cost', stats.CostAtIterationStatistic(sampler)) if shared.config['sample'].getboolean('stat_edge_freq'): sampler.add_stat('edge_freq', stats.EdgeFrequencyStatistic(sampler)) if shared.config['sample'].getboolean('stat_undirected_edge_freq'): sampler.add_stat('undirected_edge_freq', stats.UndirectedEdgeFrequencyStatistic(sampler)) if shared.config['sample'].getboolean('stat_rule_freq'): sampler.add_stat('freq', stats.RuleFrequencyStatistic(sampler)) if shared.config['sample'].getboolean('stat_rule_contrib'): sampler.add_stat('contrib', stats.RuleExpectedContributionStatistic(sampler)) # run sampling and print results logging.getLogger('main').info('Running sampling...') sampler.run_sampling() sampler.summary() sampler.save_root_costs('sample-root-costs.txt') sampler.save_edge_costs('sample-edge-costs.txt') # save paths to a file pathlen = 0 with open_to_write('paths.txt') as fp: for entry in lexicon: root = sampler.branching.root(entry) path = sampler.branching.path(root, entry) path.reverse() size = sampler.branching.subtree_size(root) fp.write(' <- '.join([str(e) for e in path]) + \ ' ({}, {})\n'.format(len(path), size)) pathlen += len(path) logging.getLogger('main').debug('Average path length: {}'\ .format(pathlen / len(lexicon))) # save rule frequency model fits to a file if model.edge_frequency_model == 'lognormal': with open_to_write('freqmodel.txt') as fp: for r_id, rule in enumerate(model.rule_set): write_line(fp, (rule, model.edge_frequency_model.means[r_id], model.edge_frequency_model.sdevs[r_id])) # count words at each depth in the graph counts_per_depth = defaultdict(lambda: 0) queue = [(word, 0) for word in lexicon \ if sampler.branching.parent(word) is None] while queue: (word, d) = queue.pop() counts_per_depth[d] += 1 queue.extend([(word, d+1) \ for word in sampler.branching.successors(word)]) logging.getLogger('main').debug('Number of nodes per depth:') for d, c in counts_per_depth.items(): logging.getLogger('main').debug('{} {}'.format(d, c))