Пример #1
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    edges_file = shared.filenames['graph-modsel']
    if not file_exists(edges_file):
        edges_file = shared.filenames['graph']
    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(edges_file, lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)
    if shared.config['General'].getboolean('supervised'):
        full_graph.remove_isolated_nodes()
#     full_graph.load_edges_from_file(graph_file)

# count rule frequencies in the full graph
#     rule_freq = defaultdict(lambda: 0)
#     for edge in full_graph.iter_edges():
#         rule_freq[edge.rule] += 1

# initialize a PointModel
    logging.getLogger('main').info('Initializing the model...')
    model = ModelSuite(rule_set, lexicon=lexicon)
    #     model = PointModel()
    #     model.fit_rootdist(lexicon.entries())
    #     model.fit_ruledist(rule for (rule, domsize) in rules)
    #     for rule, domsize in rules:
    #         model.add_rule(rule, domsize, freq=rule_freq[rule])

    softem(full_graph, model)
Пример #2
0
def get_analyzer(filename, lexicon, model):
    if file_exists(filename):
        analyzer = Analyzer.load(filename, lexicon, model)
    else:
        analyzer = Analyzer(lexicon, model)
        analyzer.save(filename)
    return analyzer
Пример #3
0
def create_new_words_acceptor_if_not_exists(filename, analyzer, lexicon):
    if not file_exists(filename):
        new_words_acceptor = hfst.HfstTransducer(analyzer.fst)
        new_words_acceptor.convert(
            hfst.ImplementationType.TROPICAL_OPENFST_TYPE)
        new_words_acceptor.input_project()
        new_words_acceptor.minimize()
        new_words_acceptor.subtract(lexicon.to_fst())
        new_words_acceptor.minimize()
        FST.save_transducer(new_words_acceptor, filename)
Пример #4
0
def get_analyzer(filename, lexicon, model):
    kwargs = {}
    kwargs['predict_vec'] = \
        shared.config['analyze'].getboolean('predict_vec')
    kwargs['max_results'] = shared.config['analyze'].getint('max_results')
    kwargs['include_roots'] = True
    kwargs['enable_back_formation'] = \
        shared.config['analyze'].getboolean('enable_back_formation')
    if file_exists(filename):
        analyzer = Analyzer.load(filename, lexicon, model, **kwargs)
    else:
        analyzer = Analyzer(lexicon, model, **kwargs)
        analyzer.save(filename)
    return analyzer
Пример #5
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    edges_file = shared.filenames['graph-modsel']
    if not file_exists(edges_file):
        edges_file = shared.filenames['graph']
    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(edges_file, lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)

    # initialize a ModelSuite and save it
    logging.getLogger('main').info('Initializing the model...')
    model = ModelSuite(rule_set, lexicon=lexicon)
    model.initialize(full_graph)
    logging.getLogger('main').info('Saving the model...')
    model.save()
Пример #6
0
 def is_loadable() -> bool:
     return file_exists(shared.filenames['root-model']) and \
            file_exists(shared.filenames['edge-model']) and \
            (shared.config['Models'].get('root_tag_model') == 'none' or \
             file_exists(shared.filenames['root-tag-model'])) and \
            (shared.config['Models'].get('root_frequency_model') == 'none' or \
             file_exists(shared.filenames['root-frequency-model'])) and \
            (shared.config['Models'].get('root_feature_model') == 'none' or \
             file_exists(shared.filenames['root-feature-model'])) and \
            (shared.config['Models'].get('edge_frequency_model') == 'none' or \
             file_exists(shared.filenames['edge-frequency-model'])) and \
            (shared.config['Models'].get('edge_feature_model') == 'none' or \
             file_exists(shared.filenames['edge-feature-model']))
Пример #7
0
def run() -> None:
    # load the lexicon
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    # load the rules
    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    tagset = extract_tag_symbols_from_rules(rule_set)
    print(tagset)
    print(len(tagset))
    # TODO compute the graph of possible edges
    # TODO save the graph
    edge_set = compute_possible_edges(lexicon, rule_set)
    edge_set.save('possible-edges.txt')
Пример #8
0
 def load() -> 'ModelSuite':
     rules_file = shared.filenames['rules-modsel']
     if not file_exists(rules_file):
         rules_file = shared.filenames['rules']
     rule_set = RuleSet.load(rules_file)
     lexicon = Lexicon.load(shared.filenames['wordlist'])
     result = ModelSuite(rule_set)
     result.rule_model = RuleModelFactory.load(
                             shared.config['Models'].get('rule_model'),
                             shared.filenames['rule-model'])
     result.root_model = RootModelFactory.load(
                             shared.config['Models'].get('root_model'),
                             shared.filenames['root-model'])
     result.edge_model = EdgeModelFactory.load(
                             shared.config['Models'].get('edge_model'),
                             shared.filenames['edge-model'],
                             rule_set)
     result.root_tag_model = \
         TagModelFactory.load(
             shared.config['Models'].get('root_tag_model'),
             shared.filenames['root-tag-model'])
     result.root_frequency_model = \
         RootFrequencyModelFactory.load(
             shared.config['Models'].get('root_frequency_model'),
             shared.filenames['root-frequency-model'])
     result.root_feature_model = \
         RootFeatureModelFactory.load(
             shared.config['Models'].get('root_feature_model'),
             shared.filenames['root-feature-model'],
             lexicon)
     result.edge_frequency_model = \
         EdgeFrequencyModelFactory.load(
             shared.config['Models'].get('edge_frequency_model'),
             shared.filenames['edge-frequency-model'],
             rule_set)
     result.edge_feature_model = \
         EdgeFeatureModelFactory.load(
             shared.config['Models'].get('edge_feature_model'),
             shared.filenames['edge-feature-model'],
             rule_set)
     return result
Пример #9
0
def load_rules() -> List[Tuple[Rule, float]]:
    rules_filename = None
    if shared.config['compile'].getboolean('weighted'):
        if shared.config['Models'].get('edge_model') == 'simple':
            rules_filename = shared.filenames['edge-model']
            max_cost = None \
                       if shared.config['compile'].get('max_cost') == 'none' \
                       else shared.config['compile'].getfloat('max_cost')
            rules = [(Rule.from_string(rule), -math.log(prod))\
                     for rule, prod in\
                         read_tsv_file(rules_filename, (str, float))\
                     if max_cost is None or -math.log(prod) < max_cost ] +\
                    [(Rule.from_string(':/:___:'), 0.0)]
            return rules
        else:
            raise Exception('Compiling a weighted analyzer is only possible'
                            ' for the Bernoulli edge model.')
    else:
        rules_filename = shared.filenames['rules-modsel']
        if not file_exists(rules_filename):
            rules_filename = shared.filenames['rules']
        return [(Rule.from_string(rule), 0.0)\
                for (rule,) in read_tsv_file(rules_filename, (str,))] +\
               [(Rule.from_string(':/:___:'), 0.0)]
Пример #10
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    edges_file = shared.filenames['graph-modsel']
    if not file_exists(edges_file):
        edges_file = shared.filenames['graph']
    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(edges_file, lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)

    # initialize a ModelSuite
    logging.getLogger('main').info('Loading the model...')
    model = ModelSuite.load()

    # setup the sampler
    logging.getLogger('main').info('Setting up the sampler...')
    sampler = MCMCGraphSamplerFactory.new(
        full_graph,
        model,
        warmup_iter=shared.config['sample'].getint('warmup_iterations'),
        sampling_iter=shared.config['sample'].getint('sampling_iterations'),
        iter_stat_interval=shared.config['sample'].getint(
            'iter_stat_interval'),
        depth_cost=shared.config['Models'].getfloat('depth_cost'))
    if shared.config['sample'].getboolean('stat_cost'):
        sampler.add_stat('cost', stats.ExpectedCostStatistic(sampler))
    if shared.config['sample'].getboolean('stat_acc_rate'):
        sampler.add_stat('acc_rate', stats.AcceptanceRateStatistic(sampler))
    if shared.config['sample'].getboolean('stat_iter_cost'):
        sampler.add_stat('iter_cost', stats.CostAtIterationStatistic(sampler))
    if shared.config['sample'].getboolean('stat_edge_freq'):
        sampler.add_stat('edge_freq', stats.EdgeFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_undirected_edge_freq'):
        sampler.add_stat('undirected_edge_freq',
                         stats.UndirectedEdgeFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_rule_freq'):
        sampler.add_stat('freq', stats.RuleFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_rule_contrib'):
        sampler.add_stat('contrib',
                         stats.RuleExpectedContributionStatistic(sampler))

    # run sampling and print results
    logging.getLogger('main').info('Running sampling...')
    sampler.run_sampling()
    sampler.summary()

    sampler.save_root_costs('sample-root-costs.txt')
    sampler.save_edge_costs('sample-edge-costs.txt')

    # save paths to a file
    pathlen = 0
    with open_to_write('paths.txt') as fp:
        for entry in lexicon:
            root = sampler.branching.root(entry)
            path = sampler.branching.path(root, entry)
            path.reverse()
            size = sampler.branching.subtree_size(root)
            fp.write(' <- '.join([str(e) for e in path]) + \
                     ' ({}, {})\n'.format(len(path), size))
            pathlen += len(path)
    logging.getLogger('main').debug('Average path length: {}'\
                                    .format(pathlen / len(lexicon)))

    # save rule frequency model fits to a file
    if model.edge_frequency_model == 'lognormal':
        with open_to_write('freqmodel.txt') as fp:
            for r_id, rule in enumerate(model.rule_set):
                write_line(fp, (rule, model.edge_frequency_model.means[r_id],
                                model.edge_frequency_model.sdevs[r_id]))

    # count words at each depth in the graph
    counts_per_depth = defaultdict(lambda: 0)
    queue = [(word, 0) for word in lexicon \
                       if sampler.branching.parent(word) is None]
    while queue:
        (word, d) = queue.pop()
        counts_per_depth[d] += 1
        queue.extend([(word, d+1) \
                      for word in sampler.branching.successors(word)])
    logging.getLogger('main').debug('Number of nodes per depth:')
    for d, c in counts_per_depth.items():
        logging.getLogger('main').debug('{} {}'.format(d, c))