Exemplo n.º 1
0
 def _set_parameters(self, lexicon: Lexicon) -> None:
     self.dim = 10  # hidden layer size
     self.alphabet = lexicon.get_alphabet(
     )  # TODO filter -- only non-tag symbols
     self.alphabet_idx = {sym: i for i, sym in enumerate(self.alphabet, 1)}
     self.tagset = lexicon.get_tagset()
     self.tagset_idx = {tag: i for i, tag in enumerate(self.tagset)}
     self.maxlen = lexicon.get_max_word_length()
Exemplo n.º 2
0
 def remove_isolated_nodes(self) -> None:
     '''Remove nodes that are not part of any edge'''
     # FIXME a very dirty implementation!
     new_lexicon = Lexicon()
     new_lexicon.add(entry for entry in self.lexicon \
                     if self.ingoing_edges(entry) +\
                        self.outgoing_edges(entry))
     self.lexicon = new_lexicon
Exemplo n.º 3
0
 def _extract_candidate_edges(words: Iterable[str],
                              output_fun: Callable[...,
                                                   None], lexicon: Lexicon,
                              transducer_path: str) -> None:
     sw = similar_words(words, transducer_path)
     for word_1, simwords in sw:
         v1_list = lexicon.get_by_symstr(word_1)
         for v1 in v1_list:
             results_for_v1 = []
             for word_2 in simwords:
                 for v2 in lexicon.get_by_symstr(word_2):
                     if v1 != v2 and _is_possible_edge(v1, v2):
                         rules = extract_all_rules(v1, v2)
                         for rule in rules:
                             results_for_v1.append((v2.literal, str(rule)))
             output_fun((v1.literal, results_for_v1))
Exemplo n.º 4
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    edges_file = shared.filenames['graph-modsel']
    if not file_exists(edges_file):
        edges_file = shared.filenames['graph']
    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(edges_file, lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)
    if shared.config['General'].getboolean('supervised'):
        full_graph.remove_isolated_nodes()
#     full_graph.load_edges_from_file(graph_file)

# count rule frequencies in the full graph
#     rule_freq = defaultdict(lambda: 0)
#     for edge in full_graph.iter_edges():
#         rule_freq[edge.rule] += 1

# initialize a PointModel
    logging.getLogger('main').info('Initializing the model...')
    model = ModelSuite(rule_set, lexicon=lexicon)
    #     model = PointModel()
    #     model.fit_rootdist(lexicon.entries())
    #     model.fit_ruledist(rule for (rule, domsize) in rules)
    #     for rule, domsize in rules:
    #         model.add_rule(rule, domsize, freq=rule_freq[rule])

    softem(full_graph, model)
Exemplo n.º 5
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])
    logging.getLogger('main').info('Building the lexicon transducer...')
    lexicon_tr = lexicon.to_fst()
    FST.save_transducer(lexicon_tr, shared.filenames['lexicon-tr'])

    if shared.config['General'].getboolean('supervised'):
        logging.getLogger('main').info('Building graph...')
        build_graph_from_training_edges(lexicon, shared.filenames['wordlist'],
                                        shared.filenames['graph'])
    else:
        logging.getLogger('main').info('Building graph...')
        write_tsv_file(
            shared.filenames['graph'],
            build_graph_fstfastss(lexicon, shared.filenames['lexicon-tr']))

    sort_file(shared.filenames['graph'], key=3)
    update_file_size(shared.filenames['graph'])
    run_filters(shared.filenames['graph'])
    update_file_size(shared.filenames['graph'])

    # write rules file
    rules = []
    for rule_str, edges in read_tsv_file_by_key(shared.filenames['graph'],
                                                key=3,
                                                show_progressbar=False):
        rules.append(Rule.from_string(rule_str))
    lexicon_tr = lexicon.to_fst()
    FST.save_transducer(lexicon_tr, shared.filenames['lexicon-tr'])
    logging.getLogger('main').info('Computing rule domain sizes...')
    write_tsv_file(shared.filenames['rules'],
                   ((str(rule), domsize)\
                    for rule, domsize in \
                        compute_rule_domsizes(lexicon_tr, rules)))
Exemplo n.º 6
0
 def get_rnn_model_parameters(lexicon: Lexicon) -> Dict[str, Any]:
     result = {}
     if lexicon is not None:
         result['alphabet'] = lexicon.get_alphabet()
         logging.getLogger('main').debug(\
             'Detected alphabet: {}'.format(', '.join(result['alphabet'])))
         result['maxlen'] = lexicon.get_max_symstr_length()
         logging.getLogger('main').debug(\
             'Detected max. word length: {}'.format(result['maxlen']))
     else:
         # default settings (TODO move somewhere else?)
         result['alphabet'] = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
                               'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
                               's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] + \
                              shared.multichar_symbols
         result['maxlen'] = 20
     return result
Exemplo n.º 7
0
def compute_possible_edges(lexicon: Lexicon, rule_set: RuleSet) -> EdgeSet:
    # build the transducer
    lexicon_tr = lexicon.to_fst()
    tag_seqs = extract_tag_symbols_from_rules(rule_set)
    if tag_seqs:
        lexicon_tr.concatenate(FST.generator(tag_seqs))
    rules_tr = rule_set.to_fst()
    tr = hfst.HfstTransducer(lexicon_tr)
    tr.compose(rules_tr)
    tr.determinize()
    tr.minimize()
    lexicon_tr.invert()
    tr.compose(lexicon_tr)
    tr.determinize()
    tr.minimize()
    FST.save_transducer(tr, 'tr.fsm')

    tr_path = full_path('tr.fsm')
    cmd = ['hfst-fst2strings', tr_path]
    p = subprocess.Popen(cmd,
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.DEVNULL,
                         universal_newlines=True,
                         bufsize=1)
    edge_set = EdgeSet(lexicon)
    while True:
        line = p.stdout.readline().strip()
        if line:
            w1, w2 = line.split(':')
            w1_without_tag = re.sub(shared.compiled_patterns['tag'], '', w1)
            w2_without_tag = re.sub(shared.compiled_patterns['tag'], '', w2)
            if w1_without_tag != w2_without_tag:
                n1 = LexiconEntry(w1)
                n2 = LexiconEntry(w2)
                rules = algorithms.align.extract_all_rules(n1, n2)
                for rule in rules:
                    if rule in rule_set:
                        n1_wt = lexicon.get_by_symstr(w1_without_tag)[0]
                        n2_wt = lexicon.get_by_symstr(w2_without_tag)[0]
                        edge_set.add(GraphEdge(n1_wt, n2_wt, rule))
        else:
            break
    return edge_set
Exemplo n.º 8
0
 def load(model_type: str, filename: str, rule_set: RuleSet) -> EdgeModel:
     if model_type == 'simple':
         return SimpleEdgeModel.load(filename, rule_set)
     elif model_type == 'neural':
         lexicon = Lexicon.load(shared.filenames['wordlist'])
         edge_set = \
             EdgeSet.load(shared.filenames['graph'], lexicon, rule_set)
         negex_sampler = NegativeExampleSampler(rule_set)
         return NeuralEdgeModel.load(filename, rule_set, edge_set,
                                     negex_sampler)
     else:
         raise UnknownModelTypeException('edge', model_type)
Exemplo n.º 9
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])
    logging.getLogger('main').info('Loading graph...')
    graph, weights = \
        load_graph(shared.filenames['weighted-graph'], lexicon,
                   threshold=shared.config['cluster'].getfloat('threshold'))
    logging.getLogger('main').info('Clustering...')
    clusters = chinese_whispers(\
        graph, weights,
        root_weights=shared.config['cluster'].getboolean('root_weights'),
        max_iterations=shared.config['cluster'].getint('max_iterations'))
    with open_to_write('clusters.txt') as fp:
        for cluster in clusters:
            fp.write(', '.join([str(node) for node in cluster]) + '\n')
Exemplo n.º 10
0
def run():
    lexicon = Lexicon.load(shared.filenames['wordlist'])
    model = ModelSuite.load()
    analyzer = get_analyzer('analyzer.fsm', lexicon, model)
    predict_vec = shared.config['analyze'].getboolean('predict_vec')
    if shared.options['interactive']:
        for line in sys.stdin:
            try:
                lexitem = LexiconEntry(line.strip())
                for analysis in analyze_word(lexitem, analyzer, predict_vec=predict_vec):
                    print(*analysis, sep='\t')
            except Exception as e:
                logging.getLogger('main').warning(e)
    else:
        lexicon_to_analyze = \
            load_raw_vocabulary(shared.filenames['analyze.wordlist'])
        for lexitem in tqdm.tqdm(lexicon_to_analyze):
            for analysis in analyze_word(lexitem, analyzer, predict_vec=predict_vec):
                print(*analysis, sep='\t')
Exemplo n.º 11
0
def run() -> None:
    lexicon = Lexicon.load(shared.filenames['wordlist'])
    model = ModelSuite.load()
    analyzer = get_analyzer('analyzer.fsm', lexicon, model)
    tr_file = 'wordgen.fst'
    create_new_words_acceptor_if_not_exists(tr_file, analyzer, lexicon)

    generator = tqdm.tqdm(
        generate_words(
            tr_file,
            analyzer,
            model,
            freq_model=shared.config['generate'].getboolean('freq_model'),
            sum_analyses=shared.config['generate'].getboolean('sum_analyses'),
            min_freq=shared.config['generate'].getfloat('min_freq'),
            max_freq=shared.config['generate'].getfloat('max_freq')))
    # TODO write to a file
    for word, cost in generator:
        print(word, cost, sep='\t')
Exemplo n.º 12
0
def run() -> None:
    # load the lexicon
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    # load the rules
    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    tagset = extract_tag_symbols_from_rules(rule_set)
    print(tagset)
    print(len(tagset))
    # TODO compute the graph of possible edges
    # TODO save the graph
    edge_set = compute_possible_edges(lexicon, rule_set)
    edge_set.save('possible-edges.txt')
Exemplo n.º 13
0
 def load() -> 'ModelSuite':
     rules_file = shared.filenames['rules-modsel']
     if not file_exists(rules_file):
         rules_file = shared.filenames['rules']
     rule_set = RuleSet.load(rules_file)
     lexicon = Lexicon.load(shared.filenames['wordlist'])
     result = ModelSuite(rule_set)
     result.rule_model = RuleModelFactory.load(
                             shared.config['Models'].get('rule_model'),
                             shared.filenames['rule-model'])
     result.root_model = RootModelFactory.load(
                             shared.config['Models'].get('root_model'),
                             shared.filenames['root-model'])
     result.edge_model = EdgeModelFactory.load(
                             shared.config['Models'].get('edge_model'),
                             shared.filenames['edge-model'],
                             rule_set)
     result.root_tag_model = \
         TagModelFactory.load(
             shared.config['Models'].get('root_tag_model'),
             shared.filenames['root-tag-model'])
     result.root_frequency_model = \
         RootFrequencyModelFactory.load(
             shared.config['Models'].get('root_frequency_model'),
             shared.filenames['root-frequency-model'])
     result.root_feature_model = \
         RootFeatureModelFactory.load(
             shared.config['Models'].get('root_feature_model'),
             shared.filenames['root-feature-model'],
             lexicon)
     result.edge_frequency_model = \
         EdgeFrequencyModelFactory.load(
             shared.config['Models'].get('edge_frequency_model'),
             shared.filenames['edge-frequency-model'],
             rule_set)
     result.edge_feature_model = \
         EdgeFeatureModelFactory.load(
             shared.config['Models'].get('edge_feature_model'),
             shared.filenames['edge-feature-model'],
             rule_set)
     return result
Exemplo n.º 14
0
def run():
    lexicon = Lexicon.load(shared.filenames['wordlist'])
    lexicon_tr = FST.load_transducer(shared.filenames['lexicon-tr'])
    rules_tr = FST.load_transducer(shared.filenames['rules-tr'])
    rules_tr.convert(hfst.ImplementationType.HFST_OLW_TYPE)
    alphabet = lexicon_tr.get_alphabet()
    model = ModelSuite.load()
    max_results = shared.config['inflect'].getint('max_results')

    if shared.options['interactive']:
        for line in sys.stdin:
            try:
                lemma_str, tag = line.rstrip().split()
                lemma = LexiconEntry(lemma_str)
                for analysis in inflect_word(lemma,
                                             tag,
                                             rules_tr,
                                             model,
                                             max_results=max_results):
                    print(*analysis, sep='\t')
            except Exception as e:
                logging.getLogger('main').warning(e)
    else:
        pairs = []
        # FIXME is there a better solution for creating lists of LexiconEntry
        # objects and skipping the ones for which exceptions are thrown?
        for lemma, tag in read_tsv_file(shared.filenames['analyze.wordlist']):
            try:
                pairs.append((LexiconEntry(lemma), tag))
            except Exception as e:
                logging.warning(e)
        for lemma, tag in tqdm.tqdm(pairs):
            for analysis in inflect_word(lemma,
                                         tag,
                                         rules_tr,
                                         model,
                                         max_results=max_results):
                print(*analysis, sep='\t')
Exemplo n.º 15
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    edges_file = shared.filenames['graph-modsel']
    if not file_exists(edges_file):
        edges_file = shared.filenames['graph']
    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(edges_file, lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)

    # initialize a ModelSuite and save it
    logging.getLogger('main').info('Initializing the model...')
    model = ModelSuite(rule_set, lexicon=lexicon)
    model.initialize(full_graph)
    logging.getLogger('main').info('Saving the model...')
    model.save()
Exemplo n.º 16
0
 def test_compute_domsize(self) -> None:
     lexicon = Lexicon()
     lexicon.add(LexiconEntry('anwinkeln<VVINF>'))
     lexicon.add(LexiconEntry('machen<VVINF>'))
     lexicon.add(LexiconEntry('Sachen<NN>'))
     lexicon.add(LexiconEntry('lachen<VVINF>'))
     lexicon.add(LexiconEntry('Dörfern<NN>'))
     lex_fst = lexicon.to_fst()
     self.assertEqual(self.rules[0].compute_domsize(lex_fst), 2)
     self.assertEqual(self.rules[1].compute_domsize(lex_fst), 18)
     self.assertEqual(self.rules[2].compute_domsize(lex_fst), 2)
Exemplo n.º 17
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rule_set = RuleSet.load(shared.filenames['rules'])

    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(shared.filenames['graph'], lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)

    logging.getLogger('main').info('Initializing the model...')
    model = ModelSuite(rule_set, lexicon=lexicon)
    model.initialize(full_graph)
    deleted_rules = set()

    for iter_num in range(shared.config['modsel'].getint('iterations')):
        sampler = MCMCGraphSampler(
            full_graph, model,
            shared.config['modsel'].getint('warmup_iterations'),
            shared.config['modsel'].getint('sampling_iterations'))
        sampler.add_stat('acc_rate', AcceptanceRateStatistic(sampler))
        sampler.add_stat('edge_freq', EdgeFrequencyStatistic(sampler))
        sampler.add_stat('exp_cost', ExpectedCostStatistic(sampler))
        sampler.run_sampling()

        # fit the model
        edge_weights = sampler.stats['edge_freq'].value()
        root_weights = np.ones(len(full_graph.lexicon))
        for idx in range(edge_weights.shape[0]):
            root_id = \
                full_graph.lexicon.get_id(full_graph.edge_set[idx].target)
            root_weights[root_id] -= edge_weights[idx]
        model.fit(sampler.lexicon, sampler.edge_set, root_weights,
                  edge_weights)

        # compute the rule statistics
        freq, contrib = sampler.compute_rule_stats()

        # determine the rules to delete
        deleted_rules |= set(np.where(contrib < 0)[0])
        logging.getLogger('main').info(\
            '{} rules deleted.'.format(len(deleted_rules)))

        # delete the edges with selected rules from the graph
        edges_to_delete = []
        for edge in full_graph.edges_iter():
            if model.rule_set.get_id(edge.rule) in deleted_rules:
                edges_to_delete.append(edge)
        full_graph.remove_edges(edges_to_delete)

        # deleting the rules is not necessary -- instead, save the reduced
        # rule set at the end; fitting will be performed separately

    logging.getLogger('main').info('Saving the graph...')
    full_graph.edge_set.save(shared.filenames['graph-modsel'])

    # remove the deleted rules from the rule set and save it
    logging.getLogger('main').info('Saving the rule set...')
    new_rule_set = RuleSet()
    for i, rule in enumerate(rule_set):
        if i not in deleted_rules:
            new_rule_set.add(rule, rule_set.get_domsize(rule))
    new_rule_set.save(shared.filenames['rules-modsel'])
Exemplo n.º 18
0
    def test_complete_sample(self) -> None:
        'Test a sample consisting of all possible negative edges.'
        words = [
            'machen', 'macht', 'mache', 'Sachen', 'Sache', 'anwinkeln',
            'anzuwinkeln'
        ]
        rules = [\
            ':/en:t___:',
            ':/n:___:',
            ':/a:ä/:er___:',
            ':/:zu/:___:'
        ]
        positive_edges = [\
            ('machen', 'macht', ':/en:t___:'),
            ('machen', 'mache', ':/n:___:'),
            ('Sachen', 'Sache', ':/n:___:'),
            ('anwinkeln', 'anzuwinkeln', ':/:zu/:___:'),
        ]
        expected_negative_edges = [\
            ('Sachen', '{CAP}sacht', ':/en:t___:'),
            ('anwinkeln', 'anwinkel', ':/n:___:'),
            ('anzuwinkeln', 'anzuwinkel', ':/n:___:'),
            ('machen', 'mächener', ':/a:ä/:er___:'),
            ('macht', 'mächter', ':/a:ä/:er___:'),
            ('mache', 'mächeer', ':/a:ä/:er___:'),
            ('Sachen', '{CAP}sächener', ':/a:ä/:er___:'),
            ('Sache', '{CAP}sächeer', ':/a:ä/:er___:'),
            ('machen', 'mzuachen', ':/:zu/:___:'),
            ('machen', 'mazuchen', ':/:zu/:___:'),
            ('machen', 'maczuhen', ':/:zu/:___:'),
            ('machen', 'machzuen', ':/:zu/:___:'),
            ('machen', 'machezun', ':/:zu/:___:'),
            ('mache', 'mzuache', ':/:zu/:___:'),
            ('mache', 'mazuche', ':/:zu/:___:'),
            ('mache', 'maczuhe', ':/:zu/:___:'),
            ('mache', 'machzue', ':/:zu/:___:'),
            ('macht', 'mzuacht', ':/:zu/:___:'),
            ('macht', 'mazucht', ':/:zu/:___:'),
            ('macht', 'maczuht', ':/:zu/:___:'),
            ('macht', 'machzut', ':/:zu/:___:'),
            ('Sachen', '{CAP}zusachen', ':/:zu/:___:'),
            ('Sachen', '{CAP}szuachen', ':/:zu/:___:'),
            ('Sachen', '{CAP}sazuchen', ':/:zu/:___:'),
            ('Sachen', '{CAP}saczuhen', ':/:zu/:___:'),
            ('Sachen', '{CAP}sachzuen', ':/:zu/:___:'),
            ('Sachen', '{CAP}sachezun', ':/:zu/:___:'),
            ('Sache', '{CAP}zusache', ':/:zu/:___:'),
            ('Sache', '{CAP}szuache', ':/:zu/:___:'),
            ('Sache', '{CAP}sazuche', ':/:zu/:___:'),
            ('Sache', '{CAP}saczuhe', ':/:zu/:___:'),
            ('Sache', '{CAP}sachzue', ':/:zu/:___:'),
            ('anwinkeln', 'azunwinkeln', ':/:zu/:___:'),
            ('anwinkeln', 'anwzuinkeln', ':/:zu/:___:'),
            ('anwinkeln', 'anwizunkeln', ':/:zu/:___:'),
            ('anwinkeln', 'anwinzukeln', ':/:zu/:___:'),
            ('anwinkeln', 'anwinkzueln', ':/:zu/:___:'),
            ('anwinkeln', 'anwinkezuln', ':/:zu/:___:'),
            ('anwinkeln', 'anwinkelzun', ':/:zu/:___:'),
            ('anzuwinkeln', 'azunzuwinkeln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuzuwinkeln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzzuuwinkeln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuwzuinkeln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuwizunkeln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuwinzukeln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuwinkzueln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuwinkezuln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuwinkelzun', ':/:zu/:___:')
        ]
        expected_weights = {\
            ':/en:t___:' : 1.0,
            ':/n:___:' : 1.0,
            ':/a:ä/:er___:' : 1.0,
            ':/:zu/:___:' : 41/40       # the word "anzuzuwinkeln" can be
                                        # derived in two different ways, so
                                        # it is counted double in domsize
                                        # computation, but sampled only once;
                                        # such cases are very rare, so they
                                        # shouldn't influence the weights much
        }

        lexicon = Lexicon(LexiconEntry(word) for word in words)
        lex_fst = lexicon.to_fst()
        rule_set = RuleSet()
        for rule_str in rules:
            rule = Rule.from_string(rule_str)
            rule_set.add(rule, rule.compute_domsize(lex_fst))
        edge_iter = (GraphEdge(lexicon[source], lexicon[target],
                               rule_set[rule]) \
                     for (source, target, rule) in positive_edges)
        edge_set = EdgeSet(lexicon, edge_iter)

        negex_sampler = NegativeExampleSampler(rule_set)
        sample_size = len(expected_negative_edges)
        sample = negex_sampler.sample(lexicon,
                                      sample_size,
                                      show_progressbar=False)
        sample_weights = negex_sampler.compute_sample_weights(sample, edge_set)

        self.assertEqual(rule_set.get_domsize(rule_set[0]), 2)
        self.assertEqual(rule_set.get_domsize(rule_set[1]), 4)
        self.assertEqual(rule_set.get_domsize(rule_set[2]), 5)
        self.assertEqual(rule_set.get_domsize(rule_set[3]), 42)
        self.longMessage = False
        for edge in edge_set:
            self.assertNotIn(edge,
                             sample,
                             msg='positive edge: {} in sample'.format(edge))
        for source, target, rule in expected_negative_edges:
            edge = GraphEdge(lexicon[source], LexiconEntry(target),
                             rule_set[rule])
            self.assertIn(edge, sample, msg='{} not in sample'.format(edge))
        self.longMessage = True
        for i, edge in enumerate(sample):
            self.assertAlmostEqual(sample_weights[i],
                                   expected_weights[str(edge.rule)],
                                   msg='for edge {}'.format(edge))
Exemplo n.º 19
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    edges_file = shared.filenames['graph-modsel']
    if not file_exists(edges_file):
        edges_file = shared.filenames['graph']
    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(edges_file, lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)

    # initialize a ModelSuite
    logging.getLogger('main').info('Loading the model...')
    model = ModelSuite.load()

    # setup the sampler
    logging.getLogger('main').info('Setting up the sampler...')
    sampler = MCMCGraphSamplerFactory.new(
        full_graph,
        model,
        warmup_iter=shared.config['sample'].getint('warmup_iterations'),
        sampling_iter=shared.config['sample'].getint('sampling_iterations'),
        iter_stat_interval=shared.config['sample'].getint(
            'iter_stat_interval'),
        depth_cost=shared.config['Models'].getfloat('depth_cost'))
    if shared.config['sample'].getboolean('stat_cost'):
        sampler.add_stat('cost', stats.ExpectedCostStatistic(sampler))
    if shared.config['sample'].getboolean('stat_acc_rate'):
        sampler.add_stat('acc_rate', stats.AcceptanceRateStatistic(sampler))
    if shared.config['sample'].getboolean('stat_iter_cost'):
        sampler.add_stat('iter_cost', stats.CostAtIterationStatistic(sampler))
    if shared.config['sample'].getboolean('stat_edge_freq'):
        sampler.add_stat('edge_freq', stats.EdgeFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_undirected_edge_freq'):
        sampler.add_stat('undirected_edge_freq',
                         stats.UndirectedEdgeFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_rule_freq'):
        sampler.add_stat('freq', stats.RuleFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_rule_contrib'):
        sampler.add_stat('contrib',
                         stats.RuleExpectedContributionStatistic(sampler))

    # run sampling and print results
    logging.getLogger('main').info('Running sampling...')
    sampler.run_sampling()
    sampler.summary()

    sampler.save_root_costs('sample-root-costs.txt')
    sampler.save_edge_costs('sample-edge-costs.txt')

    # save paths to a file
    pathlen = 0
    with open_to_write('paths.txt') as fp:
        for entry in lexicon:
            root = sampler.branching.root(entry)
            path = sampler.branching.path(root, entry)
            path.reverse()
            size = sampler.branching.subtree_size(root)
            fp.write(' <- '.join([str(e) for e in path]) + \
                     ' ({}, {})\n'.format(len(path), size))
            pathlen += len(path)
    logging.getLogger('main').debug('Average path length: {}'\
                                    .format(pathlen / len(lexicon)))

    # save rule frequency model fits to a file
    if model.edge_frequency_model == 'lognormal':
        with open_to_write('freqmodel.txt') as fp:
            for r_id, rule in enumerate(model.rule_set):
                write_line(fp, (rule, model.edge_frequency_model.means[r_id],
                                model.edge_frequency_model.sdevs[r_id]))

    # count words at each depth in the graph
    counts_per_depth = defaultdict(lambda: 0)
    queue = [(word, 0) for word in lexicon \
                       if sampler.branching.parent(word) is None]
    while queue:
        (word, d) = queue.pop()
        counts_per_depth[d] += 1
        queue.extend([(word, d+1) \
                      for word in sampler.branching.successors(word)])
    logging.getLogger('main').debug('Number of nodes per depth:')
    for d, c in counts_per_depth.items():
        logging.getLogger('main').debug('{} {}'.format(d, c))