Пример #1
0
    def _compute_untagged_edges_and_transition_mat(self, full_graph, model):
        logging.getLogger('main').info('Computing transition matrices...')

        def _untag_edge(lexicon, edge):
            source = lexicon.get_by_symstr(''.join(edge.source.word))[0]
            target = lexicon.get_by_symstr(''.join(edge.target.word))[0]
            rule = Rule(edge.rule.subst)
            return GraphEdge(source, target, rule)

        edge_prob = model.edges_prob(full_graph.edge_set)
        edge_prob_ratios = edge_prob / (1 - edge_prob)
        untagged_edge_set = EdgeSet(full_graph.lexicon)
        T = len(self.tagset)
        edge_ids_by_untagged_edge = []
        for e_id, edge in enumerate(full_graph.edge_set):
            untagged_edge = _untag_edge(full_graph.lexicon, edge)
            if untagged_edge not in untagged_edge_set:
                untagged_edge_set.add(untagged_edge)
                edge_ids_by_untagged_edge.append(list())
            ue_id = untagged_edge_set.get_id(untagged_edge)
            edge_ids_by_untagged_edge[ue_id].append(e_id)
        edge_tr_mat = []
        for ue_id, e_ids in tqdm.tqdm(enumerate(edge_ids_by_untagged_edge), \
                                      total=len(edge_ids_by_untagged_edge)):
            tr_array = np.zeros((T, T))
            for e_id in e_ids:
                edge = full_graph.edge_set[e_id]
                t1_id = self.tag_idx[edge.rule.tag_subst[0]]
                t2_id = self.tag_idx[edge.rule.tag_subst[1]]
                tr_array[t1_id, t2_id] = edge_prob_ratios[e_id]
            if ue_id != len(edge_tr_mat):
                raise Exception('Inconsistent untagged edge IDs!')
            edge_tr_mat.append(csr_matrix(tr_array))
        return untagged_edge_set, edge_tr_mat
Пример #2
0
    def _compute_leaf_prob(self):
        logging.getLogger('main').info('Computing leaf probabilities...')
        self.leaf_prob = np.ones((len(self.lexicon), len(self.tagset)),
                                 dtype=np.float64)
        edge_set = EdgeSet(lexicon)

        def _empty_edge_set(edge_set):
            lexicon = edge_set.lexicon
            n = len(edge_set)
            probs = 1 - self.model.edges_prob(edge_set)
            for e_id, edge in enumerate(edge_set):
                word = lexicon.get_by_symstr(''.join(edge.source.word))[0]
                w_id = lexicon.get_id(word)
                t_id = self.tag_idx[edge.source.tag]
                self.leaf_prob[w_id, t_id] *= probs[e_id]
            edge_set = EdgeSet(lexicon)
            print(n)
            return edge_set

        lexicon_tr = self.lexicon.to_fst()
        lexicon_tr.concatenate(FST.generator(self.tagset))
        rules_tr = self.model.rule_set.to_fst()
        tr = hfst.HfstTransducer(lexicon_tr)
        tr.compose(rules_tr)
        tr.determinize()
        tr.minimize()
        FST.save_transducer(tr, 'tr.fsm')

        tr_path = full_path('tr.fsm')
        cmd = ['hfst-fst2strings', tr_path]
        p = subprocess.Popen(cmd,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.DEVNULL,
                             universal_newlines=True,
                             bufsize=1)
        while True:
            line = p.stdout.readline().strip()
            if line:
                w1, w2 = line.split(':')
                n1 = LexiconEntry(w1)
                n2 = LexiconEntry(w2)
                rules = extract_all_rules(n1, n2)
                for rule in rules:
                    if rule in rule_set:
                        edge_set.add(GraphEdge(n1, n2, rule))
            else:
                break
            if len(edge_set) > 300000:
                edge_set = _empty_edge_set(edge_set)
        edge_set = _empty_edge_set(edge_set)
Пример #3
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    edges_file = shared.filenames['graph-modsel']
    if not file_exists(edges_file):
        edges_file = shared.filenames['graph']
    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(edges_file, lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)
    if shared.config['General'].getboolean('supervised'):
        full_graph.remove_isolated_nodes()
#     full_graph.load_edges_from_file(graph_file)

# count rule frequencies in the full graph
#     rule_freq = defaultdict(lambda: 0)
#     for edge in full_graph.iter_edges():
#         rule_freq[edge.rule] += 1

# initialize a PointModel
    logging.getLogger('main').info('Initializing the model...')
    model = ModelSuite(rule_set, lexicon=lexicon)
    #     model = PointModel()
    #     model.fit_rootdist(lexicon.entries())
    #     model.fit_ruledist(rule for (rule, domsize) in rules)
    #     for rule, domsize in rules:
    #         model.add_rule(rule, domsize, freq=rule_freq[rule])

    softem(full_graph, model)
Пример #4
0
def compute_possible_edges(lexicon: Lexicon, rule_set: RuleSet) -> EdgeSet:
    # build the transducer
    lexicon_tr = lexicon.to_fst()
    tag_seqs = extract_tag_symbols_from_rules(rule_set)
    if tag_seqs:
        lexicon_tr.concatenate(FST.generator(tag_seqs))
    rules_tr = rule_set.to_fst()
    tr = hfst.HfstTransducer(lexicon_tr)
    tr.compose(rules_tr)
    tr.determinize()
    tr.minimize()
    lexicon_tr.invert()
    tr.compose(lexicon_tr)
    tr.determinize()
    tr.minimize()
    FST.save_transducer(tr, 'tr.fsm')

    tr_path = full_path('tr.fsm')
    cmd = ['hfst-fst2strings', tr_path]
    p = subprocess.Popen(cmd,
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.DEVNULL,
                         universal_newlines=True,
                         bufsize=1)
    edge_set = EdgeSet(lexicon)
    while True:
        line = p.stdout.readline().strip()
        if line:
            w1, w2 = line.split(':')
            w1_without_tag = re.sub(shared.compiled_patterns['tag'], '', w1)
            w2_without_tag = re.sub(shared.compiled_patterns['tag'], '', w2)
            if w1_without_tag != w2_without_tag:
                n1 = LexiconEntry(w1)
                n2 = LexiconEntry(w2)
                rules = algorithms.align.extract_all_rules(n1, n2)
                for rule in rules:
                    if rule in rule_set:
                        n1_wt = lexicon.get_by_symstr(w1_without_tag)[0]
                        n2_wt = lexicon.get_by_symstr(w2_without_tag)[0]
                        edge_set.add(GraphEdge(n1_wt, n2_wt, rule))
        else:
            break
    return edge_set
Пример #5
0
def load_graph(filename, lexicon, threshold=0.0):
    edge_set = EdgeSet(lexicon)
    weights = []
    rules = {}
    for word_1, word_2, rule_str, edge_freq_str in read_tsv_file(filename):
        try:
            edge_freq = float(edge_freq_str)
            if edge_freq < threshold:
                continue
            if rule_str not in rules:
                rules[rule_str] = Rule.from_string(rule_str)
            edge = GraphEdge(lexicon[word_1],
                             lexicon[word_2],
                             rules[rule_str],
                             weight=edge_freq)
            edge_set.add(edge)
            weights.append(edge_freq)
        except ValueError:
            pass
    return FullGraph(lexicon, edge_set), np.array(weights)
Пример #6
0
 def edges_cost(self, edge_set: EdgeSet) -> np.ndarray:
     result = np.zeros(len(edge_set))
     for rule, edge_ids in edge_set.get_edge_ids_by_rule().items():
         rule_id = self.rule_set.get_id(rule)
         freq_vector = np.array([edge_set[i].source.logfreq - \
                                 edge_set[i].target.logfreq \
                                 for i in edge_ids])
         costs = -norm.logpdf(freq_vector, self.means[rule_id, ],
                              self.sdevs[rule_id, ])
         result[tuple(edge_ids), ] = costs
     return result
Пример #7
0
 def load(model_type: str, filename: str, rule_set: RuleSet) -> EdgeModel:
     if model_type == 'simple':
         return SimpleEdgeModel.load(filename, rule_set)
     elif model_type == 'neural':
         lexicon = Lexicon.load(shared.filenames['wordlist'])
         edge_set = \
             EdgeSet.load(shared.filenames['graph'], lexicon, rule_set)
         negex_sampler = NegativeExampleSampler(rule_set)
         return NeuralEdgeModel.load(filename, rule_set, edge_set,
                                     negex_sampler)
     else:
         raise UnknownModelTypeException('edge', model_type)
Пример #8
0
 def edges_cost(self, edge_set: EdgeSet) -> np.ndarray:
     result = np.zeros(len(edge_set))
     for rule, edge_ids in edge_set.get_edge_ids_by_rule().items():
         rule_id = self.rule_set.get_id(rule)
         feature_matrix = np.vstack([edge_set[i].target.vec - \
                                     edge_set[i].source.vec \
                                     for i in edge_ids])
         costs = -multivariate_normal.logpdf(feature_matrix,
                                             self.means[rule_id, ],
                                             np.diag(self.vars[rule_id, ]))
         result[tuple(edge_ids), ] = costs
     return result
Пример #9
0
 def _empty_edge_set(edge_set):
     lexicon = edge_set.lexicon
     n = len(edge_set)
     probs = 1 - self.model.edges_prob(edge_set)
     for e_id, edge in enumerate(edge_set):
         word = lexicon.get_by_symstr(''.join(edge.source.word))[0]
         w_id = lexicon.get_id(word)
         t_id = self.tag_idx[edge.source.tag]
         self.leaf_prob[w_id, t_id] *= probs[e_id]
     edge_set = EdgeSet(lexicon)
     print(n)
     return edge_set
Пример #10
0
 def fit(self, edge_set: EdgeSet, weights: np.ndarray) -> None:
     if self.means is None:
         self.means = np.empty(len(self.rule_set))
     if self.sdevs is None:
         self.sdevs = np.empty(len(self.rule_set))
     for rule, edge_ids in edge_set.get_edge_ids_by_rule().items():
         edge_ids = tuple(edge_ids)
         freq_vector = np.array([edge_set[i].source.logfreq - \
                                 edge_set[i].target.logfreq \
                                 for i in edge_ids])
         self.fit_rule(self.rule_set.get_id(rule), freq_vector,
                       weights[edge_ids, ])
Пример #11
0
 def fit(self, edge_set :EdgeSet, weights :np.ndarray) \
        -> None:
     if self.means is None:
         self.means = np.empty((len(self.rule_set), self.dim))
     if self.vars is None:
         self.vars = np.empty((len(self.rule_set), self.dim))
     for rule, edge_ids in edge_set.get_edge_ids_by_rule().items():
         edge_ids = tuple(edge_ids)
         feature_matrix = np.array([edge_set[i].target.vec - \
                                    edge_set[i].source.vec \
                                    for i in edge_ids])
         self.fit_rule(self.rule_set.get_id(rule), feature_matrix,
                       weights[edge_ids, ])
Пример #12
0
    def sample(self,
               lexicon: Lexicon,
               sample_size: int,
               show_progressbar: bool = True) -> EdgeSet:
        def _sample_process(rules: List[Rule], _output_fun: Callable[...,
                                                                     None],
                            lexicon: Lexicon, sample_size: int) -> None:
            transducers = [r.to_fst() for r in rules]
            for tr in transducers:
                tr.convert(hfst.ImplementationType.HFST_OL_TYPE)
            seen_ids = set()
            num = 0
            while num < sample_size:
                w_id = random.randrange(len(lexicon))
                r_id = random.randrange(len(rules))
                source = lexicon[w_id]
                rule = rules[r_id]
                lookup_results = \
                    sorted(list(map(lambda x: x[0].replace(hfst.EPSILON, ''),
                                    transducers[r_id].lookup(source.symstr))))
                if lookup_results:
                    t_id = random.randrange(len(lookup_results))
                    if (w_id, r_id, t_id) in seen_ids:
                        continue
                    seen_ids.add((w_id, r_id, t_id))
                    target = None
                    try:
                        target = LexiconEntry(lookup_results[t_id])
                        if target.symstr not in lexicon.items_by_symstr:
                            _output_fun(GraphEdge(source, target, rule))
                            num += 1
                    except Exception as e:
                        logging.getLogger('main').debug(\
                           'Exception during negative sampling: {}'.format(e))

        num_processes = shared.config['NegativeExampleSampler']\
                        .getint('num_processes')
        sample_size_per_proc = int(sample_size / num_processes)
        edges_iter = \
            parallel_execute(function=_sample_process,
                             data=list(self.rule_set),
                             num_processes=num_processes,
                             additional_args=(lexicon, sample_size_per_proc),
                             show_progressbar=show_progressbar,
                             progressbar_total = sample_size_per_proc * \
                                                 num_processes)
        edge_set = EdgeSet(lexicon, edges_iter)
        return edge_set
Пример #13
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    edges_file = shared.filenames['graph-modsel']
    if not file_exists(edges_file):
        edges_file = shared.filenames['graph']
    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(edges_file, lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)

    # initialize a ModelSuite and save it
    logging.getLogger('main').info('Initializing the model...')
    model = ModelSuite(rule_set, lexicon=lexicon)
    model.initialize(full_graph)
    logging.getLogger('main').info('Saving the model...')
    model.save()
Пример #14
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rule_set = RuleSet.load(shared.filenames['rules'])

    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(shared.filenames['graph'], lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)

    logging.getLogger('main').info('Initializing the model...')
    model = ModelSuite(rule_set, lexicon=lexicon)
    model.initialize(full_graph)
    deleted_rules = set()

    for iter_num in range(shared.config['modsel'].getint('iterations')):
        sampler = MCMCGraphSampler(
            full_graph, model,
            shared.config['modsel'].getint('warmup_iterations'),
            shared.config['modsel'].getint('sampling_iterations'))
        sampler.add_stat('acc_rate', AcceptanceRateStatistic(sampler))
        sampler.add_stat('edge_freq', EdgeFrequencyStatistic(sampler))
        sampler.add_stat('exp_cost', ExpectedCostStatistic(sampler))
        sampler.run_sampling()

        # fit the model
        edge_weights = sampler.stats['edge_freq'].value()
        root_weights = np.ones(len(full_graph.lexicon))
        for idx in range(edge_weights.shape[0]):
            root_id = \
                full_graph.lexicon.get_id(full_graph.edge_set[idx].target)
            root_weights[root_id] -= edge_weights[idx]
        model.fit(sampler.lexicon, sampler.edge_set, root_weights,
                  edge_weights)

        # compute the rule statistics
        freq, contrib = sampler.compute_rule_stats()

        # determine the rules to delete
        deleted_rules |= set(np.where(contrib < 0)[0])
        logging.getLogger('main').info(\
            '{} rules deleted.'.format(len(deleted_rules)))

        # delete the edges with selected rules from the graph
        edges_to_delete = []
        for edge in full_graph.edges_iter():
            if model.rule_set.get_id(edge.rule) in deleted_rules:
                edges_to_delete.append(edge)
        full_graph.remove_edges(edges_to_delete)

        # deleting the rules is not necessary -- instead, save the reduced
        # rule set at the end; fitting will be performed separately

    logging.getLogger('main').info('Saving the graph...')
    full_graph.edge_set.save(shared.filenames['graph-modsel'])

    # remove the deleted rules from the rule set and save it
    logging.getLogger('main').info('Saving the rule set...')
    new_rule_set = RuleSet()
    for i, rule in enumerate(rule_set):
        if i not in deleted_rules:
            new_rule_set.add(rule, rule_set.get_domsize(rule))
    new_rule_set.save(shared.filenames['rules-modsel'])
Пример #15
0
def run() -> None:
    logging.getLogger('main').info('Loading lexicon...')
    lexicon = Lexicon.load(shared.filenames['wordlist'])

    logging.getLogger('main').info('Loading rules...')
    rules_file = shared.filenames['rules-modsel']
    if not file_exists(rules_file):
        rules_file = shared.filenames['rules']
    rule_set = RuleSet.load(rules_file)

    edges_file = shared.filenames['graph-modsel']
    if not file_exists(edges_file):
        edges_file = shared.filenames['graph']
    logging.getLogger('main').info('Loading the graph...')
    edge_set = EdgeSet.load(edges_file, lexicon, rule_set)
    full_graph = FullGraph(lexicon, edge_set)

    # initialize a ModelSuite
    logging.getLogger('main').info('Loading the model...')
    model = ModelSuite.load()

    # setup the sampler
    logging.getLogger('main').info('Setting up the sampler...')
    sampler = MCMCGraphSamplerFactory.new(
        full_graph,
        model,
        warmup_iter=shared.config['sample'].getint('warmup_iterations'),
        sampling_iter=shared.config['sample'].getint('sampling_iterations'),
        iter_stat_interval=shared.config['sample'].getint(
            'iter_stat_interval'),
        depth_cost=shared.config['Models'].getfloat('depth_cost'))
    if shared.config['sample'].getboolean('stat_cost'):
        sampler.add_stat('cost', stats.ExpectedCostStatistic(sampler))
    if shared.config['sample'].getboolean('stat_acc_rate'):
        sampler.add_stat('acc_rate', stats.AcceptanceRateStatistic(sampler))
    if shared.config['sample'].getboolean('stat_iter_cost'):
        sampler.add_stat('iter_cost', stats.CostAtIterationStatistic(sampler))
    if shared.config['sample'].getboolean('stat_edge_freq'):
        sampler.add_stat('edge_freq', stats.EdgeFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_undirected_edge_freq'):
        sampler.add_stat('undirected_edge_freq',
                         stats.UndirectedEdgeFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_rule_freq'):
        sampler.add_stat('freq', stats.RuleFrequencyStatistic(sampler))
    if shared.config['sample'].getboolean('stat_rule_contrib'):
        sampler.add_stat('contrib',
                         stats.RuleExpectedContributionStatistic(sampler))

    # run sampling and print results
    logging.getLogger('main').info('Running sampling...')
    sampler.run_sampling()
    sampler.summary()

    sampler.save_root_costs('sample-root-costs.txt')
    sampler.save_edge_costs('sample-edge-costs.txt')

    # save paths to a file
    pathlen = 0
    with open_to_write('paths.txt') as fp:
        for entry in lexicon:
            root = sampler.branching.root(entry)
            path = sampler.branching.path(root, entry)
            path.reverse()
            size = sampler.branching.subtree_size(root)
            fp.write(' <- '.join([str(e) for e in path]) + \
                     ' ({}, {})\n'.format(len(path), size))
            pathlen += len(path)
    logging.getLogger('main').debug('Average path length: {}'\
                                    .format(pathlen / len(lexicon)))

    # save rule frequency model fits to a file
    if model.edge_frequency_model == 'lognormal':
        with open_to_write('freqmodel.txt') as fp:
            for r_id, rule in enumerate(model.rule_set):
                write_line(fp, (rule, model.edge_frequency_model.means[r_id],
                                model.edge_frequency_model.sdevs[r_id]))

    # count words at each depth in the graph
    counts_per_depth = defaultdict(lambda: 0)
    queue = [(word, 0) for word in lexicon \
                       if sampler.branching.parent(word) is None]
    while queue:
        (word, d) = queue.pop()
        counts_per_depth[d] += 1
        queue.extend([(word, d+1) \
                      for word in sampler.branching.successors(word)])
    logging.getLogger('main').debug('Number of nodes per depth:')
    for d, c in counts_per_depth.items():
        logging.getLogger('main').debug('{} {}'.format(d, c))
Пример #16
0
 def edges_cost(self, edge_set: EdgeSet) -> np.ndarray:
     result = np.zeros(len(edge_set))
     for rule, edge_ids in edge_set.get_edge_ids_by_rule().items():
         result[edge_ids] = self._rule_appl_cost[self.rule_set.get_id(rule)]
     return result
Пример #17
0
 def edges_prob(self, edges: EdgeSet) -> np.ndarray:
     result = np.zeros(len(edges))
     for rule, edge_ids in edges.get_edge_ids_by_rule().items():
         result[edge_ids] = self.rule_prob[self.rule_set.get_id(rule)]
     return result
Пример #18
0
 def analyze(self, target :LexiconEntry, compute_cost=True, **kwargs) \
            -> List[GraphEdge]:
     # TODO 1a. if predict_tag: get possible tags from the tag predictor
     # 1. get possible sources for the given target
     sources = set(sum([self.lexicon.get_by_symstr(word) \
                        for word, cost in self.fst.lookup(target.symstr)],
                       []))
     results = []
     # 2. get possible (source, rule) pairs (extract rules) and score them
     edge_set = EdgeSet(self.lexicon)
     for source in sources:
         rules = extract_all_rules(source, target)
         for rule in rules:
             if rule in self.model.rule_set:
                 if self.predict_vec:
                     target_pr = target.copy()
                     edge = GraphEdge(source, target_pr, rule)
                     target_pr.vec = self.model.predict_target_feature_vec(
                         edge)
                     edge_set.add(edge)
                 else:
                     edge_set.add(GraphEdge(source, target, rule))
     # back-formation
     if self.enable_back_formation and \
             (self.max_results is None or len(edge_set) < self.max_results):
         lookup_results = set()
         for w, c in self.inv_rules_tr.lookup(target.symstr):
             try:
                 lookup_results.add(unnormalize_word(\
                     re.sub(hfst.EPSILON, '', w)))
             except Exception as e:
                 logging.getLogger('main').warning(str(e))
         sources = []
         for word in lookup_results:
             try:
                 sources.append(LexiconEntry(word))
             except Exception as e:
                 logging.getLogger('main').warning(str(e))
         for source in sources:
             rules = extract_all_rules(source, target)
             for rule in rules:
                 if rule in self.model.rule_set:
                     edge_set.add(GraphEdge(source, target, rule))
     # analysis as root
     if self.include_roots:
         edge_set.add(GraphEdge(None, target, None))
     # scoring
     # FIXME this is inefficient and may break on some model components
     #   that don't have the method .edge_cost()
     for edge in edge_set:
         edge.attr['cost'] = 0
         if edge.source is not None:
             edge.attr['cost'] += self.model.edge_cost(edge)
             if edge.source not in self.lexicon:
                 edge.attr['cost'] += self.model.root_cost(edge.source)
         else:
             edge.attr['cost'] += self.model.root_cost(edge.target)
     results = [edge for edge in edge_set]
     # 4. sort the analyses according to the cost
     results.sort(key=lambda r: r.attr['cost'])
     if self.max_results is not None:
         results = results[:self.max_results]
     return results
Пример #19
0
    def test_complete_sample(self) -> None:
        'Test a sample consisting of all possible negative edges.'
        words = [
            'machen', 'macht', 'mache', 'Sachen', 'Sache', 'anwinkeln',
            'anzuwinkeln'
        ]
        rules = [\
            ':/en:t___:',
            ':/n:___:',
            ':/a:ä/:er___:',
            ':/:zu/:___:'
        ]
        positive_edges = [\
            ('machen', 'macht', ':/en:t___:'),
            ('machen', 'mache', ':/n:___:'),
            ('Sachen', 'Sache', ':/n:___:'),
            ('anwinkeln', 'anzuwinkeln', ':/:zu/:___:'),
        ]
        expected_negative_edges = [\
            ('Sachen', '{CAP}sacht', ':/en:t___:'),
            ('anwinkeln', 'anwinkel', ':/n:___:'),
            ('anzuwinkeln', 'anzuwinkel', ':/n:___:'),
            ('machen', 'mächener', ':/a:ä/:er___:'),
            ('macht', 'mächter', ':/a:ä/:er___:'),
            ('mache', 'mächeer', ':/a:ä/:er___:'),
            ('Sachen', '{CAP}sächener', ':/a:ä/:er___:'),
            ('Sache', '{CAP}sächeer', ':/a:ä/:er___:'),
            ('machen', 'mzuachen', ':/:zu/:___:'),
            ('machen', 'mazuchen', ':/:zu/:___:'),
            ('machen', 'maczuhen', ':/:zu/:___:'),
            ('machen', 'machzuen', ':/:zu/:___:'),
            ('machen', 'machezun', ':/:zu/:___:'),
            ('mache', 'mzuache', ':/:zu/:___:'),
            ('mache', 'mazuche', ':/:zu/:___:'),
            ('mache', 'maczuhe', ':/:zu/:___:'),
            ('mache', 'machzue', ':/:zu/:___:'),
            ('macht', 'mzuacht', ':/:zu/:___:'),
            ('macht', 'mazucht', ':/:zu/:___:'),
            ('macht', 'maczuht', ':/:zu/:___:'),
            ('macht', 'machzut', ':/:zu/:___:'),
            ('Sachen', '{CAP}zusachen', ':/:zu/:___:'),
            ('Sachen', '{CAP}szuachen', ':/:zu/:___:'),
            ('Sachen', '{CAP}sazuchen', ':/:zu/:___:'),
            ('Sachen', '{CAP}saczuhen', ':/:zu/:___:'),
            ('Sachen', '{CAP}sachzuen', ':/:zu/:___:'),
            ('Sachen', '{CAP}sachezun', ':/:zu/:___:'),
            ('Sache', '{CAP}zusache', ':/:zu/:___:'),
            ('Sache', '{CAP}szuache', ':/:zu/:___:'),
            ('Sache', '{CAP}sazuche', ':/:zu/:___:'),
            ('Sache', '{CAP}saczuhe', ':/:zu/:___:'),
            ('Sache', '{CAP}sachzue', ':/:zu/:___:'),
            ('anwinkeln', 'azunwinkeln', ':/:zu/:___:'),
            ('anwinkeln', 'anwzuinkeln', ':/:zu/:___:'),
            ('anwinkeln', 'anwizunkeln', ':/:zu/:___:'),
            ('anwinkeln', 'anwinzukeln', ':/:zu/:___:'),
            ('anwinkeln', 'anwinkzueln', ':/:zu/:___:'),
            ('anwinkeln', 'anwinkezuln', ':/:zu/:___:'),
            ('anwinkeln', 'anwinkelzun', ':/:zu/:___:'),
            ('anzuwinkeln', 'azunzuwinkeln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuzuwinkeln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzzuuwinkeln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuwzuinkeln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuwizunkeln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuwinzukeln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuwinkzueln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuwinkezuln', ':/:zu/:___:'),
            ('anzuwinkeln', 'anzuwinkelzun', ':/:zu/:___:')
        ]
        expected_weights = {\
            ':/en:t___:' : 1.0,
            ':/n:___:' : 1.0,
            ':/a:ä/:er___:' : 1.0,
            ':/:zu/:___:' : 41/40       # the word "anzuzuwinkeln" can be
                                        # derived in two different ways, so
                                        # it is counted double in domsize
                                        # computation, but sampled only once;
                                        # such cases are very rare, so they
                                        # shouldn't influence the weights much
        }

        lexicon = Lexicon(LexiconEntry(word) for word in words)
        lex_fst = lexicon.to_fst()
        rule_set = RuleSet()
        for rule_str in rules:
            rule = Rule.from_string(rule_str)
            rule_set.add(rule, rule.compute_domsize(lex_fst))
        edge_iter = (GraphEdge(lexicon[source], lexicon[target],
                               rule_set[rule]) \
                     for (source, target, rule) in positive_edges)
        edge_set = EdgeSet(lexicon, edge_iter)

        negex_sampler = NegativeExampleSampler(rule_set)
        sample_size = len(expected_negative_edges)
        sample = negex_sampler.sample(lexicon,
                                      sample_size,
                                      show_progressbar=False)
        sample_weights = negex_sampler.compute_sample_weights(sample, edge_set)

        self.assertEqual(rule_set.get_domsize(rule_set[0]), 2)
        self.assertEqual(rule_set.get_domsize(rule_set[1]), 4)
        self.assertEqual(rule_set.get_domsize(rule_set[2]), 5)
        self.assertEqual(rule_set.get_domsize(rule_set[3]), 42)
        self.longMessage = False
        for edge in edge_set:
            self.assertNotIn(edge,
                             sample,
                             msg='positive edge: {} in sample'.format(edge))
        for source, target, rule in expected_negative_edges:
            edge = GraphEdge(lexicon[source], LexiconEntry(target),
                             rule_set[rule])
            self.assertIn(edge, sample, msg='{} not in sample'.format(edge))
        self.longMessage = True
        for i, edge in enumerate(sample):
            self.assertAlmostEqual(sample_weights[i],
                                   expected_weights[str(edge.rule)],
                                   msg='for edge {}'.format(edge))