예제 #1
0
def time_hypergraph_masked(dp, scores):
    chart = np.zeros(len(dp.hypergraph.vertices))
    mask = np.array(np.random.choice([1] * 2,
                                     len(dp.hypergraph.vertices)),
                    dtype=np.uint8)
    # print  np.sum(mask) / float(len(mask))

    for _ in range(1000):
        pydecode.best_path(dp.hypergraph, scores,
                           chart=chart, mask=mask)
예제 #2
0
def check_best_path_matrix(graph):
    """
    Test viterbi path finding using matrix representation.
    """
    scores = numpy.random.random(len(graph.edges))
    path = pydecode.best_path(graph, scores)
    path.v
예제 #3
0
    def inference(self, x, w, relaxed=False):
        graph, encoder = self.dynamic_program(x)
        if ("IND", x) in self.cache:
            feature_indices = self.cache["IND", x]
        else:
            graph_labels = graph.labeling[graph.labeling != -1]
            parts = encoder.transform_labels(graph_labels)
            parts_features = self.parts_features(x, parts)

            feature_indices = sparse_feature_indices(parts_features,
                                                     self.temp_shape,
                                                     self.offsets,
                                                     self.feature_hash)
            if self._use_part_feature_cache and self.part_cache(x):

                self.cache["IND", x] = feature_indices

        # Sum the feature weights for the features in each label row.
        label_weights = np.zeros(len(graph.labeling))
        label_weights[graph.labeling != -1] = \
            np.sum(np.take(w, feature_indices, mode="clip"), axis=1)

        # weights = pydecode.transform(graph, label_weights)
        path = pydecode.best_path(graph, label_weights)
        y = encoder.transform_path(path)
        return y
예제 #4
0
파일: model.py 프로젝트: junaraki/PyDecode
    def inference(self, x, w, relaxed=False):
        graph, encoder = self.dynamic_program(x)
        if ("IND", x) in self.cache:
            feature_indices = self.cache["IND", x]
        else:
            graph_labels = graph.labeling[graph.labeling != -1]
            parts = encoder.transform_labels(graph_labels)
            parts_features = self.parts_features(x, parts)

            feature_indices = sparse_feature_indices(parts_features,
                                                     self.temp_shape,
                                                     self.offsets,
                                                     self.feature_hash)
            if self._use_part_feature_cache and self.part_cache(x):

                self.cache["IND", x] = feature_indices

        # Sum the feature weights for the features in each label row.
        label_weights = np.zeros(len(graph.labeling))
        label_weights[graph.labeling != -1] = \
            np.sum(np.take(w, feature_indices, mode="clip"), axis=1)

        # weights = pydecode.transform(graph, label_weights)
        path = pydecode.best_path(graph, label_weights)
        y = encoder.transform_path(path)
        return y
예제 #5
0
 def decode_fractional(self):
     vec = [
         pulp.value(self.edge_vars[edge.id])
         for edge in self.hypergraph.edges
     ]
     weights = pydecode.LogViterbiPotentials(
         self.hypergraph).from_vector(vec)
     return pydecode.best_path(self.hypergraph, weights)
예제 #6
0
def test_pruning():
    for h in utils.hypergraphs():

        w = numpy.random.random(len(h.edges))

        original_path = pydecode.best_path(h, w)
        marginals = pydecode.marginals(h, w)
        best = w.T * original_path.v
        print marginals[1]
        a = np.array(marginals > 0.99 * best, dtype=np.uint8)
예제 #7
0
def test_pruning():
    for h in utils.hypergraphs():

        w = numpy.random.random(len(h.edges))

        original_path = pydecode.best_path(h, w)
        marginals = pydecode.marginals(h, w)
        best = w.T * original_path.v
        print marginals[1]
        a = np.array(marginals > 0.99* best, dtype=np.uint8)
예제 #8
0
def check_best_path(graph, max_potentials):
    """
    Test viterbi path finding.
    """
    path = pydecode.best_path(graph, max_potentials)
    nt.assert_not_equal(max_potentials.T * path.v, 0.0)
    utils.valid_path(graph, path)
    same = False
    for other_path in utils.all_paths(graph):
        assert max_potentials.T * path.v >= max_potentials.T * other_path.v
        if path == other_path:
            same = True
    assert same
예제 #9
0
    def argmax(self, reparams):
        new_pot = self._reparam(reparams)
        path = pydecode.best_path(self.hypergraph, new_pot)

        labels = self.labels.dot(path)
        # for edge in path.edges:
        #     print edge, edge.head.label, edge.tail[0].label, self.labels[edge], "|",
        # # print
        # print labels
        argmax = [-1] * self.num_variables
        for i, l in labels:
            argmax[i] = l
        # print argmax
        assert (-1 not in argmax)
        # print argmax
        return argmax
예제 #10
0
    def argmax(self, reparams):
        new_pot = self._reparam(reparams)
        path = pydecode.best_path(self.hypergraph, new_pot)

        labels = self.labels.dot(path)
        # for edge in path.edges:
        #     print edge, edge.head.label, edge.tail[0].label, self.labels[edge], "|",
        # # print
        # print labels
        argmax = [-1] * self.num_variables
        for i, l in labels:
            argmax[i] = l
        # print argmax
        assert(-1 not in argmax)
        # print argmax
        return argmax
예제 #11
0
    def score(self, labels):
        s = set(enumerate(labels))
        score = 0.0

        binary = pydecode.BoolPotentials(self.hypergraph)\
            .from_vector([1 if all((l in s for l in self.labels[edge])) else 0
                          for edge in self.hypergraph.edges])

        path = pydecode.best_path(self.hypergraph, binary)
            # if all((l in s for l in self.labels[edge])):
            #     score += self.weights[edge]
            #     for l in self.labels[edge]:
            #         s.remove(l)

        if len(path.edges) ==  0:
            return -1e9
        return self.weights.dot(path)
예제 #12
0
    def score(self, labels):
        s = set(enumerate(labels))
        score = 0.0

        binary = pydecode.BoolPotentials(self.hypergraph)\
            .from_vector([1 if all((l in s for l in self.labels[edge])) else 0
                          for edge in self.hypergraph.edges])

        path = pydecode.best_path(self.hypergraph, binary)
        # if all((l in s for l in self.labels[edge])):
        #     score += self.weights[edge]
        #     for l in self.labels[edge]:
        #         s.remove(l)

        if len(path.edges) == 0:
            return -1e9
        return self.weights.dot(path)
예제 #13
0
def check_max_marginals(graph, pot):
    """
    Test that max-marginals are correct.
    """

    path = pydecode.best_path(graph, pot)
    best = pot.T * path.v
    # print "BEST"
    # print "\n".join(["%20s : %s" % (edge.label, pot[edge.id])
    #                  for edge in path.edges])
    # print best
    nt.assert_not_equal(best, 0.0)
    max_marginals = pydecode.marginals(graph, pot)

    # Array-form.
    for edge in graph.edges:
        other = max_marginals[edge.id]
        nt.assert_less_equal(other, best + 1e-4)

    # Matrix-form.
    assert (max_marginals < best + 1e-4).all()
예제 #14
0
def check_outside(graph, pot):
    """
    Test outside chart properties.
    """
    print graph
    path = pydecode.best_path(graph, pot)
    chart = pydecode.inside(graph, pot)
    print pot.shape, path.v.shape
    best = pot.T * path.v
    print path.v
    print best
    nt.assert_almost_equal(best, chart[graph.root.id])
    nt.assert_not_equal(best, 0.0)

    out_chart = pydecode.outside(graph, pot, chart)

    # Array-form
    for vertex in graph.vertices:
        other = chart[vertex.id] + out_chart[vertex.id]
        nt.assert_less_equal(other, best + 1e-4,
                             "%f %f %d %f %f"%(other, best, vertex.id,
                                         chart[vertex.id], out_chart[vertex.id]))

    # Matrix-form
    m = chart + out_chart
    assert (m < best + 1e4).all()

    # for node in graph.nodes:
    #     other = chart[node] * out_chart[node]
    #     nt.assert_less_equal(other, best + 1e-4)
    print chart
    print out_chart

    for edge in path.edges:
        for node in edge.tail:
            if node.is_terminal:
                other = out_chart[node.id]
                nt.assert_almost_equal(other, best)
예제 #15
0
    def __call__(self, x, x_diff):
        """
        Parameters
        ----------

        Returns
        --------
        path, subgradient, dual_score
        """

        if x_diff is None:
            pydecode.pairwise_dot(self.constraint_potentials, x, self.dual_weights)
        else:
            pydecode.pairwise_dot(self.constraint_potentials, x_diff,
                            self.dual_weights)

        path = pydecode.best_path(self.current_graph, self.dual_weights)
        dual_score = self.dual_weights.dot(path)
        constraint_vector = self.constraint_potentials.dot(path)
        subgradient = np.zeros(len(x))
        for i, j in constraint_vector:
            subgradient[i] = j
        return path, subgradient, dual_score
예제 #16
0
    def special_decode(self, method, problem, hypergraph, scores, constraints,
                       scorer):
        if method == "CUBE":
            groups = [node.label.i for node in hypergraph.nodes]
            ins = ph.inside(hypergraph, scores)
            out = ph.outside(hypergraph, scores, ins)

            beam_chart = ph.beam_search_BinaryVector(
                hypergraph, scores, constraints.to_binary_potentials(),
                out, -10000, groups, [1000] * len(groups), cube_pruning=True)
            return beam_chart.path(0)

        elif method == "BEAM":
            groups = [node.label.i for node in hypergraph.nodes]
            ins = ph.inside(hypergraph, scores)
            out = ph.outside(hypergraph, scores, ins)

            beam_chart = ph.beam_search_BinaryVector(
                hypergraph, scores, constraints.to_binary_potentials(),
                out, -10000, groups, [1000] * len(groups))
            return beam_chart.path(0)
        elif method == "MULTIDFA":
            old = hypergraph
            old_hmap = None

            for j in range(problem.size):
                states = 2
                symbols = 2
                dfa = ph.DFA(states, symbols, [{0:0, 1:1} , {0:1}], [1])
                vec = [(1 if (edge.head.label.j == j) else 0)
                       for edge in old.edges]
                counts = ph.CountingPotentials(old).from_vector(vec)
                hmap = ph.extend_hypergraph_by_dfa(old, counts, dfa)
                old = hmap.domain_hypergraph
                old.labeling = ph.Labeling(old, [hmap[node].label
                                                 for node in old.nodes],
                                           None)
                #new_scores = old_scores.up_project(old, hmap)
                if old_hmap is not None:
                    old_hmap = old_hmap.compose(hmap)
                else:
                    old_hmap = hmap
                # old_scores = new_scores
            new_scores = scores.up_project(old, old_hmap)
            #new_scores = self.potentials(old, scorer)
            return ph.best_path(old, new_scores)

        elif method == "BIGDFA":
            old = hypergraph
            states = 2**problem.size
            symbols = problem.size + 1
            final_state = 0
            for i in range(problem.size):
                final_state |= 2**i

            transitions = \
                [{j : i | 2**j for j in range(symbols) if i & 2**j == 0}
                 for i in range(states)]
            dfa = ph.DFA(states, symbols,
                         transitions,
                         [final_state])
            vec = [edge.head.label.j for edge in old.edges]
            counts = ph.CountingPotentials(old).from_vector(vec)
            hmap = ph.extend_hypergraph_by_dfa(old, counts, dfa)
            old = hmap.domain_hypergraph
            old.labeling = ph.Labeling(old, [hmap[node].label
                                             for node in old.nodes],
                                       None)
            new_scores = scores.up_project(old, hmap)
            return ph.best_path(old, new_scores)
예제 #17
0
 def decode_fractional(self):
     vec = [pulp.value(self.edge_vars[edge.id])
            for edge in self.hypergraph.edges]
     weights = pydecode.LogViterbiPotentials(self.hypergraph).from_vector(vec)
     return pydecode.best_path(self.hypergraph, weights)
예제 #18
0
    def special_decode(self, method, problem, hypergraph, scores, constraints,
                       scorer):
        if method == "CUBE":
            groups = [node.label.i for node in hypergraph.nodes]
            ins = ph.inside(hypergraph, scores)
            out = ph.outside(hypergraph, scores, ins)

            beam_chart = ph.beam_search_BinaryVector(
                hypergraph,
                scores,
                constraints.to_binary_potentials(),
                out,
                -10000,
                groups, [1000] * len(groups),
                cube_pruning=True)
            return beam_chart.path(0)

        elif method == "BEAM":
            groups = [node.label.i for node in hypergraph.nodes]
            ins = ph.inside(hypergraph, scores)
            out = ph.outside(hypergraph, scores, ins)

            beam_chart = ph.beam_search_BinaryVector(
                hypergraph, scores, constraints.to_binary_potentials(), out,
                -10000, groups, [1000] * len(groups))
            return beam_chart.path(0)
        elif method == "MULTIDFA":
            old = hypergraph
            old_hmap = None

            for j in range(problem.size):
                states = 2
                symbols = 2
                dfa = ph.DFA(states, symbols, [{0: 0, 1: 1}, {0: 1}], [1])
                vec = [(1 if (edge.head.label.j == j) else 0)
                       for edge in old.edges]
                counts = ph.CountingPotentials(old).from_vector(vec)
                hmap = ph.extend_hypergraph_by_dfa(old, counts, dfa)
                old = hmap.domain_hypergraph
                old.labeling = ph.Labeling(
                    old, [hmap[node].label for node in old.nodes], None)
                #new_scores = old_scores.up_project(old, hmap)
                if old_hmap is not None:
                    old_hmap = old_hmap.compose(hmap)
                else:
                    old_hmap = hmap
                # old_scores = new_scores
            new_scores = scores.up_project(old, old_hmap)
            #new_scores = self.potentials(old, scorer)
            return ph.best_path(old, new_scores)

        elif method == "BIGDFA":
            old = hypergraph
            states = 2**problem.size
            symbols = problem.size + 1
            final_state = 0
            for i in range(problem.size):
                final_state |= 2**i

            transitions = \
                [{j : i | 2**j for j in range(symbols) if i & 2**j == 0}
                 for i in range(states)]
            dfa = ph.DFA(states, symbols, transitions, [final_state])
            vec = [edge.head.label.j for edge in old.edges]
            counts = ph.CountingPotentials(old).from_vector(vec)
            hmap = ph.extend_hypergraph_by_dfa(old, counts, dfa)
            old = hmap.domain_hypergraph
            old.labeling = ph.Labeling(
                old, [hmap[node].label for node in old.nodes], None)
            new_scores = scores.up_project(old, hmap)
            return ph.best_path(old, new_scores)
예제 #19
0
def time_hypergraph(dp, scores):
    chart = np.zeros(len(dp.hypergraph.vertices))
    for _ in range(1000):
        pydecode.best_path(dp.hypergraph, scores,
                           chart=chart)
예제 #20
0
def main():
    parser = argparse.ArgumentParser(description='Run parsing experiments.')
    parser.add_argument('--original_rules', type=str, help='Original rule file')
    parser.add_argument('--binarized_rules', type=str, help='Binarized rule file')
    parser.add_argument('--training_ps', type=str,
                        help='Lexicalized phrase structure file.')
    parser.add_argument('--training_dep', type=str,
                        help='Dependency parse file.')
    parser.add_argument('--store_hypergraph_dir', type=str,
                        help='Directory to store/load hypergraphs.')
    parser.add_argument('--save_hypergraph', type=bool,
                        help='Construct and save hypergraphs.')
    parser.add_argument('--limit', type=int, help='Number of sentences to use.')
    parser.add_argument('--test_file', type=str, help='Test file.')
    parser.add_argument('--gold_file', type=str, help='Gold file.')
    parser.add_argument('--model', type=str, help='Weight model.')
    parser.add_argument('--test_limit', type=int, help='Number of sentences to test on.')
    parser.add_argument('--run_eval', default=False, type=bool, help='')
    parser.add_argument('--test_load', default=False, type=bool, help='')

    parser.add_argument('--debugger', default=False, type=bool, help='')
    parser.add_argument('--oracle', default=False, type=bool, help='Run oracle experiments')



    parser.add_argument('config', type=str)
    parser.add_argument('label', type=str)

    print >>sys.stderr, open(sys.argv[1]).read()
    argparse_config.read_config_file(parser, sys.argv[1])

    args = parser.parse_args()
    print args

    if args.debugger:
        from IPython.core import ultratb
        sys.excepthook = ultratb.FormattedTB(color_scheme='Linux', call_pdb=1)

    output_dir = os.path.join("Data", args.label)
    data_out = os.path.join(output_dir, "mydata.txt")
    print >>sys.stderr, data_out

    # Set up logging.
    logger.setLevel(logging.DEBUG)
    handler = logging.StreamHandler(open(data_out, 'w'))
    logger.addHandler(handler)

    # Load data.
    print args.training_dep
    print args.training_ps
    if args.training_dep:
        X, Y = train.read_data_set(args.training_dep,
                                   args.training_ps,
                                   args.limit)

        orules = tree.read_original_rules(open(args.original_rules))
        grammar = read_rule_set(open(args.binarized_rules))
        # for rule in grammar.unary_rules:
        #     print rule
        X, Y = zip(*[(x, y) for x, y in zip(X, Y)
                     if len(x.words) >= 5])
        binarized_Y = [tree.binarize(orules, make_bounds(x.deps), y)[0] for x, y in zip(X, Y)]


        model = train.ReconstructionModel(feature_hash=int(1e7),
                                          joint_feature_format="fast",
                                          joint_feature_cache=False,
                                          part_feature_cache=False)
        model.set_grammar(grammar)
        model.initialize(X, binarized_Y)

    if args.test_load:
        print "LOAD"
        graphs = []
        start = memory()

        for i in range(1000 -1):
            if len(X[i].words) < 5:
                continue
            x = X[i]
            path = "%s/graphs%s.graph"%(args.store_hypergraph_dir, i)

            encoder = LexicalizedCFGEncoder(x.words, x.tags, grammar)
            pre = memory()
            graph = pydecode.load(path)
            print i, memory() - pre, len(graph.edges), len(X[i].words), memory() - start
            pre = memory()
            encoder.load("%s/encoder%s.pickle"%(
                    args.store_hypergraph_dir, i), graph)
            print i, memory() - pre
            graphs.append((graph, encoder))

    elif args.save_hypergraph:
        print "SAVING"
        import time
        model.set_from_disk(None)
        for i in range(40000):
            if len(X[i].words) < 5:
                continue
            # if len(X[i].words) > 15: continue
            graph, encoder = model.dynamic_program(X[i])

            # Sanity Check

            # print binarized_Y[i]
            # print encoder.structure_path(graph, binarized_Y[i])
            if i % 100 == 0:
                print i
            pydecode.save("%s/graphs%s.graph"%(
                    args.store_hypergraph_dir, X[i].index),
                          graph)
            encoder.save("%s/encoder%s.pickle"%(
                    args.store_hypergraph_dir, X[i].index), graph)
            del graph
            del encoder

    elif args.oracle:
        print "ORACLE"
        trees_out = open(os.path.join(output_dir, "oracle.txt"), 'w')
        model = train.ReconstructionModel(feature_hash=int(1e7), part_feature_cache=False,
                                          joint_feature_cache=False,
                                          joint_feature_format="sparse")
        model.set_grammar(grammar)
        model.initialize(X, binarized_Y)
        model.set_from_disk(None)
        X_test, Y_test = train.read_data_set(
            args.test_file, args.gold_file, args.test_limit)

        w = np.load(args.model)

        # GOLD TREES
        binarized_Y_test = []

        for x, orig_y in zip(X_test, Y_test):
            y = tree.binarize(orules, orig_y)

            try:
                graph, encoder = model.dynamic_program(x)
                label_values = np.zeros(np.max(graph.labeling) + 1)
                label_values.fill(-1)

                possible = 0
                brackets = set()
                for part in encoder.transform_structure(y):
                    X = grammar.rule_nonterms(part[5])[0]
                    brackets.add((part[0], part[2], X))
                    #print part
                    if tuple(part) in encoder.encoder:
                        label = encoder.encoder[tuple(part)]
                        label_values[label] = 10.0
                        possible += 1
                print "transform"

                label_weights = np.zeros(len(graph.labeling))
                graph_labels = graph.labeling[graph.labeling != -1]
                parts = encoder.transform_labels(graph_labels)
                weights = []
                for part in parts:
                    X = grammar.rule_nonterms(part[5])[0]
                    if part[1] != part[2] and X[0] != "Z":
                        if (part[0], part[2], X) in brackets:
                            weights.append(2.0)
                        else:
                            weights.append(-2.0)
                    else:
                        weights.append(0.0)
                label_weights = np.zeros(len(graph.labeling))
                label_weights[graph.labeling != -1] = np.array(weights)

                # graph_labels = graph.labeling[graph.labeling != -1]
                # parts = encoder.transform_labels(graph_labels)
                # parts_features = model.parts_features(x, parts)
                # feature_indices = pydecode.model.sparse_feature_indices(parts_features,
                #                                          model.temp_shape,
                #                                          model.offsets,
                #                                          model.feature_hash)

                # # Sum the feature weights for the features in each label row.
                # label_weights = np.zeros(len(graph.labeling))
                # label_weights[graph.labeling != -1] = \
                #     np.sum(np.take(w, feature_indices, mode="clip"), axis=1)



                oracle_weights = pydecode.transform(graph, label_values)
                path = pydecode.best_path(graph, oracle_weights + label_weights)
                print "Match", oracle_weights.T * path.v, possible
                y_hat = encoder.transform_path(path)
                print >>trees_out, tree.remove_head(tree.unbinarize(y_hat)) \
                                       .pprint(100000)

            except:
                print >>trees_out, ""
                print "error"
                continue

    elif args.test_file:
        print "TESTING"
        trees_out = open(os.path.join(output_dir, "trees.txt"), 'w')
        model = train.ReconstructionModel(feature_hash=int(1e7), part_feature_cache=False,
                                          joint_feature_cache=False,
                                          joint_feature_format="sparse")
        model.set_grammar(grammar)
        model.initialize(X, binarized_Y)
        model.set_from_disk(None)
        X_test, Y_test = train.read_data_set(
            args.test_file, args.gold_file, args.test_limit)
        w = np.load(args.model)
        # binarized_Y_test = []
        # for i, y in enumerate(Y_test):
        #     print i
        #     binarized_Y_test.append(tree.binarize(orules, y))
        # for x, y in zip(X_test, binarized_Y_test):
        for x in X_test:
            try:
                graph, encoder = model.dynamic_program(x)

                y_hat = model.inference(x, w)
                for part in encoder.transform_structure(y_hat):
                    print part, grammar.rule_nonterms(part[-1]), model.score_part(x, w, part)
                a = w.T * model.joint_feature(x, y_hat)
                # b = w.T * model.joint_feature(x, y)
                # print a, b
                # if b > a: print "FAIL"

                print
                print tree.remove_head(y_hat)

                print
                print tree.remove_head(tree.unbinarize(y_hat))\
                          .pprint()

                # print tree.remove_head(tree.unbinarize(y))\
                #           .pprint()
                # print
                #)\tree.remove_head(
                print >>trees_out, tree.remove_head(tree.unbinarize(y_hat)) \
                                       .pprint(100000)
            except:
                print "error"
                print >>trees_out, ""

    elif args.run_eval:
        test_file = os.path.join(output_dir, "oracle.txt")
        gold_file = args.gold_file
        print "Evaling", test_file, gold_file
        os.system("../evalb/EVALB/evalb  -p ../evalb/EVALB/COLLINS.prm %s %s"%(gold_file, test_file))
    else:
        print "TRAINING"
        model.set_from_disk(args.store_hypergraph_dir)
        sp = StructuredPerceptron(model, verbose=1, max_iter=5,
                                  average=False)
        import warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            sp.fit(X, binarized_Y)
        np.save(os.path.join(output_dir, "params"), sp.w)
        w = sp.w
예제 #21
0
 def best_path(self, label_scores):
     self.compute_weights(label_scores)
     return pydecode.best_path(self.graph, self.weights)
예제 #22
0
def test_diff_potentials_fail():
    h1, w1 = utils.random_hypergraph()
    h2, w2 = utils.random_hypergraph()
    pydecode.best_path(h1, w2)
예제 #23
0
def test_diff_potentials_fail():
    h1, w1 = utils.random_hypergraph()
    h2, w2 = utils.random_hypergraph()
    pydecode.best_path(h1, w2)