def time_hypergraph_masked(dp, scores): chart = np.zeros(len(dp.hypergraph.vertices)) mask = np.array(np.random.choice([1] * 2, len(dp.hypergraph.vertices)), dtype=np.uint8) # print np.sum(mask) / float(len(mask)) for _ in range(1000): pydecode.best_path(dp.hypergraph, scores, chart=chart, mask=mask)
def check_best_path_matrix(graph): """ Test viterbi path finding using matrix representation. """ scores = numpy.random.random(len(graph.edges)) path = pydecode.best_path(graph, scores) path.v
def inference(self, x, w, relaxed=False): graph, encoder = self.dynamic_program(x) if ("IND", x) in self.cache: feature_indices = self.cache["IND", x] else: graph_labels = graph.labeling[graph.labeling != -1] parts = encoder.transform_labels(graph_labels) parts_features = self.parts_features(x, parts) feature_indices = sparse_feature_indices(parts_features, self.temp_shape, self.offsets, self.feature_hash) if self._use_part_feature_cache and self.part_cache(x): self.cache["IND", x] = feature_indices # Sum the feature weights for the features in each label row. label_weights = np.zeros(len(graph.labeling)) label_weights[graph.labeling != -1] = \ np.sum(np.take(w, feature_indices, mode="clip"), axis=1) # weights = pydecode.transform(graph, label_weights) path = pydecode.best_path(graph, label_weights) y = encoder.transform_path(path) return y
def decode_fractional(self): vec = [ pulp.value(self.edge_vars[edge.id]) for edge in self.hypergraph.edges ] weights = pydecode.LogViterbiPotentials( self.hypergraph).from_vector(vec) return pydecode.best_path(self.hypergraph, weights)
def test_pruning(): for h in utils.hypergraphs(): w = numpy.random.random(len(h.edges)) original_path = pydecode.best_path(h, w) marginals = pydecode.marginals(h, w) best = w.T * original_path.v print marginals[1] a = np.array(marginals > 0.99 * best, dtype=np.uint8)
def test_pruning(): for h in utils.hypergraphs(): w = numpy.random.random(len(h.edges)) original_path = pydecode.best_path(h, w) marginals = pydecode.marginals(h, w) best = w.T * original_path.v print marginals[1] a = np.array(marginals > 0.99* best, dtype=np.uint8)
def check_best_path(graph, max_potentials): """ Test viterbi path finding. """ path = pydecode.best_path(graph, max_potentials) nt.assert_not_equal(max_potentials.T * path.v, 0.0) utils.valid_path(graph, path) same = False for other_path in utils.all_paths(graph): assert max_potentials.T * path.v >= max_potentials.T * other_path.v if path == other_path: same = True assert same
def argmax(self, reparams): new_pot = self._reparam(reparams) path = pydecode.best_path(self.hypergraph, new_pot) labels = self.labels.dot(path) # for edge in path.edges: # print edge, edge.head.label, edge.tail[0].label, self.labels[edge], "|", # # print # print labels argmax = [-1] * self.num_variables for i, l in labels: argmax[i] = l # print argmax assert (-1 not in argmax) # print argmax return argmax
def argmax(self, reparams): new_pot = self._reparam(reparams) path = pydecode.best_path(self.hypergraph, new_pot) labels = self.labels.dot(path) # for edge in path.edges: # print edge, edge.head.label, edge.tail[0].label, self.labels[edge], "|", # # print # print labels argmax = [-1] * self.num_variables for i, l in labels: argmax[i] = l # print argmax assert(-1 not in argmax) # print argmax return argmax
def score(self, labels): s = set(enumerate(labels)) score = 0.0 binary = pydecode.BoolPotentials(self.hypergraph)\ .from_vector([1 if all((l in s for l in self.labels[edge])) else 0 for edge in self.hypergraph.edges]) path = pydecode.best_path(self.hypergraph, binary) # if all((l in s for l in self.labels[edge])): # score += self.weights[edge] # for l in self.labels[edge]: # s.remove(l) if len(path.edges) == 0: return -1e9 return self.weights.dot(path)
def check_max_marginals(graph, pot): """ Test that max-marginals are correct. """ path = pydecode.best_path(graph, pot) best = pot.T * path.v # print "BEST" # print "\n".join(["%20s : %s" % (edge.label, pot[edge.id]) # for edge in path.edges]) # print best nt.assert_not_equal(best, 0.0) max_marginals = pydecode.marginals(graph, pot) # Array-form. for edge in graph.edges: other = max_marginals[edge.id] nt.assert_less_equal(other, best + 1e-4) # Matrix-form. assert (max_marginals < best + 1e-4).all()
def check_outside(graph, pot): """ Test outside chart properties. """ print graph path = pydecode.best_path(graph, pot) chart = pydecode.inside(graph, pot) print pot.shape, path.v.shape best = pot.T * path.v print path.v print best nt.assert_almost_equal(best, chart[graph.root.id]) nt.assert_not_equal(best, 0.0) out_chart = pydecode.outside(graph, pot, chart) # Array-form for vertex in graph.vertices: other = chart[vertex.id] + out_chart[vertex.id] nt.assert_less_equal(other, best + 1e-4, "%f %f %d %f %f"%(other, best, vertex.id, chart[vertex.id], out_chart[vertex.id])) # Matrix-form m = chart + out_chart assert (m < best + 1e4).all() # for node in graph.nodes: # other = chart[node] * out_chart[node] # nt.assert_less_equal(other, best + 1e-4) print chart print out_chart for edge in path.edges: for node in edge.tail: if node.is_terminal: other = out_chart[node.id] nt.assert_almost_equal(other, best)
def __call__(self, x, x_diff): """ Parameters ---------- Returns -------- path, subgradient, dual_score """ if x_diff is None: pydecode.pairwise_dot(self.constraint_potentials, x, self.dual_weights) else: pydecode.pairwise_dot(self.constraint_potentials, x_diff, self.dual_weights) path = pydecode.best_path(self.current_graph, self.dual_weights) dual_score = self.dual_weights.dot(path) constraint_vector = self.constraint_potentials.dot(path) subgradient = np.zeros(len(x)) for i, j in constraint_vector: subgradient[i] = j return path, subgradient, dual_score
def special_decode(self, method, problem, hypergraph, scores, constraints, scorer): if method == "CUBE": groups = [node.label.i for node in hypergraph.nodes] ins = ph.inside(hypergraph, scores) out = ph.outside(hypergraph, scores, ins) beam_chart = ph.beam_search_BinaryVector( hypergraph, scores, constraints.to_binary_potentials(), out, -10000, groups, [1000] * len(groups), cube_pruning=True) return beam_chart.path(0) elif method == "BEAM": groups = [node.label.i for node in hypergraph.nodes] ins = ph.inside(hypergraph, scores) out = ph.outside(hypergraph, scores, ins) beam_chart = ph.beam_search_BinaryVector( hypergraph, scores, constraints.to_binary_potentials(), out, -10000, groups, [1000] * len(groups)) return beam_chart.path(0) elif method == "MULTIDFA": old = hypergraph old_hmap = None for j in range(problem.size): states = 2 symbols = 2 dfa = ph.DFA(states, symbols, [{0:0, 1:1} , {0:1}], [1]) vec = [(1 if (edge.head.label.j == j) else 0) for edge in old.edges] counts = ph.CountingPotentials(old).from_vector(vec) hmap = ph.extend_hypergraph_by_dfa(old, counts, dfa) old = hmap.domain_hypergraph old.labeling = ph.Labeling(old, [hmap[node].label for node in old.nodes], None) #new_scores = old_scores.up_project(old, hmap) if old_hmap is not None: old_hmap = old_hmap.compose(hmap) else: old_hmap = hmap # old_scores = new_scores new_scores = scores.up_project(old, old_hmap) #new_scores = self.potentials(old, scorer) return ph.best_path(old, new_scores) elif method == "BIGDFA": old = hypergraph states = 2**problem.size symbols = problem.size + 1 final_state = 0 for i in range(problem.size): final_state |= 2**i transitions = \ [{j : i | 2**j for j in range(symbols) if i & 2**j == 0} for i in range(states)] dfa = ph.DFA(states, symbols, transitions, [final_state]) vec = [edge.head.label.j for edge in old.edges] counts = ph.CountingPotentials(old).from_vector(vec) hmap = ph.extend_hypergraph_by_dfa(old, counts, dfa) old = hmap.domain_hypergraph old.labeling = ph.Labeling(old, [hmap[node].label for node in old.nodes], None) new_scores = scores.up_project(old, hmap) return ph.best_path(old, new_scores)
def decode_fractional(self): vec = [pulp.value(self.edge_vars[edge.id]) for edge in self.hypergraph.edges] weights = pydecode.LogViterbiPotentials(self.hypergraph).from_vector(vec) return pydecode.best_path(self.hypergraph, weights)
def special_decode(self, method, problem, hypergraph, scores, constraints, scorer): if method == "CUBE": groups = [node.label.i for node in hypergraph.nodes] ins = ph.inside(hypergraph, scores) out = ph.outside(hypergraph, scores, ins) beam_chart = ph.beam_search_BinaryVector( hypergraph, scores, constraints.to_binary_potentials(), out, -10000, groups, [1000] * len(groups), cube_pruning=True) return beam_chart.path(0) elif method == "BEAM": groups = [node.label.i for node in hypergraph.nodes] ins = ph.inside(hypergraph, scores) out = ph.outside(hypergraph, scores, ins) beam_chart = ph.beam_search_BinaryVector( hypergraph, scores, constraints.to_binary_potentials(), out, -10000, groups, [1000] * len(groups)) return beam_chart.path(0) elif method == "MULTIDFA": old = hypergraph old_hmap = None for j in range(problem.size): states = 2 symbols = 2 dfa = ph.DFA(states, symbols, [{0: 0, 1: 1}, {0: 1}], [1]) vec = [(1 if (edge.head.label.j == j) else 0) for edge in old.edges] counts = ph.CountingPotentials(old).from_vector(vec) hmap = ph.extend_hypergraph_by_dfa(old, counts, dfa) old = hmap.domain_hypergraph old.labeling = ph.Labeling( old, [hmap[node].label for node in old.nodes], None) #new_scores = old_scores.up_project(old, hmap) if old_hmap is not None: old_hmap = old_hmap.compose(hmap) else: old_hmap = hmap # old_scores = new_scores new_scores = scores.up_project(old, old_hmap) #new_scores = self.potentials(old, scorer) return ph.best_path(old, new_scores) elif method == "BIGDFA": old = hypergraph states = 2**problem.size symbols = problem.size + 1 final_state = 0 for i in range(problem.size): final_state |= 2**i transitions = \ [{j : i | 2**j for j in range(symbols) if i & 2**j == 0} for i in range(states)] dfa = ph.DFA(states, symbols, transitions, [final_state]) vec = [edge.head.label.j for edge in old.edges] counts = ph.CountingPotentials(old).from_vector(vec) hmap = ph.extend_hypergraph_by_dfa(old, counts, dfa) old = hmap.domain_hypergraph old.labeling = ph.Labeling( old, [hmap[node].label for node in old.nodes], None) new_scores = scores.up_project(old, hmap) return ph.best_path(old, new_scores)
def time_hypergraph(dp, scores): chart = np.zeros(len(dp.hypergraph.vertices)) for _ in range(1000): pydecode.best_path(dp.hypergraph, scores, chart=chart)
def main(): parser = argparse.ArgumentParser(description='Run parsing experiments.') parser.add_argument('--original_rules', type=str, help='Original rule file') parser.add_argument('--binarized_rules', type=str, help='Binarized rule file') parser.add_argument('--training_ps', type=str, help='Lexicalized phrase structure file.') parser.add_argument('--training_dep', type=str, help='Dependency parse file.') parser.add_argument('--store_hypergraph_dir', type=str, help='Directory to store/load hypergraphs.') parser.add_argument('--save_hypergraph', type=bool, help='Construct and save hypergraphs.') parser.add_argument('--limit', type=int, help='Number of sentences to use.') parser.add_argument('--test_file', type=str, help='Test file.') parser.add_argument('--gold_file', type=str, help='Gold file.') parser.add_argument('--model', type=str, help='Weight model.') parser.add_argument('--test_limit', type=int, help='Number of sentences to test on.') parser.add_argument('--run_eval', default=False, type=bool, help='') parser.add_argument('--test_load', default=False, type=bool, help='') parser.add_argument('--debugger', default=False, type=bool, help='') parser.add_argument('--oracle', default=False, type=bool, help='Run oracle experiments') parser.add_argument('config', type=str) parser.add_argument('label', type=str) print >>sys.stderr, open(sys.argv[1]).read() argparse_config.read_config_file(parser, sys.argv[1]) args = parser.parse_args() print args if args.debugger: from IPython.core import ultratb sys.excepthook = ultratb.FormattedTB(color_scheme='Linux', call_pdb=1) output_dir = os.path.join("Data", args.label) data_out = os.path.join(output_dir, "mydata.txt") print >>sys.stderr, data_out # Set up logging. logger.setLevel(logging.DEBUG) handler = logging.StreamHandler(open(data_out, 'w')) logger.addHandler(handler) # Load data. print args.training_dep print args.training_ps if args.training_dep: X, Y = train.read_data_set(args.training_dep, args.training_ps, args.limit) orules = tree.read_original_rules(open(args.original_rules)) grammar = read_rule_set(open(args.binarized_rules)) # for rule in grammar.unary_rules: # print rule X, Y = zip(*[(x, y) for x, y in zip(X, Y) if len(x.words) >= 5]) binarized_Y = [tree.binarize(orules, make_bounds(x.deps), y)[0] for x, y in zip(X, Y)] model = train.ReconstructionModel(feature_hash=int(1e7), joint_feature_format="fast", joint_feature_cache=False, part_feature_cache=False) model.set_grammar(grammar) model.initialize(X, binarized_Y) if args.test_load: print "LOAD" graphs = [] start = memory() for i in range(1000 -1): if len(X[i].words) < 5: continue x = X[i] path = "%s/graphs%s.graph"%(args.store_hypergraph_dir, i) encoder = LexicalizedCFGEncoder(x.words, x.tags, grammar) pre = memory() graph = pydecode.load(path) print i, memory() - pre, len(graph.edges), len(X[i].words), memory() - start pre = memory() encoder.load("%s/encoder%s.pickle"%( args.store_hypergraph_dir, i), graph) print i, memory() - pre graphs.append((graph, encoder)) elif args.save_hypergraph: print "SAVING" import time model.set_from_disk(None) for i in range(40000): if len(X[i].words) < 5: continue # if len(X[i].words) > 15: continue graph, encoder = model.dynamic_program(X[i]) # Sanity Check # print binarized_Y[i] # print encoder.structure_path(graph, binarized_Y[i]) if i % 100 == 0: print i pydecode.save("%s/graphs%s.graph"%( args.store_hypergraph_dir, X[i].index), graph) encoder.save("%s/encoder%s.pickle"%( args.store_hypergraph_dir, X[i].index), graph) del graph del encoder elif args.oracle: print "ORACLE" trees_out = open(os.path.join(output_dir, "oracle.txt"), 'w') model = train.ReconstructionModel(feature_hash=int(1e7), part_feature_cache=False, joint_feature_cache=False, joint_feature_format="sparse") model.set_grammar(grammar) model.initialize(X, binarized_Y) model.set_from_disk(None) X_test, Y_test = train.read_data_set( args.test_file, args.gold_file, args.test_limit) w = np.load(args.model) # GOLD TREES binarized_Y_test = [] for x, orig_y in zip(X_test, Y_test): y = tree.binarize(orules, orig_y) try: graph, encoder = model.dynamic_program(x) label_values = np.zeros(np.max(graph.labeling) + 1) label_values.fill(-1) possible = 0 brackets = set() for part in encoder.transform_structure(y): X = grammar.rule_nonterms(part[5])[0] brackets.add((part[0], part[2], X)) #print part if tuple(part) in encoder.encoder: label = encoder.encoder[tuple(part)] label_values[label] = 10.0 possible += 1 print "transform" label_weights = np.zeros(len(graph.labeling)) graph_labels = graph.labeling[graph.labeling != -1] parts = encoder.transform_labels(graph_labels) weights = [] for part in parts: X = grammar.rule_nonterms(part[5])[0] if part[1] != part[2] and X[0] != "Z": if (part[0], part[2], X) in brackets: weights.append(2.0) else: weights.append(-2.0) else: weights.append(0.0) label_weights = np.zeros(len(graph.labeling)) label_weights[graph.labeling != -1] = np.array(weights) # graph_labels = graph.labeling[graph.labeling != -1] # parts = encoder.transform_labels(graph_labels) # parts_features = model.parts_features(x, parts) # feature_indices = pydecode.model.sparse_feature_indices(parts_features, # model.temp_shape, # model.offsets, # model.feature_hash) # # Sum the feature weights for the features in each label row. # label_weights = np.zeros(len(graph.labeling)) # label_weights[graph.labeling != -1] = \ # np.sum(np.take(w, feature_indices, mode="clip"), axis=1) oracle_weights = pydecode.transform(graph, label_values) path = pydecode.best_path(graph, oracle_weights + label_weights) print "Match", oracle_weights.T * path.v, possible y_hat = encoder.transform_path(path) print >>trees_out, tree.remove_head(tree.unbinarize(y_hat)) \ .pprint(100000) except: print >>trees_out, "" print "error" continue elif args.test_file: print "TESTING" trees_out = open(os.path.join(output_dir, "trees.txt"), 'w') model = train.ReconstructionModel(feature_hash=int(1e7), part_feature_cache=False, joint_feature_cache=False, joint_feature_format="sparse") model.set_grammar(grammar) model.initialize(X, binarized_Y) model.set_from_disk(None) X_test, Y_test = train.read_data_set( args.test_file, args.gold_file, args.test_limit) w = np.load(args.model) # binarized_Y_test = [] # for i, y in enumerate(Y_test): # print i # binarized_Y_test.append(tree.binarize(orules, y)) # for x, y in zip(X_test, binarized_Y_test): for x in X_test: try: graph, encoder = model.dynamic_program(x) y_hat = model.inference(x, w) for part in encoder.transform_structure(y_hat): print part, grammar.rule_nonterms(part[-1]), model.score_part(x, w, part) a = w.T * model.joint_feature(x, y_hat) # b = w.T * model.joint_feature(x, y) # print a, b # if b > a: print "FAIL" print print tree.remove_head(y_hat) print print tree.remove_head(tree.unbinarize(y_hat))\ .pprint() # print tree.remove_head(tree.unbinarize(y))\ # .pprint() # print #)\tree.remove_head( print >>trees_out, tree.remove_head(tree.unbinarize(y_hat)) \ .pprint(100000) except: print "error" print >>trees_out, "" elif args.run_eval: test_file = os.path.join(output_dir, "oracle.txt") gold_file = args.gold_file print "Evaling", test_file, gold_file os.system("../evalb/EVALB/evalb -p ../evalb/EVALB/COLLINS.prm %s %s"%(gold_file, test_file)) else: print "TRAINING" model.set_from_disk(args.store_hypergraph_dir) sp = StructuredPerceptron(model, verbose=1, max_iter=5, average=False) import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") sp.fit(X, binarized_Y) np.save(os.path.join(output_dir, "params"), sp.w) w = sp.w
def best_path(self, label_scores): self.compute_weights(label_scores) return pydecode.best_path(self.graph, self.weights)
def test_diff_potentials_fail(): h1, w1 = utils.random_hypergraph() h2, w2 = utils.random_hypergraph() pydecode.best_path(h1, w2)