def parse_agenda(self): while len(self.agenda) > 0: item = self.agenda.pop() if logger.level >= 4: logger.writeln('pop: %s' % item) for item1, item2, inverted in self.neighboring_pairs(item): # avoid duplicated edges. note that in ABC grammar, # if the boxes of item1 and item2 are given, the nt of the # new item is fixed if logger.level >= 4: logger.writeln('neighbors: %s %s' % (item1, item2)) key = (item1.nt, item1.fi, item1.fj, item1.ei, item1.ej, item2.nt, item2.fi, item2.fj, item2.ei, item2.ej) if key not in self.edge_index: self.edge_index.add(key) new_item = self.make_item(item1, item2, inverted) if self.chart_add(new_item): self.agenda.append(new_item) self.neighbor_index.add(new_item) self.glue_nodes.append(new_item) if logger.level >= 4: logger.writeln('push: %s' % new_item) # self.stats() root = self.final_glue() self.hg = Hypergraph(root) self.hg.topo_sort() self.stats() return self.hg
def final_glue1(self): """try to cover all phrases AND glue rules""" # candidate glue nodes are glue nodes whose boxes are also phrases # candidate_glue_nodes = [] # for node in self.glue_nodes: # bin = self.chart.get((node.fi, node.fj, node.ei, node.ej)) # if bin is not None: # if PHRASE in bin: # candidate_glue_nodes.append(node) candidates = self.phrases + self.glue_rules # topo sort. root node at the end candidates.sort() roots = [] while len(candidates) > 0: root = candidates.pop() print('pop: %s' % root) roots.append(root) hg = Hypergraph(root) hg.find_reachable_nodes() candidates = [n for n in candidates if id(n) not in hg.found] top_rule = Rule() top_rule.lhs = START top_edge = PhraseHGEdge(top_rule) for root in roots: top_rule.f.append(root.nt) top_rule.e.append(root.nt) top_edge.add_tail(root) top_rule.e2f = [i for i in range(len(top_rule.f))] top_node = PhraseHGNode(START, 0, self.n1, 0, self.n2) top_node.add_incoming(top_edge) return top_node
def do_match(img1, img2, cang, crat, cdesc): M = None # Get features and distances between every pair of points from both images (kpts1, des1) = get_features(img1, M, 'target.jpg') (kpts2, des2) = get_features(img2, M, 'reference.jpg') Hgt = Hypergraph(kpts1, des1) Hgr = Hypergraph(kpts2, des2) # draw.triangulation(kpts1, Hgt.E, img1, 'Triangulation 1') # draw.triangulation(kpts2, Hgr.E, img2, 'Triangulation 2') print 'Hypergraph construction done' edge_matches, point_matches = match(Hgt.E, Hgr.E, kpts1, kpts2, des1, des2, cang, crat, cdesc, 0.7, 0.75, True) print 'Hyperedges matching done' # draw.edges_match(edge_matches, kpts1, kpts2, Hgt.E, Hgr.E, img1, img2) point_matches = sorted(point_matches, key=lambda x: x.distance) draw.points_match(point_matches, kpts1, kpts2, img1, img2) cv2.waitKey() cv2.destroyAllWindows()
def __init__(self, dag1, dag2): """ The first and second parameters must be DirectedAcyclicGraphs as specified on the file datastructures.py" """ self.dag1_mapper = DirectedAcyclicGraphMapper(dag1) self.dag2_mapper = DirectedAcyclicGraphMapper(dag2) self.hypergraph = Hypergraph()
def build_mini_hypergraph(edges): hg = Hypergraph(edges[0].head) edges = list(edges[:]) while len(edges) > 0: edge = edges.pop() if len(edge.composed_edges) == 0: hg.add(edge) else: edges += edge.composed_edges return hg
def final_glue(self): unattached = self.phrases[:] candidates = self.phrases + self.glue_nodes # topo sort. root node at the end unattached.sort() candidates.sort() self.top_roots = [] self.other_roots = [] while len(candidates) > 0: root = candidates.pop() if (root.fi == 0 and root.fj == self.n1 and root.ei == 0 and root.ej == self.n2): self.top_roots.append(root) else: self.other_roots.append(root) hg = Hypergraph(root) hg.find_reachable_nodes() unattached = [n for n in unattached if id(n) not in hg.found] candidates = [n for n in candidates if id(n) not in hg.found and \ (n.nt == PHRASE or not n < root)] top_node = PhraseHGNode(START, 0, self.n1, 0, self.n2) # add one edge for each top root for root in self.top_roots: rule = Rule() rule.lhs = START rule.f = [root.nt] rule.e = [root.nt] rule.e2f = [0] edge = PhraseHGEdge(rule) edge.add_tail(root) top_node.add_incoming(edge) # add one edge for all other roots if ((glue_missing_phrases or len(self.top_roots) == 0) and len(self.other_roots) > 0): rule = Rule() rule.lhs = START edge = PhraseHGEdge(rule) for root in self.other_roots: rule.f.append(root.nt) rule.e.append(root.nt) edge.add_tail(root) rule.e2f = [i for i in range(len(rule.f))] top_node.add_incoming(edge) return top_node
def read_string_file(stream): while True: line = stream.readline() if '[' in line or ']' in line: print >>sys.stderr, 'Square brackets found in input. Please escape these to -LSB- and -RSB-' sys.exit(1) if not line: break line = line.decode('utf-8').strip() yield Hypergraph.from_surface_string(line)
def read_tree_file(self, treefile): f = open(treefile) current_indent = 0 indent_level = 2 current_edge = None stack = [] for line in f: # TODO: why are there None node lines? if '|||' in line or line.strip() == 'None': indent = 0 while line[indent] == ' ': indent += 1 # TODO: hack. None lines have wrong indent in # max derivation viterbi trees # if line.strip() == 'None': # indent -= 2 if indent == current_indent + indent_level: current_indent = indent stack.append(node) elif indent < current_indent: npop = (current_indent - indent) // indent_level current_indent = indent for i in range(npop): top_tmp = stack.pop() # hg = Hypergraph(top_tmp) # hg.topo_sort() # hg.show() node = Node() if len(stack) > 0: stack[-1].incoming[0].add_tail(node) # TODO: why are there None nodes? these nodes have incoming # edges. they are just a nonterminal as a leaf. if line.strip() != 'None': rule = Rule() rule.fromstr(line) edge = PhraseHGEdge() edge.rule = rule node.add_incoming(edge) hg = Hypergraph(stack[0]) hg.topo_sort() f.close() return hg
def combine_trees(trees_to_combine): if len(trees_to_combine) == 0: return None hypergraphs_to_combine = [] total_scores = sum(score for _, score in trees_to_combine) for tree, score in trees_to_combine: if total_scores != 0.0: score = score / total_scores else: score = 1.0 / len(trees_to_combine) computeSpans(tree) tree_hg = Hypergraph.from_tree(tree, score) tree_hg.sanity_check() hypergraphs_to_combine.append(tree_hg) final_hypergraph = hypergraphs_to_combine[0] for hypergraph in hypergraphs_to_combine[1:]: final_hypergraph.combine(hypergraph) return final_hypergraph
def compare_communities(self, ): for index, interval in enumerate(self.graphs): if index < len(self.graphs) - 1: self.inclusions = {} window_id = 'TF%s -> TF%s' % (index, index + 1) Dhypergraph = nx.DiGraph(window=window_id) print('Initialize inclusions dict start...') Dhypergraph = self.initialize_inclusions(index, Dhypergraph) print('Initialize inclusions dict finish...') for ic, community_t in enumerate(interval): for ic2, community_t1 in enumerate(self.graphs[index + 1]): inclusion = self.inclusions[community_t.graph['cid']][ community_t1.graph['cid']]['inclusion'] inversed = self.inclusions[community_t.graph['cid']][ community_t1.graph['cid']]['inversed_inclusion'] event = Event(community_t, community_t1, inclusion, inversed, self.inclusions) result = event.classify() if result in ['growing', 'shrinking', 'continuing']: Dhypergraph.add_edge(community_t, community_t1, event_type=result) self.results.append({ 'network_t': community_t.graph['cid'], 'network_t1': community_t1.graph['cid'], 'resulted_event': result }) hypergraph = Hypergraph(Dhypergraph) self.hypergraphs.append(hypergraph)
class ABCParser(object): """Bilingual parser that glues hiero rules into a hypergraph with ABC glue grammar.""" def __init__(self, n1, n2, phrases): """ n1: French length n2: English length phrases: a list of PhraseHGNodes that have been partially linked according to heiro rule extraction """ self.n1 = n1 self.n2 = n2 self.chart = {} self.neighbor_index = NeighborIndex() self.edge_index = set() self.agenda = [] self.phrases = phrases self.glue_nodes = [] for phrase in phrases: bin = self.chart.setdefault((phrase.fi, phrase.fj, phrase.ei, phrase.ej), {}) bin[phrase.nt] = phrase self.agenda.append(phrase) self.neighbor_index.add(phrase) # not used, too slow def parse(self): self.glue_nodes = [] for i1, j1, i2, j2 in bi_cyk_spans(self.n1, self.n2): for k1 in range(i1 + 1, j1): for k2 in range(i2 + 1, j2): bin1 = self.chart.get((i1, k1, i2, k2), {}) bin2 = self.chart.get((k1, j1, k2, j2), {}) for item1 in bin1.values(): for item2 in bin2.values(): if item2.nt != STRAIGHT: new_item = self.make_item(item1, item2, False) self.chart_add(new_item) bin1 = self.chart.get((i1, k1, k2, j2), {}) bin2 = self.chart.get((k1, j1, i2, k2), {}) for item1 in bin1.values(): for item2 in bin2.values(): if item2.nt != INVERTED: new_item = self.make_item(item1, item2, True) self.chart_add(new_item) self.stats() def parse_agenda(self): while len(self.agenda) > 0: item = self.agenda.pop() if logger.level >= 4: logger.writeln('pop: %s' % item) for item1, item2, inverted in self.neighboring_pairs(item): # avoid duplicated edges. note that in ABC grammar, # if the boxes of item1 and item2 are given, the nt of the # new item is fixed if logger.level >= 4: logger.writeln('neighbors: %s %s' % (item1, item2)) key = (item1.nt, item1.fi, item1.fj, item1.ei, item1.ej, item2.nt, item2.fi, item2.fj, item2.ei, item2.ej) if key not in self.edge_index: self.edge_index.add(key) new_item = self.make_item(item1, item2, inverted) if self.chart_add(new_item): self.agenda.append(new_item) self.neighbor_index.add(new_item) self.glue_nodes.append(new_item) if logger.level >= 4: logger.writeln('push: %s' % new_item) # self.stats() root = self.final_glue() self.hg = Hypergraph(root) self.hg.topo_sort() self.stats() return self.hg def final_glue(self): unattached = self.phrases[:] candidates = self.phrases + self.glue_nodes # topo sort. root node at the end unattached.sort() candidates.sort() self.top_roots = [] self.other_roots = [] while len(candidates) > 0: root = candidates.pop() if (root.fi == 0 and root.fj == self.n1 and root.ei == 0 and root.ej == self.n2): self.top_roots.append(root) else: self.other_roots.append(root) hg = Hypergraph(root) hg.find_reachable_nodes() unattached = [n for n in unattached if id(n) not in hg.found] candidates = [n for n in candidates if id(n) not in hg.found and \ (n.nt == PHRASE or not n < root)] top_node = PhraseHGNode(START, 0, self.n1, 0, self.n2) # add one edge for each top root for root in self.top_roots: rule = Rule() rule.lhs = START rule.f = [root.nt] rule.e = [root.nt] rule.e2f = [0] edge = PhraseHGEdge(rule) edge.add_tail(root) top_node.add_incoming(edge) # add one edge for all other roots if ((glue_missing_phrases or len(self.top_roots) == 0) and len(self.other_roots) > 0): rule = Rule() rule.lhs = START edge = PhraseHGEdge(rule) for root in self.other_roots: rule.f.append(root.nt) rule.e.append(root.nt) edge.add_tail(root) rule.e2f = [i for i in range(len(rule.f))] top_node.add_incoming(edge) return top_node # not used def final_glue1(self): """try to cover all phrases AND glue rules""" # candidate glue nodes are glue nodes whose boxes are also phrases # candidate_glue_nodes = [] # for node in self.glue_nodes: # bin = self.chart.get((node.fi, node.fj, node.ei, node.ej)) # if bin is not None: # if PHRASE in bin: # candidate_glue_nodes.append(node) candidates = self.phrases + self.glue_rules # topo sort. root node at the end candidates.sort() roots = [] while len(candidates) > 0: root = candidates.pop() print('pop: %s' % root) roots.append(root) hg = Hypergraph(root) hg.find_reachable_nodes() candidates = [n for n in candidates if id(n) not in hg.found] top_rule = Rule() top_rule.lhs = START top_edge = PhraseHGEdge(top_rule) for root in roots: top_rule.f.append(root.nt) top_rule.e.append(root.nt) top_edge.add_tail(root) top_rule.e2f = [i for i in range(len(top_rule.f))] top_node = PhraseHGNode(START, 0, self.n1, 0, self.n2) top_node.add_incoming(top_edge) return top_node def neighboring_pairs(self, item): """ return value is items in the order they appear on f side, and whether they are inverted. The constraint of ABC grammar is also applied here. """ for neighbor in self.neighbor_index.get((item.fi, item.ej), 0): if item.nt != INVERTED: yield neighbor, item, True for neighbor in self.neighbor_index.get((item.fi, item.ei), 1): if item.nt != STRAIGHT: yield neighbor, item, False for neighbor in self.neighbor_index.get((item.fj, item.ei), 2): if neighbor.nt != INVERTED: yield item, neighbor, True for neighbor in self.neighbor_index.get((item.fj, item.ej), 3): if neighbor.nt != STRAIGHT: yield item, neighbor, False def make_item(self, item1, item2, inverted): """item1 and item2 is always given in the order they appear on the f side""" rule = Rule() rule.f = [item1.nt, item2.nt] fi = item1.fi fj = item2.fj if inverted: rule.lhs = INVERTED rule.e = [item2.nt, item1.nt] rule.e2f = [1, 0] ei = item2.ei ej = item1.ej else: rule.lhs = STRAIGHT rule.e = [item1.nt, item2.nt] rule.e2f = [0, 1] ei = item1.ei ej = item2.ej edge = PhraseHGEdge(rule) edge.add_tail(item1) edge.add_tail(item2) new_item = PhraseHGNode(rule.lhs, fi, fj, ei, ej) new_item.add_incoming(edge) return new_item def chart_add(self, item): bin = self.chart.setdefault((item.fi, item.fj, item.ei, item.ej), {}) added = False # the ABCParser applies only glue rules. this test says glue rules # are used only when a PHRASE is not already derived for the box # if PHRASE not in bin: old_item = bin.get(item.nt) if old_item: old_item.add_incoming(item.incoming[0]) else: added = True bin[item.nt] = item return added def stats(self): result = '--ABCParser Stats--\n' top_bin = self.chart.get((0, self.n1, 0, self.n2)) if top_bin is None: result += 'parse failed\n' else: result += 'parse succeeded\n' result += self.hg.stats() # self.hg.show() hiero_rules = 0 glue_rules = [] for edge in self.hg.edges(): if edge.rule.lhs == PHRASE: hiero_rules += 1 else: glue_rules.append(edge) result += 'hiero rules: %s\n' % hiero_rules result += 'glue rules: %s\n' % len(glue_rules) rules = [] for node in self.phrases: for edge in node.incoming: rules.append(edge.rule) hg_rules = set() for edge in self.hg.edges(): hg_rules.add(id(edge.rule)) unglued_rules = [] for rule in rules: if id(rule) not in hg_rules: unglued_rules.append(rule) roots = self.top_roots + self.other_roots result += 'roots: %s\n' % len(roots) for node in roots: result += '%s\n' % node result += 'unglued rules: %s\n' % len(unglued_rules) for rule in unglued_rules: result += '%s\n' % rule return result
def induced_graph(self, v, force_copy=False): if not force_copy and self.hg.nodes() == set(v): return self h = Hypergraph(vertices=v) h.induce_edges(self.__hg.edges()) return HypergraphPrimalView(h)
def run(self): # update per-sentence grammars, if there's any for g in self.grammars: g.update(self.id) self.flog = open('%s/%s_%s' % (FLAGS.run_dir, 'log', self.suffix), 'w') if FLAGS.show_time: self.flog.write('running on %s\n\n' % socket.gethostname()) self.flog.flush() fwords = self.line.strip().split() # added by freesunshine, build the local grammar for oov words for each sentence rules = [] if self.oov_idx is not None and len(self.oov_idx) > 0: #oov_weight = 8.0 oov_weight = 0.0001 for idx in self.oov_idx: fw = fwords[idx] ew = "." rule_str = "[A0-0] ||| %s ||| %s ||| %lf %lf %lf" %(fw, ew, oov_weight, oov_weight, oov_weight) rr = Rule() rr.fromstr(rule_str) rules.append(rr) if self.ner_items is not None and len(self.ner_items) > 0: for item in self.ner_items: concept_weight = 10.0 st = item[0][0] ed = item[0][1] fw = ' '.join(fwords[st:ed]) #concept_weight *= pow((ed-st), 2) ew = item[1] value = int(ew[2]) #Here is the feature for difference of nonterminal type #concept_weight /= pow(1.4, value) #Here is the feature for the favor of longer spans #concept_weight *= pow(2, ed-st) #Here is the feature for the number of edges #concept_weight /= pow(2.0, get_num_edges(ew)) #print >>sys.stder, ew, concept_weight #rule_str = "[A1-1] ||| %s ||| %s ||| " % (fw, ew) rule_str = "%s ||| " % ew #weight = 5 if fw == ';': rule_str += "%lf %lf %lf" % (concept_weight, concept_weight, concept_weight) else: rule_str += "%lf %lf %lf" % (concept_weight, concept_weight, concept_weight) rr = Rule() #print rule_str rr.fromstr(rule_str) rules.append(rr) #print '===== local_gr =====' #for r in rules: # print r local_gr = None if len(rules) > 0: local_gr = Grammar(FLAGS.rule_bin_size) local_gr.build(rules, self.grammars[0].features) if FLAGS.preprocess: self.fidx2replacement = {} j = 0 for i, token in enumerate(fwords): if token in ('$number', '$date'): self.fidx2replacement[i] = self.special[j][1] j += 1 self.flog.write('[%s][%s words] %s\n' % (self.id, len(fwords), self.line)) decoder = Decoder(fwords, self.grammars, self.features, local_gr) begin_time = time() if FLAGS.decoding_method == 'agenda': item = decoder.decode() elif FLAGS.decoding_method == 'cyk': item = decoder.decode_cyk() elif FLAGS.decoding_method == 'earley': item = decoder.decode_earley() else: assert False, '"%s" not valid decoding option' \ % FLAGS.decoding_method self.time = time() - begin_time if item is None: self.out = '[decoder failed to build a goal item]' else: ttt, succ = item item = ttt hg = Hypergraph(item) hg.set_semiring(hypergraph.SHORTEST_PATH) hg.set_functions(lambda x: x.cost, None, None) hg.topo_sort() self.kbest = hg.root.best_paths() #output_tokens = self.kbest[0].translation[:] #if FLAGS.preprocess: # for i in range(len(output_tokens)): # if output_tokens[i] in ('$number', '$date'): # fidx = self.kbest[0].composed_rule.we2f[i] # if fidx is not None: # output_tokens[i] = self.fidx2replacement[fidx] # @freesunshine target side string output #self.out = ' '.join(output_tokens[FLAGS.lm_order-1: # 1-FLAGS.lm_order]) self.flog.write('Decuction Tree:\n%s\n' % self.kbest[0].tree_str()) #self.out = str(self.kbest[0].translation) #if succ: self.out = self.kbest[0].translation.to_amr_format()[0] #else: # self.out = self.kbest[0].translation.toAMR() lines = [x.strip() for x in self.out.split('\n')] self.out = "".join(lines) self.hg = hg if FLAGS.output_hypergraph: self.write_hypergraph() self.flog.write('%s\n' % self.out) self.flog.write('\n') #if item is not None: # self.flog.write(self.kbest[0].tree_str()) # self.flog.write('\n') # self.flog.write(hg.stats()) # self.flog.write('\n') self.flog.write(decoder.agenda_stats()) self.flog.write('\n') self.flog.write(decoder.chart.stats()) self.flog.write('\n') for dotchart in decoder.dotcharts: self.flog.write(dotchart.stats()) self.flog.write('\n') if FLAGS.show_time: timeline = '{:<35}{:>15.2f}\n'.format('[time]:', self.time) self.flog.write(timeline) self.write_output_file() if FLAGS.output_kbest: self.write_kbest_to_file() self.flog.close()
def setUp(self): w0 = ForestNode('John') w1 = ForestNode('saw') w2 = ForestNode('a') w3 = ForestNode('girl') w4 = ForestNode('with') w5 = ForestNode('a') w6 = ForestNode('telescope') t0_1 = ForestNode('NN') t1_2_0 = ForestNode('VB') t1_2_1 = ForestNode('NN') t2_3 = ForestNode('DT') t3_4 = ForestNode('NN') t4_5 = ForestNode('IN') t5_6 = ForestNode('DT') t6_7 = ForestNode('NN') t2_4 = ForestNode('NP') t5_7 = ForestNode('NP') t1_4 = ForestNode('VP') t4_7 = ForestNode('PP') t2_7 = ForestNode('NP') t1_7 = ForestNode('VP') root = ForestNode('S') # [NN,0,1] -> John e = ForestEdge() e.add_tail(w0) e.prob = 0.02 t0_1.add_incoming(e) # [VB,1,2] -> saw e = ForestEdge() e.add_tail(w1) e.prob = 0.01 t1_2_0.add_incoming(e) # [NN,1,2] -> saw e = ForestEdge() e.add_tail(w1) e.prob = 0.01 t1_2_1.add_incoming(e) # [DT,2,3] -> a e = ForestEdge() e.add_tail(w2) e.prob = 0.5 t2_3.add_incoming(e) # [NN,3,4] -> girl e = ForestEdge() e.add_tail(w3) e.prob = 0.05 t3_4.add_incoming(e) # [IN,4,5] -> with e = ForestEdge() e.add_tail(w4) e.prob = 0.25 t4_5.add_incoming(e) # [DT,5,6] -> a e = ForestEdge() e.add_tail(w5) e.prob = 0.5 t5_6.add_incoming(e) # [NN,6,7] -> telescope e = ForestEdge() e.add_tail(w6) e.prob = 0.001 t6_7.add_incoming(e) # [NP,2,4] -> [DT,2,3] [NN,3,4] e = ForestEdge() e.add_tail(t2_3) e.add_tail(t3_4) e.prob = 0.7 t2_4.add_incoming(e) # [NP,5,7] -> [DT,5,6] [NN,6,7] e = ForestEdge() e.add_tail(t5_6) e.add_tail(t6_7) e.prob = 0.7 t5_7.add_incoming(e) # [VP,1,4] -> [VB,1,2] [NP,2,4] e = ForestEdge() e.add_tail(t1_2_0) e.add_tail(t2_4) e.prob = 0.9 t1_4.add_incoming(e) # [PP,4,7] -> [IN,4,5] [NP,5,7] e = ForestEdge() e.add_tail(t4_5) e.add_tail(t5_7) e.prob = 1.0 t4_7.add_incoming(e) # [NP,2,7] -> [NP,2,4] [PP,4,7] e = ForestEdge() e.add_tail(t2_4) e.add_tail(t4_7) e.prob = 0.3 t2_7.add_incoming(e) # [VP,1,7] -> [VB,1,2] [NP,2,7] e = ForestEdge() e.add_tail(t1_2_0) e.add_tail(t2_7) e.prob = 0.5 t1_7.add_incoming(e) # [VP,1,7] -> [VP,1,4] [PP,4,7] e = ForestEdge() e.add_tail(t1_4) e.add_tail(t4_7) e.prob = 0.5 t1_7.add_incoming(e) # [S,0,7] -> [NN,0,1] [VP,1,7] e = ForestEdge() e.add_tail(t0_1) e.add_tail(t1_7) e.prob = 0.9 root.add_incoming(e) self.hp = Hypergraph(root)
class InsideOutsideTest(TestCase): def setUp(self): w0 = ForestNode('John') w1 = ForestNode('saw') w2 = ForestNode('a') w3 = ForestNode('girl') w4 = ForestNode('with') w5 = ForestNode('a') w6 = ForestNode('telescope') t0_1 = ForestNode('NN') t1_2_0 = ForestNode('VB') t1_2_1 = ForestNode('NN') t2_3 = ForestNode('DT') t3_4 = ForestNode('NN') t4_5 = ForestNode('IN') t5_6 = ForestNode('DT') t6_7 = ForestNode('NN') t2_4 = ForestNode('NP') t5_7 = ForestNode('NP') t1_4 = ForestNode('VP') t4_7 = ForestNode('PP') t2_7 = ForestNode('NP') t1_7 = ForestNode('VP') root = ForestNode('S') # [NN,0,1] -> John e = ForestEdge() e.add_tail(w0) e.prob = 0.02 t0_1.add_incoming(e) # [VB,1,2] -> saw e = ForestEdge() e.add_tail(w1) e.prob = 0.01 t1_2_0.add_incoming(e) # [NN,1,2] -> saw e = ForestEdge() e.add_tail(w1) e.prob = 0.01 t1_2_1.add_incoming(e) # [DT,2,3] -> a e = ForestEdge() e.add_tail(w2) e.prob = 0.5 t2_3.add_incoming(e) # [NN,3,4] -> girl e = ForestEdge() e.add_tail(w3) e.prob = 0.05 t3_4.add_incoming(e) # [IN,4,5] -> with e = ForestEdge() e.add_tail(w4) e.prob = 0.25 t4_5.add_incoming(e) # [DT,5,6] -> a e = ForestEdge() e.add_tail(w5) e.prob = 0.5 t5_6.add_incoming(e) # [NN,6,7] -> telescope e = ForestEdge() e.add_tail(w6) e.prob = 0.001 t6_7.add_incoming(e) # [NP,2,4] -> [DT,2,3] [NN,3,4] e = ForestEdge() e.add_tail(t2_3) e.add_tail(t3_4) e.prob = 0.7 t2_4.add_incoming(e) # [NP,5,7] -> [DT,5,6] [NN,6,7] e = ForestEdge() e.add_tail(t5_6) e.add_tail(t6_7) e.prob = 0.7 t5_7.add_incoming(e) # [VP,1,4] -> [VB,1,2] [NP,2,4] e = ForestEdge() e.add_tail(t1_2_0) e.add_tail(t2_4) e.prob = 0.9 t1_4.add_incoming(e) # [PP,4,7] -> [IN,4,5] [NP,5,7] e = ForestEdge() e.add_tail(t4_5) e.add_tail(t5_7) e.prob = 1.0 t4_7.add_incoming(e) # [NP,2,7] -> [NP,2,4] [PP,4,7] e = ForestEdge() e.add_tail(t2_4) e.add_tail(t4_7) e.prob = 0.3 t2_7.add_incoming(e) # [VP,1,7] -> [VB,1,2] [NP,2,7] e = ForestEdge() e.add_tail(t1_2_0) e.add_tail(t2_7) e.prob = 0.5 t1_7.add_incoming(e) # [VP,1,7] -> [VP,1,4] [PP,4,7] e = ForestEdge() e.add_tail(t1_4) e.add_tail(t4_7) e.prob = 0.5 t1_7.add_incoming(e) # [S,0,7] -> [NN,0,1] [VP,1,7] e = ForestEdge() e.add_tail(t0_1) e.add_tail(t1_7) e.prob = 0.9 root.add_incoming(e) self.hp = Hypergraph(root) def test_inside_outside(self): self.hp.set_semiring(INSIDE) self.hp.set_functions(lambda x: x.prob, lambda x: 1, None) self.hp.inside() self.hp.outside() logger.writeln(self.hp.dot()) # self.hp.show() def test_inside_exp_outside_exp(self): self.hp.set_semiring(INSIDE) self.hp.set_functions(lambda x: x.prob, lambda x: 1, None) self.hp.inside() self.hp.outside() self.hp.inside_exp() self.hp.outside_exp() logger.writeln(self.hp.dot()) # self.hp.show() def test_inside_outside_log(self): self.hp.set_semiring(LOGPROB) self.hp.set_functions(lambda x: log(x.prob), lambda x: 1, None) self.hp.inside() self.hp.outside() logger.writeln(self.hp.dot()) # self.hp.show() def test_inside_exp_outside_exp_log(self): self.hp.set_semiring(LOGPROB) self.hp.set_functions(lambda x: log(x.prob), lambda x: 1, None) self.hp.inside() self.hp.outside() self.hp.inside_exp() self.hp.outside_exp() logger.writeln(self.hp.dot()) # self.hp.show() def test_best_paths(self): self.hp.set_semiring(INSIDE) self.hp.set_functions(lambda x: x.prob, lambda x: 1, None) self.hp.assert_done('topo_sort') logger.writeln(self.hp.root.best_paths()[0].tree_str()) logger.writeln(self.hp.root.best_paths()[0].weight) logger.writeln(self.hp.root.best_paths()[1].tree_str()) logger.writeln(self.hp.root.best_paths()[1].weight)
def setUp(self): self.a = Hypergraph()
def run(self): # update per-sentence grammars, if there's any for g in self.grammars: g.update(self.id) self.flog = open('%s/%s_%s' % (FLAGS.run_dir, 'log', self.suffix), 'w') if FLAGS.show_time: self.flog.write('running on %s\n\n' % socket.gethostname()) self.flog.flush() fwords = self.line.split() if FLAGS.preprocess: self.fidx2replacement = {} j = 0 for i, token in enumerate(fwords): if token in ('$number', '$date'): self.fidx2replacement[i] = self.special[j][1] j += 1 self.flog.write('[%s][%s words] %s\n' % (self.id, len(fwords), self.line)) decoder = Decoder(fwords, self.grammars, self.features) begin_time = time() if FLAGS.decoding_method == 'agenda': item = decoder.decode() elif FLAGS.decoding_method == 'cyk': item = decoder.decode_cyk() elif FLAGS.decoding_method == 'earley': item = decoder.decode_earley() else: assert False, '"%s" not valid decoding option' \ % FLAGS.decoding_method self.time = time() - begin_time if item is None: self.out = '[decoder failed to build a goal item]' else: hg = Hypergraph(item) hg.set_semiring(hypergraph.SHORTEST_PATH) hg.set_functions(lambda x: x.cost, None, None) hg.topo_sort() self.kbest = hg.root.best_paths() output_tokens = self.kbest[0].translation[:] if FLAGS.preprocess: for i in range(len(output_tokens)): if output_tokens[i] in ('$number', '$date'): fidx = self.kbest[0].composed_rule.we2f[i] if fidx is not None: output_tokens[i] = self.fidx2replacement[fidx] self.out = ' '.join(output_tokens[FLAGS.lm_order - 1:1 - FLAGS.lm_order]) self.hg = hg if FLAGS.output_hypergraph: self.write_hypergraph() self.flog.write('%s\n' % self.out) self.flog.write('\n') if item is not None: self.flog.write(self.kbest[0].tree_str()) self.flog.write('\n') self.flog.write(hg.stats()) self.flog.write('\n') self.flog.write(decoder.agenda_stats()) self.flog.write('\n') self.flog.write(decoder.chart.stats()) self.flog.write('\n') for dotchart in decoder.dotcharts: self.flog.write(dotchart.stats()) self.flog.write('\n') if FLAGS.show_time: timeline = '{:<35}{:>15.2f}\n'.format('[time]:', self.time) self.flog.write(timeline) self.write_output_file() if FLAGS.output_kbest: self.write_kbest_to_file() self.flog.close()
class DirectedAcyclicGraphComparator: """ The class perfoms the comparation between two different DirectedAcyclicGraphs To use it call the function buildHyperGraph, it will return a hypergraph containing the comparision between the two dags. The contents of the hypergraph will be as follows: nodes: formed by one node of each graph storing the value of applying the cost function to both nodes. hyperedges: hyperedges are directed, the first node of the hyperedge is the source node of the transformation, the rest of the nodes of the hyperedges will contain the location of the variables. As a value it will store the sum of the cost of the variables plus applying the transformation function to the graphs without the nodes being substituted. Each hyperedge must be different. """ def __init__(self, dag1, dag2): """ The first and second parameters must be DirectedAcyclicGraphs as specified on the file datastructures.py" """ self.dag1_mapper = DirectedAcyclicGraphMapper(dag1) self.dag2_mapper = DirectedAcyclicGraphMapper(dag2) self.hypergraph = Hypergraph() def costAssembler(self, functions): pass def __sort_by_num_of_variables(self, v): max_num_of_variables = max(map(lambda x: len(x.variables), v)) answers = tuple([[] for _ in xrange(max_num_of_variables)]) for x in v: answers[len(x.variables) - 1].append(x) return answers def __iterate_over_sorted_maps(self, s1, s2): for x1, x2 in zip(s1, s2): for map1 in x1: for map2 in x2: yield (map1, map2) def buildHyperGraph(self, number_of_variables=float('inf')): """ This function builds the hypergraph that will contain the comparision between the two dags and all its subgraphs The function returns a hypergraph containing the comparision between the two dags. """ # Compute the nodes of the hypergraph and its associated cost. Each # node is formed by each possible pair created using two random nodes # of each dag. g1 = self.dag1_mapper.dag g2 = self.dag2_mapper.dag for n1 in self.dag1_mapper.dag.links.iterkeys(): for n2 in self.dag2_mapper.dag.links.iterkeys(): # value = t_cost_function_distance([n1], [n2]) value = t_cost_edit_distance_graphs_no_vars(g1, n1, g2, n2) self.hypergraph.addNode((n1, n2), value) # In the algorithm we don't allow to compute the cost function between # two subgraphs with different number of variables. Here # we sort both sequences of subgraphs by its number of variables to # assure that doesn't happen. map1_sorted_by_vars = self.__sort_by_num_of_variables( self.dag1_mapper.generateAllVariableMappings( number_of_variables=number_of_variables)) map2_sorted_by_vars = self.__sort_by_num_of_variables( self.dag2_mapper.generateAllVariableMappings( number_of_variables=number_of_variables)) # Thanks to its ordering coming from the Mapper class the hypergraph # will be built on a top down fashion. # map1 and map2 will always contain the same number of variables. for map1, map2 in self.__iterate_over_sorted_maps( map1_sorted_by_vars, map2_sorted_by_vars): # This variable will contain the total coming from the # substituted variables. # total_from_variables = 0.0 # The node of the hypergraph. hypergraph_node = (map1.subgraph.root, map2.subgraph.root) # The current hyperedge, on this implementation the order # matters the first node will be the node acting as a root # and the rest the nodes that are going to be substituted # by variables. hyperedge = (hypergraph_node, ) + tuple( zip(map1.variables, map2.variables)) # The cost of the node of the hypergraph. # f1 = t_cost_function([map1.subgraph.root], # [map2.subgraph.root]) f1 = t_cost_edit_distance_graphs_with_vars(map1, map2) # This is for debuging pourposes if DEBUG_MODE: print stringifyGraph(map1.graph, map1.subgraph.root, map1.variables, map1.subgraph.nodes) print stringifyGraph(map2.graph, map2.subgraph.root, map2.variables, map2.subgraph.nodes) print 'Hyperedge', hyperedge # Obtain the accumulated value for the variables involved on # the substitution. # for n1, n2 in zip(map1.variables, map2.variables): # if DEBUG_MODE: # print 'Querying:', n1, n2 # total_from_variables += self.hypergraph.getNodeWeight((n1, # n2)) # Add the hyperedge to the graph # The hyperedges are directed and as the algorithm works # there should't be any duplicates so there is no need to # check if it exists. subgraphs = (map1.subgraph, map2.subgraph) # weight = f1 + total_from_variables weight = f1 self.hypergraph.addHyperedge(hyperedge, subgraphs, weight) # Check if with the values we have computed we have to update # value of the node. # if (f1 + total_from_variables) > \ # self.hypergraph.getNodeValue(hypergraph_node): # self.hypergraph.updateNode(hypergraph_node, # (f1 + total_from_variables)) # if DEBUG_MODE: # print 'Partial graph value', f1 # print 'Variables value', total_from_variables # print "==========================" if DEBUG_MODE: print "\nNodes:" print "==========================" self.hypergraph.printNodes() print "\nHyperedges:" print "==========================" self.hypergraph.printHyperedges() def buildHyperGraphDebug(self, number_of_variables=float('inf')): """ Debugging function that uses the default computing cost function to build the hypergraph. Used for testing purposes. """ for n1 in self.dag1_mapper.dag.links.iterkeys(): for n2 in self.dag2_mapper.dag.links.iterkeys(): value = t_cost_default([n1], [n2]) self.hypergraph.addNode((n1, n2), value) map1_sorted_by_vars = self.__sort_by_num_of_variables( self.dag1_mapper.generateAllVariableMappings( number_of_variables=number_of_variables)) map2_sorted_by_vars = self.__sort_by_num_of_variables( self.dag2_mapper.generateAllVariableMappings( number_of_variables=number_of_variables)) for map1, map2 in self.__iterate_over_sorted_maps( map1_sorted_by_vars, map2_sorted_by_vars): hypergraph_node = (map1.subgraph.root, map2.subgraph.root) hyperedge = (hypergraph_node, ) + tuple( zip(map1.variables, map2.variables)) weight = t_cost_default(map1.subgraph.nodes, map2.subgraph.nodes) subgraphs = (map1.subgraph, map2.subgraph) self.hypergraph.addHyperedge(hyperedge, subgraphs, weight)
class TestHypergraph(unittest.TestCase): def setUp(self): self.a = Hypergraph() def test_checkUnknownNode(self): self.assertRaises(ValueError, self.a.updateNode, "z", 2) def test_checkUnknownHyperedge(self): self.assertRaises(ValueError, self.a.updateHyperedgeLabel, ("z", "s"), 2, 0) def test_addNode1(self): self.a.addNode('a', 1) self.assertEqual(self.a.getNodeWeight('a'), 1) def test_addNode2(self): self.a.addNode('a', 1) self.a.addNode('b', 2) self.assertEqual(self.a.getNodeWeight('a'), 1) self.assertEqual(self.a.getNodeWeight('b'), 2) def test_addNode3(self): self.a.addNode('a', 1) self.a.addNode('b', 2) self.a.addNode('c', 3) self.assertEqual(self.a.getNodeWeight('a'), 1) self.assertEqual(self.a.getNodeWeight('b'), 2) self.assertEqual(self.a.getNodeWeight('c'), 3) def test_updateNodeValue(self): self.a.addNode('a', 1) self.a.addNode('b', 2) self.a.updateNode('a', 3) self.assertEqual(self.a.getNodeWeight('a'), 3) def test_addHyperedge1(self): he = ('a', 'b', 'c') self.a.addNode('a', 1) self.a.addNode('b', 2) self.a.addNode('c', 3) self.a.addHyperedge(he, "abc", 0) self.assertEqual(self.a.getHyperedgeLabel(he).data, "abc") def test_addHyperedge2(self): he = ('a', 'b', 'c') he2 = ('d', 'b', 'c') self.a.addNode('a', 1) self.a.addNode('b', 2) self.a.addNode('c', 3) self.a.addNode('d', 4) self.a.addHyperedge(he, "abc", 0) self.a.addHyperedge(he2, "dbc", 0) self.assertEqual(self.a.getHyperedgeLabel(he2).data, "dbc") def test_updateHyperedgeLabel(self): he = ('a', 'b', 'c') he2 = ('d', 'b', 'c') self.a.addNode('a', 1) self.a.addNode('b', 2) self.a.addNode('c', 3) self.a.addNode('d', 4) self.a.addHyperedge(he, "abc", 0) self.a.addHyperedge(he2, "dbc", 0) self.a.updateHyperedgeLabel(he, "test", 0) self.assertEqual(self.a.getHyperedgeLabel(he).data, "test") def test_checkHyperedgesAndNodes(self): he = ('a', 'b', 'c') he2 = ('d', 'b', 'c') solution = [he, he2] self.a.addNode('a', 1) self.a.addNode('b', 2) self.a.addNode('c', 3) self.a.addNode('d', 4) self.a.addHyperedge(he, "abc", 0) self.a.addHyperedge(he2, "dbc", 0) self.assertEqual(self.a.getHyperedgesFromNode('b'), solution)
parser.add_argument( "-n", "--nodes", action="store", default=10000, type=int, help="Select the number of nodes n (if the target dataset is 'model')", ) args = parser.parse_args() datasets_info = {'contact': 'contact-high-school', 'email': 'email-Eu-full', 'substances': 'NDC-substances-full', 'tags': 'tags-ask-ubuntu', 'threads': 'threads-math-sx', 'coauth': 'coauth-DBLP-full'} if not os.path.exists('../results'): os.mkdir('../results') if not os.path.exists('../plots'): os.mkdir('../plots') if args.dataset in datasets_info: graph = Hypergraph(datasets_info[args.dataset], args.dataset) elif args.dataset == 'model': print("Generating hypergraph using HyperFF model...") graph = HyperFF(args.burning, args.expanding, args.nodes - 1) else: print("Invalid arguments.") parser.print_help() sys.exit(0) main(graph)