def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False, detect_transpositions=False, debug_scores=False, properties_filter=None, indent=False): # collation may be collation or json; if it's the latter, use it to build a real collation if isinstance(collation, dict): json_collation = Collation() for witness in collation["witnesses"]: json_collation.add_witness(witness) collation = json_collation # assume collation is collation (by now); no error trapping if not astar: algorithm = EditGraphAligner(collation, near_match=False, detect_transpositions=detect_transpositions, debug_scores=debug_scores, properties_filter=properties_filter) else: algorithm = ExperimentalAstarAligner(collation, near_match=False, debug_scores=debug_scores) # build graph graph = VariantGraph() algorithm.collate(graph) ranking = VariantGraphRanking.of(graph) if near_match: # Segmentation not supported for near matching; raise exception if necessary # There is already a graph ('graph', without near-match edges) and ranking ('ranking') if segmentation: raise SegmentationError('segmentation must be set to False for near matching') ranking = perform_near_match(graph, ranking) # join parallel segments if segmentation: join(graph) ranking = VariantGraphRanking.of(graph) # check which output format is requested: graph or table if output == "svg" or output == "svg_simple": return display_variant_graph_as_svg(graph, output) if output == "graph": return graph # create alignment table table = AlignmentTable(collation, graph, layout, ranking) if output == "json": return export_alignment_table_as_json(table) if output == "html": return display_alignment_table_as_html(table) if output == "html2": return visualize_table_vertically_with_colors(table, collation) if output == "table": return table if output == "xml": return export_alignment_table_as_xml(table) if output == "tei": return export_alignment_table_as_tei(table, indent) if output == "csv" or output == "tsv": return display_alignment_table_as_csv(table, output) else: raise Exception("Unknown output type: " + output)
def display_variant_graph_as_svg(graph, output): a = pygraphviz.AGraph(directed=True, rankdir='LR') counter = 0 mapping = {} ranking = VariantGraphRanking.of(graph) # add nodes for n in graph.graph.nodes(): counter += 1 mapping[n] = counter if output == "svg_simple": label = n.label if label == '': label = '#' a.add_node(mapping[n], label=label) else: rank = ranking.byVertex[n] readings = ["<TR><TD ALIGN='LEFT'><B>" + n.label + "</B></TD><TD ALIGN='LEFT'>exact: " + str( rank) + "</TD></TR>"] reverse_dict = defaultdict(list) for key, value in n.tokens.items(): reverse_dict["".join( re.sub(r'>', r'>', re.sub(r'<', r'<', item.token_data["t"])) for item in value)].append( key) for key, value in sorted(reverse_dict.items()): reading = ( "<TR><TD ALIGN='LEFT'><FONT FACE='Bukyvede'>{}</FONT></TD><TD ALIGN='LEFT'>{}</TD></TR>").format( key, ', '.join(value)) readings.append(reading) a.add_node(mapping[n], label='<<TABLE CELLSPACING="0">' + "".join(readings) + '</TABLE>>') # add edges for u,v,edge_data in graph.graph.edges_iter(data=True): a.add_edge(mapping[u], mapping[v], edge_data["label"]) for key, value in ranking.byRank.items(): a.add_subgraph([mapping[item] for item in value], rank='same') svg = a.draw(prog='dot', format='svg') # diagnostic, not for production # dot = a.draw(prog='dot') # print(dot.decode(encoding='utf-8')) # display using the IPython SVG module return display(SVG(svg))
def perform_near_match(graph, ranking): # Walk ranking table in reverse order and add near-match edges to graph reverse_topological_sorted_vertices = reversed(list(topological_sort(graph.graph))) for v in reverse_topological_sorted_vertices: ##### Doesn't work: # target_rank = ranking.byVertex[v] # get the rank of a vertex # # in_edges = graph.in_edges(v) # if it has more than one in_edge, perhaps something before it can be moved # if len(in_edges) > 1: # # candidates for movement are the sources of in edges more than one rank earlir # move_candidates = [in_edge[0] for in_edge in in_edges \ # if target_rank > ranking.byVertex[in_edge[0]] + 1] # for move_candidate in move_candidates: # move_candidate_witnesses = set(move_candidate.tokens) # prepare to get intersection later # min_rank = ranking.byVertex[move_candidate] # lowest possible rank is current position # max_rank = target_rank - 1 # highest possible rank is one more before the target # vertices_to_compare = flatten([ranking.byRank[r] for r in range(min_rank, max_rank + 1)]) # vertices_to_compare.remove(move_candidate) # don't compare it to itself # print('comparing ', move_candidate, ' to ', vertices_to_compare) # ratio_dict = {} # ratio:vertex_to_compare # for vertex_to_compare in vertices_to_compare: # # don't move if there's already a vertex there with any of the same witnesses # if not move_candidate_witnesses.intersection(vertex_to_compare.tokens): # print('now comparing move candidate ', move_candidate, \ # ' (witnesses ', move_candidate_witnesses,\ # ') with ', vertex_to_compare, ' (witnesses ', vertex_to_compare.tokens, ')') # ratio = Levenshtein.ratio(str(move_candidate), str(vertex_to_compare)) # ratio_dict[ratio] = vertex_to_compare # # Create only winning edge; losing edges can create later cycles # graph.connect_near(ratio_dict[max(ratio_dict)], move_candidate, ratio) # print('connected ', move_candidate, ' to ', ratio_dict[max(ratio_dict)], \ # ' with ratio ', max(ratio_dict)) ###### in_edges = graph.in_edges(v, data=True) for source, target, edgedata in in_edges: # can only move if two conditions are both true: # 1) rank of source differs from v by more than 1; max target rank will be rank of v - 1 # 2) out_edges from source must have no target at exactly one rank higher than source if ranking.byVertex[v] - ranking.byVertex[source] > 1 and \ 1 not in [ranking.byVertex[v] - ranking.byVertex[u] for (u,v) in graph.out_edges(source)]: min_rank = ranking.byVertex[source] max_rank = ranking.byVertex[v] match_candidates = [item for item in flatten([ranking.byRank[rank] \ for rank in range(min_rank, max_rank)]) if item is not source] # print(match_candidates) levenshtein_dict = defaultdict(list) for match_candidate in match_candidates: ratio = Levenshtein.ratio(str(source), str(match_candidate)) # print(source, match_candidate, ratio) levenshtein_dict[ratio].append(match_candidate) weight = max(levenshtein_dict) winner = levenshtein_dict[max(levenshtein_dict)][0] # print('weight:',weight,'source:',winner) graph.connect_near(winner,source,weight) # print('before: byRank',str(ranking.byRank)) # print('before: byVertex',str(ranking.byVertex)) # update ranking table for next pass through loop and verify ranking = VariantGraphRanking.of(graph) # print('after: byRank',str(ranking.byRank)) # print('after: byVertex',str(ranking.byVertex)) # Create new ranking table (passed along to creation of alignment table) return VariantGraphRanking.of(graph)
def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False, detect_transpositions=False, debug_scores=False, properties_filter=None, svg_output=None, indent=False, scheduler=Scheduler()): # collation may be collation or json; if it's the latter, use it to build a real collation if isinstance(collation, dict): json_collation = Collation() for witness in collation["witnesses"]: json_collation.add_witness(witness) collation = json_collation # assume collation is collation (by now); no error trapping if not astar: algorithm = EditGraphAligner( collation, near_match=False, detect_transpositions=detect_transpositions, debug_scores=debug_scores, properties_filter=properties_filter) else: algorithm = ExperimentalAstarAligner(collation, near_match=False, debug_scores=debug_scores) # build graph graph = VariantGraph() algorithm.collate(graph, collation) ranking = VariantGraphRanking.of(graph) if near_match: # Segmentation not supported for near matching; raise exception if necessary if segmentation: raise SegmentationError( 'segmentation must be set to False for near matching') highestRank = ranking.byVertex[graph.end] witnessCount = len(collation.witnesses) # do-while loop to avoid looping through ranking while modifying it rank = highestRank - 1 condition = True while condition: rank = process_rank(scheduler, rank, collation, ranking, witnessCount) rank -= 1 condition = rank > 0 # # Verify that nodes have been moved # print("\nLabels at each rank at end of processing: ") # for rank in ranking.byRank: # print("\nRank: " + str(rank)) # print([node.label for node in ranking.byRank[rank]]) # join parallel segments if segmentation: join(graph) ranking = VariantGraphRanking.of(graph) # check which output format is requested: graph or table if output == "svg" or output == "svg_simple": return display_variant_graph_as_SVG(graph, svg_output, output) if output == "graph": return graph # create alignment table table = AlignmentTable(collation, graph, layout, ranking) if output == "json": return export_alignment_table_as_json(table) if output == "html": return display_alignment_table_as_HTML(table) if output == "html2": return visualizeTableVerticallyWithColors(table, collation) if output == "table": return table if output == "xml": return export_alignment_table_as_xml(table) if output == "tei": return export_alignment_table_as_tei(table, indent) else: raise Exception("Unknown output type: " + output)
def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False, detect_transpositions=False, debug_scores=False, properties_filter=None, indent=False): # collation may be collation or json; if it's the latter, use it to build a real collation if isinstance(collation, dict): json_collation = Collation() for witness in collation["witnesses"]: json_collation.add_witness(witness) collation = json_collation # assume collation is collation (by now); no error trapping if not astar: algorithm = EditGraphAligner( collation, near_match=False, detect_transpositions=detect_transpositions, debug_scores=debug_scores, properties_filter=properties_filter) else: algorithm = ExperimentalAstarAligner(collation, near_match=False, debug_scores=debug_scores) # build graph graph = VariantGraph() algorithm.collate(graph) ranking = VariantGraphRanking.of(graph) if near_match: # Segmentation not supported for near matching; raise exception if necessary # There is already a graph ('graph', without near-match edges) and ranking ('ranking') if segmentation: raise SegmentationError( 'segmentation must be set to False for near matching') ranking = perform_near_match(graph, ranking) # join parallel segments if segmentation: join(graph) ranking = VariantGraphRanking.of(graph) # check which output format is requested: graph or table if output == "svg" or output == "svg_simple": return display_variant_graph_as_svg(graph, output) if output == "graph": return graph # create alignment table table = AlignmentTable(collation, graph, layout) if collation.pretokenized and not segmentation: token_list = [[tk.token_data for tk in witness.tokens()] for witness in collation.witnesses] # only with segmentation=False # there could be a different comportment of get_tokenized_table if semgentation=True table = get_tokenized_at(table, token_list, segmentation=segmentation, layout=layout) # for display purpose, table and html output will return only token 't' (string) and not the full token_data (dict) if output == "table" or output == "html": for row in table.rows: row.cells = [cell["t"] for cell in row.cells] if output == "json": return export_alignment_table_as_json(table, layout=layout) if output == "html": return display_alignment_table_as_html(table) if output == "html2": return visualize_table_vertically_with_colors(table, collation) if output == "table": return table if output == "xml": return export_alignment_table_as_xml(table) if output == "tei": return export_alignment_table_as_tei(table, indent) if output == "csv" or output == "tsv": return display_alignment_table_as_csv(table, output) else: raise Exception("Unknown output type: " + output)
def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False, detect_transpositions=False, debug_scores=False, properties_filter=None, svg_output=None, indent=False, scheduler=Scheduler()): # collation may be collation or json; if it's the latter, use it to build a real collation if isinstance(collation, dict): json_collation = Collation() for witness in collation["witnesses"]: json_collation.add_witness(witness) collation = json_collation # assume collation is collation (by now); no error trapping if not astar: algorithm = EditGraphAligner(collation, near_match=False, detect_transpositions=detect_transpositions, debug_scores=debug_scores, properties_filter=properties_filter) else: algorithm = ExperimentalAstarAligner(collation, near_match=False, debug_scores=debug_scores) # build graph graph = VariantGraph() algorithm.collate(graph, collation) ranking = VariantGraphRanking.of(graph) if near_match: # Segmentation not supported for near matching; raise exception if necessary if segmentation: raise SegmentationError('segmentation must be set to False for near matching') highestRank = ranking.byVertex[graph.end] witnessCount = len(collation.witnesses) # do-while loop to avoid looping through ranking while modifying it rank = highestRank - 1 condition = True while condition: rank = process_rank(scheduler, rank, collation, ranking, witnessCount) rank -= 1 condition = rank > 0 # # Verify that nodes have been moved # print("\nLabels at each rank at end of processing: ") # for rank in ranking.byRank: # print("\nRank: " + str(rank)) # print([node.label for node in ranking.byRank[rank]]) # join parallel segments if segmentation: join(graph) ranking = VariantGraphRanking.of(graph) # check which output format is requested: graph or table if output == "svg" or output == "svg_simple": return display_variant_graph_as_SVG(graph, svg_output, output) if output == "graph": return graph # create alignment table table = AlignmentTable(collation, graph, layout, ranking) if output == "json": return export_alignment_table_as_json(table) if output == "html": return display_alignment_table_as_HTML(table) if output == "html2": return visualizeTableVerticallyWithColors(table, collation) if output == "table": return table if output == "xml": return export_alignment_table_as_xml(table) if output == "tei": return export_alignment_table_as_tei(table, indent) else: raise Exception("Unknown output type: " + output)
def _rank_the_graph(self, phrase_matches, base): #TODO: rank the graph based on only the first vertex of each of the phrasematches! return VariantGraphRanking.of(base)
def collate(collation, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False, detect_transpositions=False, debug_scores=False, properties_filter=None, indent=False): # collation may be collation or json; if it's the latter, use it to build a real collation if isinstance(collation, dict): json_collation = Collation() for witness in collation["witnesses"]: json_collation.add_witness(witness) collation = json_collation # assume collation is collation (by now); no error trapping if not astar: algorithm = EditGraphAligner( collation, near_match=False, detect_transpositions=detect_transpositions, debug_scores=debug_scores, properties_filter=properties_filter) else: algorithm = ExperimentalAstarAligner(collation, near_match=False, debug_scores=debug_scores) # build graph graph = VariantGraph() algorithm.collate(graph) ranking = VariantGraphRanking.of(graph) if near_match: # Segmentation not supported for near matching; raise exception if necessary # There is already a graph ('graph', without near-match edges) and ranking ('ranking') if segmentation: raise SegmentationError( 'segmentation must be set to False for near matching') ranking = perform_near_match(graph, ranking) # join parallel segments if segmentation: join(graph) ranking = VariantGraphRanking.of(graph) # check which output format is requested: graph or table if output == "svg" or output == "svg_simple": return display_variant_graph_as_svg(graph, output) if output == "graph": return graph # create alignment table table = AlignmentTable(collation, graph, layout, ranking) if output == "json": return export_alignment_table_as_json(table) if output == "html": return display_alignment_table_as_html(table) if output == "html2": return visualize_table_vertically_with_colors(table, collation) if output == "table": return table if output == "xml": return export_alignment_table_as_xml(table) if output == "tei": return export_alignment_table_as_tei(table, indent) if output == "csv" or output == "tsv": return display_alignment_table_as_csv(table, output) else: raise Exception("Unknown output type: " + output)
def perform_near_match(graph, ranking): # Walk ranking table in reverse order and add near-match edges to graph reverse_topological_sorted_vertices = reversed( list(topological_sort(graph.graph))) for v in reverse_topological_sorted_vertices: ##### Doesn't work: # target_rank = ranking.byVertex[v] # get the rank of a vertex # # in_edges = graph.in_edges(v) # if it has more than one in_edge, perhaps something before it can be moved # if len(in_edges) > 1: # # candidates for movement are the sources of in edges more than one rank earlir # move_candidates = [in_edge[0] for in_edge in in_edges \ # if target_rank > ranking.byVertex[in_edge[0]] + 1] # for move_candidate in move_candidates: # move_candidate_witnesses = set(move_candidate.tokens) # prepare to get intersection later # min_rank = ranking.byVertex[move_candidate] # lowest possible rank is current position # max_rank = target_rank - 1 # highest possible rank is one more before the target # vertices_to_compare = flatten([ranking.byRank[r] for r in range(min_rank, max_rank + 1)]) # vertices_to_compare.remove(move_candidate) # don't compare it to itself # print('comparing ', move_candidate, ' to ', vertices_to_compare) # ratio_dict = {} # ratio:vertex_to_compare # for vertex_to_compare in vertices_to_compare: # # don't move if there's already a vertex there with any of the same witnesses # if not move_candidate_witnesses.intersection(vertex_to_compare.tokens): # print('now comparing move candidate ', move_candidate, \ # ' (witnesses ', move_candidate_witnesses,\ # ') with ', vertex_to_compare, ' (witnesses ', vertex_to_compare.tokens, ')') # ratio = Levenshtein.ratio(str(move_candidate), str(vertex_to_compare)) # ratio_dict[ratio] = vertex_to_compare # # Create only winning edge; losing edges can create later cycles # graph.connect_near(ratio_dict[max(ratio_dict)], move_candidate, ratio) # print('connected ', move_candidate, ' to ', ratio_dict[max(ratio_dict)], \ # ' with ratio ', max(ratio_dict)) ###### in_edges = graph.in_edges(v, data=True) for source, target, edgedata in in_edges: # can only move if two conditions are both true: # 1) rank of source differs from v by more than 1; max target rank will be rank of v - 1 # 2) out_edges from source must have no target at exactly one rank higher than source if ranking.byVertex[v] - ranking.byVertex[source] > 1 and \ 1 not in [ranking.byVertex[v] - ranking.byVertex[u] for (u,v) in graph.out_edges(source)]: min_rank = ranking.byVertex[source] max_rank = ranking.byVertex[v] match_candidates = [item for item in flatten([ranking.byRank[rank] \ for rank in range(min_rank, max_rank)]) if item is not source] # print(match_candidates) levenshtein_dict = defaultdict(list) for match_candidate in match_candidates: ratio = Levenshtein.ratio(str(source), str(match_candidate)) # print(source, match_candidate, ratio) levenshtein_dict[ratio].append(match_candidate) weight = max(levenshtein_dict) winner = levenshtein_dict[max(levenshtein_dict)][0] # print('weight:',weight,'source:',winner) graph.connect_near(winner, source, weight) # print('before: byRank',str(ranking.byRank)) # print('before: byVertex',str(ranking.byVertex)) # update ranking table for next pass through loop and verify ranking = VariantGraphRanking.of(graph) # print('after: byRank',str(ranking.byRank)) # print('after: byVertex',str(ranking.byVertex)) # Create new ranking table (passed along to creation of alignment table) return VariantGraphRanking.of(graph)
def display_variant_graph_as_svg(graph, output): a = graphviz.Digraph(format="svg", graph_attr={'rankdir': 'LR'}) counter = 0 mapping = {} ranking = VariantGraphRanking.of(graph) # add nodes for n in graph.graph.nodes(): counter += 1 mapping[n] = str(counter) if output == "svg_simple": label = n.label if label == '': label = '#' a.node(mapping[n], label=label) else: rank = ranking.byVertex[n] readings = [ "<TR><TD ALIGN='LEFT'><B>" + n.label + "</B></TD><TD ALIGN='LEFT'>exact: " + str(rank) + "</TD></TR>" ] reverse_dict = defaultdict(list) for key, value in n.tokens.items(): reverse_dict["".join( re.sub(r'>', r'>', re.sub(r'<', r'<', item.token_data["t"])) for item in value)].append(key) for key, value in sorted(reverse_dict.items()): reading = ( "<TR><TD ALIGN='LEFT'><FONT FACE='Bukyvede'>{}</FONT></TD><TD ALIGN='LEFT'>{}</TD></TR>" ).format(key, ', '.join(value)) readings.append(reading) a.node(mapping[n], label='<<TABLE CELLSPACING="0">' + "".join(readings) + '</TABLE>>') # add regular (token sequence) edges for u, v, edgedata in graph.graph.edges(data=True): # print('regular edges ', u, v, edgedata) label = edgedata['label'] a.edge(mapping[u], mapping[v], label=label) # add near-match edges # TODO: Show all near edges (currently), or just the top one? for u, v, edgedata in graph.near_graph.edges(data=True): # print('near-match edges ', u, v, edgedata) label = str('{:3.2f}'.format(edgedata['weight'])) a.edge(mapping[u], mapping[v], style='dashed', label=label) # Add rank='same' information for key, value in ranking.byRank.items(): # print(key, value) # print(key, value, len(value)) # print(key, set(value), len(set(value))) tmp = graphviz.Digraph(graph_attr={'rank': 'same'}) for n in [mapping[item] for item in value]: tmp.node(n) a.subgraph(tmp) # diagnostic, not for production # dot = a.draw(prog='dot') # print(dot.decode(encoding='utf-8')) # # display using the IPython SVG module svg = a.render() return display(SVG(svg))
def collate(self, graph): """ :type graph: VariantGraph """ # prepare the token index self.token_index.prepare() self.vertex_array = [None] * len(self.token_index.token_array) # Build the variant graph for the first witness # this is easy: generate a vertex for every token first_witness = self.collation.witnesses[0] tokens = first_witness.tokens() token_to_vertex = self.merge(graph, first_witness.sigil, tokens) # print("> token_to_vertex=", token_to_vertex) self.update_token_position_to_vertex(token_to_vertex) self.update_token_to_vertex_array(tokens, first_witness, self.token_position_to_vertex) # align witness 2 - n for x in range(1, len(self.collation.witnesses)): witness = self.collation.witnesses[x] tokens = witness.tokens() # print("\nwitness", witness.sigil) variant_graph_ranking = VariantGraphRanking.of(graph) # print("> x =", x, ", variant_graph_ranking =", variant_graph_ranking.byRank) variant_graph_ranks = list( set( map(lambda v: variant_graph_ranking.byVertex.get(v), graph.vertices()))) # we leave in the rank of the start vertex, but remove the rank of the end vertex variant_graph_ranks.pop() # now the vertical stuff tokens_as_index_list = self.as_index_list(tokens) match_cube = MatchCube(self.token_index, witness, self.vertex_array, variant_graph_ranking, self.properties_filter) # print("> match_cube.matches=", match_cube.matches) self.fill_needleman_wunsch_table(variant_graph_ranks, tokens_as_index_list, match_cube) aligned = self.align_matching_tokens(match_cube) # print("> aligned=", aligned) # print("self.token_index.token_array=", self.token_index.token_array) # alignment = self.align_function(superbase, next_witness, token_to_vertex, match_cube) # merge witness_token_to_generated_vertex = self.merge( graph, witness.sigil, witness.tokens(), aligned) # print("> witness_token_to_generated_vertex =", witness_token_to_generated_vertex) token_to_vertex.update(witness_token_to_generated_vertex) # print("> token_to_vertex =", token_to_vertex) self.update_token_position_to_vertex(token_to_vertex, aligned) witness_token_position_to_vertex = {} for p in self.token_index.get_range_for_witness(witness.sigil): # print("> p= ", p) witness_token_position_to_vertex[ p] = self.token_position_to_vertex[p] self.update_token_to_vertex_array( tokens, witness, witness_token_position_to_vertex) # print("> vertex_array =", self.vertex_array) # print("actual") # self._debug_edit_graph_table(self.table) # print("expected") # self._debug_edit_graph_table(self.table2) # change superbase # superbase = self.new_superbase if self.detect_transpositions: detector = TranspositionDetection(self) detector.detect()