def make_haplotype_paths(graph_file_name, linear_ref_path_file_name, haplotype0_file_name, haplotype1_file_name, out_base_name, chromosome): # Make a linear reference fasta and interval and haplotypes fasta and intervals chrom = chromosome graph = Graph.from_file(graph_file_name) sequence_graph = SequenceGraph.from_file(graph_file_name + ".sequences") linear_ref = IntervalCollection.from_file(linear_ref_path_file_name, text_file=True) linear_ref = list(linear_ref.intervals)[0] linear_ref_nodes = set(linear_ref.region_paths) # Write linear ref fasta to file linear_ref_seq = sequence_graph.get_interval_sequence(linear_ref) out_file = open("linear_ref_" + chrom + ".fasta", "w") out_file.writelines([">%s\n" % chrom]) out_file.writelines([linear_ref_seq + "\n"]) out_file.close() logging.info("Wrote linear ref sequence. N nodes in linear ref: %d" % len(linear_ref_nodes)) haplotype_nodes = [set(), set()] # For haplotype 0 and 1 for haplotype in [0, 1]: haplotype_file_name = haplotype0_file_name if haplotype == 1: haplotype_file_name = haplotype1_file_name intervals = vg_json_file_to_intervals(haplotype_file_name, graph) for interval in intervals: for node in interval.region_paths: haplotype_nodes[haplotype].add(node) logging.info("N nodes in haplotype 0: %d" % len(haplotype_nodes[0])) logging.info("N nodes in haplotype 0 that are also in linear ref: %d" % len(haplotype_nodes[0].intersection(linear_ref_nodes))) logging.info("N nodes in haplotype 1: %d" % len(haplotype_nodes[1])) # Traverse graph to get full correct haplotype intervals first_nodes = graph.get_first_blocks() assert len(first_nodes) == 1 logging.info("N nodes in graph: %d" % len(graph.blocks)) for haplotype in [0, 1]: logging.info("Traversing haplotype %d" % haplotype) nodes = [] node = first_nodes[0] nodes_in_haplotype = haplotype_nodes[haplotype] nodes_in_haplotype = set(range( 0, max(linear_ref_nodes))).difference(linear_ref_nodes) logging.info("There are %d haplotype nodes" % len(nodes_in_haplotype)) assert len( nodes_in_haplotype ) > 0, "There are no haplotype nodes. Check that haplotype json files are not empty" n_haplotype_nodes = 0 i = 0 while True: nodes.append(node) if i % 50000 == 0: logging.info("#%d nodes traversed. On node %d" % (i, node)) i += 1 next_nodes = set(graph.adj_list[node]) if len(next_nodes) == 0: logging.info("Reached end node %d with 0 edges" % node) break next_on_haplotype = next_nodes.intersection(nodes_in_haplotype) if len(next_on_haplotype) == 1: n_haplotype_nodes += 1 next_node = list(next_on_haplotype)[0] assert next_node != node node = next_node elif len(next_on_haplotype) == 0: logging.debug( "No new haplotype node from %d. Will follow reference" % node) # Choose reference with lowest id to avoid deletion node = min(list(next_nodes.intersection(linear_ref_nodes))) else: # logging.warning("There is a deletion from node %d. Choosing lowest node id as next to avoid deletion." % node) # This means more than one next node is on haplotype. Choose the one with lowest id to avoid taking deletion node = min(list(next_on_haplotype)) logging.info("Found %d nodes. %d on haplotype" % (len(nodes), n_haplotype_nodes)) haplotype_interval = Interval(0, graph.blocks[nodes[-1]].length(), nodes, graph) print("Path length: %d" % haplotype_interval.length()) file_base_name = out_base_name + "_" + str(haplotype) IntervalCollection([haplotype_interval ]).to_file(file_base_name + ".intervalcollection", text_file=True) sequence = sequence_graph.get_interval_sequence(haplotype_interval) out_file = open(file_base_name + ".fasta", "w") out_file.writelines([">%s\n" % chrom]) out_file.writelines([sequence + "\n"]) out_file.close() logging.info("Wrote fasta sequence to %s" % file_base_name + ".fasta")
def predict_path(self): logging.info("Using linear bonus %d on chromosome %s" % (self.linear_ref_bonus, self.chromosome)) logging.info("Using linear out base name %s" % self.out_file_base_name) out_file = open("%s_%s.fasta" % (self.out_file_base_name, self.chromosome), "w") # Traverse first_nodes = self.graph.get_first_blocks() assert len(first_nodes) == 1 logging.info("N nodes in graph: %d" % len(self.graph.blocks)) node = first_nodes[0] assert node in self.linear_path_nodes, "Start node should be in linear ref" path = [] n_ambigious = 0 edges_chosen = set() i = 0 n_special_case = 0 while True: if i % 1000000 == 0: logging.info("%d nodes in graph traversed on chrom %s" % (i, self.chromosome)) i += 1 if self.max_nodes_to_traverse is not None and i > self.max_nodes_to_traverse: logging.warning("Stopped traversing before end because max node to traverse was set") break path.append(node) next_nodes = self.graph.adj_list[node] if len(next_nodes) == 0: logging.info("Done on node %d" % node) break elif len(next_nodes) == 1: node = next_nodes[0] else: most_reads = 0 most_reads_node = next_nodes[0] has_found_candidate_on_linear_ref = False for next_node in next_nodes: n_reads = self.edge_counts["%s-%s" % (node, next_node)] if next_node in self.linear_path_nodes: n_reads += self.linear_ref_bonus if n_reads > most_reads or (n_reads >= most_reads and next_node in self.linear_path_nodes): if node not in self.linear_path_nodes: n_special_case += 1 # If already found something on linear ref, and this does not have more reads or lower id (not insertion), ignore if has_found_candidate_on_linear_ref and n_reads == most_reads and next_node > most_reads_node: continue # Ignore this alternative most_reads_node = next_node most_reads = n_reads if next_node in self.linear_path_nodes: has_found_candidate_on_linear_ref = True if most_reads == 0: n_ambigious += 1 assert most_reads_node is not None edges_chosen.add("%d-%d" % (node, most_reads_node)) node = most_reads_node if most_reads == 0: # Assert we have taken linear ref path if exists if any([n in self.linear_path_nodes for n in next_nodes]): if node not in self.linear_path_nodes: logging.error("Chose node %d as next, but it is not in linear ref." % node) logging.error("Next nodes are: %s" % next_nodes) for next_node in next_nodes: if next_node in self.linear_path_nodes: logging.error(" Node %d is in linear ref" % next_node) else: logging.error(" Node %d is not in linear ref" % next_node) raise Exception("Could not traverse correctly") # Find statistics of chosen nodes nodes_chosen = set(path) n_on_linear = len(nodes_chosen.intersection(self.linear_path_nodes)) n_not_on_linear = len(nodes_chosen) - n_on_linear linear_ref_interval = Interval(0, self.graph.blocks[path[-1]].length(), path, self.graph) IntervalCollection([linear_ref_interval]).to_file("%s_%s.intervalcollection" % (self.out_file_base_name, self.chromosome), text_file=True) logging.info("=== STATS FOR CHROMOSOME %s ===" % self.chromosome) logging.info("N ambigious choices: %d" % n_ambigious) logging.info("Total nodes in linear ref: %d" % len(self.linear_path_nodes)) logging.info("N nodes chosen that are not in linear ref: %d " % n_not_on_linear) logging.info("N nodes chosen that are in linear ref: %d " % n_on_linear) logging.info("N special case: %d" % n_special_case) logging.info("N nodes in path: %d" % len(path)) logging.info("Linear path length: %d" % linear_ref_interval.length()) sequence = self.sequence_graph.get_interval_sequence(linear_ref_interval) out_file.writelines([">%s\n" % self.chromosome]) out_file.writelines([sequence + "\n"]) out_file.close()