예제 #1
0
def make_haplotype_paths(graph_file_name, linear_ref_path_file_name,
                         haplotype0_file_name, haplotype1_file_name,
                         out_base_name, chromosome):
    # Make a linear reference fasta and interval and haplotypes fasta and intervals

    chrom = chromosome
    graph = Graph.from_file(graph_file_name)
    sequence_graph = SequenceGraph.from_file(graph_file_name + ".sequences")

    linear_ref = IntervalCollection.from_file(linear_ref_path_file_name,
                                              text_file=True)
    linear_ref = list(linear_ref.intervals)[0]
    linear_ref_nodes = set(linear_ref.region_paths)

    # Write linear ref fasta to file
    linear_ref_seq = sequence_graph.get_interval_sequence(linear_ref)
    out_file = open("linear_ref_" + chrom + ".fasta", "w")
    out_file.writelines([">%s\n" % chrom])
    out_file.writelines([linear_ref_seq + "\n"])
    out_file.close()
    logging.info("Wrote linear ref sequence. N nodes in linear ref: %d" %
                 len(linear_ref_nodes))

    haplotype_nodes = [set(), set()]  # For haplotype 0 and 1
    for haplotype in [0, 1]:
        haplotype_file_name = haplotype0_file_name
        if haplotype == 1:
            haplotype_file_name = haplotype1_file_name

        intervals = vg_json_file_to_intervals(haplotype_file_name, graph)

        for interval in intervals:
            for node in interval.region_paths:
                haplotype_nodes[haplotype].add(node)

    logging.info("N nodes in haplotype 0: %d" % len(haplotype_nodes[0]))
    logging.info("N nodes in haplotype 0 that are also in linear ref: %d" %
                 len(haplotype_nodes[0].intersection(linear_ref_nodes)))
    logging.info("N nodes in haplotype 1: %d" % len(haplotype_nodes[1]))

    # Traverse graph to get full correct haplotype intervals
    first_nodes = graph.get_first_blocks()
    assert len(first_nodes) == 1
    logging.info("N nodes in graph: %d" % len(graph.blocks))

    for haplotype in [0, 1]:
        logging.info("Traversing haplotype %d" % haplotype)

        nodes = []
        node = first_nodes[0]
        nodes_in_haplotype = haplotype_nodes[haplotype]
        nodes_in_haplotype = set(range(
            0, max(linear_ref_nodes))).difference(linear_ref_nodes)
        logging.info("There are %d haplotype nodes" % len(nodes_in_haplotype))

        assert len(
            nodes_in_haplotype
        ) > 0, "There are no haplotype nodes. Check that haplotype json files are not empty"

        n_haplotype_nodes = 0
        i = 0
        while True:

            nodes.append(node)
            if i % 50000 == 0:
                logging.info("#%d nodes traversed. On node %d" % (i, node))
            i += 1

            next_nodes = set(graph.adj_list[node])

            if len(next_nodes) == 0:
                logging.info("Reached end node %d with 0 edges" % node)
                break

            next_on_haplotype = next_nodes.intersection(nodes_in_haplotype)
            if len(next_on_haplotype) == 1:
                n_haplotype_nodes += 1
                next_node = list(next_on_haplotype)[0]
                assert next_node != node
                node = next_node
            elif len(next_on_haplotype) == 0:
                logging.debug(
                    "No new haplotype node from %d. Will follow reference" %
                    node)
                # Choose reference with lowest id to avoid deletion
                node = min(list(next_nodes.intersection(linear_ref_nodes)))
            else:
                # logging.warning("There is a deletion from node %d. Choosing lowest node id as next to avoid deletion." % node)
                # This means more than one next node is on haplotype. Choose the one with lowest id to avoid taking deletion
                node = min(list(next_on_haplotype))

        logging.info("Found %d nodes. %d on haplotype" %
                     (len(nodes), n_haplotype_nodes))
        haplotype_interval = Interval(0, graph.blocks[nodes[-1]].length(),
                                      nodes, graph)
        print("Path length: %d" % haplotype_interval.length())

        file_base_name = out_base_name + "_" + str(haplotype)
        IntervalCollection([haplotype_interval
                            ]).to_file(file_base_name + ".intervalcollection",
                                       text_file=True)

        sequence = sequence_graph.get_interval_sequence(haplotype_interval)
        out_file = open(file_base_name + ".fasta", "w")
        out_file.writelines([">%s\n" % chrom])
        out_file.writelines([sequence + "\n"])
        out_file.close()
        logging.info("Wrote fasta sequence to %s" % file_base_name + ".fasta")
예제 #2
0
    def predict_path(self):
        logging.info("Using linear bonus %d on chromosome %s" % (self.linear_ref_bonus, self.chromosome))

        logging.info("Using linear out base name %s" % self.out_file_base_name)
        out_file = open("%s_%s.fasta" % (self.out_file_base_name, self.chromosome), "w")

        # Traverse
        first_nodes = self.graph.get_first_blocks()
        assert len(first_nodes) == 1

        logging.info("N nodes in graph: %d" % len(self.graph.blocks))

        node = first_nodes[0]
        assert node in self.linear_path_nodes, "Start node should be in linear ref"

        path = []
        n_ambigious = 0
        edges_chosen = set()
        i = 0
        n_special_case = 0
        while True:
            if i % 1000000 == 0:
                logging.info("%d nodes in graph traversed on chrom %s" % (i, self.chromosome))
            i += 1

            if self.max_nodes_to_traverse is not None and i > self.max_nodes_to_traverse:
                logging.warning("Stopped traversing before end because max node to traverse was set")
                break

            path.append(node)

            next_nodes = self.graph.adj_list[node]
            if len(next_nodes) == 0:
                logging.info("Done on node %d" % node)
                break
            elif len(next_nodes) == 1:
                node = next_nodes[0]
            else:
                most_reads = 0
                most_reads_node = next_nodes[0]
                has_found_candidate_on_linear_ref = False

                for next_node in next_nodes:
                    n_reads = self.edge_counts["%s-%s" % (node, next_node)]
                    if next_node in self.linear_path_nodes:
                        n_reads += self.linear_ref_bonus

                    if n_reads > most_reads or (n_reads >= most_reads and next_node in self.linear_path_nodes):
                        if node not in self.linear_path_nodes:
                            n_special_case += 1

                        # If already found something on linear ref, and this does not have more reads or lower id (not insertion), ignore
                        if has_found_candidate_on_linear_ref and n_reads == most_reads and next_node > most_reads_node:
                            continue  # Ignore this alternative

                        most_reads_node = next_node
                        most_reads = n_reads

                        if next_node in self.linear_path_nodes:
                            has_found_candidate_on_linear_ref = True

                if most_reads == 0:
                    n_ambigious += 1

                assert most_reads_node is not None

                edges_chosen.add("%d-%d" % (node, most_reads_node))
                node = most_reads_node

                if most_reads == 0:
                    # Assert we have taken linear ref path if exists
                    if any([n in self.linear_path_nodes for n in next_nodes]):
                        if node not in self.linear_path_nodes:
                            logging.error("Chose node %d as next, but it is not in linear ref." % node)
                            logging.error("Next nodes are: %s" % next_nodes)

                            for next_node in next_nodes:
                                if next_node in self.linear_path_nodes:
                                    logging.error("    Node %d is in linear ref" % next_node)
                                else:
                                    logging.error("    Node %d is not in linear ref" % next_node)

                            raise Exception("Could not traverse correctly")

        # Find statistics of chosen nodes
        nodes_chosen = set(path)
        n_on_linear = len(nodes_chosen.intersection(self.linear_path_nodes))
        n_not_on_linear = len(nodes_chosen) - n_on_linear

        linear_ref_interval = Interval(0, self.graph.blocks[path[-1]].length(), path, self.graph)
        IntervalCollection([linear_ref_interval]).to_file("%s_%s.intervalcollection" % (self.out_file_base_name, self.chromosome),
                                                          text_file=True)

        logging.info("=== STATS FOR CHROMOSOME %s ===" % self.chromosome)
        logging.info("N ambigious choices: %d" % n_ambigious)
        logging.info("Total nodes in linear ref: %d" % len(self.linear_path_nodes))
        logging.info("N nodes chosen that are not in linear ref: %d " % n_not_on_linear)
        logging.info("N nodes chosen that are in linear ref: %d " % n_on_linear)
        logging.info("N special case: %d" % n_special_case)
        logging.info("N nodes in path: %d" % len(path))
        logging.info("Linear path length: %d" % linear_ref_interval.length())

        sequence = self.sequence_graph.get_interval_sequence(linear_ref_interval)

        out_file.writelines([">%s\n" % self.chromosome])
        out_file.writelines([sequence + "\n"])
        out_file.close()