Пример #1
0
 def __init__(self, tf_experiment_dir, data_dir):
     self.experiment_dir = tf_experiment_dir
     self.data_dir = data_dir
     self.bam_file = pysam.AlignmentFile(self.experiment_dir + "/linear_alignments.bam", "rb")
     self.linear_path = NumpyIndexedInterval.from_file(self.data_dir + "/5_linear_pathv2.interval")
     self.graph = Graph.from_file(self.data_dir + "/5.nobg")
     self.alignment_collection = AlignmentCollection.from_file(self.experiment_dir + "/5_alignments.pickle", self.graph)
     self.check_peaks()
Пример #2
0
def macs_to_graph_peaks(folder):
    for chrom in ["1", "2", "3", "4", "5"]:
        path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom +
                                              "_linear_pathv2.interval")
        graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg")
        macs_peaks = PeakCollection.from_fasta_file(
            folder + "/macs_sequences_chr%s_summits_unique.fasta" % chrom,
            graph)
        macs_peaks.to_file(
            folder +
            "/%s_macs_unique_graph_summits.intervalcollection" % chrom, True)
def run_predict_path(args):
    chromosomes = args.chromosomes.split(",")
    processes = []
    if not os.path.isfile(args.alignments):
        logging.error("Input alignments file %s does not exist" %
                      args.alignments)
        sys.exit()

    for chromosome in chromosomes:
        logging.info("Starting process for chromosome %s " % chromosome)
        process = Process(target=run_predict_path_single_chromosome,
                          args=(args.alignments, chromosome, args.data_dir,
                                args.linear_ref_bonus, args.out_file_name,
                                args.max_nodes_to_traverse))
        process.start()
        processes.append(process)

    for process in processes:
        process.join()

    # Merge all fasta files that were produces
    out_fasta = open(args.out_file_name + ".fa", "w")
    logging.info("Merging fasta files")
    for chromosome in tqdm(chromosomes):
        with open(args.out_file_name + "_" + chromosome + ".fasta") as f:
            out_fasta.write(f.read())

    logging.info("Wrote resulting linear reference to %s" %
                 (args.out_file_name + ".fa"))

    # Create indexed intervals for each interval file that was produced
    logging.info("Creating indexed interval for all chromosomes")
    for chromosome in chromosomes:
        file_name = args.out_file_name + "_" + chromosome + ".intervalcollection"
        graph = Graph.from_file(args.data_dir + chromosome + ".nobg")
        intervals = IntervalCollection.from_file(file_name,
                                                 text_file=True,
                                                 graph=graph)
        intervals = list(intervals.intervals)
        assert len(
            intervals) == 1, "Only a single interval in file is supported"
        interval = intervals[0]
        indexed = interval.to_numpy_indexed_interval()
        indexed.to_file(file_name + ".indexed")
        logging.info("Wrote indexed interval to file %s" % file_name +
                     ".indexed")

    if not args.skip_bwa_index:
        logging.info("Running bwa index")
        run_bwa_index(args.out_file_name + ".fa")
    else:
        logging.info("Not creating bwa index")
Пример #4
0
def read_graphs(graph_dir, chromosomes):
    logging.info("Reading graphs")
    graphs = {}
    sequence_graphs = {}
    linear_ref_nodes = {}
    for chromosome in chromosomes:
        chromosome_name = chromosome
        if chromosome == "X":
            chromosome_name = "23"
        logging.info("Reading graphs for chromosome %s" % chromosome)
        graphs[chromosome_name] = Graph.from_file(graph_dir + chromosome +
                                                  ".nobg")
        sequence_graphs[chromosome_name] = SequenceGraph.from_file(
            graph_dir + chromosome + ".nobg.sequencesv2")
        linear_ref_nodes[
            chromosome_name] = None  #NumpyIndexedInterval.from_file(graph_dir + chromosome + "_linear_pathv2.interval").nodes_in_interval()

    return graphs, sequence_graphs, linear_ref_nodes
def run_predict_path_single_chromosome(alignment_file_name, chromosome,
                                       graph_dir, linear_ref_bonus,
                                       out_file_base_name,
                                       max_nodes_to_traverse):
    sequence_graph = SequenceGraph.from_file(graph_dir + chromosome +
                                             ".nobg.sequences")
    graph = Graph.from_file(graph_dir + chromosome + ".nobg")
    linear_path = NumpyIndexedInterval.from_file(graph_dir +
                                                 "/%s_linear_pathv2.interval" %
                                                 chromosome)
    PathPredicter(alignment_file_name,
                  graph,
                  sequence_graph,
                  chromosome,
                  linear_path,
                  out_file_base_name,
                  linear_ref_bonus=linear_ref_bonus,
                  max_nodes_to_traverse=max_nodes_to_traverse)
Пример #6
0
def make_haplotype_paths(graph_file_name, linear_ref_path_file_name,
                         haplotype0_file_name, haplotype1_file_name,
                         out_base_name, chromosome):
    # Make a linear reference fasta and interval and haplotypes fasta and intervals

    chrom = chromosome
    graph = Graph.from_file(graph_file_name)
    sequence_graph = SequenceGraph.from_file(graph_file_name + ".sequences")

    linear_ref = IntervalCollection.from_file(linear_ref_path_file_name,
                                              text_file=True)
    linear_ref = list(linear_ref.intervals)[0]
    linear_ref_nodes = set(linear_ref.region_paths)

    # Write linear ref fasta to file
    linear_ref_seq = sequence_graph.get_interval_sequence(linear_ref)
    out_file = open("linear_ref_" + chrom + ".fasta", "w")
    out_file.writelines([">%s\n" % chrom])
    out_file.writelines([linear_ref_seq + "\n"])
    out_file.close()
    logging.info("Wrote linear ref sequence. N nodes in linear ref: %d" %
                 len(linear_ref_nodes))

    haplotype_nodes = [set(), set()]  # For haplotype 0 and 1
    for haplotype in [0, 1]:
        haplotype_file_name = haplotype0_file_name
        if haplotype == 1:
            haplotype_file_name = haplotype1_file_name

        intervals = vg_json_file_to_intervals(haplotype_file_name, graph)

        for interval in intervals:
            for node in interval.region_paths:
                haplotype_nodes[haplotype].add(node)

    logging.info("N nodes in haplotype 0: %d" % len(haplotype_nodes[0]))
    logging.info("N nodes in haplotype 0 that are also in linear ref: %d" %
                 len(haplotype_nodes[0].intersection(linear_ref_nodes)))
    logging.info("N nodes in haplotype 1: %d" % len(haplotype_nodes[1]))

    # Traverse graph to get full correct haplotype intervals
    first_nodes = graph.get_first_blocks()
    assert len(first_nodes) == 1
    logging.info("N nodes in graph: %d" % len(graph.blocks))

    for haplotype in [0, 1]:
        logging.info("Traversing haplotype %d" % haplotype)

        nodes = []
        node = first_nodes[0]
        nodes_in_haplotype = haplotype_nodes[haplotype]
        nodes_in_haplotype = set(range(
            0, max(linear_ref_nodes))).difference(linear_ref_nodes)
        logging.info("There are %d haplotype nodes" % len(nodes_in_haplotype))

        assert len(
            nodes_in_haplotype
        ) > 0, "There are no haplotype nodes. Check that haplotype json files are not empty"

        n_haplotype_nodes = 0
        i = 0
        while True:

            nodes.append(node)
            if i % 50000 == 0:
                logging.info("#%d nodes traversed. On node %d" % (i, node))
            i += 1

            next_nodes = set(graph.adj_list[node])

            if len(next_nodes) == 0:
                logging.info("Reached end node %d with 0 edges" % node)
                break

            next_on_haplotype = next_nodes.intersection(nodes_in_haplotype)
            if len(next_on_haplotype) == 1:
                n_haplotype_nodes += 1
                next_node = list(next_on_haplotype)[0]
                assert next_node != node
                node = next_node
            elif len(next_on_haplotype) == 0:
                logging.debug(
                    "No new haplotype node from %d. Will follow reference" %
                    node)
                # Choose reference with lowest id to avoid deletion
                node = min(list(next_nodes.intersection(linear_ref_nodes)))
            else:
                # logging.warning("There is a deletion from node %d. Choosing lowest node id as next to avoid deletion." % node)
                # This means more than one next node is on haplotype. Choose the one with lowest id to avoid taking deletion
                node = min(list(next_on_haplotype))

        logging.info("Found %d nodes. %d on haplotype" %
                     (len(nodes), n_haplotype_nodes))
        haplotype_interval = Interval(0, graph.blocks[nodes[-1]].length(),
                                      nodes, graph)
        print("Path length: %d" % haplotype_interval.length())

        file_base_name = out_base_name + "_" + str(haplotype)
        IntervalCollection([haplotype_interval
                            ]).to_file(file_base_name + ".intervalcollection",
                                       text_file=True)

        sequence = sequence_graph.get_interval_sequence(haplotype_interval)
        out_file = open(file_base_name + ".fasta", "w")
        out_file.writelines([">%s\n" % chrom])
        out_file.writelines([sequence + "\n"])
        out_file.close()
        logging.info("Wrote fasta sequence to %s" % file_base_name + ".fasta")
Пример #7
0
def count_variants_in_graph(graph, linear_path):

    reference_nodes = linear_path.nodes_in_interval()
    n_variants = 0
    i = 0
    for node in graph.blocks:
        if i % 1000000 == 0:
            print("Node #%d" % i)
        i += 1
        if node not in reference_nodes:
            continue

        n_variants += max(0, len(graph.adj_list[node]) - 1)

    print("Variants: %d" % n_variants)
    return n_variants


if __name__ == "__main__":
    n_variants = 0
    for chromosome in sys.argv[2].split(","):
        print("Chromosome %s" % chromosome)
        graph = Graph.from_file(sys.argv[1] + "/" + chromosome +
                                "_pruned.nobg")
        linear_path = NumpyIndexedInterval.from_file(sys.argv[1] + "/" +
                                                     chromosome +
                                                     "_linear_pathv2.interval")
        n_variants += count_variants_in_graph(graph, linear_path)

    print("Total: %d" % n_variants)
Пример #8
0
import sys
from pyvg.alignmentcollection import AlignmentCollection
from offsetbasedgraph import Graph
import logging


if sys.argv[1] == "create_alignment_collection":
    collection = AlignmentCollection.from_vg_json_file(sys.argv[2], Graph.from_file(sys.argv[3]))
    collection.to_file(sys.argv[4])
    logging.info("Wrote to file %s" % sys.argv[4])
        minimizer_offset)
    return minimizer_start_position.region_path_id, minimizer_start_position.offset


if __name__ == "__main__":
    graph_dir = sys.argv[3]
    chromosome = sys.argv[2]

    kmer_cache = {}
    n_cache_hits = 0

    make_databse(chromosome)
    minimizer_db = sqlite3.connect("minimizers_chr%s.db" % chromosome)
    c = minimizer_db.cursor()

    graph = Graph.from_file(graph_dir + "/%s.nobg" % chromosome)
    sequence_graph = SequenceGraph.from_file(graph_dir +
                                             "%s.nobg.sequences" % chromosome)
    linear_ref_path = NumpyIndexedInterval.from_file(
        graph_dir + "/%s_linear_pathv2.interval" % chromosome)

    if chromosome == "X":
        chromosome = 23
    chromosome = int(chromosome)

    prev_minimizer_on_node = defaultdict(set)

    i = 0
    prev_kmer = ""
    prev_minimizer_hash = None
    prev_minimizer_pos = None
Пример #10
0
from graph_minimap.minimizer_finder import Minimizers, MinimizerFinder
from offsetbasedgraph import Graph, SequenceGraph, NumpyIndexedInterval
import sys
import pickle
from graph_minimap.find_minimizers_in_kmers import make_databse
import sqlite3

chromosome = sys.argv[1]
graph_dir = sys.argv[2]

make_databse(chromosome)
minimizer_db = sqlite3.connect("minimizers_chr%s.db" % chromosome)
c = minimizer_db.cursor()
graph = Graph.from_file(graph_dir + chromosome + ".nobg")
sequence_graph = SequenceGraph.from_file(graph_dir + chromosome +
                                         ".nobg.sequences")
linear_ref = NumpyIndexedInterval.from_file(graph_dir + chromosome +
                                            "_linear_pathv2.interval")

critical_nodes = pickle.load(
    open(graph_dir + chromosome + ".critical_nodes", "rb"))
finder = MinimizerFinder(graph,
                         sequence_graph,
                         critical_nodes,
                         linear_ref,
                         k=21,
                         w=10,
                         database=c,
                         chromosome=chromosome)
finder.find_minimizers()
print("Writing to db")
Пример #11
0
import sys
import numpy as np
from offsetbasedgraph import NumpyIndexedInterval, Graph, Interval, SequenceGraph
from pyvg.conversion import vg_json_file_to_interval_collection

import sys
graph = Graph.from_file(sys.argv[2])
sequence_graph = SequenceGraph.from_file(sys.argv[2] + ".sequences")


alignments = vg_json_file_to_interval_collection(sys.argv[1], graph)
i = 1
alignment_intervals = []
for alignment in alignments:
    #indexed_alignments.append(alignment.to_numpy_indexed_interval())
    alignment_intervals.append(alignment)
    i += 1

print("Found %d alignments" % len(alignment_intervals))

paths = []
f = open(sys.argv[3])
j = 0
for path in f: 
    indexed = NumpyIndexedInterval.from_file(path.strip() + ".interval")
    interval = indexed.get_exact_subinterval(0, indexed.length())  # Hack to get interval
    interval.graph = graph
    paths.append(interval)
    j += 1

print("Found %d paths" % len(paths))
Пример #12
0
import sys
import logging
logging.basicConfig(level=logging.DEBUG)
from offsetbasedgraph import Graph, NumpyIndexedInterval
from offsetbasedgraph.vcfmap import load_variant_maps
from graph_peak_caller.postprocess.maxpaths import SparseMaxPaths
from graph_peak_caller.sparsediffs import SparseValues
from graph_peak_caller.peakcollection import PeakCollection

chrom = sys.argv[1]
fragment_length = int(sys.argv[2])

ref = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom + "_linear_pathv2.interval")


graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg")
direct = SparseValues.from_sparse_files(chrom + "_direct_pileup")
filtered_peaks = SparseValues.from_sparse_files(chrom + "_hole_cleaned")
variant_map = load_variant_maps(chrom, "/data/bioinf/tair2/")

max_paths, sub_graphs = SparseMaxPaths(filtered_peaks, graph, direct, ref, variant_map).run()
long_maxpaths = [path for path in max_paths if path.length() >= fragment_length]

for max_path in long_max_paths:
    assert max_path.length() > 0, "Max path %s has negative length" % max_path
    score = np.max(self.q_values.get_interval_values(max_path))
    max_path.set_score(score)
    assert not np.isnan(score), "Score %s is nan" % score


PeakCollection(long_maxpaths).to_file(chrom + "_max_paths.intervalcollection", text_file=True)
Пример #13
0
from graph_peak_caller.control.snarls import SnarlGraphBuilder, SnarlGraph
from offsetbasedgraph import Graph, Block

graph = Graph.from_file("haplo1kg50-mhc.obg")

print("N blocks: %d" % len(graph.blocks))
"""
print(graph.adj_list[598826])

print(graph.adj_list[485824])
print(graph.adj_list[485850])
print(graph.adj_list[485851])
print(graph.adj_list[485852])
print(graph.adj_list[485853])
print(graph.adj_list[485854])
"""
"""
print(graph.adj_list[126022])
print(graph.adj_list[-126022])
print(graph.adj_list[126021])
print(graph.adj_list[-126021])
print(graph.adj_list[126019])
print(graph.adj_list[-126019])
print(graph.adj_list[126020])
"""

builder = SnarlGraphBuilder.from_vg_snarls(graph, "haplo1kg50-mhc.snarls")
snarlgraph = builder.build_snarl_graphs()

#print(len(snarlgraph.blocks))
Пример #14
0
import matplotlib.pyplot as plt
import sys
import numpy as np
from offsetbasedgraph import NumpyIndexedInterval, Graph, Interval, SequenceGraph
from pyvg.conversion import vg_json_file_to_interval_collection

dbla_graph = Graph.from_file(sys.argv[1])
cidra_graph = Graph.from_file(sys.argv[2])

dbla_paths = []
cidra_paths = []
f = open(sys.argv[3])
j = 0
for path in f:
    indexed = NumpyIndexedInterval.from_file("dbla_paths/" + path.strip() +
                                             ".interval")
    interval = indexed.get_exact_subinterval(
        0, indexed.length())  # Hack to get interval
    interval.graph = dbla_graph
    dbla_paths.append(interval)

    indexed = NumpyIndexedInterval.from_file("cidra_paths/" + path.strip() +
                                             ".interval")
    interval = indexed.get_exact_subinterval(
        0, indexed.length())  # Hack to get interval
    interval.graph = cidra_graph
    cidra_paths.append(interval)

    j += 1

print(dbla_paths)