Exemplo n.º 1
0
def test_simple():
    graph = Graph({
        1: Block(10),
        2: Block(5),
        3: Block(10),
        4: Block(5)
    }, {
        1: [2, 3],
        2: [4],
        3: [4]
    })
    graph.convert_to_numpy_backend()
    linear_path = NumpyIndexedInterval.from_interval(
        Interval(0, 10, [1, 2, 4], graph))
    alignments = [Interval(5, 5, [1, 3], graph), Interval(5, 5, [3, 4], graph)]
    projected = project_alignments(alignments, linear_path)
    projected = list(projected)
    assert projected[0] == (5, 15, "+")
    assert projected[1] == (15, 25, "+")
Exemplo n.º 2
0
def count_variants_in_graph(graph, linear_path):

    reference_nodes = linear_path.nodes_in_interval()
    n_variants = 0
    i = 0
    for node in graph.blocks:
        if i % 1000000 == 0:
            print("Node #%d" % i)
        i += 1
        if node not in reference_nodes:
            continue

        n_variants += max(0, len(graph.adj_list[node]) - 1)

    print("Variants: %d" % n_variants)
    return n_variants


if __name__ == "__main__":
    n_variants = 0
    for chromosome in sys.argv[2].split(","):
        print("Chromosome %s" % chromosome)
        graph = Graph.from_file(sys.argv[1] + "/" + chromosome +
                                "_pruned.nobg")
        linear_path = NumpyIndexedInterval.from_file(sys.argv[1] + "/" +
                                                     chromosome +
                                                     "_linear_pathv2.interval")
        n_variants += count_variants_in_graph(graph, linear_path)

    print("Total: %d" % n_variants)
if __name__ == "__main__":
    graph_dir = sys.argv[3]
    chromosome = sys.argv[2]

    kmer_cache = {}
    n_cache_hits = 0

    make_databse(chromosome)
    minimizer_db = sqlite3.connect("minimizers_chr%s.db" % chromosome)
    c = minimizer_db.cursor()

    graph = Graph.from_file(graph_dir + "/%s.nobg" % chromosome)
    sequence_graph = SequenceGraph.from_file(graph_dir +
                                             "%s.nobg.sequences" % chromosome)
    linear_ref_path = NumpyIndexedInterval.from_file(
        graph_dir + "/%s_linear_pathv2.interval" % chromosome)

    if chromosome == "X":
        chromosome = 23
    chromosome = int(chromosome)

    prev_minimizer_on_node = defaultdict(set)

    i = 0
    prev_kmer = ""
    prev_minimizer_hash = None
    prev_minimizer_pos = None
    ignored = 0
    #with open(sys.argv[1]) as kmer_file:
    with io.BufferedReader(gzip.open(sys.argv[1], "rb")) as kmer_file:
        for line in kmer_file:
Exemplo n.º 4
0
from offsetbasedgraph import Graph, SequenceGraph, NumpyIndexedInterval
import sys
import pickle
from graph_minimap.find_minimizers_in_kmers import make_databse
import sqlite3

chromosome = sys.argv[1]
graph_dir = sys.argv[2]

make_databse(chromosome)
minimizer_db = sqlite3.connect("minimizers_chr%s.db" % chromosome)
c = minimizer_db.cursor()
graph = Graph.from_file(graph_dir + chromosome + ".nobg")
sequence_graph = SequenceGraph.from_file(graph_dir + chromosome +
                                         ".nobg.sequences")
linear_ref = NumpyIndexedInterval.from_file(graph_dir + chromosome +
                                            "_linear_pathv2.interval")

critical_nodes = pickle.load(
    open(graph_dir + chromosome + ".critical_nodes", "rb"))
finder = MinimizerFinder(graph,
                         sequence_graph,
                         critical_nodes,
                         linear_ref,
                         k=21,
                         w=10,
                         database=c,
                         chromosome=chromosome)
finder.find_minimizers()
print("Writing to db")
minimizer_db.commit()
print("Done")
Exemplo n.º 5
0
alignments = vg_json_file_to_interval_collection(sys.argv[1], graph)
i = 1
alignment_intervals = []
for alignment in alignments:
    #indexed_alignments.append(alignment.to_numpy_indexed_interval())
    alignment_intervals.append(alignment)
    i += 1

print("Found %d alignments" % len(alignment_intervals))

paths = []
f = open(sys.argv[3])
j = 0
for path in f: 
    indexed = NumpyIndexedInterval.from_file(path.strip() + ".interval")
    interval = indexed.get_exact_subinterval(0, indexed.length())  # Hack to get interval
    interval.graph = graph
    paths.append(interval)
    j += 1

print("Found %d paths" % len(paths))

def show_sim_matrix():
    similarities = np.zeros((len(paths), len(paths)))
    for i, path in enumerate(paths):
        print(i)
        for j, path2 in enumerate(paths):
            match = path.overlap(path2) / path2.length()
            similarities[i, j] = match
            if j >= 15:
Exemplo n.º 6
0
import sys
import logging
logging.basicConfig(level=logging.DEBUG)
from offsetbasedgraph import Graph, NumpyIndexedInterval
from offsetbasedgraph.vcfmap import load_variant_maps
from graph_peak_caller.postprocess.maxpaths import SparseMaxPaths
from graph_peak_caller.sparsediffs import SparseValues
from graph_peak_caller.peakcollection import PeakCollection

chrom = sys.argv[1]
fragment_length = int(sys.argv[2])

ref = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom + "_linear_pathv2.interval")


graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg")
direct = SparseValues.from_sparse_files(chrom + "_direct_pileup")
filtered_peaks = SparseValues.from_sparse_files(chrom + "_hole_cleaned")
variant_map = load_variant_maps(chrom, "/data/bioinf/tair2/")

max_paths, sub_graphs = SparseMaxPaths(filtered_peaks, graph, direct, ref, variant_map).run()
long_maxpaths = [path for path in max_paths if path.length() >= fragment_length]

for max_path in long_max_paths:
    assert max_path.length() > 0, "Max path %s has negative length" % max_path
    score = np.max(self.q_values.get_interval_values(max_path))
    max_path.set_score(score)
    assert not np.isnan(score), "Score %s is nan" % score


PeakCollection(long_maxpaths).to_file(chrom + "_max_paths.intervalcollection", text_file=True)
Exemplo n.º 7
0
import matplotlib.pyplot as plt
import sys
import numpy as np
from offsetbasedgraph import NumpyIndexedInterval, Graph, Interval, SequenceGraph
from pyvg.conversion import vg_json_file_to_interval_collection

dbla_graph = Graph.from_file(sys.argv[1])
cidra_graph = Graph.from_file(sys.argv[2])

dbla_paths = []
cidra_paths = []
f = open(sys.argv[3])
j = 0
for path in f:
    indexed = NumpyIndexedInterval.from_file("dbla_paths/" + path.strip() +
                                             ".interval")
    interval = indexed.get_exact_subinterval(
        0, indexed.length())  # Hack to get interval
    interval.graph = dbla_graph
    dbla_paths.append(interval)

    indexed = NumpyIndexedInterval.from_file("cidra_paths/" + path.strip() +
                                             ".interval")
    interval = indexed.get_exact_subinterval(
        0, indexed.length())  # Hack to get interval
    interval.graph = cidra_graph
    cidra_paths.append(interval)

    j += 1

print(dbla_paths)