def __init__(self, tf_experiment_dir, data_dir): self.experiment_dir = tf_experiment_dir self.data_dir = data_dir self.bam_file = pysam.AlignmentFile(self.experiment_dir + "/linear_alignments.bam", "rb") self.linear_path = NumpyIndexedInterval.from_file(self.data_dir + "/5_linear_pathv2.interval") self.graph = Graph.from_file(self.data_dir + "/5.nobg") self.alignment_collection = AlignmentCollection.from_file(self.experiment_dir + "/5_alignments.pickle", self.graph) self.check_peaks()
def macs_to_graph_peaks(folder): for chrom in ["1", "2", "3", "4", "5"]: path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom + "_linear_pathv2.interval") graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg") macs_peaks = PeakCollection.from_fasta_file( folder + "/macs_sequences_chr%s_summits_unique.fasta" % chrom, graph) macs_peaks.to_file( folder + "/%s_macs_unique_graph_summits.intervalcollection" % chrom, True)
def run_predict_path(args): chromosomes = args.chromosomes.split(",") processes = [] if not os.path.isfile(args.alignments): logging.error("Input alignments file %s does not exist" % args.alignments) sys.exit() for chromosome in chromosomes: logging.info("Starting process for chromosome %s " % chromosome) process = Process(target=run_predict_path_single_chromosome, args=(args.alignments, chromosome, args.data_dir, args.linear_ref_bonus, args.out_file_name, args.max_nodes_to_traverse)) process.start() processes.append(process) for process in processes: process.join() # Merge all fasta files that were produces out_fasta = open(args.out_file_name + ".fa", "w") logging.info("Merging fasta files") for chromosome in tqdm(chromosomes): with open(args.out_file_name + "_" + chromosome + ".fasta") as f: out_fasta.write(f.read()) logging.info("Wrote resulting linear reference to %s" % (args.out_file_name + ".fa")) # Create indexed intervals for each interval file that was produced logging.info("Creating indexed interval for all chromosomes") for chromosome in chromosomes: file_name = args.out_file_name + "_" + chromosome + ".intervalcollection" graph = Graph.from_file(args.data_dir + chromosome + ".nobg") intervals = IntervalCollection.from_file(file_name, text_file=True, graph=graph) intervals = list(intervals.intervals) assert len( intervals) == 1, "Only a single interval in file is supported" interval = intervals[0] indexed = interval.to_numpy_indexed_interval() indexed.to_file(file_name + ".indexed") logging.info("Wrote indexed interval to file %s" % file_name + ".indexed") if not args.skip_bwa_index: logging.info("Running bwa index") run_bwa_index(args.out_file_name + ".fa") else: logging.info("Not creating bwa index")
def read_graphs(graph_dir, chromosomes): logging.info("Reading graphs") graphs = {} sequence_graphs = {} linear_ref_nodes = {} for chromosome in chromosomes: chromosome_name = chromosome if chromosome == "X": chromosome_name = "23" logging.info("Reading graphs for chromosome %s" % chromosome) graphs[chromosome_name] = Graph.from_file(graph_dir + chromosome + ".nobg") sequence_graphs[chromosome_name] = SequenceGraph.from_file( graph_dir + chromosome + ".nobg.sequencesv2") linear_ref_nodes[ chromosome_name] = None #NumpyIndexedInterval.from_file(graph_dir + chromosome + "_linear_pathv2.interval").nodes_in_interval() return graphs, sequence_graphs, linear_ref_nodes
def run_predict_path_single_chromosome(alignment_file_name, chromosome, graph_dir, linear_ref_bonus, out_file_base_name, max_nodes_to_traverse): sequence_graph = SequenceGraph.from_file(graph_dir + chromosome + ".nobg.sequences") graph = Graph.from_file(graph_dir + chromosome + ".nobg") linear_path = NumpyIndexedInterval.from_file(graph_dir + "/%s_linear_pathv2.interval" % chromosome) PathPredicter(alignment_file_name, graph, sequence_graph, chromosome, linear_path, out_file_base_name, linear_ref_bonus=linear_ref_bonus, max_nodes_to_traverse=max_nodes_to_traverse)
def make_haplotype_paths(graph_file_name, linear_ref_path_file_name, haplotype0_file_name, haplotype1_file_name, out_base_name, chromosome): # Make a linear reference fasta and interval and haplotypes fasta and intervals chrom = chromosome graph = Graph.from_file(graph_file_name) sequence_graph = SequenceGraph.from_file(graph_file_name + ".sequences") linear_ref = IntervalCollection.from_file(linear_ref_path_file_name, text_file=True) linear_ref = list(linear_ref.intervals)[0] linear_ref_nodes = set(linear_ref.region_paths) # Write linear ref fasta to file linear_ref_seq = sequence_graph.get_interval_sequence(linear_ref) out_file = open("linear_ref_" + chrom + ".fasta", "w") out_file.writelines([">%s\n" % chrom]) out_file.writelines([linear_ref_seq + "\n"]) out_file.close() logging.info("Wrote linear ref sequence. N nodes in linear ref: %d" % len(linear_ref_nodes)) haplotype_nodes = [set(), set()] # For haplotype 0 and 1 for haplotype in [0, 1]: haplotype_file_name = haplotype0_file_name if haplotype == 1: haplotype_file_name = haplotype1_file_name intervals = vg_json_file_to_intervals(haplotype_file_name, graph) for interval in intervals: for node in interval.region_paths: haplotype_nodes[haplotype].add(node) logging.info("N nodes in haplotype 0: %d" % len(haplotype_nodes[0])) logging.info("N nodes in haplotype 0 that are also in linear ref: %d" % len(haplotype_nodes[0].intersection(linear_ref_nodes))) logging.info("N nodes in haplotype 1: %d" % len(haplotype_nodes[1])) # Traverse graph to get full correct haplotype intervals first_nodes = graph.get_first_blocks() assert len(first_nodes) == 1 logging.info("N nodes in graph: %d" % len(graph.blocks)) for haplotype in [0, 1]: logging.info("Traversing haplotype %d" % haplotype) nodes = [] node = first_nodes[0] nodes_in_haplotype = haplotype_nodes[haplotype] nodes_in_haplotype = set(range( 0, max(linear_ref_nodes))).difference(linear_ref_nodes) logging.info("There are %d haplotype nodes" % len(nodes_in_haplotype)) assert len( nodes_in_haplotype ) > 0, "There are no haplotype nodes. Check that haplotype json files are not empty" n_haplotype_nodes = 0 i = 0 while True: nodes.append(node) if i % 50000 == 0: logging.info("#%d nodes traversed. On node %d" % (i, node)) i += 1 next_nodes = set(graph.adj_list[node]) if len(next_nodes) == 0: logging.info("Reached end node %d with 0 edges" % node) break next_on_haplotype = next_nodes.intersection(nodes_in_haplotype) if len(next_on_haplotype) == 1: n_haplotype_nodes += 1 next_node = list(next_on_haplotype)[0] assert next_node != node node = next_node elif len(next_on_haplotype) == 0: logging.debug( "No new haplotype node from %d. Will follow reference" % node) # Choose reference with lowest id to avoid deletion node = min(list(next_nodes.intersection(linear_ref_nodes))) else: # logging.warning("There is a deletion from node %d. Choosing lowest node id as next to avoid deletion." % node) # This means more than one next node is on haplotype. Choose the one with lowest id to avoid taking deletion node = min(list(next_on_haplotype)) logging.info("Found %d nodes. %d on haplotype" % (len(nodes), n_haplotype_nodes)) haplotype_interval = Interval(0, graph.blocks[nodes[-1]].length(), nodes, graph) print("Path length: %d" % haplotype_interval.length()) file_base_name = out_base_name + "_" + str(haplotype) IntervalCollection([haplotype_interval ]).to_file(file_base_name + ".intervalcollection", text_file=True) sequence = sequence_graph.get_interval_sequence(haplotype_interval) out_file = open(file_base_name + ".fasta", "w") out_file.writelines([">%s\n" % chrom]) out_file.writelines([sequence + "\n"]) out_file.close() logging.info("Wrote fasta sequence to %s" % file_base_name + ".fasta")
def count_variants_in_graph(graph, linear_path): reference_nodes = linear_path.nodes_in_interval() n_variants = 0 i = 0 for node in graph.blocks: if i % 1000000 == 0: print("Node #%d" % i) i += 1 if node not in reference_nodes: continue n_variants += max(0, len(graph.adj_list[node]) - 1) print("Variants: %d" % n_variants) return n_variants if __name__ == "__main__": n_variants = 0 for chromosome in sys.argv[2].split(","): print("Chromosome %s" % chromosome) graph = Graph.from_file(sys.argv[1] + "/" + chromosome + "_pruned.nobg") linear_path = NumpyIndexedInterval.from_file(sys.argv[1] + "/" + chromosome + "_linear_pathv2.interval") n_variants += count_variants_in_graph(graph, linear_path) print("Total: %d" % n_variants)
import sys from pyvg.alignmentcollection import AlignmentCollection from offsetbasedgraph import Graph import logging if sys.argv[1] == "create_alignment_collection": collection = AlignmentCollection.from_vg_json_file(sys.argv[2], Graph.from_file(sys.argv[3])) collection.to_file(sys.argv[4]) logging.info("Wrote to file %s" % sys.argv[4])
minimizer_offset) return minimizer_start_position.region_path_id, minimizer_start_position.offset if __name__ == "__main__": graph_dir = sys.argv[3] chromosome = sys.argv[2] kmer_cache = {} n_cache_hits = 0 make_databse(chromosome) minimizer_db = sqlite3.connect("minimizers_chr%s.db" % chromosome) c = minimizer_db.cursor() graph = Graph.from_file(graph_dir + "/%s.nobg" % chromosome) sequence_graph = SequenceGraph.from_file(graph_dir + "%s.nobg.sequences" % chromosome) linear_ref_path = NumpyIndexedInterval.from_file( graph_dir + "/%s_linear_pathv2.interval" % chromosome) if chromosome == "X": chromosome = 23 chromosome = int(chromosome) prev_minimizer_on_node = defaultdict(set) i = 0 prev_kmer = "" prev_minimizer_hash = None prev_minimizer_pos = None
from graph_minimap.minimizer_finder import Minimizers, MinimizerFinder from offsetbasedgraph import Graph, SequenceGraph, NumpyIndexedInterval import sys import pickle from graph_minimap.find_minimizers_in_kmers import make_databse import sqlite3 chromosome = sys.argv[1] graph_dir = sys.argv[2] make_databse(chromosome) minimizer_db = sqlite3.connect("minimizers_chr%s.db" % chromosome) c = minimizer_db.cursor() graph = Graph.from_file(graph_dir + chromosome + ".nobg") sequence_graph = SequenceGraph.from_file(graph_dir + chromosome + ".nobg.sequences") linear_ref = NumpyIndexedInterval.from_file(graph_dir + chromosome + "_linear_pathv2.interval") critical_nodes = pickle.load( open(graph_dir + chromosome + ".critical_nodes", "rb")) finder = MinimizerFinder(graph, sequence_graph, critical_nodes, linear_ref, k=21, w=10, database=c, chromosome=chromosome) finder.find_minimizers() print("Writing to db")
import sys import numpy as np from offsetbasedgraph import NumpyIndexedInterval, Graph, Interval, SequenceGraph from pyvg.conversion import vg_json_file_to_interval_collection import sys graph = Graph.from_file(sys.argv[2]) sequence_graph = SequenceGraph.from_file(sys.argv[2] + ".sequences") alignments = vg_json_file_to_interval_collection(sys.argv[1], graph) i = 1 alignment_intervals = [] for alignment in alignments: #indexed_alignments.append(alignment.to_numpy_indexed_interval()) alignment_intervals.append(alignment) i += 1 print("Found %d alignments" % len(alignment_intervals)) paths = [] f = open(sys.argv[3]) j = 0 for path in f: indexed = NumpyIndexedInterval.from_file(path.strip() + ".interval") interval = indexed.get_exact_subinterval(0, indexed.length()) # Hack to get interval interval.graph = graph paths.append(interval) j += 1 print("Found %d paths" % len(paths))
import sys import logging logging.basicConfig(level=logging.DEBUG) from offsetbasedgraph import Graph, NumpyIndexedInterval from offsetbasedgraph.vcfmap import load_variant_maps from graph_peak_caller.postprocess.maxpaths import SparseMaxPaths from graph_peak_caller.sparsediffs import SparseValues from graph_peak_caller.peakcollection import PeakCollection chrom = sys.argv[1] fragment_length = int(sys.argv[2]) ref = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom + "_linear_pathv2.interval") graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg") direct = SparseValues.from_sparse_files(chrom + "_direct_pileup") filtered_peaks = SparseValues.from_sparse_files(chrom + "_hole_cleaned") variant_map = load_variant_maps(chrom, "/data/bioinf/tair2/") max_paths, sub_graphs = SparseMaxPaths(filtered_peaks, graph, direct, ref, variant_map).run() long_maxpaths = [path for path in max_paths if path.length() >= fragment_length] for max_path in long_max_paths: assert max_path.length() > 0, "Max path %s has negative length" % max_path score = np.max(self.q_values.get_interval_values(max_path)) max_path.set_score(score) assert not np.isnan(score), "Score %s is nan" % score PeakCollection(long_maxpaths).to_file(chrom + "_max_paths.intervalcollection", text_file=True)
from graph_peak_caller.control.snarls import SnarlGraphBuilder, SnarlGraph from offsetbasedgraph import Graph, Block graph = Graph.from_file("haplo1kg50-mhc.obg") print("N blocks: %d" % len(graph.blocks)) """ print(graph.adj_list[598826]) print(graph.adj_list[485824]) print(graph.adj_list[485850]) print(graph.adj_list[485851]) print(graph.adj_list[485852]) print(graph.adj_list[485853]) print(graph.adj_list[485854]) """ """ print(graph.adj_list[126022]) print(graph.adj_list[-126022]) print(graph.adj_list[126021]) print(graph.adj_list[-126021]) print(graph.adj_list[126019]) print(graph.adj_list[-126019]) print(graph.adj_list[126020]) """ builder = SnarlGraphBuilder.from_vg_snarls(graph, "haplo1kg50-mhc.snarls") snarlgraph = builder.build_snarl_graphs() #print(len(snarlgraph.blocks))
import matplotlib.pyplot as plt import sys import numpy as np from offsetbasedgraph import NumpyIndexedInterval, Graph, Interval, SequenceGraph from pyvg.conversion import vg_json_file_to_interval_collection dbla_graph = Graph.from_file(sys.argv[1]) cidra_graph = Graph.from_file(sys.argv[2]) dbla_paths = [] cidra_paths = [] f = open(sys.argv[3]) j = 0 for path in f: indexed = NumpyIndexedInterval.from_file("dbla_paths/" + path.strip() + ".interval") interval = indexed.get_exact_subinterval( 0, indexed.length()) # Hack to get interval interval.graph = dbla_graph dbla_paths.append(interval) indexed = NumpyIndexedInterval.from_file("cidra_paths/" + path.strip() + ".interval") interval = indexed.get_exact_subinterval( 0, indexed.length()) # Hack to get interval interval.graph = cidra_graph cidra_paths.append(interval) j += 1 print(dbla_paths)