def from_graph_peaks_in_fasta(cls, graph, vg_graph_json_file_name, chromosome, fasta_file_name, regions_bed_file, true_peaks_file): reads = PeakCollection.from_fasta_file(fasta_file_name, graph=graph) vg_graph = pyvg.vg.Graph.create_from_file( vg_graph_json_file_name, limit_to_chromosomes=chromosome) logging.info("Finding linear path") linear_path_file = "linear_path_%s.intervalcollection" % chromosome try: linear_path = obg.IntervalCollection.from_file( linear_path_file, text_file=True).intervals[0] linear_path = linear_path.to_indexed_interval() except FileNotFoundError: linear_path = create_linear_path(graph, vg_graph, path_name=chromosome, write_to_file=linear_path_file) linear_path.graph = graph filtered_reads = [] # Convert regions to intervals in graph logging.info("Converting regions to regions in graph") graph_regions = [] bed_file = open(regions_bed_file) for line in bed_file: print(line) line = line.split() chr = line[0] start = int(line[1]) end = int(line[2]) if chr != "chr%s" % chromosome: logging.info("Skipping %s, %d, %d" % (chr, start, end)) continue graph_interval = linear_path.get_subinterval(start, end) graph_regions.append(graph_interval) assert len(graph_regions ) > 0, " Found not graph regions for chr %d" % chromosome graph_regions = PeakCollection(graph_regions) # Filter out reads not overlapping with linear regions for read in reads: n_overlapping = graph_regions.get_overlapping_intervals( read, minimum_overlap=1) if n_overlapping: filtered_reads.append(read) logging.info("Found %d reads in graph regions" % len(filtered_reads)) return cls(chromosome, reads, true_peaks_file)
def macs_to_graph_peaks(folder): for chrom in ["1", "2", "3", "4", "5"]: path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom + "_linear_pathv2.interval") graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg") macs_peaks = PeakCollection.from_fasta_file( folder + "/macs_sequences_chr%s_summits_unique.fasta" % chrom, graph) macs_peaks.to_file( folder + "/%s_macs_unique_graph_summits.intervalcollection" % chrom, True)
def test_intervals_to_fasta_from_fasta(self): run_argument_parser([ "create_ob_graph", "-o", "tests/testgraph.obg", "tests/vg_test_graph.json" ]) PeakCollection([Peak(0, 2, [1, 2], score=3) ]).to_file("tests/testintervals.intervalcollection", text_file=True) run_argument_parser([ "peaks_to_fasta", "tests/testgraph.obg.sequences", "tests/testintervals.intervalcollection", "tests/testsequences.fasta" ]) collection = PeakCollection.from_fasta_file( "tests/testsequences.fasta") self.assertEqual(len(collection.intervals), 1) self.assertEqual(collection.intervals[0].sequence.lower(), "tttcccctt")
def test_get_summits(self): qvalues = SparseValues(np.array([0]), np.array([3])) qvalues.track_size = 22 qvalues.to_sparse_files("tests/test_qvalues") run_argument_parser([ "create_ob_graph", "-o", "tests/testgraph.obg", "tests/vg_test_graph.json" ]) max_paths = PeakCollection([Peak(0, 2, [1, 2], score=3)]) PeakFasta(self.correct_sequence_graph).write_max_path_sequences( "tests/test_max_paths.fasta", max_paths) run_argument_parser([ "get_summits", "-g", "tests/testgraph.obg", "tests/test_max_paths.fasta", "tests/test_qvalues", "2" ]) result = PeakCollection.from_fasta_file( "tests/test_max_paths_summits.fasta") self.assertEqual(result.intervals[0], Peak(2, 6, [1])) self.assertEqual(result.intervals[0].sequence.lower(), "tccc")
return mapping out_file = open("motif_summary_graph_matching_macs.tsv", "w") for chrom in ["1", "2", "3", "4", "5"]: logging.info("Chromosome %s" % chrom) path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom + "_linear_pathv2.interval") graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg") macs_peaks = NonGraphPeakCollection.from_fasta("macs_sequences_chr" + chrom + "_summits.fasta") macs_peaks = PeakCollection.create_from_nongraph_peak_collection( graph, macs_peaks, path) macs_peaks.create_node_index() graph_peaks = PeakCollection.from_fasta_file(chrom + "_sequences_summits.fasta") graph_peaks.create_node_index() macs_motif_matches = set([ line.split("\t")[2] for line in open("fimo_macs_chr" + chrom + "/fimo.txt") if not line.startswith("#") ]) graph_motif_matches = set([ line.split("\t")[2] for line in open("fimo_graph_chr" + chrom + "/fimo.txt") if not line.startswith("#") ]) mapping = get_id_mapping(graph_peaks, macs_peaks) motif_ids = [