Пример #1
0
    def from_graph_peaks_in_fasta(cls, graph, vg_graph_json_file_name,
                                  chromosome, fasta_file_name,
                                  regions_bed_file, true_peaks_file):
        reads = PeakCollection.from_fasta_file(fasta_file_name, graph=graph)
        vg_graph = pyvg.vg.Graph.create_from_file(
            vg_graph_json_file_name, limit_to_chromosomes=chromosome)
        logging.info("Finding linear path")

        linear_path_file = "linear_path_%s.intervalcollection" % chromosome
        try:
            linear_path = obg.IntervalCollection.from_file(
                linear_path_file, text_file=True).intervals[0]
            linear_path = linear_path.to_indexed_interval()
        except FileNotFoundError:
            linear_path = create_linear_path(graph,
                                             vg_graph,
                                             path_name=chromosome,
                                             write_to_file=linear_path_file)

        linear_path.graph = graph

        filtered_reads = []

        # Convert regions to intervals in graph
        logging.info("Converting regions to regions in graph")
        graph_regions = []
        bed_file = open(regions_bed_file)
        for line in bed_file:
            print(line)
            line = line.split()
            chr = line[0]
            start = int(line[1])
            end = int(line[2])

            if chr != "chr%s" % chromosome:
                logging.info("Skipping %s, %d, %d" % (chr, start, end))
                continue

            graph_interval = linear_path.get_subinterval(start, end)
            graph_regions.append(graph_interval)

        assert len(graph_regions
                   ) > 0, " Found not graph regions for chr %d" % chromosome
        graph_regions = PeakCollection(graph_regions)

        # Filter out reads not overlapping with linear regions
        for read in reads:
            n_overlapping = graph_regions.get_overlapping_intervals(
                read, minimum_overlap=1)

            if n_overlapping:
                filtered_reads.append(read)

        logging.info("Found %d reads in graph regions" % len(filtered_reads))

        return cls(chromosome, reads, true_peaks_file)
Пример #2
0
def macs_to_graph_peaks(folder):
    for chrom in ["1", "2", "3", "4", "5"]:
        path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom +
                                              "_linear_pathv2.interval")
        graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg")
        macs_peaks = PeakCollection.from_fasta_file(
            folder + "/macs_sequences_chr%s_summits_unique.fasta" % chrom,
            graph)
        macs_peaks.to_file(
            folder +
            "/%s_macs_unique_graph_summits.intervalcollection" % chrom, True)
    def test_intervals_to_fasta_from_fasta(self):
        run_argument_parser([
            "create_ob_graph", "-o", "tests/testgraph.obg",
            "tests/vg_test_graph.json"
        ])

        PeakCollection([Peak(0, 2, [1, 2], score=3)
                        ]).to_file("tests/testintervals.intervalcollection",
                                   text_file=True)
        run_argument_parser([
            "peaks_to_fasta", "tests/testgraph.obg.sequences",
            "tests/testintervals.intervalcollection",
            "tests/testsequences.fasta"
        ])

        collection = PeakCollection.from_fasta_file(
            "tests/testsequences.fasta")
        self.assertEqual(len(collection.intervals), 1)
        self.assertEqual(collection.intervals[0].sequence.lower(), "tttcccctt")
    def test_get_summits(self):

        qvalues = SparseValues(np.array([0]), np.array([3]))
        qvalues.track_size = 22
        qvalues.to_sparse_files("tests/test_qvalues")

        run_argument_parser([
            "create_ob_graph", "-o", "tests/testgraph.obg",
            "tests/vg_test_graph.json"
        ])
        max_paths = PeakCollection([Peak(0, 2, [1, 2], score=3)])
        PeakFasta(self.correct_sequence_graph).write_max_path_sequences(
            "tests/test_max_paths.fasta", max_paths)

        run_argument_parser([
            "get_summits", "-g", "tests/testgraph.obg",
            "tests/test_max_paths.fasta", "tests/test_qvalues", "2"
        ])

        result = PeakCollection.from_fasta_file(
            "tests/test_max_paths_summits.fasta")
        self.assertEqual(result.intervals[0], Peak(2, 6, [1]))
        self.assertEqual(result.intervals[0].sequence.lower(), "tccc")
    return mapping


out_file = open("motif_summary_graph_matching_macs.tsv", "w")
for chrom in ["1", "2", "3", "4", "5"]:
    logging.info("Chromosome %s" % chrom)
    path = NumpyIndexedInterval.from_file("/data/bioinf/tair2/" + chrom +
                                          "_linear_pathv2.interval")
    graph = Graph.from_file("/data/bioinf/tair2/" + chrom + ".nobg")
    macs_peaks = NonGraphPeakCollection.from_fasta("macs_sequences_chr" +
                                                   chrom + "_summits.fasta")
    macs_peaks = PeakCollection.create_from_nongraph_peak_collection(
        graph, macs_peaks, path)
    macs_peaks.create_node_index()
    graph_peaks = PeakCollection.from_fasta_file(chrom +
                                                 "_sequences_summits.fasta")
    graph_peaks.create_node_index()
    macs_motif_matches = set([
        line.split("\t")[2]
        for line in open("fimo_macs_chr" + chrom + "/fimo.txt")
        if not line.startswith("#")
    ])
    graph_motif_matches = set([
        line.split("\t")[2]
        for line in open("fimo_graph_chr" + chrom + "/fimo.txt")
        if not line.startswith("#")
    ])

    mapping = get_id_mapping(graph_peaks, macs_peaks)

    motif_ids = [