Exemplo n.º 1
0
    def _run(self, _config, temp):
        def keyfunc(bed):
            return (bed.contig, bed.name, bed.start)

        fastafile = pysam.Fastafile(self._reference)
        seqs = collections.defaultdict(list)
        with open(self._intervals) as bedfile:
            intervals = text.parse_lines_by_contig(bedfile, pysam.asBed()).items()
            for (contig, beds) in sorted(intervals):
                beds.sort(key = keyfunc)

                for (gene, gene_beds) in itertools.groupby(beds, lambda x: x.name):
                    gene_beds = tuple(gene_beds)
                    for bed in gene_beds:
                        seqs[(contig, gene)].append(fastafile.fetch(contig, bed.start, bed.end))

                    seq = "".join(seqs[(contig, gene)])
                    if any((bed.strand == "-") for bed in gene_beds):
                        assert all((bed.strand == "-") for bed in gene_beds)
                        seq = sequences.reverse_complement(seq)
                    seqs[(contig, gene)] = seq

        temp_file = os.path.join(temp, "sequences.fasta")
        with open(temp_file, "w") as out_file:
            for ((_, gene), sequence) in sorted(seqs.items()):
                fasta.print_fasta(gene, sequence, out_file)

        move_file(temp_file, self._outfile)
Exemplo n.º 2
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--genotype",
                        help="Tabix indexed pileup file.",
                        required=True)
    parser.add_argument("--intervals", help="BED file.", required=True)
    parser.add_argument("--padding",
                        type=int,
                        default=10,
                        help="Number of bases to expand intervals, when "
                        "filtering based on adjacent indels [%default]")
    parser.add_argument("--min-distance-to-indels",
                        type=int,
                        default=5,
                        help="Variants closer than this distance from indels "
                        "are filtered [%default].")
    args = parser.parse_args(argv)

    genotype = pysam.Tabixfile(args.genotype)
    with open(args.intervals) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, pysam.asBed())

    for (_, beds) in sorted(intervals.items()):
        for (name, sequence) in build_genes(args, genotype, beds):
            FASTA(name, None, sequence).write(sys.stdout)

    return 0
Exemplo n.º 3
0
    def _run(self, _config, temp):
        def keyfunc(bed):
            return (bed.contig, bed.name, bed.start)

        fastafile = pysam.Fastafile(self._reference)
        seqs = collections.defaultdict(list)
        with open(self._intervals) as bedfile:
            intervals = text.parse_lines_by_contig(bedfile,
                                                   pysam.asBed()).items()
            for (contig, beds) in sorted(intervals):
                beds.sort(key=keyfunc)

                for (gene,
                     gene_beds) in itertools.groupby(beds, lambda x: x.name):
                    gene_beds = tuple(gene_beds)
                    for bed in gene_beds:
                        seqs[(contig, gene)].append(
                            fastafile.fetch(contig, bed.start, bed.end))

                    seq = "".join(seqs[(contig, gene)])
                    if any((bed.strand == "-") for bed in gene_beds):
                        assert all((bed.strand == "-") for bed in gene_beds)
                        seq = sequences.reverse_complement(seq)
                    seqs[(contig, gene)] = seq

        temp_file = os.path.join(temp, "sequences.fasta")
        with open(temp_file, "w") as out_file:
            for ((_, gene), sequence) in sorted(seqs.items()):
                fasta.print_fasta(gene, sequence, out_file)

        move_file(temp_file, self._outfile)
Exemplo n.º 4
0
def test_parse_lines__two_contigs():
    lines = ["abc line1 \n", "def line2 \n"]
    def _parse(line, length):
        assert_equal(len(line), length)
        return _RecordMock(*line.split())

    expected = {"abc" : [_RecordMock("abc", "line1")],
                "def" : [_RecordMock("def", "line2")]}
    assert_equal(parse_lines_by_contig(lines, _parse), expected)
Exemplo n.º 5
0
def test_parse_lines__two_contigs():
    lines = ["abc line1 \n", "def line2 \n"]

    def _parse(line, length):
        assert_equal(len(line), length)
        return _RecordMock(*line.split())

    expected = {
        "abc": [_RecordMock("abc", "line1")],
        "def": [_RecordMock("def", "line2")]
    }
    assert_equal(parse_lines_by_contig(lines, _parse), expected)
Exemplo n.º 6
0
def read_intervals(filename):
    with open(filename) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, BEDRecord)

        for (key, beds) in intervals.iteritems():
            bed_tuples = []
            for bed in beds:
                if len(bed) < 6:
                    sys.stderr.write(("ERROR: Invalid BED record '%r', must "
                                      "have at least 6 fields ...\n") % (bed,))
                    return None

                bed_tuples.append(bed)
            intervals[key] = bed_tuples

    return intervals
Exemplo n.º 7
0
def read_intervals(filename):
    with open(filename) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, BEDRecord)

        for (key, beds) in intervals.iteritems():
            bed_tuples = []
            for bed in beds:
                if len(bed) < 6:
                    sys.stderr.write(
                        ("ERROR: Invalid BED record '%r', must "
                         "have at least 6 fields ...\n") % (bed, ))
                    return None

                bed_tuples.append(bed)
            intervals[key] = bed_tuples

    return intervals
Exemplo n.º 8
0
    def _run(self, _config, temp):
        def _by_name(bed):
            return bed.name

        fastafile = pysam.Fastafile(self._reference)
        seqs = collections.defaultdict(list)
        with open(self._bedfile) as bedfile:
            bedrecords = text.parse_lines_by_contig(bedfile, BEDRecord)
            for (contig, beds) in sorted(bedrecords.iteritems()):
                beds.sort(key=lambda bed: (bed.contig, bed.name, bed.start))

                for (gene, gene_beds) in itertools.groupby(beds, _by_name):
                    gene_beds = tuple(gene_beds)
                    sequence = self._collect_sequence(fastafile, gene_beds)
                    seqs[(contig, gene)] = sequence

        temp_file = os.path.join(temp, "sequences.fasta")
        with open(temp_file, "w") as out_file:
            for ((_, gene), sequence) in sorted(seqs.items()):
                FASTA(gene, None, sequence).write(out_file)

        fileutils.move_file(temp_file, self._outfile)
Exemplo n.º 9
0
    def _run(self, _config, temp):
        def _by_name(bed):
            return bed.name

        fastafile = pysam.Fastafile(self._reference)
        seqs = collections.defaultdict(list)
        with open(self._bedfile) as bedfile:
            bedrecords = text.parse_lines_by_contig(bedfile, pysam.asBed())
            for (contig, beds) in sorted(bedrecords.iteritems()):
                beds.sort(key=lambda bed: (bed.contig, bed.name, bed.start))

                for (gene, gene_beds) in itertools.groupby(beds, _by_name):
                    gene_beds = tuple(gene_beds)
                    sequence = self._collect_sequence(fastafile, gene_beds)
                    seqs[(contig, gene)] = sequence

        temp_file = os.path.join(temp, "sequences.fasta")
        with open(temp_file, "w") as out_file:
            for ((_, gene), sequence) in sorted(seqs.items()):
                FASTA(gene, None, sequence).write(out_file)

        fileutils.move_file(temp_file, self._outfile)
Exemplo n.º 10
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--genotype", help="Tabix indexed pileup file.",
                        required=True)
    parser.add_argument("--intervals", help="BED file.", required=True)
    parser.add_argument("--padding", type=int, default=10,
                        help="Number of bases to expand intervals, when "
                             "filtering based on adjacent indels [%default]")
    parser.add_argument("--min-distance-to-indels", type=int, default=5,
                        help="Variants closer than this distance from indels "
                             "are filtered [%default].")
    args = parser.parse_args(argv)

    genotype = pysam.Tabixfile(args.genotype)
    with open(args.intervals) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, pysam.asBed())

    for (_, beds) in sorted(intervals.items()):
        for (name, sequence) in build_genes(args, genotype, beds):
            FASTA(name, None, sequence).write(sys.stdout)

    return 0
Exemplo n.º 11
0
def read_intervals(filename):
    with open(filename) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, pysam.asBed())

        for (key, beds) in intervals.iteritems():
            bed_tuples = []
            for bed in beds:
                if len(bed) < 6:
                    sys.stderr.write(("ERROR: Invalid BED record '%s', must "
                                      "have at least 6 fields ...\n") %
                                     ("\\t".join(bed),))
                    return None

                # Transform to a named tuple, as Pysam has a tendency to
                # segfault if you do anything wrong
                bed = list(bed)[:6]   # BED6 only
                bed[1] = int(bed[1])  # start
                bed[2] = int(bed[2])  # end
                bed[4] = int(bed[4])  # score

                bed_tuples.append(BEDTuple(*bed))
            intervals[key] = bed_tuples

    return intervals