Exemplo n.º 1
0
def _read_sample_table(config, filename):
    """Parses a 2 - 3 column tab-seperated table containing, on each row, a
    name to be used for a sample in the first row, and then the paths two
    either one or to two BAM files, which must represent a single nuclear or
    a single mitochondrial alignment (2 columns), or both (3 columns).
    """
    print_info("Reading table of samples from %r" % (filename,))

    samples = config.samples = {}
    with fileutils.open_ro(filename) as handle:
        for linenum, line in enumerate(handle, start=1):
            if not line.strip() or line.lstrip().startswith("#"):
                continue

            fields = filter(None, line.rstrip('\r\n').split('\t'))
            if len(fields) not in (2, 3):
                print_err("Error reading sample table (%r) at line %i; "
                          "expected 2 or 3 columns, found %i; please "
                          "correct file before continuing."
                          % (filename, linenum, len(fields)))
                return

            name = fields[0]
            if name in samples:
                print_err("Duplicate sample name found in sample table "
                          "(%r) at line %i: %r. All sample names must "
                          "be unique!" % (filename, linenum, name))
                return

            samples[name] = {"Root": os.path.join(config.destination, name),
                             "Files": fields[1:]}

    return True
Exemplo n.º 2
0
def main(argv):
    args = parse_arguments(argv)

    scaffolds = {}
    if args.scaffolds:
        print("Reading scaffolds information from %r" % (args.scaffolds, ))
        scaffolds = read_scaffolds(args.scaffolds)

    with open_ro(args.infile, "rb") as gtf_file:
        print("Reading GTF from %r" % (args.infile, ))
        src_table = read_gtf(gtf_file, scaffolds, args.contig_prefix)

    for (source, table) in src_table.items():
        print("Writing tables for '%s'" % source)

        if source.startswith("protein"):
            features = build_coding_seqs_table(args, table)
        else:
            features = build_noncoding_seqs_table(args, table)

        for feature in features:
            fpath = "%s.%s.%s.bed" % (args.output_prefix, source, feature)

            print("\tWriting %ss to '%s'" % (feature, fpath))
            write_bed(features[feature], fpath)

    return 0
Exemplo n.º 3
0
def main(argv):
    args = parse_arguments(argv)

    scaffolds = {}
    if args.scaffolds:
        print("Reading scaffolds information from %r" % (args.scaffolds,))
        scaffolds = read_scaffolds(args.scaffolds)

    with open_ro(args.infile) as gtf_file:
        print("Reading GTF from %r" % (args.infile,))
        src_table = read_gtf(gtf_file, scaffolds, args.contig_prefix)

    for (source, table) in src_table.iteritems():
        print("Writing tables for '%s' ..." % source)

        if source.startswith("protein"):
            features = build_coding_seqs_table(args, table)
        else:
            features = build_noncoding_seqs_table(args, table)

        for feature in features:
            fpath = "%s.%s.%s.bed" % (args.output_prefix, source, feature)

            print("\tWriting %ss to '%s' ..." % (feature, fpath, ))
            write_bed(features[feature], fpath)

    return 0
Exemplo n.º 4
0
def read_bed_file(filename, min_columns=3, contigs=None):
    """Parses a (gzip/bzip2 compressed) BED file, and yields a sequence of
    records. Comments and empty lines are skipped. If the number of columns in
    the bed record is less than the specified ('min_columns'), a BEDError is
    raised. If a dictionary of {contig: length} is supplied, and min_columns
    is at least 6, then the coordinates are validated against the known contig
    lengths.
    """
    if min_columns < 3:
        raise ValueError("'min_columns' must be >= 3 in 'read_bed_file'")

    infinite = float("inf")
    handle = None
    try:
        handle = open_ro(filename)

        for (line_num, line) in enumerate(handle):
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            try:
                bed = BEDRecord(line)
            except ValueError as error:
                raise BEDError("Error parsing line %i in regions file:\n"
                               "  Path = %r\n  Line = %r\n\n%s" %
                               (line_num + 1, filename, line, error))

            if len(bed) < min_columns:
                url = "http://genome.ucsc.edu/FAQ/FAQformat.html#format1"
                name = repr(bed.name) if len(bed) > 3 else "unnamed record"
                raise BEDError("Region at line #%i (%s) does not "
                               "contain the expected number of fields; "
                               "the first %i fields are required. C.f. "
                               "defination at\n   %s\n\nPath = %r" %
                               (line_num, name, min_columns, url, filename))

            if contigs is None:
                contig_len = infinite
            else:
                contig_len = contigs.get(bed.contig)

            if contig_len is None:
                raise BEDError("Regions file contains contig not found "
                               "in reference:\n  Path = %r\n  Contig = "
                               "%r\n\nPlease ensure that all contig "
                               "names match the reference names!" %
                               (filename, bed.contig))
            elif not (0 <= bed.start < bed.end <= contig_len):
                raise BEDError(
                    "Regions file contains invalid region:\n"
                    "  Path   = %r\n  Contig = %r\n"
                    "  Start  = %s\n  End    = %s\n\n"
                    "Expected 0 <= Start < End <= %i!" %
                    (filename, bed.contig, bed.start, bed.end, contig_len))

            yield bed
    finally:
        if handle:
            handle.close()
Exemplo n.º 5
0
def test_open_ro__bz2():
    handle = open_ro(test_file('fasta_file.fasta.bz2'))
    try:
        assert_equal(handle.read(),
                     b'>This_is_BZ_FASTA!\nCGTNA\n'
                     b'>This_is_ALSO_BZ_FASTA!\nACGTN\n')
    finally:
        handle.close()
Exemplo n.º 6
0
def test_open_ro__gz():
    handle = open_ro(test_file('fasta_file.fasta.gz'))
    try:
        assert_equal(handle.read(),
                     b'>This_is_GZipped_FASTA!\nACGTN\n'
                     b'>This_is_ALSO_GZipped_FASTA!\nCGTNA\n')
    finally:
        handle.close()
Exemplo n.º 7
0
    def _parse_freq_table(cls, filename):
        with fileutils.open_ro(filename) as handle:
            handle.readline()  # Skip header

            for line in handle:
                chrom, snp, clst, _, _, _, mac, nchroms = line.split()

                yield (chrom, snp, clst, int(mac), int(nchroms))
Exemplo n.º 8
0
def test_open_ro__gz():
    handle = open_ro(test_file('fasta_file.fasta.gz'))
    try:
        assert_equal(
            handle.read(), b'>This_is_GZipped_FASTA!\nACGTN\n'
            b'>This_is_ALSO_GZipped_FASTA!\nCGTNA\n')
    finally:
        handle.close()
Exemplo n.º 9
0
    def _parse_freq_table(cls, filename):
        with fileutils.open_ro(filename) as handle:
            handle.readline()  # Skip header

            for line in handle:
                chrom, snp, clst, _, _, _, mac, nchroms = line.split()

                yield (chrom, snp, clst, int(mac), int(nchroms))
Exemplo n.º 10
0
 def from_file(cls, filename):
     """Reads a MSA from the specified filename. The file may
     be uncompressed, gzipped or bzipped. See also 'MSA.from_lines'."""
     fasta_file = open_ro(filename)
     try:
         return MSA.from_lines(fasta_file)
     except MSAError, error:
         raise MSAError("%s in file %r" % (error, filename))
Exemplo n.º 11
0
 def from_file(cls, filename):
     """Reads a MSA from the specified filename. The file may
     be uncompressed, gzipped or bzipped. See also 'MSA.from_lines'."""
     fasta_file = open_ro(filename)
     try:
         return MSA.from_lines(fasta_file)
     except MSAError, error:
         raise MSAError("%s in file %r" % (error, filename))
Exemplo n.º 12
0
def test_open_ro__bz2():
    handle = open_ro(test_file('fasta_file.fasta.bz2'))
    try:
        assert_equal(
            handle.read(), b'>This_is_BZ_FASTA!\nCGTNA\n'
            b'>This_is_ALSO_BZ_FASTA!\nACGTN\n')
    finally:
        handle.close()
Exemplo n.º 13
0
 def from_file(cls, filename):
     """Reads an unindexed FASTA file, returning a sequence of
     tuples containing the name and sequence of each entry in
     the file. The FASTA file may be GZIP/BZ2 compressed."""
     fasta_file = open_ro(filename)
     try:
         for record in FASTA.from_lines(fasta_file):
             yield record
     finally:
         fasta_file.close()
Exemplo n.º 14
0
 def from_file(cls, filename):
     """Reads an unindexed FASTA file, returning a sequence of
     tuples containing the name and sequence of each entry in
     the file. The FASTA file may be GZIP/BZ2 compressed."""
     fasta_file = open_ro(filename)
     try:
         for record in FASTA.from_lines(fasta_file):
             yield record
     finally:
         fasta_file.close()
Exemplo n.º 15
0
def _read_sample_table(config, filename):
    """Parses a 2 - 3 column tab-seperated table containing, on each row, a
    name to be used for a sample in the first row, and then the paths two
    either one or to two BAM files, which must represent a single nuclear or
    a single mitochondrial alignment (2 columns), or both (3 columns).
    """
    log = logging.getLogger(__name__)
    log.info("Reading table of samples from %r", filename)
    valid_characters = frozenset(string.ascii_letters + string.digits + ".-_")

    samples = config.samples = {}
    with fileutils.open_ro(filename) as handle:
        for linenum, line in enumerate(handle, start=1):
            if not line.strip() or line.lstrip().startswith("#"):
                continue

            fields = [_f for _f in map(str.strip, line.split("\t")) if _f]
            if len(fields) not in (2, 3):
                log.error(
                    "Error reading sample table (%r) at line %i: Expected 2 or 3 "
                    "columns, found %i; please correct file before continuing.",
                    filename,
                    linenum,
                    len(fields),
                )
                return

            name = fields[0]
            invalid_letters = frozenset(name) - valid_characters
            if invalid_letters:
                log.error(
                    "Error reading sample table (%r) at line %i: Sample name contains "
                    "illegal character(s). Only letters, numbers, and '-', '_', and "
                    "'.' are allowed, but found %r in name %r ",
                    filename,
                    linenum,
                    "".join(invalid_letters),
                    name,
                )
                return
            elif name in samples:
                log.error(
                    "Duplicate name %r in sample table; names must be unique!",
                    name)
                return

            samples[name] = {
                "Root": os.path.join(config.destination, name),
                "Files": fields[1:],
            }

    return True
Exemplo n.º 16
0
def _read_sample_table(config, filename):
    """Parses a 2 - 3 column tab-seperated table containing, on each row, a
    name to be used for a sample in the first row, and then the paths two
    either one or to two BAM files, which must represent a single nuclear or
    a single mitochondrial alignment (2 columns), or both (3 columns).
    """
    print_info("Reading table of samples from %r" % (filename, ))
    valid_characters = frozenset(string.letters + string.digits + ".-_")

    samples = config.samples = {}
    with fileutils.open_ro(filename) as handle:
        for linenum, line in enumerate(handle, start=1):
            if not line.strip() or line.lstrip().startswith("#"):
                continue

            fields = filter(None, map(str.strip, line.split('\t')))
            if len(fields) not in (2, 3):
                print_err("Error reading sample table (%r) at line %i: "
                          "Expected 2 or 3 columns, found %i; please "
                          "correct file before continuing." %
                          (filename, linenum, len(fields)))
                return

            name = fields[0]
            invalid_letters = frozenset(name) - valid_characters
            if invalid_letters:
                print_err("Error reading sample table (%r) at line %i: "
                          "Sample name contains illegal character(s). Only "
                          "letters, numbers, and '-', '_', and '.' are "
                          "allowed, but found %r in name %r " %
                          (filename, linenum, "".join(invalid_letters), name))
                return
            elif name in samples:
                print_err("Duplicate sample name found in sample table "
                          "(%r) at line %i: %r. All sample names must "
                          "be unique!" % (filename, linenum, name))
                return

            samples[name] = {
                "Root": os.path.join(config.destination, name),
                "Files": fields[1:]
            }

    return True
Exemplo n.º 17
0
def _read_sample_table(config, filename):
    """Parses a 2 - 3 column tab-seperated table containing, on each row, a
    name to be used for a sample in the first row, and then the paths two
    either one or to two BAM files, which must represent a single nuclear or
    a single mitochondrial alignment (2 columns), or both (3 columns).
    """
    print_info("Reading table of samples from %r" % (filename,))
    valid_characters = frozenset(string.letters + string.digits + ".-_")

    samples = config.samples = {}
    with fileutils.open_ro(filename) as handle:
        for linenum, line in enumerate(handle, start=1):
            if not line.strip() or line.lstrip().startswith("#"):
                continue

            fields = filter(None, map(str.strip, line.split('\t')))
            if len(fields) not in (2, 3):
                print_err("Error reading sample table (%r) at line %i: "
                          "Expected 2 or 3 columns, found %i; please "
                          "correct file before continuing."
                          % (filename, linenum, len(fields)))
                return

            name = fields[0]
            invalid_letters = frozenset(name) - valid_characters
            if invalid_letters:
                print_err("Error reading sample table (%r) at line %i: "
                          "Sample name contains illegal character(s). Only "
                          "letters, numbers, and '-', '_', and '.' are "
                          "allowed, but found %r in name %r "
                          % (filename, linenum, "".join(invalid_letters), name))
                return
            elif name in samples:
                print_err("Duplicate sample name found in sample table "
                          "(%r) at line %i: %r. All sample names must "
                          "be unique!" % (filename, linenum, name))
                return

            samples[name] = {"Root": os.path.join(config.destination, name),
                             "Files": fields[1:]}

    return True
Exemplo n.º 18
0
def _read_files(args):
    in_header = True
    has_filters = False
    vcf_parser = pysam.asVCF()
    for filename in args.filenames:
        with open_ro(filename, "rb") as handle:
            for line in handle:
                if not line.startswith(b"#"):
                    in_header = False
                    line = line.rstrip(b"\n\r")
                    vcf = vcf_parser(line, len(line))
                    if args.reset_filter:
                        vcf.filter = "."

                    yield vcf
                elif in_header:
                    if not (line.startswith(b"##") or has_filters):
                        has_filters = True
                        for item in sorted(
                                vcffilter.describe_filters(args).items()):
                            print('##FILTER=<ID=%s,Description="%s">' % item)

                    print(line.decode("utf-8"), end="")
Exemplo n.º 19
0
 def from_file(cls, filename):
     """Reads an unindexed FASTQ file, returning a sequence of
     tuples containing the name and sequence of each entry in
     the file. The FASTQ file may be GZIP/BZ2 compressed."""
     with open_ro(filename) as handle:
         yield from FASTQ.from_lines(handle)
Exemplo n.º 20
0
def read_bed_file(filename, min_columns=3, contigs=None):
    """Parses a (gzip/bzip2 compressed) BED file, and yields a sequence of
    records. Comments and empty lines are skipped. If the number of columns in
    the bed record is less than the specified ('min_columns'), a BEDError is
    raised. If a dictionary of {contig: length} is supplied, and min_columns
    is at least 6, then the coordinates are validated against the known contig
    lengths.
    """
    if min_columns < 3:
        raise ValueError("'min_columns' must be >= 3 in 'read_bed_file'")

    infinite = float("inf")
    handle = None
    try:
        handle = fileutils.open_ro(filename)

        for (line_num, line) in enumerate(handle):
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            try:
                bed = BEDRecord(line)
            except ValueError, error:
                raise BEDError("Error parsing line %i in regions file:\n"
                               "  Path = %r\n  Line = %r\n\n%s"
                               % (line_num + 1, filename, line, error))

            if len(bed) < min_columns:
                url = "http://genome.ucsc.edu/FAQ/FAQformat.html#format1"
                name = repr(bed.name) if len(bed) > 3 else "unnamed record"
                raise BEDError("Region at line #%i (%s) does not "
                               "contain the expected number of fields; "
                               "the first %i fields are required. C.f. "
                               "defination at\n   %s\n\nPath = %r"
                               % (line_num, name, min_columns,
                                  url, filename))

            if contigs is None:
                contig_len = infinite
            else:
                contig_len = contigs.get(bed.contig)

            if contig_len is None:
                raise BEDError("Regions file contains contig not found "
                               "in reference:\n  Path = %r\n  Contig = "
                               "%r\n\nPlease ensure that all contig "
                               "names match the reference names!"
                               % (filename, bed.contig))
            elif not (0 <= bed.start < bed.end <= contig_len):
                raise BEDError("Regions file contains invalid region:\n"
                               "  Path   = %r\n  Contig = %r\n"
                               "  Start  = %s\n  End    = %s\n\n"
                               "Expected 0 <= Start < End <= %i!"
                               % (filename, bed.contig, bed.start,
                                  bed.end, contig_len))

            yield bed
    finally:
        if handle:
            handle.close()