Exemplo n.º 1
0
def _read_sample_table(config, filename):
    """Parses a 2 - 3 column tab-seperated table containing, on each row, a
    name to be used for a sample in the first row, and then the paths two
    either one or to two BAM files, which must represent a single nuclear or
    a single mitochondrial alignment (2 columns), or both (3 columns).
    """
    print_info("Reading table of samples from %r" % (filename,))

    samples = config.samples = {}
    with fileutils.open_ro(filename) as handle:
        for linenum, line in enumerate(handle, start=1):
            if not line.strip() or line.lstrip().startswith("#"):
                continue

            fields = filter(None, line.rstrip('\r\n').split('\t'))
            if len(fields) not in (2, 3):
                print_err("Error reading sample table (%r) at line %i; "
                          "expected 2 or 3 columns, found %i; please "
                          "correct file before continuing."
                          % (filename, linenum, len(fields)))
                return

            name = fields[0]
            if name in samples:
                print_err("Duplicate sample name found in sample table "
                          "(%r) at line %i: %r. All sample names must "
                          "be unique!" % (filename, linenum, name))
                return

            samples[name] = {"Root": os.path.join(config.destination, name),
                             "Files": fields[1:]}

    return True
Exemplo n.º 2
0
    def validate_bam(self, filename):
        """Validates a sample BAM file, checking that it is either a valid
        mitochondrial BAM (aligned against one of the referenc mt sequences),
        or that it is a valid nuclear BAM (aligned against the reference).

        Returns one of INVALID_BAMFILE, NUC_BAMFILE, and MITO_BAMFILE.
        """
        print_info("  - Validating BAM file %r ... " % (filename, ))

        try:
            handle = pysam.Samfile(filename)
        except (ValueError, IOError), error:
            print_err("Error reading BAM: %s" % (error, ))
            return
Exemplo n.º 3
0
    def validate_bam(self, filename):
        """Validates a sample BAM file, checking that it is either a valid
        mitochondrial BAM (aligned against one of the referenc mt sequences),
        or that it is a valid nuclear BAM (aligned against the reference).

        Returns one of INVALID_BAMFILE, NUC_BAMFILE, and MITO_BAMFILE.
        """
        print_info("  - Validating BAM file %r ... " % (filename,))

        try:
            handle = pysam.Samfile(filename)
        except (ValueError, IOError), error:
            print_err("Error reading BAM: %s" % (error,))
            return
Exemplo n.º 4
0
def _validate_mito_bam(data, handle, info):
    if data.mitochondria is None:
        # No mitochondrial data .. skip phylogeny
        return True

    references = handle.references
    min_length = min(
        (len(record.sequence)) for record in data.mitochondria.itervalues())

    for bam_contig, bam_length in zip(references, handle.lengths):
        if bam_contig not in data.mitochondria:
            continue

        db_sequence = data.mitochondria[bam_contig].sequence
        db_length = len(db_sequence) - db_sequence.count("-")

        if bam_length != db_length:
            print_err("ERROR: Length of mitochondrial contig %r (%i bp) "
                      "does not match the length of the corresponding "
                      "sequence in the database (%i bp)" %
                      (bam_contig, bam_length, db_length))
            return False

        if not os.path.exists(handle.filename + '.bai') \
                and not os.path.exists(swap_ext(handle.filename, '.bai')):
            print_info('    - Attempting to index BAM file %r!' %
                       (handle.filename, ))
            pysam.index(handle.filename)

        # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
        for line in "".join(pysam.idxstats(handle.filename)).split('\n'):
            line = line.strip()
            if not line:
                continue

            name, _, hits, _ = line.split('\t')
            if (name == bam_contig) and not int(hits):
                print_err("WARNING: Mitochondrial BAM (%r) does not contain "
                          "any reads aligned to contig %r; inferring an "
                          "phylogeny is not possible." %
                          (handle.filename, name))
                return True

        info.mt_contig = bam_contig
        info.mt_length = bam_length
        info.mt_padding = len(db_sequence) - min_length

        return True
    return True
Exemplo n.º 5
0
def _validate_mito_bam(data, handle, info):
    if data.mitochondria is None:
        # No mitochondrial data .. skip phylogeny
        return True

    references = handle.references
    min_length = min((len(record.sequence))
                     for record in data.mitochondria.itervalues())

    for bam_contig, bam_length in zip(references, handle.lengths):
        if bam_contig not in data.mitochondria:
            continue

        db_sequence = data.mitochondria[bam_contig].sequence
        db_length = len(db_sequence) - db_sequence.count("-")

        if bam_length != db_length:
            print_err("ERROR: Length of mitochondrial contig %r (%i bp) "
                      "does not match the length of the corresponding "
                      "sequence in the database (%i bp)"
                      % (bam_contig, bam_length, db_length))
            return False

        if not os.path.exists(handle.filename + '.bai') \
                and not os.path.exists(swap_ext(handle.filename, '.bai')):
            print_info('    - Attempting to index BAM file %r!'
                       % (handle.filename,))
            pysam.index(handle.filename)

        # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
        for line in "".join(pysam.idxstats(handle.filename)).split('\n'):
            line = line.strip()
            if not line:
                continue

            name, _, hits, _ = line.split('\t')
            if (name == bam_contig) and not int(hits):
                print_err("WARNING: Mitochondrial BAM (%r) does not contain "
                          "any reads aligned to contig %r; inferring an "
                          "phylogeny is not possible."
                          % (handle.filename, name))
                return True

        info.mt_contig = bam_contig
        info.mt_length = bam_length
        info.mt_padding = len(db_sequence) - min_length

        return True
    return True
Exemplo n.º 6
0
def _process_samples(config):
    for name, info in sorted(config.samples.items()):
        files = {}

        if name == "-":
            print_info("Validating unnamed sample ...")
        else:
            print_info("Validating sample %r ..." % (name,))

        for filename in info.pop("Files"):
            filetype = config.database.validate_bam(filename)
            if not filetype:
                print_err("ERROR: File is not a valid BAM file: %r"
                          % (filename,))
                return False

            if filetype.is_nuclear and filetype.is_mitochondrial:
                if "Nuc" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")
                    return False
                elif "Mito" in files:
                    print_err("WARNING: Nuclear + mitochondrial BAM, and "
                              "mitochondrial BAM specified; the mitochondrial "
                              "genome in the first BAM will not be used!")

                files["Nuc"] = filename
                files.setdefault("Mito", filename)
            elif filetype.is_nuclear:
                if "Nuc" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")
                    return False

                files["Nuc"] = filename
            elif filetype.is_mitochondrial:
                if "Mito" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")
                    return False

                files["Mito"] = filename
            else:
                print_err("ERROR: BAM does not contain usable nuclear "
                          "or mitochondrial contigs: %r" % (filename,))
                return False

        config.samples[name]["Files"] = files

    return True
Exemplo n.º 7
0
def _process_samples(config):
    for name, info in sorted(config.samples.items()):
        files = {}

        if name == "-":
            print_info("Validating unnamed sample ...")
        else:
            print_info("Validating sample %r ..." % (name, ))

        for filename in info.pop("Files"):
            filetype = config.database.validate_bam(filename)
            if not filetype:
                print_err("ERROR: File is not a valid BAM file: %r" %
                          (filename, ))
                return False

            if filetype.is_nuclear and filetype.is_mitochondrial:
                if "Nuc" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")
                    return False
                elif "Mito" in files:
                    print_err("WARNING: Nuclear + mitochondrial BAM, and "
                              "mitochondrial BAM specified; the mitochondrial "
                              "genome in the first BAM will not be used!")

                files["Nuc"] = filename
                files.setdefault("Mito", filename)
            elif filetype.is_nuclear:
                if "Nuc" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")
                    return False

                files["Nuc"] = filename
            elif filetype.is_mitochondrial:
                if "Mito" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")
                    return False

                files["Mito"] = filename
            else:
                print_err("ERROR: BAM does not contain usable nuclear "
                          "or mitochondrial contigs: %r" % (filename, ))
                return False

        config.samples[name]["Files"] = files

    return True
Exemplo n.º 8
0
def _read_sample_table(config, filename):
    """Parses a 2 - 3 column tab-seperated table containing, on each row, a
    name to be used for a sample in the first row, and then the paths two
    either one or to two BAM files, which must represent a single nuclear or
    a single mitochondrial alignment (2 columns), or both (3 columns).
    """
    print_info("Reading table of samples from %r" % (filename, ))
    valid_characters = frozenset(string.letters + string.digits + ".-_")

    samples = config.samples = {}
    with fileutils.open_ro(filename) as handle:
        for linenum, line in enumerate(handle, start=1):
            if not line.strip() or line.lstrip().startswith("#"):
                continue

            fields = filter(None, map(str.strip, line.split('\t')))
            if len(fields) not in (2, 3):
                print_err("Error reading sample table (%r) at line %i: "
                          "Expected 2 or 3 columns, found %i; please "
                          "correct file before continuing." %
                          (filename, linenum, len(fields)))
                return

            name = fields[0]
            invalid_letters = frozenset(name) - valid_characters
            if invalid_letters:
                print_err("Error reading sample table (%r) at line %i: "
                          "Sample name contains illegal character(s). Only "
                          "letters, numbers, and '-', '_', and '.' are "
                          "allowed, but found %r in name %r " %
                          (filename, linenum, "".join(invalid_letters), name))
                return
            elif name in samples:
                print_err("Duplicate sample name found in sample table "
                          "(%r) at line %i: %r. All sample names must "
                          "be unique!" % (filename, linenum, name))
                return

            samples[name] = {
                "Root": os.path.join(config.destination, name),
                "Files": fields[1:]
            }

    return True
Exemplo n.º 9
0
def _read_sample_table(config, filename):
    """Parses a 2 - 3 column tab-seperated table containing, on each row, a
    name to be used for a sample in the first row, and then the paths two
    either one or to two BAM files, which must represent a single nuclear or
    a single mitochondrial alignment (2 columns), or both (3 columns).
    """
    print_info("Reading table of samples from %r" % (filename,))
    valid_characters = frozenset(string.letters + string.digits + ".-_")

    samples = config.samples = {}
    with fileutils.open_ro(filename) as handle:
        for linenum, line in enumerate(handle, start=1):
            if not line.strip() or line.lstrip().startswith("#"):
                continue

            fields = filter(None, map(str.strip, line.split('\t')))
            if len(fields) not in (2, 3):
                print_err("Error reading sample table (%r) at line %i: "
                          "Expected 2 or 3 columns, found %i; please "
                          "correct file before continuing."
                          % (filename, linenum, len(fields)))
                return

            name = fields[0]
            invalid_letters = frozenset(name) - valid_characters
            if invalid_letters:
                print_err("Error reading sample table (%r) at line %i: "
                          "Sample name contains illegal character(s). Only "
                          "letters, numbers, and '-', '_', and '.' are "
                          "allowed, but found %r in name %r "
                          % (filename, linenum, "".join(invalid_letters), name))
                return
            elif name in samples:
                print_err("Duplicate sample name found in sample table "
                          "(%r) at line %i: %r. All sample names must "
                          "be unique!" % (filename, linenum, name))
                return

            samples[name] = {"Root": os.path.join(config.destination, name),
                             "Files": fields[1:]}

    return True
Exemplo n.º 10
0
    def __init__(self, filename):
        self.filename = filename

        if not os.path.exists(filename):
            raise ZonkeyDBError('Database file does not exist')
        elif not tarfile.is_tarfile(filename):
            raise ZonkeyDBError('Database file is not a valid tar-file')

        print_info('Reading Zonkey database from %r ...' % (filename, ))

        # Warn if file is gzip / bzip2 compressed; gives worse throughput
        _check_file_compression(filename)

        with tarfile.open(filename) as tar_handle:
            print_info('  - Reading settings ...')
            self.settings = self._read_settings(tar_handle, "settings.yaml")
            print_info('  - Reading list of contigs ...')
            self.contigs = self._read_contigs_table(tar_handle, "contigs.txt")
            print_info('  - Reading list of samples ...')
            self.samples = self._read_samples_table(tar_handle, "samples.txt")
            print_info('  - Reading mitochondrial sequences ...')
            self.mitochondria = self._read_mitochondria(
                tar_handle, "mitochondria.fasta")
            print_info('  - Reading emperical admixture distribution ...')
            self.simulations = self._read_simulations(tar_handle,
                                                      "simulations.txt")
            print_info('  - Determining sample order ...')
            self.sample_order = self._read_sample_order(
                tar_handle, "genotypes.txt")

        self._cross_validate()
Exemplo n.º 11
0
    def __init__(self, filename):
        self.filename = filename

        if not os.path.exists(filename):
            raise ZonkeyDBError('Database file does not exist')
        elif not tarfile.is_tarfile(filename):
            raise ZonkeyDBError('Database file is not a valid tar-file')

        print_info('Reading Zonkey database from %r ...' % (filename,))

        # Warn if file is gzip / bzip2 compressed; gives worse throughput
        _check_file_compression(filename)

        with tarfile.open(filename) as tar_handle:
            print_info('  - Reading settings ...')
            self.settings = self._read_settings(tar_handle, "settings.yaml")
            print_info('  - Reading list of contigs ...')
            self.contigs = self._read_contigs_table(tar_handle, "contigs.txt")
            print_info('  - Reading list of samples ...')
            self.samples = self._read_samples_table(tar_handle, "samples.txt")
            print_info('  - Reading mitochondrial sequences ...')
            self.mitochondria = self._read_mitochondria(tar_handle,
                                                        "mitochondria.fasta")
            print_info('  - Reading emperical admixture distribution ...')
            self.simulations = self._read_simulations(tar_handle,
                                                      "simulations.txt")
            print_info('  - Determining sample order ...')
            self.sample_order = self._read_sample_order(tar_handle,
                                                        "genotypes.txt")

        self._cross_validate()