示例#1
0
def extract_from_bam(bam, chromosome, keep_supplementary=True):
    """Extracts metrics from bam.

    Worker function per chromosome
    loop over a bam file and create list with tuples containing metrics:
    -qualities
    -aligned qualities
    -lengths
    -aligned lengths
    -mapping qualities
    -edit distances to the reference genome scaled by read length
    """
    samfile = pysam.AlignmentFile(bam, "rb")
    if keep_supplementary:
        return [(read.query_name, nanomath.ave_qual(read.query_qualities),
                 nanomath.ave_qual(read.query_alignment_qualities),
                 read.query_length, read.query_alignment_length,
                 read.mapping_quality, get_pID(read))
                for read in samfile.fetch(reference=chromosome,
                                          multiple_iterators=True)
                if not read.is_secondary and not read.is_unmapped]
    else:
        return [(read.query_name, nanomath.ave_qual(read.query_qualities),
                 nanomath.ave_qual(read.query_alignment_qualities),
                 read.query_length, read.query_alignment_length,
                 read.mapping_quality, get_pID(read))
                for read in samfile.fetch(reference=chromosome,
                                          multiple_iterators=True)
                if not read.is_secondary and not read.is_unmapped
                and not read.is_supplementary]
示例#2
0
def process_fastq_rich(fastq, **kwargs):
    """Extract metrics from a richer fastq file.

    Extract information from fastq files generated by albacore or MinKNOW,
    containing richer information in the header (key-value pairs)
    read=<int> [72]
    ch=<int> [159]
    start_time=<timestamp> [2016-07-15T14:23:22Z]  # UTC ISO 8601 ISO 3339 timestamp
    Z indicates UTC time, T is the delimiter between date expression and time expression
    dateutil.parser.parse("2016-07-15T14:23:22Z") imported as dparse
    -> datetime.datetime(2016, 7, 15, 14, 23, 22, tzinfo=tzutc())
    """
    logging.info(
        "Nanoget: Starting to collect statistics from rich fastq file.")
    inputfastq = handle_compressed_input(fastq)
    res = []
    for record in SeqIO.parse(inputfastq, "fastq"):
        try:
            read_info = info_to_dict(record.description)
            res.append(
                (nanomath.ave_qual(record.letter_annotations["phred_quality"]),
                 len(record), read_info["ch"], read_info["start_time"],
                 read_info["runid"]))
        except KeyError:
            logging.error("Nanoget: keyerror when processing record {}".format(
                record.description))
            sys.exit("Unexpected fastq identifier:\n{}\n\n \
            missing one or more of expected fields 'ch', 'start_time' or 'runid'"
                     .format(record.description))
    df = pd.DataFrame(
        data=res,
        columns=["quals", "lengths", "channelIDs", "timestamp",
                 "runIDs"]).dropna()
    df["channelIDs"] = df["channelIDs"].astype("int64")
    return ut.reduce_memory_usage(df)
示例#3
0
def extract_all_from_fastq(rec):
    """Extract metrics from a fastq file.

    Return identifier, read length, average quality and median quality
    """
    return (rec.id, len(rec),
            nanomath.ave_qual(rec.letter_annotations["phred_quality"]),
            nanomath.median_qual(rec.letter_annotations["phred_quality"]))
示例#4
0
def extract_from_fastq(fq):
    """Extract metrics from a fastq file.

    Return average quality and read length
    """
    for rec in SeqIO.parse(fq, "fastq"):
        yield nanomath.ave_qual(
            rec.letter_annotations["phred_quality"]), len(rec)
示例#5
0
def splitFq(fq, args):
    '''
    Split a fastq file in a fail and pass file
    Optionally trim a number of nucleotides from beginning and end.
    '''
    prefix = os.path.join(
        args.outdir,
        os.path.basename(args.fastqfile.name).replace('.fastq', '').replace(
            '.gz', '').replace('.fq', ''))
    p, f = 0, 0
    with gzip.open(prefix + ".pass.fastq.gz",
                   'wt') as passed, gzip.open(prefix + ".fail.fastq.gz",
                                              'wt') as failed:
        for record in SeqIO.parse(fq, "fastq"):
            if ave_qual(record.letter_annotations["phred_quality"]
                        ) >= args.quality:
                p += 1
                passed.write(record.format("fastq"))
            else:
                failed.write(record.format("fastq"))
                f += 1
    print("Split the file in {} reads in <pass> and {} reads in <fail>".format(
        p, f))
示例#6
0
def process_ubam(bam, **kwargs):
    """Extracting metrics from unaligned bam format
    Extracting lengths
    """
    logging.info(
        "Nanoget: Starting to collect statistics from ubam file {}.".format(
            bam))
    samfile = pysam.AlignmentFile(bam, "rb", check_sq=False)
    if not samfile.has_index():
        pysam.index(bam)
        # Need to reload the samfile after creating index
        samfile = pysam.AlignmentFile(bam, "rb", check_sq=False)
        logging.info(
            "Nanoget: No index for bam file could be found, created index.")
    datadf = pd.DataFrame(
        data=[(read.query_name, nanomath.ave_qual(read.query_qualities), read.query_length)
              for read in samfile.fetch(until_eof=True)],
        columns=["readIDs", "quals", "lengths"]) \
        .dropna(axis='columns', how='all') \
        .dropna(axis='index', how='any')
    logging.info("Nanoget: ubam {} contains {} reads.".format(
        bam, datadf["lengths"].size))
    return ut.reduce_memory_usage(datadf)
示例#7
0
 def test_ave_qual(self):
     """Test average quality calculation."""
     quals = list(range(128 + 1)) * 100
     mq = nm.ave_qual(quals, qround=True)
     self.assertEqual(mq, 14)