Пример #1
0
def process_cram(cram, **kwargs):
    """Combines metrics from cram after extraction.

    Processing function: calls pool of worker functions
    to extract from a cram file the following metrics:
    -lengths
    -aligned lengths
    -qualities
    -aligned qualities
    -mapping qualities
    -edit distances to the reference genome scaled by read length
    Returned in a pandas DataFrame
    """
    logging.info(
        "Nanoget: Starting to collect statistics from cram file {}.".format(
            cram))
    samfile = check_bam(cram, samtype="cram")
    chromosomes = samfile.references
    params = zip([cram] * len(chromosomes), chromosomes)
    with cfutures.ProcessPoolExecutor(
            max_workers=kwargs["threads"]) as executor:
        datadf = pd.DataFrame(
            data=[res for sublist in executor.map(extract_from_bam, params) for res in sublist],
            columns=["readIDs", "quals", "aligned_quals", "lengths",
                     "aligned_lengths", "mapQ", "percentIdentity"]) \
            .dropna(axis='columns', how='all') \
            .dropna(axis='index', how='any')
    logging.info("Nanoget: cram {} contains {} primary alignments.".format(
        cram, datadf["lengths"].size))
    return ut.reduce_memory_usage(datadf)
Пример #2
0
def process_fastq_rich(fastq, **kwargs):
    """Extract metrics from a richer fastq file.

    Extract information from fastq files generated by albacore or MinKNOW,
    containing richer information in the header (key-value pairs)
    read=<int> [72]
    ch=<int> [159]
    start_time=<timestamp> [2016-07-15T14:23:22Z]  # UTC ISO 8601 ISO 3339 timestamp
    Z indicates UTC time, T is the delimiter between date expression and time expression
    dateutil.parser.parse("2016-07-15T14:23:22Z") imported as dparse
    -> datetime.datetime(2016, 7, 15, 14, 23, 22, tzinfo=tzutc())
    """
    logging.info(
        "Nanoget: Starting to collect statistics from rich fastq file.")
    inputfastq = handle_compressed_input(fastq)
    res = []
    for record in SeqIO.parse(inputfastq, "fastq"):
        try:
            read_info = info_to_dict(record.description)
            res.append(
                (ut.ave_qual(record.letter_annotations["phred_quality"]),
                 len(record), read_info["ch"], read_info["start_time"],
                 read_info["runid"]))
        except KeyError:
            logging.error("Nanoget: keyerror when processing record {}".format(
                record.description))
            sys.exit("Unexpected fastq identifier:\n{}\n\n \
            missing one or more of expected fields 'ch', 'start_time' or 'runid'"
                     .format(record.description))
    df = pd.DataFrame(
        data=res,
        columns=["quals", "lengths", "channelIDs", "timestamp",
                 "runIDs"]).dropna()
    df["channelIDs"] = df["channelIDs"].astype("int64")
    return ut.reduce_memory_usage(df)
Пример #3
0
def process_fasta(fasta, **kwargs):
    """Combine metrics extracted from a fasta file."""
    logging.info("Nanoget: Starting to collect statistics from a fasta file.")
    inputfasta = handle_compressed_input(fasta, file_type="fasta")
    return ut.reduce_memory_usage(
        pd.DataFrame(
            data=[len(rec) for rec in SeqIO.parse(inputfasta, "fasta")],
            columns=["lengths"]).dropna())
Пример #4
0
def process_fastq_plain(fastq, **kwargs):
    """Combine metrics extracted from a fastq file."""
    logging.info(
        "Nanoget: Starting to collect statistics from plain fastq file.")
    inputfastq = handle_compressed_input(fastq)
    return ut.reduce_memory_usage(
        pd.DataFrame(
            data=[res for res in extract_from_fastq(inputfastq) if res],
            columns=["quals", "lengths"]).dropna())
Пример #5
0
def process_fastq_minimal(fastq, **kwargs):
    """Swiftly extract minimal features (length and timestamp) from a rich fastq file"""
    infastq = handle_compressed_input(fastq)
    try:
        df = pd.DataFrame(data=[rec for rec in fq_minimal(infastq) if rec],
                          columns=["timestamp", "lengths"])
    except IndexError:
        logging.error("Fatal: Incorrect file structure for fastq_minimal")
        sys.exit(
            "Error: file does not match expected structure for fastq_minimal")
    return ut.reduce_memory_usage(df)
Пример #6
0
def process_bam(bam, **kwargs):
    """Combines metrics from bam after extraction.

    Processing function: calls pool of worker functions
    to extract from a bam file the following metrics:
    -lengths
    -aligned lengths
    -qualities
    -aligned qualities
    -mapping qualities
    -edit distances to the reference genome scaled by read length
    Returned in a pandas DataFrame
    """
    logging.info(
        "Nanoget: Starting to collect statistics from bam file {}.".format(
            bam))
    samfile = check_bam(bam)
    chromosomes = samfile.references
    if len(chromosomes) > 100 or kwargs["huge"]:
        logging.info(
            "Nanoget: lots of contigs (>100) or --huge, not running in separate processes"
        )
        datadf = pd.DataFrame(
            data=extract_from_bam(bam, None, kwargs["keep_supp"]),
            columns=["readIDs", "quals", "aligned_quals", "lengths",
                     "aligned_lengths", "mapQ", "percentIdentity"]) \
            .dropna(axis='columns', how='all') \
            .dropna(axis='index', how='any')

    else:
        unit = chromosomes
        with cfutures.ProcessPoolExecutor(
                max_workers=kwargs["threads"]) as executor:
            datadf = pd.DataFrame(
                data=[res for sublist in executor.map(extract_from_bam,
                                                      repeat(bam),
                                                      unit,
                                                      repeat(kwargs["keep_supp"]))
                      for res in sublist],
                columns=["readIDs", "quals", "aligned_quals", "lengths",
                         "aligned_lengths", "mapQ", "percentIdentity"]) \
                .dropna(axis='columns', how='all') \
                .dropna(axis='index', how='any')
    logging.info(
        f"Nanoget: bam {bam} contains {datadf['lengths'].size} primary alignments."
    )
    return ut.reduce_memory_usage(datadf)
Пример #7
0
def process_ubam(bam, **kwargs):
    """Extracting metrics from unaligned bam format
    Extracting lengths
    """
    logging.info(
        "Nanoget: Starting to collect statistics from ubam file {}.".format(
            bam))
    samfile = pysam.AlignmentFile(bam, "rb", check_sq=False)
    if not samfile.has_index():
        pysam.index(bam)
        # Need to reload the samfile after creating index
        samfile = pysam.AlignmentFile(bam, "rb", check_sq=False)
        logging.info(
            "Nanoget: No index for bam file could be found, created index.")
    datadf = pd.DataFrame(
        data=[(read.query_name, ut.ave_qual(read.query_qualities), read.query_length)
              for read in samfile.fetch(until_eof=True)],
        columns=["readIDs", "quals", "lengths"]) \
        .dropna(axis='columns', how='all') \
        .dropna(axis='index', how='any')
    logging.info("Nanoget: ubam {} contains {} reads.".format(
        bam, datadf["lengths"].size))
    return ut.reduce_memory_usage(datadf)
Пример #8
0
def process_summary(summaryfile, **kwargs):
    """Extracting information from an albacore summary file.

    Only reads which have a >0 length are returned.

    The fields below may or may not exist, depending on the type of sequencing performed.
    Fields 1-14 are for 1D sequencing.
    Fields 1-23 for 2D sequencing.
    Fields 24-27, 2-5, 22-23 for 1D^2 (1D2) sequencing
    Fields 28-38 for barcoded workflows
     1  filename
     2  read_id
     3  run_id
     4  channel
     5  start_time
     6  duration
     7  num_events
     8  template_start
     9  num_events_template
    10  template_duration
    11  num_called_template
    12  sequence_length_template
    13  mean_qscore_template
    14  strand_score_template
    15  complement_start
    16    num_events_complement
    17    complement_duration
    18    num_called_complement
    19    sequence_length_complement
    20    mean_qscore_complement
    21    strand_score_complement
    22    sequence_length_2d
    23    mean_qscore_2d
    24    filename1
    25    filename2
    26    read_id1
    27    read_id2
    28    barcode_arrangement
    29    barcode_score
    30    barcode_full_arrangement
    31    front_score
    32    rear_score
    33    front_begin_index
    34    front_foundseq_length
    35    rear_end_index
    36    rear_foundseq_length
    37    kit
    38    variant
    """
    logging.info(
        "Nanoget: Collecting metrics from summary file {} for {} sequencing".
        format(summaryfile, kwargs["readtype"]))
    ut.check_existance(summaryfile)
    if kwargs["readtype"] == "1D":
        cols = [
            "channel", "start_time", "duration", "sequence_length_template",
            "mean_qscore_template"
        ]
    elif kwargs["readtype"] in ["2D", "1D2"]:
        cols = [
            "channel", "start_time", "duration", "sequence_length_2d",
            "mean_qscore_2d"
        ]
    if kwargs["barcoded"]:
        cols.append("barcode_arrangement")
        logging.info("Nanoget: Extracting metrics per barcode.")
    try:
        datadf = pd.read_csv(
            filepath_or_buffer=summaryfile,
            sep="\t",
            usecols=cols,
        )
    except ValueError:
        logging.error(
            "Nanoget: did not find expected columns in summary file {}:\n {}".
            format(summaryfile, ', '.join(cols)))
        sys.exit("ERROR: expected columns in summary file {} not found:\n {}".
                 format(summaryfile, ', '.join(cols)))
    if kwargs["barcoded"]:
        datadf.columns = [
            "channelIDs", "time", "duration", "lengths", "quals", "barcode"
        ]
    else:
        datadf.columns = ["channelIDs", "time", "duration", "lengths", "quals"]
    logging.info(
        "Nanoget: Finished collecting statistics from summary file {}".format(
            summaryfile))
    return ut.reduce_memory_usage(datadf.loc[datadf["lengths"] != 0].copy())