Пример #1
0
def parse_fast5_chunk(dn, cs, is_upper=False):
    reads = []
    n_seqs = 0
    n_bases = 0
    size = 0
    f5s = [os.path.join(dn, f) for f in os.listdir(dn) if f.endswith(".fast5")]
    for f5 in f5s:
        f5h = lq_nanopore.open_fast5(f5)
        top = lq_nanopore.list_toplevel(f5h)
        for k in top:
            if not k.startswith('read_'):
                continue
            fastq = lq_nanopore.get_fastq_from_multi_fast5(f5h, k).splitlines()
            name = fastq[0].split(" ")[0]
            if is_upper:
                reads.append([name, fastq[1].upper(), fastq[3]])
            else:
                reads.append([name, fastq[1], fastq[3]])
            size += sys.getsizeof(name) + sys.getsizeof(
                fastq[1]) + sys.getsizeof(fastq[3])
            n_bases += len(fastq[1])
            n_seqs += 1
            if size >= cs:
                yield (reads, n_seqs, n_bases)
                size = 0
                reads = []
    yield (reads, n_seqs, n_bases)
Пример #2
0
def guess_format(fn):

    # assume fast5 is given in a dir.
    if os.path.isdir(fn):
        logger.info(
            "not a file but a direcory %s is given. looking for fast5 files.."
            % fn)
        for f in os.listdir(fn):
            if f.endswith(".fast5"):
                f5 = lq_nanopore.open_fast5(os.path.join(fn, f))
                if '/UniqueGlobalKey' in f5:
                    logger.error(
                        "single read fast5 is included? it's not supported for sampleqc."
                    )
                    return -1
                return 4

        logger.error("no fast5 is found.")
        return -1

    try:
        fh = open(fn, 'rb')
    except:
        logger.error("cannot open %s" % fn)

    try:
        majic = os.read(fh.fileno(), 4)
    except:
        logger.error("cannot read %s" % fn)

    # pybam and/or biopython way
    if majic == 'BAM\1':
        fh.close()
        logger.debug("%s is an uncompressed BAM." % fn)
        return 0
    elif b'\x1f\x8b' in majic:
        # YF memo: 1f 8b 08 04 code can exist in fq.gz either.
        # changed the logic.
        fh.close()
        with gzip.open(fn, 'rb') as f:
            l = f.read(4)
            if "BAM" in l.decode():  # this should be 'BAM\x01'
                logger.debug("%s is a compressed BAM." % fn)
                return 0
            else:
                return __guess_sam_fastx(fn, isgzip=True)
    else:
        fh.close()

    return __guess_sam_fastx(fn, isgzip=False)
Пример #3
0
def guess_format(fn):

    # assume fast5 is given in a dir.
    if os.path.isdir(fn):
        logger.info(
            "not a file but a direcory %s is given. looking for fast5 files.."
            % fn)
        for f in os.listdir(fn):
            if f.endswith(".fast5"):
                f5 = lq_nanopore.open_fast5(os.path.join(fn, f))
                if '/UniqueGlobalKey' in f5:
                    logger.error(
                        "single read fast5 is included? it's not supported for sampleqc."
                    )
                    return -1
                return 4

        logger.error("no fast5 is found.")
        return -1

    try:
        fh = open(fn, 'rb')
    except:
        logger.error("cannot open %s" % fn)

    try:
        majic = os.read(fh.fileno(), 4)
    except:
        logger.error("cannot read %s" % fn)

    # pybam and/or biopython way
    if majic == 'BAM\1':
        return 0
        fh.close()
    elif majic == b'\x1f\x8b\x08\x04':
        # compressed bam
        return 0
        fh.close()

    fh.close()

    try:
        fh = open(fn, 'r')
    except:
        logger.error("cannot open %s" % fn)

    # assume sam, fastx
    at_line_cnt = 0
    for line in fh:
        if line[0] == '@':
            at_line_cnt += 1
            continue
        elif at_line_cnt > 0:
            if at_line_cnt > 1:
                # header of sam
                fh.close()
                return 1
            cn = len(line.split("\t"))
            if cn == 11:
                fh.close()
                return 1
            at_line_cnt = 0
            # fastq
            fh.close()
            return 2
        elif line[0] == '>' and at_line_cnt == 0:
            # fasta
            fh.close()
            return 3
        else:
            cn = len(line.split("\t"))
            if cn == 11:
                fh.close()
                return 1
            at_line_cnt = 0
            continue

    # something else
    fh.close()
    return -1