Python stream_fastq 예제들, roblib.stream_fastq Python 예제들

예제 #1

0

파일 보기

파일: find_mates.py 프로젝트: linsalrob/EdwardsLab

def print_reads(miss, fq1, fq2):
    """
    Print the missing reads from the two fastq files
    :param miss1: the set of reads missing from fq1
    :param miss2: the set of reads missing from fq2
    :param fq1: the first fastq file
    :param fq2: the second fastq file
    :return:
    """


    bn = re.search('/(\w+)_pass_1.fastq', fq1)
    if not bn:
        sys.stderr.write(f"Can't parse the base filename from {fq1}\n")
        sys.exit(-1)

    fqo1 = bn.groups()[0] + "_missed_1.fastq"
    fqo2 = bn.groups()[0] + "_missed_2.fastq"
    if os.path.exists(fqo1):
        sys.stderr.write(f"Not overwrting {fqo1}\n")
        sys.exit(-1)

    if os.path.exists(fqo2):
        sys.stderr.write(f"Not overwrting {fqo2}\n")
        sys.exit(-1)

    with open(fqo1, 'w') as out:
        sys.stderr.write("Finding reads from {}\n".format(fq1))
        c =  0
        for sid, allid, seq, qual in stream_fastq(fq1):
            c += 1
            if not c % 100000:
                sys.stderr.write(".")
                sys.stderr.flush()
            test = sid[:sid.rindex(".1")].replace('@', '', 1)
            if test in miss:
                out.write("@{}\n{}\n+\n{}\n".format(allid, seq, qual))
                out.flush()

    with open(fqo2, 'w') as out:
        sys.stderr.write("\nFinding reads from {}\n".format(fq2))
        c=0
        for sid, allid, seq, qual in stream_fastq(fq2):
            c += 1
            if not c % 100000:
                sys.stderr.write(".")
                sys.stderr.flush()

            test = sid[:sid.rindex(".2")].replace('@', '', 1)
            if test in miss:
                out.write("@{}\n{}\n+\n{}\n".format(allid, seq, qual))
                out.flush()
    sys.stderr.write("\n")

예제 #2

0

파일 보기

파일: filter_fastq_by_blast.py 프로젝트: linsalrob/EdwardsLab

def filter_fastq(fqf, br, matchout=None, nomatchout=None, verbose=False):
    """
    Filter the fastq file and print out matches or no matches
    :param fqf: The fastq file to filter
    :param br: the set of query blast results
    :param matchout: The file to write matches to
    :param nomatchout: the file to write no matches to
    :param verbose: more output
    :return: nothing
    """

    mo = open(matchout, 'w')
    nmo = open(nomatchout, 'w')

    matches = 0
    nonmatches = 0
    for sid, allid, seq, qual in stream_fastq(fqf):
        if sid in br:
            if matchout:
                mo.write(f"@{allid}\n{seq}\n+\n{qual}\n")
            matches += 1
        else:
            if nomatchout:
                nmo.write(f"@{allid}\n{seq}\n+\n{qual}\n")
                nonmatches += 1
    sys.stderr.write(f"{bcolors.GREEN}FINISHED:{bcolors.ENDC} Sequences Matched: {matches} Sequences without match {nonmatches}\n")

예제 #3

0

파일 보기

def fq_ids(fqdir, verbose=False):
    """
    Get a list of fastq ids for each of the fastq files in fqdir
    :param fqdir: directory of fastq files
    :return: a dict of ids
    """

    if verbose:
        sys.stderr.write("Reading fastq files\n")

    fqids = {}
    for fqf in os.listdir(fqdir):
        if not fqf.endswith('fastq'):
            continue
        if verbose:
            sys.stderr.write("\t{}\n".format(fqf))
        for seqid, fullid, seq, qual in stream_fastq(os.path.join(fqdir, fqf)):
            if fullid in fqids:
                if fqids[fullid] == fqf.replace('.fastq', ''):
                    continue
                sys.stderr.write(
                    "WARNING: {} is not a unique id. It is in {} and {}\n".
                    format(fullid, fqids[fullid], fqf))
            fqids[fullid] = fqf.replace('.fastq', '')
            if ' ' in fullid:
                fi = fullid.replace(' ', '_')
                fqids[fi] = fqf.replace('.fastq', '')

    return fqids

예제 #4

0

파일 보기

파일: fastq2color_strip.py 프로젝트: linsalrob/EdwardsLab

def fq_ids(fqdir, verbose=False):
    """
    Get a list of fastq ids for each of the fastq files in fqdir
    :param fqdir: directory of fastq files
    :return: a dict of ids
    """

    if verbose:
        sys.stderr.write("Reading fastq files\n")

    fqids = {}
    for fqf in os.listdir(fqdir):
        if not fqf.endswith('fastq'):
            continue
        if verbose:
            sys.stderr.write("\t{}\n".format(fqf))
        for seqid, fullid, seq, qual in stream_fastq(os.path.join(fqdir, fqf)):
            if fullid in fqids:
                if fqids[fullid] == fqf.replace('.fastq', ''):
                    continue
                sys.stderr.write("WARNING: {} is not a unique id. It is in {} and {}\n".format(fullid, fqids[fullid], fqf))
            fqids[fullid] = fqf.replace('.fastq', '')
            if ' ' in fullid:
                fi = fullid.replace(' ', '_')
                fqids[fi] = fqf.replace('.fastq', '')

    return fqids

예제 #5

0

파일 보기

파일: count_trailing_kmers.py 프로젝트: bcpd/EdwardsLab

def count_kmers(fqf, kmer, verbose=False):
    """ 
    Count hte frequency of bases in the first k-mer bp of a sequences
    :param fqf: fastq file
    :param kmer: length to count
    :param verbose: more output
    """

    if verbose:
        sys.stderr.write(f"{bcolors.GREEN}Reading {fqf}{bcolors.ENDC}\n")
    counts = [[0, 0, 0, 0] for x in range(kmer)]
    for sid, seqid, seq, qual in stream_fastq(fqf):
        if 'N' in seq or 'n' in seq:
            continue
        seq = seq[::-1]  # reverse the sequence!
        for x in range(kmer):
            try:
                counts[x][k[seq[x]]] += 1
            except KeyError as e:
                base = e.args[0]
                if base.upper() != "N":
                    sys.stderr.write(
                        f'{bcolors.PINK}Unknown base {base}{bcolors.ENDC}\n')
    counts.reverse()
    return counts

예제 #6

0

파일 보기

파일: extract_pcr_reads_from_fq.py 프로젝트: linsalrob/EdwardsLab

def read_fastqs(fastqdir, fname, seqids, verbose=True):
    """
    Read the fastq files and store the sequences we want to save
    :param fastqdir: the directory with fastq files
    :param fname: the likely filename
    :param seqids: the seqids we want to save
    :param verbose: more output
    :return : a dict of seqids: left, left qual, right, right qual
    """

    seqs = {x:[None, None, None, None] for x in seqids}
    wanted = set()
    for s in seqids:
        wanted.add(f"@{s}.1")
        wanted.add(f"@{s}.2")
    for f in os.listdir(fastqdir):
        if fname in f:
            if verbose:
                sys.stderr.write("Reading {}\n".format(os.path.join(fastqdir, f)))
            for seqid, header, seq, qualscores in stream_fastq(os.path.join(fastqdir, f)):
                if seqid in wanted:
                    s = re.sub('.\d$', '', seqid)
                    s = s.replace('@', '', 1)
                    if seqid.endswith('.1'):
                        seqs[s][0] = seq
                        seqs[s][1] = qualscores
                    elif seqid.endswith('.2'):
                        seqs[s][2] = seq
                        seqs[s][3] = qualscores
                    else:
                        sys.stderr.write("ERR: Not sure about sequence ID {}\n".format(seqid))
    return seqs

예제 #7

0

파일 보기

def filter_fastq(fqf, br, matchout=None, nomatchout=None, verbose=False):
    """
    Filter the fastq file and print out matches or no matches
    :param fqf: The fastq file to filter
    :param br: the set of query blast results
    :param matchout: The file to write matches to
    :param nomatchout: the file to write no matches to
    :param verbose: more output
    :return: nothing
    """

    mo = open(matchout, 'w')
    nmo = open(nomatchout, 'w')

    matches = 0
    nonmatches = 0
    for sid, allid, seq, qual in stream_fastq(fqf):
        if sid in br:
            if matchout:
                mo.write(f"@{allid}\n{seq}\n+\n{qual}\n")
            matches += 1
        else:
            if nomatchout:
                nmo.write(f"@{allid}\n{seq}\n+\n{qual}\n")
                nonmatches += 1
    sys.stderr.write(
        f"{bcolors.GREEN}FINISHED:{bcolors.ENDC} Sequences Matched: {matches} Sequences without match {nonmatches}\n"
    )

예제 #8

0

파일 보기

파일: cpgs.py 프로젝트: linsalrob/EdwardsLab

def countcpgs(fqfile):
    """
    Count the CpGs in a file
    :param fqfile: the fastq file
    :return:
    """

    count = {}
    for seqid, header, seq, qual in stream_fastq(fqfile):
        cg = seq.count('CG')
        count[cg] = count.get(cg, 0) + 1
    return count

예제 #9

0

파일 보기

def countcpgs(fqfile):
    """
    Count the CpGs in a file
    :param fqfile: the fastq file
    :return:
    """

    count = {}
    for seqid, header, seq, qual in stream_fastq(fqfile):
        cg = seq.count('CG')
        count[cg] = count.get(cg, 0) + 1
    return count

예제 #10

0

파일 보기

def write_sequences(reads, outdir, leftfq, rightfq, singlefq = None, verbose = False):
    """
    Write the sequences out to a file
    :param reads: the dict of reads and bins
    :param outdir: the output dir to write to
    :param leftfq: the left reads
    :param rightfq: the right reads
    :param singlefq: the single reads (optional)
    :param verbose: more output
    :return:
    """

    if verbose:
        sys.stderr.write(f"{bcolors.GREEN}Writing sequences\n{bcolors.ENDC}")

    if not os.path.exists(outdir):
        os.mkdir(outdir)


    files = {}
    psc = 0
    for seqid, header1, seq1, qualscores1, header2, seq2, qualscores2 in stream_paired_fastq(leftfq, rightfq):
        if seqid in reads:
            for clst in reads[seqid]:
                if clst not in files:
                    files[clst] = [
                        open(os.path.join(outdir, clst + ".R1.fastq"), 'w'),
                        open(os.path.join(outdir, clst + ".R2.fastq"), 'w')
                    ]
                files[clst][0].write(f"@{header1}\n{seq1}\n+\n{qualscores1}\n")
                files[clst][1].write(f"@{header2}\n{seq2}\n+\n{qualscores2}\n")
            psc += 1

    singlefiles = {}
    sc = 0
    if singlefq:
        for seqid, header, seq, qualscores in stream_fastq(singlefq):
            if seqid in reads:
                for clst in reads[seqid]:
                    if clst not in singlefiles:
                        singlefiles[clst] = open(os.path.join(outdir, clst + ".single.fastq"), 'w')
                    singlefiles[clst].write(f"@{header}\n{seq}\n+\n{qualscores}\n")
            sc += 1

    for f in files:
        files[f][0].close()
        files[f][1].close()
    for f in singlefiles:
        singlefiles[f].close()

    if verbose:
        sys.stderr.write(f"{bcolors.GREEN}Wrote {psc} paired end sequences and {sc} single reads\n{bcolors.ENDC}")

예제 #11

0

파일 보기

파일: read_ends.py 프로젝트: bcpd/EdwardsLab

def split_fastq(fqf, outdir, frac, verbose=False):
    """
    Split a fastq file
    :param fqf: fastq file
    :param outdir: output directory to write all the files to
    :param frac: fraction of the sequence for each end
    :param verbose: more output
    :return: nothing
    """

    if not os.path.exists(outdir):
        os.path.mkdir(outdir)

    for seqid, header, seq, qual in stream_fastq(fqf):
        with open (os.path.join(outdir, seq + ".left.fna"), 'w') as out:

예제 #12

0

파일 보기

파일: filter_from_blast.py 프로젝트: maggishaggy/EdwardsLab

def read_fastq(fqfile, blast, verbose=False):
    """
    Read the fastq file and print only sequences we need
    :param fqfile:  The fastq file
    :param blast: the blast reads that matched (ie. reads to delete)
    :param verbose: more output
    :return:
    """

    for seqid, fullid, seq, qual in stream_fastq(fqfile):
        if seqid.startswith('@'):
            seqid = seqid[1:]
        if seqid in blast or fullid in blast:
            continue
        print("@{}\n{}\n+\n{}".format(fullid, seq, qual))

예제 #13

0

파일 보기

def count_kmers(faf, type, k, jsonout=None, verbose=False):
    """
    Count the kmers
    :param faf: fasta file
    :param type: str either fasta or fastq
    :param k: kmer size
    :param verbose: more output
    :return: a dict of kmers
    """

    if verbose:
        sys.stderr.write(f"{bcolors.GREEN}Counting kmers (k={k}) in {faf}\n")

    kmers = {}

    if type == "fasta":
        for id, seq in stream_fasta(faf):
            rcseq = rc(seq)
            posn = 0
            while posn < len(seq) - k - 1:
                kmers[seq[posn:posn +
                          k]] = kmers.get(seq[posn:posn + k], 0) + 1
                kmers[rcseq[posn:posn +
                            k]] = kmers.get(rcseq[posn:posn + k], 0) + 1
                posn += 1

    if type == "fastq":
        for id, fullid, seq, qual in stream_fastq(faf):
            rcseq = rc(seq)
            posn = 0
            while posn < len(seq) - k - 1:
                kmers[seq[posn:posn +
                          k]] = kmers.get(seq[posn:posn + k], 0) + 1
                kmers[rcseq[posn:posn +
                            k]] = kmers.get(rcseq[posn:posn + k], 0) + 1
                posn += 1

    if jsonout:
        if verbose:
            sys.stderr.write(f"{bcolors.BLUE}\tWriting to {jsonout}\n")
        with open(jsonout, 'w') as out:
            json.dump({faf: kmers}, out)

    if verbose:
        sys.stderr.write(
            f"{bcolors.BLUE}\tDone counting kmers (k={k}) in {faf}\n")

    return kmers

예제 #14

0

파일 보기

파일: color_based_on_fastq.py 프로젝트: linsalrob/EdwardsLab

def fq_ids(fnames):
    """
    Get a list of fastq ids for each of the files in fnames
    :param fnames: a list of files
    :return: a dict of ids
    """

    res = {}
    for f in fnames:
        for seqid, fullid, seq, qual in stream_fastq(f):
            # note we store several versions of the id as phylosift does some munging on them
            res[fullid] = f
            fullid = fullid.replace(' ', '_')
            res[fullid] = f

    return res

예제 #15

0

파일 보기

파일: color_based_on_fastq.py 프로젝트: maggishaggy/EdwardsLab

def fq_ids(fnames):
    """
    Get a list of fastq ids for each of the files in fnames
    :param fnames: a list of files
    :return: a dict of ids
    """

    res = {}
    for f in fnames:
        for seqid, fullid, seq, qual in stream_fastq(f):
            # note we store several versions of the id as phylosift does some munging on them
            res[fullid] = f
            fullid = fullid.replace(' ', '_')
            res[fullid] = f

    return res

예제 #16

0

파일 보기

파일: fastq_not_in_bam.py 프로젝트: maggishaggy/EdwardsLab

def extract_fastq(fqf, reads, verbose):
    """
    Extract the reads from the fastq file
    :param fqf: fastq file
    :param reads: set of reads to ignore
    :param verbose: more output
    :return:  nada
    """

    for (sid, label, seq, qual) in stream_fastq(fqf):
        if sid.startswith('@'):
            sid = sid[1:]
        if sid not in reads:
            if verbose:
                sys.stderr.write("Keeping: {}  -->  {}\n".format(sid, label))
            print("@{}\n{}\n+\n{}".format(label, seq, qual))
        elif verbose:
            sys.stderr.write("Skipping: {}  -->  {}\n".format(sid, label))

예제 #17

0

파일 보기

파일: index_in_fastq.py 프로젝트: bcpd/EdwardsLab

def print_locations(fastqf, s, pl):
    """
    Print the location of s in all reads in fastqf
    :param fastqf:
    :param s:
    :param pl: print the sequence length
    :return:
    """

    for seqid, header, seq, qual in stream_fastq(fastqf):
        r = seq.find(s)
        while r > -1:
            if pl:
                print(f"{seqid}\t{r}\t{len(seq)}")
            else:
                print(f"{seqid}\t{r}")
            r += 1
            r = seq.find(s, r)

예제 #18

0

파일 보기

def fq_ids(fnames, verbose=False):
    """
    Get a list of fastq ids for each of the files in fnames
    :param fnames: a list of files
    :return: a dict of ids
    """

    if verbose:
        sys.stderr.write("Reading fastq files\n")

    fqids = {}
    for f in fnames:
        for seqid, fullid, seq, qual in stream_fastq(f):
            # note we store several versions of the id as phylosift does some munging on them
            fqids[fullid] = f.split(os.path.sep)[-1]
            fullid = clean_newick_id(fullid)
            fqids[fullid] = f.split(os.path.sep)[-1]

    return fqids

예제 #19

0

파일 보기

파일: fastq2ids.py 프로젝트: linsalrob/EdwardsLab

def fq_ids(fnames, verbose=False):
    """
    Get a list of fastq ids for each of the files in fnames
    :param fnames: a list of files
    :return: a dict of ids
    """

    if verbose:
        sys.stderr.write("Reading fastq files\n")

    fqids = {}
    for f in fnames:
        for seqid, fullid, seq, qual in stream_fastq(f):
            # note we store several versions of the id as phylosift does some munging on them
            fqids[fullid] = f.split(os.path.sep)[-1]
            fullid = clean_newick_id(fullid)
            fqids[fullid] = f.split(os.path.sep)[-1]

    return fqids

예제 #20

0

파일 보기

def fq_ids(fnames, verbose=False):
    """
    Get a list of fastq ids for each of the files in fnames
    :param fnames: a list of files
    :return: a dict of ids
    """

    if verbose:
        sys.stderr.write("Reading fastq files\n")

    fqids = {}
    for f in fnames:
        for seqid, fullid, seq, qual in stream_fastq(f):
            # note we store several versions of the id as phylosift does some munging on them
            fqids[fullid] = f
            fullid = fullid.replace(' ', '_')
            fqids[fullid] = f

    return fqids

예제 #21

0

파일 보기

파일: pcr_fastq_coverage.py 프로젝트: maggishaggy/EdwardsLab

def parse_dir(dir, verbose=False):
    """
    Parse the directory of files
    :param dir:
    :param verbose:
    :return:
    """

    lengths = {}
    for f in os.listdir(dir):
        m = re.search('^(\w+)_(Primer\w)_', f)
        if not m:
            sys.stderr.write("Error: can't parse {}\n".format(f))
            continue
        (srr, primer) = m.groups()
        if srr not in lengths:
            lengths[srr] = {'PrimerA' : 0, 'PrimerB' : 0, 'PrimerC' : 0}
        for seqid, header, seq, qualscores in stream_fastq(os.path.join(dir, f)):
            lengths[srr][primer] += len(seq)
    return lengths

예제 #22

0

파일 보기

파일: extract_pcr_reads_from_fq.py 프로젝트: maggishaggy/EdwardsLab

def read_fastqs(fastqdir, fname, seqids, verbose=True):
    """
    Read the fastq files and store the sequences we want to save
    :param fastqdir: the directory with fastq files
    :param fname: the likely filename
    :param seqids: the seqids we want to save
    :param verbose: more output
    :return : a dict of seqids: left, left qual, right, right qual
    """

    seqs = {x: [None, None, None, None] for x in seqids}
    wanted = set()
    for s in seqids:
        wanted.add(f"@{s}.1")
        wanted.add(f"@{s}.2")
    for f in os.listdir(fastqdir):
        if fname in f:
            if verbose:
                sys.stderr.write("Reading {}\n".format(
                    os.path.join(fastqdir, f)))
            for seqid, header, seq, qualscores in stream_fastq(
                    os.path.join(fastqdir, f)):
                if seqid in wanted:
                    s = re.sub('.\d$', '', seqid)
                    s = s.replace('@', '', 1)
                    if seqid.endswith('.1'):
                        seqs[s][0] = seq
                        seqs[s][1] = qualscores
                    elif seqid.endswith('.2'):
                        seqs[s][2] = seq
                        seqs[s][3] = qualscores
                    else:
                        sys.stderr.write(
                            "ERR: Not sure about sequence ID {}\n".format(
                                seqid))
    return seqs

예제 #23

0

파일 보기

__maintainer__ = 'Rob Edwards'
__email__ = '*****@*****.**'

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=" ")
    parser.add_argument('-f', help='fastq file', required=True)
    parser.add_argument('-o', help='output directory', required=True)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    tags = ['GGTTCACTTGAGACAC', 'CTTGAGACAC']

    os.makedirs(args.o, exist_ok=True)
    outfiles = {'none': open(os.path.join(args.o, "none.fastq"), 'w')}

    for seqid, header, seq, qual in stream_fastq(args.f):
        written = False
        for t in tags:
            if t in seq and seq.index(t) < 25:
                tag = seq[0:seq.index(t) + len(t)]
                if tag not in outfiles:
                    outfiles[tag] = open(os.path.join(args.o, f"{tag}.fastq"),
                                         'w')
                outfiles[tag].write(f"@{header}\n{seq}\n+\n{qual}\n")
                written = True
                break
        if not written:
            outfiles['none'].write(f"@{header}\n{seq}\n+\n{qual}\n")

    for t in outfiles:
        outfiles[t].close()

예제 #24

0

파일 보기

파일: pair_fastq_files.py 프로젝트: shaferab/EdwardsLab

"""

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Check paired end files and make sure the pairs match up')
    parser.add_argument(
        '-l',
        help='The file where the reads end /1 (the option is a lowercase L)',
        required=True)
    parser.add_argument('-r',
                        help='The file where the reads end /2',
                        required=True)
    args = parser.parse_args()

    lseq = {}
    for (seqid, header, seq, qual) in stream_fastq(args.l):
        if not seqid.endswith('/1'):
            sys.stderr.write(
                "Sequence {} in {} does not appear to be a read /1\n".format(
                    seqid, args.l))
            continue
        seqid = seqid.replace('/1', '')
        lseq[seqid] = [header, seq, qual]

    rseq = {}
    for (seqid, header, seq, qual) in stream_fastq(args.r):
        if not seqid.endswith('/2'):
            sys.stderr.write(
                "Sequence {} in {} does not appear to be a read /2\n".format(
                    seqid, args.l))
            continue

예제 #25

0

파일 보기

파일: countfastq.py 프로젝트: shaferab/EdwardsLab

"""

import os
import sys
import argparse
from roblib import stream_fastq

__author__ = 'Rob Edwards'

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=' ')
    parser.add_argument('-f', help='fasta file', required=True)
    args = parser.parse_args()

    lens = []
    for (sid, label, seq, qual) in stream_fastq(args.f):
        lens.append(len(seq))
    lens.sort()
    length=sum(lens)

    len_so_far = 0
    n50 = None
    n75 = None

    for i in lens:
        len_so_far += i
        if not n50 and len_so_far >= length * 0.5:
            n50 = i
        if not n75 and len_so_far >= length * 0.75:
            n75 = i

예제 #26

0

파일 보기

파일: countfastq.py 프로젝트: bcpd/EdwardsLab

        files = []

    if args.d:
        for subdir in args.d:
            for f in os.listdir(subdir):
                files.append(os.path.join(subdir, f))

    overall = {'number': 0, 'total': 0, 'shortest': 1e6, 'longest': 0}
    for faf in files:
        if not os.path.exists(faf):
            sys.stderr.write(
                f"{bcolors.RED}FATAL: {faf} not found{bcolors.ENDC}\n")
            sys.exit(1)

        lens = []
        for (sid, label, seq, qual) in stream_fastq(faf):
            lens.append(len(seq))
        lens.sort()
        length = sum(lens)

        len_so_far = 0
        n50 = None
        n75 = None
        auN = 0
        for i in lens:
            len_so_far += i
            if not n50 and len_so_far >= length * 0.5:
                n50 = i
            if not n75 and len_so_far >= length * 0.75:
                n75 = i
            auN += i**2

예제 #27

0

파일 보기

        sys.stderr.write(f"{bcolors.GREEN}Filtering on length{bcolors.ENDC}\n")

    fqnew = []
    for s in fq:
        if len(s[2]) > length:
            fqnew.append(s)
    return fqnew


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=' ')
    parser.add_argument('-f', help='fastq file', required=True)
    parser.add_argument(
        '-m',
        help='filter based on sequence length. Supply minimum length',
        type=int)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    # just read the whole file into an array, and then we
    #  can run serial filters
    fq = []
    for seqid, header, seq, scores in stream_fastq(args.f):
        fq.append([seqid, header, seq, scores])

    if args.m:
        fq = filter_len(fq, args.m, args.v)

    for s in fq:
        print(f"@{s[1]}\n{s[2]}\n+\n{s[3]}")

예제 #28

0

파일 보기

                        help='base file name. Everything upto the _R1',
                        required=True)
    parser.add_argument('-q',
                        help='QC dir (default: %(default)s',
                        default='QC')
    parser.add_argument('-o', help='output directory', required=True)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    os.makedirs(args.o, exist_ok=True)
    dna = {}
    qual = {}
    header = {}

    # initially didn't plan to keep all these :)
    for seqid, hd, seq, qualscores in stream_fastq(args.f):
        dna[seqid] = seq.upper()
        qual[seqid] = qualscores
        header[seqid] = hd

    changed = set()
    deleted = set()
    for step in range(1, 10):
        if args.v:
            message(f"Working on step {step}", "GREEN")
        fqf = os.path.join(args.q, f"step_{step}",
                           f"{args.n}.s{step}.out.fastq")
        if not os.path.exists(fqf):
            message(f"FQ File {fqf} not found", "RED")
            continue
        seqs = []

예제 #29

0

파일 보기

    if not args.forward and not args.reverse:
        message(
            "Either --forward or --reverse primer must be specified otherwise nothing will be removed"
        )
        sys.exit(-1)

    fwd = None
    rev = None
    if args.forward:
        fwd = args.forward.upper()
    if args.reverse:
        rev = args.reverse.upper()

    with open(args.o, 'w') as out:
        for sid, seqid, seq, qual in stream_fastq(args.f):
            original = [seq, qual]
            trimmed = False
            if fwd and fwd in seq.upper():
                idx = seq.upper().index(fwd)
                if idx < args.maxfwd:
                    if idx > 10:
                        message(
                            f"WARNING: Trimming forward primer {fwd} from {sid} starting at position {idx}",
                            "PINK")
                    seq = seq[idx + len(args.forward):]
                    qual = qual[idx + len(args.forward):]
                    trimmed = True
                else:
                    if args.v:
                        message(