def extract_reads_for_lane(fastq,lane):
    """
    Fetch reads from Fastq from specified lane

    Generator function which iterates through a
    Fastqe file and yields each read record where
    the lane number matches the specified lane.

    Example usage:

    >>> for r in extract_reads_for_lane('illumina_R1.fq',2):
    >>> ... print r

    Arguments:
      fastq (str): path to Fastq (can be gzipped)

    Yields:
      String: matching read record as a string.
    """
    regex_pattern = r"^([^:]*:){3}%s:" % lane
    for read in getreads_regex(fastq,regex_pattern):
        yield '\n'.join(read)
Пример #2
0
def extract_reads_for_lane(fastq, lane):
    """
    Fetch reads from Fastq from specified lane

    Generator function which iterates through a
    Fastqe file and yields each read record where
    the lane number matches the specified lane.

    Example usage:

    >>> for r in extract_reads_for_lane('illumina_R1.fq',2):
    >>> ... print(r)

    Arguments:
      fastq (str): path to Fastq (can be gzipped)

    Yields:
      String: matching read record as a string.
    """
    regex_pattern = r"^([^:]*:){3}%s:" % lane
    for read in getreads_regex(fastq, regex_pattern):
        yield '\n'.join(read)
def main(args=None):
    # Command line processing
    if args is None:
        args = sys.argv[1:]
    p = optparse.OptionParser(usage="%prog -m PATTERN |-n NREADS infile "
                              "[ infile ... ]",
                              version="%prog "+__version__,
                              description=__description__)
    p.add_option('-m','--match',action='store',dest='pattern',default=None,
                 help="extract records that match Python regular "
                 "expression PATTERN")
    p.add_option('-n',action='store',dest='n',default=None,
                 help="extract N random reads from the input file(s). "
                 "If multiple files are supplied (e.g. R1/R2 pair) then "
                 "the same subsets will be extracted for each. "
                 "(Optionally a percentage can be supplied instead e.g. "
                 "'50%' to extract a subset of half the reads.)")
    p.add_option('-s','--seed',action='store',dest='seed',default=None,
                 help="specify seed for random number generator (used "
                 "for -n option; using the same seed should produce the "
                 "same 'random' sample of reads)")
    opts,args = p.parse_args(args)
    if len(args) < 1:
        p.error("Need to supply at least one input file")
    # Pattern matching option
    if opts.pattern is not None:
        if opts.n is not None:
            p.error("Need to supply only one of -n or -m options")
        print "Extracting reads matching '%s'" % opts.pattern
        for f in args:
            if f.endswith('.gz'):
                outfile = os.path.basename(os.path.splitext(f[:-3])[0])
            else:
                outfile = os.path.basename(os.path.splitext(f)[0])
            outfile += '.subset_regex.fq'
            print "Extracting to %s" % outfile
            with open(outfile,'w') as fp:
                for read in getreads_regex(f,opts.pattern):
                    fp.write('\n'.join(read) + '\n')
    else:
        # Seed random number generator
        if opts.seed is not None:
            random.seed(opts.seed)
        # Count the reads
        nreads = sum(1 for i in getreads(args[0]))
        print "Number of reads: %s" % nreads
        if len(args) > 1:
            print "Verifying read numbers match between files"
        for f in args[1:]:
            if sum(1 for i in getreads(f)) != nreads:
                print "Inconsistent numbers of reads between files"
                sys.exit(1)
        # Generate a subset of read indices to extract
        try:
            nsubset = int(opts.n)
        except ValueError:
            if str(opts.n).endswith('%'):
                nsubset = int(float(opts.n[:-1])*nreads/100.0)
        if nsubset > nreads:
            print "Requested subset (%s) is larger than file (%s)" % (nsubset,
                                                                      nreads)
            sys.exit(1)
        print "Generating set of %s random indices" % nsubset
        subset_indices = random.sample(xrange(nreads),nsubset)
        # Extract the reads to separate files
        for f in args:
            if f.endswith('.gz'):
                outfile = os.path.basename(os.path.splitext(f[:-3])[0])
            else:
                outfile = os.path.basename(os.path.splitext(f)[0])
            outfile += '.subset_%s.fq' % nsubset
            print "Extracting to %s" % outfile
            with open(outfile,'w') as fp:
                for read in getreads_subset(f,subset_indices):
                    fp.write('\n'.join(read) + '\n')
Пример #4
0
def main(args=None):
    # Command line processing
    if args is None:
        args = sys.argv[1:]
    p = argparse.ArgumentParser(version="%(prog)s "+__version__,
                                description=__description__)
    p.add_argument('-m','--match',action='store',dest='pattern',
                   default=None,
                   help="extract records that match Python regular "
                   "expression PATTERN")
    p.add_argument('-n',action='store',dest='n',default=None,
                   help="extract N random reads from the input file(s). "
                   "If multiple files are supplied (e.g. R1/R2 pair) then "
                   "the same subsets will be extracted for each. "
                   "(Optionally a percentage can be supplied instead e.g. "
                   "'50%%' to extract a subset of half the reads.)")
    p.add_argument('-s','--seed',action='store',dest='seed',default=None,
                   help="specify seed for random number generator (used "
                   "for -n option; using the same seed should produce the "
                   "same 'random' sample of reads)")
    p.add_argument('infiles',metavar='infile',nargs='+',
                   help="input FASTQ, CSFASTA, or QUAL file")
    args = p.parse_args(args)
    # Pattern matching option
    if args.pattern is not None:
        if args.n is not None:
            p.error("Need to supply only one of -n or -m options")
        print("Extracting reads matching '%s'" % args.pattern)
        for f in args.infiles:
            if f.endswith('.gz'):
                outfile = os.path.basename(os.path.splitext(f[:-3])[0])
            else:
                outfile = os.path.basename(os.path.splitext(f)[0])
            outfile += '.subset_regex.fq'
            print("Extracting to %s" % outfile)
            with open(outfile,'w') as fp:
                for read in getreads_regex(f,args.pattern):
                    fp.write('\n'.join(read) + '\n')
    else:
        # Seed random number generator
        if args.seed is not None:
            random.seed(args.seed)
        # Count the reads
        nreads = sum(1 for i in getreads(args.infiles[0]))
        print("Number of reads: %s" % nreads)
        if len(args.infiles) > 1:
            print("Verifying read numbers match between files")
        for f in args.infiles[1:]:
            if sum(1 for i in getreads(f)) != nreads:
                print("Inconsistent numbers of reads between files")
                sys.exit(1)
        # Generate a subset of read indices to extract
        try:
            nsubset = int(args.n)
        except ValueError:
            if str(args.n).endswith('%'):
                nsubset = int(float(args.n[:-1])*nreads/100.0)
        if nsubset > nreads:
            print("Requested subset (%s) is larger than file (%s)" % (nsubset,
                                                                      nreads))
            sys.exit(1)
        print("Generating set of %s random indices" % nsubset)
        subset_indices = random.sample(xrange(nreads),nsubset)
        # Extract the reads to separate files
        for f in args.infiles:
            if f.endswith('.gz'):
                outfile = os.path.basename(os.path.splitext(f[:-3])[0])
            else:
                outfile = os.path.basename(os.path.splitext(f)[0])
            outfile += '.subset_%s.fq' % nsubset
            print("Extracting to %s" % outfile)
            with open(outfile,'w') as fp:
                for read in getreads_subset(f,subset_indices):
                    fp.write('\n'.join(read) + '\n')