def combine_dual_indices(file1, file2): Seq = namedtuple('Seq', ['id', 'seq', 'qual', 'qual2']) for i1, i2 in zip_longest(fastqlite(file1), fastqlite(file2)): assert i1.id == i2.id yield Seq(id=i1.id, seq=i1.seq + '+' + i2.seq, qual=i1.qual, qual2=i2.qual)
def main(arguments): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'fastq', help='reads to count in fastq format', metavar='file.fastq[.bz2|.gz]', type=Opener(), ) parser.add_argument( 'read_counts', help='tabulate read counts and store as a CSV', metavar='FILE', type=argparse.FileType('w'), ) args = parser.parse_args(arguments) count = sum(1 for _ in fastqlite(args.fastq)) read_counts_writer = csv.writer(args.read_counts) read_counts_writer.writerow([args.fastq.name, count, count])
def main(): args_parser = argparse.ArgumentParser( description= """Given a set of fasta / fastq files (min 2), combine them into one fasta file. Check at least to be sure no overlapping IDs. Optionally check for overlapping sequences too. """) args_parser.add_argument( 'files', help='Files to be combined. Minimum of 2 required', nargs='+', type=fastalite.Opener(mode='r')) args_parser.add_argument('--fastq', '-q', help='input and outputs are fastq (not fasta)', action='store_true') args_parser.add_argument( '--check-seq', '-s', help= 'Also check to be sure sequences are not repeated. Default is to only check for repeated IDs', action='store_true') args_parser.add_argument( '--output', '-o', help='File into which we should place our combined reads', required=True, type=fastalite.Opener(mode='w')) args = args_parser.parse_args() logging.basicConfig(level=logging.INFO) if len(args.files) < 2: logging.error("Only one file given. Nothing to do.") return -1 out_h = args.output if args.check_seq: seq_ids = set() seqs = set() for file_h in args.files: if args.fastq: reader = fastalite.fastqlite(file_h) else: reader = fastalite.fastalite(file_h) for sr in reader: if sr.id not in seq_ids and sr.seq not in seqs: seq_ids.add(sr.id) seqs.add(sr.seq) if args.fastq: out_h.write("@{} {}\n{}\n+\n{}\n".format( sr.id, sr.description, sr.seq, sr.qual)) else: out_h.write(">%s %s\n%s\n" % (sr.id, sr.description, sr.seq)) else: # just IDs seq_ids = set() for file_h in args.files: if args.fastq: reader = fastalite.fastqlite(file_h) else: reader = fastalite.fastalite(file_h) for sr in reader: if sr.id not in seq_ids: seq_ids.add(sr.id) if args.fastq: out_h.write("@{} {}\n{}\n+\n{}\n".format( sr.id, sr.description, sr.seq, sr.qual)) else: out_h.write(">%s %s\n%s\n" % (sr.id, sr.description, sr.seq))
def main(arguments=None): parser = argparse.ArgumentParser( prog='barcodecop', description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'index', nargs='+', type=Opener(), metavar='file.fastq[.bz2|.gz]', help='one or two files containing index reads in fastq format') parser.add_argument('-f', '--fastq', type=Opener(), metavar='file.fastq[.bz2|.gz]', help='reads to filter in fastq format') parser.add_argument('-o', '--outfile', default=sys.stdout, type=Opener('w'), help='output fastq') parser.add_argument( '--snifflimit', type=int, default=10000, metavar='N', help='read no more than N records from the index file [%(default)s]') parser.add_argument('--head', type=int, metavar='N', help='limit the output file to N records') parser.add_argument( '--min-pct-assignment', type=float, default=90.0, metavar='PERCENT', help=("""warn (or fail with an error; see --strict) if the most common barcode represents less than PERCENT of the total [%(default)s]""")) parser.add_argument( '--strict', action='store_true', default=False, help=("""fail if conditions of --min-pct-assignment are not met""")) parser.add_argument( '--invert', action='store_true', default=False, help='include only sequences *not* matching the most common barcode') # parser.add_argument('--format', choices=['fasta', 'fastq'], default='fastq') parser.add_argument('-c', '--show-counts', action='store_true', default=False, help='tabulate barcode counts and exit') parser.add_argument('-q', '--quiet', action='store_true', default=False, help='minimize messages to stderr') parser.add_argument('-V', '--version', action=VersionAction, version=__version__, help='Print the version number and exit') args = parser.parse_args(arguments) logging.basicConfig(format='%(message)s', level=logging.ERROR if args.quiet else logging.INFO) log = logging.getLogger(__name__) if len(args.index) == 1: bcseqs = fastqlite(args.index[0]) elif len(args.index) == 2: bcseqs = combine_dual_indices(*args.index) else: log.error('error: please specify either one or two index files') bc1, bc2 = tee(bcseqs, 2) # determine the most common barcode barcode_counts = Counter( [str(seq.seq) for seq in islice(bc1, args.snifflimit)]) barcodes, counts = zip(*barcode_counts.most_common()) most_common_bc = barcodes[0] most_common_pct = 100 * float(counts[0]) / sum(counts) log.info('most common barcode: {} ({}/{} = {:.2f}%)'.format( most_common_bc, counts[0], sum(counts), most_common_pct)) if args.show_counts: for bc, count in barcode_counts.most_common(): print('{}\t{}\t{}'.format(bc, seqdiff(most_common_bc, bc), count)) return None if most_common_pct < args.min_pct_assignment: msg = 'frequency of most common barcode is less than {}%'.format( args.min_pct_assignment) if args.strict: log.error('Error: ' + msg) sys.exit(1) else: log.warning('Warning: ' + msg) if not args.fastq: log.error('specify a fastq format file to filter using -f/--fastq') sys.exit(1) seqs = fastqlite(args.fastq) filtered = islice(filter(bc2, seqs, most_common_bc, args.invert), args.head) for seq in filtered: args.outfile.write(as_fastq(seq))
def combine_dual_indices(file1, file2): Seq = namedtuple('Seq', ['id', 'seq']) for i1, i2 in izip(fastqlite(file1), fastqlite(file2)): assert i1.id == i2.id yield Seq(id=i1.id, seq=i1.seq + '+' + i2.seq)
def main(arguments=None): parser = argparse.ArgumentParser( prog='barcodecop', description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'index', nargs='+', type=Opener(), metavar='file.fastq[.bz2|.gz]', help='one or two files containing index reads in fastq format') parser.add_argument('-f', '--fastq', type=Opener(), metavar='file.fastq[.bz2|.gz]', help='reads to filter in fastq format') parser.add_argument('-o', '--outfile', default=sys.stdout, type=Opener('w'), help='output fastq') parser.add_argument( '--snifflimit', type=int, default=10000, metavar='N', help='read no more than N records from the index file [%(default)s]') parser.add_argument('--head', type=int, metavar='N', help='limit the output file to N records') parser.add_argument( '--invert', action='store_true', default=False, help='include only sequences failing filtering criteria') parser.add_argument('-q', '--quiet', action='store_true', default=False, help='minimize messages to stderr') parser.add_argument('-V', '--version', action=VersionAction, version=__version__, help='Print the version number and exit') match_options = parser.add_argument_group('Barcode matching options') match_options.add_argument( '--match-filter', action='store_true', default=False, help=('filter reads based on exact match to most common barcode ' '[default: no match filter]')) match_options.add_argument( '--min-pct-assignment', type=float, default=90.0, metavar='PERCENT', help=("""warn (or fail with an error; see --strict) if the most common barcode represents less than PERCENT of the total [%(default)s]""")) match_options.add_argument( '--strict', action='store_true', default=False, help=("""fail if conditions of --min-pct-assignment are not met""")) match_options.add_argument('-c', '--show-counts', action='store_true', default=False, help='tabulate barcode counts and exit') qual_options = parser.add_argument_group( 'Barcode quality filtering options') qual_options.add_argument( '--qual-filter', action='store_true', default=False, help= 'filter reads based on minimum index quality [default: no quality filter]' ) qual_options.add_argument( '-p', '--min-qual', type=int, default=MIN_QUAL, help="""reject seqs with mean barcode quality score less than this value; for dual index, both barcodes must meet the threshold [%(default)s]""") qual_options.add_argument('--encoding', default='phred', choices=['phred'], help="""quality score encoding; see https://en.wikipedia.org/wiki/FASTQ_format [%(default)s]""") args = parser.parse_args(arguments) logging.basicConfig(format='%(message)s', level=logging.ERROR if args.quiet else logging.INFO) log = logging.getLogger(__name__) # when provided with dual barcodes, concatenate into a single # namedtuple with attributes qual and qual1; generate a filter # function appropriate for either case. if len(args.index) == 1: bcseqs = fastqlite(args.index[0]) qual_filter = get_qual_filter(args.min_qual, args.encoding) elif len(args.index) == 2: qual_filter = get_qual_filter(args.min_qual, args.encoding, paired=True) bcseqs = combine_dual_indices(*args.index) else: log.error('error: please specify either one or two index files') # use bc1 to determine most common barcode bc1, bc2 = tee(bcseqs, 2) # determine the most common barcode barcode_counts = Counter( [str(seq.seq) for seq in islice(bc1, args.snifflimit)]) barcodes, counts = list(zip(*barcode_counts.most_common())) most_common_bc = barcodes[0] most_common_pct = 100 * float(counts[0]) / sum(counts) log.info('most common barcode: {} ({}/{} = {:.2f}%)'.format( most_common_bc, counts[0], sum(counts), most_common_pct)) if args.show_counts: for bc, count in barcode_counts.most_common(): print(('{}\t{}\t{}'.format(bc, seqdiff(most_common_bc, bc), count))) return None if most_common_pct < args.min_pct_assignment: msg = 'frequency of most common barcode is less than {}%'.format( args.min_pct_assignment) if args.strict: log.error('Error: ' + msg) sys.exit(1) else: log.warning('Warning: ' + msg) if not args.fastq: log.error('specify a fastq format file to filter using -f/--fastq') sys.exit(1) ifilterfun = filterfalse if args.invert else filter seqs = fastqlite(args.fastq) filtered = zip_longest(seqs, bc2) if args.match_filter: filtered = ifilterfun(get_match_filter(most_common_bc), filtered) if args.qual_filter: filtered = ifilterfun(qual_filter, filtered) for seq, bc in islice(filtered, args.head): assert seq.id == bc.id args.outfile.write(as_fastq(seq))
def main(): args_parser = argparse.ArgumentParser( description="""Given set(s) of paired reads in fastq format combine all into one pair of reads also in fastq format. Concurrently confirm all R1 have matched R2. """) args_parser.add_argument('--in-1', '-1', help='Read 1 Files to be combined.', nargs='+', required=True, type=fastalite.Opener(mode='r')) args_parser.add_argument( '--in-2', '-2', help="""Read 2 Files to be combined. Must be in same order as --in-1""", nargs='+', required=True, type=fastalite.Opener(mode='r')) args_parser.add_argument( '--out-1', '-o1', help='File into which we should place our combined R1', required=True, type=fastalite.Opener(mode='w')) args_parser.add_argument( '--out-2', '-o2', help='File into which we should place our combined R2', required=True, type=fastalite.Opener(mode='w')) args_parser.add_argument( '--normalize-ids', '-ni', help='Normalize IDs for pairs by stripping /x from the end', action='store_true') args = args_parser.parse_args() logging.basicConfig(level=logging.INFO) assert len(args.in_1) == len( args.in_2), "Mismatched number of forward and reverse read files." # Loop 1: Identify ALL R1 and R2 IDs in all files. # Also look for duplicated IDs IDs_R1 = set() IDs_R2 = set() logging.info("Looping through files to identify all sequence IDs") for r1_h, r2_h in zip(args.in_1, args.in_2): # Will not use list comprehension to allow for error handling... file_ids_r1 = set() r1_reader = fastalite.fastqlite(r1_h) try: while True: try: sr = next(r1_reader) file_ids_r1.add(get_seq_id(sr.id, args.normalize_ids)) except ValueError: pass except StopIteration: pass file_ids_r2 = set() r2_reader = fastalite.fastqlite(r2_h) try: while True: try: sr = next(r2_reader) file_ids_r2.add(get_seq_id(sr.id, args.normalize_ids)) except ValueError: pass except StopIteration: pass if len(IDs_R1.intersection(file_ids_r1)) > 0: logging.warning( "{:,} of {:,} R1 read IDs from this file overlap with others". format(len(IDs_R1.intersection(file_ids_r1)), len(file_ids_r1))) if len(IDs_R2.intersection(file_ids_r2)) > 0: logging.warning( "{:,} of {:,} R2 read IDs from this file overlap with others". format(len(IDs_R2.intersection(file_ids_r2)), len(file_ids_r2))) IDs_R1.update(file_ids_r1) IDs_R2.update(file_ids_r2) r1_h.seek(0) r2_h.seek(0) overlapped_ids = IDs_R1.intersection(IDs_R2) starting_num_ids = len(overlapped_ids) logging.info( "There are {:,} overlapping IDs from {:,} forward read IDs and {:,} reverse read IDs" .format(starting_num_ids, len(IDs_R1), len(IDs_R2))) for r1_h, r2_h in zip(args.in_1, args.in_2): srs_r1 = fastalite.fastqlite(r1_h) srs_r2 = fastalite.fastqlite(r2_h) sr_1 = None sr_2 = None try: while sr_1 is None: try: sr_1 = next(srs_r1) except ValueError: sr_1 = None while sr_2 is None: try: sr_2 = next(srs_r2) except ValueError: sr_2 = None while len(overlapped_ids) > 0: if len(overlapped_ids) % 10000 == 0: logging.info("{:,} of {:,} pairs remaining".format( len(overlapped_ids), starting_num_ids, )) while (sr_1 is None) or (get_seq_id( sr_1.id, args.normalize_ids) not in overlapped_ids): try: sr_1 = next(srs_r1) except ValueError: sr_1 = None while (sr_2 is None) or (get_seq_id( sr_2.id, args.normalize_ids) not in overlapped_ids): try: sr_2 = next(srs_r2) except ValueError: sr_2 = None assert get_seq_id(sr_1.id, args.normalize_ids) == get_seq_id( sr_2.id, args.normalize_ids), "Order off of reads" # Implicit else paired and shared. # Remove it from the target list (takes care of duplicates) overlapped_ids.remove(get_seq_id(sr_1.id, args.normalize_ids)) # Write out pair... write_fastq(sr_1, args.out_1) write_fastq(sr_2, args.out_2) # move to next try: sr_1 = next(srs_r1) except ValueError: pass try: sr_2 = next(srs_r2) except ValueError: pass except StopIteration: pass