def main(): args = sanitize_help(get_parser()).parse_args() print('fastq from ', args.input_sequence, file=sys.stderr) outfp = get_file_writer(args.output, args.gzip, args.bzip) n_count = 0 for n, record in enumerate(screed.open(args.input_sequence)): if n % 10000 == 0: print('...', n, file=sys.stderr) sequence = record['sequence'] if 'N' in sequence: if not args.n_keep: n_count += 1 continue del record['quality'] write_record(record, outfp) print('\n' + 'lines from ' + args.input_sequence, file=sys.stderr) if not args.n_keep: print(str(n_count) + ' lines dropped.', file=sys.stderr) else: print('No lines dropped from file.', file=sys.stderr) print('Wrote output to', describe_file_handle(args.output), file=sys.stderr)
def main(): info('interleave-reads.py') args = sanitize_help(get_parser()).parse_args() check_input_files(args.left, args.force) check_input_files(args.right, args.force) check_space([args.left, args.right], args.force) s1_file = args.left s2_file = args.right print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr) outfp = get_file_writer(args.output, args.gzip, args.bzip) counter = 0 screed_iter_1 = screed.open(s1_file) screed_iter_2 = screed.open(s2_file) for read1, read2 in zip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print(("ERROR: Input files contain different number" " of records."), file=sys.stderr) sys.exit(1) if counter % 100000 == 0: print('...', counter, 'pairs', file=sys.stderr) counter += 1 name1 = read1.name name2 = read2.name if not args.no_reformat: if not check_is_left(name1): name1 += '/1' if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print("ERROR: This doesn't look like paired data! " "%s %s" % (read1.name, read2.name), file=sys.stderr) sys.exit(1) write_record_pair(read1, read2, outfp) print('final: interleaved %d pairs' % counter, file=sys.stderr) print('output written to', describe_file_handle(outfp), file=sys.stderr)
def main(): info('interleave-reads.py') args = sanitize_help(get_parser()).parse_args() check_input_files(args.left, args.force) check_input_files(args.right, args.force) check_space([args.left, args.right], args.force) s1_file = args.left s2_file = args.right fail = False print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr) outfp = get_file_writer(args.output, args.gzip, args.bzip) counter = 0 screed_iter_1 = screed.open(s1_file) screed_iter_2 = screed.open(s2_file) for read1, read2 in zip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print(("ERROR: Input files contain different number" " of records."), file=sys.stderr) sys.exit(1) if counter % 100000 == 0: print('...', counter, 'pairs', file=sys.stderr) counter += 1 name1 = read1.name name2 = read2.name if not args.no_reformat: if not check_is_left(name1): name1 += '/1' if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print("ERROR: This doesn't look like paired data! " "%s %s" % (read1.name, read2.name), file=sys.stderr) sys.exit(1) write_record_pair(read1, read2, outfp) print('final: interleaved %d pairs' % counter, file=sys.stderr) print('output written to', describe_file_handle(outfp), file=sys.stderr)
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print( "Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def main(): args = sanitize_help(get_parser()).parse_args() infile = args.infile filenames = [infile] check_input_files(infile, args.force) check_space(filenames, args.force) basename = os.path.basename(infile) # decide where to put output files - specific directory? or just default? if infile in ('/dev/stdin', '-'): # seqan only treats '-' as "read from stdin" infile = '-' if not (args.output_first and args.output_second): print("Accepting input from stdin; " "output filenames must be provided.", file=sys.stderr) sys.exit(1) elif args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = os.path.join(args.output_directory, basename + '.1') out2 = os.path.join(args.output_directory, basename + '.2') else: out1 = basename + '.1' out2 = basename + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip) out1 = fp_out1.name else: # Use default filename created above fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip) if args.output_second: fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip) out2 = fp_out2.name else: # Use default filename created above fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip) # put orphaned reads here, if -0! if args.output_orphaned: fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip) out0 = describe_file_handle(args.output_orphaned) counter1 = 0 counter2 = 0 counter3 = 0 index = None # walk through all the reads in broken-paired mode. paired_iter = broken_paired_reader(ReadParser(infile), require_paired=not args.output_orphaned) try: for index, is_pair, record1, record2 in paired_iter: if index % 10000 == 0: print('...', index, file=sys.stderr) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 elif args.output_orphaned: write_record(record1, fp_out0) counter3 += 1 except UnpairedReadsError as e: print("Unpaired reads found starting at {name}; exiting".format( name=e.read1.name), file=sys.stderr) sys.exit(1) print("DONE; split %d sequences (%d left, %d right, %d orphans)" % (counter1 + counter2, counter1, counter2, counter3), file=sys.stderr) print("/1 reads in %s" % out1, file=sys.stderr) print("/2 reads in %s" % out2, file=sys.stderr) if args.output_orphaned: print("orphans in %s" % out0, file=sys.stderr)
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = Countgraph.load(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print("Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def main(): info('split-paired-reads.py') args = sanitize_help(get_parser()).parse_args() infile = args.infile filenames = [infile] check_input_files(infile, args.force) check_space(filenames, args.force) basename = os.path.basename(infile) # decide where to put output files - specific directory? or just default? if infile in ('/dev/stdin', '-'): if not (args.output_first and args.output_second): print( "Accepting input from stdin; " "output filenames must be provided.", file=sys.stderr) sys.exit(1) elif args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = os.path.join(args.output_directory, basename + '.1') out2 = os.path.join(args.output_directory, basename + '.2') else: out1 = basename + '.1' out2 = basename + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip) out1 = fp_out1.name else: # Use default filename created above fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip) if args.output_second: fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip) out2 = fp_out2.name else: # Use default filename created above fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip) # put orphaned reads here, if -0! if args.output_orphaned: fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip) out0 = describe_file_handle(args.output_orphaned) counter1 = 0 counter2 = 0 counter3 = 0 index = None screed_iter = screed.open(infile) # walk through all the reads in broken-paired mode. paired_iter = broken_paired_reader(screed_iter, require_paired=not args.output_orphaned) try: for index, is_pair, record1, record2 in paired_iter: if index % 10000 == 0: print('...', index, file=sys.stderr) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 elif args.output_orphaned: write_record(record1, fp_out0) counter3 += 1 except UnpairedReadsError as e: print("Unpaired reads found starting at {name}; exiting".format( name=e.read1.name), file=sys.stderr) sys.exit(1) print("DONE; split %d sequences (%d left, %d right, %d orphans)" % (counter1 + counter2, counter1, counter2, counter3), file=sys.stderr) print("/1 reads in %s" % out1, file=sys.stderr) print("/2 reads in %s" % out2, file=sys.stderr) if args.output_orphaned: print("orphans in %s" % out0, file=sys.stderr)