def main(): input_filename = sys.argv[1] input_type = sys.argv[2] output_filename = sys.argv[3] output_type = sys.argv[4] force_quality_encoding = sys.argv[5] summarize_input = sys.argv[6] == 'summarize_input' if force_quality_encoding == 'None': force_quality_encoding = None aggregator = fastqAggregator() out = fastqWriter( open( output_filename, 'wb' ), format = output_type, force_quality_encoding = force_quality_encoding ) read_count = None if summarize_input: reader = fastqVerboseErrorReader else: reader = fastqReader for read_count, fastq_read in enumerate( reader( open( input_filename ), format = input_type, apply_galaxy_conventions = True ) ): if summarize_input: aggregator.consume_read( fastq_read ) out.write( fastq_read ) out.close() if read_count is not None: print "Groomed %i %s reads into %s reads." % ( read_count + 1, input_type, output_type ) if input_type != output_type and 'solexa' in [ input_type, output_type ]: print "Converted between Solexa and PHRED scores." if summarize_input: print "Based upon quality and sequence, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" ) ascii_range = aggregator.get_ascii_range() decimal_range = aggregator.get_decimal_range() print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] ) else: print "No valid FASTQ reads were provided."
def main(): input_filename = sys.argv[1] output_filename = sys.argv[2] input_type = sys.argv[3] or 'sanger' aggregator = fastqAggregator() num_reads = None fastq_read = None for num_reads, fastq_read in enumerate( fastqReader(open(input_filename), format=input_type)): aggregator.consume_read(fastq_read) out = open(output_filename, 'wb') valid_nucleotides = VALID_NUCLEOTIDES if fastq_read: if fastq_read.sequence_space == 'base': out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\tA_Count\tC_Count\tG_Count\tT_Count\tN_Count\tother_bases\tother_base_count\n' ) else: out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\t0_Count\t1_Count\t2_Count\t3_Count\t4_Count\t5_Count\t6_Count\t._Count\tother_bases\tother_base_count\n' ) valid_nucleotides = VALID_COLOR_SPACE for i in range(aggregator.get_max_read_length()): column_stats = aggregator.get_summary_statistics_for_column(i) out.write('%i\t' % (i + 1)) out.write('%s\t' * len(SUMMARY_STAT_ORDER) % tuple([column_stats[key] for key in SUMMARY_STAT_ORDER])) out.write('%s\t' % ','.join(map(str, column_stats['outliers']))) base_counts = aggregator.get_base_counts_for_column(i) for nuc in valid_nucleotides: out.write("%s\t" % base_counts.get(nuc, 0)) extra_nucs = sorted([ nuc for nuc in base_counts.keys() if nuc not in valid_nucleotides ]) out.write("%s\t%s\n" % (','.join(extra_nucs), ','.join( str(base_counts[nuc]) for nuc in extra_nucs))) out.close() if num_reads is None: print "No valid fastq reads could be processed." else: print "%i fastq reads were processed." % (num_reads + 1) print "Based upon quality values and sequence characters, the input data is valid for: %s" % ( ", ".join(aggregator.get_valid_formats()) or "None") ascii_range = aggregator.get_ascii_range() decimal_range = aggregator.get_decimal_range() print "Input ASCII range: %s(%i) - %s(%i)" % ( repr(ascii_range[0]), ord(ascii_range[0]), repr( ascii_range[1]), ord(ascii_range[1]) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed print "Input decimal range: %i - %i" % (decimal_range[0], decimal_range[1])
def main(): input_filename = sys.argv[1] input_type = sys.argv[2] output_filename = sys.argv[3] output_type = sys.argv[4] force_quality_encoding = sys.argv[5] summarize_input = sys.argv[6] == 'summarize_input' if force_quality_encoding == 'None': force_quality_encoding = None aggregator = fastqAggregator() out = fastqWriter(path=output_filename, format=output_type, force_quality_encoding=force_quality_encoding) read_count = None if summarize_input: reader = fastqVerboseErrorReader else: reader = fastqReader for read_count, fastq_read in enumerate( reader(path=input_filename, format=input_type, apply_galaxy_conventions=True)): if summarize_input: aggregator.consume_read(fastq_read) out.write(fastq_read) out.close() if read_count is not None: print("Groomed %i %s reads into %s reads." % (read_count + 1, input_type, output_type)) if input_type != output_type and 'solexa' in [input_type, output_type]: print("Converted between Solexa and PHRED scores.") if summarize_input: print( "Based upon quality and sequence, the input data is valid for: %s" % (", ".join(aggregator.get_valid_formats()) or "None")) ascii_range = aggregator.get_ascii_range() decimal_range = aggregator.get_decimal_range() print( "Input ASCII range: %s(%i) - %s(%i)" % (repr(ascii_range[0]), ord(ascii_range[0]), repr( ascii_range[1]), ord(ascii_range[1])) ) # print using repr, since \x00 (null) causes info truncation in galaxy when printed print("Input decimal range: %i - %i" % (decimal_range[0], decimal_range[1])) else: print("No valid FASTQ reads were provided.")
def partition(input_filename, temp_output_filename, fileCount, quality_encoding, verbose): # print 'Starting Thread: ' + str(fileCount) input_type = ARGV[1] output_type = ARGV[3] force_quality_encoding = quality_encoding summarize_input = verbose if force_quality_encoding == 'None': force_quality_encoding = None aggregator = fastqAggregator() temp_process_file = fastqWriter( open(temp_output_filename, 'wb'), format=output_type, force_quality_encoding=force_quality_encoding) read_count = None if summarize_input: reader = fastqVerboseErrorReader else: reader = fastqReader for read_count, fastq_read in enumerate( reader(open(input_filename, 'rb'), format=input_type, apply_galaxy_conventions=True)): if summarize_input: aggregator.consume_read(fastq_read) temp_process_file.write(fastq_read) # print "Just wrote (%d): " % read_count + str(fastq_read) temp_process_file.close() if read_count is not None: if input_type != output_type and 'solexa' in [input_type, output_type]: print "Converted between Solexa and PHRED scores." if summarize_input: with open(temp_output_filename + "_summary", 'w') as summaryLogFile: pickle.dump(aggregator, summaryLogFile) else: print "No valid FASTQ reads were provided."
def main(): input_filename = sys.argv[1] output_filename = sys.argv[2] input_type = sys.argv[3] or 'sanger' aggregator = fastqAggregator() num_reads = None fastq_read = None for num_reads, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)): aggregator.consume_read(fastq_read) out = open(output_filename, 'w') valid_nucleotides = VALID_NUCLEOTIDES if fastq_read: if fastq_read.sequence_space == 'base': out.write('#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\tA_Count\tC_Count\tG_Count\tT_Count\tN_Count\tother_bases\tother_base_count\n') else: out.write('#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\t0_Count\t1_Count\t2_Count\t3_Count\t4_Count\t5_Count\t6_Count\t._Count\tother_bases\tother_base_count\n') valid_nucleotides = VALID_COLOR_SPACE for i in range(aggregator.get_max_read_length()): column_stats = aggregator.get_summary_statistics_for_column(i) out.write('%d\t' % (i + 1)) out.write("%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\t%f\t%d\t%d\t" % tuple(column_stats[key] for key in SUMMARY_STAT_ORDER)) out.write('%s\t' % ','.join(map(str, column_stats['outliers']))) base_counts = aggregator.get_base_counts_for_column(i) for nuc in valid_nucleotides: out.write("%s\t" % base_counts.get(nuc, 0)) extra_nucs = sorted(nuc for nuc in base_counts.keys() if nuc not in valid_nucleotides) out.write("%s\t%s\n" % (','.join(extra_nucs), ','.join(str(base_counts[nuc]) for nuc in extra_nucs))) out.close() if num_reads is None: print("No valid fastq reads could be processed.") else: print("%i fastq reads were processed." % (num_reads + 1)) print("Based upon quality values and sequence characters, the input data is valid for: %s" % (", ".join(aggregator.get_valid_formats()) or "None")) ascii_range = aggregator.get_ascii_range() decimal_range = aggregator.get_decimal_range() print("Input ASCII range: %s(%i) - %s(%i)" % (repr(ascii_range[0]), ord(ascii_range[0]), repr(ascii_range[1]), ord(ascii_range[1]))) # print(using repr, since \x00 (null) causes info truncation in galaxy when printed) print("Input decimal range: %i - %i" % (decimal_range[0], decimal_range[1]))
def run(self): aggregator = fastqAggregator() reader_class = fastqReader if self.summarize_input: reader_class = fastqVerboseErrorReader read_count = None writer = fastqWriter( path=self.output_filename, format=self.output_type, force_quality_encoding=self.force_quality_encoding) reader = reader_class(fh=self.file_handle, path=self.input_filename, format=self.input_type, apply_galaxy_conventions=True, fix_id=self.fix_id) with writer, reader: for read_count, fastq_read in enumerate(reader): if self.summarize_input: aggregator.consume_read(fastq_read) writer.write(fastq_read) self._print_output(read_count, aggregator)
def main(): input_filename = sys.argv[1] input_type = sys.argv[2] output_filename = sys.argv[3] output_type = sys.argv[4] force_quality_encoding = sys.argv[5] summarize_input = sys.argv[6] == 'summarize_input' if force_quality_encoding == 'None': force_quality_encoding = None fix_id = False # fix inconsistent identifiers (SRA data dumps) if len(sys.argv) > 7: fix_id = sys.argv[7] == 'fix_id' aggregator = fastqAggregator() out = fastqWriter(path=output_filename, format=output_type, force_quality_encoding=force_quality_encoding) read_count = None if summarize_input: reader_type = fastqVerboseErrorReader else: reader_type = fastqReader reader = reader_type(path=input_filename, format=input_type, apply_galaxy_conventions=True, fix_id=fix_id) for read_count, fastq_read in enumerate(reader): if summarize_input: aggregator.consume_read(fastq_read) out.write(fastq_read) out.close() _print_output(read_count, input_type, output_type, summarize_input, aggregator)