예제 #1
0
def main():
    input_filename = sys.argv[1]
    input_type = sys.argv[2]
    output_filename = sys.argv[3]
    output_type = sys.argv[4]
    force_quality_encoding = sys.argv[5]
    summarize_input = sys.argv[6] == 'summarize_input'
    if force_quality_encoding == 'None':
        force_quality_encoding = None
    
    aggregator = fastqAggregator()
    out = fastqWriter( open( output_filename, 'wb' ), format = output_type, force_quality_encoding = force_quality_encoding )
    read_count = None
    if summarize_input:
        reader = fastqVerboseErrorReader
    else:
        reader = fastqReader
    for read_count, fastq_read in enumerate( reader( open( input_filename ), format = input_type, apply_galaxy_conventions = True ) ):
        if summarize_input:
            aggregator.consume_read( fastq_read )
        out.write( fastq_read )
    out.close()
    
    if read_count is not None:
        print "Groomed %i %s reads into %s reads." % ( read_count + 1, input_type, output_type )
        if input_type != output_type and 'solexa' in [ input_type, output_type ]:
            print "Converted between Solexa and PHRED scores."
        if summarize_input:
            print "Based upon quality and sequence, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() )  or "None" )
            ascii_range = aggregator.get_ascii_range()
            decimal_range =  aggregator.get_decimal_range()
            print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed
            print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] )        
    else:
        print "No valid FASTQ reads were provided."
예제 #2
0
def main():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    input_type = sys.argv[3] or 'sanger'

    aggregator = fastqAggregator()
    num_reads = None
    fastq_read = None
    for num_reads, fastq_read in enumerate(
            fastqReader(open(input_filename), format=input_type)):
        aggregator.consume_read(fastq_read)
    out = open(output_filename, 'wb')
    valid_nucleotides = VALID_NUCLEOTIDES
    if fastq_read:
        if fastq_read.sequence_space == 'base':
            out.write(
                '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\tA_Count\tC_Count\tG_Count\tT_Count\tN_Count\tother_bases\tother_base_count\n'
            )
        else:
            out.write(
                '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\t0_Count\t1_Count\t2_Count\t3_Count\t4_Count\t5_Count\t6_Count\t._Count\tother_bases\tother_base_count\n'
            )
            valid_nucleotides = VALID_COLOR_SPACE
    for i in range(aggregator.get_max_read_length()):
        column_stats = aggregator.get_summary_statistics_for_column(i)
        out.write('%i\t' % (i + 1))
        out.write('%s\t' * len(SUMMARY_STAT_ORDER) %
                  tuple([column_stats[key] for key in SUMMARY_STAT_ORDER]))
        out.write('%s\t' % ','.join(map(str, column_stats['outliers'])))
        base_counts = aggregator.get_base_counts_for_column(i)
        for nuc in valid_nucleotides:
            out.write("%s\t" % base_counts.get(nuc, 0))
        extra_nucs = sorted([
            nuc for nuc in base_counts.keys() if nuc not in valid_nucleotides
        ])
        out.write("%s\t%s\n" % (','.join(extra_nucs), ','.join(
            str(base_counts[nuc]) for nuc in extra_nucs)))
    out.close()
    if num_reads is None:
        print "No valid fastq reads could be processed."
    else:
        print "%i fastq reads were processed." % (num_reads + 1)
        print "Based upon quality values and sequence characters, the input data is valid for: %s" % (
            ", ".join(aggregator.get_valid_formats()) or "None")
        ascii_range = aggregator.get_ascii_range()
        decimal_range = aggregator.get_decimal_range()
        print "Input ASCII range: %s(%i) - %s(%i)" % (
            repr(ascii_range[0]), ord(ascii_range[0]), repr(
                ascii_range[1]), ord(ascii_range[1])
        )  #print using repr, since \x00 (null) causes info truncation in galaxy when printed
        print "Input decimal range: %i - %i" % (decimal_range[0],
                                                decimal_range[1])
예제 #3
0
def main():
    input_filename = sys.argv[1]
    input_type = sys.argv[2]
    output_filename = sys.argv[3]
    output_type = sys.argv[4]
    force_quality_encoding = sys.argv[5]
    summarize_input = sys.argv[6] == 'summarize_input'
    if force_quality_encoding == 'None':
        force_quality_encoding = None

    aggregator = fastqAggregator()
    out = fastqWriter(path=output_filename,
                      format=output_type,
                      force_quality_encoding=force_quality_encoding)
    read_count = None
    if summarize_input:
        reader = fastqVerboseErrorReader
    else:
        reader = fastqReader
    for read_count, fastq_read in enumerate(
            reader(path=input_filename,
                   format=input_type,
                   apply_galaxy_conventions=True)):
        if summarize_input:
            aggregator.consume_read(fastq_read)
        out.write(fastq_read)
    out.close()

    if read_count is not None:
        print("Groomed %i %s reads into %s reads." %
              (read_count + 1, input_type, output_type))
        if input_type != output_type and 'solexa' in [input_type, output_type]:
            print("Converted between Solexa and PHRED scores.")
        if summarize_input:
            print(
                "Based upon quality and sequence, the input data is valid for: %s"
                % (", ".join(aggregator.get_valid_formats()) or "None"))
            ascii_range = aggregator.get_ascii_range()
            decimal_range = aggregator.get_decimal_range()
            print(
                "Input ASCII range: %s(%i) - %s(%i)" %
                (repr(ascii_range[0]), ord(ascii_range[0]), repr(
                    ascii_range[1]), ord(ascii_range[1]))
            )  # print using repr, since \x00 (null) causes info truncation in galaxy when printed
            print("Input decimal range: %i - %i" %
                  (decimal_range[0], decimal_range[1]))
    else:
        print("No valid FASTQ reads were provided.")
예제 #4
0
def partition(input_filename, temp_output_filename, fileCount,
              quality_encoding, verbose):
    #    print 'Starting Thread: ' + str(fileCount)
    input_type = ARGV[1]
    output_type = ARGV[3]
    force_quality_encoding = quality_encoding
    summarize_input = verbose
    if force_quality_encoding == 'None':
        force_quality_encoding = None
    aggregator = fastqAggregator()
    temp_process_file = fastqWriter(
        open(temp_output_filename, 'wb'),
        format=output_type,
        force_quality_encoding=force_quality_encoding)
    read_count = None
    if summarize_input:
        reader = fastqVerboseErrorReader
    else:
        reader = fastqReader
    for read_count, fastq_read in enumerate(
            reader(open(input_filename, 'rb'),
                   format=input_type,
                   apply_galaxy_conventions=True)):
        if summarize_input:
            aggregator.consume_read(fastq_read)
        temp_process_file.write(fastq_read)


#        print "Just wrote (%d): " % read_count + str(fastq_read)
    temp_process_file.close()
    if read_count is not None:
        if input_type != output_type and 'solexa' in [input_type, output_type]:
            print "Converted between Solexa and PHRED scores."
        if summarize_input:
            with open(temp_output_filename + "_summary",
                      'w') as summaryLogFile:
                pickle.dump(aggregator, summaryLogFile)
    else:
        print "No valid FASTQ reads were provided."
예제 #5
0
def main():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    input_type = sys.argv[3] or 'sanger'

    aggregator = fastqAggregator()
    num_reads = None
    fastq_read = None
    for num_reads, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)):
        aggregator.consume_read(fastq_read)
    out = open(output_filename, 'w')
    valid_nucleotides = VALID_NUCLEOTIDES
    if fastq_read:
        if fastq_read.sequence_space == 'base':
            out.write('#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\tA_Count\tC_Count\tG_Count\tT_Count\tN_Count\tother_bases\tother_base_count\n')
        else:
            out.write('#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\t0_Count\t1_Count\t2_Count\t3_Count\t4_Count\t5_Count\t6_Count\t._Count\tother_bases\tother_base_count\n')
            valid_nucleotides = VALID_COLOR_SPACE
    for i in range(aggregator.get_max_read_length()):
        column_stats = aggregator.get_summary_statistics_for_column(i)
        out.write('%d\t' % (i + 1))
        out.write("%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\t%f\t%d\t%d\t" % tuple(column_stats[key] for key in SUMMARY_STAT_ORDER))
        out.write('%s\t' % ','.join(map(str, column_stats['outliers'])))
        base_counts = aggregator.get_base_counts_for_column(i)
        for nuc in valid_nucleotides:
            out.write("%s\t" % base_counts.get(nuc, 0))
        extra_nucs = sorted(nuc for nuc in base_counts.keys() if nuc not in valid_nucleotides)
        out.write("%s\t%s\n" % (','.join(extra_nucs), ','.join(str(base_counts[nuc]) for nuc in extra_nucs)))
    out.close()
    if num_reads is None:
        print("No valid fastq reads could be processed.")
    else:
        print("%i fastq reads were processed." % (num_reads + 1))
        print("Based upon quality values and sequence characters, the input data is valid for: %s" % (", ".join(aggregator.get_valid_formats()) or "None"))
        ascii_range = aggregator.get_ascii_range()
        decimal_range = aggregator.get_decimal_range()
        print("Input ASCII range: %s(%i) - %s(%i)" % (repr(ascii_range[0]), ord(ascii_range[0]), repr(ascii_range[1]), ord(ascii_range[1])))  # print(using repr, since \x00 (null) causes info truncation in galaxy when printed)
        print("Input decimal range: %i - %i" % (decimal_range[0], decimal_range[1]))
예제 #6
0
    def run(self):
        aggregator = fastqAggregator()
        reader_class = fastqReader
        if self.summarize_input:
            reader_class = fastqVerboseErrorReader
        read_count = None

        writer = fastqWriter(
            path=self.output_filename,
            format=self.output_type,
            force_quality_encoding=self.force_quality_encoding)
        reader = reader_class(fh=self.file_handle,
                              path=self.input_filename,
                              format=self.input_type,
                              apply_galaxy_conventions=True,
                              fix_id=self.fix_id)
        with writer, reader:
            for read_count, fastq_read in enumerate(reader):
                if self.summarize_input:
                    aggregator.consume_read(fastq_read)
                writer.write(fastq_read)

        self._print_output(read_count, aggregator)
예제 #7
0
def main():
    input_filename = sys.argv[1]
    input_type = sys.argv[2]
    output_filename = sys.argv[3]
    output_type = sys.argv[4]
    force_quality_encoding = sys.argv[5]
    summarize_input = sys.argv[6] == 'summarize_input'
    if force_quality_encoding == 'None':
        force_quality_encoding = None

    fix_id = False  # fix inconsistent identifiers (SRA data dumps)
    if len(sys.argv) > 7:
        fix_id = sys.argv[7] == 'fix_id'

    aggregator = fastqAggregator()
    out = fastqWriter(path=output_filename,
                      format=output_type,
                      force_quality_encoding=force_quality_encoding)
    read_count = None
    if summarize_input:
        reader_type = fastqVerboseErrorReader
    else:
        reader_type = fastqReader

    reader = reader_type(path=input_filename,
                         format=input_type,
                         apply_galaxy_conventions=True,
                         fix_id=fix_id)
    for read_count, fastq_read in enumerate(reader):
        if summarize_input:
            aggregator.consume_read(fastq_read)
        out.write(fastq_read)
    out.close()

    _print_output(read_count, input_type, output_type, summarize_input,
                  aggregator)