def main(args, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ try: infile, outfile_base = args except ValueError: parser.print_help() sys.exit("\nError: one infile and one outfile base name are required! Got %s"%args) indexes = options.index_list.split(',') index_OUTFILES = {index: open("%s_%s.fq"%(outfile_base, index), 'w') for index in indexes} unmatched_OUTFILE = open("%s_unmatched.fq"%outfile_base, 'w') index_counts = {index: 0 for index in indexes} unmatched_count = 0 if options.index_in_sequence: index_len = set(len(x) for x in indexes) if len(index_len) > 1: raise Exception("Indexes need to all have the same lengths! Found lengths %s from indexes %s"%(index_len, indexes)) index_len = index_len.pop() get_index = lambda name,seq: seq[:index_len] make_output = lambda name, seq, qual, index: ("%s:%s"%(name,index), seq[index_len:], qual[index_len:]) else: get_index = lambda name, seq: name.split(':')[-1] make_output = lambda name, seq, qual, index: (name, seq, qual) for (name,seq,qual) in basic_seq_utilities.parse_fastq(infile): index = get_index(name, seq) try: OUTFILE = index_OUTFILES[index] index_counts[index] += 1 except KeyError: OUTFILE = unmatched_OUTFILE unmatched_count += 1 basic_seq_utilities.write_fastq_line(*make_output(name, seq, qual, index), OUTFILE=OUTFILE) if not options.quiet: total = unmatched_count + sum(index_counts.values()) print "%s reads:\n%s unmatched\n%s"%(total, value_and_percentages(unmatched_count, [total]), '\n'.join("%s %s"%(value_and_percentages(count, [total]), index) for (index,count) in index_counts.items()) )
(options, args) = parser.parse_args() try: [infile,outfile] = args except ValueError: parser.print_help() sys.exit("\nError: exactly one input and output file required!") max_values_kept = 10 if options.remove_first_header_field: first_field_values = set() if options.remove_last_header_field: last_field_values = set() # go over all sequences, remove bases as required, print all the sequences with open(outfile,'w') as OUTFILE: for (header,sequence,quality) in parse_fastq(infile): if options.remove_first_header_field: first_field,header = header.split(':',1) if len(first_field_values)<max_values_kept: first_field_values.add(first_field) if options.remove_last_header_field: header,last_field = header.rsplit('#',1) if len(last_field_values)<max_values_kept: last_field_values.add(last_field) OUTFILE.write( "@%s\n"%header) OUTFILE.write(sequence + '\n') OUTFILE.write( "+\n") OUTFILE.write(quality + '\n') def print_info_line(field_name, field_values): global max_values_kept