def main(infiles, args): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ if not infiles: parser.print_help() sys.exit("\nError: at least one infile and exactly one outfile are required!") all_names_and_seqs = [] for infile in infiles: seq_format = check_fasta_fastq_format(infile) with open(infile) as INFILE: for sequence in SeqIO.parse(INFILE, seq_format): # using seq.tostring() to convert Biopython Seq objects to plain strings - Seq objects aren't hashable correctly all_names_and_seqs.append((sequence.name, sequence.seq.tostring())) no_repeats = True for (nameA, seqA), (nameB, seqB) in combinations(all_names_and_seqs, 2): result = check_pair( seqA, seqB, nameA, nameB, options.exact_identity_only, options.forward_only, options.ignore_empty_sequences ) if result: print result no_repeats = False if no_repeats: print "NO REPEATS."
def main(infiles, total_seq_number_only=False, input_collapsed_to_unique=False, include_zeros=False, verbosity=1, OUTPUT=sys.stdout): """ Given a list of fastq/fasta files, return total seq number, a length:N dict and formatted info (optionally print). If total_seq_number_only is True, only return/print total seq count. If input_collapsed_to_unique is True, program assumes infile was preprocessed with fastx_collapser, and attempts to give original pre-collapsing seq_counts (based on headers). If include_zeros is False (default), only print non-zero seq counts; if True, print seq counts for all lengths between min and max length, even if they're 0. Verbosity: if >1, print filetype and seqcount for each input file; if 0, don't print header or summary. Prints to stdout by default; to print to file, pass open file object as OUTPUT; to suppress printing, pass None.""" # a counter with a default value of 0 total_seqcount, total_seqlen_dict = 0, {} formatted_output = [] # add the numbers from each file to total_seqlen_dict for infile in infiles: # detect filetype based on extension # MAYBE-TODO add command-line options that force the format to fasta/fastq instead of checking by extension? seq_format = check_fasta_fastq_format(infile, verbosity>1) # note: just using plain "fastq" quality encoding, because we're not dealing with qualities so it doesn't matter with open(infile) as INFILE: file_seqcount, file_seqlen_dict = seq_count_and_lengths(SeqIO.parse(INFILE, seq_format), total_seq_number_only, input_collapsed_to_unique) total_seqcount += file_seqcount total_seqlen_dict = add_dicts_of_ints(total_seqlen_dict, file_seqlen_dict) # format and print (optionally) and return the output if total_seq_number_only: formatted_output.append("Total %s seqs\n"%total_seqcount) else: formatted_output += _format_lengths(total_seqlen_dict, include_zeros, verbosity) if not OUTPUT is None: for line in formatted_output: OUTPUT.write(line) return total_seqcount, total_seqlen_dict, formatted_output
def main(infiles, args): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ if not infiles: parser.print_help() sys.exit( "\nError: at least one infile and exactly one outfile are required!" ) if options.seq_length is None: seqlen_info = '' elif options.seq_length > 0: seqlen_info = ' first %sbp' % options.seq_length elif options.seq_length < 0: seqlen_info = ' last %sbp' % (-options.seq_length) for infile in infiles: seq_format = check_fasta_fastq_format(infile) with open(infile) as INFILE: seq_counter = subsequence_counts( SeqIO.parse(INFILE, seq_format), options.seq_length, options.input_collapsed_to_unique) seq_list_by_count = sorted( seq_counter.items(), key=lambda (s, c): c, reverse=True) total_seqs = sum(seq_counter.values()) # if not using the min_percent_to_print option, just print the top N sequences from each file if options.min_percent_to_print is None: seq_data_list = [] for i in range(min(options.n_to_print, len(seq_list_by_count))): seq, count = seq_list_by_count[i] percent = count * 100.0 / total_seqs # "%.2g" is significant-digit-based formatting of floats!! So 92.12345 is 92%, but 0.00045 is 0.00045%. percent_2_sig_digits = str(float("%.2g" % percent)) if percent_2_sig_digits.endswith(".0"): percent_2_sig_digits = percent_2_sig_digits[:-2] seq_data_list.append( "%s%% %s (%d)" % (percent_2_sig_digits, seq, count)) print " * %s (%s seqs, %s unique%s):" % ( infile, total_seqs, len(seq_list_by_count), seqlen_info) print ', '.join(seq_data_list) # if using the min_percent_to_print option, just print the top N sequences from each file else: print "min_percent_to_print NOT IMPLEMENTED!"
def main(args, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ try: [infile] = args except ValueError: parser.print_help() sys.exit( "Error: exactly one infile required! %s infiles provided: %s" % (len(args), args)) # MAYBE-TODO bowtie could take multiple infiles, but then I'd have to deal with multiple preprocessing metafiles... other_bowtie_options_split = options.other_bowtie_options.split(' ') if any([ x in other_bowtie_options_split for x in ('-v -e --maqerr -n --seedmms -l --seedlen'.split(' ')) ]): raise Exception( "Cannot include -v/-n/-e and related bowtie options in -B! Use separate -e option for that; " "note that this program allows -v bowtie mode only.") if any([ x in other_bowtie_options_split for x in ('-m -k -a --all'.split(' ')) ]): raise Exception( "Cannot include -m/-a bowtie options in -B! Use separate -m option for that." ) specific_bowtie_options = '-v %s' % options.allowed_errors if not any([x in options.other_bowtie_options for x in ('-f', '-q')]): infile_format = check_fasta_fastq_format(infile) if infile_format == 'fasta': specific_bowtie_options += ' -f' elif infile_format == 'fastq': specific_bowtie_options += ' -q' else: raise Exception("Cannot process auto-detected infile format %s!" % infile_format) # using a minimum of -k 2 (or -a) in order to make sure I can easily tell multiple from unique alignments if options.multiple_to_show == -1: multiple_bowtie_option = '-a' else: multiple_bowtie_option = '-k %s' % max(options.multiple_to_show, 2) # output file names: temporary for alignments, final (split or all), metadata info file. outfile_suffix = '.sam' if any( [x in options.other_bowtie_options for x in ['-S', '--sam']]) else '.map' tmpfile_genome = options.outfile_basename + '_tmp_genome' + outfile_suffix if options.cassette_bowtie_index != 'NONE': tmpfile_cassette = options.outfile_basename + '_tmp_cassette' + outfile_suffix if options.dont_split_by_category: outfile_all = options.outfile_basename + outfile_suffix else: outfile_unaligned = options.outfile_basename + '_unaligned.fa' outfile_cassette = options.outfile_basename + '_cassette' + outfile_suffix outfile_multiple_genomic = options.outfile_basename + '_multiple-genomic'\ + ('.fa' if options.multiple_to_show==0 else outfile_suffix) outfile_genomic_unique = options.outfile_basename + '_genomic-unique' + outfile_suffix infofile = options.outfile_basename + '_info.txt' with open(infofile, 'w') as INFOFILE: ### write header data write_header_data(INFOFILE, options) ### run bowtie vs the main/genome index file # run 'bowtie --version' to get that data (print to INFOFILE but not stdout) INFOFILE.write('\n\n') run_command_print_info_output("bowtie --version", INFOFILE, printing_level=0, shell=True) # run the actual bowtie alignment command; always print output to stdout as well as INFOFILE # (bowtie actually prints the summary to stderr, not stdout, so I need to print it to stdout in case there's # an error, so I can see the error message! Or I could try to detect whether there was an error or not # based on the output contents, but that seems like unnecessary work.) INFOFILE.write('\n\n') command = "bowtie %s %s %s %s %s %s" % ( specific_bowtie_options, multiple_bowtie_option, options.other_bowtie_options, options.genome_bowtie_index, infile, tmpfile_genome) if options.bowtie_aln_file_genome is None: run_command_print_info_output(command, INFOFILE, printing_level=(not options.quiet), shell=True) else: options.keep_tmpfiles = True if not os.access(options.bowtie_aln_file_genome, os.R_OK): raise Exception( "Can't read provided options.bowtie_aln_file_genome %s!" % options.bowtie_aln_file_genome) text = "UNUSUAL RUN: Instead of running \"%s\", using file %s." % ( command, options.bowtie_aln_file_genome) print text INFOFILE.write('\n' + text + '\n') tmpfile_genome = options.bowtie_aln_file_genome ### run bowtie vs the cassette index file if given if options.cassette_bowtie_index != 'NONE': INFOFILE.write('\n\n') command = "bowtie %s %s %s %s %s %s" % ( specific_bowtie_options, '--all', options.other_bowtie_options, options.cassette_bowtie_index, infile, tmpfile_cassette) if options.bowtie_aln_file_cassette is None: run_command_print_info_output( command, INFOFILE, printing_level=(not options.quiet), shell=True) else: options.keep_tmpfiles = True if not os.access(options.bowtie_aln_file_cassette, os.R_OK): raise Exception( "Can't read provided options.bowtie_aln_file_cassette %s!" % options.bowtie_aln_file_cassette) text = "UNUSUAL RUN: Instead of running \"%s\", using file %s." % ( command, options.bowtie_aln_file_cassette) print text INFOFILE.write('\n' + text + '\n') tmpfile_cassette = options.bowtie_aln_file_cassette ### Check that bowtie runs worked missing_alnfile_text = "Bowtie run against %s failed! See above or %s file for bowtie error message." if not os.access(tmpfile_genome, os.R_OK): sys.exit(missing_alnfile_text % (options.genome_bowtie_index, infofile)) if options.cassette_bowtie_index != 'NONE' and not os.access( tmpfile_cassette, os.R_OK): sys.exit(missing_alnfile_text % (options.cassette_bowtie_index, infofile)) # MAYBE-TODO make sure bowtie errors are printed to stdout even with -1? Hard - bowtie is unfortunately ANNOYING # and uses stderr both for normal output and for errors, AND gives no returncode. ### Parse the two alignment files in parallel, and merge them together (remove sub-optimal alignments, # (and remove non-cassette ones if there are cassette ones with equal quality); remove alignment files. # Do all this WITHOUT reading the entire files into memory! A bit tricky. if options.cassette_bowtie_index != 'NONE': aln_list_generator = aln_generator_from_two_samfiles_parallel( tmpfile_genome, tmpfile_cassette) else: aln_list_generator = aln_generator_from_single_samfile( tmpfile_genome) ### Decide the proper category for each read, and write the info to appropriate final output files if options.dont_split_by_category: GENOMIC_UNIQUE_FILE = MULTIPLE_GENOMIC_FILE = CASSETTE_FILE = UNALIGNED_FILE = open( outfile_all, 'w') unaligned_as_fasta = False else: UNALIGNED_FILE = open(outfile_unaligned, 'w') CASSETTE_FILE = open(outfile_cassette, 'w') MULTIPLE_GENOMIC_FILE = open(outfile_multiple_genomic, 'w') GENOMIC_UNIQUE_FILE = open(outfile_genomic_unique, 'w') unaligned_as_fasta = True category_readcounts = { 'unaligned': 0, 'cassette': 0, 'multiple-genomic': 0, 'genomic-unique': 0, 'cassette-multiple': 0 } for (readname, full_aln_list) in aln_list_generator: reduced_aln_list = reduce_alignment_list(full_aln_list) final_aln_list = prioritize_cassette_reads( reduced_aln_list, if_cassette_function=is_cassette_chromosome) categorize_reads_print_to_files( readname, final_aln_list, category_readcounts, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=unaligned_as_fasta, multiple_to_write=options.multiple_to_show, input_collapsed_to_unique=options.input_collapsed_to_unique, no_multi_cassette_warnings=options.no_multi_cassette_warnings) if options.dont_split_by_category: # all files are actually the same pointer, so only close once GENOMIC_UNIQUE_FILE.close() else: UNALIGNED_FILE.close() CASSETTE_FILE.close() MULTIPLE_GENOMIC_FILE.close() GENOMIC_UNIQUE_FILE.close() # delete alignment tmpfiles now that they've been parsed if not options.keep_tmpfiles: os.remove(tmpfile_genome) if options.cassette_bowtie_index != 'NONE': os.remove(tmpfile_cassette) ### print category_readcounts to INFOFILE in a nice way text1 = "\n### FINAL ALIGNMENT CATEGORY COUNTS" cassette_multiple = category_readcounts.pop('cassette-multiple') total_reads = sum(category_readcounts.values()) text2 = "# total reads: %s" % total_reads if options.input_collapsed_to_unique: text2 += " (uncollapsed readcounts)" lines = [text1, text2] for category, count in sorted(category_readcounts.items()): text = "# %s: %s" % (category, value_and_percentages(count, [total_reads])) if category == 'cassette' and cassette_multiple: text += ' (Warning: %s multiple!!)' % cassette_multiple lines.append(text) INFOFILE.write('\n') for text in lines: INFOFILE.write(text + '\n') if not options.quiet: print text ### copy preprocessing metadata file to the bottom of the new metadata file INFOFILE.write( "\n\n################## Metadata from input preprocessing ##################\n\n" ) if options.input_metadata_file == 'NONE': INFOFILE.write( 'Not looking for a metadata input file, as specified by options\n' ) else: if options.input_metadata_file == 'AUTO': # the correct info file for X.txt is X.fa, but for X_5prime.txt it can be either X_5prime.txt or X.txt, so try both. # (in the new preprocessing version all files are X_*prime.txt and the info files are X_info.txt; # in the old version it was just X.txt and X_info.txt) # MAYBE-TODO add a test-case for this thing! Probably too minor. metafile_basename = os.path.splitext(infile)[0] options.input_metadata_file = metafile_basename + '_info.txt' if not os.path.exists(options.input_metadata_file): if metafile_basename.endswith( '_3prime') or metafile_basename.endswith( '_5prime'): options.input_metadata_file = metafile_basename[:-len( '_3prime')] + '_info.txt' text = 'Automatically determining metadata input file name: %s\n' % options.input_metadata_file if not options.quiet: print text, else: text = 'Metadata input file name provided in options: %s\n' % options.input_metadata_file INFOFILE.write(text + '\n') if os.path.exists(options.input_metadata_file): print_text_from_file(options.input_metadata_file, INFOFILE, printing=False) else: text = 'Metadata input file %s not found!\n' % options.input_metadata_file if not options.quiet: print text, INFOFILE.write(text)
def main(args, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ try: [infile] = args except ValueError: parser.print_help() sys.exit("Error: exactly one infile required! %s infiles provided: %s"%(len(args), args)) # MAYBE-TODO bowtie could take multiple infiles, but then I'd have to deal with multiple preprocessing metafiles... other_bowtie_options_split = options.other_bowtie_options.split(' ') if any([x in other_bowtie_options_split for x in ('-v -e --maqerr -n --seedmms -l --seedlen'.split(' '))]): raise Exception("Cannot include -v/-n/-e and related bowtie options in -B! Use separate -e option for that; " "note that this program allows -v bowtie mode only.") if any([x in other_bowtie_options_split for x in ('-m -k -a --all'.split(' '))]): raise Exception("Cannot include -m/-a bowtie options in -B! Use separate -m option for that.") specific_bowtie_options = '-v %s'%options.allowed_errors if not any([x in options.other_bowtie_options for x in ('-f', '-q')]): infile_format = check_fasta_fastq_format(infile) if infile_format=='fasta': specific_bowtie_options += ' -f' elif infile_format=='fastq': specific_bowtie_options += ' -q' else: raise Exception("Cannot process auto-detected infile format %s!"%infile_format) # using a minimum of -k 2 (or -a) in order to make sure I can easily tell multiple from unique alignments if options.multiple_to_show == -1: multiple_bowtie_option = '-a' else: multiple_bowtie_option = '-k %s'%max(options.multiple_to_show, 2) # output file names: temporary for alignments, final (split or all), metadata info file. outfile_suffix = '.sam' if any([x in options.other_bowtie_options for x in ['-S','--sam']]) else '.map' tmpfile_genome = options.outfile_basename + '_tmp_genome' + outfile_suffix if options.cassette_bowtie_index != 'NONE': tmpfile_cassette = options.outfile_basename + '_tmp_cassette' + outfile_suffix if options.dont_split_by_category: outfile_all = options.outfile_basename + outfile_suffix else: outfile_unaligned = options.outfile_basename + '_unaligned.fa' outfile_cassette = options.outfile_basename + '_cassette' + outfile_suffix outfile_multiple_genomic = options.outfile_basename + '_multiple-genomic'\ + ('.fa' if options.multiple_to_show==0 else outfile_suffix) outfile_genomic_unique = options.outfile_basename + '_genomic-unique' + outfile_suffix infofile = options.outfile_basename + '_info.txt' with open(infofile,'w') as INFOFILE: ### write header data write_header_data(INFOFILE,options) ### run bowtie vs the main/genome index file # run 'bowtie --version' to get that data (print to INFOFILE but not stdout) INFOFILE.write('\n\n') run_command_print_info_output("bowtie --version", INFOFILE, printing_level=0, shell=True) # run the actual bowtie alignment command; always print output to stdout as well as INFOFILE # (bowtie actually prints the summary to stderr, not stdout, so I need to print it to stdout in case there's # an error, so I can see the error message! Or I could try to detect whether there was an error or not # based on the output contents, but that seems like unnecessary work.) INFOFILE.write('\n\n') command = "bowtie %s %s %s %s %s %s"%(specific_bowtie_options, multiple_bowtie_option, options.other_bowtie_options, options.genome_bowtie_index, infile, tmpfile_genome) run_command_print_info_output(command, INFOFILE, printing_level=(not options.quiet), shell=True) ### run bowtie vs the cassette index file if given if options.cassette_bowtie_index != 'NONE': INFOFILE.write('\n\n') command = "bowtie %s %s %s %s %s %s"%(specific_bowtie_options, '--all', options.other_bowtie_options, options.cassette_bowtie_index, infile, tmpfile_cassette) run_command_print_info_output(command, INFOFILE, printing_level=(not options.quiet), shell=True) ### Check that bowtie runs worked missing_alnfile_text = "Bowtie run against %s failed! See above or %s file for bowtie error message." if not os.access(tmpfile_genome, os.R_OK): sys.exit(missing_alnfile_text%(options.genome_bowtie_index, infofile)) if options.cassette_bowtie_index != 'NONE' and not os.access(tmpfile_cassette, os.R_OK): sys.exit(missing_alnfile_text%(options.cassette_bowtie_index, infofile)) # MAYBE-TODO make sure bowtie errors are printed to stdout even with -1? Hard - bowtie is unfortunately ANNOYING # and uses stderr both for normal output and for errors, AND gives no returncode. ### Parse the two alignment files, and merge them together (remove sub-optimal alignments, # (and remove non-cassette ones if there are cassette ones with equal quality); remove alignment files. readname_to_aln_list = make_aln_dict_from_samfile(tmpfile_genome) if options.cassette_bowtie_index != 'NONE': readname_to_aln_list = make_aln_dict_from_samfile(tmpfile_cassette, starting_dict=readname_to_aln_list) # MAYBE-TODO right now I'm reading the entire files into memory before merging and processing them, # which takes a fair amount of memory - could instead write something that would read both alignment files # in parallel and do the merging and output-writing read-by-read. Do that if I start getting memory issues. reduce_alignment_dict(readname_to_aln_list) prioritize_cassette_reads(readname_to_aln_list, if_cassette_function=is_cassette_chromosome) # delete alignment tmpfiles now that they've been parsed os.remove(tmpfile_genome) if options.cassette_bowtie_index != 'NONE': os.remove(tmpfile_cassette) ### Decide the proper category for each read, and write the info to appropriate final output files if options.dont_split_by_category: with open(outfile_all,'w') as ALL_FILE: category_counts = categorize_reads_print_to_files(readname_to_aln_list, ALL_FILE, ALL_FILE, ALL_FILE, ALL_FILE, unaligned_as_fasta=False, multiple_to_write=options.multiple_to_show, input_collapsed_to_unique=options.input_collapsed_to_unique, no_warnings=options.quiet) else: with open(outfile_unaligned, 'w') as UNALIGNED_FILE: with open(outfile_cassette, 'w') as CASSETTE_FILE: with open(outfile_multiple_genomic, 'w') as MULTIPLE_GENOMIC_FILE: with open(outfile_genomic_unique, 'w') as GENOMIC_UNIQUE_FILE: category_counts = categorize_reads_print_to_files(readname_to_aln_list, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=True, multiple_to_write=options.multiple_to_show, input_collapsed_to_unique=options.input_collapsed_to_unique, no_warnings=options.quiet) ### print category_readcounts to INFOFILE in a nice way text1 = "\n### FINAL ALIGNMENT CATEGORY COUNTS" cassette_multiple = category_counts.pop('cassette-multiple') total_reads = sum(category_counts.values()) text2 = "# total reads: %s"%total_reads if options.input_collapsed_to_unique: text2 +=" (uncollapsed readcounts)" lines = [text1, text2] for category,count in sorted(category_counts.items()): text = "# %s: %s"%(category, value_and_percentages(count, [total_reads])) if category=='cassette' and cassette_multiple: text += ' (Warning: %s multiple!!)'%cassette_multiple lines.append(text) INFOFILE.write('\n') for text in lines: INFOFILE.write(text + '\n') if not options.quiet: print text ### copy preprocessing metadata file to the bottom of the new metadata file INFOFILE.write("\n\n################## Metadata from input preprocessing ##################\n\n") if options.input_metadata_file == 'NONE': INFOFILE.write('Not looking for a metadata input file, as specified by options\n') else: if options.input_metadata_file == 'AUTO': # the correct info file for X.txt is X.fa, but for X_5prime.txt it can be either X_5prime.txt or X.txt, so try both. # (in the new preprocessing version all files are X_*prime.txt and the info files are X_info.txt; # in the old version it was just X.txt and X_info.txt) # MAYBE-TODO add a test-case for this thing! Probably too minor. metafile_basename = os.path.splitext(infile)[0] options.input_metadata_file = metafile_basename + '_info.txt' if not os.path.exists(options.input_metadata_file): if metafile_basename.endswith('_3prime') or metafile_basename.endswith('_5prime'): options.input_metadata_file = metafile_basename[:-len('_3prime')] + '_info.txt' text = 'Automatically determining metadata input file name: %s\n'%options.input_metadata_file if not options.quiet: print text, else: text = 'Metadata input file name provided in options: %s\n'%options.input_metadata_file INFOFILE.write(text+'\n') if os.path.exists(options.input_metadata_file): print_text_from_file(options.input_metadata_file, INFOFILE, printing=False) else: text = 'Metadata input file %s not found!\n'%options.input_metadata_file if not options.quiet: print text, INFOFILE.write(text)