def save_dataset_files(dataset, outfile, verbosity_level=0, if_pickle=True, count_cassette=True, count_other=True, merge_boundary_features=True, sort_data_by='position', N_sequences_per_mutant=5, options="N/A"): """ Print summary and data to output file; optionally print summary to stdout; optionally pickle dataset to picklefile. The options argument is only used to be printed in the header to make it clear how the file was generated - it should be the applicable optparse options object if there is one, or a text message otherwise. """ # print summary info to stdout if desired if verbosity_level>1: print "\nDATA SUMMARY:" if verbosity_level>0: dataset.print_summary(merge_boundary_features=merge_boundary_features, count_cassette=count_cassette, count_other=count_other) # print full data to outfile if verbosity_level>1: print "printing output - time %s."%time.ctime() with open(outfile,'w') as OUTFILE: write_header_data(OUTFILE,options) OUTFILE.write("### SUMMARY:\n") dataset.print_summary(OUTFILE, line_prefix="# ", header_prefix="## ", merge_boundary_features=merge_boundary_features, count_cassette = count_cassette, count_other=count_other) OUTFILE.write("### HEADER AND DATA:\n") dataset.print_data(OUTPUT=OUTFILE, sort_data_by=sort_data_by, N_sequences=N_sequences_per_mutant, header_line=True, header_prefix='# ') # print pickled dataset to picklefile, if desired if if_pickle: outfile_basename = os.path.splitext(outfile)[0] pickled_outfile = outfile_basename + '.pickle' with open(pickled_outfile,'w') as PICKLEFILE: pickle.dump(dataset, PICKLEFILE, 0)
def save_dataset_files(dataset, outfile, verbosity_level=0, if_pickle=True, count_cassette=True, count_other=True, merge_boundary_features=True, sort_data_by='position', N_sequences_per_mutant=5, options="N/A"): """ Print summary and data to output file; optionally print summary to stdout; optionally pickle dataset to picklefile. The options argument is only used to be printed in the header to make it clear how the file was generated - it should be the applicable optparse options object if there is one, or a text message otherwise. """ # print summary info to stdout if desired if verbosity_level > 1: print "\nDATA SUMMARY:" if verbosity_level > 0: dataset.print_summary(merge_boundary_features=merge_boundary_features, count_cassette=count_cassette, count_other=count_other) # print full data to outfile if verbosity_level > 1: print "printing output - time %s." % time.ctime() with open(outfile, 'w') as OUTFILE: write_header_data(OUTFILE, options) OUTFILE.write("### SUMMARY:\n") dataset.print_summary(OUTFILE, line_prefix="# ", header_prefix="## ", merge_boundary_features=merge_boundary_features, count_cassette=count_cassette, count_other=count_other) OUTFILE.write("### HEADER AND DATA:\n") dataset.print_data(OUTPUT=OUTFILE, sort_data_by=sort_data_by, N_sequences=N_sequences_per_mutant, header_line=True, header_prefix='# ') # print pickled dataset to picklefile, if desired if if_pickle: outfile_basename = os.path.splitext(outfile)[0] pickled_outfile = outfile_basename + '.pickle' with open(pickled_outfile, 'w') as PICKLEFILE: pickle.dump(dataset, PICKLEFILE, 0)
def main(infiles, outfile, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. Print final dataset to outfile (if given); return final multi-dataset object and the list of dataset names in order. The options argument should be generated by an optparse parser. """ # parse all infiles, print summaries to stdout if requested all_datasets = {} if options.dataset_names: dataset_names = options.dataset_names.split(',') if not len(dataset_names)==len(infiles): raise ValueError("If dataset names are provided via -D option, you must provide the same number of names " +"as the total number of infiles! We have %s names and %s infiles."%(len(dataset_names), len(infiles))) else: dataset_names = [os.path.splitext(os.path.basename(infile))[0] for infile in infiles] for dataset_name,infile in zip(dataset_names,infiles): if options.verbosity_level>1: print "parsing input file %s - time %s."%(infile, time.ctime()) if infile.endswith('.pickle'): current_dataset = unpickle(infile) else: current_dataset = mutant_analysis_classes.Insertional_mutant_pool_dataset(infile=infile) current_dataset.count_adjacent_mutants(OUTPUT=None) # note - read_data_from_file doesn't deal with merging/counting info, so that will be wrong/missing all_datasets[dataset_name] = current_dataset if options.verbosity_level>0: print "%s mutants in file %s"%(len(current_dataset), infile) elif options.verbosity_level>1: current_dataset.print_summary() # merge datasets into one multi-dataset object if options.verbosity_level>1: print "merging the mutant data into combined dataset - time %s."%(time.ctime()) multi_dataset = mutant_analysis_classes.Insertional_mutant_pool_dataset(multi_dataset=True) multi_dataset.populate_multi_dataset(all_datasets, overwrite=False, check_gene_data=True) # make sure the datasets are in the same order as they were given on the command-line # (using all_datasets to initialize multi_dataset didn't give an order, since all_datasets is a dictionary) multi_dataset.dataset_order = dataset_names # print varying amounts of summary data to stdout if options.verbosity_level>0: print "total %s mutants present in combined dataset"%(len(multi_dataset)) elif options.verbosity_level>0: multi_dataset.print_summary() ### optionally remove mutants based on another dataset if options.remove_mutants_from_file: other_dataset = mutant_analysis_classes.read_mutant_file(options.remove_mutants_from_file) old_N = len(multi_dataset) multi_dataset.remove_mutants_based_on_other_dataset(other_dataset, readcount_min=options.remove_mutants_readcount_min, perfect_reads=options.remove_mutants_min_is_perfect) if options.verbosity_level>0: new_N = len(multi_dataset) print "removed %s mutants based on %s - %s mutants remaining in combined dataset"%(old_N - new_N, options.remove_mutants_from_file, new_N) # if requested, add gene annotation info from separate file if options.gene_annotation_file: if options.verbosity_level>1: print "adding gene annotation from file %s - time %s."%(options.gene_annotation_file, time.ctime()) multi_dataset.add_gene_annotation(options.gene_annotation_file, if_standard_Cre_file=options.annotation_file_is_standard) # print full data to outfile, unless there is no outfile name given if outfile: if options.verbosity_level>1: print "printing combined dataset output to file %s - time %s."%(outfile, time.ctime()) with open(outfile,'w') as OUTFILE: write_header_data(OUTFILE,options) OUTFILE.write("### DATASET SUMMARIES:\n") multi_dataset.print_summary(OUTPUT=OUTFILE, line_prefix="# ", header_prefix="## ") OUTFILE.write("### HEADER AND DATA:\n") multi_dataset.print_data(OUTPUT=OUTFILE, sort_data_by=options.sort_data_key, header_line=True) # TODO make a *.pickle outfile as well? return multi_dataset, dataset_names
def main(args, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ try: [infile] = args except ValueError: parser = define_option_parser() parser.print_help() sys.exit("Error: exactly one infile required!") # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc... ### check inputs adapter_options = '-a --adapter -b --anywhere -g --front' if any([x in options.other_cutadapt_options for x in adapter_options.split()]): sys.exit("Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)"%adapter_options +" - use -5/-3 options to specify adapters instead!") ### outfile and tmpfile names infile_suffix = os.path.splitext(infile)[1] outfile_suffix = '.fa' #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix ends = "5' 3'".split() outfiles = {end: options.outfile_basename + '_%s.fa'%end.replace("'","prime") for end in ends} infofile = options.outfile_basename + '_info.txt' wrong_start_file = options.outfile_basename + '_wrong-start.fa' no_cassette_tmpfiles = {end: options.outfile_basename + '_no-cassette-tmpfile_%s.fa'%end.replace("'","prime") for end in ends} no_cassette_file = options.outfile_basename + '_no-cassette.fa' trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa' cutadapt_tmpfiles = {end: options.outfile_basename + '_cutadapt-tmpfile_%s.fa'%end.replace("'","prime") for end in ends} cutadapt_tmpfiles_original = cutadapt_tmpfiles with open(infofile,'w') as INFOFILE: ### write header data write_header_data(INFOFILE,options) INFOFILE.write('\n') ### 0. look at the infile; make sure it's readable, etc # (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format) starting_readcount = check_readcount(infile, INFOFILE, bool(options.verbosity>1), "original input", options.total_read_number_only, False) ### 1. Trim the first bases (from adapter) # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function... # Would that be faster, or better in any other way? # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl), # since that's the eventual point of having those constant first bases there... if options.first_bases_to_trim == 'NONE': text = "### Not trimming first bases, since NONE was passed to -F option.\n" if options.verbosity>0: print text INFOFILE.write(text+'\n') trimmed_tmpfile = infile trimmed_readcount = starting_readcount untrimmed_readcount = 0 else: trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile, wrong_start_file, INFOFILE, options.verbosity) trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE, bool(options.verbosity>1), "first-base-trimming output", options.total_read_number_only, False) untrimmed_readcount = check_readcount(wrong_start_file, None, False, True, False) assert trimmed_readcount+untrimmed_readcount==starting_readcount,\ "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\ +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount) ### 2. run cutadapt to strip cassette sequence # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version), # to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the # standard install! Or wait until the cutadapt maintainer does it (I submitted it as an issue) # (see ~/experiments/basic_programs/cutadapt_modifications/). if_running_cutadapt = True if options.other_cutadapt_options == 'NONE': if_running_cutadapt = False text = "### Not running cutadapt, since NONE was passed to -A option.\n" elif not (options.adapter_5prime or options.adapter_3prime): if_running_cutadapt = False text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n" # if not running it, just skip it if not if_running_cutadapt: if options.verbosity>0: print text INFOFILE.write(text+'\n') cutadapt_tmpfile = trimmed_tmpfile cutadapt_readcount = {'all': trimmed_readcount} no_cassette_readcount = 0 # otherwise run the 5' and 3' ends separately else: cutadapt_readcount = {} for (end_type, adapter_seq) in [("5'", options.adapter_5prime), ("3'", options.adapter_3prime)]: assert end_type in ends # if the adapter sequence for that side is empty, skip if not adapter_seq.replace('"','').replace("'",'').replace(' ',''): continue cutadapt_tmpfile = cutadapt_tmpfiles[end_type] full_cutadapt_options = '-a %s %s'%(adapter_seq, options.other_cutadapt_options) for extra_seq_category in ('untrimmed', 'too-short', 'too-long'): if not extra_seq_category in full_cutadapt_options: full_cutadapt_options += ' --%s-output %s'%(extra_seq_category, no_cassette_tmpfiles[end_type]) command = "cutadapt_mod %s -o %s %s"%(full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile) run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, program_name="cutadapt for %s"%end_type) cutadapt_readcount[end_type] = check_readcount(cutadapt_tmpfile, INFOFILE, bool(options.verbosity>1), "cutadapt output", options.total_read_number_only, False) tmp_no_cassette_readcount = check_readcount(no_cassette_tmpfiles[end_type], None, False, True, False) assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\ "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\ +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount) # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles! text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n" if options.verbosity>0: print text INFOFILE.write(text+'\n') no_cassette_seqs = [] for no_cassette_tmpfile in no_cassette_tmpfiles.values(): try: no_cassette_seqs.append(dict(parse_fasta(no_cassette_tmpfile))) except IOError: pass # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets overlapping_no_cassette_headers = set.intersection(*[set(d.keys()) for d in no_cassette_seqs]) no_cassette_readcount = len(overlapping_no_cassette_headers) with open(no_cassette_file,'w') as NO_CASSETTE_FILE: for header in sorted(overlapping_no_cassette_headers): # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase write_fasta_line(header, no_cassette_seqs[0][header].upper(), NO_CASSETTE_FILE) assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\ "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\ +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount) # remove the original no_cassette_tmpfiles for tmpfile in no_cassette_tmpfiles.values(): if os.path.exists(tmpfile): os.remove(tmpfile) ### 3. run fastx_collapser to collapse the sequences to unique if not options.collapse_to_unique: text = "### Not running fastx_collapser, since -C option was not used.\n" if options.verbosity>0: print text INFOFILE.write(text+'\n') for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items(): if os.path.exists(cutadapt_tmpfile): os.rename(cutadapt_tmpfile, outfiles[end_type]) collapsed_readcount = cutadapt_readcount # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because # fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off. else: collapsed_readcount, uncollapsed_readcount = {}, {} for (end_type,cutadapt_tmpfile) in cutadapt_tmpfiles.items(): outfile = outfiles[end_type] # if there is no file for that end, skip if not os.path.exists(cutadapt_tmpfile): continue command = "fastx_collapser -v %s -i %s -o %s"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], cutadapt_tmpfile, outfile) run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, program_name="fastx_collapser for %s"%end_type) INFOFILE.write('\n') collapsed_readcount[end_type] = check_readcount(outfile,INFOFILE,bool(options.verbosity>1), "fastx_collapser output", options.total_read_number_only, input_collapsed_to_unique=False) # make sure uncollapsed readcount is the same as before collapsing uncollapsed_readcount[end_type] = check_readcount(outfile, None, False, "", True, input_collapsed_to_unique=True) if not uncollapsed_readcount[end_type] == cutadapt_readcount[end_type]: text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count! Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n" else: text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n" if options.verbosity>1: print text INFOFILE.write(text+'\n') # also run fastx_collapser on wrong_start_file and no_cassette_file text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n" if options.verbosity: print text INFOFILE.write(text+'\n') extra_collapsed_readcounts = {} for extra_file in (wrong_start_file, no_cassette_file): command = "fastx_collapser -v %s -i %s -o tmp.fa"%(FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], extra_file) retcode = run_command_print_info_output(command, None, options.verbosity-1, shell=True) # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists # (also it chokes on empty files, AND on lowercase bases! That's a bit ridiculous...) # it also apparently sometimes changes the order of the sequences for no good reason! ARGH. if retcode in (0, None) and os.path.exists('tmp.fa'): os.remove(extra_file) os.rename('tmp.fa', extra_file) extra_collapsed_readcounts[extra_file] = check_readcount(extra_file, None, False, "", True, input_collapsed_to_unique=False) ### Final readcount check final_output = ["### Final read count info for %s (main output files %s)\n"%(infile, ', '.join(outfiles))] final_output.append("# starting total read count:\t%s\n"%starting_readcount) if not options.first_bases_to_trim == 'NONE': final_output.append('# "good" read count after start trimming (%% of total):\t%s\n'% value_and_percentages(trimmed_readcount, [starting_readcount])) final_output.append('# "bad" read count (wrong-start) (%% of total):\t%s\n'% value_and_percentages(untrimmed_readcount, [starting_readcount])) if if_running_cutadapt: for end_type in cutadapt_readcount.keys(): final_output.append('# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n'% (end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount, trimmed_readcount]))) final_output.append('# "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n'% value_and_percentages(no_cassette_readcount, [starting_readcount, trimmed_readcount])) for end_type in cutadapt_readcount.keys(): final_output.append('## final "good" %s reads (in main output file) (%% of total):\t%s\n'%(end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount]))) final_output.append('## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n'% value_and_percentages(starting_readcount-sum(cutadapt_readcount.values()), [starting_readcount])) if options.collapse_to_unique: for end_type in cutadapt_readcount.keys(): final_output.append('# "good" %s unique sequence count after collapsing reads to unique sequences '%end_type +'(%% of read count):\t%s\n'%value_and_percentages(collapsed_readcount[end_type], [cutadapt_readcount[end_type]])) if not options.first_bases_to_trim == 'NONE': final_output.append('# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n' %value_and_percentages(extra_collapsed_readcounts[wrong_start_file], [untrimmed_readcount])) if if_running_cutadapt: final_output.append('# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n' %value_and_percentages(extra_collapsed_readcounts[no_cassette_file], [no_cassette_readcount])) for line in final_output: INFOFILE.write(line) if options.verbosity>0: print line, ### Remove tmpfiles # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps, # and I don't want to remove the infile! if not options.keep_tmpfiles: for tmpfile in [trimmed_tmpfile_original] + cutadapt_tmpfiles_original.values(): if os.path.exists(tmpfile): os.remove(tmpfile)
def main(args, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ try: [infile] = args # TODO multiple infiles would be nice! except ValueError: parser = define_option_parser() parser.print_help() sys.exit("Error: exactly one infile required!") # MAYBE-TODO implement option with multiple infiles? Need to make sure they're the same fa/fq type etc... ### check inputs adapter_options = '-a --adapter -b --anywhere -g --front' if any( [x in options.other_cutadapt_options for x in adapter_options.split()]): sys.exit( "Error: --other_cutadapt_options value shouldn't contain any adapter seq options (%s)" % adapter_options + " - use -5/-3 options to specify adapters instead!") ### outfile and tmpfile names # outfile suffix is always fa because we always discard quality info right now, even when not forced to do that by collapsing to unique! MAYBE-TODO change that? #infile_suffix = os.path.splitext(infile)[1] #outfile_suffix = '.fa' if options.collapse_to_unique else infile_suffix outfile_suffix = '.fa' infofile = options.outfile_basename + '_info.txt' wrong_start_file = options.outfile_basename + '_wrong-start.fa' no_cassette_file = options.outfile_basename + '_no-cassette.fa' trimmed_tmpfile = trimmed_tmpfile_original = options.outfile_basename + '_trimmed-tmpfile.fa' # outfiles and tmpfiles should be split by end ONLY if cutadapt is being run! if options.other_cutadapt_options == 'NONE' or not ( options.adapter_5prime or options.adapter_3prime): outfiles = {'': options.outfile_basename + '.fa'} no_cassette_tmpfiles = { '': options.outfile_basename + '_no-cassette-tmpfile.fa' } cutadapt_tmpfiles = { '': options.outfile_basename + '_cutadapt-tmpfile.fa' } cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles) else: ends = "5' 3'".split() outfiles = { end: options.outfile_basename + '_%s.fa' % end.replace("'", "prime") for end in ends } no_cassette_tmpfiles = { end: options.outfile_basename + '_no-cassette-tmpfile_%s.fa' % end.replace("'", "prime") for end in ends } cutadapt_tmpfiles = { end: options.outfile_basename + '_cutadapt-tmpfile_%s.fa' % end.replace("'", "prime") for end in ends } cutadapt_tmpfiles_original = dict(cutadapt_tmpfiles) with open(infofile, 'w') as INFOFILE: ### write header data write_header_data(INFOFILE, options) INFOFILE.write('\n') ### 0. look at the infile; make sure it's readable, etc # (check_readcount uses seq_count_and_lengths, which uses HTSeq and autodetects fa/fq format) starting_readcount = check_readcount(infile, INFOFILE, bool(options.verbosity > 1), "original input", options.total_read_number_only, False) ### 1. Trim the first bases (from adapter) # MAYBE-TODO I could do this with cutadapt again, instead of with my own trim_prefix function... # Would that be faster, or better in any other way? # MAYBE-TODO could also do it with a multiplexing barcode-splitting tool (like fastx_barcode_splitter.pl), # since that's the eventual point of having those constant first bases there... if options.first_bases_to_trim == 'NONE': text = "### Not trimming first bases, since NONE was passed to -F option.\n" if options.verbosity > 0: print text INFOFILE.write(text + '\n') trimmed_tmpfile = infile trimmed_readcount = starting_readcount untrimmed_readcount = 0 else: trim_prefix(options.first_bases_to_trim, infile, trimmed_tmpfile, wrong_start_file, INFOFILE, options.verbosity) trimmed_readcount = check_readcount(trimmed_tmpfile, INFOFILE, bool(options.verbosity > 1), "first-base-trimming output", options.total_read_number_only, False) untrimmed_readcount = check_readcount(wrong_start_file, None, False, True, False) assert trimmed_readcount+untrimmed_readcount==starting_readcount,\ "Trimmed/untrimmed readcounts don't add up to starting readcount - check tmpfile!"\ +"(%s+%s != %s)"%(trimmed_readcount, untrimmed_readcount, starting_readcount) ### 2. run cutadapt to strip cassette sequence # NOTE: this currently requires my version of cutadapt, cutadapt_mod (based on some older cutadapt version), # to deal with too-long seqs correctly - LATER-TODO submit my modification as a patch to cutadapt to get it in the # standard install! Or wait until the cutadapt maintainer does it (I submitted it as an issue) # (see ~/experiments/basic_programs/cutadapt_modifications/). if_running_cutadapt = True if options.other_cutadapt_options == 'NONE': if_running_cutadapt = False text = "### Not running cutadapt, since NONE was passed to -A option.\n" elif not (options.adapter_5prime or options.adapter_3prime): if_running_cutadapt = False text = "### Not running cutadapt, since empty sequences were passed to -5 and -3 options.\n" # if not running it, just skip it if not if_running_cutadapt: if options.verbosity > 0: print text INFOFILE.write(text + '\n') cutadapt_tmpfiles[''] = trimmed_tmpfile cutadapt_readcount = {'all': trimmed_readcount} no_cassette_readcount = 0 # otherwise run the 5' and 3' ends separately else: cutadapt_readcount = {} for (end_type, adapter_seqs) in [("5'", options.adapter_5prime), ("3'", options.adapter_3prime)]: assert end_type in ends # if the adapter sequence for that side is empty, skip adapter_seqs = adapter_seqs.replace('"', '').replace( "'", '').replace(' ', '') if not adapter_seqs: continue cutadapt_tmpfile = cutadapt_tmpfiles[end_type] all_adapter_options = ' '.join( ['-a %s' % seq for seq in adapter_seqs.split(',')]) full_cutadapt_options = all_adapter_options + ' ' + options.other_cutadapt_options for extra_seq_category in ('untrimmed', 'too-short', 'too-long'): if not extra_seq_category in full_cutadapt_options: full_cutadapt_options += ' --%s-output %s' % ( extra_seq_category, no_cassette_tmpfiles[end_type]) command = "cutadapt_mod %s -o %s %s" % ( full_cutadapt_options, cutadapt_tmpfile, trimmed_tmpfile) run_command_print_info_output(command, INFOFILE, options.verbosity, shell=True, program_name="cutadapt for %s" % end_type) cutadapt_readcount[end_type] = check_readcount( cutadapt_tmpfile, INFOFILE, bool(options.verbosity > 1), "cutadapt output", options.total_read_number_only, False) tmp_no_cassette_readcount = check_readcount( no_cassette_tmpfiles[end_type], None, False, True, False) assert cutadapt_readcount[end_type] + tmp_no_cassette_readcount == trimmed_readcount,\ "%s cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\ +"(%s+%s != %s)"%(end_type, cutadapt_readcount[end_type], tmp_no_cassette_readcount, trimmed_readcount) # make an actual no_cassette_file based on the overlap of the two no_cassette_tmpfiles! text = "### Merging the 5' and 3' cutadapt untrimmed outputs to get single no-cassette file.\n" if options.verbosity > 0: print text INFOFILE.write(text + '\n') no_cassette_seqs = [] for no_cassette_tmpfile in no_cassette_tmpfiles.values(): try: no_cassette_seqs.append( dict(parse_fasta(no_cassette_tmpfile))) except IOError: pass # the real no-cassette seqs are the intersection of the seq headers from both no_cassette_tmpfile sets overlapping_no_cassette_headers = set.intersection( *[set(d.keys()) for d in no_cassette_seqs]) no_cassette_readcount = len(overlapping_no_cassette_headers) with open(no_cassette_file, 'w') as NO_CASSETTE_FILE: for header in sorted(overlapping_no_cassette_headers): # some fastx_toolkit tools give errors on lowercase bases, so make everything uppercase write_fasta_line(header, no_cassette_seqs[0][header].upper(), NO_CASSETTE_FILE) assert no_cassette_readcount + sum(cutadapt_readcount.values()) == trimmed_readcount,\ "Final cassette/no-cassette readcounts don't add up to trimmed readcount - check tmpfile!"\ +"(%s+%s != %s)"%(sum(cutadapt_readcount.values()), no_cassette_readcount, trimmed_readcount) # remove the original no_cassette_tmpfiles for tmpfile in no_cassette_tmpfiles.values(): if os.path.exists(tmpfile): os.remove(tmpfile) ### 3. run fastx_collapser to collapse the sequences to unique if not options.collapse_to_unique: text = "### Not running fastx_collapser, since -C option was not used.\n" if options.verbosity > 0: print text INFOFILE.write(text + '\n') for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items(): if os.path.exists(cutadapt_tmpfile): os.rename(cutadapt_tmpfile, outfiles[end_type]) collapsed_readcount = cutadapt_readcount # Note for fastx_collapser, but also for the others - NONE is necessary here, can't just use '', because # fastx_collapser works fine with no options, so '' is a sensible input and can't be used to turn it off. else: collapsed_readcount, uncollapsed_readcount = {}, {} for (end_type, cutadapt_tmpfile) in cutadapt_tmpfiles.items(): outfile = outfiles[end_type] # if there is no file for that end, skip if not os.path.exists(cutadapt_tmpfile): continue command = "fastx_collapser -v %s -i %s -o %s" % ( FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], cutadapt_tmpfile, outfile) run_command_print_info_output( command, INFOFILE, options.verbosity, shell=True, program_name="fastx_collapser for %s" % end_type) INFOFILE.write('\n') collapsed_readcount[end_type] = check_readcount( outfile, INFOFILE, bool(options.verbosity > 1), "fastx_collapser output", options.total_read_number_only, input_collapsed_to_unique=False) # make sure uncollapsed readcount is the same as before collapsing uncollapsed_readcount[end_type] = check_readcount( outfile, None, False, "", True, input_collapsed_to_unique=True) if not uncollapsed_readcount[end_type] == cutadapt_readcount[ end_type]: text = "ERROR: the uncollapsed read-count after fastx_collapser isn't the same as the before-collapser count! Collapsing went wrong somehow, or the way fastx_collapser works changed since this program was written?\n" else: text = "(checked that all the reads are still there if you uncollapse the numbers using header info)\n" if options.verbosity > 1: print text INFOFILE.write(text + '\n') # also run fastx_collapser on wrong_start_file and no_cassette_file text = "### Running fastx_collapser on the \"bad\" output files. Not printing the output to info file.\n" if options.verbosity: print text INFOFILE.write(text + '\n') extra_collapsed_readcounts = {} for extra_file in (wrong_start_file, no_cassette_file): command = "fastx_collapser -v %s -i %s -o tmp.fa" % ( FASTQ_ENCODINGS_FASTX_TOOLKIT[options.fastq_encoding], extra_file) retcode = run_command_print_info_output(command, None, options.verbosity - 1, shell=True) # note: actually fastx_collapser doesn't give proper retcodes, so just check if outfile exists # (also it chokes on empty files, AND on lowercase bases! That's a bit ridiculous...) # it also apparently sometimes changes the order of the sequences for no good reason! ARGH. if retcode in (0, None) and os.path.exists('tmp.fa'): os.remove(extra_file) os.rename('tmp.fa', extra_file) extra_collapsed_readcounts[extra_file] = check_readcount( extra_file, None, False, "", True, input_collapsed_to_unique=False) ### Final readcount check final_output = [ "### Final read count info for %s (main output files %s)\n" % (infile, ', '.join(outfiles)) ] final_output.append("# starting total read count:\t%s\n" % starting_readcount) if not options.first_bases_to_trim == 'NONE': final_output.append( '# "good" read count after start trimming (%% of total):\t%s\n' % value_and_percentages(trimmed_readcount, [starting_readcount])) final_output.append( '# "bad" read count (wrong-start) (%% of total):\t%s\n' % value_and_percentages(untrimmed_readcount, [starting_readcount])) if if_running_cutadapt: for end_type in cutadapt_readcount.keys(): final_output.append( '# "good" %s read count after cassette stripping (%% of total, %% of trimmed):\t%s\n' % (end_type, value_and_percentages( cutadapt_readcount[end_type], [starting_readcount, trimmed_readcount]))) final_output.append( '# "bad" read count (no-cassette) (%% of total, %% of trimmed):\t%s\n' % value_and_percentages(no_cassette_readcount, [starting_readcount, trimmed_readcount])) for end_type in cutadapt_readcount.keys(): final_output.append( '## final "good" %s reads (in main output file) (%% of total):\t%s\n' % (end_type, value_and_percentages(cutadapt_readcount[end_type], [starting_readcount]))) final_output.append( '## final "bad" reads (in _wrong-start and/or _no-cassette files) (%% of total):\t%s\n' % value_and_percentages( starting_readcount - sum(cutadapt_readcount.values()), [starting_readcount])) if options.collapse_to_unique: for end_type in cutadapt_readcount.keys(): final_output.append( '# "good" %s unique sequence count after collapsing reads to unique sequences ' % end_type + '(%% of read count):\t%s\n' % value_and_percentages(collapsed_readcount[end_type], [cutadapt_readcount[end_type]])) if not options.first_bases_to_trim == 'NONE': final_output.append( '# wrong-start unique sequence count after collapsing (%% of read count):\t%s\n' % value_and_percentages( extra_collapsed_readcounts[wrong_start_file], [untrimmed_readcount])) if if_running_cutadapt: final_output.append( '# no-cassette unique sequence count after collapsing (%% of read count):\t%s\n' % value_and_percentages( extra_collapsed_readcounts[no_cassette_file], [no_cassette_readcount])) for line in final_output: INFOFILE.write(line) if options.verbosity > 0: print line, ### Remove tmpfiles # need to use the tmpfile*_original names here because I do "trimmed_tmpfile = infile" etc if skipping steps, # and I don't want to remove the infile! if not options.keep_tmpfiles: for tmpfile in [trimmed_tmpfile_original ] + cutadapt_tmpfiles_original.values(): if os.path.exists(tmpfile): os.remove(tmpfile)
def main(args, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ try: [infile] = args except ValueError: parser.print_help() sys.exit( "Error: exactly one infile required! %s infiles provided: %s" % (len(args), args)) # MAYBE-TODO bowtie could take multiple infiles, but then I'd have to deal with multiple preprocessing metafiles... other_bowtie_options_split = options.other_bowtie_options.split(' ') if any([ x in other_bowtie_options_split for x in ('-v -e --maqerr -n --seedmms -l --seedlen'.split(' ')) ]): raise Exception( "Cannot include -v/-n/-e and related bowtie options in -B! Use separate -e option for that; " "note that this program allows -v bowtie mode only.") if any([ x in other_bowtie_options_split for x in ('-m -k -a --all'.split(' ')) ]): raise Exception( "Cannot include -m/-a bowtie options in -B! Use separate -m option for that." ) specific_bowtie_options = '-v %s' % options.allowed_errors if not any([x in options.other_bowtie_options for x in ('-f', '-q')]): infile_format = check_fasta_fastq_format(infile) if infile_format == 'fasta': specific_bowtie_options += ' -f' elif infile_format == 'fastq': specific_bowtie_options += ' -q' else: raise Exception("Cannot process auto-detected infile format %s!" % infile_format) # using a minimum of -k 2 (or -a) in order to make sure I can easily tell multiple from unique alignments if options.multiple_to_show == -1: multiple_bowtie_option = '-a' else: multiple_bowtie_option = '-k %s' % max(options.multiple_to_show, 2) # output file names: temporary for alignments, final (split or all), metadata info file. outfile_suffix = '.sam' if any( [x in options.other_bowtie_options for x in ['-S', '--sam']]) else '.map' tmpfile_genome = options.outfile_basename + '_tmp_genome' + outfile_suffix if options.cassette_bowtie_index != 'NONE': tmpfile_cassette = options.outfile_basename + '_tmp_cassette' + outfile_suffix if options.dont_split_by_category: outfile_all = options.outfile_basename + outfile_suffix else: outfile_unaligned = options.outfile_basename + '_unaligned.fa' outfile_cassette = options.outfile_basename + '_cassette' + outfile_suffix outfile_multiple_genomic = options.outfile_basename + '_multiple-genomic'\ + ('.fa' if options.multiple_to_show==0 else outfile_suffix) outfile_genomic_unique = options.outfile_basename + '_genomic-unique' + outfile_suffix infofile = options.outfile_basename + '_info.txt' with open(infofile, 'w') as INFOFILE: ### write header data write_header_data(INFOFILE, options) ### run bowtie vs the main/genome index file # run 'bowtie --version' to get that data (print to INFOFILE but not stdout) INFOFILE.write('\n\n') run_command_print_info_output("bowtie --version", INFOFILE, printing_level=0, shell=True) # run the actual bowtie alignment command; always print output to stdout as well as INFOFILE # (bowtie actually prints the summary to stderr, not stdout, so I need to print it to stdout in case there's # an error, so I can see the error message! Or I could try to detect whether there was an error or not # based on the output contents, but that seems like unnecessary work.) INFOFILE.write('\n\n') command = "bowtie %s %s %s %s %s %s" % ( specific_bowtie_options, multiple_bowtie_option, options.other_bowtie_options, options.genome_bowtie_index, infile, tmpfile_genome) if options.bowtie_aln_file_genome is None: run_command_print_info_output(command, INFOFILE, printing_level=(not options.quiet), shell=True) else: options.keep_tmpfiles = True if not os.access(options.bowtie_aln_file_genome, os.R_OK): raise Exception( "Can't read provided options.bowtie_aln_file_genome %s!" % options.bowtie_aln_file_genome) text = "UNUSUAL RUN: Instead of running \"%s\", using file %s." % ( command, options.bowtie_aln_file_genome) print text INFOFILE.write('\n' + text + '\n') tmpfile_genome = options.bowtie_aln_file_genome ### run bowtie vs the cassette index file if given if options.cassette_bowtie_index != 'NONE': INFOFILE.write('\n\n') command = "bowtie %s %s %s %s %s %s" % ( specific_bowtie_options, '--all', options.other_bowtie_options, options.cassette_bowtie_index, infile, tmpfile_cassette) if options.bowtie_aln_file_cassette is None: run_command_print_info_output( command, INFOFILE, printing_level=(not options.quiet), shell=True) else: options.keep_tmpfiles = True if not os.access(options.bowtie_aln_file_cassette, os.R_OK): raise Exception( "Can't read provided options.bowtie_aln_file_cassette %s!" % options.bowtie_aln_file_cassette) text = "UNUSUAL RUN: Instead of running \"%s\", using file %s." % ( command, options.bowtie_aln_file_cassette) print text INFOFILE.write('\n' + text + '\n') tmpfile_cassette = options.bowtie_aln_file_cassette ### Check that bowtie runs worked missing_alnfile_text = "Bowtie run against %s failed! See above or %s file for bowtie error message." if not os.access(tmpfile_genome, os.R_OK): sys.exit(missing_alnfile_text % (options.genome_bowtie_index, infofile)) if options.cassette_bowtie_index != 'NONE' and not os.access( tmpfile_cassette, os.R_OK): sys.exit(missing_alnfile_text % (options.cassette_bowtie_index, infofile)) # MAYBE-TODO make sure bowtie errors are printed to stdout even with -1? Hard - bowtie is unfortunately ANNOYING # and uses stderr both for normal output and for errors, AND gives no returncode. ### Parse the two alignment files in parallel, and merge them together (remove sub-optimal alignments, # (and remove non-cassette ones if there are cassette ones with equal quality); remove alignment files. # Do all this WITHOUT reading the entire files into memory! A bit tricky. if options.cassette_bowtie_index != 'NONE': aln_list_generator = aln_generator_from_two_samfiles_parallel( tmpfile_genome, tmpfile_cassette) else: aln_list_generator = aln_generator_from_single_samfile( tmpfile_genome) ### Decide the proper category for each read, and write the info to appropriate final output files if options.dont_split_by_category: GENOMIC_UNIQUE_FILE = MULTIPLE_GENOMIC_FILE = CASSETTE_FILE = UNALIGNED_FILE = open( outfile_all, 'w') unaligned_as_fasta = False else: UNALIGNED_FILE = open(outfile_unaligned, 'w') CASSETTE_FILE = open(outfile_cassette, 'w') MULTIPLE_GENOMIC_FILE = open(outfile_multiple_genomic, 'w') GENOMIC_UNIQUE_FILE = open(outfile_genomic_unique, 'w') unaligned_as_fasta = True category_readcounts = { 'unaligned': 0, 'cassette': 0, 'multiple-genomic': 0, 'genomic-unique': 0, 'cassette-multiple': 0 } for (readname, full_aln_list) in aln_list_generator: reduced_aln_list = reduce_alignment_list(full_aln_list) final_aln_list = prioritize_cassette_reads( reduced_aln_list, if_cassette_function=is_cassette_chromosome) categorize_reads_print_to_files( readname, final_aln_list, category_readcounts, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=unaligned_as_fasta, multiple_to_write=options.multiple_to_show, input_collapsed_to_unique=options.input_collapsed_to_unique, no_multi_cassette_warnings=options.no_multi_cassette_warnings) if options.dont_split_by_category: # all files are actually the same pointer, so only close once GENOMIC_UNIQUE_FILE.close() else: UNALIGNED_FILE.close() CASSETTE_FILE.close() MULTIPLE_GENOMIC_FILE.close() GENOMIC_UNIQUE_FILE.close() # delete alignment tmpfiles now that they've been parsed if not options.keep_tmpfiles: os.remove(tmpfile_genome) if options.cassette_bowtie_index != 'NONE': os.remove(tmpfile_cassette) ### print category_readcounts to INFOFILE in a nice way text1 = "\n### FINAL ALIGNMENT CATEGORY COUNTS" cassette_multiple = category_readcounts.pop('cassette-multiple') total_reads = sum(category_readcounts.values()) text2 = "# total reads: %s" % total_reads if options.input_collapsed_to_unique: text2 += " (uncollapsed readcounts)" lines = [text1, text2] for category, count in sorted(category_readcounts.items()): text = "# %s: %s" % (category, value_and_percentages(count, [total_reads])) if category == 'cassette' and cassette_multiple: text += ' (Warning: %s multiple!!)' % cassette_multiple lines.append(text) INFOFILE.write('\n') for text in lines: INFOFILE.write(text + '\n') if not options.quiet: print text ### copy preprocessing metadata file to the bottom of the new metadata file INFOFILE.write( "\n\n################## Metadata from input preprocessing ##################\n\n" ) if options.input_metadata_file == 'NONE': INFOFILE.write( 'Not looking for a metadata input file, as specified by options\n' ) else: if options.input_metadata_file == 'AUTO': # the correct info file for X.txt is X.fa, but for X_5prime.txt it can be either X_5prime.txt or X.txt, so try both. # (in the new preprocessing version all files are X_*prime.txt and the info files are X_info.txt; # in the old version it was just X.txt and X_info.txt) # MAYBE-TODO add a test-case for this thing! Probably too minor. metafile_basename = os.path.splitext(infile)[0] options.input_metadata_file = metafile_basename + '_info.txt' if not os.path.exists(options.input_metadata_file): if metafile_basename.endswith( '_3prime') or metafile_basename.endswith( '_5prime'): options.input_metadata_file = metafile_basename[:-len( '_3prime')] + '_info.txt' text = 'Automatically determining metadata input file name: %s\n' % options.input_metadata_file if not options.quiet: print text, else: text = 'Metadata input file name provided in options: %s\n' % options.input_metadata_file INFOFILE.write(text + '\n') if os.path.exists(options.input_metadata_file): print_text_from_file(options.input_metadata_file, INFOFILE, printing=False) else: text = 'Metadata input file %s not found!\n' % options.input_metadata_file if not options.quiet: print text, INFOFILE.write(text)
def main(args, options): """ Run the main functionality of the module (see module docstring for more information), excluding testing. The options argument should be generated by an optparse parser. """ try: [infile] = args except ValueError: parser.print_help() sys.exit("Error: exactly one infile required! %s infiles provided: %s"%(len(args), args)) # MAYBE-TODO bowtie could take multiple infiles, but then I'd have to deal with multiple preprocessing metafiles... other_bowtie_options_split = options.other_bowtie_options.split(' ') if any([x in other_bowtie_options_split for x in ('-v -e --maqerr -n --seedmms -l --seedlen'.split(' '))]): raise Exception("Cannot include -v/-n/-e and related bowtie options in -B! Use separate -e option for that; " "note that this program allows -v bowtie mode only.") if any([x in other_bowtie_options_split for x in ('-m -k -a --all'.split(' '))]): raise Exception("Cannot include -m/-a bowtie options in -B! Use separate -m option for that.") specific_bowtie_options = '-v %s'%options.allowed_errors if not any([x in options.other_bowtie_options for x in ('-f', '-q')]): infile_format = check_fasta_fastq_format(infile) if infile_format=='fasta': specific_bowtie_options += ' -f' elif infile_format=='fastq': specific_bowtie_options += ' -q' else: raise Exception("Cannot process auto-detected infile format %s!"%infile_format) # using a minimum of -k 2 (or -a) in order to make sure I can easily tell multiple from unique alignments if options.multiple_to_show == -1: multiple_bowtie_option = '-a' else: multiple_bowtie_option = '-k %s'%max(options.multiple_to_show, 2) # output file names: temporary for alignments, final (split or all), metadata info file. outfile_suffix = '.sam' if any([x in options.other_bowtie_options for x in ['-S','--sam']]) else '.map' tmpfile_genome = options.outfile_basename + '_tmp_genome' + outfile_suffix if options.cassette_bowtie_index != 'NONE': tmpfile_cassette = options.outfile_basename + '_tmp_cassette' + outfile_suffix if options.dont_split_by_category: outfile_all = options.outfile_basename + outfile_suffix else: outfile_unaligned = options.outfile_basename + '_unaligned.fa' outfile_cassette = options.outfile_basename + '_cassette' + outfile_suffix outfile_multiple_genomic = options.outfile_basename + '_multiple-genomic'\ + ('.fa' if options.multiple_to_show==0 else outfile_suffix) outfile_genomic_unique = options.outfile_basename + '_genomic-unique' + outfile_suffix infofile = options.outfile_basename + '_info.txt' with open(infofile,'w') as INFOFILE: ### write header data write_header_data(INFOFILE,options) ### run bowtie vs the main/genome index file # run 'bowtie --version' to get that data (print to INFOFILE but not stdout) INFOFILE.write('\n\n') run_command_print_info_output("bowtie --version", INFOFILE, printing_level=0, shell=True) # run the actual bowtie alignment command; always print output to stdout as well as INFOFILE # (bowtie actually prints the summary to stderr, not stdout, so I need to print it to stdout in case there's # an error, so I can see the error message! Or I could try to detect whether there was an error or not # based on the output contents, but that seems like unnecessary work.) INFOFILE.write('\n\n') command = "bowtie %s %s %s %s %s %s"%(specific_bowtie_options, multiple_bowtie_option, options.other_bowtie_options, options.genome_bowtie_index, infile, tmpfile_genome) run_command_print_info_output(command, INFOFILE, printing_level=(not options.quiet), shell=True) ### run bowtie vs the cassette index file if given if options.cassette_bowtie_index != 'NONE': INFOFILE.write('\n\n') command = "bowtie %s %s %s %s %s %s"%(specific_bowtie_options, '--all', options.other_bowtie_options, options.cassette_bowtie_index, infile, tmpfile_cassette) run_command_print_info_output(command, INFOFILE, printing_level=(not options.quiet), shell=True) ### Check that bowtie runs worked missing_alnfile_text = "Bowtie run against %s failed! See above or %s file for bowtie error message." if not os.access(tmpfile_genome, os.R_OK): sys.exit(missing_alnfile_text%(options.genome_bowtie_index, infofile)) if options.cassette_bowtie_index != 'NONE' and not os.access(tmpfile_cassette, os.R_OK): sys.exit(missing_alnfile_text%(options.cassette_bowtie_index, infofile)) # MAYBE-TODO make sure bowtie errors are printed to stdout even with -1? Hard - bowtie is unfortunately ANNOYING # and uses stderr both for normal output and for errors, AND gives no returncode. ### Parse the two alignment files, and merge them together (remove sub-optimal alignments, # (and remove non-cassette ones if there are cassette ones with equal quality); remove alignment files. readname_to_aln_list = make_aln_dict_from_samfile(tmpfile_genome) if options.cassette_bowtie_index != 'NONE': readname_to_aln_list = make_aln_dict_from_samfile(tmpfile_cassette, starting_dict=readname_to_aln_list) # MAYBE-TODO right now I'm reading the entire files into memory before merging and processing them, # which takes a fair amount of memory - could instead write something that would read both alignment files # in parallel and do the merging and output-writing read-by-read. Do that if I start getting memory issues. reduce_alignment_dict(readname_to_aln_list) prioritize_cassette_reads(readname_to_aln_list, if_cassette_function=is_cassette_chromosome) # delete alignment tmpfiles now that they've been parsed os.remove(tmpfile_genome) if options.cassette_bowtie_index != 'NONE': os.remove(tmpfile_cassette) ### Decide the proper category for each read, and write the info to appropriate final output files if options.dont_split_by_category: with open(outfile_all,'w') as ALL_FILE: category_counts = categorize_reads_print_to_files(readname_to_aln_list, ALL_FILE, ALL_FILE, ALL_FILE, ALL_FILE, unaligned_as_fasta=False, multiple_to_write=options.multiple_to_show, input_collapsed_to_unique=options.input_collapsed_to_unique, no_warnings=options.quiet) else: with open(outfile_unaligned, 'w') as UNALIGNED_FILE: with open(outfile_cassette, 'w') as CASSETTE_FILE: with open(outfile_multiple_genomic, 'w') as MULTIPLE_GENOMIC_FILE: with open(outfile_genomic_unique, 'w') as GENOMIC_UNIQUE_FILE: category_counts = categorize_reads_print_to_files(readname_to_aln_list, UNALIGNED_FILE, CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, unaligned_as_fasta=True, multiple_to_write=options.multiple_to_show, input_collapsed_to_unique=options.input_collapsed_to_unique, no_warnings=options.quiet) ### print category_readcounts to INFOFILE in a nice way text1 = "\n### FINAL ALIGNMENT CATEGORY COUNTS" cassette_multiple = category_counts.pop('cassette-multiple') total_reads = sum(category_counts.values()) text2 = "# total reads: %s"%total_reads if options.input_collapsed_to_unique: text2 +=" (uncollapsed readcounts)" lines = [text1, text2] for category,count in sorted(category_counts.items()): text = "# %s: %s"%(category, value_and_percentages(count, [total_reads])) if category=='cassette' and cassette_multiple: text += ' (Warning: %s multiple!!)'%cassette_multiple lines.append(text) INFOFILE.write('\n') for text in lines: INFOFILE.write(text + '\n') if not options.quiet: print text ### copy preprocessing metadata file to the bottom of the new metadata file INFOFILE.write("\n\n################## Metadata from input preprocessing ##################\n\n") if options.input_metadata_file == 'NONE': INFOFILE.write('Not looking for a metadata input file, as specified by options\n') else: if options.input_metadata_file == 'AUTO': # the correct info file for X.txt is X.fa, but for X_5prime.txt it can be either X_5prime.txt or X.txt, so try both. # (in the new preprocessing version all files are X_*prime.txt and the info files are X_info.txt; # in the old version it was just X.txt and X_info.txt) # MAYBE-TODO add a test-case for this thing! Probably too minor. metafile_basename = os.path.splitext(infile)[0] options.input_metadata_file = metafile_basename + '_info.txt' if not os.path.exists(options.input_metadata_file): if metafile_basename.endswith('_3prime') or metafile_basename.endswith('_5prime'): options.input_metadata_file = metafile_basename[:-len('_3prime')] + '_info.txt' text = 'Automatically determining metadata input file name: %s\n'%options.input_metadata_file if not options.quiet: print text, else: text = 'Metadata input file name provided in options: %s\n'%options.input_metadata_file INFOFILE.write(text+'\n') if os.path.exists(options.input_metadata_file): print_text_from_file(options.input_metadata_file, INFOFILE, printing=False) else: text = 'Metadata input file %s not found!\n'%options.input_metadata_file if not options.quiet: print text, INFOFILE.write(text)