def make_jobs(commands, job_prefix, queue, jobs_dir="jobs/", walltime="72:00:00", ncpus=1, nodes=1, keep_output="oe"): """prepare qsub text files. command: list of commands job_prefix: a short, descriptive name for the job. queue: name of the queue to submit to jobs_dir: path to directory where job submision scripts are written walltime: the maximal walltime ncpus: number of cpus nodes: number of nodes keep_output: keep standard error, standard out, both, or neither o=std out, e=std err, oe=both, n=neither """ filenames = [] create_dir(jobs_dir) for command in commands: fd, job_name = mkstemp(dir=jobs_dir, prefix=job_prefix + "_", suffix=".txt") close(fd) out_fh = open(job_name, "w") out_fh.write(QSUB_TEXT % (walltime, ncpus, nodes, queue, job_prefix, keep_output, command)) out_fh.close() filenames.append(job_name) return filenames
def write_checkpoint(current_key, ctr, cluster_mapping, ids, bestscores, order, out_fp): """write intermediate results to checkpoint file current_key: the identifier of the current denoiser round ctr: a uniq counter to label the checkpoint cluster_mapping: an intermediate cluster mapping as dict ids: the dict of active ids order: a list of ids, which defines the order of which flowgrams are clustered bestscores: a dict of """ checkpoint_dir = out_fp + "/checkpoints/" if (not exists(checkpoint_dir)): create_dir(checkpoint_dir) out_fp = checkpoint_dir + "/checkpoint%d.pickle" % ctr out_fh = open(out_fp, "w") pickle.dump( (current_key, ctr, cluster_mapping, ids, bestscores, order), out_fh) return out_fp
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_dir = opts.input_dir output_dir = opts.output_dir create_dir(output_dir) lanes = opts.lanes.split(',') bases = opts.bases read = opts.read ignore_pass_filter = opts.ignore_pass_filter for lane in lanes: read1_fps = sorted(glob('%s/s_%s_%d_*qseq.txt' % (input_dir, lane.replace( ',', ''), read))) # sort so results will be consistent across different runs (important # so amplicon and barcodes read headers will match) output_fp = '%s/s_%s_%s_sequences.fastq' % (output_dir, lane, read) output_f = open(output_fp, 'w') for read1_fp in read1_fps: for record in iter_split_lines(open(read1_fp, 'U')): fastq_s, pass_filter = illumina_data_to_fastq(record, number_of_bases=bases) if ignore_pass_filter or pass_filter != 0: output_f.write('%s\n' % fastq_s) output_f.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.attempt_read_reorientation: if not opts.mapping_fp: option_parser.error("To use --attempt_read_reorientation, one must " "supply a mapping file that contains both LinkerPrimerSequence " "and ReversePrimer columns.") if opts.input_type == "barcode_paired_end": if not opts.fastq2: option_parser.error("To use input_type of barcode_paired_end, " "a second fastq file must be specified with --fastq2") if not opts.fastq2: disable_header_match = True else: disable_header_match = opts.disable_header_match fastq1 = qiime_open(opts.fastq1) if opts.fastq2: fastq2 = qiime_open(opts.fastq2) else: fastq2 = None create_dir(opts.output_dir) if opts.mapping_fp: map_fp = qiime_open(opts.mapping_fp) else: map_fp = None extract_barcodes(fastq1, fastq2, opts.output_dir, opts.input_type, opts.bc1_len, opts.bc2_len, opts.rev_comp_bc1, opts.rev_comp_bc2, opts.char_delineator, opts.switch_bc_order, map_fp, opts.attempt_read_reorientation, disable_header_match)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_dir = opts.input_dir output_dir = opts.output_dir create_dir(output_dir) lanes = opts.lanes.split(',') bases = opts.bases read = opts.read ignore_pass_filter = opts.ignore_pass_filter for lane in lanes: read1_fps = sorted( glob('%s/s_%s_%d_*qseq.txt' % (input_dir, lane.replace(',', ''), read))) # sort so results will be consistent across different runs (important # so amplicon and barcodes read headers will match) output_fp = '%s/s_%s_%s_sequences.fastq' % (output_dir, lane, read) output_f = open(output_fp, 'w') for read1_fp in read1_fps: for record in iter_split_lines(open(read1_fp, 'U')): fastq_s, pass_filter = illumina_data_to_fastq( record, number_of_bases=bases) if ignore_pass_filter or pass_filter != 0: output_f.write('%s\n' % fastq_s) output_f.close()
def test_create_dir(self): """create_dir creates dir and fails meaningful.""" # create a directory tmp_dir_path = mkdtemp() # create a random temporary directory name tmp_dir_path2 = join(mkdtemp(), str(uuid4())) tmp_dir_path3 = join(mkdtemp(), str(uuid4())) self.dirs_to_remove += [tmp_dir_path, tmp_dir_path2, tmp_dir_path3] # create on existing dir raises OSError if fail_on_exist=True self.assertRaises(OSError, create_dir, tmp_dir_path, fail_on_exist=True) self.assertEqual( create_dir(tmp_dir_path, fail_on_exist=True, handle_errors_externally=True), 1) # return should be 1 if dir exist and fail_on_exist=False self.assertEqual(create_dir(tmp_dir_path, fail_on_exist=False), 1) # if dir not there make it and return always 0 self.assertEqual(create_dir(tmp_dir_path2), 0) self.assertEqual(create_dir(tmp_dir_path3, fail_on_exist=True), 0)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if len(opts.taxa_summary_fps) != 2: option_parser.error("Exactly two taxa summary files are required. You " "provided %d." % len(opts.taxa_summary_fps)) # Create the output dir if it doesn't already exist. try: create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") sample_id_map = None if opts.sample_id_map_fp: sample_id_map = parse_sample_id_map(open(opts.sample_id_map_fp, 'U')) results = compare_taxa_summaries( parse_taxa_summary_table(open(opts.taxa_summary_fps[0], 'U')), parse_taxa_summary_table(open(opts.taxa_summary_fps[1], 'U')), opts.comparison_mode, correlation_type=opts.correlation_type, tail_type=opts.tail_type, num_permutations=opts.num_permutations, confidence_level=opts.confidence_level, perform_detailed_comparisons=opts.perform_detailed_comparisons, sample_id_map=sample_id_map, expected_sample_id=opts.expected_sample_id) # Write out the sorted and filled taxa summaries, basing their # filenames on the original input filenames. If the filenames are the same, # append a number to each filename. same_filenames = False if basename(opts.taxa_summary_fps[0]) == \ basename(opts.taxa_summary_fps[1]): same_filenames = True for orig_ts_fp, filled_ts_lines, file_num in zip(opts.taxa_summary_fps, results[:2], range(0, 2)): filename_suffix = '_sorted_and_filled' if same_filenames: filename_suffix += '_%d' % file_num filled_ts_fp = add_filename_suffix(orig_ts_fp, filename_suffix) filled_ts_f = open(join(opts.output_dir, filled_ts_fp), 'w') filled_ts_f.write(filled_ts_lines) filled_ts_f.close() # Write the overall comparison result. overall_comp_f = open(join(opts.output_dir, 'overall_comparison.txt'), 'w') overall_comp_f.write(results[2]) overall_comp_f.close() # Write the correlation vector containing the pairwise sample comparisons. if opts.perform_detailed_comparisons: corr_vec_f = open(join(opts.output_dir, 'detailed_comparisons.txt'), 'w') corr_vec_f.write(results[3]) corr_vec_f.close()
def main(): args = handle_program_options() if osp.isfile(args.out_dir): print("--out_dir (-o) option must be a valid directory and not a file", file=sys.stderr) sys.exit(1) # will fail gracefully if dir exists skbu.create_dir(args.out_dir) metagenomes = [] if args.metagenome_id is not None: metagenomes.append(args.metagenome_id) elif args.metagenome_file is not None: metagenomes.extend(parse_metagenome_file(args.metagenome_file)) if args.verbose: msg = 'Processing requested for {} metagenome(s) found in: {}' print(msg.format(len(metagenomes), args.metagenome_file)) # MG-RAST stage.file ids for downloading derep_passed = '150.1' screen_passed = '299.1' for mg_id in metagenomes: if args.verbose: print('Processing metagenome: {}'.format(mg_id)) print('\tDownloading: Dereplication Passed...', end='') sys.stdout.flush() derepp_rsp = mgapi.mgrast_request('download', mg_id, {'file': derep_passed}, auth_key=args.auth_key) derepp_sc = SequenceCollection.read(StringIO(derepp_rsp.text), format='fastq', variant='illumina1.8') if args.verbose: print('{} sequences'.format(len(derepp_sc))) print('\tDownloading: Screen Passed...', end='') sys.stdout.flush() screenp_rsp = mgapi.mgrast_request('download', mg_id, {'file': screen_passed}, auth_key=args.auth_key) screenp_ids = extract_seq_ids(screenp_rsp.text, fmt='fastq', variant='illumina1.8') if args.verbose: print('{} sequences'.format(len(screenp_ids))) # filter dereplication passed with IDs from screen passed failed_screen = filter_seqs(derepp_sc, screenp_ids) if args.verbose: nsp = len(screenp_ids) print('\tRemoved {} sequences from Dereplication Passed'.format(nsp)) print('\tleaving {} sequences'.format(len(failed_screen))) out_fp = osp.join(args.out_dir, mg_id + '_screen_failed.fastq') failed_screen.write(out_fp, format='fastq', variant='illumina1.8') if args.verbose: print('Sequence data written to: ' + out_fp)
def main(): with open('/data/input/AppSession.json', 'U') as fd_json: app = json.load(fd_json) jobs = '1' # get command attributes, etc for item in app['Properties']['Items']: if item['Name'] == 'Input.Projects': project_id = item['Items'][0]['Id'] if item['Name'] == 'Input.number-of-jobs': jobs = item['Content'] # from BaseSpace's documentation input_dir = '/data/input/samples/' base = join('/data/output/appresults/', project_id) output_dir = join(base, 'sl-out') # for sanity create_dir(output_dir) # split libraries cmd = ("multiple_split_libraries_fastq.py " "-i '{input_dir}' -o '{output_dir}'") params = {'input_dir': input_dir, 'output_dir': output_dir} system_call(cmd.format(**params)) for log_file in glob(join(output_dir, 'log_*')): with open(log_file, 'U') as fd_log: print fd_log.read() # OTU picking input_dir = join(output_dir, 'seqs.fna') output_dir = join(base, 'closed-ref') cmd = ("pick_closed_reference_otus.py " "-i '{input_seqs}' -o '{output_dir}'") params = {'input_seqs': input_dir, 'output_dir': output_dir, 'jobs': jobs} # see https://github.com/biocore/qiime/issues/2034 if jobs != '1': cmd += ' -a -O {jobs}' system_call(cmd.format(**params)) for log_file in glob(join(output_dir, 'log_*')): with open(log_file, 'U') as fd_log: print fd_log.read() input_dir = join(output_dir, 'otu_table.biom') output_dir = join(base, 'closed-ref', 'table-summary.txt') cmd = ("biom summarize-table " "-i '{input_table}' -o '{output_summary}'") params = {'input_table': input_dir, 'output_summary': output_dir} system_call(cmd.format(**params)) return 0
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. out_dir = opts.output_dir try: create_dir(out_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") map_f = open(opts.mapping_fp, 'U') dm_f = open(opts.distance_matrix_fp, 'U') fields = map(strip, opts.fields.split(',')) fields = [field.strip('"').strip("'") for field in fields] color_individual_within_by_field = opts.color_individual_within_by_field results = make_distance_boxplots(dm_f, map_f, fields, width=opts.width, height=opts.height, suppress_all_within=opts.suppress_all_within, suppress_all_between=opts.suppress_all_between, suppress_individual_within=opts.suppress_individual_within, suppress_individual_between=opts.suppress_individual_between, y_min=opts.y_min, y_max=opts.y_max, whisker_length=opts.whisker_length, box_width=opts.box_width, box_color=opts.box_color, color_individual_within_by_field=color_individual_within_by_field, sort=opts.sort) for field, plot_figure, plot_data, plot_labels, plot_colors in results: output_plot_fp = join(out_dir, "%s_Distances.%s" % (field, opts.imagetype)) plot_figure.savefig(output_plot_fp, format=opts.imagetype, transparent=opts.transparent) if not opts.suppress_significance_tests: sig_tests_f = open(join(out_dir, "%s_Stats.txt" % field), 'w') sig_tests_results = all_pairs_t_test(plot_labels, plot_data, tail_type=opts.tail_type, num_permutations=opts.num_permutations) sig_tests_f.write(sig_tests_results) sig_tests_f.close() if opts.save_raw_data: # Write the raw plot data into a tab-delimited file. assert(len(plot_labels) == len(plot_data)) raw_data_fp = join(out_dir, "%s_Distances.txt" % field) raw_data_f = open(raw_data_fp, 'w') for label, data in zip(plot_labels, plot_data): raw_data_f.write(label.replace(" ", "_") + "\t") raw_data_f.write("\t".join(map(str, data))) raw_data_f.write("\n") raw_data_f.close()
def setUp(self): # create the temporary input files that will be used self.iupac = {'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'R': '[AG]', 'Y': '[CT]', 'S': '[GC]', 'W': '[AT]', 'K': '[GT]', 'M': '[AC]', 'B': '[CGT]', 'D': '[AGT]', 'H': '[ACT]', 'V': '[ACG]', 'N': '[ACGT]'} self.output_dir = mkdtemp() self.output_dir += '/' create_dir(self.output_dir)
def write_xml_file(self, element, fp): """Writes an XML file after calling one of the XML generation functions Parameters ---------- element : ET.Element The Element to be written fp : str The filepath to which the XML will be written """ create_dir(self.xml_dir) ET.ElementTree(element).write(fp, encoding='UTF-8')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) out_dir = opts.output_dir categories = opts.categories.split(",") # Create the output dir if it doesn't already exist. try: if not exists(out_dir): create_dir(out_dir) except: option_parser.error( "Could not create or access output directory '%s' " "specified with the -o option." % out_dir ) compare_categories(opts.input_dm, opts.mapping_file, opts.method, categories, opts.num_permutations, out_dir)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) out_dir = opts.output_dir categories = opts.categories.split(',') # Create the output dir if it doesn't already exist. try: if not exists(out_dir): create_dir(out_dir) except: option_parser.error("Could not create or access output directory '%s' " "specified with the -o option." % out_dir) compare_categories(opts.input_dm, opts.mapping_file, opts.method, categories, opts.num_permutations, out_dir)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.num_permutations < 1: option_parser.error( "--num_permutations must be greater than or equal to 1.") # Create the output dir if it doesn't already exist. try: if not path.exists(opts.output_dir): create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") sample_id_map = None if opts.sample_id_map_fp: sample_id_map = dict([(k, v[0]) for k, v in fields_to_dict(open(opts.sample_id_map_fp, "U")).items()]) input_dm_fps = opts.input_dms distmats = [parse_distmat(open(dm_fp, 'U')) for dm_fp in input_dm_fps] if opts.method == 'mantel': output_f = open(path.join(opts.output_dir, 'mantel_results.txt'), 'w') output_f.write(run_mantel_test('mantel', input_dm_fps, distmats, opts.num_permutations, opts.tail_type, comment_mantel_pmantel, sample_id_map=sample_id_map)) elif opts.method == 'partial_mantel': output_f = open(path.join(opts.output_dir, 'partial_mantel_results.txt'), 'w') output_f.write(run_mantel_test('partial_mantel', input_dm_fps, distmats, opts.num_permutations, opts.tail_type, comment_mantel_pmantel, control_dm_fp=opts.control_dm, control_dm=parse_distmat(open(opts.control_dm, 'U')), sample_id_map=sample_id_map)) elif opts.method == 'mantel_corr': output_f = open(path.join(opts.output_dir, 'mantel_correlogram_results.txt'), 'w') result_str, correlogram_fps, correlograms = run_mantel_correlogram( input_dm_fps, distmats, opts.num_permutations, comment_corr, opts.alpha, sample_id_map=sample_id_map, variable_size_distance_classes=opts.variable_size_distance_classes) output_f.write(result_str) for corr_fp, corr in zip(correlogram_fps, correlograms): corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type), format=opts.image_type) output_f.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fps = opts.input_fps output_dir = opts.output_dir create_dir(output_dir) barcode_length = opts.barcode_length barcode_in_header = opts.barcode_in_header barcode_qual_c = opts.barcode_qual_c for input_fp in input_fps: if input_fp.endswith('.gz'): open_f = gzip_open input_basename = split(splitext(splitext(input_fp)[0])[0])[1] else: input_basename = split(splitext(input_fp)[0])[1] open_f = open sequence_output_fp = '%s/%s.fastq' % (output_dir, input_basename) sequence_output_f = open(sequence_output_fp, 'w') barcode_output_fp = '%s/%s_barcodes.fastq' % (output_dir, input_basename) barcode_output_f = open(barcode_output_fp, 'w') for line in open_f(input_fp): common_fields, sequence, sequence_qual, barcode, barcode_qual =\ iseq_to_qseq_fields( line, barcode_in_header, barcode_length, barcode_qual_c) sequence_s, pass_filter_s = illumina_data_to_fastq( (common_fields[0], common_fields[1], common_fields[2], common_fields[3], common_fields[4], common_fields[5], common_fields[6], common_fields[7], sequence, sequence_qual)) barcode_s, pass_filter_b = illumina_data_to_fastq( (common_fields[0], common_fields[1], common_fields[2], common_fields[3], common_fields[4], common_fields[5], common_fields[6], common_fields[7], barcode, barcode_qual), barcode_length) if pass_filter_s != 0: sequence_output_f.write('%s\n' % sequence_s) barcode_output_f.write('%s\n' % barcode_s) sequence_output_f.close() barcode_output_f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: if not path.exists(opts.output_dir): create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") sample_id_map = None if opts.sample_id_map_fp: sample_id_map = dict([(k, v[0]) for k, v in fields_to_dict(open(opts.sample_id_map_fp, "U")).items()]) input_dm_fps = opts.input_dms distmats = [parse_distmat(open(dm_fp, 'U')) for dm_fp in input_dm_fps] if opts.method == 'mantel': output_f = open(path.join(opts.output_dir, 'mantel_results.txt'), 'w') output_f.write(run_mantel_test('mantel', input_dm_fps, distmats, opts.num_permutations, opts.tail_type, comment_mantel_pmantel, sample_id_map=sample_id_map)) elif opts.method == 'partial_mantel': output_f = open(path.join(opts.output_dir, 'partial_mantel_results.txt'), 'w') output_f.write(run_mantel_test('partial_mantel', input_dm_fps, distmats, opts.num_permutations, opts.tail_type, comment_mantel_pmantel, control_dm_fp=opts.control_dm, control_dm=parse_distmat(open(opts.control_dm, 'U')), sample_id_map=sample_id_map)) elif opts.method == 'mantel_corr': output_f = open(path.join(opts.output_dir, 'mantel_correlogram_results.txt'), 'w') result_str, correlogram_fps, correlograms = run_mantel_correlogram( input_dm_fps, distmats, opts.num_permutations, comment_corr, opts.alpha, sample_id_map=sample_id_map, variable_size_distance_classes=opts.variable_size_distance_classes) output_f.write(result_str) for corr_fp, corr in zip(correlogram_fps, correlograms): corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type), format=opts.image_type) output_f.close()
def make_jobs(commands, job_prefix, queue, jobs_dir="jobs/", walltime="72:00:00", ncpus=1, nodes=1, keep_output="oe"): """prepare qsub text files. command: list of commands job_prefix: a short, descriptive name for the job. queue: name of the queue to submit to jobs_dir: path to directory where job submision scripts are written walltime: the maximal walltime ncpus: number of cpus nodes: number of nodes keep_output: keep standard error, standard out, both, or neither o=std out, e=std err, oe=both, n=neither """ filenames = [] create_dir(jobs_dir) for command in commands: fd, job_name = mkstemp(dir=jobs_dir, prefix=job_prefix + "_", suffix=".txt") close(fd) out_fh = open(job_name, "w") out_fh.write( QSUB_TEXT % (walltime, ncpus, nodes, queue, job_prefix, keep_output, command)) out_fh.close() filenames.append(job_name) return filenames
def setUp(self): # create the temporary input files that will be used self._files_to_remove = [] fd, self.qual_fp = mkstemp(prefix='qual_scores_', suffix='.qual') close(fd) seq_file = open(self.qual_fp, 'w') seq_file.write(qual_scores) seq_file.close() self.output_dir = mkdtemp() self.output_dir += '/' create_dir(self.output_dir) self.expected_output_text_file = expected_output_text_file self._files_to_remove =\ [self.qual_fp]
def test_create_dir(self): # create a directory tmp_dir_path = mkdtemp() # create a random temporary directory name tmp_dir_path2 = join(mkdtemp(), str(uuid4())) tmp_dir_path3 = join(mkdtemp(), str(uuid4())) self.dirs_to_remove += [tmp_dir_path, tmp_dir_path2, tmp_dir_path3] # create on existing dir raises OSError if fail_on_exist=True self.assertRaises(OSError, create_dir, tmp_dir_path, fail_on_exist=True) self.assertEqual(create_dir(tmp_dir_path, fail_on_exist=True, handle_errors_externally=True), 1) # return should be 1 if dir exist and fail_on_exist=False self.assertEqual(create_dir(tmp_dir_path, fail_on_exist=False), 1) # if dir not there make it and return always 0 self.assertEqual(create_dir(tmp_dir_path2), 0) self.assertEqual(create_dir(tmp_dir_path3, fail_on_exist=True), 0)
def write_checkpoint(current_key, ctr, cluster_mapping, ids, bestscores, order, out_fp): """write intermediate results to checkpoint file current_key: the identifier of the current denoiser round ctr: a uniq counter to label the checkpoint cluster_mapping: an intermediate cluster mapping as dict ids: the dict of active ids order: a list of ids, which defines the order of which flowgrams are clustered bestscores: a dict of """ checkpoint_dir = out_fp + "/checkpoints/" if (not exists(checkpoint_dir)): create_dir(checkpoint_dir) out_fp = checkpoint_dir + "/checkpoint%d.pickle" % ctr out_fh = open(out_fp, "w") pickle.dump((current_key, ctr, cluster_mapping, ids, bestscores, order), out_fh) return out_fp
def split_fasta(infile, seqs_per_file, outfile_prefix, working_dir=''): """ Split infile into files with seqs_per_file sequences in each infile: list of fasta lines or open file object seqs_per_file: the number of sequences to include in each file out_fileprefix: string used to create output filepath - output filepaths are <out_prefix>.<i>.fasta where i runs from 0 to number of output files working_dir: directory to prepend to temp filepaths (defaults to empty string -- files written to cwd) List of output filepaths is returned. """ if seqs_per_file <= 0: raise ValueError("seqs_per_file must be > 0!") seq_counter = 0 out_files = [] if working_dir and not working_dir.endswith('/'): working_dir += '/' create_dir(working_dir) for seq_id, seq in parse_fasta(infile): if seq_counter == 0: current_out_fp = '%s%s.%d.fasta' \ % (working_dir, outfile_prefix, len(out_files)) current_out_file = open(current_out_fp, 'w') out_files.append(current_out_fp) current_out_file.write('>%s\n%s\n' % (seq_id, seq)) seq_counter += 1 if seq_counter == seqs_per_file: current_out_file.close() seq_counter = 0 if not current_out_file.closed: current_out_file.close() return out_files
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. output_dir = opts.output_dir try: create_dir(output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o/--output_dir option.") otu_table_fp = opts.otu_table_fp table = load_table(otu_table_fp) estimator = ObservationRichnessEstimator(table, Chao1MultinomialPointEstimator) results = estimator(opts.min, opts.max, opts.num_steps, opts.confidence_level) out_fp = join(output_dir, 'estimates_table.txt') with open(out_fp, 'w') as out_f: results.toTable(out_f)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.attempt_read_reorientation: if not opts.mapping_fp: option_parser.error( "To use --attempt_read_reorientation, one must " "supply a mapping file that contains both LinkerPrimerSequence " "and ReversePrimer columns.") if opts.input_type == "barcode_paired_end": if not opts.fastq2: option_parser.error( "To use input_type of barcode_paired_end, " "a second fastq file must be specified with --fastq2") if not opts.fastq2: disable_header_match = True else: disable_header_match = opts.disable_header_match fastq1 = qiime_open(opts.fastq1) if opts.fastq2: fastq2 = qiime_open(opts.fastq2) else: fastq2 = None create_dir(opts.output_dir) if opts.mapping_fp: map_fp = qiime_open(opts.mapping_fp) else: map_fp = None extract_barcodes(fastq1, fastq2, opts.output_dir, opts.input_type, opts.bc1_len, opts.bc2_len, opts.rev_comp_bc1, opts.rev_comp_bc2, opts.char_delineator, opts.switch_bc_order, map_fp, opts.attempt_read_reorientation, disable_header_match)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") # Parse the distance matrix and mapping file. try: dist_matrix_header, dist_matrix = parse_distmat( open(opts.distance_matrix_fp, 'U')) except: option_parser.error("This does not look like a valid distance matrix " "file. Please supply a valid distance matrix file using the -d " "option.") try: mapping, mapping_header, mapping_comments = parse_mapping_file( open(opts.mapping_fp, 'U')) except QiimeParseError: option_parser.error("This does not look like a valid metadata mapping " "file. Please supply a valid mapping file using the -m option.") # Make sure the y_min and y_max options make sense, as they can be either # 'auto' or a number. y_min = opts.y_min y_max = opts.y_max try: y_min = float(y_min) except ValueError: if y_min == 'auto': y_min = None else: option_parser.error("The --y_min option must be either a number " "or 'auto'.") try: y_max = float(y_max) except ValueError: if y_max == 'auto': y_max = None else: option_parser.error("The --y_max option must be either a number " "or 'auto'.") # Parse the field states that will be compared to every other field state. comparison_field_states = opts.comparison_groups comparison_field_states = map(strip, comparison_field_states.split(',')) comparison_field_states = [field_state.strip('"').strip("'") for field_state in comparison_field_states] if comparison_field_states is None: option_parser.error("You must provide at least one field state to " "compare (using the -c option).") # Get distance comparisons between each field state and each of the # comparison field states. field = opts.field comparison_groupings = get_field_state_comparisons(dist_matrix_header, dist_matrix, mapping_header, mapping, field, comparison_field_states) # Grab a list of all field states that had the comparison field states # compared against them. These will be plotted along the x-axis. field_states = comparison_groupings.keys() def custom_comparator(x, y): try: num_x = float(x) num_y = float(y) return int(num_x - num_y) except: if x < y: return -1 elif x > y: return 1 else: return 0 # Sort the field states as numbers if the elements are numbers, else sort # them lexically. field_states.sort(custom_comparator) # If the label type is numeric, get a list of all field states in sorted # numeric order. These will be used to determine the spacing of the # field state 'points' along the x-axis. x_spacing = None if opts.label_type == "numeric": try: x_spacing = sorted(map(float, field_states)) except: option_parser.error("The 'numeric' label type is invalid because " "not all field states could be converted into " "numbers. Please specify a different label " "type.") # Accumulate the data for each field state 'point' along the x-axis. plot_data = [] plot_x_axis_labels = [] for field_state in field_states: field_state_data = [] for comp_field_state in comparison_field_states: field_state_data.append( comparison_groupings[field_state][comp_field_state]) plot_data.append(field_state_data) plot_x_axis_labels.append(field_state) # Plot the data and labels. plot_title = "Distance Comparisons" plot_x_label = field plot_y_label = "Distance" # If we are creating a bar chart or box plot, grab a list of good data # colors to use. plot_type = opts.plot_type plot_colors = None if plot_type == "bar" or plot_type == "box": plot_colors = [matplotlib_rgb_color(data_colors[color].toRGB()) for color in data_color_order] assert plot_data, "Error: there is no data to plot!" width = opts.width height = opts.height if width <= 0 or height <= 0: option_parser.error("The specified width and height of the image must " "be greater than zero.") plot_figure = grouped_distributions( opts.plot_type, plot_data, x_values=x_spacing, data_point_labels=plot_x_axis_labels, distribution_labels=comparison_field_states, distribution_markers=plot_colors, x_label=plot_x_label, y_label=plot_y_label, title=plot_title, x_tick_labels_orientation=opts.x_tick_labels_orientation, y_min=y_min, y_max=y_max, whisker_length=opts.whisker_length, error_bar_type=opts.error_bar_type, distribution_width=opts.distribution_width, figure_width=width, figure_height=height) # Save the plot in the specified format. output_plot_fp = join(opts.output_dir, "%s_Distance_Comparisons.%s" % (field, opts.imagetype)) plot_figure.savefig(output_plot_fp, format=opts.imagetype, transparent=opts.transparent) if not opts.suppress_significance_tests: sig_tests_f = open(join(opts.output_dir, "%s_Stats.txt" % field), 'w') # Rearrange the plot data into a format suitable for all_pairs_t_test. sig_tests_labels = [] sig_tests_data = [] for data_point, data_point_label in zip(plot_data, plot_x_axis_labels): for dist, comp_field in zip(data_point, comparison_field_states): sig_tests_labels.append('%s vs %s' % (data_point_label, comp_field)) sig_tests_data.append(dist) sig_tests_results = all_pairs_t_test(sig_tests_labels, sig_tests_data, tail_type=opts.tail_type, num_permutations=opts.num_permutations) sig_tests_f.write(sig_tests_results) sig_tests_f.close() if opts.save_raw_data: # Write the raw plot data into a tab-delimited file, where each line # has the distances between a comparison group and another field state # 'point' along the x-axis. assert (len(plot_x_axis_labels) == len(plot_data)), "The number of " +\ "labels do not match the number of points along the x-axis." raw_data_fp = join(opts.output_dir, "%s_Distance_Comparisons.txt" % field) raw_data_f = open(raw_data_fp, 'w') raw_data_f.write("#ComparisonGroup\tFieldState\tDistances\n") for label, data in zip(plot_x_axis_labels, plot_data): assert (len(comparison_field_states) == len(data)), "The " +\ "number of specified comparison groups does not match " +\ "the number of groups found at the current point along " +\ "the x-axis." for comp_field_state, comp_grp_data in zip(comparison_field_states, data): raw_data_f.write(comp_field_state + "\t" + label + "\t" + "\t".join(map(str, comp_grp_data)) + "\n") raw_data_f.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_dir = opts.output_dir if opts.num_fraction_for_core_steps < 2: option_parser.error( "Must perform at least two steps. Increase --num_fraction_for_core_steps.") fractions_for_core = np.linspace(opts.min_fraction_for_core, opts.max_fraction_for_core, opts.num_fraction_for_core_steps) otu_md = opts.otu_md valid_states = opts.valid_states mapping_fp = opts.mapping_fp create_dir(output_dir) if valid_states and opts.mapping_fp: sample_ids = sample_ids_from_metadata_description( open(mapping_fp, 'U'), valid_states) if len(sample_ids) < 1: option_parser.error( "--valid_states pattern didn't match any entries in mapping file: \"%s\"" % valid_states) else: # get core across all samples if user doesn't specify a subset of the # samples to work with sample_ids = None input_table = parse_biom_table(open(input_fp, 'U')) otu_counts = [] summary_figure_fp = join(output_dir, 'core_otu_size.pdf') for fraction_for_core in fractions_for_core: # build a string representation of the fraction as that gets used # several times fraction_for_core_str = "%1.0f" % (fraction_for_core * 100.) # prep output files output_fp = join( output_dir, 'core_otus_%s.txt' % fraction_for_core_str) output_table_fp = join( output_dir, 'core_table_%s.biom' % fraction_for_core_str) output_f = open(output_fp, 'w') try: core_table = filter_table_to_core(input_table, sample_ids, fraction_for_core) except TableException: output_f.write( "# No OTUs present in %s %% of samples." % fraction_for_core_str) output_f.close() otu_counts.append(0) continue # write some header information to file if sample_ids is None: output_f.write( "# Core OTUs across %s %% of samples.\n" % fraction_for_core_str) else: output_f.write( "# Core OTUs across %s %% of samples matching the sample metadata pattern \"%s\":\n# %s\n" % (fraction_for_core_str, valid_states, ' '.join(sample_ids))) # write the otu id and corresponding metadata for all core otus otu_count = 0 for value, id_, md in core_table.iter(axis='observation'): output_f.write('%s\t%s\n' % (id_, md[otu_md])) otu_count += 1 output_f.close() # write the core biom table write_biom_table(core_table, output_table_fp) # append the otu count to the list of counts otu_counts.append(otu_count) plot(fractions_for_core, otu_counts) xlim(min(fractions_for_core), max(fractions_for_core)) ylim(0, max(otu_counts) + 1) xlabel( "Fraction of samples that OTU must be observed in to be considered 'core'") ylabel("Number of OTUs") savefig(summary_figure_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) verbose = opts.verbose otu_table_fp = opts.otu_table_fp output_dir = opts.output_dir mapping_fp = opts.mapping_fp tree_fp = opts.tree_fp verbose = opts.verbose print_only = opts.print_only seqs_per_sample = opts.seqs_per_sample parallel = opts.parallel # No longer checking that jobs_to_start > 2, but # commenting as we may change our minds about this. #if parallel: raise_error_on_parallel_unavailable() if opts.parameter_fp: try: parameter_f = open(opts.parameter_fp, 'U') except IOError: raise IOError("Can't open parameters file (%s). Does it exist? Do you have read access?" % opts.parameter_fp) params = parse_qiime_parameters(parameter_f) parameter_f.close() else: params = parse_qiime_parameters([]) # empty list returns empty defaultdict for now jobs_to_start = opts.jobs_to_start default_jobs_to_start = qiime_config['jobs_to_start'] validate_and_set_jobs_to_start(params, jobs_to_start, default_jobs_to_start, parallel, option_parser) create_dir(output_dir, fail_on_exist=not opts.force) if print_only: command_handler = print_commands else: command_handler = call_commands_serially if verbose: status_update_callback = print_to_stdout else: status_update_callback = no_status_updates run_beta_diversity_through_plots(otu_table_fp=otu_table_fp, mapping_fp=mapping_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, color_by_interesting_fields_only=not opts.color_by_all_fields, sampling_depth=seqs_per_sample, tree_fp=tree_fp, parallel=parallel, suppress_emperor_plots=opts.suppress_emperor_plots, status_update_callback=status_update_callback)
def compare(interest_fp, other_fp, output_dir='blast-results-compare', interest_pcts=None, interest_alg_lens=None, other_pcts=None, other_alg_lens=None, hits_to_first=False, hits_to_second=False): """Compare two databases and write the outputs Parameters ---------- interest_fp : str BLAST results when searching against the database of interest. other_fp : str BLAST results when searching against the other database. output_dir : str, optional Name of the output file path. interest_pcts : list, optional Minimum percentage identity to be considered as a valid result in the interest database search results. If None is passed, it defaults to `[70]`. interest_alg_lens : list, optional Minimum alginment length to be considered a valid result in the interest database search results. If None is passed, it defaults to `[50]`. other_pcts : list, optional Minimum percentage identity to be considered as a valid result in the other database search results. If None is passed, it defaults to `[70]`. other_lengths : list, optional Minimum alginment length to be considered a valid result in the other database search results. If None is passed, it defaults to `[50]`. hits_to_first : bool, optional defaults to False Outputs the labels and counts of the sequences being hit in the first database. hits_to_second : bool, optional defaults to False Outputs the labels and counts of the sequences being hit in the second database. Raises ------ click.BadParameter If the `interest_pcts` and the `other_pcts` lists are of different length. If the `interest_alg_lens` and the `other_alg_lens` lists are of different length. """ if interest_pcts is None: interest_pcts = [70] if interest_alg_lens is None: interest_alg_lens = [50] db_a = open(interest_fp, 'U') db_b = open(other_fp, 'U') # try to create the output directory, if it exists, just continue create_dir(output_dir, False) # run some validations on the input parameters if other_pcts: if len(interest_pcts) != len(other_pcts): raise BadParameter("The percentage values for both databases " "should be the same length: %s - %s" % (interest_pcts, other_pcts)) else: other_pcts = interest_pcts if other_alg_lens: if len(interest_alg_lens) != len(other_alg_lens): raise BadParameter("The alignment length values for both databases" " should be the length : %s - %s" % (interest_alg_lens, other_alg_lens)) else: other_alg_lens = interest_alg_lens # process databases total_queries, best_hits = parse_first_database(db_a, interest_pcts, interest_alg_lens) parse_second_database(db_b, best_hits, other_pcts, other_alg_lens) # parse results results = process_results(interest_pcts, interest_alg_lens, other_pcts, other_alg_lens, best_hits) # Collating output and writing full results for i, item in enumerate(results): filename = join(output_dir, "summary_" + item['filename'] + ".txt") with open(filename, 'w') as fd: fd.write('\n'.join(item['summary'])) if i == 0: combined_results = [] combined_results.append(['filename']) combined_results.append(['interest db (%s)' % basename(interest_fp)]) combined_results.append(['other db (%s)' % basename(other_fp)]) combined_results.append(['only interest']) combined_results.append(['both dbs']) combined_results.append(['no hits in interest db']) no_hits = total_queries - item['db_interest'] - item['db_other'] - \ item['perfect_interest'] - item['equal'] combined_results[0].append(item['filename']) combined_results[1].append(str(item['db_interest'])) combined_results[2].append(str(item['db_other'])) combined_results[3].append(str(item['perfect_interest'])) combined_results[4].append(str(item['equal'])) combined_results[5].append(str(no_hits)) # tiny helper function to save hits files def save_hits(data, name): s_hits = sorted(data, key=itemgetter(1), reverse=True) filename = join(output_dir, name) with open(filename, 'w') as fd: fd.write('\n'.join(['%s\t%d' % (k, v) for k, v in s_hits if v != 0])) if hits_to_first: save_hits(item['db_seqs_counts']['a'].items(), "hits_to_first_db_%s.txt" % item['filename']) if hits_to_second: save_hits(item['db_seqs_counts']['b'].items(), "hits_to_second_db_%s.txt" % item['filename']) # saving collated results with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output: compiled_output.write('\n'.join(['\t'.join(item) for item in combined_results])) fn = join(output_dir, "compile_output_no_nohits.txt") with open(fn, 'w') as fd: fd.write('\n'.join(['\t'.join(item) for item in combined_results[:-1]]))
def _get_job_commands(self, input_fp, output_dir, params, job_prefix, working_dir, command_prefix='/bin/bash; ', command_suffix='; exit'): """Generate beta diversity to split single OTU table to multiple jobs full_tree=True is faster: beta_diversity.py -f will make things go faster, but be sure you already have the correct minimal tree. """ commands = [] result_filepaths = [] sids = load_table(input_fp).ids() if params['full_tree']: full_tree_str = '-f' else: full_tree_str = '' if params['tree_path']: tree_str = '-t %s' % params['tree_path'] else: tree_str = '' metrics = params['metrics'] # this is a little bit of an abuse of _merge_to_n_commands, so may # be worth generalizing that method - this determines the correct # number of samples to process in each command sample_id_groups = self._merge_to_n_commands(sids, params['jobs_to_start'], delimiter=',', command_prefix='', command_suffix='') for i, sample_id_group in enumerate(sample_id_groups): working_dir_i = join(working_dir, str(i)) create_dir(working_dir_i) output_dir_i = join(output_dir, str(i)) create_dir(output_dir_i) result_filepaths.append(output_dir_i) input_dir, input_fn = split(input_fp) input_basename, input_ext = splitext(input_fn) sample_id_desc = sample_id_group.replace(',', '_') output_fns = ['%s_%s.txt' % (metric, input_basename) for metric in metrics.split(',')] rename_command, current_result_filepaths = self._get_rename_command( output_fns, working_dir_i, output_dir_i) result_filepaths += current_result_filepaths bdiv_command = '%s -i %s -o %s %s -m %s %s -r %s' %\ (self._script_name, input_fp, working_dir_i, tree_str, params['metrics'], full_tree_str, sample_id_group) shell_script_fp = '%s/%s%d.sh' % (working_dir_i, job_prefix, i) shell_script_commands = [bdiv_command] + rename_command.split(';') self._commands_to_shell_script(shell_script_commands, shell_script_fp) commands.append('bash %s' % shell_script_fp) commands = self._merge_to_n_commands(commands, params['jobs_to_start'], command_prefix=command_prefix, command_suffix=command_suffix) return commands, result_filepaths
interest_taxonomy = sequences_from_query(open(tax_fp, 'U'), query) except (PlatypusValueError, PlatypusParseError), e: raise BadParameter(e.message) if len(interest_taxonomy) == 0: raise BadParameter('The query could not retrieve any results, try ' 'a different one.') else: interest_taxonomy = { l.strip().split('\t')[0].strip(): '' for l in open(split_fp, 'U') } if not interest_taxonomy: raise BadParameter('The split_fp is empty!') create_dir(output_fp, False) interest_fp = open(join(output_fp, 'interest.fna'), 'w') rest_fp = open(join(output_fp, 'rest.fna'), 'w') for record in read(seqs_fp, format='fasta'): full_name = record.id seq = record.sequence name = full_name.strip().split(' ')[0].strip() if name in interest_taxonomy: interest_fp.write(">%s\n%s\n" % (full_name, seq)) else: rest_fp.write(">%s\n%s\n" % (full_name, seq))
def main(): args = handle_program_options() if osp.isfile(args.out_dir): print("--out_dir (-o) option must be a valid directory and not a file", file=sys.stderr) sys.exit(1) # will fail gracefully if dir exists skbu.create_dir(args.out_dir) metagenomes = [] if args.metagenome_id is not None: metagenomes.append(args.metagenome_id) elif args.metagenome_file is not None: metagenomes.extend(parse_metagenome_file(args.metagenome_file)) if args.verbose: msg = 'Processing requested for {} metagenome(s) found in: {}' print(msg.format(len(metagenomes), args.metagenome_file)) # MG-RAST stage.file ids for downloading derep_passed = '150.1' screen_passed = '299.1' for mg_id in metagenomes: if args.verbose: print('Processing metagenome: {}'.format(mg_id)) print('\tDownloading: Dereplication Passed...', end='') sys.stdout.flush() derepp_rsp = mgapi.mgrast_request('download', mg_id, {'file': derep_passed}, auth_key=args.auth_key) derepp_sc = SequenceCollection.read(StringIO(derepp_rsp.text), format='fastq', variant='illumina1.8') if args.verbose: print('{} sequences'.format(len(derepp_sc))) print('\tDownloading: Screen Passed...', end='') sys.stdout.flush() screenp_rsp = mgapi.mgrast_request('download', mg_id, {'file': screen_passed}, auth_key=args.auth_key) screenp_ids = extract_seq_ids(screenp_rsp.text, fmt='fastq', variant='illumina1.8') if args.verbose: print('{} sequences'.format(len(screenp_ids))) # filter dereplication passed with IDs from screen passed failed_screen = filter_seqs(derepp_sc, screenp_ids) if args.verbose: nsp = len(screenp_ids) print( '\tRemoved {} sequences from Dereplication Passed'.format(nsp)) print('\tleaving {} sequences'.format(len(failed_screen))) out_fp = osp.join(args.out_dir, mg_id + '_screen_failed.fastq') failed_screen.write(out_fp, format='fastq', variant='illumina1.8') if args.verbose: print('Sequence data written to: ' + out_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) read_arguments_from_file = opts.read_arguments_from_file # these arguments can optionally be read from a file, reasoning is to # allow arguments that would span over hundreds of samples and would be # prohibitive to execute as a command line call if read_arguments_from_file: # sample_ids is the only one of these arguments that's returned as a # string, the rest of them are lists if opts.sample_ids: opts.sample_ids = ','.join(parse_items(opts.sample_ids)) if opts.sequence_read_fps: opts.sequence_read_fps = parse_items(opts.sequence_read_fps[0]) if opts.barcode_read_fps: opts.barcode_read_fps = parse_items(opts.barcode_read_fps[0]) if opts.mapping_fps: opts.mapping_fps = parse_items(opts.mapping_fps[0]) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps sample_ids = None if opts.sample_ids is not None: sample_ids = opts.sample_ids.split(',') mapping_fps = opts.mapping_fps phred_quality_threshold = opts.phred_quality_threshold retain_unassigned_reads = opts.retain_unassigned_reads min_per_read_length_fraction = opts.min_per_read_length_fraction max_bad_run_length = opts.max_bad_run_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD # opts.filter_bad_illumina_qual_digit filter_bad_illumina_qual_digit = False store_qual_scores = opts.store_qual_scores store_demultiplexed_fastq = opts.store_demultiplexed_fastq barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors # if this is not a demultiplexed run, if barcode_type == 'not-barcoded': if sample_ids is None: option_parser.error("If not providing barcode reads (because " "your data is not multiplexed), must provide --sample_ids.") if len(sample_ids) != len(sequence_read_fps): option_parser.error("If providing --sample_ids (because " "your data is not multiplexed), must provide the same number " "of sample ids as sequence read filepaths.") barcode_read_fps = [None] * len(sequence_read_fps) mapping_fps = [None] * len(sequence_read_fps) elif barcode_read_fps is None: option_parser.error("Must provide --barcode_read_fps if " "--barcode_type is not 'not-barcoded'") elif mapping_fps is None: option_parser.error("Must provide --mapping_fps if " "--barcode_type is not 'not-barcoded'") phred_offset = opts.phred_offset if phred_offset is not None: try: phred_offset = int(phred_offset) except ValueError: # shouldn't be able to get here... option_parser.error( "If --phred_offset is provided, it must be a valid integer.") if opts.last_bad_quality_char is not None: option_parser.error('--last_bad_quality_char is no longer supported. ' 'Use -q instead (see option help text by passing -h)') if not (0 < min_per_read_length_fraction <= 1): option_parser.error('--min_per_read_length_fraction must be greater ' 'than 0 and less than or equal to 1. You passed ' '%1.5f.' % min_per_read_length_fraction) barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None) if len(mapping_fps) == 1 and len(sequence_read_fps) > 1: mapping_fps = mapping_fps * len(sequence_read_fps) if len(set([len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps)])) > 1: option_parser.error("Same number of sequence, barcode, and mapping " "files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp, 'w') qual_fp_temp = '%s/qual.fna.incomplete' % output_dir qual_fp = '%s/seqs.qual' % output_dir output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir output_fastq_fp = '%s/seqs.fastq' % output_dir if store_qual_scores: qual_f = open(qual_fp_temp, 'w') # define a qual writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def qual_writer(h, q): qual_f.write('>%s\n%s\n' % (h, q)) else: def qual_writer(h, q): pass if store_demultiplexed_fastq: output_fastq_f = open(output_fastq_fp_temp, 'w') # define a fastq writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def fastq_writer(h, s, q): output_fastq_f.write(format_fastq_record(h, s, q)) else: def fastq_writer(h, s, q): pass log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp, 'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp, 'w') for i in range(len(sequence_read_fps)): sequence_read_fp = sequence_read_fps[i] barcode_read_fp = barcode_read_fps[i] mapping_fp = mapping_fps[i] if mapping_fp is not None: mapping_f = open(mapping_fp, 'U') _, _, barcode_to_sample_id, _, _, _, _ = check_map(mapping_f, disable_primer_check=True, has_barcodes=barcode_read_fp is not None) else: mapping_f = None barcode_to_sample_id = {} if rev_comp_mapping_barcodes: barcode_to_sample_id = {str(DNA(k).rc()): v for k, v in barcode_to_sample_id.iteritems()} if barcode_type == 'golay_12': invalid_golay_barcodes = get_invalid_golay_barcodes( barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error("Some or all barcodes are not valid golay " "codes. Do they need to be reverse complemented? If these " "are not golay barcodes pass --barcode_type 12 to disable " "barcode error correction, or pass --barcode_type # if " "the barcodes are not 12 base pairs, where # is the size " "of the barcodes. Invalid codes:\n\t%s" % ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") if mapping_fp is not None: log_f.write('Mapping filepath: %s (md5: %s)\n' % (mapping_fp, safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' % (sequence_read_fp, str(safe_md5(open(sequence_read_fp)).hexdigest()))) if sequence_read_fp.endswith('.gz'): sequence_read_f = gzip_open(sequence_read_fp) else: sequence_read_f = open(sequence_read_fp, 'U') seq_id = start_seq_id if barcode_read_fp is not None: log_f.write('Barcode read filepath: %s (md5: %s)\n\n' % (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest())) if barcode_read_fp.endswith('.gz'): barcode_read_f = gzip_open(barcode_read_fp) else: barcode_read_f = open(barcode_read_fp, 'U') seq_generator = process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors, phred_offset=phred_offset) else: seq_generator = process_fastq_single_end_read_file_no_barcode( sequence_read_f, sample_ids[i], store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, phred_offset=phred_offset) for fasta_header, sequence, quality, seq_id in seq_generator: output_f.write('>%s\n%s\n' % (fasta_header, sequence)) qual_writer(fasta_header, quality) fastq_writer(fasta_header, sequence, quality) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp, output_fp) # process the optional output files, as necessary if store_qual_scores: qual_f.close() rename(qual_fp_temp, qual_fp) if store_demultiplexed_fastq: output_fastq_f.close() rename(output_fastq_fp_temp, output_fastq_fp)
def pick_subsampled_open_reference_otus( input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=None, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust'] allowed_reference_otu_picking_methods = [ 'uclust_ref', 'usearch61_ref', 'sortmerna' ] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s( logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id) * 100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus(input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir # count number of sequences in step 1 failures fasta file with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f: num_failure_seqs, mean, std = count_seqs_from_file( step1_failures_fasta_f) # number of failures sequences is greater than the threshold, # continue to step 2,3 and 4 run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold if run_step_2_and_3: # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_dir = '%s/step2_otus/' % output_dir create_dir(step2_dir) step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step2_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath(step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir # remove the indexed reference database from the dictionary of # parameters as it must be forced to build a new database # using the step2_repset_fasta_fp if reference_otu_picking_method == 'sortmerna': if 'sortmerna_db' in params['pick_otus']: del params['pick_otus']['sortmerna_db'] step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)]) index_links.append(( 'Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step4_dir = '%s/step4_otus/' % output_dir if run_step_2_and_3: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) failures_fp = step3_failures_fasta_fp failures_otus_fp = 'failures_failures_otus.txt' failures_step = 'step3' else: failures_fp = step1_failures_fasta_fp failures_otus_fp = 'failures_otus.txt' failures_step = 'step1' step3_otu_map_fp = "" step4_cmd = pick_denovo_otus(failures_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp) commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp) commands.append([('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created if run_step_2_and_3: failures_fp = step3_failures_list_fp else: failures_fp = step1_failures_list_fp step3_otu_map_fp = "" cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([ ('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (failures_fp, output_dir)) ]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append( ('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write( '# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append(('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append(( 'New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copyfile(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write( '# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. if run_step_2_and_3: for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() # steps 1-4 executed if run_step_2_and_3: logger.write( '# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # only steps 1 and 4 executed else: logger.write( '# Write non-singleton otus representative sequences from ' + 'step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) index_links.append(('OTU taxonomic assignments', taxonomy_fp, _index_headers['taxa_assignments'])) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append(('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table( table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=None, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write( 'Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_reference_otus( input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, suppress_index_page=True, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback, minimum_failure_threshold=minimum_failure_threshold) # perform post-iteration file shuffling whether the previous iteration's # data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps), otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table( table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def _get_job_commands(self, input_fp, output_dir, params, job_prefix, working_dir, command_prefix='/bin/bash; ', command_suffix='; exit'): """Generate beta diversity to split single OTU table to multiple jobs full_tree=True is faster: beta_diversity.py -f will make things go faster, but be sure you already have the correct minimal tree. """ commands = [] result_filepaths = [] sids = load_table(input_fp).ids() if params['full_tree']: full_tree_str = '-f' else: full_tree_str = '' if params['tree_path']: tree_str = '-t %s' % params['tree_path'] else: tree_str = '' metrics = params['metrics'] # this is a little bit of an abuse of _merge_to_n_commands, so may # be worth generalizing that method - this determines the correct # number of samples to process in each command sample_id_groups = self._merge_to_n_commands(sids, params['jobs_to_start'], delimiter=',', command_prefix='', command_suffix='') for i, sample_id_group in enumerate(sample_id_groups): working_dir_i = join(working_dir, str(i)) create_dir(working_dir_i) output_dir_i = join(output_dir, str(i)) create_dir(output_dir_i) result_filepaths.append(output_dir_i) input_dir, input_fn = split(input_fp) input_basename, input_ext = splitext(input_fn) sample_id_desc = sample_id_group.replace(',', '_') output_fns = [ '%s_%s.txt' % (metric, input_basename) for metric in metrics.split(',') ] rename_command, current_result_filepaths = self._get_rename_command( output_fns, working_dir_i, output_dir_i) result_filepaths += current_result_filepaths bdiv_command = '%s -i %s -o %s %s -m %s %s -r %s' %\ (self._script_name, input_fp, working_dir_i, tree_str, params['metrics'], full_tree_str, sample_id_group) shell_script_fp = '%s/%s%d.sh' % (working_dir_i, job_prefix, i) shell_script_commands = [bdiv_command] + rename_command.split(';') self._commands_to_shell_script(shell_script_commands, shell_script_fp) commands.append('bash %s' % shell_script_fp) commands = self._merge_to_n_commands(commands, params['jobs_to_start'], command_prefix=command_prefix, command_suffix=command_suffix) return commands, result_filepaths
try: interest_taxonomy = sequences_from_query(open(tax_fp, 'U'), query) except (PlatypusValueError, PlatypusParseError), e: raise BadParameter(e.message) if len(interest_taxonomy) == 0: raise BadParameter('The query could not retrieve any results, try ' 'a different one.') else: interest_taxonomy = {l.strip().split('\t')[0].strip(): '' for l in open(split_fp, 'U')} if not interest_taxonomy: raise BadParameter('The split_fp is empty!') create_dir(output_fp, False) interest_fp = open(join(output_fp, 'interest.fna'), 'w') rest_fp = open(join(output_fp, 'rest.fna'), 'w') for record in read(seqs_fp, format='fasta'): full_name = record.id seq = record.sequence name = full_name.strip().split(' ')[0].strip() if name in interest_taxonomy: interest_fp.write(">%s\n%s\n" % (full_name, seq)) else: rest_fp.write(">%s\n%s\n" % (full_name, seq))
def main(commandline_args=None): parser, opts, args = parse_command_line_parameters(**script_info) if(opts.checkpoint_fp): bp_fp = opts.checkpoint_fp if not exists(bp_fp): parser.error( 'Specified checkpoint file does not exist: %s' % bp_fp) # peek into sff.txt files to make sure they are parseable # cat_sff_fles is lazy and only reads header flowgrams, header = cat_sff_files(map(open, opts.sff_fps)) if(opts.split and opts.preprocess_fp): parser.error('Options --split and --preprocess_fp are exclusive') if(opts.preprocess_fp): pp_fp = opts.preprocess_fp if not exists(opts.preprocess_fp): parser.error( 'Specified preprocess directory does not exist: %s' % opts.preprocess_fp) if not files_exist('%s/prefix_mapping.txt,%s/prefix_dereplicated.fasta' % (pp_fp, pp_fp)): parser.error('Specified preprocess directory does not contain expected files: ' + 'prefix_mapping.txt and prefix_dereplicated.fasta') if opts.titanium: opts.error_profile = DENOISER_DATA_DIR + 'Titanium_error_profile.dat' opts.low_cutoff = 4 opts.high_cutoff = 5 if not exists(opts.error_profile): parser.error( 'Specified error profile %s does not exist' % opts.error_profile) if opts.output_dir: # make sure it always ends on / tmpoutdir = opts.output_dir + "/" create_dir(tmpoutdir, not opts.force) else: # make random dir in current dir tmpoutdir = mkdtemp(dir="", prefix="denoiser_", suffix="/") log_fp = 'denoiser.log' if opts.split: denoise_per_sample( opts.sff_fps, opts.fasta_fp, tmpoutdir, opts.cluster, opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium) else: denoise_seqs( opts.sff_fps, opts.fasta_fp, tmpoutdir, opts.preprocess_fp, opts.cluster, opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium, opts.checkpoint_fp)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fps = opts.input_fps output_dir = opts.output_dir create_dir(output_dir) barcode_length = opts.barcode_length barcode_in_header = opts.barcode_in_header barcode_qual_c = opts.barcode_qual_c for input_fp in input_fps: if input_fp.endswith('.gz'): open_f = gzip_open input_basename = split(splitext(splitext(input_fp)[0])[0])[1] else: input_basename = split(splitext(input_fp)[0])[1] open_f = open sequence_output_fp = '%s/%s.fastq' % (output_dir, input_basename) sequence_output_f = open(sequence_output_fp, 'w') barcode_output_fp = '%s/%s_barcodes.fastq' % (output_dir, input_basename) barcode_output_f = open(barcode_output_fp, 'w') for line in open_f(input_fp): common_fields, sequence, sequence_qual, barcode, barcode_qual =\ iseq_to_qseq_fields( line, barcode_in_header, barcode_length, barcode_qual_c) sequence_s, pass_filter_s = illumina_data_to_fastq( (common_fields[0], common_fields[ 1], common_fields[ 2], common_fields[ 3], common_fields[ 4], common_fields[ 5], common_fields[ 6], common_fields[ 7], sequence, sequence_qual)) barcode_s, pass_filter_b = illumina_data_to_fastq( (common_fields[0], common_fields[ 1], common_fields[ 2], common_fields[ 3], common_fields[ 4], common_fields[ 5], common_fields[ 6], common_fields[ 7], barcode, barcode_qual), barcode_length) if pass_filter_s != 0: sequence_output_f.write('%s\n' % sequence_s) barcode_output_f.write('%s\n' % barcode_s) sequence_output_f.close() barcode_output_f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) verbose = opts.verbose otu_table_fp = opts.otu_table_fp output_dir = opts.output_dir mapping_fp = opts.mapping_fp tree_fp = opts.tree_fp verbose = opts.verbose print_only = opts.print_only seqs_per_sample = opts.seqs_per_sample parallel = opts.parallel # No longer checking that jobs_to_start > 2, but # commenting as we may change our minds about this. #if parallel: raise_error_on_parallel_unavailable() if opts.parameter_fp: try: parameter_f = open(opts.parameter_fp, 'U') except IOError: raise IOError( "Can't open parameters file (%s). Does it exist? Do you have read access?" % opts.parameter_fp) params = parse_qiime_parameters(parameter_f) parameter_f.close() else: params = parse_qiime_parameters([]) # empty list returns empty defaultdict for now jobs_to_start = opts.jobs_to_start default_jobs_to_start = qiime_config['jobs_to_start'] validate_and_set_jobs_to_start(params, jobs_to_start, default_jobs_to_start, parallel, option_parser) create_dir(output_dir, fail_on_exist=not opts.force) if print_only: command_handler = print_commands else: command_handler = call_commands_serially if verbose: status_update_callback = print_to_stdout else: status_update_callback = no_status_updates run_beta_diversity_through_plots( otu_table_fp=otu_table_fp, mapping_fp=mapping_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, color_by_interesting_fields_only=not opts.color_by_all_fields, sampling_depth=seqs_per_sample, tree_fp=tree_fp, parallel=parallel, suppress_emperor_plots=opts.suppress_emperor_plots, status_update_callback=status_update_callback)
def main(): spreadsheet_key = None with open('/data/input/AppSession.json', 'U') as fd_json: app = json.load(fd_json) # get command attributes, etc for item in app['Properties']['Items']: if item['Name'] == 'Input.Projects': project_id = item['Items'][0]['Id'] if item['Name'] == 'Input.spreadsheet-key': spreadsheet_key = item['Content'] if item['Name'] == 'Input.app-result-id': results_id = item['Content']['Id'] if item['Name'] == 'Input.rarefaction-depth': depth = item['Content'] if item['Name'] == 'Input.number-of-jobs': jobs = item['Content'] # from BaseSpace's documentation input_dir = '/data/input/appresults/' base = join('/data/output/appresults/', project_id) create_dir(base) # OTU picking input_dir = join(input_dir, results_id) mapping_fp = join(base, 'mapping-file.txt') cmd = ("load_remote_mapping_file.py " "-k {spreadsheet_key} -o {mapping_fp}") params = {'spreadsheet_key': spreadsheet_key, 'mapping_fp': mapping_fp} system_call(cmd.format(**params)) biom_fp = join(input_dir, 'otu_table.biom') tree_fp = glob(join(input_dir, '*.tree'))[0] output_dir = join(base, 'corediv-out') bt = load_table(biom_fp) if bt.is_empty(): logging.error('BIOM table is empty, cannot perform diversity ' 'analyses.') return 11 params_fp = join(base, 'alpha-params.txt') with open(params_fp, 'w') as alpha_fp: alpha_fp.write('alpha_diversity:metrics shannon,PD_whole_tree,' 'chao1,observed_species') cmd = ("core_diversity_analyses.py " "-i {biom_fp} -o {output_dir} -m {mapping_fp} -e {depth} " "-t {tree_fp} -p {params_fp}") params = {'biom_fp': biom_fp, 'output_dir': output_dir, 'mapping_fp': mapping_fp, 'depth': depth, 'jobs': jobs, 'tree_fp': tree_fp, 'params_fp': params_fp} # see https://github.com/biocore/qiime/issues/2034 if jobs != '1': cmd += ' -a -O {jobs}' system_call(cmd.format(**params)) for log_file in glob(join(output_dir, 'log_*')): with open(log_file, 'U') as fd_log: print fd_log.read() return 0
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) read_arguments_from_file = opts.read_arguments_from_file # these arguments can optionally be read from a file, reasoning is to # allow arguments that would span over hundreds of samples and would be # prohibitive to execute as a command line call if read_arguments_from_file: # sample_ids is the only one of these arguments that's returned as a # string, the rest of them are lists if opts.sample_ids: opts.sample_ids = ','.join(parse_items(opts.sample_ids)) if opts.sequence_read_fps: opts.sequence_read_fps = parse_items(opts.sequence_read_fps[0]) if opts.barcode_read_fps: opts.barcode_read_fps = parse_items(opts.barcode_read_fps[0]) if opts.mapping_fps: opts.mapping_fps = parse_items(opts.mapping_fps[0]) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps sample_ids = None if opts.sample_ids is not None: sample_ids = opts.sample_ids.split(',') mapping_fps = opts.mapping_fps phred_quality_threshold = opts.phred_quality_threshold retain_unassigned_reads = opts.retain_unassigned_reads min_per_read_length_fraction = opts.min_per_read_length_fraction max_bad_run_length = opts.max_bad_run_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD # opts.filter_bad_illumina_qual_digit filter_bad_illumina_qual_digit = False store_qual_scores = opts.store_qual_scores store_demultiplexed_fastq = opts.store_demultiplexed_fastq barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors # if this is not a demultiplexed run, if barcode_type == 'not-barcoded': if sample_ids is None: option_parser.error( "If not providing barcode reads (because " "your data is not multiplexed), must provide --sample_ids.") if len(sample_ids) != len(sequence_read_fps): option_parser.error( "If providing --sample_ids (because " "your data is not multiplexed), must provide the same number " "of sample ids as sequence read filepaths.") barcode_read_fps = [None] * len(sequence_read_fps) mapping_fps = [None] * len(sequence_read_fps) elif barcode_read_fps is None: option_parser.error("Must provide --barcode_read_fps if " "--barcode_type is not 'not-barcoded'") elif mapping_fps is None: option_parser.error("Must provide --mapping_fps if " "--barcode_type is not 'not-barcoded'") phred_offset = opts.phred_offset if phred_offset is not None: try: phred_offset = int(phred_offset) except ValueError: # shouldn't be able to get here... option_parser.error( "If --phred_offset is provided, it must be a valid integer.") if opts.last_bad_quality_char is not None: option_parser.error( '--last_bad_quality_char is no longer supported. ' 'Use -q instead (see option help text by passing -h)') if not (0 < min_per_read_length_fraction <= 1): option_parser.error('--min_per_read_length_fraction must be greater ' 'than 0 and less than or equal to 1. You passed ' '%1.5f.' % min_per_read_length_fraction) barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None) if len(mapping_fps) == 1 and len(sequence_read_fps) > 1: mapping_fps = mapping_fps * len(sequence_read_fps) if len( set([ len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps) ])) > 1: option_parser.error("Same number of sequence, barcode, and mapping " "files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp, 'w') qual_fp_temp = '%s/qual.fna.incomplete' % output_dir qual_fp = '%s/seqs.qual' % output_dir output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir output_fastq_fp = '%s/seqs.fastq' % output_dir if store_qual_scores: qual_f = open(qual_fp_temp, 'w') # define a qual writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def qual_writer(h, q): qual_f.write('>%s\n%s\n' % (h, q)) else: def qual_writer(h, q): pass if store_demultiplexed_fastq: output_fastq_f = open(output_fastq_fp_temp, 'w') # define a fastq writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def fastq_writer(h, s, q): output_fastq_f.write(format_fastq_record(h, s, q)) else: def fastq_writer(h, s, q): pass log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp, 'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp, 'w') for i in range(len(sequence_read_fps)): sequence_read_fp = sequence_read_fps[i] barcode_read_fp = barcode_read_fps[i] mapping_fp = mapping_fps[i] if mapping_fp is not None: mapping_f = open(mapping_fp, 'U') _, _, barcode_to_sample_id, _, _, _, _ = check_map( mapping_f, disable_primer_check=True, has_barcodes=barcode_read_fp is not None) else: mapping_f = None barcode_to_sample_id = {} if rev_comp_mapping_barcodes: barcode_to_sample_id = { str(DNA(k).rc()): v for k, v in barcode_to_sample_id.iteritems() } if barcode_type == 'golay_12': invalid_golay_barcodes = get_invalid_golay_barcodes( barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error( "Some or all barcodes are not valid golay " "codes. Do they need to be reverse complemented? If these " "are not golay barcodes pass --barcode_type 12 to disable " "barcode error correction, or pass --barcode_type # if " "the barcodes are not 12 base pairs, where # is the size " "of the barcodes. Invalid codes:\n\t%s" % ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") if mapping_fp is not None: log_f.write('Mapping filepath: %s (md5: %s)\n' % (mapping_fp, safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' % (sequence_read_fp, str(safe_md5(open(sequence_read_fp)).hexdigest()))) if sequence_read_fp.endswith('.gz'): sequence_read_f = gzip_open(sequence_read_fp) else: sequence_read_f = open(sequence_read_fp, 'U') seq_id = start_seq_id if barcode_read_fp is not None: log_f.write( 'Barcode read filepath: %s (md5: %s)\n\n' % (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest())) if barcode_read_fp.endswith('.gz'): barcode_read_f = gzip_open(barcode_read_fp) else: barcode_read_f = open(barcode_read_fp, 'U') seq_generator = process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors, phred_offset=phred_offset) else: seq_generator = process_fastq_single_end_read_file_no_barcode( sequence_read_f, sample_ids[i], store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, phred_offset=phred_offset) for fasta_header, sequence, quality, seq_id in seq_generator: output_f.write('>%s\n%s\n' % (fasta_header, sequence)) qual_writer(fasta_header, quality) fastq_writer(fasta_header, sequence, quality) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp, output_fp) # process the optional output files, as necessary if store_qual_scores: qual_f.close() rename(qual_fp_temp, qual_fp) if store_demultiplexed_fastq: output_fastq_f.close() rename(output_fastq_fp_temp, output_fastq_fp)
def compare(interest_fp, other_fp, output_dir='blast-results-compare', interest_pcts=None, interest_alg_lens=None, other_pcts=None, other_alg_lens=None, hits_to_first=False, hits_to_second=False): """Compare two databases and write the outputs Parameters ---------- interest_fp : str BLAST results when searching against the database of interest. other_fp : str BLAST results when searching against the other database. output_dir : str, optional Name of the output file path. interest_pcts : list, optional Minimum percentage identity to be considered as a valid result in the interest database search results. If None is passed, it defaults to `[70]`. interest_alg_lens : list, optional Minimum alginment length to be considered a valid result in the interest database search results. If None is passed, it defaults to `[50]`. other_pcts : list, optional Minimum percentage identity to be considered as a valid result in the other database search results. If None is passed, it defaults to `[70]`. other_lengths : list, optional Minimum alginment length to be considered a valid result in the other database search results. If None is passed, it defaults to `[50]`. hits_to_first : bool, optional defaults to False Outputs the labels and counts of the sequences being hit in the first database. hits_to_second : bool, optional defaults to False Outputs the labels and counts of the sequences being hit in the second database. Raises ------ click.BadParameter If the `interest_pcts` and the `other_pcts` lists are of different length. If the `interest_alg_lens` and the `other_alg_lens` lists are of different length. """ if interest_pcts is None: interest_pcts = [70] if interest_alg_lens is None: interest_alg_lens = [50] db_a = open(interest_fp, 'U') db_b = open(other_fp, 'U') # try to create the output directory, if it exists, just continue create_dir(output_dir, False) # run some validations on the input parameters if other_pcts: if len(interest_pcts) != len(other_pcts): raise BadParameter("The percentage values for both databases " "should be the same length: %s - %s" % (interest_pcts, other_pcts)) else: other_pcts = interest_pcts if other_alg_lens: if len(interest_alg_lens) != len(other_alg_lens): raise BadParameter("The alignment length values for both databases" " should be the length : %s - %s" % (interest_alg_lens, other_alg_lens)) else: other_alg_lens = interest_alg_lens # process databases total_queries, best_hits = parse_first_database(db_a, interest_pcts, interest_alg_lens) parse_second_database(db_b, best_hits, other_pcts, other_alg_lens) # parse results results = process_results(interest_pcts, interest_alg_lens, other_pcts, other_alg_lens, best_hits) # Collating output and writing full results for i, item in enumerate(results): filename = join(output_dir, "summary_" + item['filename'] + ".txt") with open(filename, 'w') as fd: fd.write('\n'.join(item['summary'])) if i == 0: combined_results = [] combined_results.append(['filename']) combined_results.append( ['interest db (%s)' % basename(interest_fp)]) combined_results.append(['other db (%s)' % basename(other_fp)]) combined_results.append(['only interest']) combined_results.append(['both dbs']) combined_results.append(['no hits in interest db']) no_hits = total_queries - item['db_interest'] - item['db_other'] - \ item['perfect_interest'] - item['equal'] combined_results[0].append(item['filename']) combined_results[1].append(str(item['db_interest'])) combined_results[2].append(str(item['db_other'])) combined_results[3].append(str(item['perfect_interest'])) combined_results[4].append(str(item['equal'])) combined_results[5].append(str(no_hits)) # tiny helper function to save hits files def save_hits(data, name): s_hits = sorted(data, key=itemgetter(1), reverse=True) filename = join(output_dir, name) with open(filename, 'w') as fd: fd.write('\n'.join( ['%s\t%d' % (k, v) for k, v in s_hits if v != 0])) if hits_to_first: save_hits(item['db_seqs_counts']['a'].items(), "hits_to_first_db_%s.txt" % item['filename']) if hits_to_second: save_hits(item['db_seqs_counts']['b'].items(), "hits_to_second_db_%s.txt" % item['filename']) # saving collated results with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output: compiled_output.write('\n'.join( ['\t'.join(item) for item in combined_results])) fn = join(output_dir, "compile_output_no_nohits.txt") with open(fn, 'w') as fd: fd.write('\n'.join(['\t'.join(item) for item in combined_results[:-1]]))
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=None, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write('Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_reference_otus(input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join( [new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, suppress_index_page=True, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback, minimum_failure_threshold=minimum_failure_threshold) # perform post-iteration file shuffling whether the previous iteration's # data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append( '%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps), otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write( "Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table(table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def main(commandline_args=None): parser, opts, args = parse_command_line_parameters(**script_info) if (opts.checkpoint_fp): bp_fp = opts.checkpoint_fp if not exists(bp_fp): parser.error('Specified checkpoint file does not exist: %s' % bp_fp) # peek into sff.txt files to make sure they are parseable # cat_sff_fles is lazy and only reads header flowgrams, header = cat_sff_files(map(open, opts.sff_fps)) if (opts.split and opts.preprocess_fp): parser.error('Options --split and --preprocess_fp are exclusive') if (opts.preprocess_fp): pp_fp = opts.preprocess_fp if not exists(opts.preprocess_fp): parser.error('Specified preprocess directory does not exist: %s' % opts.preprocess_fp) if not files_exist( '%s/prefix_mapping.txt,%s/prefix_dereplicated.fasta' % (pp_fp, pp_fp)): parser.error( 'Specified preprocess directory does not contain expected files: ' + 'prefix_mapping.txt and prefix_dereplicated.fasta') if opts.titanium: opts.error_profile = DENOISER_DATA_DIR + 'Titanium_error_profile.dat' opts.low_cutoff = 4 opts.high_cutoff = 5 if not exists(opts.error_profile): parser.error('Specified error profile %s does not exist' % opts.error_profile) if opts.output_dir: # make sure it always ends on / tmpoutdir = opts.output_dir + "/" create_dir(tmpoutdir, not opts.force) else: # make random dir in current dir tmpoutdir = mkdtemp(dir="", prefix="denoiser_", suffix="/") log_fp = 'denoiser.log' if opts.split: denoise_per_sample(opts.sff_fps, opts.fasta_fp, tmpoutdir, opts.cluster, opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium) else: denoise_seqs(opts.sff_fps, opts.fasta_fp, tmpoutdir, opts.preprocess_fp, opts.cluster, opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium, opts.checkpoint_fp)
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=None, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust'] allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref', 'sortmerna'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append( [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append( [('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus( input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir # count number of sequences in step 1 failures fasta file with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f: num_failure_seqs, mean, std = count_seqs_from_file(step1_failures_fasta_f) # number of failures sequences is greater than the threshold, # continue to step 2,3 and 4 run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold if run_step_2_and_3: # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_dir = '%s/step2_otus/' % output_dir create_dir(step2_dir) step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step2_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath( step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append( [('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir # remove the indexed reference database from the dictionary of # parameters as it must be forced to build a new database # using the step2_repset_fasta_fp if reference_otu_picking_method == 'sortmerna': if 'sortmerna_db' in params['pick_otus']: del params['pick_otus']['sortmerna_db'] step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set', step3_cmd)]) index_links.append( ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step4_dir = '%s/step4_otus/' % output_dir if run_step_2_and_3: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) failures_fp = step3_failures_fasta_fp failures_otus_fp = 'failures_failures_otus.txt' failures_step = 'step3' else: failures_fp = step1_failures_fasta_fp failures_otus_fp = 'failures_otus.txt' failures_step = 'step1' step3_otu_map_fp = "" step4_cmd = pick_denovo_otus(failures_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp) commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp) commands.append( [('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created if run_step_2_and_3: failures_fp = step3_failures_list_fp else: failures_fp = step1_failures_list_fp step3_otu_map_fp = "" cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (failures_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map( otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append(('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write('# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath( otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append( ('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append( ('New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copyfile(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write('# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. if run_step_2_and_3: for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() # steps 1-4 executed if run_step_2_and_3: logger.write('# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # only steps 1 and 4 executed else: logger.write('# Write non-singleton otus representative sequences from ' + 'step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write( "Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) index_links.append( ('OTU taxonomic assignments', taxonomy_fp, _index_headers['taxa_assignments'])) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append( ('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table(table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)
def split_db(tax_fp, seqs_fp, query, output_fp, split_fp): """Split a database in parts that match a query and parts that don't Parameters ---------- tax_fp : str Tab-delimited file with two columns, name/identifier of the sequence and the taxonomy. The sequence identifier is the longest string before a space in the header of the sequence. seqs_fp : str Path to a FASTA formatted file to split in interest and rest. Note: sequence identifiers must match the ones in the taxonomy file. query : str The query used to split the database, for example: salmonella. The query should be an exact match, no wild cards, it can have spaces, and it is case insensitive output_fp : str Output folder path where the results are stored. split_fp : str The tab delimited query file, where each line is a different sequence and the first column is the sequence id. Raises ------ BadParameter If the Taxonomy file is empty. If the query you passed retrieved no results. """ if query is not None: # query the taxonomy file for the required sequence identifiers try: interest_taxonomy = sequences_from_query(open(tax_fp, 'U'), query) except (PlatypusValueError, PlatypusParseError) as e: raise BadParameter(e.message) if len(interest_taxonomy) == 0: raise BadParameter('The query could not retrieve any results, try ' 'a different one.') else: interest_taxonomy = { l.strip().split('\t')[0].strip(): '' for l in open(split_fp, 'U') } if not interest_taxonomy: raise BadParameter('The split_fp is empty!') create_dir(output_fp, False) interest_fp = open(join(output_fp, 'interest.fna'), 'w') rest_fp = open(join(output_fp, 'rest.fna'), 'w') for record in read(seqs_fp, format='fasta'): full_name = record.id seq = record.sequence name = full_name.strip().split(' ')[0].strip() if name in interest_taxonomy: interest_fp.write(">%s\n%s\n" % (full_name, seq)) else: rest_fp.write(">%s\n%s\n" % (full_name, seq)) interest_fp.close() rest_fp.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. out_dir = opts.output_dir try: create_dir(out_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") map_f = open(opts.mapping_fp, 'U') dm_f = open(opts.distance_matrix_fp, 'U') fields = map(strip, opts.fields.split(',')) fields = [field.strip('"').strip("'") for field in fields] color_individual_within_by_field = opts.color_individual_within_by_field results = make_distance_boxplots( dm_f, map_f, fields, width=opts.width, height=opts.height, suppress_all_within=opts.suppress_all_within, suppress_all_between=opts.suppress_all_between, suppress_individual_within=opts.suppress_individual_within, suppress_individual_between=opts.suppress_individual_between, y_min=opts.y_min, y_max=opts.y_max, whisker_length=opts.whisker_length, box_width=opts.box_width, box_color=opts.box_color, color_individual_within_by_field=color_individual_within_by_field, sort=opts.sort) for field, plot_figure, plot_data, plot_labels, plot_colors in results: output_plot_fp = join(out_dir, "%s_Distances.%s" % (field, opts.imagetype)) plot_figure.savefig(output_plot_fp, format=opts.imagetype, transparent=opts.transparent) if not opts.suppress_significance_tests: sig_tests_f = open(join(out_dir, "%s_Stats.txt" % field), 'w') sig_tests_results = all_pairs_t_test( plot_labels, plot_data, tail_type=opts.tail_type, num_permutations=opts.num_permutations) sig_tests_f.write(sig_tests_results) sig_tests_f.close() if opts.save_raw_data: # Write the raw plot data into a tab-delimited file. assert (len(plot_labels) == len(plot_data)) raw_data_fp = join(out_dir, "%s_Distances.txt" % field) raw_data_f = open(raw_data_fp, 'w') for label, data in zip(plot_labels, plot_data): raw_data_f.write(label.replace(" ", "_") + "\t") raw_data_f.write("\t".join(map(str, data))) raw_data_f.write("\n") raw_data_f.close()
def main(): spreadsheet_key = None with open('/data/input/AppSession.json', 'U') as fd_json: app = json.load(fd_json) # get command attributes, etc for item in app['Properties']['Items']: if item['Name'] == 'Input.Projects': project_id = item['Items'][0]['Id'] if item['Name'] == 'Input.spreadsheet-key': spreadsheet_key = item['Content'] if item['Name'] == 'Input.app-result-id': results_id = item['Content']['Id'] if item['Name'] == 'Input.rarefaction-depth': depth = item['Content'] if item['Name'] == 'Input.number-of-jobs': jobs = item['Content'] # from BaseSpace's documentation input_dir = '/data/input/appresults/' base = join('/data/output/appresults/', project_id) create_dir(base) # OTU picking input_dir = join(input_dir, results_id) mapping_fp = join(base, 'mapping-file.txt') cmd = ("load_remote_mapping_file.py " "-k {spreadsheet_key} -o {mapping_fp}") params = {'spreadsheet_key': spreadsheet_key, 'mapping_fp': mapping_fp} system_call(cmd.format(**params)) biom_fp = join(input_dir, 'otu_table.biom') tree_fp = glob(join(input_dir, '*.tree'))[0] output_dir = join(base, 'corediv-out') bt = load_table(biom_fp) if bt.is_empty(): logging.error('BIOM table is empty, cannot perform diversity ' 'analyses.') return 11 params_fp = join(base, 'alpha-params.txt') with open(params_fp, 'w') as alpha_fp: alpha_fp.write('alpha_diversity:metrics shannon,PD_whole_tree,' 'chao1,observed_species') cmd = ("core_diversity_analyses.py " "-i {biom_fp} -o {output_dir} -m {mapping_fp} -e {depth} " "-t {tree_fp} -p {params_fp}") params = { 'biom_fp': biom_fp, 'output_dir': output_dir, 'mapping_fp': mapping_fp, 'depth': depth, 'jobs': jobs, 'tree_fp': tree_fp, 'params_fp': params_fp } # see https://github.com/biocore/qiime/issues/2034 if jobs != '1': cmd += ' -a -O {jobs}' system_call(cmd.format(**params)) for log_file in glob(join(output_dir, 'log_*')): with open(log_file, 'U') as fd_log: print fd_log.read() return 0
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") # Parse the distance matrix and mapping file. try: dist_matrix_header, dist_matrix = parse_distmat( open(opts.distance_matrix_fp, 'U')) except: option_parser.error( "This does not look like a valid distance matrix " "file. Please supply a valid distance matrix file using the -d " "option.") try: mapping, mapping_header, mapping_comments = parse_mapping_file( open(opts.mapping_fp, 'U')) except QiimeParseError: option_parser.error( "This does not look like a valid metadata mapping " "file. Please supply a valid mapping file using the -m option.") # Make sure the y_min and y_max options make sense, as they can be either # 'auto' or a number. y_min = opts.y_min y_max = opts.y_max try: y_min = float(y_min) except ValueError: if y_min == 'auto': y_min = None else: option_parser.error("The --y_min option must be either a number " "or 'auto'.") try: y_max = float(y_max) except ValueError: if y_max == 'auto': y_max = None else: option_parser.error("The --y_max option must be either a number " "or 'auto'.") # Parse the field states that will be compared to every other field state. comparison_field_states = opts.comparison_groups comparison_field_states = map(strip, comparison_field_states.split(',')) comparison_field_states = [ field_state.strip('"').strip("'") for field_state in comparison_field_states ] if comparison_field_states is None: option_parser.error("You must provide at least one field state to " "compare (using the -c option).") # Get distance comparisons between each field state and each of the # comparison field states. field = opts.field comparison_groupings = get_field_state_comparisons( dist_matrix_header, dist_matrix, mapping_header, mapping, field, comparison_field_states) # Grab a list of all field states that had the comparison field states # compared against them. These will be plotted along the x-axis. field_states = comparison_groupings.keys() def custom_comparator(x, y): try: num_x = float(x) num_y = float(y) return int(num_x - num_y) except: if x < y: return -1 elif x > y: return 1 else: return 0 # Sort the field states as numbers if the elements are numbers, else sort # them lexically. field_states.sort(custom_comparator) # If the label type is numeric, get a list of all field states in sorted # numeric order. These will be used to determine the spacing of the # field state 'points' along the x-axis. x_spacing = None if opts.label_type == "numeric": try: x_spacing = sorted(map(float, field_states)) except: option_parser.error("The 'numeric' label type is invalid because " "not all field states could be converted into " "numbers. Please specify a different label " "type.") # Accumulate the data for each field state 'point' along the x-axis. plot_data = [] plot_x_axis_labels = [] for field_state in field_states: field_state_data = [] for comp_field_state in comparison_field_states: field_state_data.append( comparison_groupings[field_state][comp_field_state]) plot_data.append(field_state_data) plot_x_axis_labels.append(field_state) # Plot the data and labels. plot_title = "Distance Comparisons" plot_x_label = field plot_y_label = "Distance" # If we are creating a bar chart or box plot, grab a list of good data # colors to use. plot_type = opts.plot_type plot_colors = None if plot_type == "bar" or plot_type == "box": plot_colors = [ matplotlib_rgb_color(data_colors[color].toRGB()) for color in data_color_order ] assert plot_data, "Error: there is no data to plot!" width = opts.width height = opts.height if width <= 0 or height <= 0: option_parser.error("The specified width and height of the image must " "be greater than zero.") plot_figure = grouped_distributions( opts.plot_type, plot_data, x_values=x_spacing, data_point_labels=plot_x_axis_labels, distribution_labels=comparison_field_states, distribution_markers=plot_colors, x_label=plot_x_label, y_label=plot_y_label, title=plot_title, x_tick_labels_orientation=opts.x_tick_labels_orientation, y_min=y_min, y_max=y_max, whisker_length=opts.whisker_length, error_bar_type=opts.error_bar_type, distribution_width=opts.distribution_width, figure_width=width, figure_height=height) # Save the plot in the specified format. output_plot_fp = join( opts.output_dir, "%s_Distance_Comparisons.%s" % (field, opts.imagetype)) plot_figure.savefig(output_plot_fp, format=opts.imagetype, transparent=opts.transparent) if not opts.suppress_significance_tests: sig_tests_f = open(join(opts.output_dir, "%s_Stats.txt" % field), 'w') # Rearrange the plot data into a format suitable for all_pairs_t_test. sig_tests_labels = [] sig_tests_data = [] for data_point, data_point_label in zip(plot_data, plot_x_axis_labels): for dist, comp_field in zip(data_point, comparison_field_states): sig_tests_labels.append('%s vs %s' % (data_point_label, comp_field)) sig_tests_data.append(dist) sig_tests_results = all_pairs_t_test( sig_tests_labels, sig_tests_data, tail_type=opts.tail_type, num_permutations=opts.num_permutations) sig_tests_f.write(sig_tests_results) sig_tests_f.close() if opts.save_raw_data: # Write the raw plot data into a tab-delimited file, where each line # has the distances between a comparison group and another field state # 'point' along the x-axis. assert (len(plot_x_axis_labels) == len(plot_data)), "The number of " +\ "labels do not match the number of points along the x-axis." raw_data_fp = join(opts.output_dir, "%s_Distance_Comparisons.txt" % field) raw_data_f = open(raw_data_fp, 'w') raw_data_f.write("#ComparisonGroup\tFieldState\tDistances\n") for label, data in zip(plot_x_axis_labels, plot_data): assert (len(comparison_field_states) == len(data)), "The " +\ "number of specified comparison groups does not match " +\ "the number of groups found at the current point along " +\ "the x-axis." for comp_field_state, comp_grp_data in zip(comparison_field_states, data): raw_data_f.write(comp_field_state + "\t" + label + "\t" + "\t".join(map(str, comp_grp_data)) + "\n") raw_data_f.close()