def setUp(self): self.mapping_file_data = MAPPING_FILE_DATA self.mapping_file_headers = ['SampleID', 'BarcodeSequence', 'LinkerPrimerSequence', 'Treatment', 'DOB', 'Description'] self.valid_columns = ['Treatment', 'DOB'] self.support_files_filename = get_qiime_temp_dir() self.support_files_filename_spaces = join(get_qiime_temp_dir(), 'Directory With Spaces/AndNoSpaces') # data for the custom axes, contains columns that are gradients self.mapping_file_data_gradient = MAPPING_FILE_DATA_GRADIENT self.mapping_file_headers_gradient = ['SampleID', 'Treatment', 'Time', 'Weight', 'Description'] self.coords_header = ['PC.355', 'PC.635', 'PC.636', 'PC.354'] self.coords_data = COORDS_DATA self.coords_eigenvalues = array([1, 2, 3, 4]) self.coords_pct = array([40, 30, 20, 10]) # jackknifed test data self.jk_mapping_file_headers = ['SampleID', 'C2', 'C3', 'C4'] self.jk_mapping_file_data = [['1', 'a', 'b', 'c'], ['2', 'd', 'e', 'f'], ['3', 'g', 'h', 'i']] self.jk_coords_header = [['1', '2', '3'], ['1', '2', '3'], ['1', '2', '3'], ['1', '2', '3']] self.jk_coords_data = [array([[1.2, 0.1, -1.2],[-2.5, -4.0, 4.5]]), array([[-1.4, 0.05, 1.3],[2.6, 4.1, -4.7]]), array([[-1.5, 0.05, 1.6],[2.4, 4.0, -4.8]]), array([[-1.5, 0.05, 1.6],[2.4, 4.0, -4.8]])] self.jk_coords_eigenvalues = [array([0.80, .11, 0.09]), array([0.76, .20,0.04]), array([0.84, .14, 0.02]), array([0.84, .11, 0.05])] self.jk_coords_pcts = [array([0.80, .10, 0.10]), array([0.76, .21, 0.03]), array([0.84, .11, 0.05]), array([0.84, .15, 0.01])] self.jk_mapping_file_data_gradient = MAPPING_FILE_DATA_GRADIENT self.jk_mapping_file_headers_gradient = ['SampleID', 'Treatment','Time', 'Weight', 'Description'] self.jk_coords_header_gradient = [['PC.354','PC.355','PC.635','PC.636'], ['PC.354','PC.355','PC.635','PC.636'], ['PC.354','PC.355','PC.635', 'PC.636'], ['PC.354','PC.355','PC.635','PC.636']] self.jk_coords_data_gradient = [array([[1.2, 0.1, -1.2, 1.1],[-2.5, -4.0, 4.5, 0.3], [.5, -0.4, 3.5, 1.001], [0.67, 0.23, 1.01, 2.2]]), array([[1.2, 1, -0.2, 0.1],[-2.5, -4.0, 4.5, 3.2], [.5, -0.4, 3.5, 1.00], [0.57, 0.27, 0.95, 2.1]]), array([[1.0, 1, -1.2, 1.1],[-2.1, -2.0, 3.5, 0.3], [.5, 3, 3.5, 2], [0.60, 0.33, 1.3, 2.0]]), array([ [1.2, 0.1, -1.2, 1.1],[-2.5,-4.0, 4.5, 0.3], [.5, -0.4, 3.5, 1.001], [0.69, 0.20, 1.01, 2.2]])] self.jk_coords_eigenvalues_gradient = [array([0.80, .11, 0.09, 0.0]), array([0.76, .20,0.04, 0.0]), array([0.84, .14, 0.02, 0.0]), array([ 0.84, .11, 0.05, 0.0])] self.jk_coords_pcts_gradient = [array([0.80, .10, 0.10, 0.0]), array( [0.76, .21, 0.03, 0.0]), array([0.84, .11, 0.05, 0.0]), array([0.84, .15, 0.01, 0])] self.broken_mapping_file_data = BROKEN_MAPPING_FILE self.broken_mapping_file_data_2_values = BROKEN_MAPPING_FILE_2_VALUES
def __call__(self, query_fasta_fp, database_fasta_fp, output_dir, observation_metadata_fp=None, params=None, HALT_EXEC=False): if params is None: params = {} """ Call the DatabaseMapper """ create_dir(output_dir) raw_output_fp = self._get_raw_output_fp(output_dir, params) output_observation_map_fp = '%s/observation_map.txt' % output_dir output_biom_fp = '%s/observation_table.biom' % output_dir log_fp = '%s/observation_table.log' % output_dir self._assign_dna_reads_to_database( query_fasta_fp=query_fasta_fp, database_fasta_fp=database_fasta_fp, raw_output_fp=raw_output_fp, temp_dir=get_qiime_temp_dir(), params=params, HALT_EXEC=HALT_EXEC) self._process_raw_output(raw_output_fp, log_fp, output_observation_map_fp) self._generate_biom_output(output_observation_map_fp, output_biom_fp, observation_metadata_fp)
def _generate_training_files(self): """Returns a tuple of file objects suitable for passing to the RdpTrainer application controller. """ tmp_dir = get_qiime_temp_dir() training_set = RdpTrainingSet() reference_seqs_file = open(self.Params['reference_sequences_fp'], 'U') id_to_taxonomy_file = open(self.Params['id_to_taxonomy_fp'], 'U') for seq_id, seq in MinimalFastaParser(reference_seqs_file): training_set.add_sequence(seq_id, seq) for line in id_to_taxonomy_file: seq_id, lineage_str = map(strip, line.split('\t')) training_set.add_lineage(seq_id, lineage_str) training_set.dereplicate_taxa() rdp_taxonomy_file = NamedTemporaryFile( prefix='RdpTaxonAssigner_taxonomy_', suffix='.txt', dir=tmp_dir) rdp_taxonomy_file.write(training_set.get_rdp_taxonomy()) rdp_taxonomy_file.seek(0) rdp_training_seqs_file = NamedTemporaryFile( prefix='RdpTaxonAssigner_training_seqs_', suffix='.fasta', dir=tmp_dir) for rdp_id, seq in training_set.get_training_seqs(): rdp_training_seqs_file.write('>%s\n%s\n' % (rdp_id, seq)) rdp_training_seqs_file.seek(0) self._training_set = training_set return rdp_taxonomy_file, rdp_training_seqs_file
def setUp(self): """ """ self.test_data = get_test_data_fps() self.files_to_remove = [] self.dirs_to_remove = [] # Create example output directory tmp_dir = get_qiime_temp_dir() self.test_out = mkdtemp(dir=tmp_dir, prefix='core_qiime_analyses_test_', suffix='') self.dirs_to_remove.append(self.test_out) self.qiime_config = load_qiime_config() self.params = parse_qiime_parameters(params_f1) # suppress stderr during tests (one of the systems calls in the # workflow prints a warning, and we can't suppress that warning with # warnings.filterwarnings) here because it comes from within the code # executed through the system call. Found this trick here: # http://stackoverflow.com/questions/9949633/suppressing-print-as-stdout-python self.saved_stderr = sys.stderr sys.stderr = StringIO() initiate_timeout(180)
def multiple_file_DA_DESeq2(input_dir, output_dir, mapping_fp, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots): """perform DESeq2 negative binomial Wald differential abundance test on a directory of raw abundance OTU matrices """ if not exists(output_dir): makedirs(output_dir) file_names = [fname for fname in listdir(input_dir) if not (fname.startswith('.')\ or isdir(fname))] for fname in file_names: base_fname, ext = splitext(fname) original_fname = base_fname+'.biom' hdf5_infile = join(input_dir, original_fname) tmp_bt = load_table(hdf5_infile) tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp) check_mapping_file_category(tmp_bt, mapping_fp, mapping_category, subcategory_1, subcategory_2) tmp_bt.add_metadata(tmp_pmf, 'sample') outfile = join(output_dir, 'DESeq2_DA_'+base_fname+'.txt') outfile_diagnostic = join(output_dir, 'DESeq2_diagnostic_plots_'+base_fname+'.pdf') with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(), prefix='QIIME-differential-abundance-temp-table-', suffix='.biom') as temp_fh: temp_fh.write(tmp_bt.to_json('forR')) temp_fh.flush() run_DESeq2(temp_fh.name, outfile, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots, outfile_diagnostic)
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = mkdtemp(dir=tmp_dir, prefix='qiime_parallel_tests_', suffix='') self.dirs_to_remove.append(self.test_out) fd, self.template_fp = mkstemp(dir=self.test_out, prefix='qiime_template', suffix='.fasta') close(fd) template_f = open(self.template_fp, 'w') template_f.write(pynast_test1_template_fasta) template_f.close() self.files_to_remove.append(self.template_fp) fd, self.inseqs1_fp = mkstemp(dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') close(fd) inseqs1_f = open(self.inseqs1_fp, 'w') inseqs1_f.write(inseqs1) inseqs1_f.close() self.files_to_remove.append(self.inseqs1_fp) initiate_timeout(60)
def setUp(self): self.files_to_remove = [] self.dirs_to_remove = [] # Create example output directory tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_parallel_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) # Create example input file self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') inseqs1_f = open(self.inseqs1_fp,'w') inseqs1_f.write(inseqs1) inseqs1_f.close() self.files_to_remove.append(self.inseqs1_fp) # Define number of seconds a test can run for before timing out # and failing initiate_timeout(60)
def multiple_file_DA_fitZIG(input_dir, output_dir, mapping_fp, mapping_category, subcategory_1, subcategory_2): """perform metagenomeSeq's Zero Inflated Gaussian (ZIG) OTU differential abundance test on a directory of raw abundance OTU matrices """ if not exists(output_dir): makedirs(output_dir) file_names = [fname for fname in listdir(input_dir) if not (fname.startswith('.')\ or isdir(fname))] for fname in file_names: base_fname, ext = splitext(fname) original_fname = base_fname+'.biom' hdf5_infile = join(input_dir, original_fname) tmp_bt = load_table(hdf5_infile) tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp) check_mapping_file_category(tmp_bt, mapping_fp, mapping_category, subcategory_1, subcategory_2) tmp_bt.add_metadata(tmp_pmf, 'sample') #make temporary json biom version - R currently does not have hdf5 outfile = join(output_dir, 'fitZIG_DA_'+base_fname+'.txt') with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(), prefix='QIIME-differential-abundance-temp-table-', suffix='.biom') as temp_fh: temp_fh.write(tmp_bt.to_json('forR')) temp_fh.flush() run_fitZIG(temp_fh.name, outfile, mapping_category, subcategory_1, subcategory_2)
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = mkdtemp(dir=tmp_dir, prefix='qiime_parallel_blaster_tests_', suffix='') self.dirs_to_remove.append(self.test_out) fd, self.tmp_seq_filepath = mkstemp(dir=self.test_out, prefix='qiime_parallel_blaster_tests_input', suffix='.fasta') close(fd) seq_file = open(self.tmp_seq_filepath, 'w') seq_file.write(blast_test_seqs) seq_file.close() self.files_to_remove.append(self.tmp_seq_filepath) self.reference_seqs_file = NamedTemporaryFile( prefix='qiime_parallel_blaster_tests_ref_seqs', suffix='.fasta', dir=tmp_dir) self.reference_seqs_file.write(blast_ref_seqs) self.reference_seqs_file.seek(0) initiate_timeout(60)
def setUp(self): self.files_to_remove = [] self.dirs_to_remove = [] # Create example output directory tmp_dir = get_qiime_temp_dir() self.test_out = mkdtemp(dir=tmp_dir, prefix='core_qiime_analyses_test_', suffix='') self.dirs_to_remove.append(self.test_out) # Get input data self.test_data = get_test_data_fps() self.qiime_config = load_qiime_config() self.qiime_config['jobs_to_start'] = 2 self.qiime_config['seconds_to_sleep'] = 1 # suppress stderr during tests (one of the systems calls in the # workflow prints a warning, and we can't suppress that warning with # warnings.filterwarnings) here because it comes from within the code # executed through the system call. Found this trick here: # http://stackoverflow.com/questions/9949633/suppressing-print-as-stdout-python self.saved_stderr = sys.stderr sys.stderr = StringIO() # Define number of seconds a test can run for before timing out # and failing initiate_timeout(600)
def test_split_fasta_diff_num_seqs_per_file_alt(self): """split_fasta funcs always catches all seqs """ # start with 59 seqs (b/c it's prime, so should make more # confusing splits) in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)]) infile = in_seqs.toFasta().split('\n') # test seqs_per_file from 1 to 1000 for i in range(1, 1000): _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(_) actual = split_fasta(infile, i, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) # remove the files now, so if the test fails they still get # cleaned up remove_files(actual) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual( LoadSeqs(data=infile, aligned=False), LoadSeqs(data=actual_seqs, aligned=False))
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_parallel_taxonomy_assigner_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) self.tmp_seq_filepath = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_parallel_taxonomy_assigner_tests_input', suffix='.fasta') seq_file = open(self.tmp_seq_filepath, 'w') seq_file.write(blast_test_seqs.toFasta()) seq_file.close() self.files_to_remove.append(self.tmp_seq_filepath) self.id_to_taxonomy_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy', suffix='.txt',dir=tmp_dir) self.id_to_taxonomy_file.write(blast_id_to_taxonomy) self.id_to_taxonomy_file.seek(0) self.reference_seqs_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs', suffix='.fasta',dir=tmp_dir) self.reference_seqs_file.write(blast_reference_seqs.toFasta()) self.reference_seqs_file.seek(0) initiate_timeout(60)
def setUp(self): """Defines data that will be used by the tests.""" self.files_to_remove = [] self.dirs_to_remove = [] # Create temp directory to hold input and output. self.test_dir = mkdtemp(dir=get_qiime_temp_dir(), prefix='qiime_compare_categories_tests_') self.dirs_to_remove.append(self.test_dir) # Create input files under our temp dir. self.dm_fp = join(self.test_dir, 'dm.txt') dm_f = open(self.dm_fp, 'w') dm_f.write(dm_str) dm_f.close() self.files_to_remove.append(self.dm_fp) self.invalid_dm_fp = join(self.test_dir, 'invalid_dm.txt') invalid_dm_f = open(self.invalid_dm_fp, 'w') invalid_dm_f.write(invalid_dm_str) invalid_dm_f.close() self.files_to_remove.append(self.invalid_dm_fp) self.map_fp = join(self.test_dir, 'map.txt') map_f = open(self.map_fp, 'w') map_f.write(map_str) map_f.close() self.files_to_remove.append(self.map_fp) self.cat_methods = ['adonis', 'anosim', 'mrpp', 'permanova', 'permdisp', 'dbrda'] self.num_methods = ['best', 'morans_i'] self.cat_categories = ['Treatment'] self.num_categories = ['DOB'] self.num_perms = 42
def test_mothur_supported_version(self): """mothur is in path and version is supported """ acceptable_version = (1, 25, 0) self.assertTrue( which("mothur"), "mothur not found. This may or may not be a problem depending on " + "which components of QIIME you plan to use.", ) # mothur creates a log file in cwd, so create a tmp and cd there first log_file = join(get_qiime_temp_dir(), "mothur.log") command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file stdout, stderr, exit_Status = qiime_system_call(command) # remove log file remove_files([log_file], error_on_missing=False) version_string = stdout.strip().split(" ")[1].strip("v.") try: version = tuple(map(int, version_string.split("."))) pass_test = version == acceptable_version except ValueError: pass_test = False version_string = stdout self.assertTrue( pass_test, "Unsupported mothur version. %s is required, but running %s." % (".".join(map(str, acceptable_version)), version_string), )
def test_temp_dir(self): """temp_dir is set to a valid path""" temp_dir = get_qiime_temp_dir() self.assertTrue(exists(temp_dir), "temp_dir does not exist: %s" % temp_dir) self.assertTrue(isdir(temp_dir), "temp_dir is not a directory: %s" % temp_dir) self.assertTrue(access(temp_dir, W_OK), "temp_dir is not writable: %s" % temp_dir)
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_parallel_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) self.template_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_template', suffix='.fasta') template_f = open(self.template_fp,'w') template_f.write(pynast_test1_template_fasta) template_f.close() self.files_to_remove.append(self.template_fp) self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') inseqs1_f = open(self.inseqs1_fp,'w') inseqs1_f.write(inseqs1) inseqs1_f.close() self.files_to_remove.append(self.inseqs1_fp) initiate_timeout(60)
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(_) infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA'] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual, expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual( LoadSeqs(data=infile, aligned=False), LoadSeqs(data=actual_seqs, aligned=False))
def setUp(self): """Define some sample data that will be used by the tests.""" # Standard recipients file with two recipients, one with multiple email # addresses. self.recipients = ["# a comment", " ", " foo1\[email protected] ", "foo2\t [email protected], [email protected],[email protected] "] # An empty recipients file. self.empty_recipients = ["# a comment", " ", "\n\t\t\t\t"] # Standard participants list. self.participants = ["# a comment", " ", " foo1 ", "foo2"] # Invalid (duplicate) participants list. self.duplicate_participants = ["foo1", "foo2", "foo1"] # The prefix to use for temporary files. This prefix may be added to, # but all temp dirs and files created by the tests will have this # prefix at a minimum. self.prefix = 'my_microbes_tests_' self.start_dir = getcwd() self.dirs_to_remove = [] self.files_to_remove = [] self.tmp_dir = get_qiime_temp_dir() if not exists(self.tmp_dir): makedirs(self.tmp_dir) # If test creates the temp dir, also remove it. self.dirs_to_remove.append(self.tmp_dir) # Set up temporary input and output directories. self.output_dir = mkdtemp(dir=self.tmp_dir, prefix='%soutput_dir_' % self.prefix) self.dirs_to_remove.append(self.output_dir) # Set up temporary input and output directories. self.input_dir = mkdtemp(dir=self.tmp_dir, prefix='%sinput_dir_' % self.prefix) self.dirs_to_remove.append(self.input_dir) # Data that will be used by the tests. self.otu_cat_sig_gut_fp = join(self.input_dir, 'otu_cat_sig_gut.txt') otu_cat_sig_gut_f = open(self.otu_cat_sig_gut_fp, 'w') otu_cat_sig_gut_f.write(otu_cat_sig_gut_text) otu_cat_sig_gut_f.close() self.files_to_remove.append(self.otu_cat_sig_gut_fp) self.otu_cat_sig_palm_fp = join(self.input_dir, 'otu_cat_sig_palm.txt') otu_cat_sig_palm_f = open(self.otu_cat_sig_palm_fp, 'w') otu_cat_sig_palm_f.write(otu_cat_sig_gut_text) otu_cat_sig_palm_f.close() self.files_to_remove.append(self.otu_cat_sig_palm_fp) self.rep_seqs_fp = join(self.input_dir, 'rep_seqs.fna') rep_seqs_f = open(self.rep_seqs_fp, 'w') rep_seqs_f.write(rep_seqs_text) rep_seqs_f.close() self.files_to_remove.append(self.rep_seqs_fp)
def setUp(self): """ """ self.files_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='bufWriterTest', suffix='.txt') self.files_to_remove.append(self.test_fp)
def __call__(self, seq_path, result_path=None, log_path=None): """Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq. Parameters: seq_path: path to file of sequences result_path: path to file of results. If specified, dumps the result to the desired path instead of returning it. log_path: path to log, which should include dump of params. """ if log_path: self.writeLog(log_path) reference_sequences_fp = self.Params["reference_sequences_fp"] assert reference_sequences_fp, "Must provide reference_sequences_fp when calling an RtaxTaxonAssigner." id_to_taxonomy_fp = self.Params["id_to_taxonomy_fp"] assert id_to_taxonomy_fp, "Must provide id_to_taxonomy_fp when calling an RtaxTaxonAssigner." # delimiter = self.Params['delimiter'] read_1_seqs_fp = self.Params["read_1_seqs_fp"] assert read_1_seqs_fp, "Must provide read_1_seqs_fp when calling an RtaxTaxonAssigner." # following params may all be null read_2_seqs_fp = self.Params["read_2_seqs_fp"] single_ok = self.Params["single_ok"] no_single_ok_generic = self.Params["no_single_ok_generic"] header_id_regex = self.Params["header_id_regex"] assert header_id_regex, ( "Must not provide empty header_id_regex when calling an RtaxTaxonAssigner; leave unset" "to use default if in doubt." ) read_id_regex = self.Params["read_id_regex"] amplicon_id_regex = self.Params["amplicon_id_regex"] # seq_file = open(seq_path, 'r') results = rtax.assign_taxonomy( seq_path, reference_sequences_fp, id_to_taxonomy_fp, read_1_seqs_fp, read_2_seqs_fp, single_ok=single_ok, no_single_ok_generic=no_single_ok_generic, header_id_regex=header_id_regex, read_id_regex=read_id_regex, amplicon_id_regex=amplicon_id_regex, output_fp=result_path, log_path=log_path, base_tmp_dir=get_qiime_temp_dir(), ) return results
def setUp(self): """ """ self.files_to_remove = [] tmp_dir = get_qiime_temp_dir() fd, self.test_fp = mkstemp(dir=tmp_dir, prefix='bufWriterTest', suffix='.txt') close(fd) self.files_to_remove.append(self.test_fp)
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) try: template_alignment = LoadSeqs(data=template_alignment, moltype=DNA, aligned=DenseAlignment) except KeyError as e: raise KeyError('Only ACGT-. characters can be contained in template alignments.' + ' The offending character was: %s' % e) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) if failure_path is not None: fail_file = open(failure_path, 'w') for seq in pynast_failed: fail_file.write(seq.toFasta()) fail_file.write('\n') fail_file.close() if result_path is not None: result_file = open(result_path, 'w') for seq in pynast_aligned: result_file.write(seq.toFasta()) result_file.write('\n') result_file.close() return None else: try: return LoadSeqs(data=pynast_aligned, aligned=DenseAlignment) except ValueError: return {}
def normalize_DESeq2(input_path, out_path, DESeq_negatives_to_zero): """performs DESeq2VS normalization on a single raw abundance OTU matrix """ tmp_bt = load_table(input_path) with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(), prefix='QIIME-normalize-table-temp-table-', suffix='.biom') as temp_fh: temp_fh.write(tmp_bt.to_json('forR')) temp_fh.flush() run_DESeq2(temp_fh.name, out_path, DESeq_negatives_to_zero)
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records( template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def __call__(self, seq_path, result_path=None, log_path=None): """Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq. Parameters: seq_path: path to file of sequences result_path: path to file of results. If specified, dumps the result to the desired path instead of returning it. log_path: path to log, which should include dump of params. """ tmp_dir = get_qiime_temp_dir() min_conf = self.Params["Confidence"] training_data_properties_fp = self.Params["training_data_properties_fp"] reference_sequences_fp = self.Params["reference_sequences_fp"] id_to_taxonomy_fp = self.Params["id_to_taxonomy_fp"] max_memory = self.Params["max_memory"] seq_file = open(seq_path, "U") if reference_sequences_fp and id_to_taxonomy_fp: # Train and assign taxonomy taxonomy_file, training_seqs_file = self._generate_training_files() results = rdp_classifier.train_rdp_classifier_and_assign_taxonomy( training_seqs_file, taxonomy_file, seq_file, min_confidence=min_conf, classification_output_fp=result_path, max_memory=max_memory, tmp_dir=tmp_dir, ) if result_path is None: results = self._training_set.fix_results(results) else: self._training_set.fix_output_file(result_path) else: # Just assign taxonomy, using properties file if passed if training_data_properties_fp: fix_ranks = False else: fix_ranks = True results = rdp_classifier.assign_taxonomy( seq_file, min_confidence=min_conf, output_fp=result_path, training_data_fp=training_data_properties_fp, max_memory=max_memory, fixrank=fix_ranks, tmp_dir=tmp_dir, ) if log_path: self.writeLog(log_path) return results
def run_fitZIG(input_path, out_path, mapping_category, subcategory_1, subcategory_2): """Run metagenomeSeq's fitZIG algorithm through Rscript """ # set options command_args = ['-i %s -o %s -c %s -x %s -y %s' % (input_path, out_path, mapping_category, subcategory_1, subcategory_2)] # instantiate the object rsl = RExecutor(TmpDir=get_qiime_temp_dir()) # run the app app_result = rsl(command_args=command_args, script_name='fitZIG.r') return app_result
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.submit_jobs and not opts.make_jobs: option_parser.error("Must pass -m if passing -s. (Sorry about this, " + "it's for backwards-compatibility.)") min_args = 2 if len(args) < min_args: option_parser.error("Exactly two arguments are required.") output_dir = get_qiime_temp_dir() run_commands(output_dir, open(args[0]), args[1], submit_jobs=opts.submit_jobs, keep_temp=True)
def setUp(self): """ """ tmp_dir = get_qiime_temp_dir() self.test_out = mkdtemp(dir=tmp_dir, prefix='qiime_parallel_tests_', suffix='') self.dirs_to_remove = [self.test_out] self.output_fp = join(self.test_out, 'fmap.txt') self.failure_fp = join(self.test_out, 'fail.txt') self.usearch_fp = join(self.test_out, 'out.uc') self.bl6_fp = join(self.test_out, 'out.bl6') self.log_fp = join(self.test_out, 'fmap.log') self.files_to_remove = [self.output_fp, self.failure_fp, self.usearch_fp, self.log_fp, self.bl6_fp] fd, self.refseqs1_fp = mkstemp(dir=self.test_out, prefix='qiime_refseqs', suffix='.fasta') close(fd) refseqs1_f = open(self.refseqs1_fp, 'w') refseqs1_f.write(refseqs1) refseqs1_f.close() self.files_to_remove.append(self.refseqs1_fp) fd, self.refseqs2_fp = mkstemp(dir=self.test_out, prefix='qiime_refseqs', suffix='.fasta') close(fd) refseqs2_f = open(self.refseqs2_fp, 'w') refseqs2_f.write(refseqs2) refseqs2_f.close() self.files_to_remove.append(self.refseqs2_fp) fd, self.inseqs1_fp = mkstemp(dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') close(fd) inseqs1_f = open(self.inseqs1_fp, 'w') inseqs1_f.write(inseqs1) inseqs1_f.close() self.files_to_remove.append(self.inseqs1_fp) fd, self.inseqs2_fp = mkstemp(dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') close(fd) inseqs2_f = open(self.inseqs2_fp, 'w') inseqs2_f.write(inseqs2) inseqs2_f.close() self.files_to_remove.append(self.inseqs2_fp) initiate_timeout(60)
def _precommand_initiation(self,input_fp,output_dir,working_dir,params): if not params['blast_db']: # Build the blast database from the reference_seqs_fp -- all procs # will then access one db rather than create one per proc blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(params['template_fp'], output_dir=get_qiime_temp_dir()) self.files_to_remove += db_files_to_remove params['blast_db'] = blast_db if params['min_length'] < 0: params['min_length'] = compute_min_alignment_length(\ open(input_fp,'U'))
def DA_fitZIG(input_path, out_path, mapping_fp, mapping_category, subcategory_1, subcategory_2): """perform metagenomeSeq's Zero Inflated Gaussian (ZIG) OTU differential abundance testing""" tmp_bt = load_table(input_path) tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp) check_mapping_file_category(tmp_bt, mapping_fp, mapping_category, subcategory_1, subcategory_2) tmp_bt.add_metadata(tmp_pmf, 'sample') with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(), prefix='QIIME-differential-abundance-temp-table-', suffix='.biom') as temp_fh: temp_fh.write(tmp_bt.to_json('forR')) temp_fh.flush() run_fitZIG(temp_fh.name, out_path, mapping_category, subcategory_1, subcategory_2)
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_parallel_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) self.refseqs1_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_refseqs', suffix='.fasta') refseqs1_f = open(self.refseqs1_fp, 'w') refseqs1_f.write(refseqs1) refseqs1_f.close() self.files_to_remove.append(self.refseqs1_fp) self.refseqs2_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_refseqs', suffix='.fasta') refseqs2_f = open(self.refseqs2_fp, 'w') refseqs2_f.write(refseqs2) refseqs2_f.close() self.files_to_remove.append(self.refseqs2_fp) self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') inseqs1_f = open(self.inseqs1_fp, 'w') inseqs1_f.write(inseqs1) inseqs1_f.close() self.files_to_remove.append(self.inseqs1_fp) self.inseqs2_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') inseqs2_f = open(self.inseqs2_fp, 'w') inseqs2_f.write(inseqs2) inseqs2_f.close() self.files_to_remove.append(self.inseqs2_fp) initiate_timeout(60)
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename( tmp_dir=tmp_dir, prefix='qiime_parallel_taxonomy_assigner_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) # Temporary input file self.tmp_seq_filepath = get_tmp_filename( tmp_dir=self.test_out, prefix='qiime_parallel_taxonomy_assigner_tests_input', suffix='.fasta') seq_file = open(self.tmp_seq_filepath, 'w') seq_file.write(rdp_test_seqs) seq_file.close() self.files_to_remove.append(self.tmp_seq_filepath) self.id_to_taxonomy_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy', suffix='.txt', dir=tmp_dir) self.id_to_taxonomy_file.write(rdp_id_to_taxonomy) self.id_to_taxonomy_file.seek(0) self.reference_seqs_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs', suffix='.fasta', dir=tmp_dir) self.reference_seqs_file.write(rdp_reference_seqs) self.reference_seqs_file.seek(0) jar_fp = getenv('RDP_JAR_PATH') jar_basename = basename(jar_fp) if '2.2' not in jar_basename: raise ApplicationError( "RDP_JAR_PATH does not point to version 2.2 of the " "RDP Classifier.") initiate_timeout(60)
def setUp(self): """Set up files/environment that will be used by the tests.""" # The prefix to use for temporary files. This prefix may be added to, # but all temp dirs and files created by the tests will have this # prefix at a minimum. self.prefix = 'generate_taxa_compare_table_tests' self.start_dir = getcwd() self.dirs_to_remove = [] self.files_to_remove = [] self.tmp_dir = get_qiime_temp_dir() if not exists(self.tmp_dir): makedirs(self.tmp_dir) # if test creates the temp dir, also remove it self.dirs_to_remove.append(self.tmp_dir) # setup temporary root input directory self.root_dir = mkdtemp(dir=self.tmp_dir, prefix='%s_root_dir_' % self.prefix) self.dirs_to_remove.append(self.root_dir) L18S_dir = '/L18S-1/blast_1.0/' makedirs(self.root_dir + L18S_dir) self.L18S_fp = self.root_dir + L18S_dir + '/otu_table_mc2_w_taxa_L5.txt' with open(self.L18S_fp, 'w') as f: f.writelines(L18S_L5_blast_one_multiple_assign_output) self.files_to_remove.append(self.L18S_fp) # setup temporary key directory self.key_dir = mkdtemp(dir=self.tmp_dir, prefix='%s_key_dir_' % self.prefix) self.dirs_to_remove.append(self.key_dir) self.key_fp = self.key_dir + '/L18S_key.txt' with open(self.key_fp, 'w') as f: f.writelines(L18S_key) self.files_to_remove.append(self.key_fp) self.bad_key = self.key_dir + '/L18S_key.txt' # setup temporary output directory self.output_dir = mkdtemp(dir=self.tmp_dir, prefix='%s_output_dir_' % self.prefix) self.dirs_to_remove.append(self.output_dir) initiate_timeout(60)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.submit_jobs and not opts.make_jobs: option_parser.error('Must pass -m if passing -s. (Sorry about this, ' 'it\'s for backwards-compatibility.)') min_args = 2 if len(args) < min_args: option_parser.error('Exactly two arguments are required.') output_dir = get_qiime_temp_dir() run_commands(output_dir, open(args[0]), args[1], submit_jobs=opts.submit_jobs, keep_temp=True, queue_name=opts.queue_name)
def normalize_CSS(input_path, out_path, output_CSS_statistics): """performs metagenomeSeq's CSS normalization on a single raw abundance OTU matrix """ tmp_bt = load_table(input_path) if output_CSS_statistics: base_fname, ext = splitext(out_path) output_CSS_statistics = base_fname + '_CSS_statistics.txt' with tempfile.NamedTemporaryFile( dir=get_qiime_temp_dir(), prefix='QIIME-normalize-table-temp-table-', suffix='.biom') as temp_fh: temp_fh.write(tmp_bt.to_json('forR')) temp_fh.flush() run_CSS(temp_fh.name, out_path, output_CSS_statistics=output_CSS_statistics)
def setUp(self): """Set up files/environment that will be used by the tests.""" # The prefix to use for temporary files. This prefix may be added to, # but all temp dirs and files created by the tests will have this # prefix at a minimum. self.prefix = 'tax2tree_controller_tests' self.start_dir = getcwd() self.dirs_to_remove = [] self.files_to_remove = [] self.tmp_dir = get_qiime_temp_dir() if not exists(self.tmp_dir): makedirs(self.tmp_dir) # if test creates the temp dir, also remove it self.dirs_to_remove.append(self.tmp_dir) initiate_timeout(60)
def setUp(self): """Define some test data.""" self.tmp_dir = get_qiime_temp_dir() self.otu_table1 = Table(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=list('abcd')) fd, self.otu_table1_fp = mkstemp(dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) write_biom_table(self.otu_table1, self.otu_table1_fp) self.otu_table2 = Table(data=array([[2, 0, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]]).T, sample_ids=list('XYZ'), observation_ids=['a', 'b', 'c', 'd_']) fd, self.otu_table2_fp = mkstemp(dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) write_biom_table(self.otu_table2, self.otu_table2_fp) self.single_sample_otu_table = Table( data=array([[2, 0, 0, 1]]).T, sample_ids=list('X'), observation_ids=list( 'abcd')) fd, self.single_sample_otu_table_fp = mkstemp( dir=self.tmp_dir, prefix='alpha_diversity_tests', suffix='.biom') close(fd) write_biom_table(self.single_sample_otu_table, self.single_sample_otu_table_fp) self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);') self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);") self.files_to_remove = [self.otu_table1_fp, self.otu_table2_fp, self.single_sample_otu_table_fp]
def setUp(self): self.tmp_dir = get_qiime_temp_dir() #Temporary input file fd, self.tmp_otu_fp = mkstemp(dir=self.tmp_dir, prefix='R_test_otu_table_', suffix='.biom') close(fd) seq_file = open(self.tmp_otu_fp, 'w') seq_file.write(test_otu_table) seq_file.close() self.tmp_otu_fp_out_CSS = '%s/R_test_otu_table_out_CSS.biom' % (str( self.tmp_dir)) self.tmp_otu_fp_out_DESeq = '%s/R_test_otu_table_out_DESeq.biom' % ( str(self.tmp_dir)) self.files_to_remove = \ [self.tmp_otu_fp, self.tmp_otu_fp_out_CSS, self.tmp_otu_fp_out_DESeq]
def setUp(self): """ """ self.test_data = get_test_data_fps() self.files_to_remove = [] self.dirs_to_remove = [] # Create example output directory tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='core_qiime_analyses_test_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) self.qiime_config = load_qiime_config() self.params = parse_qiime_parameters([]) initiate_timeout(60)
def setUp(self): self.tmp_dir = get_qiime_temp_dir() self.otu_table_data = np.array([[2, 1, 0], [0, 5, 0], [0, 3, 0], [1, 2, 0]]) self.sample_names = list('YXZ') self.taxon_names = list('bacd') self.otu_metadata = [{ 'domain': 'Archaea' }, { 'domain': 'Bacteria' }, { 'domain': 'Bacteria' }, { 'domain': 'Bacteria' }] self.otu_table = Table(self.otu_table_data, self.taxon_names, self.sample_names) self.otu_table_meta = Table(self.otu_table_data, self.taxon_names, self.sample_names, observation_metadata=self.otu_metadata) fd, self.otu_table_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(fd) fd, self.otu_table_meta_fp = mkstemp(dir=self.tmp_dir, prefix='test_rarefaction', suffix='.biom') close(fd) self.rare_dir = mkdtemp(dir=self.tmp_dir, prefix='test_rarefaction_dir', suffix='') write_biom_table(self.otu_table, self.otu_table_fp) write_biom_table(self.otu_table_meta, self.otu_table_meta_fp) self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp] self._dirs_to_clean_up = [self.rare_dir]
def DA_DESeq2(input_path, out_path, mapping_fp, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots): """perform DESeq2 negative binomial Wald differential abundance test on a raw abundance OTU matrix """ tmp_bt = load_table(input_path) tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp) check_mapping_file_category(tmp_bt, mapping_fp, mapping_category, subcategory_1, subcategory_2) tmp_bt.add_metadata(tmp_pmf, 'sample') base_fname, ext = splitext(out_path) outfile_diagnostic = join(base_fname + '_diagnostic_plots.pdf') with tempfile.NamedTemporaryFile( dir=get_qiime_temp_dir(), prefix='QIIME-differential-abundance-temp-table-', suffix='.biom') as temp_fh: temp_fh.write(tmp_bt.to_json('forR')) temp_fh.flush() run_DESeq2(temp_fh.name, out_path, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots, outfile_diagnostic)
def multiple_file_normalize_DESeq2(input_dir, output_dir, DESeq_negatives_to_zero): """performs DESeq2VS normalization on a directory of raw abundance OTU matrices """ if not exists(output_dir): makedirs(output_dir) file_names = [fname for fname in listdir(input_dir) if not (fname.startswith('.')\ or isdir(fname))] for fname in file_names: base_fname, ext = splitext(fname) original_fname = base_fname+'.biom' hdf5_infile = join(input_dir, original_fname) tmp_bt = load_table(hdf5_infile) outfile = join(output_dir, 'DESeq2_'+base_fname+'.biom') with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(), prefix='QIIME-normalize-table-temp-table-', suffix='.biom') as temp_fh: temp_fh.write(tmp_bt.to_json('forR')) temp_fh.flush() run_DESeq2(temp_fh.name, outfile, DESeq_negatives_to_zero)
def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ filename_prefix = get_tmp_filename(tmp_dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='', result_constructor=str) infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\ '>seq3','CCTT--AA'] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix,i) for i in range(3)] self.assertEqual(actual,expected) self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))
def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA'] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)] self.assertEqual(actual, expected) self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename( tmp_dir=tmp_dir, prefix='qiime_parallel_taxonomy_assigner_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) self.tmp_seq_filepath = get_tmp_filename( tmp_dir=self.test_out, prefix='qiime_parallel_taxonomy_assigner_tests_input', suffix='.fasta') seq_file = open(self.tmp_seq_filepath, 'w') seq_file.write(uclust_test_seqs.toFasta()) seq_file.close() self.files_to_remove.append(self.tmp_seq_filepath) self.id_to_taxonomy_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy', suffix='.txt', dir=tmp_dir) self.id_to_taxonomy_file.write(uclust_id_to_taxonomy) self.id_to_taxonomy_file.seek(0) self.reference_seqs_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs', suffix='.fasta', dir=tmp_dir) self.reference_seqs_file.write(uclust_reference_seqs.toFasta()) self.reference_seqs_file.seek(0) initiate_timeout(60)
def submit_jobs(commands, prefix): """submit jobs using exe pointed to by cluster_jobs_fp. commands: List of commands (strings) that should be executed prefix: A uniq prefix used to name submit script """ qiime_config = load_qiime_config() CLUSTER_JOBS_SCRIPT = qiime_config['cluster_jobs_fp'] if not CLUSTER_JOBS_SCRIPT: raise ApplicationNotFoundError, "cluster_jobs_fp not set in config file!" if not (exists(CLUSTER_JOBS_SCRIPT) or app_path(CLUSTER_JOBS_SCRIPT)): raise ApplicationNotFoundError, "cluster_jobs_fp not in $PATH or provided as full path!" outfilename = join(get_qiime_temp_dir(), "%s_commands.txt" % prefix) fh = open(outfilename, "w") fh.write("\n".join(commands)) fh.close() cmd = '%s -ms %s %s' % (CLUSTER_JOBS_SCRIPT, outfilename, prefix) system(cmd) remove(outfilename)
def setUp(self): """setup the test values""" # define test data self.fasta_seqs_of_rand_bcs = fasta_seqs_of_rand_bcs self.fasta_seqs_for_cluster_ratio = fasta_seqs_for_cluster_ratio self.fasta_seqs_for_consensus = fasta_seqs_for_consensus self.fwd_read_data = fwd_read_data.split() self.rev_read_data = rev_read_data.split() self.mapping_data = mapping_data self.fasta_seq_for_primer = fasta_seq_for_primer self.possible_primers = possible_primers self.fasta_seqs_for_consensus_tie_G_C = \ fasta_seqs_for_consensus_tie_G_C self.fasta_seqs_for_consensus_unequal_length = \ fasta_seqs_for_consensus_unequal_length self.min_difference_in_clusters = min_difference_in_clusters self.temp_dir = get_qiime_temp_dir() self.mapping_fp = NamedTemporaryFile( delete=False, mode='w', dir=self.temp_dir) self.mapping_fp.write(self.mapping_data) self.mapping_fp_name = self.mapping_fp.name self.mapping_fp.close() self.mapping_fp = open(self.mapping_fp_name, 'r') self.seqs_with_no_consensus = seqs_with_no_consensus self.false_primers = false_primers self.barcode_len = barcode_len self.barcode_correction_fn = barcode_correction_fn self.max_barcode_errors = max_barcode_errors self.fwd_length = fwd_length self.rev_length = fwd_length self.bc_to_sid = bc_to_sid self.bc_to_fwd_primers = bc_to_fwd_primers self.bc_to_rev_primers = bc_to_rev_primers self.min_difference_in_bcs = min_difference_in_bcs self.min_reads_per_random_bc = min_reads_per_random_bc self.max_cluster_ratio = max_cluster_ratio
def run_DESeq2(input_path, out_path, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots, outfile_diagnostic): """Run DESeq2 negative binomial Wald algorithm through Rscript """ # set options if DESeq2_diagnostic_plots == True: command_args = [ '-i %s -o %s -c %s -x %s -y %s -d %s -e %s' % (input_path, out_path, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots, outfile_diagnostic) ] else: command_args = [ '-i %s -o %s -c %s -x %s -y %s' % (input_path, out_path, mapping_category, subcategory_1, subcategory_2) ] # instantiate the object rsl = RExecutor(TmpDir=get_qiime_temp_dir()) # run the app app_result = rsl(command_args=command_args, script_name='DESeq2_nbinom.r') return app_result
def setUp(self): """Set up files/environment that will be used by the tests.""" # The prefix to use for temporary files. This prefix may be added to, # but all temp dirs and files created by the tests will have this # prefix at a minimum. self.prefix = 'multiple_assign_taxonomy_tests' self.start_dir = getcwd() self.dirs_to_remove = [] self.files_to_remove = [] self.tmp_dir = get_qiime_temp_dir() if not exists(self.tmp_dir): makedirs(self.tmp_dir) # if test creates the temp dir, also remove it self.dirs_to_remove.append(self.tmp_dir) # setup temporary output directories self.output_dir = mkdtemp(dir=self.tmp_dir, prefix='%s_output_dir_' % self.prefix) self.dirs_to_remove.append(self.output_dir) initiate_timeout(60)
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = mkdtemp( dir=tmp_dir, prefix='qiime_parallel_taxonomy_assigner_tests_', suffix='') self.dirs_to_remove.append(self.test_out) fd, self.tmp_seq_filepath = mkstemp( dir=self.test_out, prefix='qiime_parallel_taxonomy_assigner_tests_input', suffix='.fasta') close(fd) seq_file = open(self.tmp_seq_filepath, 'w') seq_file.write(blast_test_seqs.to_fasta()) seq_file.close() self.files_to_remove.append(self.tmp_seq_filepath) self.id_to_taxonomy_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy', suffix='.txt', dir=tmp_dir) self.id_to_taxonomy_file.write(blast_id_to_taxonomy) self.id_to_taxonomy_file.seek(0) self.reference_seqs_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs', suffix='.fasta', dir=tmp_dir) self.reference_seqs_file.write(blast_reference_seqs.to_fasta()) self.reference_seqs_file.seek(0) initiate_timeout(60)
def select_unique_rand_bcs(rand_bcs, unique_threshold): """ Attempts to select true barcodes from set of barcodes i.e. removes barcodes that might be artifacts due to sequencing errors. Uses uclust to remove barcodes that are similar thatn threshold. Parameters ---------- rand_bcs: list unique_threshold: float Returns ---------- unique_rand_bcs: set set of unique random barcodes. """ temp_dir = get_qiime_temp_dir() fasta_fd, fasta_tempfile_name = mkstemp(dir=temp_dir, prefix='tmp', suffix='.fas') rand_bcs = set(rand_bcs) with open(fasta_tempfile_name, 'w') as fasta_tempfile: for rand_bc in rand_bcs: fasta_tempfile.write(">{}\n{}\n".format(rand_bc, rand_bc)) fasta_tempfile.close() _, _, unique_rand_bcs = get_clusters_from_fasta_filepath( fasta_tempfile_name, original_fasta_path=None, percent_ID=unique_threshold, save_uc_files=False, output_dir=temp_dir) unique_rand_bcs = set(unique_rand_bcs) remove_files([fasta_tempfile_name]) return unique_rand_bcs
def setUp(self): """Defines data that will be used by the tests.""" self.files_to_remove = [] self.dirs_to_remove = [] # Create temp directory to hold input and output. self.test_dir = mkdtemp(dir=get_qiime_temp_dir(), prefix='qiime_compare_categories_tests_') self.dirs_to_remove.append(self.test_dir) # Create input files under our temp dir. self.dm_fp = join(self.test_dir, 'dm.txt') dm_f = open(self.dm_fp, 'w') dm_f.write(dm_str) dm_f.close() self.files_to_remove.append(self.dm_fp) self.invalid_dm_fp = join(self.test_dir, 'invalid_dm.txt') invalid_dm_f = open(self.invalid_dm_fp, 'w') invalid_dm_f.write(invalid_dm_str) invalid_dm_f.close() self.files_to_remove.append(self.invalid_dm_fp) self.map_fp = join(self.test_dir, 'map.txt') map_f = open(self.map_fp, 'w') map_f.write(map_str) map_f.close() self.files_to_remove.append(self.map_fp) self.cat_methods = [ 'adonis', 'anosim', 'mrpp', 'permanova', 'permdisp', 'dbrda' ] self.num_methods = ['best', 'morans_i'] self.cat_categories = ['Treatment'] self.num_categories = ['DOB'] self.num_perms = 42
def test_mothur_supported_version(self): """mothur is in path and version is supported """ acceptable_version = (1, 25, 0) self.assertTrue(which('mothur'), "mothur not found. This may or may not be a problem depending on " + "which components of QIIME you plan to use.") # mothur creates a log file in cwd, so create a tmp and cd there first log_file = join(get_qiime_temp_dir(), 'mothur.log') command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file stdout, stderr, exit_Status = qiime_system_call(command) # remove log file remove_files([log_file], error_on_missing=False) version_string = stdout.strip().split(' ')[1].strip('v.') try: version = tuple(map(int, version_string.split('.'))) pass_test = version == acceptable_version except ValueError: pass_test = False version_string = stdout self.assertTrue(pass_test, "Unsupported mothur version. %s is required, but running %s." % ('.'.join(map(str, acceptable_version)), version_string))
action='store_true', dest='no_clean', default=False, help= "If set, don't delete files generated by formatdb after running [default: %default]." ), make_option( "--blastmatroot", dest='blastmatroot', default=None, type="existing_dirpath", help="Path to a folder containing blast matrices [default: %default]." ), make_option("--working_dir", dest='working_dir', default=get_qiime_temp_dir(), type="existing_dirpath", help="Working dir for BLAST [default: %default]."), make_option( "-m", "--max_hits", type='int', dest='max_hits', default=100, help= """Max hits parameter for BLAST. CAUTION: Because filtering on alignment percentage occurs after BLAST, a max hits value of 1 in combination with an alignment percent filter could miss valid contaminants. [default: %default]""" ), make_option("-w", "--word_size", type='int', dest='wordsize',
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir): """Runs the specified statistical method using the category of interest. This method does not return anything; all output is written to results files in out_dir. Arguments: dm_fp - filepath to the input distance matrix map_fp - filepath to the input metadata mapping file categories - list of categories in the metadata mapping file to consider in the statistical test. Multiple categories will only be considered if method is 'best', otherwise only the first category will be considered num_perms - the number of permutations to use when calculating the p-value. If method is 'best' or 'morans_i', this parameter will be ignored as they are not permutation-based methods out_dir - path to the output directory where results files will be written. It is assumed that this directory already exists and we have write permissions to it """ # Make sure we were passed a list of categories, not a single string. if not isinstance(categories, ListType): raise TypeError("The supplied categories must be a list of " "strings.") # Special case: we do not allow SampleID as it is not a category, neither # in data structure representation nor in terms of a statistical test (no # groups are formed since all entries are unique IDs). if 'SampleID' in categories: raise ValueError("Cannot use SampleID as a category because it is a " "unique identifier for each sample, and thus does " "not create groups of samples (nor can it be used as " "a numeric category in Moran's I or BEST analyses). " "Please use a different metadata column to perform " "statistical tests on.") # Parse the mapping file and distance matrix. md_map = MetadataMap.parseMetadataMap(open(map_fp, 'U')) dm = DistanceMatrix.parseDistanceMatrix(open(dm_fp, 'U')) # Remove any samples from the mapping file that aren't in the distance # matrix (important for validation checks). Use strict=True so that an # error is raised if the distance matrix contains any samples that aren't # in the mapping file. md_map.filterSamples(dm.SampleIds, strict=True) # Run the specified statistical method. if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']: # These methods are run in R. Input validation must be done here before # running the R commands. The pure-Python implementations perform all # validation in the classes in the stats module. # Make sure the input distance matrix is symmetric and hollow. if not dm.is_symmetric_and_hollow(): raise ValueError("The distance matrix must be symmetric and " "hollow.") # Check to make sure all categories passed in are in mapping file and # are not all the same value. for category in categories: if not category in md_map.CategoryNames: raise ValueError("Category '%s' not found in mapping file " "columns." % category) if md_map.hasSingleCategoryValue(category): raise ValueError("All values in category '%s' are the " "same. The statistical method '%s' cannot " "operate on a category that creates only " "a single group of samples (e.g. there " "are no 'between' distances because " "there is only a single group)." % (category, method)) # Build the command arguments string. command_args = [ '-d %s -m %s -c %s -o %s' % (dm_fp, map_fp, categories[0], out_dir) ] if method == 'morans_i': # Moran's I requires only numeric categories. for category in categories: if not md_map.isNumericCategory(category): raise TypeError( "The category '%s' is not numeric. Not " "all values could be converted to numbers." % category) else: # The rest require groups of samples, so the category values cannot # all be unique. for category in categories: if md_map.hasUniqueCategoryValues(category): raise ValueError("All values in category '%s' are unique. " "This statistical method cannot operate " "on a category with unique values (e.g. " "there are no 'within' distances because " "each group of samples contains only a " "single sample)." % category) # Only Moran's I doesn't accept a number of permutations. if num_perms < 0: raise ValueError("The number of permutations must be greater " "than or equal to zero.") command_args[0] += ' -n %d' % num_perms rex = RExecutor(TmpDir=get_qiime_temp_dir()) rex(command_args, '%s.r' % method, output_dir=out_dir) elif method == 'anosim': anosim = Anosim(md_map, dm, categories[0]) anosim_results = anosim(num_perms) out_f = open(join(out_dir, '%s_results.txt' % method), 'w+') out_f.write(format_anosim_results(anosim_results)) out_f.close() elif method == 'best': best = Best(dm, md_map, categories) best_results = best() out_f = open(join(out_dir, '%s_results.txt' % method), 'w+') out_f.write(format_best_results(best_results)) out_f.close() elif method == 'permanova': permanova = Permanova(md_map, dm, categories[0]) permanova_results = permanova(num_perms) out_f = open(join(out_dir, '%s_results.txt' % method), 'w+') out_f.write(format_permanova_results(permanova_results)) out_f.close() else: raise ValueError("Unrecognized method '%s'. Valid methods: %r" % (method, methods))
def __call__(self, seq_path, result_path=None, uc_path=None, log_path=None, HALT_EXEC=False): """Returns mapping of each seq to (tax, consensus fraction, n) Results: If result_path is specified, the results will be written to file as tab-separated lines of: query_id <tab> tax <tab> consensus fraction <tab> n If result_path is None (default), the results will be returned as a dict of: {'query_id': (tax, consensus fraction, n)} In both cases, the values are: tax: the consensus taxonomy assignment consensus fraction: the fraction of the assignments for the query that contained the lowest level tax assignment that is included in tax (e.g., if the assignment goes to genus level, this will be the fraction of assignments that had the consensus genus assignment) n: the number of assignments that were considered when constructing the consensus Parameters: seq_path: path to file of query sequences result_path: path where results should be written. If None (default), returns results as a dict uc_path: path where .uc file should be saved. If None (default), and log_path is specified, the .uc contents will be written to appended to the log file. log_path: path where run log should be written. If None (default), no log file is written. HALT_EXEC: debugging paramter. If pass, will exit just before the uclust command is issued, and will print the command that would have been called to stdout. """ # initialize the logger logger = self._get_logger(log_path) logger.info(str(self)) # set the user-defined parameters params = { '--id': self.Params['similarity'], '--maxaccepts': self.Params['max_accepts'] } # initialize the application controller object app = Uclust(params, HALT_EXEC=HALT_EXEC) # Configure for consensus taxonomy assignment app.Parameters['--rev'].on() app.Parameters['--lib'].on(self.Params['reference_sequences_fp']) app.Parameters['--libonly'].on() app.Parameters['--allhits'].on() if uc_path is None: uc = NamedTemporaryFile(prefix='UclustConsensusTaxonAssigner_', suffix='.uc', dir=get_qiime_temp_dir()) uc_path = uc.name store_uc_in_log = True else: store_uc_in_log = False app_result = app({'--input': seq_path, '--uc': uc_path}) result = self._uc_to_assignment(app_result['ClusterFile']) if result_path is not None: # if the user provided a result_path, write the # results to file of = open(result_path, 'w') for seq_id, (assignment, consensus_fraction, n) in result.items(): assignment_str = ';'.join(assignment) of.write('%s\t%s\t%1.2f\t%d\n' % (seq_id, assignment_str, consensus_fraction, n)) of.close() result = None logger.info('Result path: %s' % result_path) else: # If no result_path was provided, the result dict is # returned as-is. logger.info('Result path: None, returned as dict.') if store_uc_in_log: # This is a little hackish, but we don't have a good way # to pass the uc_path value right now through the # assign_taxonomy.py script, so writing the contents to the # user-specified log file (since this is being stored for logging # purposes). app_result['ClusterFile'].seek(0) logger.info('\n.uc file contents:\n') for line in app_result['ClusterFile']: logger.info(line.strip()) return result
def setUp(self): self.tmp_dir = get_qiime_temp_dir() self.map_file = """#SampleID Day time Description #This is some comment about the study 1 090809 1200 some description of sample1 2 090809 1800 some description of sample2 3 090909 1200 some description of sample3 4 090909 1800 some description of sample4 5 091009 1200 some description of sample5""" self.cat_by_sample = {"1": [("Day", "090809"), ("time", "1200")], "2": [("Day", "090809"), ("time", "1800")], "3": [("Day", "090909"), ("time", "1200")], "4": [("Day", "090909"), ("time", "1800")], "5": [("Day", "091009"), ("time", "1200")]} self.sample_by_cat = {("Day", "090809"): ["1", "2"], ("Day", "090909"): ["3", "4"], ("Day", "091009"): ["5"], ("time", "1200"): ["1", "3", "5"], ("time", "1800"): ["2", "4"]} self.num_cats = 2 self.meta_dict = {"1": ["090809 1200", 0], "2": ["090809 1800", 0], "3": ["090909 1200", 0], "4": ["090909 1800", 0], "5": ["091009 1200", 0]} self.labels = ["from", "to", "eweight", "consensus_lin", "Day", "time"] self.node_labels = ["node_name", "node_disp_name", "ntype", "degree", "weighted_degree", "consensus_lin", "Day", "time"] self.label_list = [["090809", "090909", "091009"], ["1200", "1800"]] self.otu_table_vals = array([[0, 1, 0, 0, 6], [2, 0, 0, 0, 0], [0, 0, 3, 1, 0], [0, 0, 0, 0, 5], [0, 4, 2, 0, 0], [3, 6, 0, 0, 0], [0, 0, 4, 2, 0], [0, 0, 0, 0, 3], [2, 0, 0, 5, 0], [0, 2, 0, 4, 0]]) otu_table = Table(self.otu_table_vals, ['otu_1', 'otu_2', 'otu_3', 'otu_4', 'otu_5', 'otu_6', 'otu_7', 'otu_8', 'otu_9', 'otu_10'], ['1', '2', '3', '4', '5'], [{"taxonomy": ["Bacteria", "Actinobacteria", "Coriobacteridae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Bacteroidaceae"]}, {"taxonomy": ["Bacteria", "Firmicutes", "Clostridia", "Clostridiales"]}, {"taxonomy": ["Bacteria", "Spirochaetes", "Spirochaetales", "Spirochaetaceae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Rikenellaceae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Odoribacteriaceae"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae", "otu_425"]}, {"taxonomy": ["Bacteria", "Bacteroidetes", "Bacteroidales", "Dysgonomonaceae", "otu_425"]}, {"taxonomy": ["Bacteria", "Firmicutes", "Mollicutes", "Clostridium_aff_innocuum_CM970"]}], [None, None, None, None, None]) fd, self.otu_table_fp = mkstemp( dir=self.tmp_dir, prefix='test_make_otu_network_otu_table', suffix='.biom') close(fd) write_biom_table(otu_table, self.otu_table_fp) self.otu_sample_file = """#Full OTU Counts #OTU ID 1 2 3 4 5 Consensus Lineage otu_1 0 1 0 0 6 Bacteria; Actinobacteria; Coriobacteridae otu_2 2 0 0 0 0 Bacteria; Bacteroidetes; Bacteroidales; Bacteroidaceae otu_3 0 0 3 1 0 Bacteria; Firmicutes; Clostridia; Clostridiales otu_4 0 0 0 0 5 Bacteria; Spirochaetes; Spirochaetales; Spirochaetaceae otu_5 0 4 2 0 0 Bacteria; Bacteroidetes; Bacteroidales; Rikenellaceae otu_6 3 6 0 0 0 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae otu_7 0 0 4 2 0 Bacteria; Bacteroidetes; Bacteroidales; Odoribacteriaceae otu_8 0 0 0 0 3 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425 otu_9 2 0 0 5 0 Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425 otu_10 0 2 0 4 0 Bacteria; Firmicutes; Mollicutes; Clostridium_aff_innocuum_CM970""" self.con_by_sample = { '1': set(['2', '4']), '2': set(['5', '3', '1', '4']), '3': set(['4', '2']), '4': set(['3', '1', '2']), '5': set(['2'])} self.edge_file_str = [ "2 otu_1 1.0 Bacteria:Actinobacteria:Coriobacteridae 090809 1800", "5 otu_1 6.0 Bacteria:Actinobacteria:Coriobacteridae 091009 1200", "1 otu_2 2.0 Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae 090809 1200", "3 otu_3 3.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1200", "4 otu_3 1.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1800", "5 otu_4 5.0 Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae 091009 1200", "2 otu_5 4.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090809 1800", "3 otu_5 2.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090909 1200", "1 otu_6 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1200", "2 otu_6 6.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1800", "3 otu_7 4.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1200", "4 otu_7 2.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1800", "5 otu_8 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 091009 1200", "1 otu_9 2.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090809 1200", "4 otu_9 5.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090909 1800", "2 otu_10 2.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090809 1800", "4 otu_10 4.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090909 1800"] self.node_file_str = ["1 1 user_node 3 7.0 other 090809 1200", "2 2 user_node 4 13.0 other 090809 1800", "3 3 user_node 3 9.0 other 090909 1200", "4 4 user_node 4 12.0 other 090909 1800", "5 5 user_node 3 14.0 other 091009 1200", "otu_1 otu_node 2 7.0 Bacteria:Actinobacteria:Coriobacteridae otu otu", "otu_2 otu_node 1 2.0 Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae otu otu", "otu_3 otu_node 2 4.0 Bacteria:Firmicutes:Clostridia:Clostridiales otu otu", "otu_4 otu_node 1 5.0 Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae otu otu", "otu_5 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae otu otu", "otu_6 otu_node 2 9.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae otu otu", "otu_7 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae otu otu", "otu_8 otu_node 1 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_9 otu_node 2 7.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_10 otu_node 2 6.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 otu otu"] self.red_edge_file_str = [ "2 otu_1 1.0 Bacteria:Actinobacteria:Coriobacteridae 090809 1800", "5 otu_1 6.0 Bacteria:Actinobacteria:Coriobacteridae 091009 1200", "1 @1 1.0 missed 090809 1200", "3 otu_3 3.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1200", "4 otu_3 1.0 Bacteria:Firmicutes:Clostridia:Clostridiales 090909 1800", "5 @5 1.0 missed 091009 1200", "2 otu_5 4.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090809 1800", "3 otu_5 2.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae 090909 1200", "1 otu_6 3.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1200", "2 otu_6 6.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae 090809 1800", "3 otu_7 4.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1200", "4 otu_7 2.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae 090909 1800", "1 otu_9 2.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090809 1200", "4 otu_9 5.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 090909 1800", "2 otu_10 2.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090809 1800", "4 otu_10 4.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 090909 1800"] self.red_node_file_str = ["1 1 user_node 3 7.0 other 090809 1200", "2 2 user_node 4 13.0 other 090809 1800", "3 3 user_node 3 9.0 other 090909 1200", "4 4 user_node 4 12.0 other 090909 1800", "5 5 user_node 3 14.0 other 091009 1200", "otu_1 otu_node 2 7.0 Bacteria:Actinobacteria:Coriobacteridae otu otu", "@1 otu_collapsed 1 1.0 other otu otu", "otu_3 otu_node 2 4.0 Bacteria:Firmicutes:Clostridia:Clostridiales otu otu", "@5 otu_collapsed 2 2.0 other otu otu", "otu_5 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae otu otu", "otu_6 otu_node 2 9.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae otu otu", "otu_7 otu_node 2 6.0 Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae otu otu", "otu_9 otu_node 2 7.0 Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425 otu otu", "otu_10 otu_node 2 6.0 Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970 otu otu"] self.otu_dc = {1: 3, 2: 7} self.sample_dc = {3: 3, 4: 2} self.degree_counts = {1: 3, 2: 7, 3: 3, 4: 2} self.num_con_cat = {"Day": 2, "time": 1} self.num_con = 6 self.num_cat = {"Day": 2, "time": 4} self.num_cat_less = {"Day": 1, "time": 3} self._paths_to_clean_up = [self.otu_table_fp] self._dir_to_clean_up = ''
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir): """Runs the specified statistical method using the category of interest. This method does not return anything; all output is written to results files in out_dir. Arguments: dm_fp - filepath to the input distance matrix map_fp - filepath to the input metadata mapping file categories - list of categories in the metadata mapping file to consider in the statistical test. Multiple categories will only be considered if method is 'bioenv', otherwise only the first category will be considered num_perms - the number of permutations to use when calculating the p-value. If method is 'bioenv' or 'morans_i', this parameter will be ignored as they are not permutation-based methods out_dir - path to the output directory where results files will be written. It is assumed that this directory already exists and we have write permissions to it """ # Make sure we were passed a list of categories, not a single string. if not isinstance(categories, ListType): raise TypeError("The supplied categories must be a list of " "strings.") # Special case: we do not allow SampleID as it is not a category, neither # in data structure representation nor in terms of a statistical test (no # groups are formed since all entries are unique IDs). if 'SampleID' in categories: raise ValueError("Cannot use SampleID as a category because it is a " "unique identifier for each sample, and thus does " "not create groups of samples (nor can it be used as " "a numeric category in Moran's I or BIO-ENV " "analyses). Please choose a different metadata " "column to perform statistical tests on.") dm = DistanceMatrix.read(dm_fp) if method in ('anosim', 'permanova', 'bioenv'): with open(map_fp, 'U') as map_f: md_dict = parse_mapping_file_to_dict(map_f)[0] df = pd.DataFrame.from_dict(md_dict, orient='index') out_fp = join(out_dir, '%s_results.txt' % method) if method in ('anosim', 'permanova'): if method == 'anosim': method_fn = anosim elif method == 'permanova': method_fn = permanova results = method_fn(dm, df, column=categories[0], permutations=num_perms) elif method == 'bioenv': results = bioenv(dm, df, columns=categories) results.to_csv(out_fp, sep='\t') else: # Remove any samples from the mapping file that aren't in the distance # matrix (important for validation checks). Use strict=True so that an # error is raised if the distance matrix contains any samples that # aren't in the mapping file. with open(map_fp, 'U') as map_f: md_map = MetadataMap.parseMetadataMap(map_f) md_map.filterSamples(dm.ids, strict=True) # These methods are run in R. Input validation must be done here before # running the R commands. if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']: # Check to make sure all categories passed in are in mapping file # and are not all the same value. for category in categories: if not category in md_map.CategoryNames: raise ValueError("Category '%s' not found in mapping file " "columns." % category) if md_map.hasSingleCategoryValue(category): raise ValueError("All values in category '%s' are the " "same. The statistical method '%s' " "cannot operate on a category that " "creates only a single group of samples " "(e.g. there are no 'between' distances " "because there is only a single group)." % (category, method)) # Build the command arguments string. command_args = [ '-d %s -m %s -c %s -o %s' % (dm_fp, map_fp, categories[0], out_dir) ] if method == 'morans_i': # Moran's I requires only numeric categories. for category in categories: if not md_map.isNumericCategory(category): raise TypeError("The category '%s' is not numeric. " "Not all values could be converted to " "numbers." % category) else: # The rest require groups of samples, so the category values # cannot all be unique. for category in categories: if (md_map.hasUniqueCategoryValues(category) and not (method == 'adonis' and md_map.isNumericCategory(category))): raise ValueError("All values in category '%s' are " "unique. This statistical method " "cannot operate on a category with " "unique values (e.g. there are no " "'within' distances because each " "group of samples contains only a " "single sample)." % category) # Only Moran's I doesn't accept a number of permutations. if num_perms < 0: raise ValueError("The number of permutations must be " "greater than or equal to zero.") command_args[0] += ' -n %d' % num_perms rex = RExecutor(TmpDir=get_qiime_temp_dir()) rex(command_args, '%s.r' % method) else: raise ValueError("Unrecognized method '%s'. Valid methods: %r" % (method, methods))
def get_cluster_ratio(fasta_seqs, min_difference_in_clusters): """ Uses uclust to calculate cluster ratio cluster_ratio = num_of_seq_in_cluster_with_max_seq divided by num_of_seq_in cluster_with_second_higest_seq Parameters ---------- fasta_seqs: list list of fasta sequences min_difference_in_clusters: float percent identity threshold for cluster formation Returns ---------- cluster_ratio: float cluster ratio of the sequences using uclust cluster_ratio = num_of_seq_in_cluster_with_max_seq / num_of_seq_in cluster_with_second_higest_seq """ cluster_percent_id = min_difference_in_clusters temp_dir = get_qiime_temp_dir() fd_uc, uclust_tempfile_name = mkstemp(dir=temp_dir, suffix='.uc') close(fd_uc) fd_fas, fasta_tempfile_name = mkstemp(dir=temp_dir, suffix='.uc') close(fd_fas) with open(fasta_tempfile_name, 'w') as fasta_tempfile: fasta_tempfile.write(fasta_seqs) fasta_tempfile.close() count = 0 command = "uclust --usersort --input {} --uc {} --id 0.98".format( fasta_tempfile_name, uclust_tempfile_name) # In the function, I am calling uclust a large number of times. # Initially I was using from bfillings.get_clusters_from_fasta_filepath # but due to issue (biocore/bfillingss#31), I have temporarily # reverted to qiime_system_call. count_lookup = defaultdict(int) qiime_system_call(command) uclust_tempfile = open(uclust_tempfile_name, 'r') for line in uclust_tempfile: if search(r'^C', line): pieces = line.split('\t') count_lookup[pieces[1]] += int(pieces[2]) count += 1 uclust_tempfile.close() files_to_be_removed = list() files_to_be_removed.append(uclust_tempfile_name) remove_files(files_to_be_removed) sorted_counts_in_clusters = sorted( count_lookup.iteritems(), key=lambda x: x[1], reverse=True) try: max_cluster_count = \ float(str(sorted_counts_in_clusters[0][1])) second_cluster_count = \ float(str(sorted_counts_in_clusters[1][1])) return max_cluster_count / second_cluster_count except IndexError: return 1
make_option("-o","--outputdir",dest='outputdir',default = None, type="new_dirpath", help="The output directory") ] script_info['optional_options']=[\ make_option("-e","--e_value",type='float',dest='e_value',\ default = 1e-10,\ help="The e-value cutoff for blast queries [default: %default]."),\ make_option("-p","--percent_aligned",type='float',\ dest='percent_aligned',default = 0.97,\ help="The % alignment cutoff for blast queries [default: %default]."),\ make_option("--no_clean",action = 'store_true',\ dest='no_clean',default = False,\ help="If set, don't delete files generated by formatdb after running [default: %default]."),\ make_option("--blastmatroot",dest='blastmatroot',default = None, type="existing_dirpath",\ help="Path to a folder containing blast matrices [default: %default]."),\ make_option("--working_dir",dest='working_dir',default = get_qiime_temp_dir(), type="existing_dirpath",\ help="Working dir for BLAST [default: %default]."),\ make_option("-m","--max_hits",type='int',dest='max_hits',\ default = 100,\ help="""Max hits parameter for BLAST. CAUTION: Because filtering on alignment percentage occurs after BLAST, a max hits value of 1 in combination with an alignment percent filter could miss valid contaminants. [default: %default]"""),\ make_option("-w","--word_size",type='int',dest='wordsize',\ default = 28,\ help="Word size to use for BLAST search [default: %default]"),\ make_option("-n","--no_format_db",dest = 'no_format_db', action = "store_true",\ default = False,\ help="""If this flag is specified, format_db will not be called on the subject database (formatdb will be set to False). This is useful if you have already formatted the database and a) it took a very long time or b) you want to run the script in parallel on the pre-formatted database [default: %default]""") ] script_info['version'] = __version__ FORMAT_BAR = """------------------------------""" * 2