Пример #1
0
    def setUp(self):
        self.mapping_file_data = MAPPING_FILE_DATA
        self.mapping_file_headers = ['SampleID', 'BarcodeSequence',
            'LinkerPrimerSequence', 'Treatment', 'DOB', 'Description']
        self.valid_columns = ['Treatment', 'DOB']
        self.support_files_filename = get_qiime_temp_dir()
        self.support_files_filename_spaces = join(get_qiime_temp_dir(),
            'Directory With Spaces/AndNoSpaces')

        # data for the custom axes, contains columns that are gradients
        self.mapping_file_data_gradient = MAPPING_FILE_DATA_GRADIENT
        self.mapping_file_headers_gradient = ['SampleID', 'Treatment', 'Time',
            'Weight', 'Description']

        self.coords_header = ['PC.355', 'PC.635', 'PC.636', 'PC.354']
        self.coords_data = COORDS_DATA
        self.coords_eigenvalues = array([1, 2, 3, 4])
        self.coords_pct = array([40, 30, 20, 10])

        # jackknifed test data
        self.jk_mapping_file_headers = ['SampleID', 'C2', 'C3', 'C4']
        self.jk_mapping_file_data = [['1', 'a', 'b', 'c'], ['2', 'd', 'e', 'f'],
            ['3', 'g', 'h', 'i']]
        self.jk_coords_header = [['1', '2', '3'], ['1', '2', '3'],
            ['1', '2', '3'], ['1', '2', '3']]
        self.jk_coords_data = [array([[1.2, 0.1, -1.2],[-2.5, -4.0, 4.5]]),
            array([[-1.4, 0.05, 1.3],[2.6, 4.1, -4.7]]),
            array([[-1.5, 0.05, 1.6],[2.4, 4.0, -4.8]]),
            array([[-1.5, 0.05, 1.6],[2.4, 4.0, -4.8]])]
        self.jk_coords_eigenvalues = [array([0.80, .11, 0.09]), array([0.76,
            .20,0.04]), array([0.84, .14, 0.02]), array([0.84, .11, 0.05])]
        self.jk_coords_pcts = [array([0.80, .10, 0.10]), array([0.76, .21,
            0.03]), array([0.84, .11, 0.05]), array([0.84, .15, 0.01])]

        self.jk_mapping_file_data_gradient = MAPPING_FILE_DATA_GRADIENT
        self.jk_mapping_file_headers_gradient = ['SampleID', 'Treatment','Time',
            'Weight', 'Description']
        self.jk_coords_header_gradient = [['PC.354','PC.355','PC.635','PC.636'],
            ['PC.354','PC.355','PC.635','PC.636'], ['PC.354','PC.355','PC.635',
            'PC.636'], ['PC.354','PC.355','PC.635','PC.636']]
        self.jk_coords_data_gradient = [array([[1.2, 0.1, -1.2, 1.1],[-2.5,
            -4.0, 4.5, 0.3], [.5, -0.4, 3.5, 1.001], [0.67, 0.23, 1.01, 2.2]]),
            array([[1.2, 1, -0.2, 0.1],[-2.5, -4.0, 4.5, 3.2], [.5, -0.4, 3.5,
            1.00], [0.57, 0.27, 0.95, 2.1]]), array([[1.0, 1, -1.2, 1.1],[-2.1,
            -2.0, 3.5, 0.3], [.5, 3, 3.5, 2], [0.60, 0.33, 1.3, 2.0]]), array([
            [1.2, 0.1, -1.2, 1.1],[-2.5,-4.0, 4.5, 0.3], [.5, -0.4, 3.5, 1.001],
            [0.69, 0.20, 1.01, 2.2]])]
        self.jk_coords_eigenvalues_gradient = [array([0.80, .11, 0.09, 0.0]),
            array([0.76, .20,0.04, 0.0]), array([0.84, .14, 0.02, 0.0]), array([
            0.84, .11, 0.05, 0.0])]
        self.jk_coords_pcts_gradient = [array([0.80, .10, 0.10, 0.0]), array(
            [0.76, .21, 0.03, 0.0]), array([0.84, .11, 0.05, 0.0]), array([0.84,
            .15, 0.01, 0])]

        self.broken_mapping_file_data = BROKEN_MAPPING_FILE
        self.broken_mapping_file_data_2_values = BROKEN_MAPPING_FILE_2_VALUES
    def __call__(self,
                 query_fasta_fp,
                 database_fasta_fp,
                 output_dir,
                 observation_metadata_fp=None,
                 params=None,
                 HALT_EXEC=False):

        if params is None:
            params = {}

        """ Call the DatabaseMapper """
        create_dir(output_dir)
        raw_output_fp = self._get_raw_output_fp(output_dir,
                                                params)
        output_observation_map_fp = '%s/observation_map.txt' % output_dir
        output_biom_fp = '%s/observation_table.biom' % output_dir
        log_fp = '%s/observation_table.log' % output_dir

        self._assign_dna_reads_to_database(
            query_fasta_fp=query_fasta_fp,
            database_fasta_fp=database_fasta_fp,
            raw_output_fp=raw_output_fp,
            temp_dir=get_qiime_temp_dir(),
            params=params,
            HALT_EXEC=HALT_EXEC)

        self._process_raw_output(raw_output_fp,
                                 log_fp,
                                 output_observation_map_fp)

        self._generate_biom_output(output_observation_map_fp,
                                   output_biom_fp,
                                   observation_metadata_fp)
Пример #3
0
    def _generate_training_files(self):
        """Returns a tuple of file objects suitable for passing to the
        RdpTrainer application controller.
        """
        tmp_dir = get_qiime_temp_dir()
        training_set = RdpTrainingSet()
        reference_seqs_file = open(self.Params['reference_sequences_fp'], 'U')
        id_to_taxonomy_file = open(self.Params['id_to_taxonomy_fp'], 'U')

        for seq_id, seq in MinimalFastaParser(reference_seqs_file):
            training_set.add_sequence(seq_id, seq)

        for line in id_to_taxonomy_file:
            seq_id, lineage_str = map(strip, line.split('\t'))
            training_set.add_lineage(seq_id, lineage_str)

        training_set.dereplicate_taxa()

        rdp_taxonomy_file = NamedTemporaryFile(
            prefix='RdpTaxonAssigner_taxonomy_', suffix='.txt', dir=tmp_dir)
        rdp_taxonomy_file.write(training_set.get_rdp_taxonomy())
        rdp_taxonomy_file.seek(0)

        rdp_training_seqs_file = NamedTemporaryFile(
            prefix='RdpTaxonAssigner_training_seqs_', suffix='.fasta',
            dir=tmp_dir)
        for rdp_id, seq in training_set.get_training_seqs():
            rdp_training_seqs_file.write('>%s\n%s\n' % (rdp_id, seq))
        rdp_training_seqs_file.seek(0)

        self._training_set = training_set

        return rdp_taxonomy_file, rdp_training_seqs_file
Пример #4
0
    def setUp(self):
        """ """
        self.test_data = get_test_data_fps()
        self.files_to_remove = []
        self.dirs_to_remove = []

        # Create example output directory
        tmp_dir = get_qiime_temp_dir()
        self.test_out = mkdtemp(dir=tmp_dir,
                                prefix='core_qiime_analyses_test_',
                                suffix='')
        self.dirs_to_remove.append(self.test_out)

        self.qiime_config = load_qiime_config()
        self.params = parse_qiime_parameters(params_f1)

        # suppress stderr during tests (one of the systems calls in the
        # workflow prints a warning, and we can't suppress that warning with
        # warnings.filterwarnings) here because it comes from within the code
        # executed through the system call. Found this trick here:
        # http://stackoverflow.com/questions/9949633/suppressing-print-as-stdout-python
        self.saved_stderr = sys.stderr
        sys.stderr = StringIO()

        initiate_timeout(180)
Пример #5
0
def multiple_file_DA_DESeq2(input_dir, output_dir, mapping_fp, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots):
    """perform DESeq2 negative binomial Wald differential abundance test on a directory of raw abundance OTU matrices
    """
    if not exists(output_dir):
        makedirs(output_dir)
    file_names = [fname for fname in listdir(input_dir) if not (fname.startswith('.')\
        or isdir(fname))]

    for fname in file_names:
        base_fname, ext = splitext(fname)
        original_fname = base_fname+'.biom'
        hdf5_infile = join(input_dir, original_fname)
        tmp_bt = load_table(hdf5_infile)
        tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp)
        check_mapping_file_category(tmp_bt, mapping_fp, mapping_category, subcategory_1, subcategory_2)
        tmp_bt.add_metadata(tmp_pmf, 'sample')
        outfile = join(output_dir, 'DESeq2_DA_'+base_fname+'.txt') 
        outfile_diagnostic = join(output_dir, 'DESeq2_diagnostic_plots_'+base_fname+'.pdf') 

        with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(),
                                         prefix='QIIME-differential-abundance-temp-table-',
                                         suffix='.biom') as temp_fh:
            temp_fh.write(tmp_bt.to_json('forR'))
            temp_fh.flush()
            run_DESeq2(temp_fh.name, outfile, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots, outfile_diagnostic) 
Пример #6
0
    def setUp(self):
        """ """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = mkdtemp(dir=tmp_dir,
                                prefix='qiime_parallel_tests_',
                                suffix='')
        self.dirs_to_remove.append(self.test_out)

        fd, self.template_fp = mkstemp(dir=self.test_out,
                                      prefix='qiime_template',
                                      suffix='.fasta')
        close(fd)
        template_f = open(self.template_fp, 'w')
        template_f.write(pynast_test1_template_fasta)
        template_f.close()
        self.files_to_remove.append(self.template_fp)

        fd, self.inseqs1_fp = mkstemp(dir=self.test_out,
                                     prefix='qiime_inseqs',
                                     suffix='.fasta')
        close(fd)
        inseqs1_f = open(self.inseqs1_fp, 'w')
        inseqs1_f.write(inseqs1)
        inseqs1_f.close()
        self.files_to_remove.append(self.inseqs1_fp)

        initiate_timeout(60)
Пример #7
0
 def setUp(self):
     
     self.files_to_remove = []
     self.dirs_to_remove = []
     
     # Create example output directory
     tmp_dir = get_qiime_temp_dir()
     self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                      prefix='qiime_parallel_tests_',
                                      suffix='',
                                      result_constructor=str)
     self.dirs_to_remove.append(self.test_out)
     create_dir(self.test_out)
     
     # Create example input file
     self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out,
                                         prefix='qiime_inseqs',
                                         suffix='.fasta')
     inseqs1_f = open(self.inseqs1_fp,'w')
     inseqs1_f.write(inseqs1)
     inseqs1_f.close()
     self.files_to_remove.append(self.inseqs1_fp)
     
     # Define number of seconds a test can run for before timing out 
     # and failing
     initiate_timeout(60)
Пример #8
0
def multiple_file_DA_fitZIG(input_dir, output_dir, mapping_fp, mapping_category, subcategory_1, subcategory_2):
    """perform metagenomeSeq's Zero Inflated Gaussian (ZIG) OTU differential abundance test on a directory of raw abundance OTU matrices
    """
    if not exists(output_dir):
        makedirs(output_dir)
    file_names = [fname for fname in listdir(input_dir) if not (fname.startswith('.')\
        or isdir(fname))]

    for fname in file_names:
        base_fname, ext = splitext(fname)
        original_fname = base_fname+'.biom'
        hdf5_infile = join(input_dir, original_fname)
        tmp_bt = load_table(hdf5_infile) 
        tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp)
        check_mapping_file_category(tmp_bt, mapping_fp, mapping_category, subcategory_1, subcategory_2)
        tmp_bt.add_metadata(tmp_pmf, 'sample')
        #make temporary json biom version - R currently does not have hdf5
        outfile = join(output_dir, 'fitZIG_DA_'+base_fname+'.txt')

        with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(),
                                         prefix='QIIME-differential-abundance-temp-table-',
                                         suffix='.biom') as temp_fh:
            temp_fh.write(tmp_bt.to_json('forR'))
            temp_fh.flush()
            run_fitZIG(temp_fh.name, outfile, mapping_category, subcategory_1, subcategory_2) 
Пример #9
0
    def setUp(self):
        """ """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = mkdtemp(dir=tmp_dir,
                                prefix='qiime_parallel_blaster_tests_',
                                suffix='')
        self.dirs_to_remove.append(self.test_out)

        fd, self.tmp_seq_filepath = mkstemp(dir=self.test_out,
                                           prefix='qiime_parallel_blaster_tests_input',
                                           suffix='.fasta')
        close(fd)
        seq_file = open(self.tmp_seq_filepath, 'w')
        seq_file.write(blast_test_seqs)
        seq_file.close()
        self.files_to_remove.append(self.tmp_seq_filepath)

        self.reference_seqs_file = NamedTemporaryFile(
            prefix='qiime_parallel_blaster_tests_ref_seqs',
            suffix='.fasta', dir=tmp_dir)
        self.reference_seqs_file.write(blast_ref_seqs)
        self.reference_seqs_file.seek(0)

        initiate_timeout(60)
    def setUp(self):

        self.files_to_remove = []
        self.dirs_to_remove = []

        # Create example output directory
        tmp_dir = get_qiime_temp_dir()
        self.test_out = mkdtemp(dir=tmp_dir,
                                         prefix='core_qiime_analyses_test_',
                                         suffix='')
        self.dirs_to_remove.append(self.test_out)

        # Get input data
        self.test_data = get_test_data_fps()

        self.qiime_config = load_qiime_config()
        self.qiime_config['jobs_to_start'] = 2
        self.qiime_config['seconds_to_sleep'] = 1

        # suppress stderr during tests (one of the systems calls in the
        # workflow prints a warning, and we can't suppress that warning with
        # warnings.filterwarnings) here because it comes from within the code
        # executed through the system call. Found this trick here:
        # http://stackoverflow.com/questions/9949633/suppressing-print-as-stdout-python
        self.saved_stderr = sys.stderr
        sys.stderr = StringIO()

        # Define number of seconds a test can run for before timing out
        # and failing
        initiate_timeout(600)
Пример #11
0
    def test_split_fasta_diff_num_seqs_per_file_alt(self):
        """split_fasta funcs always catches all seqs
        """
        # start with 59 seqs (b/c it's prime, so should make more
        # confusing splits)
        in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)])
        infile = in_seqs.toFasta().split('\n')

        # test seqs_per_file from 1 to 1000
        for i in range(1, 1000):
            _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                         prefix='split_fasta_tests',
                                         suffix='')
            close(_)

            actual = split_fasta(infile, i, filename_prefix)

            actual_seqs = []
            for fp in actual:
                actual_seqs += list(open(fp))
            # remove the files now, so if the test fails they still get
            # cleaned up
            remove_files(actual)

            # building seq collections from infile and the split files result in
            # equivalent seq collections
            self.assertEqual(
                LoadSeqs(data=infile, aligned=False),
                LoadSeqs(data=actual_seqs, aligned=False))
Пример #12
0
    def setUp(self):
        """ """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='qiime_parallel_taxonomy_assigner_tests_',
                                         suffix='',
                                         result_constructor=str)
        self.dirs_to_remove.append(self.test_out)
        create_dir(self.test_out)

        self.tmp_seq_filepath = get_tmp_filename(tmp_dir=self.test_out,
            prefix='qiime_parallel_taxonomy_assigner_tests_input',
            suffix='.fasta')
        seq_file = open(self.tmp_seq_filepath, 'w')
        seq_file.write(blast_test_seqs.toFasta())
        seq_file.close()
        self.files_to_remove.append(self.tmp_seq_filepath)

        self.id_to_taxonomy_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy',
            suffix='.txt',dir=tmp_dir)
        self.id_to_taxonomy_file.write(blast_id_to_taxonomy)
        self.id_to_taxonomy_file.seek(0)

        self.reference_seqs_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs',
            suffix='.fasta',dir=tmp_dir)
        self.reference_seqs_file.write(blast_reference_seqs.toFasta())
        self.reference_seqs_file.seek(0)

        initiate_timeout(60)
Пример #13
0
    def setUp(self):
        """Defines data that will be used by the tests."""
        self.files_to_remove = []
        self.dirs_to_remove = []

        # Create temp directory to hold input and output.
        self.test_dir = mkdtemp(dir=get_qiime_temp_dir(),
                                prefix='qiime_compare_categories_tests_')
        self.dirs_to_remove.append(self.test_dir)

        # Create input files under our temp dir.
        self.dm_fp = join(self.test_dir, 'dm.txt')
        dm_f = open(self.dm_fp, 'w')
        dm_f.write(dm_str)
        dm_f.close()
        self.files_to_remove.append(self.dm_fp)

        self.invalid_dm_fp = join(self.test_dir, 'invalid_dm.txt')
        invalid_dm_f = open(self.invalid_dm_fp, 'w')
        invalid_dm_f.write(invalid_dm_str)
        invalid_dm_f.close()
        self.files_to_remove.append(self.invalid_dm_fp)

        self.map_fp = join(self.test_dir, 'map.txt')
        map_f = open(self.map_fp, 'w')
        map_f.write(map_str)
        map_f.close()
        self.files_to_remove.append(self.map_fp)

        self.cat_methods = ['adonis', 'anosim', 'mrpp', 'permanova',
                            'permdisp', 'dbrda']
        self.num_methods = ['best', 'morans_i']
        self.cat_categories = ['Treatment']
        self.num_categories = ['DOB']
        self.num_perms = 42
Пример #14
0
    def test_mothur_supported_version(self):
        """mothur is in path and version is supported """
        acceptable_version = (1, 25, 0)
        self.assertTrue(
            which("mothur"),
            "mothur not found. This may or may not be a problem depending on "
            + "which components of QIIME you plan to use.",
        )
        # mothur creates a log file in cwd, so create a tmp and cd there first
        log_file = join(get_qiime_temp_dir(), "mothur.log")
        command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file
        stdout, stderr, exit_Status = qiime_system_call(command)

        # remove log file
        remove_files([log_file], error_on_missing=False)

        version_string = stdout.strip().split(" ")[1].strip("v.")
        try:
            version = tuple(map(int, version_string.split(".")))
            pass_test = version == acceptable_version
        except ValueError:
            pass_test = False
            version_string = stdout
        self.assertTrue(
            pass_test,
            "Unsupported mothur version. %s is required, but running %s."
            % (".".join(map(str, acceptable_version)), version_string),
        )
Пример #15
0
    def test_temp_dir(self):
        """temp_dir is set to a valid path"""
        temp_dir = get_qiime_temp_dir()

        self.assertTrue(exists(temp_dir), "temp_dir does not exist: %s" % temp_dir)
        self.assertTrue(isdir(temp_dir), "temp_dir is not a directory: %s" % temp_dir)
        self.assertTrue(access(temp_dir, W_OK), "temp_dir is not writable: %s" % temp_dir)
Пример #16
0
 def setUp(self):
     """ """
     self.files_to_remove = []
     self.dirs_to_remove = []
     
     tmp_dir = get_qiime_temp_dir()
     self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                      prefix='qiime_parallel_tests_',
                                      suffix='',
                                      result_constructor=str)
     self.dirs_to_remove.append(self.test_out)
     create_dir(self.test_out)
     
     self.template_fp = get_tmp_filename(tmp_dir=self.test_out,
                                         prefix='qiime_template',
                                         suffix='.fasta')
     template_f = open(self.template_fp,'w')
     template_f.write(pynast_test1_template_fasta)
     template_f.close()
     self.files_to_remove.append(self.template_fp)
     
     self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out,
                                         prefix='qiime_inseqs',
                                         suffix='.fasta')
     inseqs1_f = open(self.inseqs1_fp,'w')
     inseqs1_f.write(inseqs1)
     inseqs1_f.close()
     self.files_to_remove.append(self.inseqs1_fp)
     
     initiate_timeout(60)
Пример #17
0
    def test_split_fasta_diff_num_seqs_per_file(self):
        """split_fasta funcs as expected when diff num seqs go to each file
        """
        _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                     prefix='split_fasta_tests',
                                     suffix='')
        close(_)
        infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA',
                  '>seq3', 'CCTT--AA']

        actual = split_fasta(infile, 2, filename_prefix)

        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)]
        # list of file paths is as expected
        self.assertEqual(actual, expected)
        # building seq collections from infile and the split files result in
        # equivalent seq collections
        self.assertEqual(
            LoadSeqs(data=infile, aligned=False),
            LoadSeqs(data=actual_seqs, aligned=False))
Пример #18
0
    def setUp(self):
        """Define some sample data that will be used by the tests."""
        # Standard recipients file with two recipients, one with multiple email
        # addresses.
        self.recipients = ["# a comment", " ", " foo1\[email protected]  ",
                           "foo2\t [email protected],  [email protected],[email protected] "]

        # An empty recipients file.
        self.empty_recipients = ["# a comment", " ", "\n\t\t\t\t"]

        # Standard participants list.
        self.participants = ["# a comment", " ", " foo1  ", "foo2"]

        # Invalid (duplicate) participants list.
        self.duplicate_participants = ["foo1", "foo2", "foo1"]
        
        # The prefix to use for temporary files. This prefix may be added to,
        # but all temp dirs and files created by the tests will have this
        # prefix at a minimum.
        self.prefix = 'my_microbes_tests_'

        self.start_dir = getcwd()
        self.dirs_to_remove = []
        self.files_to_remove = []

        self.tmp_dir = get_qiime_temp_dir()
        if not exists(self.tmp_dir):
            makedirs(self.tmp_dir)
            # If test creates the temp dir, also remove it.
            self.dirs_to_remove.append(self.tmp_dir)

        # Set up temporary input and output directories.
        self.output_dir = mkdtemp(dir=self.tmp_dir,
                                  prefix='%soutput_dir_' % self.prefix)
        self.dirs_to_remove.append(self.output_dir)
        
        # Set up temporary input and output directories.
        self.input_dir = mkdtemp(dir=self.tmp_dir,
                                  prefix='%sinput_dir_' % self.prefix)
        self.dirs_to_remove.append(self.input_dir)

        # Data that will be used by the tests.
        self.otu_cat_sig_gut_fp = join(self.input_dir, 'otu_cat_sig_gut.txt')
        otu_cat_sig_gut_f = open(self.otu_cat_sig_gut_fp, 'w')
        otu_cat_sig_gut_f.write(otu_cat_sig_gut_text)
        otu_cat_sig_gut_f.close()
        self.files_to_remove.append(self.otu_cat_sig_gut_fp)
        
        self.otu_cat_sig_palm_fp = join(self.input_dir, 'otu_cat_sig_palm.txt')
        otu_cat_sig_palm_f = open(self.otu_cat_sig_palm_fp, 'w')
        otu_cat_sig_palm_f.write(otu_cat_sig_gut_text)
        otu_cat_sig_palm_f.close()
        self.files_to_remove.append(self.otu_cat_sig_palm_fp)

        self.rep_seqs_fp = join(self.input_dir, 'rep_seqs.fna')
        rep_seqs_f = open(self.rep_seqs_fp, 'w')
        rep_seqs_f.write(rep_seqs_text)
        rep_seqs_f.close()
        self.files_to_remove.append(self.rep_seqs_fp)
Пример #19
0
 def setUp(self):
     """ """
     self.files_to_remove = []
     tmp_dir = get_qiime_temp_dir()
     self.test_fp = get_tmp_filename(tmp_dir=tmp_dir,
                                     prefix='bufWriterTest',
                                     suffix='.txt')
     self.files_to_remove.append(self.test_fp)
Пример #20
0
    def __call__(self, seq_path, result_path=None, log_path=None):
        """Returns dict mapping {seq_id:(taxonomy, confidence)} for
        each seq.

        Parameters:
        seq_path: path to file of sequences
        result_path: path to file of results. If specified, dumps the
            result to the desired path instead of returning it.
        log_path: path to log, which should include dump of params.
        """

        if log_path:
            self.writeLog(log_path)

        reference_sequences_fp = self.Params["reference_sequences_fp"]
        assert reference_sequences_fp, "Must provide reference_sequences_fp when calling an RtaxTaxonAssigner."

        id_to_taxonomy_fp = self.Params["id_to_taxonomy_fp"]
        assert id_to_taxonomy_fp, "Must provide id_to_taxonomy_fp when calling an RtaxTaxonAssigner."

        # delimiter = self.Params['delimiter']
        read_1_seqs_fp = self.Params["read_1_seqs_fp"]
        assert read_1_seqs_fp, "Must provide read_1_seqs_fp when calling an RtaxTaxonAssigner."

        # following params may all be null

        read_2_seqs_fp = self.Params["read_2_seqs_fp"]
        single_ok = self.Params["single_ok"]
        no_single_ok_generic = self.Params["no_single_ok_generic"]
        header_id_regex = self.Params["header_id_regex"]
        assert header_id_regex, (
            "Must not provide empty header_id_regex when calling an RtaxTaxonAssigner; leave unset"
            "to use default if in doubt."
        )

        read_id_regex = self.Params["read_id_regex"]
        amplicon_id_regex = self.Params["amplicon_id_regex"]

        # seq_file = open(seq_path, 'r')

        results = rtax.assign_taxonomy(
            seq_path,
            reference_sequences_fp,
            id_to_taxonomy_fp,
            read_1_seqs_fp,
            read_2_seqs_fp,
            single_ok=single_ok,
            no_single_ok_generic=no_single_ok_generic,
            header_id_regex=header_id_regex,
            read_id_regex=read_id_regex,
            amplicon_id_regex=amplicon_id_regex,
            output_fp=result_path,
            log_path=log_path,
            base_tmp_dir=get_qiime_temp_dir(),
        )

        return results
Пример #21
0
 def setUp(self):
     """ """
     self.files_to_remove = []
     tmp_dir = get_qiime_temp_dir()
     fd, self.test_fp = mkstemp(dir=tmp_dir,
                               prefix='bufWriterTest',
                               suffix='.txt')
     close(fd)
     self.files_to_remove.append(self.test_fp)
Пример #22
0
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        try:
            template_alignment = LoadSeqs(data=template_alignment, moltype=DNA,
                                          aligned=DenseAlignment)
        except KeyError as e:
            raise KeyError('Only ACGT-. characters can be contained in template alignments.' +
                           ' The offending character was: %s' % e)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            for seq in pynast_failed:
                fail_file.write(seq.toFasta())
                fail_file.write('\n')
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            for seq in pynast_aligned:
                result_file.write(seq.toFasta())
                result_file.write('\n')
            result_file.close()
            return None
        else:
            try:
                return LoadSeqs(data=pynast_aligned, aligned=DenseAlignment)
            except ValueError:
                return {}
Пример #23
0
def normalize_DESeq2(input_path, out_path, DESeq_negatives_to_zero):
    """performs DESeq2VS normalization on a single raw abundance OTU matrix
    """
    tmp_bt = load_table(input_path) 
    with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(),
                                     prefix='QIIME-normalize-table-temp-table-',
                                     suffix='.biom') as temp_fh:
        temp_fh.write(tmp_bt.to_json('forR'))
        temp_fh.flush()
        run_DESeq2(temp_fh.name, out_path, DESeq_negatives_to_zero)
Пример #24
0
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        template_alignment = Alignment.from_fasta_records(
                    template_alignment, DNASequence, validate=True)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        for i, seq in enumerate(pynast_failed):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_failed[i] = skb_seq
        pynast_failed = SequenceCollection(pynast_failed)

        for i, seq in enumerate(pynast_aligned):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_aligned[i] = skb_seq
        pynast_aligned = Alignment(pynast_aligned)

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            fail_file.write(pynast_failed.to_fasta())
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(pynast_aligned.to_fasta())
            result_file.close()
            return None
        else:
            return pynast_aligned
Пример #25
0
    def __call__(self, seq_path, result_path=None, log_path=None):
        """Returns dict mapping {seq_id:(taxonomy, confidence)} for
        each seq.

        Parameters:
        seq_path: path to file of sequences
        result_path: path to file of results. If specified, dumps the
            result to the desired path instead of returning it.
        log_path: path to log, which should include dump of params.
        """
        tmp_dir = get_qiime_temp_dir()
        min_conf = self.Params["Confidence"]
        training_data_properties_fp = self.Params["training_data_properties_fp"]
        reference_sequences_fp = self.Params["reference_sequences_fp"]
        id_to_taxonomy_fp = self.Params["id_to_taxonomy_fp"]
        max_memory = self.Params["max_memory"]

        seq_file = open(seq_path, "U")
        if reference_sequences_fp and id_to_taxonomy_fp:
            # Train and assign taxonomy
            taxonomy_file, training_seqs_file = self._generate_training_files()
            results = rdp_classifier.train_rdp_classifier_and_assign_taxonomy(
                training_seqs_file,
                taxonomy_file,
                seq_file,
                min_confidence=min_conf,
                classification_output_fp=result_path,
                max_memory=max_memory,
                tmp_dir=tmp_dir,
            )

            if result_path is None:
                results = self._training_set.fix_results(results)
            else:
                self._training_set.fix_output_file(result_path)
        else:
            # Just assign taxonomy, using properties file if passed
            if training_data_properties_fp:
                fix_ranks = False
            else:
                fix_ranks = True
            results = rdp_classifier.assign_taxonomy(
                seq_file,
                min_confidence=min_conf,
                output_fp=result_path,
                training_data_fp=training_data_properties_fp,
                max_memory=max_memory,
                fixrank=fix_ranks,
                tmp_dir=tmp_dir,
            )

        if log_path:
            self.writeLog(log_path)

        return results
Пример #26
0
def run_fitZIG(input_path, out_path, mapping_category, subcategory_1, subcategory_2):
    """Run metagenomeSeq's fitZIG algorithm through Rscript
    """
    # set options
    command_args = ['-i %s -o %s -c %s -x %s -y %s' % (input_path, out_path, mapping_category, subcategory_1, subcategory_2)]
    # instantiate the object
    rsl = RExecutor(TmpDir=get_qiime_temp_dir())
    # run the app
    app_result = rsl(command_args=command_args, script_name='fitZIG.r')

    return app_result
Пример #27
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.submit_jobs and not opts.make_jobs:
        option_parser.error("Must pass -m if passing -s. (Sorry about this, " + "it's for backwards-compatibility.)")

    min_args = 2
    if len(args) < min_args:
        option_parser.error("Exactly two arguments are required.")

    output_dir = get_qiime_temp_dir()
    run_commands(output_dir, open(args[0]), args[1], submit_jobs=opts.submit_jobs, keep_temp=True)
    def setUp(self):
        """
        """
        tmp_dir = get_qiime_temp_dir()
        self.test_out = mkdtemp(dir=tmp_dir,
                                prefix='qiime_parallel_tests_',
                                suffix='')
        self.dirs_to_remove = [self.test_out]

        self.output_fp = join(self.test_out, 'fmap.txt')
        self.failure_fp = join(self.test_out, 'fail.txt')
        self.usearch_fp = join(self.test_out, 'out.uc')
        self.bl6_fp = join(self.test_out, 'out.bl6')
        self.log_fp = join(self.test_out, 'fmap.log')
        self.files_to_remove = [self.output_fp, self.failure_fp,
                                self.usearch_fp, self.log_fp, self.bl6_fp]

        fd, self.refseqs1_fp = mkstemp(dir=self.test_out,
                                      prefix='qiime_refseqs',
                                      suffix='.fasta')
        close(fd)
        refseqs1_f = open(self.refseqs1_fp, 'w')
        refseqs1_f.write(refseqs1)
        refseqs1_f.close()
        self.files_to_remove.append(self.refseqs1_fp)

        fd, self.refseqs2_fp = mkstemp(dir=self.test_out,
                                      prefix='qiime_refseqs',
                                      suffix='.fasta')
        close(fd)
        refseqs2_f = open(self.refseqs2_fp, 'w')
        refseqs2_f.write(refseqs2)
        refseqs2_f.close()
        self.files_to_remove.append(self.refseqs2_fp)

        fd, self.inseqs1_fp = mkstemp(dir=self.test_out,
                                     prefix='qiime_inseqs',
                                     suffix='.fasta')
        close(fd)
        inseqs1_f = open(self.inseqs1_fp, 'w')
        inseqs1_f.write(inseqs1)
        inseqs1_f.close()
        self.files_to_remove.append(self.inseqs1_fp)
        fd, self.inseqs2_fp = mkstemp(dir=self.test_out,
                                     prefix='qiime_inseqs',
                                     suffix='.fasta')
        close(fd)
        inseqs2_f = open(self.inseqs2_fp, 'w')
        inseqs2_f.write(inseqs2)
        inseqs2_f.close()
        self.files_to_remove.append(self.inseqs2_fp)
        initiate_timeout(60)
Пример #29
0
 def _precommand_initiation(self,input_fp,output_dir,working_dir,params):
     if not params['blast_db']:        
         # Build the blast database from the reference_seqs_fp -- all procs
         # will then access one db rather than create one per proc
         blast_db, db_files_to_remove = \
          build_blast_db_from_fasta_path(params['template_fp'],
                                         output_dir=get_qiime_temp_dir())
         self.files_to_remove += db_files_to_remove
         params['blast_db'] = blast_db
     
     if params['min_length'] < 0:
         params['min_length'] = compute_min_alignment_length(\
                                 open(input_fp,'U'))
Пример #30
0
def DA_fitZIG(input_path, out_path, mapping_fp, mapping_category, subcategory_1, subcategory_2):
   """perform metagenomeSeq's Zero Inflated Gaussian (ZIG) OTU differential abundance testing"""
   tmp_bt = load_table(input_path)
   tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp)
   check_mapping_file_category(tmp_bt, mapping_fp, mapping_category, subcategory_1, subcategory_2)
   tmp_bt.add_metadata(tmp_pmf, 'sample')

   with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(),
                                    prefix='QIIME-differential-abundance-temp-table-',
                                    suffix='.biom') as temp_fh:
        temp_fh.write(tmp_bt.to_json('forR'))
        temp_fh.flush()
        run_fitZIG(temp_fh.name, out_path, mapping_category, subcategory_1, subcategory_2)
Пример #31
0
    def setUp(self):
        """ """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='qiime_parallel_tests_',
                                         suffix='',
                                         result_constructor=str)
        self.dirs_to_remove.append(self.test_out)
        create_dir(self.test_out)

        self.refseqs1_fp = get_tmp_filename(tmp_dir=self.test_out,
                                            prefix='qiime_refseqs',
                                            suffix='.fasta')
        refseqs1_f = open(self.refseqs1_fp, 'w')
        refseqs1_f.write(refseqs1)
        refseqs1_f.close()
        self.files_to_remove.append(self.refseqs1_fp)

        self.refseqs2_fp = get_tmp_filename(tmp_dir=self.test_out,
                                            prefix='qiime_refseqs',
                                            suffix='.fasta')
        refseqs2_f = open(self.refseqs2_fp, 'w')
        refseqs2_f.write(refseqs2)
        refseqs2_f.close()
        self.files_to_remove.append(self.refseqs2_fp)

        self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out,
                                           prefix='qiime_inseqs',
                                           suffix='.fasta')
        inseqs1_f = open(self.inseqs1_fp, 'w')
        inseqs1_f.write(inseqs1)
        inseqs1_f.close()
        self.files_to_remove.append(self.inseqs1_fp)

        self.inseqs2_fp = get_tmp_filename(tmp_dir=self.test_out,
                                           prefix='qiime_inseqs',
                                           suffix='.fasta')
        inseqs2_f = open(self.inseqs2_fp, 'w')
        inseqs2_f.write(inseqs2)
        inseqs2_f.close()
        self.files_to_remove.append(self.inseqs2_fp)

        initiate_timeout(60)
Пример #32
0
    def setUp(self):
        """ """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(
            tmp_dir=tmp_dir,
            prefix='qiime_parallel_taxonomy_assigner_tests_',
            suffix='',
            result_constructor=str)
        self.dirs_to_remove.append(self.test_out)
        create_dir(self.test_out)

        # Temporary input file
        self.tmp_seq_filepath = get_tmp_filename(
            tmp_dir=self.test_out,
            prefix='qiime_parallel_taxonomy_assigner_tests_input',
            suffix='.fasta')
        seq_file = open(self.tmp_seq_filepath, 'w')
        seq_file.write(rdp_test_seqs)
        seq_file.close()
        self.files_to_remove.append(self.tmp_seq_filepath)

        self.id_to_taxonomy_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy',
            suffix='.txt',
            dir=tmp_dir)
        self.id_to_taxonomy_file.write(rdp_id_to_taxonomy)
        self.id_to_taxonomy_file.seek(0)

        self.reference_seqs_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs',
            suffix='.fasta',
            dir=tmp_dir)
        self.reference_seqs_file.write(rdp_reference_seqs)
        self.reference_seqs_file.seek(0)

        jar_fp = getenv('RDP_JAR_PATH')
        jar_basename = basename(jar_fp)
        if '2.2' not in jar_basename:
            raise ApplicationError(
                "RDP_JAR_PATH does not point to version 2.2 of the "
                "RDP Classifier.")

        initiate_timeout(60)
    def setUp(self):
        """Set up files/environment that will be used by the tests."""
        # The prefix to use for temporary files. This prefix may be added to,
        # but all temp dirs and files created by the tests will have this
        # prefix at a minimum.
        self.prefix = 'generate_taxa_compare_table_tests'

        self.start_dir = getcwd()
        self.dirs_to_remove = []
        self.files_to_remove = []

        self.tmp_dir = get_qiime_temp_dir()
        if not exists(self.tmp_dir):
            makedirs(self.tmp_dir)
            # if test creates the temp dir, also remove it
            self.dirs_to_remove.append(self.tmp_dir)

        # setup temporary root input directory
        self.root_dir = mkdtemp(dir=self.tmp_dir,
                                prefix='%s_root_dir_' % self.prefix)
        self.dirs_to_remove.append(self.root_dir)

        L18S_dir = '/L18S-1/blast_1.0/'
        makedirs(self.root_dir + L18S_dir)
        self.L18S_fp = self.root_dir + L18S_dir + '/otu_table_mc2_w_taxa_L5.txt'
        with open(self.L18S_fp, 'w') as f:
            f.writelines(L18S_L5_blast_one_multiple_assign_output)
        self.files_to_remove.append(self.L18S_fp)

        # setup temporary key directory
        self.key_dir = mkdtemp(dir=self.tmp_dir,
                               prefix='%s_key_dir_' % self.prefix)
        self.dirs_to_remove.append(self.key_dir)
        self.key_fp = self.key_dir + '/L18S_key.txt'
        with open(self.key_fp, 'w') as f:
            f.writelines(L18S_key)
        self.files_to_remove.append(self.key_fp)
        self.bad_key = self.key_dir + '/L18S_key.txt'

        # setup temporary output directory
        self.output_dir = mkdtemp(dir=self.tmp_dir,
                                  prefix='%s_output_dir_' % self.prefix)
        self.dirs_to_remove.append(self.output_dir)

        initiate_timeout(60)
Пример #34
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.submit_jobs and not opts.make_jobs:
        option_parser.error('Must pass -m if passing -s. (Sorry about this, '
                            'it\'s for backwards-compatibility.)')

    min_args = 2
    if len(args) < min_args:
        option_parser.error('Exactly two arguments are required.')

    output_dir = get_qiime_temp_dir()
    run_commands(output_dir,
                 open(args[0]),
                 args[1],
                 submit_jobs=opts.submit_jobs,
                 keep_temp=True,
                 queue_name=opts.queue_name)
Пример #35
0
def normalize_CSS(input_path, out_path, output_CSS_statistics):
    """performs metagenomeSeq's CSS normalization on a single raw abundance OTU matrix
    """
    tmp_bt = load_table(input_path)

    if output_CSS_statistics:
        base_fname, ext = splitext(out_path)
        output_CSS_statistics = base_fname + '_CSS_statistics.txt'

    with tempfile.NamedTemporaryFile(
            dir=get_qiime_temp_dir(),
            prefix='QIIME-normalize-table-temp-table-',
            suffix='.biom') as temp_fh:
        temp_fh.write(tmp_bt.to_json('forR'))
        temp_fh.flush()
        run_CSS(temp_fh.name,
                out_path,
                output_CSS_statistics=output_CSS_statistics)
Пример #36
0
    def setUp(self):
        """Set up files/environment that will be used by the tests."""
        # The prefix to use for temporary files. This prefix may be added to,
        # but all temp dirs and files created by the tests will have this
        # prefix at a minimum.
        self.prefix = 'tax2tree_controller_tests'

        self.start_dir = getcwd()
        self.dirs_to_remove = []
        self.files_to_remove = []

        self.tmp_dir = get_qiime_temp_dir()
        if not exists(self.tmp_dir):
            makedirs(self.tmp_dir)
            # if test creates the temp dir, also remove it
            self.dirs_to_remove.append(self.tmp_dir)

        initiate_timeout(60)
Пример #37
0
    def setUp(self):
        """Define some test data."""
        self.tmp_dir = get_qiime_temp_dir()

        self.otu_table1 = Table(data=array([[2, 0, 0, 1],
                                            [1, 1, 1, 1],
                                            [0, 0, 0, 0]]).T,
                                           sample_ids=list('XYZ'),
                                           observation_ids=list('abcd'))
        fd, self.otu_table1_fp = mkstemp(dir=self.tmp_dir,
                                              prefix='alpha_diversity_tests',
                                              suffix='.biom')
        close(fd)
        write_biom_table(self.otu_table1, self.otu_table1_fp)

        self.otu_table2 = Table(data=array([[2, 0, 0, 1],
                                                   [1, 1, 1, 1],
                                                   [0, 0, 0, 0]]).T,
                                        sample_ids=list('XYZ'),
                                        observation_ids=['a', 'b', 'c', 'd_'])
        fd, self.otu_table2_fp = mkstemp(dir=self.tmp_dir,
                                              prefix='alpha_diversity_tests',
                                              suffix='.biom')
        close(fd)
        write_biom_table(self.otu_table2, self.otu_table2_fp)

        self.single_sample_otu_table = Table(
            data=array([[2, 0, 0, 1]]).T,
            sample_ids=list('X'),
            observation_ids=list(
                'abcd'))
        fd, self.single_sample_otu_table_fp = mkstemp(
            dir=self.tmp_dir,
            prefix='alpha_diversity_tests',
            suffix='.biom')
        close(fd)
        write_biom_table(self.single_sample_otu_table,
                         self.single_sample_otu_table_fp)

        self.tree1 = parse_newick('((a:2,b:3):2,(c:1,d:2):7);')
        self.tree2 = parse_newick("((a:2,'b':3):2,(c:1,'d_':2):7);")

        self.files_to_remove = [self.otu_table1_fp, self.otu_table2_fp,
                                self.single_sample_otu_table_fp]
Пример #38
0
    def setUp(self):
        self.tmp_dir = get_qiime_temp_dir()

        #Temporary input file
        fd, self.tmp_otu_fp = mkstemp(dir=self.tmp_dir,
                                      prefix='R_test_otu_table_',
                                      suffix='.biom')
        close(fd)
        seq_file = open(self.tmp_otu_fp, 'w')
        seq_file.write(test_otu_table)
        seq_file.close()

        self.tmp_otu_fp_out_CSS = '%s/R_test_otu_table_out_CSS.biom' % (str(
            self.tmp_dir))
        self.tmp_otu_fp_out_DESeq = '%s/R_test_otu_table_out_DESeq.biom' % (
            str(self.tmp_dir))

        self.files_to_remove = \
            [self.tmp_otu_fp, self.tmp_otu_fp_out_CSS, self.tmp_otu_fp_out_DESeq]
Пример #39
0
    def setUp(self):
        """ """
        self.test_data = get_test_data_fps()
        self.files_to_remove = []
        self.dirs_to_remove = []

        # Create example output directory
        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(tmp_dir=tmp_dir,
                                         prefix='core_qiime_analyses_test_',
                                         suffix='',
                                         result_constructor=str)
        self.dirs_to_remove.append(self.test_out)
        create_dir(self.test_out)

        self.qiime_config = load_qiime_config()
        self.params = parse_qiime_parameters([])

        initiate_timeout(60)
Пример #40
0
    def setUp(self):
        self.tmp_dir = get_qiime_temp_dir()

        self.otu_table_data = np.array([[2, 1, 0], [0, 5, 0], [0, 3, 0],
                                        [1, 2, 0]])
        self.sample_names = list('YXZ')
        self.taxon_names = list('bacd')
        self.otu_metadata = [{
            'domain': 'Archaea'
        }, {
            'domain': 'Bacteria'
        }, {
            'domain': 'Bacteria'
        }, {
            'domain': 'Bacteria'
        }]

        self.otu_table = Table(self.otu_table_data, self.taxon_names,
                               self.sample_names)

        self.otu_table_meta = Table(self.otu_table_data,
                                    self.taxon_names,
                                    self.sample_names,
                                    observation_metadata=self.otu_metadata)

        fd, self.otu_table_fp = mkstemp(dir=self.tmp_dir,
                                        prefix='test_rarefaction',
                                        suffix='.biom')
        close(fd)
        fd, self.otu_table_meta_fp = mkstemp(dir=self.tmp_dir,
                                             prefix='test_rarefaction',
                                             suffix='.biom')
        close(fd)

        self.rare_dir = mkdtemp(dir=self.tmp_dir,
                                prefix='test_rarefaction_dir',
                                suffix='')

        write_biom_table(self.otu_table, self.otu_table_fp)
        write_biom_table(self.otu_table_meta, self.otu_table_meta_fp)

        self._paths_to_clean_up = [self.otu_table_fp, self.otu_table_meta_fp]
        self._dirs_to_clean_up = [self.rare_dir]
Пример #41
0
def DA_DESeq2(input_path, out_path, mapping_fp, mapping_category,
              subcategory_1, subcategory_2, DESeq2_diagnostic_plots):
    """perform DESeq2 negative binomial Wald differential abundance test on a raw abundance OTU matrix
    """
    tmp_bt = load_table(input_path)
    tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp)
    check_mapping_file_category(tmp_bt, mapping_fp, mapping_category,
                                subcategory_1, subcategory_2)
    tmp_bt.add_metadata(tmp_pmf, 'sample')
    base_fname, ext = splitext(out_path)
    outfile_diagnostic = join(base_fname + '_diagnostic_plots.pdf')

    with tempfile.NamedTemporaryFile(
            dir=get_qiime_temp_dir(),
            prefix='QIIME-differential-abundance-temp-table-',
            suffix='.biom') as temp_fh:
        temp_fh.write(tmp_bt.to_json('forR'))
        temp_fh.flush()
        run_DESeq2(temp_fh.name, out_path, mapping_category, subcategory_1,
                   subcategory_2, DESeq2_diagnostic_plots, outfile_diagnostic)
Пример #42
0
def multiple_file_normalize_DESeq2(input_dir, output_dir, DESeq_negatives_to_zero):
    """performs DESeq2VS normalization on a directory of raw abundance OTU matrices
    """
    if not exists(output_dir):
        makedirs(output_dir)
    file_names = [fname for fname in listdir(input_dir) if not (fname.startswith('.')\
        or isdir(fname))]

    for fname in file_names:
        base_fname, ext = splitext(fname)
        original_fname = base_fname+'.biom'
        hdf5_infile = join(input_dir, original_fname)
        tmp_bt = load_table(hdf5_infile) 
        outfile = join(output_dir, 'DESeq2_'+base_fname+'.biom')

        with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(),
                                         prefix='QIIME-normalize-table-temp-table-',
                                         suffix='.biom') as temp_fh:
            temp_fh.write(tmp_bt.to_json('forR'))
            temp_fh.flush()
            run_DESeq2(temp_fh.name, outfile, DESeq_negatives_to_zero)
Пример #43
0
 def test_split_fasta_equal_num_seqs_per_file(self):
     """split_fasta funcs as expected when equal num seqs go to each file
     """
     filename_prefix = get_tmp_filename(tmp_dir=get_qiime_temp_dir(),
                                        prefix='split_fasta_tests',
                                        suffix='',
                                        result_constructor=str)
     infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\
      '>seq3','CCTT--AA']
      
     actual = split_fasta(infile, 1, filename_prefix)
     actual_seqs = []
     for fp in actual:
         actual_seqs += list(open(fp))
     remove_files(actual)
     
     expected = ['%s.%d.fasta' % (filename_prefix,i) for i in range(3)]
     
     self.assertEqual(actual,expected)
     self.assertEqual(\
      LoadSeqs(data=infile,aligned=False),\
      LoadSeqs(data=actual_seqs,aligned=False))
Пример #44
0
    def test_split_fasta_equal_num_seqs_per_file(self):
        """split_fasta funcs as expected when equal num seqs go to each file
        """
        fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                     prefix='split_fasta_tests',
                                     suffix='')
        close(fd)
        infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA',
                  '>seq3', 'CCTT--AA']

        actual = split_fasta(infile, 1, filename_prefix)
        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)]

        self.assertEqual(actual, expected)
        self.assertEqual(
            SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
            SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
Пример #45
0
    def setUp(self):
        """
        """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(
            tmp_dir=tmp_dir,
            prefix='qiime_parallel_taxonomy_assigner_tests_',
            suffix='',
            result_constructor=str)
        self.dirs_to_remove.append(self.test_out)
        create_dir(self.test_out)

        self.tmp_seq_filepath = get_tmp_filename(
            tmp_dir=self.test_out,
            prefix='qiime_parallel_taxonomy_assigner_tests_input',
            suffix='.fasta')
        seq_file = open(self.tmp_seq_filepath, 'w')
        seq_file.write(uclust_test_seqs.toFasta())
        seq_file.close()
        self.files_to_remove.append(self.tmp_seq_filepath)

        self.id_to_taxonomy_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy',
            suffix='.txt',
            dir=tmp_dir)
        self.id_to_taxonomy_file.write(uclust_id_to_taxonomy)
        self.id_to_taxonomy_file.seek(0)

        self.reference_seqs_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs',
            suffix='.fasta',
            dir=tmp_dir)
        self.reference_seqs_file.write(uclust_reference_seqs.toFasta())
        self.reference_seqs_file.seek(0)

        initiate_timeout(60)
Пример #46
0
def submit_jobs(commands, prefix):
    """submit jobs using exe pointed to by cluster_jobs_fp.

    commands: List of commands (strings) that should be executed

    prefix: A uniq prefix used to name submit script
"""
    qiime_config = load_qiime_config()
    CLUSTER_JOBS_SCRIPT = qiime_config['cluster_jobs_fp']

    if not CLUSTER_JOBS_SCRIPT:
        raise ApplicationNotFoundError, "cluster_jobs_fp not set in config file!"
    if not (exists(CLUSTER_JOBS_SCRIPT) or app_path(CLUSTER_JOBS_SCRIPT)):
        raise ApplicationNotFoundError, "cluster_jobs_fp not in $PATH or provided as full path!"

    outfilename = join(get_qiime_temp_dir(), "%s_commands.txt" % prefix)
    fh = open(outfilename, "w")
    fh.write("\n".join(commands))
    fh.close()
    cmd = '%s -ms %s %s' % (CLUSTER_JOBS_SCRIPT, outfilename, prefix)
    system(cmd)
    remove(outfilename)
Пример #47
0
 def setUp(self):
     """setup the test values"""
     # define test data
     self.fasta_seqs_of_rand_bcs = fasta_seqs_of_rand_bcs
     self.fasta_seqs_for_cluster_ratio = fasta_seqs_for_cluster_ratio
     self.fasta_seqs_for_consensus = fasta_seqs_for_consensus
     self.fwd_read_data = fwd_read_data.split()
     self.rev_read_data = rev_read_data.split()
     self.mapping_data = mapping_data
     self.fasta_seq_for_primer = fasta_seq_for_primer
     self.possible_primers = possible_primers
     self.fasta_seqs_for_consensus_tie_G_C = \
         fasta_seqs_for_consensus_tie_G_C
     self.fasta_seqs_for_consensus_unequal_length = \
         fasta_seqs_for_consensus_unequal_length
     self.min_difference_in_clusters = min_difference_in_clusters
     self.temp_dir = get_qiime_temp_dir()
     self.mapping_fp = NamedTemporaryFile(
         delete=False,
         mode='w',
         dir=self.temp_dir)
     self.mapping_fp.write(self.mapping_data)
     self.mapping_fp_name = self.mapping_fp.name
     self.mapping_fp.close()
     self.mapping_fp = open(self.mapping_fp_name, 'r')
     self.seqs_with_no_consensus = seqs_with_no_consensus
     self.false_primers = false_primers
     self.barcode_len = barcode_len
     self.barcode_correction_fn = barcode_correction_fn
     self.max_barcode_errors = max_barcode_errors
     self.fwd_length = fwd_length
     self.rev_length = fwd_length
     self.bc_to_sid = bc_to_sid
     self.bc_to_fwd_primers = bc_to_fwd_primers
     self.bc_to_rev_primers = bc_to_rev_primers
     self.min_difference_in_bcs = min_difference_in_bcs
     self.min_reads_per_random_bc = min_reads_per_random_bc
     self.max_cluster_ratio = max_cluster_ratio
Пример #48
0
def run_DESeq2(input_path, out_path, mapping_category, subcategory_1,
               subcategory_2, DESeq2_diagnostic_plots, outfile_diagnostic):
    """Run DESeq2 negative binomial Wald algorithm through Rscript
    """
    # set options
    if DESeq2_diagnostic_plots == True:
        command_args = [
            '-i %s -o %s -c %s -x %s -y %s -d %s -e %s' %
            (input_path, out_path, mapping_category, subcategory_1,
             subcategory_2, DESeq2_diagnostic_plots, outfile_diagnostic)
        ]
    else:
        command_args = [
            '-i %s -o %s -c %s -x %s -y %s' %
            (input_path, out_path, mapping_category, subcategory_1,
             subcategory_2)
        ]
    # instantiate the object
    rsl = RExecutor(TmpDir=get_qiime_temp_dir())
    # run the app
    app_result = rsl(command_args=command_args, script_name='DESeq2_nbinom.r')

    return app_result
Пример #49
0
    def setUp(self):
        """Set up files/environment that will be used by the tests."""
        # The prefix to use for temporary files. This prefix may be added to,
        # but all temp dirs and files created by the tests will have this
        # prefix at a minimum.
        self.prefix = 'multiple_assign_taxonomy_tests'

        self.start_dir = getcwd()
        self.dirs_to_remove = []
        self.files_to_remove = []

        self.tmp_dir = get_qiime_temp_dir()
        if not exists(self.tmp_dir):
            makedirs(self.tmp_dir)
            # if test creates the temp dir, also remove it
            self.dirs_to_remove.append(self.tmp_dir)

        # setup temporary output directories
        self.output_dir = mkdtemp(dir=self.tmp_dir,
                                  prefix='%s_output_dir_' % self.prefix)
        self.dirs_to_remove.append(self.output_dir)

        initiate_timeout(60)
Пример #50
0
    def setUp(self):
        """ """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = mkdtemp(
            dir=tmp_dir,
            prefix='qiime_parallel_taxonomy_assigner_tests_',
            suffix='')
        self.dirs_to_remove.append(self.test_out)

        fd, self.tmp_seq_filepath = mkstemp(
            dir=self.test_out,
            prefix='qiime_parallel_taxonomy_assigner_tests_input',
            suffix='.fasta')
        close(fd)
        seq_file = open(self.tmp_seq_filepath, 'w')
        seq_file.write(blast_test_seqs.to_fasta())
        seq_file.close()
        self.files_to_remove.append(self.tmp_seq_filepath)

        self.id_to_taxonomy_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy',
            suffix='.txt',
            dir=tmp_dir)
        self.id_to_taxonomy_file.write(blast_id_to_taxonomy)
        self.id_to_taxonomy_file.seek(0)

        self.reference_seqs_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs',
            suffix='.fasta',
            dir=tmp_dir)
        self.reference_seqs_file.write(blast_reference_seqs.to_fasta())
        self.reference_seqs_file.seek(0)

        initiate_timeout(60)
Пример #51
0
def select_unique_rand_bcs(rand_bcs, unique_threshold):
    """
    Attempts to select true barcodes from set of barcodes
    i.e. removes barcodes that might be artifacts
    due to sequencing errors.
    Uses uclust to remove barcodes that are similar thatn
    threshold.
    Parameters
    ----------
    rand_bcs: list
    unique_threshold: float
    Returns
    ----------
    unique_rand_bcs: set
        set of unique random barcodes.
    """
    temp_dir = get_qiime_temp_dir()
    fasta_fd, fasta_tempfile_name = mkstemp(dir=temp_dir,
                                            prefix='tmp',
                                            suffix='.fas')
    rand_bcs = set(rand_bcs)

    with open(fasta_tempfile_name, 'w') as fasta_tempfile:
        for rand_bc in rand_bcs:
            fasta_tempfile.write(">{}\n{}\n".format(rand_bc, rand_bc))
    fasta_tempfile.close()

    _, _, unique_rand_bcs = get_clusters_from_fasta_filepath(
        fasta_tempfile_name,
        original_fasta_path=None,
        percent_ID=unique_threshold,
        save_uc_files=False,
        output_dir=temp_dir)

    unique_rand_bcs = set(unique_rand_bcs)
    remove_files([fasta_tempfile_name])
    return unique_rand_bcs
Пример #52
0
    def setUp(self):
        """Defines data that will be used by the tests."""
        self.files_to_remove = []
        self.dirs_to_remove = []

        # Create temp directory to hold input and output.
        self.test_dir = mkdtemp(dir=get_qiime_temp_dir(),
                                prefix='qiime_compare_categories_tests_')
        self.dirs_to_remove.append(self.test_dir)

        # Create input files under our temp dir.
        self.dm_fp = join(self.test_dir, 'dm.txt')
        dm_f = open(self.dm_fp, 'w')
        dm_f.write(dm_str)
        dm_f.close()
        self.files_to_remove.append(self.dm_fp)

        self.invalid_dm_fp = join(self.test_dir, 'invalid_dm.txt')
        invalid_dm_f = open(self.invalid_dm_fp, 'w')
        invalid_dm_f.write(invalid_dm_str)
        invalid_dm_f.close()
        self.files_to_remove.append(self.invalid_dm_fp)

        self.map_fp = join(self.test_dir, 'map.txt')
        map_f = open(self.map_fp, 'w')
        map_f.write(map_str)
        map_f.close()
        self.files_to_remove.append(self.map_fp)

        self.cat_methods = [
            'adonis', 'anosim', 'mrpp', 'permanova', 'permdisp', 'dbrda'
        ]
        self.num_methods = ['best', 'morans_i']
        self.cat_categories = ['Treatment']
        self.num_categories = ['DOB']
        self.num_perms = 42
Пример #53
0
    def test_mothur_supported_version(self):
        """mothur is in path and version is supported """
        acceptable_version = (1, 25, 0)
        self.assertTrue(which('mothur'),
                        "mothur not found. This may or may not be a problem depending on " +
                        "which components of QIIME you plan to use.")
        # mothur creates a log file in cwd, so create a tmp and cd there first
        log_file = join(get_qiime_temp_dir(), 'mothur.log')
        command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file
        stdout, stderr, exit_Status = qiime_system_call(command)

        # remove log file
        remove_files([log_file], error_on_missing=False)

        version_string = stdout.strip().split(' ')[1].strip('v.')
        try:
            version = tuple(map(int, version_string.split('.')))
            pass_test = version == acceptable_version
        except ValueError:
            pass_test = False
            version_string = stdout
        self.assertTrue(pass_test,
                        "Unsupported mothur version. %s is required, but running %s."
                        % ('.'.join(map(str, acceptable_version)), version_string))
Пример #54
0
     action='store_true',
     dest='no_clean',
     default=False,
     help=
     "If set, don't delete files generated by formatdb after running [default: %default]."
 ),
 make_option(
     "--blastmatroot",
     dest='blastmatroot',
     default=None,
     type="existing_dirpath",
     help="Path to a folder containing blast matrices [default: %default]."
 ),
 make_option("--working_dir",
             dest='working_dir',
             default=get_qiime_temp_dir(),
             type="existing_dirpath",
             help="Working dir for BLAST [default: %default]."),
 make_option(
     "-m",
     "--max_hits",
     type='int',
     dest='max_hits',
     default=100,
     help=
     """Max hits parameter for BLAST. CAUTION: Because filtering on alignment percentage occurs after BLAST, a max hits value of 1 in combination with an alignment percent filter could miss valid contaminants. [default: %default]"""
 ),
 make_option("-w",
             "--word_size",
             type='int',
             dest='wordsize',
Пример #55
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'best', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'best' or 'morans_i', this parameter will be
            ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """

    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BEST analyses). "
                         "Please use a different metadata column to perform "
                         "statistical tests on.")

    # Parse the mapping file and distance matrix.
    md_map = MetadataMap.parseMetadataMap(open(map_fp, 'U'))
    dm = DistanceMatrix.parseDistanceMatrix(open(dm_fp, 'U'))

    # Remove any samples from the mapping file that aren't in the distance
    # matrix (important for validation checks). Use strict=True so that an
    # error is raised if the distance matrix contains any samples that aren't
    # in the mapping file.
    md_map.filterSamples(dm.SampleIds, strict=True)

    # Run the specified statistical method.
    if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
        # These methods are run in R. Input validation must be done here before
        # running the R commands. The pure-Python implementations perform all
        # validation in the classes in the stats module.

        # Make sure the input distance matrix is symmetric and hollow.
        if not dm.is_symmetric_and_hollow():
            raise ValueError("The distance matrix must be symmetric and "
                             "hollow.")

        # Check to make sure all categories passed in are in mapping file and
        # are not all the same value.
        for category in categories:
            if not category in md_map.CategoryNames:
                raise ValueError("Category '%s' not found in mapping file "
                                 "columns." % category)

            if md_map.hasSingleCategoryValue(category):
                raise ValueError("All values in category '%s' are the "
                                 "same. The statistical method '%s' cannot "
                                 "operate on a category that creates only "
                                 "a single group of samples (e.g. there "
                                 "are no 'between' distances because "
                                 "there is only a single group)." %
                                 (category, method))

        # Build the command arguments string.
        command_args = [
            '-d %s -m %s -c %s -o %s' % (dm_fp, map_fp, categories[0], out_dir)
        ]

        if method == 'morans_i':
            # Moran's I requires only numeric categories.
            for category in categories:
                if not md_map.isNumericCategory(category):
                    raise TypeError(
                        "The category '%s' is not numeric. Not "
                        "all values could be converted to numbers." % category)
        else:
            # The rest require groups of samples, so the category values cannot
            # all be unique.
            for category in categories:
                if md_map.hasUniqueCategoryValues(category):
                    raise ValueError("All values in category '%s' are unique. "
                                     "This statistical method cannot operate "
                                     "on a category with unique values (e.g. "
                                     "there are no 'within' distances because "
                                     "each group of samples contains only a "
                                     "single sample)." % category)

            # Only Moran's I doesn't accept a number of permutations.
            if num_perms < 0:
                raise ValueError("The number of permutations must be greater "
                                 "than or equal to zero.")

            command_args[0] += ' -n %d' % num_perms

        rex = RExecutor(TmpDir=get_qiime_temp_dir())
        rex(command_args, '%s.r' % method, output_dir=out_dir)
    elif method == 'anosim':
        anosim = Anosim(md_map, dm, categories[0])
        anosim_results = anosim(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_anosim_results(anosim_results))
        out_f.close()
    elif method == 'best':
        best = Best(dm, md_map, categories)
        best_results = best()

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_best_results(best_results))
        out_f.close()
    elif method == 'permanova':
        permanova = Permanova(md_map, dm, categories[0])
        permanova_results = permanova(num_perms)

        out_f = open(join(out_dir, '%s_results.txt' % method), 'w+')
        out_f.write(format_permanova_results(permanova_results))
        out_f.close()
    else:
        raise ValueError("Unrecognized method '%s'. Valid methods: %r" %
                         (method, methods))
Пример #56
0
    def __call__(self,
                 seq_path,
                 result_path=None,
                 uc_path=None,
                 log_path=None,
                 HALT_EXEC=False):
        """Returns mapping of each seq to (tax, consensus fraction, n)

        Results:
        If result_path is specified, the results will be written to file
         as tab-separated lines of:
          query_id <tab> tax <tab> consensus fraction <tab> n
        If result_path is None (default), the results will be returned
         as a dict of:
          {'query_id': (tax, consensus fraction, n)}
        In both cases, the values are:
         tax: the consensus taxonomy assignment
         consensus fraction: the fraction of the assignments for the
          query that contained the lowest level tax assignment that is
          included in tax (e.g., if the assignment goes to genus level,
          this will be the fraction of assignments that had the consensus
          genus assignment)
         n: the number of assignments that were considered when constructing
          the consensus

        Parameters:
        seq_path: path to file of query sequences
        result_path: path where results should be written. If None (default),
         returns results as a dict
        uc_path: path where .uc file should be saved. If None (default), and
         log_path is specified, the .uc contents will be written to appended to
         the log file.
        log_path: path where run log should be written. If None (default), no
         log file is written.
        HALT_EXEC: debugging paramter. If pass, will exit just before the
         uclust command is issued, and will print the command that would have
         been called to stdout.
        """

        # initialize the logger
        logger = self._get_logger(log_path)
        logger.info(str(self))

        # set the user-defined parameters
        params = {
            '--id': self.Params['similarity'],
            '--maxaccepts': self.Params['max_accepts']
        }

        # initialize the application controller object
        app = Uclust(params, HALT_EXEC=HALT_EXEC)

        # Configure for consensus taxonomy assignment
        app.Parameters['--rev'].on()
        app.Parameters['--lib'].on(self.Params['reference_sequences_fp'])
        app.Parameters['--libonly'].on()
        app.Parameters['--allhits'].on()

        if uc_path is None:
            uc = NamedTemporaryFile(prefix='UclustConsensusTaxonAssigner_',
                                    suffix='.uc',
                                    dir=get_qiime_temp_dir())
            uc_path = uc.name
            store_uc_in_log = True
        else:
            store_uc_in_log = False

        app_result = app({'--input': seq_path, '--uc': uc_path})
        result = self._uc_to_assignment(app_result['ClusterFile'])
        if result_path is not None:
            # if the user provided a result_path, write the
            # results to file
            of = open(result_path, 'w')
            for seq_id, (assignment, consensus_fraction, n) in result.items():
                assignment_str = ';'.join(assignment)
                of.write('%s\t%s\t%1.2f\t%d\n' %
                         (seq_id, assignment_str, consensus_fraction, n))
            of.close()
            result = None
            logger.info('Result path: %s' % result_path)
        else:
            # If no result_path was provided, the result dict is
            # returned as-is.
            logger.info('Result path: None, returned as dict.')

        if store_uc_in_log:
            # This is a little hackish, but we don't have a good way
            # to pass the uc_path value right now through the
            # assign_taxonomy.py script, so writing the contents to the
            # user-specified log file (since this is being stored for logging
            # purposes).
            app_result['ClusterFile'].seek(0)
            logger.info('\n.uc file contents:\n')
            for line in app_result['ClusterFile']:
                logger.info(line.strip())

        return result
Пример #57
0
    def setUp(self):
        self.tmp_dir = get_qiime_temp_dir()

        self.map_file = """#SampleID	Day	time	Description
#This is some comment about the study
1	090809	1200	some description of sample1
2	090809	1800	some description of sample2
3	090909	1200	some description of sample3
4	090909	1800	some description of sample4
5	091009	1200	some description of sample5"""
        self.cat_by_sample = {"1": [("Day", "090809"), ("time", "1200")],
                              "2": [("Day", "090809"), ("time", "1800")],
                              "3": [("Day", "090909"), ("time", "1200")],
                              "4": [("Day", "090909"), ("time", "1800")],
                              "5": [("Day", "091009"), ("time", "1200")]}
        self.sample_by_cat = {("Day", "090809"): ["1", "2"],
                              ("Day", "090909"): ["3", "4"],
                              ("Day", "091009"): ["5"],
                              ("time", "1200"): ["1", "3", "5"],
                              ("time", "1800"): ["2", "4"]}

        self.num_cats = 2
        self.meta_dict = {"1": ["090809	1200", 0],
                          "2": ["090809	1800", 0],
                          "3": ["090909	1200", 0],
                          "4": ["090909	1800", 0],
                          "5": ["091009	1200", 0]}
        self.labels = ["from", "to", "eweight", "consensus_lin", "Day", "time"]
        self.node_labels = ["node_name", "node_disp_name", "ntype", "degree",
                            "weighted_degree", "consensus_lin", "Day", "time"]
        self.label_list = [["090809", "090909", "091009"], ["1200", "1800"]]

        self.otu_table_vals = array([[0, 1, 0, 0, 6],
                                     [2, 0, 0, 0, 0],
                                     [0, 0, 3, 1, 0],
                                     [0, 0, 0, 0, 5],
                                     [0, 4, 2, 0, 0],
                                     [3, 6, 0, 0, 0],
                                     [0, 0, 4, 2, 0],
                                     [0, 0, 0, 0, 3],
                                     [2, 0, 0, 5, 0],
                                     [0, 2, 0, 4, 0]])

        otu_table = Table(self.otu_table_vals,
                          ['otu_1', 'otu_2', 'otu_3', 'otu_4', 'otu_5',
                           'otu_6', 'otu_7', 'otu_8', 'otu_9', 'otu_10'],
                          ['1', '2', '3', '4', '5'],
                          [{"taxonomy": ["Bacteria", "Actinobacteria",
                                         "Coriobacteridae"]},
                           {"taxonomy": ["Bacteria", "Bacteroidetes",
                                         "Bacteroidales", "Bacteroidaceae"]},
                           {"taxonomy": ["Bacteria", "Firmicutes",
                                         "Clostridia", "Clostridiales"]},
                           {"taxonomy": ["Bacteria", "Spirochaetes",
                                         "Spirochaetales", "Spirochaetaceae"]},
                           {"taxonomy": ["Bacteria", "Bacteroidetes",
                                         "Bacteroidales", "Rikenellaceae"]},
                           {"taxonomy": ["Bacteria", "Bacteroidetes",
                                         "Bacteroidales", "Dysgonomonaceae"]},
                           {"taxonomy": ["Bacteria", "Bacteroidetes",
                                         "Bacteroidales",
                                         "Odoribacteriaceae"]},
                           {"taxonomy": ["Bacteria", "Bacteroidetes",
                                         "Bacteroidales", "Dysgonomonaceae",
                                         "otu_425"]},
                           {"taxonomy": ["Bacteria", "Bacteroidetes",
                                         "Bacteroidales", "Dysgonomonaceae",
                                         "otu_425"]},
                           {"taxonomy": ["Bacteria", "Firmicutes",
                                         "Mollicutes",
                                         "Clostridium_aff_innocuum_CM970"]}],
                          [None, None, None, None, None])

        fd, self.otu_table_fp = mkstemp(
            dir=self.tmp_dir, prefix='test_make_otu_network_otu_table',
            suffix='.biom')
        close(fd)
        write_biom_table(otu_table, self.otu_table_fp)

        self.otu_sample_file = """#Full OTU Counts
#OTU ID	1	2	3	4	5	Consensus Lineage
otu_1	0	1	0	0	6	Bacteria; Actinobacteria; Coriobacteridae
otu_2	2	0	0	0	0	Bacteria; Bacteroidetes; Bacteroidales; Bacteroidaceae
otu_3	0	0	3	1	0	Bacteria; Firmicutes; Clostridia; Clostridiales
otu_4	0	0	0	0	5	Bacteria; Spirochaetes; Spirochaetales; Spirochaetaceae
otu_5	0	4	2	0	0	Bacteria; Bacteroidetes; Bacteroidales; Rikenellaceae
otu_6	3	6	0	0	0	Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae
otu_7	0	0	4	2	0	Bacteria; Bacteroidetes; Bacteroidales; Odoribacteriaceae
otu_8	0	0	0	0	3	Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425
otu_9	2	0	0	5	0	Bacteria; Bacteroidetes; Bacteroidales; Dysgonomonaceae; otu_425
otu_10	0	2	0	4	0	Bacteria; Firmicutes; Mollicutes; Clostridium_aff_innocuum_CM970"""

        self.con_by_sample = {
            '1': set(['2', '4']), '2': set(['5', '3', '1', '4']),
            '3': set(['4', '2']), '4': set(['3', '1', '2']),
            '5': set(['2'])}

        self.edge_file_str = [
            "2	otu_1	1.0	Bacteria:Actinobacteria:Coriobacteridae	090809	1800",
            "5	otu_1	6.0	Bacteria:Actinobacteria:Coriobacteridae	091009	1200",
            "1	otu_2	2.0	Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae	090809	1200",
            "3	otu_3	3.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1200",
            "4	otu_3	1.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1800",
            "5	otu_4	5.0	Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae	091009	1200",
            "2	otu_5	4.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090809	1800",
            "3	otu_5	2.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090909	1200",
            "1	otu_6	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1200",
            "2	otu_6	6.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1800",
            "3	otu_7	4.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1200",
            "4	otu_7	2.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1800",
            "5	otu_8	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	091009	1200",
            "1	otu_9	2.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090809	1200",
            "4	otu_9	5.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090909	1800",
            "2	otu_10	2.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090809	1800",
            "4	otu_10	4.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090909	1800"]

        self.node_file_str = ["1	1	user_node	3	7.0	other	090809	1200",
                              "2	2	user_node	4	13.0	other	090809	1800",
                              "3	3	user_node	3	9.0	other	090909	1200",
                              "4	4	user_node	4	12.0	other	090909	1800",
                              "5	5	user_node	3	14.0	other	091009	1200",
                              "otu_1		otu_node	2	7.0	Bacteria:Actinobacteria:Coriobacteridae	otu	otu",
                              "otu_2		otu_node	1	2.0	Bacteria:Bacteroidetes:Bacteroidales:Bacteroidaceae	otu	otu",
                              "otu_3		otu_node	2	4.0	Bacteria:Firmicutes:Clostridia:Clostridiales	otu	otu",
                              "otu_4		otu_node	1	5.0	Bacteria:Spirochaetes:Spirochaetales:Spirochaetaceae	otu	otu",
                              "otu_5		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	otu	otu",
                              "otu_6		otu_node	2	9.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	otu	otu",
                              "otu_7		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	otu	otu",
                              "otu_8		otu_node	1	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	otu	otu",
                              "otu_9		otu_node	2	7.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	otu	otu",
                              "otu_10		otu_node	2	6.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	otu	otu"]

        self.red_edge_file_str = [
            "2	otu_1	1.0	Bacteria:Actinobacteria:Coriobacteridae	090809	1800",
            "5	otu_1	6.0	Bacteria:Actinobacteria:Coriobacteridae	091009	1200",
            "1	@1	1.0	missed	090809	1200",
            "3	otu_3	3.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1200",
            "4	otu_3	1.0	Bacteria:Firmicutes:Clostridia:Clostridiales	090909	1800",
            "5	@5	1.0	missed	091009	1200",
            "2	otu_5	4.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090809	1800",
            "3	otu_5	2.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	090909	1200",
            "1	otu_6	3.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1200",
            "2	otu_6	6.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	090809	1800",
            "3	otu_7	4.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1200",
            "4	otu_7	2.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	090909	1800",
            "1	otu_9	2.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090809	1200",
            "4	otu_9	5.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	090909	1800",
            "2	otu_10	2.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090809	1800",
            "4	otu_10	4.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	090909	1800"]

        self.red_node_file_str = ["1	1	user_node	3	7.0	other	090809	1200",
                                  "2	2	user_node	4	13.0	other	090809	1800",
                                  "3	3	user_node	3	9.0	other	090909	1200",
                                  "4	4	user_node	4	12.0	other	090909	1800",
                                  "5	5	user_node	3	14.0	other	091009	1200",
                                  "otu_1		otu_node	2	7.0	Bacteria:Actinobacteria:Coriobacteridae	otu	otu",
                                  "@1		otu_collapsed	1	1.0	other	otu	otu",
                                  "otu_3		otu_node	2	4.0	Bacteria:Firmicutes:Clostridia:Clostridiales	otu	otu",
                                  "@5		otu_collapsed	2	2.0	other	otu	otu",
                                  "otu_5		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Rikenellaceae	otu	otu",
                                  "otu_6		otu_node	2	9.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae	otu	otu",
                                  "otu_7		otu_node	2	6.0	Bacteria:Bacteroidetes:Bacteroidales:Odoribacteriaceae	otu	otu",
                                  "otu_9		otu_node	2	7.0	Bacteria:Bacteroidetes:Bacteroidales:Dysgonomonaceae:otu_425	otu	otu",
                                  "otu_10		otu_node	2	6.0	Bacteria:Firmicutes:Mollicutes:Clostridium_aff_innocuum_CM970	otu	otu"]

        self.otu_dc = {1: 3, 2: 7}
        self.sample_dc = {3: 3, 4: 2}
        self.degree_counts = {1: 3, 2: 7, 3: 3, 4: 2}

        self.num_con_cat = {"Day": 2,
                            "time": 1}
        self.num_con = 6
        self.num_cat = {"Day": 2,
                        "time": 4}
        self.num_cat_less = {"Day": 1,
                             "time": 3}
        self._paths_to_clean_up = [self.otu_table_fp]
        self._dir_to_clean_up = ''
Пример #58
0
def compare_categories(dm_fp, map_fp, method, categories, num_perms, out_dir):
    """Runs the specified statistical method using the category of interest.

    This method does not return anything; all output is written to results
    files in out_dir.

    Arguments:
        dm_fp - filepath to the input distance matrix
        map_fp - filepath to the input metadata mapping file
        categories - list of categories in the metadata mapping file to
            consider in the statistical test. Multiple categories will only be
            considered if method is 'bioenv', otherwise only the first category
            will be considered
        num_perms - the number of permutations to use when calculating the
            p-value. If method is 'bioenv' or 'morans_i', this parameter will
            be ignored as they are not permutation-based methods
        out_dir - path to the output directory where results files will be
            written. It is assumed that this directory already exists and we
            have write permissions to it
    """
    # Make sure we were passed a list of categories, not a single string.
    if not isinstance(categories, ListType):
        raise TypeError("The supplied categories must be a list of "
                        "strings.")

    # Special case: we do not allow SampleID as it is not a category, neither
    # in data structure representation nor in terms of a statistical test (no
    # groups are formed since all entries are unique IDs).
    if 'SampleID' in categories:
        raise ValueError("Cannot use SampleID as a category because it is a "
                         "unique identifier for each sample, and thus does "
                         "not create groups of samples (nor can it be used as "
                         "a numeric category in Moran's I or BIO-ENV "
                         "analyses). Please choose a different metadata "
                         "column to perform statistical tests on.")

    dm = DistanceMatrix.read(dm_fp)

    if method in ('anosim', 'permanova', 'bioenv'):
        with open(map_fp, 'U') as map_f:
            md_dict = parse_mapping_file_to_dict(map_f)[0]
        df = pd.DataFrame.from_dict(md_dict, orient='index')

        out_fp = join(out_dir, '%s_results.txt' % method)

        if method in ('anosim', 'permanova'):
            if method == 'anosim':
                method_fn = anosim
            elif method == 'permanova':
                method_fn = permanova

            results = method_fn(dm,
                                df,
                                column=categories[0],
                                permutations=num_perms)
        elif method == 'bioenv':
            results = bioenv(dm, df, columns=categories)

        results.to_csv(out_fp, sep='\t')
    else:
        # Remove any samples from the mapping file that aren't in the distance
        # matrix (important for validation checks). Use strict=True so that an
        # error is raised if the distance matrix contains any samples that
        # aren't in the mapping file.
        with open(map_fp, 'U') as map_f:
            md_map = MetadataMap.parseMetadataMap(map_f)
        md_map.filterSamples(dm.ids, strict=True)

        # These methods are run in R. Input validation must be done here before
        # running the R commands.
        if method in ['adonis', 'morans_i', 'mrpp', 'permdisp', 'dbrda']:
            # Check to make sure all categories passed in are in mapping file
            # and are not all the same value.
            for category in categories:
                if not category in md_map.CategoryNames:
                    raise ValueError("Category '%s' not found in mapping file "
                                     "columns." % category)

                if md_map.hasSingleCategoryValue(category):
                    raise ValueError("All values in category '%s' are the "
                                     "same. The statistical method '%s' "
                                     "cannot operate on a category that "
                                     "creates only a single group of samples "
                                     "(e.g. there are no 'between' distances "
                                     "because there is only a single group)." %
                                     (category, method))

            # Build the command arguments string.
            command_args = [
                '-d %s -m %s -c %s -o %s' %
                (dm_fp, map_fp, categories[0], out_dir)
            ]

            if method == 'morans_i':
                # Moran's I requires only numeric categories.
                for category in categories:
                    if not md_map.isNumericCategory(category):
                        raise TypeError("The category '%s' is not numeric. "
                                        "Not all values could be converted to "
                                        "numbers." % category)
            else:
                # The rest require groups of samples, so the category values
                # cannot all be unique.
                for category in categories:
                    if (md_map.hasUniqueCategoryValues(category)
                            and not (method == 'adonis'
                                     and md_map.isNumericCategory(category))):
                        raise ValueError("All values in category '%s' are "
                                         "unique. This statistical method "
                                         "cannot operate on a category with "
                                         "unique values (e.g. there are no "
                                         "'within' distances because each "
                                         "group of samples contains only a "
                                         "single sample)." % category)

                # Only Moran's I doesn't accept a number of permutations.
                if num_perms < 0:
                    raise ValueError("The number of permutations must be "
                                     "greater than or equal to zero.")

                command_args[0] += ' -n %d' % num_perms

            rex = RExecutor(TmpDir=get_qiime_temp_dir())
            rex(command_args, '%s.r' % method)
        else:
            raise ValueError("Unrecognized method '%s'. Valid methods: %r" %
                             (method, methods))
Пример #59
0
def get_cluster_ratio(fasta_seqs, min_difference_in_clusters):
    """
    Uses uclust to calculate cluster ratio
    cluster_ratio =
    num_of_seq_in_cluster_with_max_seq
    divided by
    num_of_seq_in cluster_with_second_higest_seq
    Parameters
    ----------
    fasta_seqs: list
        list of fasta sequences
    min_difference_in_clusters: float
        percent identity threshold for cluster formation
    Returns
    ----------
    cluster_ratio: float
        cluster ratio of the sequences using uclust
        cluster_ratio =
        num_of_seq_in_cluster_with_max_seq /
        num_of_seq_in cluster_with_second_higest_seq
    """
    cluster_percent_id = min_difference_in_clusters
    temp_dir = get_qiime_temp_dir()
    fd_uc, uclust_tempfile_name = mkstemp(dir=temp_dir, suffix='.uc')
    close(fd_uc)
    fd_fas, fasta_tempfile_name = mkstemp(dir=temp_dir, suffix='.uc')
    close(fd_fas)
    with open(fasta_tempfile_name, 'w') as fasta_tempfile:
        fasta_tempfile.write(fasta_seqs)
    fasta_tempfile.close()
    count = 0
    command = "uclust --usersort --input {} --uc {} --id 0.98".format(
        fasta_tempfile_name, uclust_tempfile_name)
    # In the function, I am calling uclust a large number of times.
    # Initially I was using from bfillings.get_clusters_from_fasta_filepath
    # but due to issue (biocore/bfillingss#31), I have temporarily
    # reverted to qiime_system_call.

    count_lookup = defaultdict(int)

    qiime_system_call(command)
    uclust_tempfile = open(uclust_tempfile_name, 'r')
    for line in uclust_tempfile:
        if search(r'^C', line):
            pieces = line.split('\t')
            count_lookup[pieces[1]] += int(pieces[2])
            count += 1
    uclust_tempfile.close()
    files_to_be_removed = list()
    files_to_be_removed.append(uclust_tempfile_name)
    remove_files(files_to_be_removed)

    sorted_counts_in_clusters = sorted(
        count_lookup.iteritems(),
        key=lambda x: x[1], reverse=True)
    try:
        max_cluster_count = \
            float(str(sorted_counts_in_clusters[0][1]))
        second_cluster_count = \
            float(str(sorted_counts_in_clusters[1][1]))
        return max_cluster_count / second_cluster_count
    except IndexError:
        return 1
Пример #60
0
make_option("-o","--outputdir",dest='outputdir',default = None, type="new_dirpath",
        help="The output directory")
                                 ]
script_info['optional_options']=[\
    make_option("-e","--e_value",type='float',dest='e_value',\
        default = 1e-10,\
        help="The e-value cutoff for blast queries [default: %default]."),\
    make_option("-p","--percent_aligned",type='float',\
        dest='percent_aligned',default = 0.97,\
        help="The % alignment cutoff for blast queries [default: %default]."),\
    make_option("--no_clean",action = 'store_true',\
        dest='no_clean',default = False,\
        help="If set, don't delete files generated by formatdb after running [default: %default]."),\
    make_option("--blastmatroot",dest='blastmatroot',default = None, type="existing_dirpath",\
            help="Path to a folder containing blast matrices [default: %default]."),\
    make_option("--working_dir",dest='working_dir',default = get_qiime_temp_dir(), type="existing_dirpath",\
        help="Working dir for BLAST [default: %default]."),\
    make_option("-m","--max_hits",type='int',dest='max_hits',\
        default = 100,\
        help="""Max hits parameter for BLAST. CAUTION: Because filtering on alignment percentage occurs after BLAST, a max hits value of 1 in combination with an alignment percent filter could miss valid contaminants. [default: %default]"""),\
    make_option("-w","--word_size",type='int',dest='wordsize',\
        default = 28,\
        help="Word size to use for BLAST search [default: %default]"),\
    make_option("-n","--no_format_db",dest = 'no_format_db', action = "store_true",\
        default = False,\
        help="""If this flag is specified, format_db will not be called on the subject database (formatdb will be set to False).  This is  useful if you have already formatted the database and a) it took a very long time or b) you want to run the script in parallel on the pre-formatted database [default: %default]""")
                                 ]
script_info['version'] = __version__

FORMAT_BAR = """------------------------------""" * 2