Пример #1
0
class ConfigFileHandler(DefaultValues):
    """
    Reading and writing config file

    @type _list_of_communities: list[Community]
    """
    # internal variables not set in config
    _file_name_config = "config.ini"
    _ncbi_ref_files = ["nodes.dmp", "merged.dmp", "names.dmp"]

    def __init__(self,
                 label="ConfigFileHandler",
                 logfile=None,
                 verbose=False,
                 debug=False):
        super(ConfigFileHandler, self).__init__(label=label,
                                                logfile=logfile,
                                                verbose=verbose,
                                                debug=debug)
        self._validator = Validator(logfile=logfile,
                                    verbose=verbose,
                                    debug=debug)

    def _read_config(self, file_path_config):
        """
        Read parameter from configuration file.

        @rtype: bool
        """
        # TODO: check that all keys options make sense
        self._config = ConfigParserWrapper(logfile=self._logfile,
                                           verbose=self._verbose)
        if not self._validator.validate_file(file_path_config,
                                             key="Configuration file"):
            self._valid_args = False
            return
        self._config.read(file_path_config)

        # ##########
        # [Main]
        # ##########

        section = None  # "Main"
        if self._phase is None:
            self._phase = self._config.get_value("phase",
                                                 is_digit=True,
                                                 silent=True)

        if self._seed is None:
            self._seed = self._config.get_value("seed", silent=True)

        if self._max_processors is None:
            self._max_processors = self._config.get_value("max_processors",
                                                          is_digit=True,
                                                          silent=True)

        if self._dataset_id is None:
            self._dataset_id = self._config.get_value("dataset_id",
                                                      silent=True)

        if self._directory_output is None:
            self._directory_output = self._config.get_value("output_directory",
                                                            is_path=True)

        if self._tmp_dir is None:
            config_value = self._config.get_value("temp_directory",
                                                  is_path=True,
                                                  silent=True)
            if config_value is not None:
                assert self._validator.validate_dir(config_value)
                self._tmp_dir = config_value

        self._phase_gsa = self._config.get_value("gsa",
                                                 is_boolean=True,
                                                 silent=True)
        self._phase_pooled_gsa = self._config.get_value("pooled_gsa",
                                                        is_boolean=True,
                                                        silent=True)

        self._compresslevel = self._config.get_value("compress",
                                                     is_digit=True,
                                                     silent=True)

        self._phase_anonymize = self._config.get_value("anonymous",
                                                       is_boolean=True,
                                                       silent=True)

        # ##########
        # [ReadSimulator]
        # ##########

        section = None  # "ReadSimulator"
        if self._sample_size_in_base_pairs is None:
            config_value = self._config.get_value("size",
                                                  is_digit=True,
                                                  silent=True)
            if config_value is not None:
                self._sample_size_in_base_pairs = config_value * self._base_pairs_multiplication_factor

        if self._read_simulator_type is None:
            self._read_simulator_type = self._config.get_value("type",
                                                               silent=True)

        if self._executable_samtools is None:
            self._executable_samtools = self._config.get_value("samtools",
                                                               is_path=True,
                                                               silent=True)

        if self._executable_readsim is None:
            self._executable_readsim = self._config.get_value("readsim",
                                                              silent=True,
                                                              is_path=True)

        if self._directory_error_profiles is None:
            self._directory_error_profiles = self._config.get_value(
                "error_profiles", silent=True, is_path=True)

        if self._error_profile is None:
            self._error_profile = self._config.get_value("profile",
                                                         silent=True)

        if self._custom_profile_filename is None:
            self._custom_profile_filename = self._config.get_value(
                "base_profile_name", silent=True)

        if self._custom_readlength is None:
            self._custom_readlength = self._config.get_value(
                "profile_read_length", is_digit=True, silent=True)

        if self._fragment_size_standard_deviation_in_bp is None:
            self._fragment_size_standard_deviation_in_bp = self._config.get_value(
                "fragment_size_standard_deviation", is_digit=True, silent=True)

        if self._fragments_size_mean_in_bp is None:
            self._fragments_size_mean_in_bp = self._config.get_value(
                "fragments_size_mean", is_digit=True, silent=True)

        # ##########
        # [CommunityDesign]
        # ##########

        if self._input_list_of_file_paths_distributions is None:
            input_list_of_file_paths_distributions = self._config.get_value(
                "distribution_file_paths", is_path=True, silent=True)
            if input_list_of_file_paths_distributions is not None:
                self._input_list_of_file_paths_distributions = input_list_of_file_paths_distributions.split(
                    ',')

        section = None  # "CommunityDesign"
        if self._directory_ncbi_taxdump is None:
            self._directory_ncbi_taxdump = self._config.get_value(
                "ncbi_taxdump", is_path=True, silent=True)

        if self._strain_simulation_template is None:
            self._strain_simulation_template = self._config.get_value(
                "strain_simulation_template", is_path=True, silent=True)

        if self._number_of_samples is None:
            self._number_of_samples = self._config.get_value(
                "number_of_samples", is_digit=True, silent=True)

        # if self._number_of_communities is None:
        #     self._number_of_communities = self._config.get_value('number_of_communities', is_digit=True)
        #
        # if self._number_of_communities is None:
        #     self._logger.error("Bad number of communities!")
        #     self._valid_arguments = False
        #     return

        community_sections = set()
        community_key_options = {
            "genomes_total", 'num_real_genomes', 'max_strains_per_otu',
            'ratio', 'log_mu', 'log_sigma', 'gauss_mu', 'gauss_sigma'
        }
        for key_options in community_key_options:
            community_sections = community_sections.union(
                self._config.search_sections_of(key_options))

        self._list_of_communities = []
        is_valid = True
        for community_section in community_sections:
            file_path_metadata_table = self._config.get_value(
                'metadata', community_section, is_path=True)
            file_path_genome_locations = self._config.get_value(
                'id_to_genome_file', community_section, is_path=True)
            file_path_gff_locations = self._config.get_value('id_to_gff_file',
                                                             community_section,
                                                             is_path=True,
                                                             silent=True)
            mode = self._config.get_value('mode',
                                          community_section,
                                          silent=True)
            if not isinstance(file_path_metadata_table, str):
                is_valid = False
            if not isinstance(file_path_genome_locations, str):
                is_valid = False
            # if not isinstance(file_path_gff_locations, str):
            #     is_valid = False
            # if not isinstance(mode, str):
            #     is_valid = False

            if not is_valid:
                continue
            assert isinstance(file_path_metadata_table, str)
            assert isinstance(file_path_genome_locations, str)
            assert file_path_gff_locations is None or isinstance(
                file_path_gff_locations, str)
            assert mode is None or isinstance(mode, str)
            new_community = Community(
                identifier=community_section,
                genomes_total=self._config.get_value('genomes_total',
                                                     community_section,
                                                     is_digit=True),
                genomes_real=self._config.get_value('num_real_genomes',
                                                    community_section,
                                                    is_digit=True,
                                                    silent=True),
                limit_per_otu=self._config.get_value('max_strains_per_otu',
                                                     community_section,
                                                     is_digit=True,
                                                     silent=True),
                file_path_metadata_table=file_path_metadata_table,
                file_path_genome_locations=file_path_genome_locations,
                file_path_gff_locations=file_path_gff_locations,
                ratio=self._config.get_value('ratio',
                                             community_section,
                                             is_digit=True,
                                             silent=True),
                mode=mode,
                log_mu=self._config.get_value('log_mu',
                                              community_section,
                                              is_digit=True,
                                              silent=True),
                log_sigma=self._config.get_value('log_sigma',
                                                 community_section,
                                                 is_digit=True,
                                                 silent=True),
                gauss_mu=self._config.get_value('gauss_mu',
                                                community_section,
                                                is_digit=True,
                                                silent=True),
                gauss_sigma=self._config.get_value('gauss_sigma',
                                                   community_section,
                                                   is_digit=True,
                                                   silent=True),
                verbose=self._config.get_value('view',
                                               community_section,
                                               is_boolean=True,
                                               silent=True))
            self._list_of_communities.append(new_community)
            self._number_of_communities = len(self._list_of_communities)
        return is_valid

    def _stream_main(self, output_stream=sys.stdout):
        """

        @param output_stream:
        """
        output_stream.write("[Main]\n")
        output_stream.write("seed={}\n".format(self._seed or ""))
        output_stream.write("phase={}\n".format(self._phase))
        output_stream.write("max_processors={}\n".format(self._max_processors))
        output_stream.write("dataset_id={}\n".format(self._dataset_id))
        output_stream.write("output_directory={}\n".format(
            self._directory_output or ""))
        output_stream.write("temp_directory={}\n".format(self._tmp_dir or ""))
        output_stream.write("gsa={}\n".format(self._phase_gsa))
        output_stream.write("pooled_gsa={}\n".format(self._phase_pooled_gsa))
        output_stream.write("anonymous={}\n".format(self._phase_anonymize))
        output_stream.write("compress={}\n".format(self._compresslevel))

    def _stream_read_simulator(self, output_stream=sys.stdout):
        """

        @param output_stream:
        """
        output_stream.write("[ReadSimulator]\n")
        output_stream.write("readsim={}\n".format(self._executable_readsim))
        output_stream.write("error_profiles={}\n".format(
            self._directory_error_profiles or ""))
        output_stream.write("samtools={}\n".format(self._executable_samtools))
        output_stream.write("profile={}\n".format(self._error_profile))
        output_stream.write("base_profile_name={}\n".format(
            self._custom_profile_filename or ""))
        output_stream.write("profile_read_length={}\n".format(
            self._custom_readlength or ""))
        output_stream.write(
            "size={}\n".format(self._sample_size_in_base_pairs /
                               self._base_pairs_multiplication_factor))
        output_stream.write("type={}\n".format(self._read_simulator_type))
        output_stream.write("fragments_size_mean={}\n".format(
            self._fragments_size_mean_in_bp))
        output_stream.write("fragment_size_standard_deviation={}\n".format(
            self._fragment_size_standard_deviation_in_bp))

    def _stream_community_design(self, output_stream=sys.stdout):
        """

        @param output_stream:
        """
        output_stream.write("[CommunityDesign]\n")
        output_stream.write("distribution_file_paths={}\n".format(
            self._input_list_of_file_paths_distributions or ""))
        output_stream.write("ncbi_taxdump={}\n".format(
            self._directory_ncbi_taxdump or ""))
        output_stream.write("strain_simulation_template={}\n".format(
            self._strain_simulation_template or ""))
        output_stream.write("number_of_samples={}\n".format(
            self._number_of_samples))
        # output_stream.write("number_of_communities={}\n".format(self._number_of_communities))

    def _stream_communities(self, output_stream=sys.stdout):
        """

        @param output_stream:
        """
        for community in self._list_of_communities:
            output_stream.write("[{}]\n".format(community.id))
            output_stream.write("metadata={}\n".format(
                community.file_path_metadata_table))
            output_stream.write("id_to_genome_file={}\n".format(
                community.file_path_genome_locations or ""))
            output_stream.write("id_to_gff_file={}\n".format(
                community.file_path_gff_locations or ""))
            output_stream.write("genomes_total={}\n".format(
                community.genomes_total))
            output_stream.write("num_real_genomes={}\n".format(
                community.genomes_real))
            output_stream.write("max_strains_per_otu={}\n".format(
                community.limit_per_otu))
            output_stream.write("ratio={}\n".format(community.ratio))
            output_stream.write("mode={}\n".format(community.mode))
            output_stream.write("log_mu={}\n".format(community.log_mu))
            output_stream.write("log_sigma={}\n".format(community.log_sigma))
            output_stream.write("gauss_mu={}\n".format(community.gauss_mu))
            output_stream.write("gauss_sigma={}\n".format(
                community.gauss_sigma))
            output_stream.write("view={}\n".format(community.verbose))
            output_stream.write("\n")

    def write_config(self, file_path):
        with open(file_path, 'w') as write_handler:
            self._stream_main(write_handler)
            write_handler.write("\n")
            self._stream_read_simulator(write_handler)
            write_handler.write("\n")
            self._stream_community_design(write_handler)
            write_handler.write("\n")
            self._stream_communities(write_handler)
Пример #2
0
    def _read_config(self, file_path_config):
        """
        Read parameter from configuration file.

        @rtype: bool
        """
        # TODO: check that all keys options make sense
        self._config = ConfigParserWrapper(logfile=self._logfile,
                                           verbose=self._verbose)
        if not self._validator.validate_file(file_path_config,
                                             key="Configuration file"):
            self._valid_args = False
            return
        self._config.read(file_path_config)

        # ##########
        # [Main]
        # ##########

        section = None  # "Main"
        if self._phase is None:
            self._phase = self._config.get_value("phase",
                                                 is_digit=True,
                                                 silent=True)

        if self._seed is None:
            self._seed = self._config.get_value("seed", silent=True)

        if self._max_processors is None:
            self._max_processors = self._config.get_value("max_processors",
                                                          is_digit=True,
                                                          silent=True)

        if self._dataset_id is None:
            self._dataset_id = self._config.get_value("dataset_id",
                                                      silent=True)

        if self._directory_output is None:
            self._directory_output = self._config.get_value("output_directory",
                                                            is_path=True)

        if self._tmp_dir is None:
            config_value = self._config.get_value("temp_directory",
                                                  is_path=True,
                                                  silent=True)
            if config_value is not None:
                assert self._validator.validate_dir(config_value)
                self._tmp_dir = config_value

        self._phase_gsa = self._config.get_value("gsa",
                                                 is_boolean=True,
                                                 silent=True)
        self._phase_pooled_gsa = self._config.get_value("pooled_gsa",
                                                        is_boolean=True,
                                                        silent=True)

        self._compresslevel = self._config.get_value("compress",
                                                     is_digit=True,
                                                     silent=True)

        self._phase_anonymize = self._config.get_value("anonymous",
                                                       is_boolean=True,
                                                       silent=True)

        # ##########
        # [ReadSimulator]
        # ##########

        section = None  # "ReadSimulator"
        if self._sample_size_in_base_pairs is None:
            config_value = self._config.get_value("size",
                                                  is_digit=True,
                                                  silent=True)
            if config_value is not None:
                self._sample_size_in_base_pairs = config_value * self._base_pairs_multiplication_factor

        if self._read_simulator_type is None:
            self._read_simulator_type = self._config.get_value("type",
                                                               silent=True)

        if self._executable_samtools is None:
            self._executable_samtools = self._config.get_value("samtools",
                                                               is_path=True,
                                                               silent=True)

        if self._executable_readsim is None:
            self._executable_readsim = self._config.get_value("readsim",
                                                              silent=True,
                                                              is_path=True)

        if self._directory_error_profiles is None:
            self._directory_error_profiles = self._config.get_value(
                "error_profiles", silent=True, is_path=True)

        if self._error_profile is None:
            self._error_profile = self._config.get_value("profile",
                                                         silent=True)

        if self._custom_profile_filename is None:
            self._custom_profile_filename = self._config.get_value(
                "base_profile_name", silent=True)

        if self._custom_readlength is None:
            self._custom_readlength = self._config.get_value(
                "profile_read_length", is_digit=True, silent=True)

        if self._fragment_size_standard_deviation_in_bp is None:
            self._fragment_size_standard_deviation_in_bp = self._config.get_value(
                "fragment_size_standard_deviation", is_digit=True, silent=True)

        if self._fragments_size_mean_in_bp is None:
            self._fragments_size_mean_in_bp = self._config.get_value(
                "fragments_size_mean", is_digit=True, silent=True)

        # ##########
        # [CommunityDesign]
        # ##########

        if self._input_list_of_file_paths_distributions is None:
            input_list_of_file_paths_distributions = self._config.get_value(
                "distribution_file_paths", is_path=True, silent=True)
            if input_list_of_file_paths_distributions is not None:
                self._input_list_of_file_paths_distributions = input_list_of_file_paths_distributions.split(
                    ',')

        section = None  # "CommunityDesign"
        if self._directory_ncbi_taxdump is None:
            self._directory_ncbi_taxdump = self._config.get_value(
                "ncbi_taxdump", is_path=True, silent=True)

        if self._strain_simulation_template is None:
            self._strain_simulation_template = self._config.get_value(
                "strain_simulation_template", is_path=True, silent=True)

        if self._number_of_samples is None:
            self._number_of_samples = self._config.get_value(
                "number_of_samples", is_digit=True, silent=True)

        # if self._number_of_communities is None:
        #     self._number_of_communities = self._config.get_value('number_of_communities', is_digit=True)
        #
        # if self._number_of_communities is None:
        #     self._logger.error("Bad number of communities!")
        #     self._valid_arguments = False
        #     return

        community_sections = set()
        community_key_options = {
            "genomes_total", 'num_real_genomes', 'max_strains_per_otu',
            'ratio', 'log_mu', 'log_sigma', 'gauss_mu', 'gauss_sigma'
        }
        for key_options in community_key_options:
            community_sections = community_sections.union(
                self._config.search_sections_of(key_options))

        self._list_of_communities = []
        is_valid = True
        for community_section in community_sections:
            file_path_metadata_table = self._config.get_value(
                'metadata', community_section, is_path=True)
            file_path_genome_locations = self._config.get_value(
                'id_to_genome_file', community_section, is_path=True)
            file_path_gff_locations = self._config.get_value('id_to_gff_file',
                                                             community_section,
                                                             is_path=True,
                                                             silent=True)
            mode = self._config.get_value('mode',
                                          community_section,
                                          silent=True)
            if not isinstance(file_path_metadata_table, str):
                is_valid = False
            if not isinstance(file_path_genome_locations, str):
                is_valid = False
            # if not isinstance(file_path_gff_locations, str):
            #     is_valid = False
            # if not isinstance(mode, str):
            #     is_valid = False

            if not is_valid:
                continue
            assert isinstance(file_path_metadata_table, str)
            assert isinstance(file_path_genome_locations, str)
            assert file_path_gff_locations is None or isinstance(
                file_path_gff_locations, str)
            assert mode is None or isinstance(mode, str)
            new_community = Community(
                identifier=community_section,
                genomes_total=self._config.get_value('genomes_total',
                                                     community_section,
                                                     is_digit=True),
                genomes_real=self._config.get_value('num_real_genomes',
                                                    community_section,
                                                    is_digit=True,
                                                    silent=True),
                limit_per_otu=self._config.get_value('max_strains_per_otu',
                                                     community_section,
                                                     is_digit=True,
                                                     silent=True),
                file_path_metadata_table=file_path_metadata_table,
                file_path_genome_locations=file_path_genome_locations,
                file_path_gff_locations=file_path_gff_locations,
                ratio=self._config.get_value('ratio',
                                             community_section,
                                             is_digit=True,
                                             silent=True),
                mode=mode,
                log_mu=self._config.get_value('log_mu',
                                              community_section,
                                              is_digit=True,
                                              silent=True),
                log_sigma=self._config.get_value('log_sigma',
                                                 community_section,
                                                 is_digit=True,
                                                 silent=True),
                gauss_mu=self._config.get_value('gauss_mu',
                                                community_section,
                                                is_digit=True,
                                                silent=True),
                gauss_sigma=self._config.get_value('gauss_sigma',
                                                   community_section,
                                                   is_digit=True,
                                                   silent=True),
                verbose=self._config.get_value('view',
                                               community_section,
                                               is_boolean=True,
                                               silent=True))
            self._list_of_communities.append(new_community)
            self._number_of_communities = len(self._list_of_communities)
        return is_valid
Пример #3
0
    def _read_config(self):
        """
		Read arguments from a configuration file

		@rtype: None
		"""
        if not self.validate_file(self._file_path_config,
                                  key="Configuration file"):
            self._valid_args = False
            return

        self._config = ConfigParserWrapper(self._file_path_config,
                                           logfile=self._logfile,
                                           verbose=self._verbose)
        section = "Main"
        if self._phase is None:
            self._phase = self._config.get_value("phase",
                                                 is_digit=True,
                                                 silent=False)
        self._directory_temp = self._config.get_value("temp_directory",
                                                      is_path=True,
                                                      silent=False)
        self._directory_output = self._config.get_value("output_directory",
                                                        is_path=True,
                                                        silent=False)
        if self._max_processors is None:
            self._max_processors = self._config.get_value("max_processors",
                                                          is_digit=True)
        self._validate_genomes = self._config.get_value("validate_genomes",
                                                        is_boolean=True)

        section = "MarkerGeneExtraction"
        self._binary_rnammer = self._config.get_value("rnammer", is_path=True)
        self._hmmerBinDir = self._config.get_value("hmmerBinDir", is_path=True)
        self._rnaHmmInstallDir = self._config.get_value("rnaHmmInstallDir",
                                                        is_path=True)
        self._file_path_reference_genome_locations = self._config.get_value(
            "reference_genomes_file", is_path=True)
        self._file_path_map_reference_genome_id_to_tax_id = self._config.get_value(
            "reference_genomes_map_file", is_path=True)
        self._file_path_reference_markergene = self._config.get_value(
            "input_reference_fna_file", is_path=True)
        self._hmmer = self._config.get_value("hmmer", is_digit=True)
        self._file_path_query_genomes_location_file = self._config.get_value(
            "input_genomes_file", is_path=True)

        section = "MarkerGeneClustering"
        self._binary_mothur = self._config.get_value("mothur", is_path=True)
        self._metadata_table_in = self._config.get_value("metadata_table_in",
                                                         is_path=True)
        self._silva_reference_directory = self._config.get_value(
            "silva_reference_directory", is_path=True)
        self._cluster_method = self._config.get_value("cluster_method")
        self._distance_cutoff = self._config.get_value("max_threshold",
                                                       is_digit=True)
        self._otu_distance = self._config.get_value("otu_distance",
                                                    is_digit=True)
        self._classification_distance_minimum = self._config.get_value(
            "classification_distance", is_digit=True)

        section = "MarkerGeneAnnotation"
        self._ncbi_reference_directory = self._config.get_value(
            "ncbi_reference_directory", is_path=True)
        self._file_path_nucmer = self._config.get_value("nucmer", is_path=True)
        self._annotate_classify = self._config.get_value("classify",
                                                         is_boolean=True)
        self._annotate_novelty = self._config.get_value("novelty",
                                                        is_boolean=True)
        self._annotate_otu = self._config.get_value("otu", is_boolean=True)
        self._annotate_ani = self._config.get_value("ani", is_boolean=True)
Пример #4
0
def main():
    # external commands will be executed in Shell in Unix/Linux
    assert os.name == 'posix', str(
        'The pipeline runs only on "posix" systems (i.e. Unix/Linux compatible). '
        + 'Your system is "' + os.name + '"')

    parser = argparse.ArgumentParser(
        description='''PhyloPythiaS Plus is an extension of PhyloPythiaS.''',
        epilog='''Read the user documentation for more details.''')

    parser.add_argument('-c',
                        '--config',
                        type=file,
                        required=True,
                        help='configuration file of the pipeline',
                        metavar='config.cfg',
                        dest='config')

    parser.add_argument('-i',
                        '--input_fasta_file',
                        default=None,
                        required=False,
                        help='path to fasta file',
                        metavar='fasta_file.fn',
                        dest='i')

    parser.add_argument('-out',
                        '--output_folder',
                        default=None,
                        required=False,
                        help='path to output folder',
                        dest='out')

    parser.add_argument(
        '-n',
        '--runn-rrna16S',
        action='store_true',
        help=
        'run hidden markov model searching for 16S, 23S, and 5S marker genes',
        dest='n')

    parser.add_argument(
        '-g',
        '--run-marker-gene-analysis',
        action='store_true',
        help=
        'run hidden markov model and classify according to the "31" marker genes',
        dest='g')

    parser.add_argument("-hmmer",
                        "--hmmer",
                        default=3,
                        type=int,
                        help="'2': rnammer; '3': hmmsearch using hmmer 3.0")

    parser.add_argument("-log",
                        "--logfile",
                        type=str,
                        default=None,
                        help="pipeline output will written to this log file")

    args = parser.parse_args()

    # read configuration
    logfile = args.logfile
    config = ConfigParserWrapper(args.config, logfile=logfile, verbose=True)

    # pipeline directory
    pipeline_dir = args.out
    if pipeline_dir is None or not os.path.isdir(pipeline_dir):
        print("Pipeline directory doesn't exist: ", pipeline_dir)
        return

    # create the following directories in the working/output directory if they don't exist
    working_dir = os.path.join(pipeline_dir, 'working')
    mg_working_dir = os.path.join(working_dir, 'mgWorking')
    output_dir = os.path.join(pipeline_dir, 'output')
    dir_array = [
        working_dir, output_dir,
        os.path.join(working_dir, 'projectDir'),
        os.path.join(working_dir, 'sampleSpecificDir'), mg_working_dir
    ]
    for dirPath in dir_array:
        if not os.path.isdir(dirPath):
            try:
                os.mkdir(dirPath)
            except OSError:
                print("Can't create directory", dirPath)
                return

    input_fasta_file = args.i
    if (input_fasta_file is None) or ((input_fasta_file is not None) and
                                      (not os.path.isfile(input_fasta_file))):
        print("The input fasta file %s doesn't exist" % input_fasta_file)
        return
    # read input fasta: contigs, scaffolds, mapping

    input_fasta_scaffolds_file = None
    scaffolds_to_contigs_map_file = None

    # create input id files (contigs, scaffolds, mappings)
    fasta_file_ids = common.createTagFilePath(working_dir, input_fasta_file,
                                              'ids')
    seq_name_seq_id_file = common.createTagFilePath(working_dir,
                                                    input_fasta_file, 'cToIds')

    scaffold_contig_map_ids_file = common.createTagFilePath(
        working_dir, input_fasta_file, 'mapSCIds')

    taxonomic_ranks = taxonomy_ncbi.TAXONOMIC_RANKS[1:]  # without root
    try:
        min_seq_len = config.get_value("MarkerGeneExtraction",
                                       "minSeqLen",
                                       is_digit=True)
    except Exception as e:
        print(
            "Can't parse configuration entry (minSeqLen), make sure that it's an integer number"
        )
        raise e

    # generates working fasta files always when the configuration file is newer than particular files to be generated
    sequences = Sequences(input_fasta_file, input_fasta_scaffolds_file,
                          scaffolds_to_contigs_map_file, taxonomic_ranks,
                          min_seq_len)
    if sequences.get_sequence_count() == -1:
        print("WARNING: [16S detector] File contains no valid sequences: {}".
              format(input_fasta_file))
        return

    sequences.writeSequences(fasta_file_ids)
    # print('Working contigs input fasta file created: %s' % fasta_file_ids)
    sequences.writeSeqNameSeqId(seq_name_seq_id_file)
    # print('Ids mapping for the working contigs fasta file created: %s' % seq_name_seq_id_file)
    sequences.writeScaffoldContigMap(scaffold_contig_map_ids_file)
    # print('Scaffolds -> contigs map ids file created: %s' % scaffold_contig_map_ids_file)
    # assert Common.seqFileCmp(inputFastaFile, fastaFileIds), 'The fasta IDs file contains different sequences!'

    # is it specified what to do?
    if not (args.n or args.g):
        print('Choose what do you want to do!')
        print(parser.print_help())
        return

    # run 16S analysis
    if args.n:
        hmmer = args.hmmer
        rrna = RRNA16S(config, None, working_dir)
        print('run Hidden Markov Model for (16S, 23S, 5S)')
        rrna.runHMM(fasta_file_ids,
                    outLog=logfile,
                    hmmer=hmmer,
                    moltypes="ssu",
                    kingdoms="arc,bac")
Пример #5
0
class ArgumentHandler(SequenceValidator):
    """
	Reading pipeline configuration from file and from passed arguments
	"""
    _label = "ArgumentHandler"
    _file_path_config = None
    _directory_pipeline = None
    _directory_temp = None

    # [main]
    _phase = 0
    _max_processors = 1

    # [MarkerGeneExtraction]
    _hmmer = None
    _file_path_reference_genome_locations = None
    _file_path_reference_markergene = None
    _file_path_query_genomes_location_file = None
    _file_path_map_reference_genome_id_to_tax_id = None
    _directory_output = None

    _mg_analyse_executable = None
    _binary_rnammer = None
    _hmmerBinDir = None  # 16S mg analysis
    _rnaHmmInstallDir = None  # 16S mg analysis

    # [MarkerGeneClustering]
    _cluster_method_choices = MGCluster._cluster_method_choices
    _binary_mothur = None
    _metadata_table_in = None
    _cluster_method = None
    _distance_cutoff = None
    _silva_reference_directory = None
    _precision = 1000

    # [MarkerGeneAnnotation]
    _otu_distance = None
    _classification_distance_minimum = None
    _ncbi_reference_directory = None
    _ani_minimum_alignment = 0.8

    # subfolder/files
    _silva_ref_files = MGCluster._silva_ref_files
    _ncbi_ref_files = ["nodes.dmp", "merged.dmp", "names.dmp"]

    # meta table columns  'OTU', 'novelty_category'
    _separator = "\t"
    _column_name_genome_id = "genome_ID"
    _column_name_cutoff = "prediction_threshold"
    _column_name_otu_id = "OTU"
    _column_name_cluster_prediction = "NCBI_ID"
    _column_name_cluster_scientific_name = "SCIENTIFIC_NAME"
    _column_name_cluster_novelty = "novelty_category"
    _column_name_ani = "ANI"
    _column_name_ani_novelty = "ANI_NOVELTY_CATEGORY"
    _column_name_ani_compare = "ANI_TAXONOMIC_COMPARE"
    _column_name_ani_scientific_name = "ANI_SCIENTIFIC_NAME"

    def __init__(self,
                 args=None,
                 version="Prototype",
                 separator="\t",
                 column_name_genome_id="genome_ID",
                 column_name_otu="OTU",
                 column_name_novelty_category="novelty_category",
                 column_name_ncbi="NCBI_ID"):
        """
		Constructor

		@param args: Past arguments like sys.args
		@type args: list[]
		@param version: Version of main Program
		@type version: str|unicode
		@param separator: Expected separator for data tables
		@type separator: str|unicode
		@param column_name_genome_id: Column name of genome ids
		@type column_name_genome_id: str|unicode
		@param column_name_otu: Column name of otu ids
		@type column_name_otu: str|unicode
		@param column_name_novelty_category: Column name of genome novelty
		@type column_name_novelty_category: str|unicode
		@param column_name_ncbi: Column name of taxonomic classification
		@type column_name_ncbi: str|unicode
		"""
        assert args is None or isinstance(args, list)
        assert isinstance(version, str)
        assert isinstance(separator, str)
        assert isinstance(column_name_genome_id, str)
        assert isinstance(column_name_otu, str)
        assert isinstance(column_name_novelty_category, str)
        assert isinstance(column_name_ncbi, str)
        self._separator = separator
        self._column_name_genome_id = column_name_genome_id
        self._column_name_otu_id = column_name_otu
        self._column_name_cluster_novelty = column_name_novelty_category
        self._column_name_ncbi = column_name_ncbi
        self._directory_pipeline = self._get_directory_pipeline()

        if not os.path.isabs(self._directory_pipeline):
            self._directory_pipeline = os.path.expanduser(
                self._directory_pipeline)
            self._directory_pipeline = os.path.realpath(
                self._directory_pipeline)

        self._valid_args = True

        # read parsed arguments
        options = self._get_parser_options(args, version)

        logfile = options.logfile
        if logfile is not None:
            logfile = self.get_full_path(logfile)
        super(ArgumentHandler, self).__init__(logfile=logfile)

        self._read_options(options)
        if not self._valid_args:
            return

        # set log level read from arguments
        self.set_log_level(verbose=self._verbose, debug=self._debug)

        # read config options
        self._read_config()
        if not self._valid_args:
            return

        # (sanity) check values
        self._check_values()

        tmp_dir = self._directory_temp
        directory_output = self._directory_output
        assert isinstance(tmp_dir, str)
        assert isinstance(directory_output, str)
        if not self.validate_dir(tmp_dir) or not self.validate_dir(
                directory_output, only_parent=True):
            self._valid_args = False
            return

        self._project_file_folder_handler = ProjectFileFolderHandle(
            tmp_dir=tmp_dir,
            output_dir=directory_output,
            time_stamp=None,
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)

    def _get_mg_analyse_executable(self):
        """
		Get the path of the marker gene analyse main script

		@return: File path
		@rtype: str|unicode
		"""
        return os.path.join(self._directory_pipeline, "rnahmm", "run.py")

    def _get_directory_pipeline(self):
        """
		Get pipeline location based on script location

		@return: Location of pipeline
		@rtype: str | unicode
		"""
        return self.get_full_path(
            os.path.dirname(os.path.realpath(sys.argv[0])))

    def to_file(self, file_path):
        """
		Write arguments as configuration file

		@param file_path:
		@type file_path: str | unicode

		@rtype: None
		"""
        assert self.validate_dir(file_path, only_parent=True)
        file_directory = os.path.dirname(file_path)
        if not os.path.isdir(file_directory):
            self._logger.error(
                "Directory does not exist: '{}'".format(file_directory))
            return
        with open(file_path, 'w') as file_handler:
            file_handler.write(self.to_string())

    def to_string(self):
        """
		Return arguments as string

		@return: arguments as string
		@rtype: str | unicode
		"""
        result_string = """Parameter:
		_Main_
		Config file:\t\t'{config}'
		Pipeline directory:\t'{pipe}'
		Output directory:\t'{out}'
		Phase:\t\t\t{stage}
		Processors:\t\t{pool}

		_MarkerGeneExtraction_
		Ref. genomes:\t\t'{ir}'
		Ref. 16S:\t\t'{irf}'
		New genomes:\t\t'{i}'

		_MarkerGeneClustering_
		Ref. SILVA:\t\t'{silva}
		Metadata Table in:\t'{im}
		Metadata Table out:\t'{om}
		Distance Cutoff:\t\t{th}

		_MarkerGeneClassification_
		Ref. NCBI:\t\t'{ncbi}'
		OTU dist.:\t\t{otu}
		Min. Clas. dist.:\t{mcd}

""".format(config=self._file_path_config,
           pipe=self._directory_pipeline,
           out=self._directory_output,
           stage=self._phase,
           pool=self._max_processors,
           ir=self._file_path_reference_genome_locations,
           irf=self._file_path_reference_markergene,
           i=self._file_path_query_genomes_location_file,
           im=self._metadata_table_in,
           om=self._project_file_folder_handler.get_file_path_meta_data_table(
           ),
           th=self._distance_cutoff,
           silva=self._silva_reference_directory,
           ncbi=self._ncbi_reference_directory,
           otu=self._otu_distance,
           mcd=self._classification_distance_minimum)
        return result_string

    def _input_valid(self):
        """
		Return True if input seems valid.

		@return: True if arguemnts valid
		@rtype: bool
		"""
        return self._valid_args

    def _validate_genome_ids(self):
        """
		Validate genome ids

		@return:
		"""
        file_path_reference_genome_locations = self._file_path_reference_genome_locations
        file_path_query_genomes_location_file = self._file_path_query_genomes_location_file
        silva_reference_directory = self._silva_reference_directory
        assert isinstance(file_path_reference_genome_locations, str)
        assert isinstance(file_path_query_genomes_location_file, str)
        assert isinstance(silva_reference_directory, str)
        data_table_reference = MetadataTable(separator=self._separator,
                                             logfile=self._logfile,
                                             verbose=self._verbose)
        data_table_reference.read(file_path_reference_genome_locations)
        reference_gids = data_table_reference.get_column(0)
        reference_gids_set = set(reference_gids)
        if not len(reference_gids) == len(reference_gids_set):
            self._valid_args = False
            self._logger.error("Reference genome ids are not unique")
            return

        data_table_query = MetadataTable(separator=self._separator,
                                         logfile=self._logfile,
                                         verbose=self._verbose)
        data_table_query.read(file_path_query_genomes_location_file)
        query_gids = data_table_query.get_column(0)
        query_gids_set = set(query_gids)
        if not len(query_gids) == len(query_gids_set):
            self._valid_args = False
            self._logger.error("Query genome ids are not unique")
            return

        data_table_silva = MetadataTable(separator=self._separator,
                                         logfile=self._logfile,
                                         verbose=self._verbose)
        file_path_silva_map = os.path.join(silva_reference_directory,
                                           MGCluster.get_file_name_of_map())
        data_table_silva.read(file_path_silva_map)
        silver_ids_set = set(data_table_silva.get_column(1))
        # silva ids are allowed to be not unique

        if not query_gids_set.isdisjoint(reference_gids_set):
            self._valid_args = False
            self._logger.error(
                "Reference and query genomes ids must be unique!")
            return
        if not query_gids_set.isdisjoint(silver_ids_set):
            self._valid_args = False
            self._logger.error("Silva and query genomes ids must be unique!")
            return

    def _check_values(self):
        """
		Validating input arguments

		@rtype: None
		"""
        if not self.validate_dir(self._directory_output,
                                 only_parent=True,
                                 key="Output directory"):
            self._valid_args = False
            return

        if not self.validate_dir(self._ncbi_reference_directory,
                                 file_names=self._ncbi_ref_files,
                                 key="NCBI reference directory"):
            self._valid_args = False
            return

        if self._directory_temp is None:
            self._directory_temp = tempfile.gettempdir()
        if not self.validate_dir(self._directory_temp, key="Temp directory"):
            self._valid_args = False
            return

        if self._phase < 2:
            if not self.validate_dir(self._rnaHmmInstallDir,
                                     key="rnaHmmInstallDir"):
                self._valid_args = False
                return

            if not self.validate_dir(self._rnaHmmInstallDir,
                                     file_names=["rna_hmm2.py", "rna_hmm3.py"
                                                 ]):
                self._valid_args = False
                return

            directory_rna_hmm = self._rnaHmmInstallDir
            assert isinstance(directory_rna_hmm, str)
            rna_hmm_wrapper = None
            if self._hmmer == 3:
                if not self.validate_dir(self._hmmerBinDir,
                                         file_names=["hmmsearch"]):
                    self._valid_args = False
                    return
                directory = self._hmmerBinDir
                assert isinstance(directory, str)
                executable = os.path.join(directory, "hmmsearch")
                if not self.validate_file(executable, executable=True):
                    self._valid_args = False
                    return
                rna_hmm_wrapper = os.path.join(directory_rna_hmm,
                                               "rna_hmm3.py")
            elif self._hmmer == 2:
                if not self.validate_file(
                        self._binary_rnammer, executable=True, key="rnammer"):
                    self._valid_args = False
                    return
                rna_hmm_wrapper = os.path.join(directory_rna_hmm,
                                               "rna_hmm2.py")

            if not self.validate_file(rna_hmm_wrapper,
                                      executable=True,
                                      key="hmmer{}".format(self._hmmer)):
                self._valid_args = False
                return

            if self._file_path_reference_genome_locations is None and self._file_path_reference_markergene is None:
                self._logger.error(
                    "'-ir' or '-irf' Reference genome maping file is required!"
                )
                self._valid_args = False
                return
            else:  # if not os.path.isfile(self.input_reference_file) and not os.path.isfile(self.input_reference_fna_file):
                file_path = self._file_path_reference_genome_locations or self._file_path_reference_markergene
                if not self.validate_file(file_path, key="reference genome"):
                    self._valid_args = False
                    return

        if self._phase == 0 or self._phase > 1:
            if not self.validate_file(self._metadata_table_in,
                                      key="Metadata file"):
                self._valid_args = False
                return

            if not self.validate_dir(self._silva_reference_directory,
                                     file_names=self._silva_ref_files,
                                     key="SILVA reference directory"):
                self._valid_args = False
                return

            if not self.validate_file(
                    self._binary_mothur, executable=True, key="mothur"):
                self._valid_args = False
                return

            if self._distance_cutoff is None:
                self._logger.error("A max distance threshold is required!")
                self._valid_args = False
                return
            elif not self.validate_number(self._distance_cutoff,
                                          minimum=0,
                                          maximum=1,
                                          zero=False,
                                          key="Max distance threshold"):
                self._valid_args = False
                return

            if self._otu_distance is None:
                self._logger.error("A threshold is required for otus!")
                self._valid_args = False
                return
            elif not self.validate_number(self._otu_distance,
                                          minimum=0,
                                          maximum=1,
                                          zero=False,
                                          key="OTU distance threshold"):
                self._valid_args = False
                return

            if self._classification_distance_minimum is None:
                self._logger.error(
                    "A minimum classification distance threshold is required!")
                self._valid_args = False
                return
            elif not self.validate_number(
                    self._classification_distance_minimum,
                    minimum=0,
                    maximum=1,
                    zero=False,
                    key="Minimum classification distance threshold"):
                self._valid_args = False
                return

            if self._cluster_method is None:
                self._logger.error(
                    "A clustering method must be chosen: {}!".format(', '.join(
                        self._cluster_method_choices)))
                self._valid_args = False
                return

            if self._cluster_method not in self._cluster_method_choices:
                self._logger.error(
                    "A clustering method must be chosen: {}!".format(', '.join(
                        self._cluster_method_choices)))
                self._valid_args = False
                return

        if not self.validate_file(self._file_path_query_genomes_location_file,
                                  key="Query genome locations"):
            self._valid_args = False
            return

        if self._max_processors is None:
            self._logger.error("A number of available processors is required!")
            self._valid_args = False
            return
        elif not self.validate_number(
                self._max_processors, minimum=1, key="Available processors"):
            self._valid_args = False
            return

        expected_output_size_gb = self._expected_output_size_in_giga_byte()
        expected_tmp_size = expected_output_size_gb
        if not self.validate_free_space(
                directory=self._directory_temp,
                required_space_in_gb=expected_tmp_size):
            self._valid_args = False
            return

        directory_output = self._directory_output
        assert isinstance(directory_output, str)
        if not os.path.exists(directory_output):
            directory_output = os.path.dirname(directory_output)
        if not self.validate_free_space(
                directory=directory_output,
                required_space_in_gb=expected_output_size_gb):
            self._valid_args = False
            return

        if self._file_path_nucmer:
            self.validate_file(self._file_path_nucmer, executable=True)

        self._validate_genome_ids()

    # read the configuration file
    def _read_config(self):
        """
		Read arguments from a configuration file

		@rtype: None
		"""
        if not self.validate_file(self._file_path_config,
                                  key="Configuration file"):
            self._valid_args = False
            return

        self._config = ConfigParserWrapper(self._file_path_config,
                                           logfile=self._logfile,
                                           verbose=self._verbose)
        section = "Main"
        if self._phase is None:
            self._phase = self._config.get_value("phase",
                                                 is_digit=True,
                                                 silent=False)
        self._directory_temp = self._config.get_value("temp_directory",
                                                      is_path=True,
                                                      silent=False)
        self._directory_output = self._config.get_value("output_directory",
                                                        is_path=True,
                                                        silent=False)
        if self._max_processors is None:
            self._max_processors = self._config.get_value("max_processors",
                                                          is_digit=True)
        self._validate_genomes = self._config.get_value("validate_genomes",
                                                        is_boolean=True)

        section = "MarkerGeneExtraction"
        self._binary_rnammer = self._config.get_value("rnammer", is_path=True)
        self._hmmerBinDir = self._config.get_value("hmmerBinDir", is_path=True)
        self._rnaHmmInstallDir = self._config.get_value("rnaHmmInstallDir",
                                                        is_path=True)
        self._file_path_reference_genome_locations = self._config.get_value(
            "reference_genomes_file", is_path=True)
        self._file_path_map_reference_genome_id_to_tax_id = self._config.get_value(
            "reference_genomes_map_file", is_path=True)
        self._file_path_reference_markergene = self._config.get_value(
            "input_reference_fna_file", is_path=True)
        self._hmmer = self._config.get_value("hmmer", is_digit=True)
        self._file_path_query_genomes_location_file = self._config.get_value(
            "input_genomes_file", is_path=True)

        section = "MarkerGeneClustering"
        self._binary_mothur = self._config.get_value("mothur", is_path=True)
        self._metadata_table_in = self._config.get_value("metadata_table_in",
                                                         is_path=True)
        self._silva_reference_directory = self._config.get_value(
            "silva_reference_directory", is_path=True)
        self._cluster_method = self._config.get_value("cluster_method")
        self._distance_cutoff = self._config.get_value("max_threshold",
                                                       is_digit=True)
        self._otu_distance = self._config.get_value("otu_distance",
                                                    is_digit=True)
        self._classification_distance_minimum = self._config.get_value(
            "classification_distance", is_digit=True)

        section = "MarkerGeneAnnotation"
        self._ncbi_reference_directory = self._config.get_value(
            "ncbi_reference_directory", is_path=True)
        self._file_path_nucmer = self._config.get_value("nucmer", is_path=True)
        self._annotate_classify = self._config.get_value("classify",
                                                         is_boolean=True)
        self._annotate_novelty = self._config.get_value("novelty",
                                                        is_boolean=True)
        self._annotate_otu = self._config.get_value("otu", is_boolean=True)
        self._annotate_ani = self._config.get_value("ani", is_boolean=True)

    @staticmethod
    def _expected_output_size_in_giga_byte():
        """
		Get expected output size of the data in giga bytes
		@todo: Write a good predicting algorithm

		@return: Expected output size of the data in giga bytes
		@rtype: int|long
		"""
        expected_output_size = 0
        return expected_output_size

    def _read_options(self, options):
        """
		Read option from parsed arguments with argparse

		@param options: parser.parse_args()
		@type options: Any

		@rtype: None
		"""
        config_file = options.config_file
        if config_file is not None:
            config_file = self.get_full_path(config_file)
        self._file_path_config = config_file
        self._verbose = options.verbose
        self._debug = options.debug_mode
        self._phase = options.phase
        self._max_processors = options.max_processors

    @staticmethod
    def _get_parser_options(args=None, version="Prototype"):
        """
		Parsing of passed arguments.

		@param args: Passed arguemnts

		@return: any
		"""
        description = """
	#######################################
	#    GenomeAnnotationPipeline         #
	#    Version {}#
	#######################################

	Pipeline for the extraction of marker genes, clustering and taxonomic classification""".format(
            version.ljust(25))
        parser = argparse.ArgumentParser(
            usage="python %(prog)s configuration_file_path",
            version="MetagenomeSimulationPipeline TC {}".format(version),
            description=description,
            formatter_class=argparse.RawTextHelpFormatter)
        parser.add_argument("-verbose",
                            "--verbose",
                            action='store_true',
                            default=False,
                            help="display more information!")
        parser.add_argument("-debug",
                            "--debug_mode",
                            action='store_true',
                            default=False,
                            help="tmp folders will not be deleted!")
        parser.add_argument(
            "-log",
            "--logfile",
            type=str,
            default=None,
            help="pipeline output will written to this log file")

        group_input = parser.add_argument_group('optional config arguments')
        group_input.add_argument("-p",
                                 "--max_processors",
                                 default=None,
                                 type=int,
                                 help="number of available processors")
        group_input.add_argument("-s",
                                 "--phase",
                                 default=None,
                                 type=int,
                                 choices=[0, 1, 2, 3],
                                 help='''
0 -> Full run (Default)
1 -> Marker gene extraction
2 -> Gene alignment and clustering
3 -> Annotation of Genomes
''')
        group_input = parser.add_argument_group('required')
        group_input.add_argument(
            "config_file",
            type=str,
            default=None,
            help="path to the configuration file of the pipeline")

        if args is None:
            return parser.parse_args()
        else:
            return parser.parse_args(args)