예제 #1
0
    def _get_genome_id_to_path_map(
            self, file_path_of_file_mapping_genome_id_to_paths,
            list_of_drawn_genome_id):
        """
		Get a dictionary mapping genome id to the path of their genome

		@param file_path_of_file_mapping_genome_id_to_paths: File path to file with format 'id \t path'
		@type file_path_of_file_mapping_genome_id_to_paths: str | unicode
		@param list_of_drawn_genome_id: List of genome identifiers
		@type list_of_drawn_genome_id: list[str|unicode]

		@return: genome ids mapped to their gnome file path
		@rtype: dict[str|unicode, str|unicode]
		"""
        genome_id_to_path_map = {}
        mdt = MetadataTable(logfile=self._logfile, verbose=self._verbose)
        mdt.read(file_path_of_file_mapping_genome_id_to_paths)
        if mdt.get_number_of_rows() > 0:
            genome_id_to_path_map = mdt.get_map(0, 1, unique_key=True)
        msg = "'{}' is missing one or more genome id".format(
            os.path.basename(file_path_of_file_mapping_genome_id_to_paths))
        assert set(genome_id_to_path_map.keys()).issuperset(
            list_of_drawn_genome_id), msg
        return {
            genome_id: genome_id_to_path_map[genome_id]
            for genome_id in list_of_drawn_genome_id
        }
예제 #2
0
    def _generate_gsa_pooled(self):
        """
        Create a perfect assembly of the reads of all samples.
            merge all sample bam files and create a assembly of all of them
            - create folder reads_on_genomes wherever you are
            - merge bamfiles from list_of_bamdirs into this dirs
            - run gsa for reads_on_genomes
            - create mapping

        @return: file paths of assembly
        @rtype: str|unicode
        """
        meta_data_table = MetadataTable(separator=self._separator,
                                        logfile=self._logfile,
                                        verbose=self._verbose)

        gs_handler = GoldStandardAssembly(
            file_path_samtools=self._executable_samtools,
            max_processes=self._max_processors,
            tmp_dir=self._project_file_folder_handler.get_tmp_wd(),
            logfile=self._logfile,
            verbose=self._verbose)

        file_path_genome_locations = self._project_file_folder_handler.get_genome_location_file_path(
        )
        meta_data_table.read(file_path_genome_locations)
        dict_id_to_file_path_fasta = meta_data_table.get_map(0, 1)

        # list_of_directory_bam = [
        #     self._project_file_folder_handler.get_bam_dir(str(sample_index))
        #  for sample_index in range(self._number_of_samples)]
        list_of_directory_bam = self._project_file_folder_handler.get_bam_dirs(
        )
        list_of_sample_folders = [
            os.path.basename(os.path.dirname(directory_bam))
            for directory_bam in list_of_directory_bam
        ]
        self._logger.info("Samples used for pooled assembly: '{}'".format(
            "', '".join(list_of_sample_folders)))

        file_path_output_gsa_pooled = gs_handler.pooled_gold_standard_by_dir(
            list_of_directory_bam, dict_id_to_file_path_fasta)

        if not self._phase_anonymize:
            gsa_pooled_output = self._project_file_folder_handler.get_gsa_pooled_file_path(
            )
            if self._phase_compress:
                self._list_tuple_archive_files.append(
                    (file_path_output_gsa_pooled, gsa_pooled_output + ".gz"))
            else:
                shutil.move(file_path_output_gsa_pooled, gsa_pooled_output)

        return file_path_output_gsa_pooled
예제 #3
0
    def get_dict_gid_to_genome_file_path(self):
        """
        Get map genome id to genome file path

        @return: Genome id to geone file path
        @rtype: dict[str|unicode, str|unicode]
        """
        meta_data_table = MetadataTable(
            separator=self._separator,
            logfile=self._logfile,
            verbose=self._verbose)

        file_path_genome_locations = self._project_file_folder_handler.get_genome_location_file_path()
        if not self._validator.validate_file(file_path_genome_locations, silent=True):
            msg = "Required file not found! Was design of communities not completed?"
            raise RuntimeError(msg)
        meta_data_table.read(file_path_genome_locations)
        return meta_data_table.get_map(0, 1)
예제 #4
0
    def get_dict_unique_id_to_genome_file_path(self, file_path_mapping):
        """
            Get a map, original sequence name to anonymous sequence name from a mapping file.

            @attention: anonymous name in second column

            @param file_path_mapping: File path to mapping file
            @type file_path_mapping: str | unicode

            @return: Mapping of anonymous sequence name to original sequence name
            @rtype: dict[str | unicode, str | unicode]
        """
        assert isinstance(file_path_mapping, str)

        table = MetadataTable(logfile=self._logfile, verbose=self._verbose)
        table.read(file_path_mapping, separator=self._separator)
        dict_mapping = table.get_map(0, 1)
        return dict_mapping
예제 #5
0
    def _get_genome_id_to_path_map(self, file_path):
        """
		Get a map of genome_id to genome path

		@param file_path: File path
		@type file_path: str | unicode

		@return: map of genome_id to genome path
		@rtype: dict[str|unicode, str|unicode]
		"""
        assert self.validate_file(file_path)

        data_table = MetadataTable(separator=self._separator,
                                   logfile=self._logfile,
                                   verbose=self._verbose)
        data_table.read(file_path, column_names=False)
        if data_table.get_number_of_rows() == 0:
            self._logger.warning("No data in file '{}'.".format(file_path))
            return {}
        dict_genome_id_to_path = data_table.get_map(0, 1)
        return dict_genome_id_to_path
예제 #6
0
    def get_dict_genome_id_to_tax_id(
        self, file_path_metadata):
        """
            Get a map, genome id to taxonomic id from a metadata file.

            @attention: "genome_ID" and "NCBI_ID" assumed default column names.

            @param file_path_metadata: File path to metadata file
            @type file_path_metadata: str | unicode

            @return: Mapping of  genome id to taxonomic id
            @rtype: dict[str | unicode, str | unicode]
        """
        assert isinstance(file_path_metadata, str)
        assert isinstance(self._column_name_gid, str)
        assert isinstance(self._column_name_ncbi, str)
        assert isinstance(self._separator, str)

        table = MetadataTable(logfile=self._logfile, verbose=self._verbose)
        table.read(file_path_metadata, separator=self._separator, column_names=True)
        dict_genome_id_to_tax_id = table.get_map(self._column_name_gid, self._column_name_ncbi)
        return dict_genome_id_to_tax_id
예제 #7
0
    def _get_genome_id_to_path_map(
            self, file_path_of_file_mapping_genome_id_to_paths,
            list_of_drawn_genome_id):
        """
		Get a dictionary mapping genome id to the path of their genome

		@param file_path_of_file_mapping_genome_id_to_paths: File path to file with format 'id \t path'
		@type file_path_of_file_mapping_genome_id_to_paths: str | unicode
		@param list_of_drawn_genome_id: List of genome identifiers
		@type list_of_drawn_genome_id: list[str|unicode]

		@return: genome ids mapped to their gnome file path
		@rtype: dict[str|unicode, str|unicode]
		"""
        mdt = MetadataTable(logfile=self._logfile, verbose=self._verbose)
        mdt.read(file_path_of_file_mapping_genome_id_to_paths)
        genome_id_to_path_map = mdt.get_map(0, 1, unique_key=True)
        assert set(
            genome_id_to_path_map.keys()).issuperset(list_of_drawn_genome_id)
        return {
            genome_id: genome_id_to_path_map[genome_id]
            for genome_id in list_of_drawn_genome_id
        }
예제 #8
0
    def marker_gene_annotation(self):
        """As the third step, the unpublished genomes are classified based on the clusters they are found in.
		Since clusters were made in 0.01 distance steps, the classification can be done using the smallest clusters first, using bigger ones if a classification can not be made.
		If a marker gene of an unpublished genome is found in a cluster together with references, a common taxon that 90% of sequences agree with will be the predicted taxon.
		The 90% is arbitrary chosen and is required because of taxonomic inconsistencies.
		When a specific rank is checked for agreement, sequences with unknown classification on that rank are ignored.
		TODO: check for taxonomic consitency on higher ranks for those!
		Novelty prediction is based on the predicted taxon's rank. a high rank (phylum, order, class) with low distance can be a strong indicator for taxonomic inconsistencies.
		But it could also be caused by sequences that are not fully classified, yet.
		input:
		- meta data table with a list of the genomes that are to be classified
		- working directory where the results will be saved and which contains the mothur formatted file with the clusters
		output:
		- meta data table with a list of the genomes, with columns added that contain cluster based tax prediction, rank and novelty prediction

		@rtype: None
		"""
        # set of taxonomic ids of well known genomes
        data_table = MetadataTable(separator=self._separator,
                                   logfile=self._logfile,
                                   verbose=self._verbose)
        data_table.read(self._file_path_map_reference_genome_id_to_tax_id)
        list_of_refernce_ncbi_id = data_table.get_column(1)

        # mapping of all internal ids
        # data_table_iid_mapping_silva = MetadataTable(
        # 	separator=self._separator, logfile=self._logfile, verbose=self._verbose)
        # file_path_silva_map = os.path.join(self._silva_reference_directory, MGCluster.get_file_name_of_map())
        # data_table_iid_mapping_silva.read(file_path_silva_map)
        data_table_iid_mapping = MetadataTable(separator=self._separator,
                                               logfile=self._logfile,
                                               verbose=self._verbose)
        data_table_iid_mapping.read(
            self._project_file_folder_handler.get_file_path_internal_id_map())
        # data_table_iid_mapping.concatenate(data_table_iid_mapping_silva, strict=False)

        mg_annotate = MGAnnotate(
            # ncbi_reference_directory=self._ncbi_reference_directory,
            file_path_query_genomes_location=self.
            _file_path_query_genomes_location_file,
            file_path_reference_genomes_location=self.
            _file_path_reference_genome_locations,
            file_path_reference_taxid_map=self.
            _file_path_map_reference_genome_id_to_tax_id,
            file_path_nucmer=self._file_path_nucmer,
            column_name_genome_id=self._column_name_genome_id,
            column_name_otu=self._column_name_otu_id,
            column_name_novelty_category=self._column_name_cluster_novelty,
            column_name_ncbi=self._column_name_ncbi,
            column_name_scientific_name=self.
            _column_name_cluster_scientific_name,
            column_name_ani=self._column_name_ani,
            column_name_ani_novelty=self._column_name_ani_novelty,
            column_name_ani_ncbi=self._column_name_ani_compare,
            column_name_ani_scientific_name=self.
            _column_name_ani_scientific_name,
            temp_directory=self._directory_temp,
            max_processors=self._max_processors,
            separator=self._separator,
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)

        metadata_table = MetadataTable(separator=self._separator,
                                       logfile=self._logfile,
                                       verbose=self._verbose)
        metadata_table.read(self._metadata_table_in, column_names=True)
        metadata_table.remove_empty_columns()

        list_query_gid = metadata_table.get_column(self._column_name_genome_id)
        if list_query_gid is None:
            msg = "Meta data file does not contain the required header '{}'".format(
                self._column_name_genome_id)
            self._logger.error(msg)
            raise IOError(msg)

        taxonomy = NcbiTaxonomy(self._ncbi_reference_directory,
                                verbose=self._verbose,
                                logfile=self._logfile)

        mothur_cluster = MothurCluster(
            self._precision,
            iid_gid_mapping=data_table_iid_mapping.get_map(0, 1),
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)
        mothur_cluster.read(
            self._project_file_folder_handler.get_file_path_cluster_mg_16s(),
            list_query_gid)

        taxonomy_cluster = TaxonomicCluster(
            mothur_cluster,
            taxonomy,
            iid_tid_map=data_table_iid_mapping.get_map(0, 2),
            set_reference_genome_ncbi=set(list_of_refernce_ncbi_id),
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)

        if self._annotate_classify:
            self._logger.info("Taxonomic classification")
            # also, novelty based clustering
            mg_annotate.taxonomic_classification(
                metadata_table, mothur_cluster, taxonomy_cluster, taxonomy,
                self._classification_distance_minimum)
            self._logger.info("Taxonomic classification Done")

        if self._annotate_novelty:
            self._logger.info("Novelty categorisation")
            # novelty by comparing with reference taxonomic ids
            mg_annotate.novelty_categorisation(taxonomy,
                                               set(list_of_refernce_ncbi_id),
                                               metadata_table)
            self._logger.info("Novelty categorisation Done")

        if self._annotate_otu:
            self._logger.info("OTU")
            mg_annotate.set_otu_id(metadata_table, mothur_cluster,
                                   self._otu_distance)
            self._logger.info("OTU Done")

        if self._annotate_ani:
            self._logger.info("Calculating ANI")
            mg_annotate.calculate_ani(mothur_cluster, taxonomy, metadata_table,
                                      self._distance_cutoff,
                                      self._ani_minimum_alignment)
            self._logger.info("Calculating ANI Done")
        metadata_table.write(
            self._project_file_folder_handler.get_file_path_meta_data_table(),
            column_names=True)
예제 #9
0
    def __init__(self,
                 mg_analyse_executable,
                 file_path_query_genome_file_paths,
                 file_path_reference_genome_file_paths,
                 file_path_name_reference_marker_genes,
                 config_path,
                 file_path_map_reference_genome_id_to_tax_id=None,
                 max_processors=1,
                 temp_directory=None,
                 separator="\t",
                 logfile=None,
                 verbose=False,
                 debug=False):
        """
		Constructor

		@param mg_analyse_executable: File path to modified tool of Ivan
		@type mg_analyse_executable: str | unicode
		@param file_path_query_genome_file_paths: File path to file with the location of genomes to be classified
		@type file_path_query_genome_file_paths: str | unicode
		@param file_path_reference_genome_file_paths: File path to file with the location of reference genomes
		@type file_path_reference_genome_file_paths: str | unicode
		@param file_path_name_reference_marker_genes: File path to fasta file with list of marker gene sequences
		@type file_path_name_reference_marker_genes: str | unicode
		@param config_path: File path to configuration file
		@type config_path: str | unicode
		@param file_path_map_reference_genome_id_to_tax_id: Mapping of Reference genome_id to their taxonomic assignment
		@type file_path_map_reference_genome_id_to_tax_id: str | unicode
		@param max_processors: Amount of available processors
		@type max_processors: int | long
		@param temp_directory: File path to temporary storage
		@type temp_directory: str | unicode
		@param separator: Separator of metadata files
		@type separator: str | unicode
		"""
        super(MGExtract, self).__init__(logfile=logfile,
                                        verbose=verbose,
                                        debug=debug)
        assert file_path_map_reference_genome_id_to_tax_id is None or self.validate_file(
            file_path_map_reference_genome_id_to_tax_id)
        assert self.validate_file(file_path_query_genome_file_paths)
        assert file_path_reference_genome_file_paths is None or self.validate_file(
            file_path_reference_genome_file_paths)
        assert file_path_name_reference_marker_genes is None or self.validate_file(
            file_path_name_reference_marker_genes)
        assert self.validate_file(config_path)
        assert self.validate_file(mg_analyse_executable, executable=True)
        assert self.validate_number(max_processors, minimum=1)
        assert self.validate_dir(temp_directory)
        self._temp_directory = temp_directory
        self._mg_analyse_executable = mg_analyse_executable
        self._file_path_query_genome_file_paths = file_path_query_genome_file_paths
        self._file_path_reference_genome_file_paths = file_path_reference_genome_file_paths
        self._file_path_reference_marker_genes = file_path_name_reference_marker_genes
        self._config_path = config_path
        self._max_processors = max_processors
        self._debug = debug
        self._working_dirs = {}
        self._genome_id_to_tax_id = None
        self._separator = separator
        if file_path_map_reference_genome_id_to_tax_id is None:
            return
        meta_data_table = MetadataTable(separator=self._separator,
                                        logfile=logfile,
                                        verbose=verbose)
        meta_data_table.read(file_path_map_reference_genome_id_to_tax_id,
                             column_names=False)
        self._genome_id_to_tax_id = meta_data_table.get_map(0, 1)
        del meta_data_table
예제 #10
0
    def __init__(self,
                 file_path_query_genomes_location,
                 file_path_reference_genomes_location,
                 file_path_reference_taxid_map,
                 file_path_nucmer="nucmer",
                 minimum_alignment=0.8,
                 separator='\t',
                 temp_directory=None,
                 max_processors=1,
                 logfile=None,
                 verbose=False,
                 debug=False):
        """
		Constructor

		@param file_path_query_genomes_location:
		@type file_path_query_genomes_location: str|unicode
		@param file_path_reference_genomes_location:
		@type file_path_reference_genomes_location: str|unicode
		@param file_path_reference_taxid_map:
		@type file_path_reference_taxid_map: str|unicode
		@param file_path_nucmer:
		@type file_path_nucmer: str|unicode
		@param minimum_alignment:
		@type minimum_alignment: str|unicode|int|long|float
		@param separator:
		@type separator: str|unicode
		@param temp_directory:
		@type temp_directory: str|unicode
		@param max_processors:
		@type max_processors: int|long
		@param logfile: file handler or file path to a log file
		@type logfile: file | FileIO | StringIO | basestring
		@param verbose: Not verbose means that only warnings and errors will be past to stream
		@type verbose: bool
		@param debug: Display debug messages
		@type debug: bool

		@rtype: None
		"""
        assert self.validate_file(file_path_query_genomes_location)
        assert self.validate_file(file_path_reference_genomes_location)
        assert self.validate_file(file_path_reference_taxid_map)
        assert self.validate_file(file_path_nucmer, executable=True)
        assert temp_directory is None or self.validate_dir(temp_directory)
        assert isinstance(minimum_alignment, (int, float))
        assert self.validate_number(minimum_alignment, minimum=0, maximum=1)
        assert isinstance(separator, basestring)
        assert isinstance(max_processors, (int, long))
        assert self.validate_number(max_processors, minimum=1)
        super(ANIm, self).__init__(logfile=logfile,
                                   verbose=verbose,
                                   debug=debug)
        self._CUM_RETVALS = 0
        self._max_processors = max_processors
        self._file_path_nucmer = file_path_nucmer
        self._tmp_dir = temp_directory
        self._separator = separator
        if temp_directory is None:
            self._tmp_dir = tempfile.mkdtemp()
        else:
            self._tmp_dir = tempfile.mkdtemp(dir=temp_directory)
        self._cmd_lines = []
        data_table = MetadataTable(separator=self._separator,
                                   logfile=self._logfile,
                                   verbose=self._verbose)
        data_table.read(file_path_query_genomes_location)
        self._query_gid_to_location = data_table.get_map(0, 1)
        data_table.read(file_path_reference_genomes_location)
        self._reference_gid_to_location = data_table.get_map(0, 1)
        data_table.read(file_path_reference_taxid_map)
        self._reference_gid_to_taxid = data_table.get_map(0, 1)
        self._total_lengths = {}
        self._minimum_alignment = minimum_alignment
        self._used_file_names = {}