示例#1
0
    def _read_genome_location_file(self, file_path):
        """
        Read file with the file paths of gnomes

        @param file_path: File genome id associated with the file path of a genome
        @type file_path: str | unicode

        @return: Dictionary of genome id to file path
        @rtype: dict[str|unicode, str|unicode]
        """
        self._logger.info('Reading genome location file')
        assert self.validate_file(file_path)
        dict_id_file_path = {}
        metadata_table = MetadataTable(logfile=self._logfile,
                                       verbose=self._verbose,
                                       separator=self._separator)
        iterator_distributions = metadata_table.parse_file(file_path,
                                                           as_list=True)
        for genome_id, file_path_genome in iterator_distributions:
            assert genome_id != '', "Invalid genomid: '{}'".format(genome_id)
            assert file_path_genome != '', "Invalid file path: '{}'".format(
                genome_id)
            assert self.validate_file(
                file_path_genome), "Invalid file path: '{}'".format(genome_id)

            # check uniqueness
            assert genome_id not in dict_id_file_path, "Genome '{}' not unique in the distribution file!".format(
                genome_id)
            dict_id_file_path[genome_id] = file_path_genome
        return dict_id_file_path
    def write_taxonomic_profile_from_abundance_files(self,
                                                     metadata_table,
                                                     list_of_file_paths,
                                                     directory_output,
                                                     sample_id=""):
        """
		Write a taxonomic profile file for each relative abundance file

		@param metadata_table: Contains metadata of all communities
		@type metadata_table: MetadataTable
		@param list_of_file_paths: List of abundance file paths
		@type list_of_file_paths: list[str | unicode]
		@param directory_output: Profiles are written in this directory
		@type directory_output: str | unicode
		@param sample_id: Identifier of a sample
		@type sample_id: str | unicode
		"""
        metadata_table_tmp = MetadataTable(logfile=self._logfile,
                                           verbose=self._verbose)
        for index_abundance, file_path in enumerate(list_of_file_paths):
            community_abundance = metadata_table_tmp.parse_file(
                file_path, column_names=False)
            file_path_output = os.path.join(
                directory_output,
                self._filename_taxonomic_profile.format(
                    sample_index=index_abundance))
            with open(file_path_output, 'w') as stream_output:
                self.write_taxonomic_profile(community_abundance,
                                             stream_output, metadata_table,
                                             sample_id)
示例#3
0
    def _read_distribution_file(self, file_path):
        """
        Read file with the distribution of a sample

        @param file_path: File genome id associated with the abundance of a genome
        @type file_path: str | unicode

        @return: Dictionary of genome id to file path
        @rtype: dict[str|unicode, float]
        """
        self._logger.info('Reading distribution file')
        assert self.validate_file(file_path)
        dict_id_abundance = {}
        # dict_id_file_path = {}
        metadata_table = MetadataTable(logfile=self._logfile, verbose=self._verbose, separator=self._separator)
        iterator_distributions = metadata_table.parse_file(file_path, as_list=True)
        # for genome_id, abundance, genome_length, file_path_genome in iterator_distributions:
        abundance_sum = 0.
        for genome_id, abundance in iterator_distributions:
            assert genome_id != '', "Invalid genom id: '{}'".format(genome_id)
            assert abundance != '', "Invalid abundance: '{}'".format(genome_id)
            abundance = float(abundance)
            assert self.validate_number(abundance, zero=True), "Invalid abundance: '{}'".format(genome_id)

            assert genome_id not in dict_id_abundance, "Genome '{}' not unique in the distribution file!".format(genome_id)
            dict_id_abundance[genome_id] = abundance
            abundance_sum += abundance
        dict_id_abundance = {x : dict_id_abundance[x]/abundance_sum for x in dict_id_abundance} # normalise to 1
        return dict_id_abundance
    def merge_communities(self, list_of_communities,
                          list_of_comunity_distribution_file_paths,
                          index_sample, file_path_output):
        """
        Combine distributions of communities and adjust them according to their ratio.

        @param list_of_communities: List of community inputs
        @type list_of_communities: list[Community]
        @param list_of_comunity_distribution_file_paths: List of distributions
        @type list_of_comunity_distribution_file_paths: list[str | unicode]
        @param index_sample: Index of sample
        @type index_sample: int | long
        @param file_path_output: Sample distribution file path
        @type file_path_output: str | unicode

        @return: Nothing
        @rtype: None
        """
        assert isinstance(list_of_communities, list)
        for community in list_of_communities:
            assert isinstance(community, Community)
        # assert isinstance(metadata_table, MetadataTable)

        # read communities and adapt to ratio
        list_of_community_total_abundance = [0] * len(list_of_communities)
        sample_total_abundance = 0

        genomes = set()
        metadata_table_community = MetadataTable(logfile=self._logfile,
                                                 verbose=self._verbose)
        for index_community, file_path in enumerate(
                list_of_comunity_distribution_file_paths):
            community_distribution = metadata_table_community.parse_file(
                file_path, column_names=False)
            for row in community_distribution:
                genome_id = row[0]
                if genome_id in genomes:
                    raise ValueError(
                        "Genome id '{}' not unique".format(genome_id))
                genomes.add(genome_id)
                abundance = row[index_sample + 1]
                list_of_community_total_abundance[index_community] += float(
                    abundance)  # * float(sequence_info[4])
            community_distribution.close()

        for index_community, _ in enumerate(
                list_of_comunity_distribution_file_paths):
            sample_total_abundance += list_of_community_total_abundance[
                index_community]

        # out.append(read_communities[0][0])
        list_of_community_factor = [0.0] * len(list_of_communities)

        for index_community, _ in enumerate(
                list_of_comunity_distribution_file_paths):
            ratio = float(list_of_communities[index_community].ratio)
            community_total_abundance = float(
                list_of_community_total_abundance[index_community])
            current_proportion_in_sample = community_total_abundance / float(
                sample_total_abundance)
            list_of_community_factor[
                index_community] = ratio / current_proportion_in_sample
            # self.update_community(communities[index_community], factor)

        # join communities
        communities = []
        for index_community, file_path in enumerate(
                list_of_comunity_distribution_file_paths):
            communities.append(
                metadata_table_community.parse_file(file_path,
                                                    column_names=False))

        # print_ratios(communities)
        with open(file_path_output, 'w') as stream_output:
            self._write_joined_community(communities, list_of_community_factor,
                                         index_sample, stream_output)