Пример #1
0
    def _write(self):
        with open(self.path, 'w') as fh:
            fh.write('user_genome\treference_genome\tfastani_ani\tfastani_af\t'
                     'reference_taxonomy\tsatisfies_gtdb_circumscription_criteria\n')
            for gid in sorted(self.genomes):
                if gid in self.results:
                    thresh_results = [(ref_gid, hit) for (ref_gid, hit) in
                                      self.results[gid].items() if hit['af'] >= self.min_af]
                    closest = sorted(thresh_results, key=lambda x: (-x[1]['ani'], -x[1]['af']))
                    if len(closest) > 0:
                        ref_gid = closest[0][0]
                        canonical_rid = canonical_gid(ref_gid)
                        taxonomy_str = ';'.join(self.taxonomy[canonical_rid])
                        gtdb_ani_radius = self.gtdb_radii.get_rep_ani(canonical_rid)
                        closest_ani = closest[0][1]["ani"]
                        closest_af = closest[0][1]["af"]

                        fh.write(f'{gid}\t{ref_gid}')
                        fh.write(f'\t{closest_ani}\t{closest_af}')
                        fh.write(f'\t{taxonomy_str}')
                        fh.write(f'\t{closest_ani >= gtdb_ani_radius and closest_af >= self.gtdb_min_af}\n')
                    else:
                        fh.write(f'{gid}\tno result\tno result\tno result\tno result\tno result\n')
                else:
                    fh.write(f'{gid}\tno result\tno result\tno result\tno result\n')
        self.logger.info(f'Closest representative hits saved to: {self.path}')
Пример #2
0
 def _read(self):
     """Read the file and create any data."""
     self._rep_idx, self._species_idx = dict(), dict()
     with open(self.path) as fh:
         for line in fh.readlines():
             species, genome, ani = line.strip().split('\t')
             genome = canonical_gid(genome)
             ani = float(ani)
             self._rep_idx[genome] = {'species': species, 'ani': ani}
             self._species_idx[species] = {'rep': genome, 'ani': ani}
Пример #3
0
 def _write(self):
     with open(self.path, 'w') as fh:
         fh.write('user_genome\treference_genome\tfastani_ani\tfastani_af\treference_taxonomy\n')
         for qry_gid, ref_hits in sorted(self.results.items()):
             for ref_gid, ref_hit in sorted(ref_hits.items(), key=lambda x: (-x[1]['af'], -x[1]['ani'], x[0])):
                 canonical_rid = canonical_gid(ref_gid)
                 taxonomy_str = ';'.join(self.taxonomy[canonical_rid])
                 fh.write(f'{qry_gid}\t{ref_gid}')
                 fh.write(f'\t{ref_hit["ani"]}\t{ref_hit["af"]}')
                 fh.write(f'\t{taxonomy_str}\n')
     self.logger.info(f'Summary of results saved to: {self.path}')
Пример #4
0
    def read(self,
             taxonomy_file: str,
             canonical_ids: bool = False) -> Dict[str, List[str]]:
        """Read Greengenes-style taxonomy file.

        Expected format is:
            <id>\t<taxonomy string>

        where the taxonomy string has the formats:
            d__; p__; c__; o__; f__; g__; s__

        Parameters
        ----------
        taxonomy_file : str
            Path to a Greengenes-style taxonomy file.
        canonical_ids : bool
            True if to use the canonical ID format, False otherwise.
        """

        try:
            d = {}
            with open(taxonomy_file, 'r') as f:
                for row, line in enumerate(f.readlines()):
                    line_split = line.split('\t')

                    if len(line_split) != 2:
                        raise GTDBTkExit(f'Not a tab-separated line: {line}')

                    unique_id = line_split[0]
                    if canonical_ids:
                        unique_id = canonical_gid(unique_id)

                    tax_str = line_split[1].rstrip()
                    if tax_str[-1] == ';':
                        # remove trailing semicolons which sometimes
                        # appear in Greengenes-style taxonomy files
                        tax_str = tax_str[0:-1]

                    d[unique_id] = [x.strip() for x in tax_str.split(';')]
        except:
            self.logger.error('Failed to parse taxonomy file on line %d' %
                              (row + 1))
            raise

        return d
Пример #5
0
    def read(self, taxonomy_file, canonical_ids=False):
        """Read Greengenes-style taxonomy file.

        Expected format is:
            <id>\t<taxonomy string>

        where the taxonomy string has the formats:
            d__; c__; o__; f__; g__; s__

        Parameters
        ----------
        taxonomy_file : str
            Greengenes-style taxonomy file.

        Returns
        -------
        dict[str, tuple[str, str, str, str, str, str, str]]
            d[unique_id] -> [d__<taxon>, ..., s__<taxon>]
        """

        try:
            d = {}
            with open(taxonomy_file, 'r') as f:
                for row, line in enumerate(f.readlines()):
                    line_split = line.split('\t')
                    unique_id = line_split[0]
                    if canonical_ids:
                        unique_id = canonical_gid(unique_id)

                    tax_str = line_split[1].rstrip()
                    if tax_str[-1] == ';':
                        # remove trailing semicolons which sometimes
                        # appear in Greengenes-style taxonomy files
                        tax_str = tax_str[0:-1]

                    d[unique_id] = [x.strip() for x in tax_str.split(';')]
        except:
            self.logger.error('Failed to parse taxonomy file on line %d' %
                              (row + 1))
            raise

        return d
Пример #6
0
 def _write(self):
     with open(self.path, 'w') as fh:
         fh.write('user_genome\treference_genome\tfastani_ani\tfastani_af\treference_taxonomy\n')
         for gid in sorted(self.genomes):
             if gid in self.results:
                 thresh_results = [(ref_gid, hit) for (ref_gid, hit) in
                                   self.results[gid].items() if hit['af'] >= self.min_af]
                 closest = sorted(thresh_results, key=lambda x: (-x[1]['ani'], -x[1]['af']))
                 if len(closest) > 0:
                     ref_gid = closest[0][0]
                     canonical_rid = canonical_gid(ref_gid)
                     taxonomy_str = ';'.join(self.taxonomy[canonical_rid])
                     fh.write(f'{gid}\t{ref_gid}')
                     fh.write(f'\t{closest[0][1]["ani"]}\t{closest[0][1]["af"]}')
                     fh.write(f'\t{taxonomy_str}\n')
                 else:
                     fh.write(f'{gid}\tno result\tno result\tno result\tno result\n')
             else:
                 fh.write(f'{gid}\tno result\tno result\tno result\n')
     self.logger.info(f'Closest representative hits saved to: {self.path}')