Пример #1
0
    def diff(self, options):
        """Compare two taxonomy files."""

        check_file_exists(options.input_taxonomy1)
        check_file_exists(options.input_taxonomy2)

        taxonomy1 = Taxonomy().read(options.input_taxonomy1)
        taxonomy2 = Taxonomy().read(options.input_taxonomy2)

        all_taxon_ids = set(taxonomy1.keys()).union(list(taxonomy2.keys()))

        rank_index = Taxonomy.rank_labels.index(options.rank)
        for taxon_id in all_taxon_ids:
            if options.report_missing_taxa:
                if taxon_id not in taxonomy1:
                    print('Missing in taxonomy 1: %s' % taxon_id)
                elif taxon_id not in taxonomy2:
                    print('Missing in taxonomy 2: %s' % taxon_id)

            if taxon_id in taxonomy1 and taxon_id in taxonomy2:
                taxon1 = taxonomy1[taxon_id][rank_index]
                taxon2 = taxonomy2[taxon_id][rank_index]

                if taxon1 != taxon2:
                    if options.report_missing_ranks or (taxon1[3:]
                                                        and taxon2[3:]):
                        print('Different taxon for %s: %s %s' %
                              (taxon_id, taxon1, taxon2))

        print('Done.')
Пример #2
0
    def run(self,
                input_taxonomy,
                genome_path_file,
                metadata_file, 
                max_genomes,
                min_comp,
                max_cont,
                min_quality, 
                max_contigs, 
                min_N50, 
                max_ambiguous, 
                max_gap_length, 
                output_dir):
        """Calculate ANI for named species."""
        
        # get genomes passing filtering criteria
        filtered_genome_ids = filter_genomes(metadata_file,
                                                min_comp,
                                                max_cont,
                                                min_quality, 
                                                max_contigs, 
                                                min_N50, 
                                                max_ambiguous, 
                                                max_gap_length)
                                                
        # get species in each named species
        taxonomy = Taxonomy().read(input_taxonomy)
        genome_ids_to_remove = set(taxonomy.keys()) - filtered_genome_ids
        for genome_id in genome_ids_to_remove:
            del taxonomy[genome_id]
            
        named_species = Taxonomy().extant_taxa_for_rank('species', taxonomy)
        
        # get path to nucleotide files
        nt_files = {}
        for line in open(genome_path_file):
            line_split = line.strip().split('\t')

            gtdb_id = line_split[0]
            genome_id = gtdb_id.replace('GB_', '').replace('RS_', '')

            genome_dir = line_split[1]

            nt_file = os.path.join(genome_dir, 'prodigal', genome_id + '_protein.fna')
            nt_files[gtdb_id] = nt_file

        # populate worker queue with data to process
        worker_queue = mp.Queue()
        writer_queue = mp.Queue()

        num_species = 0
        for species, genome_ids in named_species.items():
            if len(genome_ids) > 1:
                worker_queue.put((species, genome_ids))
                num_species += 1

        for _ in range(self.cpus):
          worker_queue.put((None, None))

        try:
          worker_proc = [mp.Process(target=self.__worker, args=(metadata_file,
                                                                    nt_files,
                                                                    max_genomes,
                                                                    worker_queue,
                                                                    writer_queue)) for _ in range(self.cpus)]
          write_proc = mp.Process(target=self.__writer, args=(num_species,
                                                                  output_dir,
                                                                  writer_queue))

          write_proc.start()

          for p in worker_proc:
              p.start()

          for p in worker_proc:
              p.join()

          writer_queue.put((None, None, None, None, None))
          write_proc.join()
        except:
          for p in worker_proc:
            p.terminate()

          write_proc.terminate()
Пример #3
0
   def tax_diff(self, tax1_file, tax2_file, include_user_taxa, output_dir):
       """Tabulate differences between two taxonomies.
       
       Parameters
       ----------
       tax1_file : str
           First taxonomy file.
       tax2_file : str
           Second taxonomy file.
       include_user_taxa : boolean
           Flag indicating if User genomes should be considered.
       output_dir : str
           Output directory.
       """
       
       tax1 = Taxonomy().read(tax1_file)
       tax2 = Taxonomy().read(tax2_file)
       
       if not include_user_taxa:
           new_tax1 = {}
           for genome_id, taxonomy in tax1.iteritems():
               if not genome_id.startswith('U_'):
                   new_tax1[genome_id] = taxonomy
           tax1 = new_tax1
           
           new_tax2 = {}
           for genome_id, taxonomy in tax2.iteritems():
               if not genome_id.startswith('U_'):
                   new_tax2[genome_id] = taxonomy
           tax2 = new_tax2
       
       common_taxa = set(tax1.keys()).intersection(tax2.keys())
       
       self.logger.info('First taxonomy contains %d taxa.' % len(tax1))
       self.logger.info('Second taxonomy contains %d taxa.' % len(tax2))
       self.logger.info('Taxonomies have %d taxa in common.' % len(common_taxa))
       
       # identify differences between taxonomies
       tax_file_name1 = os.path.splitext(os.path.basename(tax1_file))[0]
       tax_file_name2 = os.path.splitext(os.path.basename(tax2_file))[0]
       output_table = os.path.join(output_dir, '%s.tax_diff.tsv' % tax_file_name1)
       
       fout = open(output_table, 'w')
       fout.write('Genome ID\tChange\tRank\t%s\t%s\n' % (tax_file_name1, tax_file_name2))
       
       unchanged = defaultdict(int)           # T2 = g__Bob -> T1 = g__Bob, or T2 = g__ -> T1 = g__
       active_change = defaultdict(int)       # T2 = g__Bob -> T1 = g__Jane, or T2 = g__Bob -> T1 = g__Bob_A
       passive_change = defaultdict(int)      # T2 = g__??? -> T1 = g__Jane
       unresolved_change = defaultdict(int)   # T2 = g__Box -> T1 = g__???
       for taxa in common_taxa:
           t1 = tax1[taxa]
           t2 = tax2[taxa]
           
           for rank, (taxon1, taxon2) in enumerate(zip(t1, t2)):
               if taxon1 == taxon2:
                   unchanged[rank] += 1
               elif taxon1 != Taxonomy.rank_prefixes[rank] and taxon2 != Taxonomy.rank_prefixes[rank]:
                   active_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'active', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
               elif taxon2 == Taxonomy.rank_prefixes[rank]:
                   passive_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'passive', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
               elif taxon1 == Taxonomy.rank_prefixes[rank]:
                   unresolved_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'unresolved', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
                   
       fout.close()
 
       # report results
       output_table = os.path.join(output_dir, '%s.tax_diff_summary.tsv' % tax_file_name1)
       
       fout = open(output_table, 'w')
       fout.write('Rank\tUnchanged\tUnchanged (%)\tActive\t Active (%)\tPassive\tPassive (%)\tUnresolved\tUnresolved (%)\n')
       print 'Rank\tUnchanged\tActive\tPassive\tUnresolved\tTotal'
       for rank in xrange(0, len(Taxonomy.rank_prefixes)):
           total = unchanged[rank] + active_change[rank] + passive_change[rank] + unresolved_change[rank]
           if total != 0:
               fout.write('%s\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n' %
                                   (Taxonomy.rank_labels[rank],
                                   unchanged[rank], unchanged[rank] * 100.0 / total,
                                   active_change[rank], active_change[rank] * 100.0 / total,
                                   passive_change[rank], passive_change[rank] * 100.0 / total,
                                   unresolved_change[rank], unresolved_change[rank] * 100.0 / total))
               print '%s\t%d\t%d\t%d\t%d\t%d' % (Taxonomy.rank_labels[rank],
                                                   unchanged[rank],
                                                   active_change[rank],
                                                   passive_change[rank],
                                                   unresolved_change[rank],
                                                   total)
Пример #4
0
   def tax_diff(self, tax1_file, tax2_file, include_user_taxa, output_dir):
       """Tabulate differences between two taxonomies.
       
       Parameters
       ----------
       tax1_file : str
           First taxonomy file.
       tax2_file : str
           Second taxonomy file.
       include_user_taxa : boolean
           Flag indicating if User genomes should be considered.
       output_dir : str
           Output directory.
       """
       
       tax1 = Taxonomy().read(tax1_file)
       tax2 = Taxonomy().read(tax2_file)
       
       if not include_user_taxa:
           new_tax1 = {}
           for genome_id, taxonomy in tax1.items():
               if not genome_id.startswith('U_'):
                   new_tax1[genome_id] = taxonomy
           tax1 = new_tax1
           
           new_tax2 = {}
           for genome_id, taxonomy in tax2.items():
               if not genome_id.startswith('U_'):
                   new_tax2[genome_id] = taxonomy
           tax2 = new_tax2
       
       common_taxa = set(tax1.keys()).intersection(list(tax2.keys()))
       
       self.logger.info('First taxonomy contains %d taxa.' % len(tax1))
       self.logger.info('Second taxonomy contains %d taxa.' % len(tax2))
       self.logger.info('Taxonomies have %d taxa in common.' % len(common_taxa))
       
       # identify differences between taxonomies
       tax_file_name1 = os.path.splitext(os.path.basename(tax1_file))[0]
       tax_file_name2 = os.path.splitext(os.path.basename(tax2_file))[0]
       output_table = os.path.join(output_dir, '%s.tax_diff.tsv' % tax_file_name1)
       
       fout = open(output_table, 'w')
       fout.write('Genome ID\tChange\tRank\t%s\t%s\n' % (tax_file_name1, tax_file_name2))
       
       unchanged = defaultdict(int)           # T2 = g__Bob -> T1 = g__Bob, or T2 = g__ -> T1 = g__
       active_change = defaultdict(int)       # T2 = g__Bob -> T1 = g__Jane, or T2 = g__Bob -> T1 = g__Bob_A
       passive_change = defaultdict(int)      # T2 = g__??? -> T1 = g__Jane
       unresolved_change = defaultdict(int)   # T2 = g__Box -> T1 = g__???
       for taxa in common_taxa:
           t1 = tax1[taxa]
           t2 = tax2[taxa]
           
           for rank, (taxon1, taxon2) in enumerate(list(zip(t1, t2))):
               if taxon1 == taxon2:
                   unchanged[rank] += 1
               elif taxon1 != Taxonomy.rank_prefixes[rank] and taxon2 != Taxonomy.rank_prefixes[rank]:
                   active_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'active', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
               elif taxon2 == Taxonomy.rank_prefixes[rank]:
                   passive_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'passive', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
               elif taxon1 == Taxonomy.rank_prefixes[rank]:
                   unresolved_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'unresolved', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
                   
       fout.close()
 
       # report results
       output_table = os.path.join(output_dir, '%s.tax_diff_summary.tsv' % tax_file_name1)
       
       fout = open(output_table, 'w')
       fout.write('Rank\tUnchanged\tUnchanged (%)\tActive\t Active (%)\tPassive\tPassive (%)\tUnresolved\tUnresolved (%)\n')
       print('Rank\tUnchanged\tActive\tPassive\tUnresolved\tTotal')
       for rank in range(0, len(Taxonomy.rank_prefixes)):
           total = unchanged[rank] + active_change[rank] + passive_change[rank] + unresolved_change[rank]
           if total != 0:
               fout.write('%s\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n' %
                                   (Taxonomy.rank_labels[rank],
                                   unchanged[rank], unchanged[rank] * 100.0 / total,
                                   active_change[rank], active_change[rank] * 100.0 / total,
                                   passive_change[rank], passive_change[rank] * 100.0 / total,
                                   unresolved_change[rank], unresolved_change[rank] * 100.0 / total))
               print('%s\t%d\t%d\t%d\t%d\t%d' % (Taxonomy.rank_labels[rank],
                                                   unchanged[rank],
                                                   active_change[rank],
                                                   passive_change[rank],
                                                   unresolved_change[rank],
                                                   total))