def _appraise_inexactly(self, metagenome_otu_table_collection, found_otu_collection, sequence_identity): '''Given a metagenome sample collection and OTUs 'found' either by binning or assembly, return a AppraisalBuildingBlock representing the OTUs that have been found, using inexact matching. ''' found_otu_table = OtuTable() found_otu_table.add(found_otu_collection) found_collection = OtuTableCollection() found_collection.otu_table_objects = [found_otu_table] sample_to_building_block = {} for uc in SequenceSearcher().global_search( metagenome_otu_table_collection, found_otu_collection, sequence_identity): q = uc.query if q.sample_name in sample_to_building_block: appraisal = sample_to_building_block[q.sample_name] else: appraisal = AppraisalBuildingBlock() sample_to_building_block[q.sample_name] = appraisal if uc.target is not None: appraisal.num_found += q.count appraisal.found_otus.append(q) return sample_to_building_block
def _appraise_inexactly(self, metagenome_otu_table_collection, found_otu_collection, sequence_identity): '''Given a metagenome sample collection and OTUs 'found' either by binning or assembly, return a AppraisalBuildingBlock representing the OTUs that have been found, using inexact matching. ''' found_otu_table = OtuTable() found_otu_table.add(found_otu_collection) found_collection = OtuTableCollection() found_collection.otu_table_objects = [found_otu_table] sample_to_building_block = {} for uc in SequenceSearcher().global_search(metagenome_otu_table_collection, found_otu_collection, sequence_identity): q = uc.query if q.sample_name in sample_to_building_block: appraisal = sample_to_building_block[q.sample_name] else: appraisal = AppraisalBuildingBlock() sample_to_building_block[q.sample_name] = appraisal if uc.target is not None: appraisal.num_found += q.count appraisal.found_otus.append(q) return sample_to_building_block
def rarefy(self, otu_table_collection, num_to_sample, random_generator=random): '''Return an OtuTable rarefied so that only num_to_sample sequences are present in each sample. Samples not containing sufficient sequences are ignored with a warning. This is not a true rarefaction technique because sequences not chosen in the rarefaction can still influence the output table through the LCA or arbitrary choice operation that has been carried out on the input table. Also, the rarefier operates on counts rather than predicted coverage, skeweing the results toward OTUs that lack inserts. But not by a lot, presumably. otu_table_collection: OtuTableCollection OTU tables iterable num_to_sample: int number of sequences to sample from each ''' sample_to_gene_to_otu = {} to_return = OtuTable() for otu in otu_table_collection: sample_name = otu.sample_name gene = otu.marker if sample_name not in sample_to_gene_to_otu: sample_to_gene_to_otu[sample_name] = {} if gene not in sample_to_gene_to_otu[sample_name]: sample_to_gene_to_otu[sample_name][gene] = {} if otu.sequence in sample_to_gene_to_otu[sample_name][gene]: raise Exception("Found duplicate sequence in OTU table in sample %s, gene %s" % sample_name, gene) sample_to_gene_to_otu[sample_name][gene][otu.sequence] = otu for sample_name in sample_to_gene_to_otu.keys(): for gene in sample_to_gene_to_otu[sample_name].keys(): sequences_to_sample = [] for sequence, otu in sample_to_gene_to_otu[sample_name][gene].items(): for _ in range(otu.count): sequences_to_sample.append(sequence) if len(sequences_to_sample) < num_to_sample: logging.warn("Sample %s gene %s only contains %i sequences, so cannot be rarefied. Ignoring this sample/gene combination" % (sample_name, gene, len(sequences_to_sample))) continue else: sequences_sampled = random_generator.sample(sequences_to_sample, num_to_sample) sequence_counts = {} for seq in sequences_sampled: try: sequence_counts[seq] += 1 except KeyError: sequence_counts[seq] = 1 for seq, count in sequence_counts.items(): otu = sample_to_gene_to_otu[sample_name][gene][seq] e = copy.copy(otu) e.count = count to_return.add([e]) return to_return
def collapse_coupled(self): '''Return an OTU table that is collapsed in 2 ways: duplicate sequences are collapsed together, and samples names are modified, removing r'.1$' and r'.2$'. ''' sample_to_sequence_to_otus = OrderedDict() reg = re.compile(r'.[12]$') for otu in self: new_sample = reg.sub('',otu.sample_name) otu.sample_name = new_sample if new_sample not in sample_to_sequence_to_otus: sample_to_sequence_to_otus[new_sample] = OrderedDict() if otu.sequence not in sample_to_sequence_to_otus[new_sample]: sample_to_sequence_to_otus[new_sample][otu.sequence] = [] sample_to_sequence_to_otus[new_sample][otu.sequence].append(otu) otu_table = OtuTable() for sample, seq_otus in sample_to_sequence_to_otus.items(): for seq, otus in seq_otus.items(): if len(otus) == 1: otu_table.add(otus) else: o = OtuTableEntry() o.marker = otus[0].marker o.sample_name = sample o.sequence = seq o.count = sum([otu.count for otu in otus]) o.coverage = sum([otu.coverage for otu in otus]) o.taxonomy = otus[0].taxonomy #TODO: Make this more of a 'median' taxonomy. otu_table.add([o]) return otu_table
def write_rarefied_otu_table(**kwargs): output_table_io = kwargs.pop('output_table_io') table_collection = kwargs.pop('table_collection') number_to_choose = kwargs.pop('number_to_choose', None) if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) if number_to_choose is None: counts = {} for otu in table_collection: key = "%s_singlem_RAND8_%s" % (otu.sample_name, otu.marker) try: counts[key] += otu.count except KeyError: counts[key] = otu.count number_to_choose = min(counts.values()) logging.info( "Minimum number of sequences detected is %i, rarefying all sample/gene combinations to this level" % number_to_choose) logging.info( "Rarefying OTU table to max %i sequences per sample/gene combination and writing to %s" % (number_to_choose, output_table_io.name)) OtuTable.write_otus_to( Rarefier().rarefy(table_collection, number_to_choose), output_table_io)
def __iter__(self): '''Iterate over all the OTUs from all the tables. This can only be done once since the data is streamed in. ''' for io in self._archive_table_io_objects: for otu in ArchiveOtuTable.read(io): yield otu for io in self._otu_table_io_objects: for otu in OtuTable.each(io): yield otu for file_path in self._archive_table_file_paths: for otu in ArchiveOtuTable.read(open(file_path)): yield otu for file_path in self._otu_table_file_paths: for otu in OtuTable.each(open(file_path)): yield otu
def write_clustered_otu_table(**kwargs): output_table_io = kwargs.pop('output_table_io') table_collection = kwargs.pop('table_collection') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) logging.info("Writing clustered OTU table") output_table_io.write( "\t".join( OtuTable.DEFAULT_OUTPUT_FIELDS+ ['representative', 'total_num_reads', 'total_coverage', 'num_sub_otus', 'max_sub_otu_abundance']) +"\n") for d in table_collection: for otu in d.otus: output_table_io.write("\t".join( [OtuTable._to_printable(cell) for cell in [ otu.marker, otu.sample_name, otu.sequence, otu.count, otu.coverage, otu.taxonomy, d.sequence, d.count, d.coverage, len(d.otus), max([otu.count for otu in d.otus]) ]])+"\n")
def __iter__(self): '''Iterate over all the OTUs from all the tables. This can only be done once since the data is streamed in. ''' for io in self._archive_table_io_objects: for otu in ArchiveOtuTable.read(io): yield otu for io in self._otu_table_io_objects: for otu in OtuTable.read(io): yield otu for file_path in self._archive_table_file_paths: for otu in ArchiveOtuTable.read(open(file_path)): yield otu for file_path in self._otu_table_file_paths: for otu in OtuTable.each(open(file_path)): yield otu
def write_otu_table(**kwargs): output_table_io = kwargs.pop('output_table_io') table_collection = kwargs.pop('table_collection') output_extras = kwargs.pop('output_extras') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) if hasattr(output_table_io, 'name'): logging.info("Writing %s" % output_table_io.name) else: logging.info("Writing an OTU table") if output_extras: OtuTable.write_otus_to(table_collection, output_table_io, fields_to_print=table_collection.example_field_names()) else: OtuTable.write_otus_to(table_collection, output_table_io)
def write_rarefied_otu_table(**kwargs): output_table_io = kwargs.pop('output_table_io') table_collection = kwargs.pop('table_collection') number_to_choose = kwargs.pop('number_to_choose', None) if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) if number_to_choose is None: counts = {} for otu in table_collection: key = "%s_singlem_RAND8_%s" % (otu.sample_name, otu.marker) try: counts[key] += otu.count except KeyError: counts[key] = otu.count number_to_choose = min(counts.values()) logging.info("Minimum number of sequences detected is %i, rarefying all sample/gene combinations to this level" % number_to_choose) logging.info("Rarefying OTU table to max %i sequences per sample/gene combination and writing to %s" % (number_to_choose, output_table_io.name)) OtuTable.write_otus_to(Rarefier().rarefy(table_collection, number_to_choose), output_table_io)
def print_samples(self, **kwargs): db = SequenceDatabase.acquire(kwargs.pop('db')) sample_names = kwargs.pop('sample_names') taxonomy = kwargs.pop('taxonomy') output_io = kwargs.pop('output_io') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) dbm = self._connect_to_sqlite(db) max_set_size = 999 # Cannot query sqlite with > 999 '?' entries, so # query in batches. if sample_names: query_chunks = set(sample_names) else: query_chunks = [taxonomy] otus = OtuTable() total_printed = 0 for chunk in SequenceDatabase.grouper(query_chunks, max_set_size): if sample_names: it = dbm.table('otus').where_in( 'sample_name', [sample for sample in chunk if sample is not None]).get() elif taxonomy: it = dbm.table('otus').where( 'taxonomy', 'like', "%%%s%%" % taxonomy).get() else: raise Exception("Programming error") for entry in it: otu = OtuTableEntry() otu.marker = entry.marker otu.sample_name = entry.sample_name otu.sequence = entry.sequence otu.count = entry.num_hits otu.coverage = entry.coverage otu.taxonomy = entry.taxonomy otus.add([otu]) total_printed += 1 otus.write_to(output_io) logging.info("Printed %i OTU table entries" % total_printed)
def add_otu_table(self, input_otu_table_io): '''Add a regular style OTU table to the collection. Parameters ---------- input_otu_table_ios: list of IO entries are open streams of OTU table data Returns ------- None ''' self.otu_table_objects.append(OtuTable.read(input_otu_table_io))
def print_samples(self, **kwargs): db = SequenceDatabase.acquire(kwargs.pop('db')) sample_names = kwargs.pop('sample_names') taxonomy = kwargs.pop('taxonomy') output_io = kwargs.pop('output_io') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) dbm = self._connect_to_sqlite(db) max_set_size = 999 # Cannot query sqlite with > 999 '?' entries, so # query in batches. if sample_names: query_chunks = set(sample_names) else: query_chunks = [taxonomy] otus = OtuTable() total_printed = 0 for chunk in SequenceDatabase.grouper(query_chunks, max_set_size): if sample_names: it = dbm.table('otus').where_in( 'sample_name', [sample for sample in chunk if sample is not None]).get() elif taxonomy: it = dbm.table('otus').where('taxonomy', 'like', "%%%s%%" % taxonomy).get() else: raise Exception("Programming error") for entry in it: otu = OtuTableEntry() otu.marker = entry.marker otu.sample_name = entry.sample_name otu.sequence = entry.sequence otu.count = entry.num_hits otu.coverage = entry.coverage otu.taxonomy = entry.taxonomy otus.add([otu]) total_printed += 1 otus.write_to(output_io) logging.info("Printed %i OTU table entries" % total_printed)
def run_to_otu_table(self, **kwargs): '''Run the pipe, ''' forward_read_files = kwargs.pop('sequences') num_threads = kwargs.pop('threads') known_otu_tables = kwargs.pop('known_otu_tables') singlem_assignment_method = kwargs.pop('assignment_method') output_jplace = kwargs.pop('output_jplace') evalue = kwargs.pop('evalue') min_orf_length = kwargs.pop('min_orf_length') restrict_read_length = kwargs.pop('restrict_read_length') filter_minimum_protein = kwargs.pop('filter_minimum_protein') filter_minimum_nucleotide = kwargs.pop('filter_minimum_nucleotide') include_inserts = kwargs.pop('include_inserts') singlem_packages = kwargs.pop('singlem_packages') assign_taxonomy = kwargs.pop('assign_taxonomy') known_sequence_taxonomy = kwargs.pop('known_sequence_taxonomy') working_directory = kwargs.pop('working_directory') force = kwargs.pop('force') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) self._num_threads = num_threads self._evalue = evalue self._min_orf_length = min_orf_length self._restrict_read_length = restrict_read_length self._filter_minimum_protein = filter_minimum_protein self._filter_minimum_nucleotide = filter_minimum_nucleotide hmms = HmmDatabase(singlem_packages) if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD: graftm_assignment_method = DIAMOND_ASSIGNMENT_METHOD else: graftm_assignment_method = singlem_assignment_method if logging.getLevelName(logging.getLogger().level) == 'DEBUG': self._graftm_verbosity = '5' else: self._graftm_verbosity = '2' if not assign_taxonomy: singlem_assignment_method = NO_ASSIGNMENT_METHOD using_temporary_working_directory = working_directory is None if using_temporary_working_directory: shared_mem_directory = '/dev/shm' if os.path.exists(shared_mem_directory): logging.debug("Using shared memory as a base directory") tmp = tempdir.TempDir(basedir=shared_mem_directory) tempfiles_path = os.path.join(tmp.name, 'tempfiles') os.mkdir(tempfiles_path) os.environ['TEMP'] = tempfiles_path else: logging.debug( "Shared memory directory not detected, using default temporary directory instead" ) tmp = tempdir.TempDir() working_directory = tmp.name else: working_directory = working_directory if os.path.exists(working_directory): if force: logging.info("Overwriting directory %s" % working_directory) shutil.rmtree(working_directory) os.mkdir(working_directory) else: raise Exception( "Working directory '%s' already exists, not continuing" % working_directory) else: os.mkdir(working_directory) logging.debug("Using working directory %s" % working_directory) self._working_directory = working_directory extracted_reads = None def return_cleanly(): if using_temporary_working_directory: tmp.dissolve() logging.info("Finished") #### Search self._singlem_package_database = hmms search_result = self._search(hmms, forward_read_files) sample_names = search_result.samples_with_hits() if len(sample_names) == 0: logging.info("No reads identified in any samples, stopping") return_cleanly() return None logging.debug("Recovered %i samples with at least one hit e.g. '%s'" \ % (len(sample_names), sample_names[0])) #### Alignment align_result = self._align(search_result) ### Extract reads that have already known taxonomy if known_otu_tables: logging.info("Parsing known taxonomy OTU tables") known_taxes = KnownOtuTable() known_taxes.parse_otu_tables(known_otu_tables) logging.debug("Read in %i sequences with known taxonomy" % len(known_taxes)) else: known_taxes = [] if known_sequence_taxonomy: logging.debug("Parsing sequence-wise taxonomy..") tax1 = GreenGenesTaxonomy.read( open(known_sequence_taxonomy)).taxonomy known_sequence_tax = {} for seq_id, tax in tax1.items(): known_sequence_tax[seq_id] = '; '.join(tax) logging.info( "Read in %i taxonomies from the GreenGenes format taxonomy file" % len(known_sequence_tax)) ### Extract other reads which do not have known taxonomy extracted_reads = self._extract_relevant_reads(align_result, include_inserts, known_taxes) logging.info("Finished extracting aligned sequences") #### Taxonomic assignment if assign_taxonomy: logging.info("Running taxonomic assignment with GraftM..") assignment_result = self._assign_taxonomy( extracted_reads, graftm_assignment_method) #### Process taxonomically assigned reads # get the sequences out for each of them otu_table_object = OtuTable() if singlem_assignment_method == PPLACER_ASSIGNMENT_METHOD: package_to_taxonomy_bihash = {} for readset in extracted_reads: sample_name = readset.sample_name singlem_package = readset.singlem_package known_sequences = readset.known_sequences def add_info(infos, otu_table_object, known_tax): for info in infos: to_print = [ singlem_package.graftm_package_basename(), sample_name, info.seq, info.count, info.coverage, info.taxonomy, info.names, info.aligned_lengths, known_tax ] otu_table_object.data.append(to_print) known_infos = self._seqs_to_counts_and_taxonomy( known_sequences, NO_ASSIGNMENT_METHOD, known_taxes, known_sequence_taxonomy, None) add_info(known_infos, otu_table_object, True) if len( readset.unknown_sequences ) > 0: # if any sequences were aligned (not just already known) tmpbase = readset.tmpfile_basename if assign_taxonomy: is_known_taxonomy = False aligned_seqs = list( itertools.chain(readset.unknown_sequences, readset.known_sequences)) if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD: tax_file = assignment_result.diamond_assignment_file( sample_name, singlem_package, tmpbase) taxonomies = DiamondResultParser(tax_file) elif singlem_assignment_method == DIAMOND_ASSIGNMENT_METHOD: tax_file = assignment_result.read_tax_file( sample_name, singlem_package, tmpbase) if not os.path.isfile(tax_file): logging.warn( "Unable to find tax file for gene %s from sample %s " "(likely do to min length filtering), skipping" % (os.path.basename( singlem_package.base_directory()), sample_name)) taxonomies = {} else: taxonomies = TaxonomyFile(tax_file) elif singlem_assignment_method == PPLACER_ASSIGNMENT_METHOD: bihash_key = singlem_package.base_directory() if bihash_key in package_to_taxonomy_bihash: taxonomy_bihash = package_to_taxonomy_bihash[ bihash_key] else: taxtastic_taxonomy = singlem_package.graftm_package( ).taxtastic_taxonomy_path() logging.debug( "Reading taxtastic taxonomy from %s" % taxtastic_taxonomy) with open(taxtastic_taxonomy) as f: taxonomy_bihash = TaxonomyBihash.parse_taxtastic_taxonomy( f) package_to_taxonomy_bihash[ bihash_key] = taxonomy_bihash base_dir = assignment_result._base_dir( sample_name, singlem_package, tmpbase) jplace_file = os.path.join(base_dir, "placements.jplace") logging.debug( "Attempting to read jplace output from %s" % jplace_file) if os.path.exists(jplace_file): with open(jplace_file) as f: jplace_json = json.loads(f.read()) placement_parser = PlacementParser( jplace_json, taxonomy_bihash, 0.5) else: # Sometimes alignments are filtered out. placement_parser = None taxonomies = {} elif singlem_assignment_method == NO_ASSIGNMENT_METHOD: taxonomies = {} else: raise Exception("Programming error") else: # Taxonomy has not been assigned. aligned_seqs = readset.unknown_sequences if known_sequence_taxonomy: taxonomies = known_sequence_tax else: taxonomies = {} is_known_taxonomy = True new_infos = list( self._seqs_to_counts_and_taxonomy( aligned_seqs, singlem_assignment_method, known_sequence_tax if known_sequence_taxonomy else {}, taxonomies, placement_parser if singlem_assignment_method == PPLACER_ASSIGNMENT_METHOD else None)) add_info(new_infos, otu_table_object, is_known_taxonomy) if output_jplace: base_dir = assignment_result._base_dir( sample_name, singlem_package, tmpbase) input_jplace_file = os.path.join(base_dir, "placements.jplace") output_jplace_file = "%s_%s_%s.jplace" % ( output_jplace, sample_name, singlem_package.graftm_package_basename()) logging.info("Writing jplace file '%s'" % output_jplace_file) logging.debug( "Converting jplace file %s to singlem jplace file %s" % (input_jplace_file, output_jplace_file)) with open(output_jplace_file, 'w') as output_jplace_io: self._write_jplace_from_infos(open(input_jplace_file), new_infos, output_jplace_io) return_cleanly() return otu_table_object
def print_appraisal(self, appraisal, doing_binning, output_io=sys.stdout, doing_assembly=False, binned_otu_table_io=None, unbinned_otu_table_io=None, assembled_otu_table_io=None, unaccounted_for_otu_table_io=None): '''print the Appraisal object overview to STDOUT''' headers = ['sample'] if doing_binning: headers.append('num_binned') if doing_assembly: headers.append('num_assembled') headers.append('num_not_found') if doing_binning: headers.append('percent_binned') if doing_assembly: headers.append('percent_assembled') output_io.write("\t".join(headers)+"\n") binned = [] assembled = [] assembled_not_binned = [] not_founds = [] def print_sample(num_binned, num_assembled, num_assembled_not_binned, num_not_found, sample, mypercent_binned=None, mypercent_assembled=None): if mypercent_binned is not None or mypercent_assembled is not None: if doing_binning: percent_binned = mypercent_binned if doing_assembly: percent_assembled = mypercent_assembled else: total = num_not_found if doing_binning: total += num_binned if doing_assembly: total += num_assembled_not_binned if total == 0: if doing_binning: percent_binned = 0.0 if doing_assembly: percent_assembled = 0.0 else: if doing_binning: percent_binned = float(num_binned)/total * 100 if doing_assembly: percent_assembled = float(num_assembled)/total * 100 to_write = [sample] if doing_binning: to_write.append(str(num_binned)) if doing_assembly: to_write.append(str(num_assembled)) to_write.append(str(num_not_found)) if doing_binning: to_write.append("%2.1f" % percent_binned) if doing_assembly: to_write.append("%2.1f" % percent_assembled) output_io.write("\t".join(to_write)+"\n") def mean(l): return float(sum(l))/len(l) if len(l) > 0 else float('nan') if binned_otu_table_io: binned_table = OtuTable() if unbinned_otu_table_io: unbinned_table = OtuTable() if assembled_otu_table_io: assembled_table = OtuTable() if unaccounted_for_otu_table_io: unaccounted_for_table = OtuTable() for appraisal_result in appraisal.appraisal_results: if doing_assembly: num_assembled_not_binned = appraisal_result.num_assembled_not_binned() print_sample(appraisal_result.num_binned if doing_binning else None, appraisal_result.num_assembled if doing_assembly else None, num_assembled_not_binned if doing_assembly else None, appraisal_result.num_not_found, appraisal_result.metagenome_sample_name) if doing_binning: binned.append(appraisal_result.num_binned) if doing_assembly: assembled.append(appraisal_result.num_assembled) assembled_not_binned.append(num_assembled_not_binned) not_founds.append(appraisal_result.num_not_found) if binned_otu_table_io: binned_table.add(appraisal_result.binned_otus) if unbinned_otu_table_io: unbinned_table.add(appraisal_result.assembled_not_binned_otus()) if assembled_otu_table_io: assembled_table.add(appraisal_result.assembled_otus) if unaccounted_for_otu_table_io: unaccounted_for_table.add(appraisal_result.not_found_otus) print_sample(sum(binned) if doing_binning else None, sum(assembled) if doing_assembly else None, sum(assembled_not_binned) if doing_assembly else None, sum(not_founds), 'total') binned_means = [] assembled_means = [] if doing_binning: to_enumerate = binned else: to_enumerate = assembled for i, _ in enumerate(to_enumerate): num_binned = binned[i] if doing_binning else 0 num_assembled = assembled[i] if doing_assembly else 0 num_assembled_not_binned = assembled_not_binned[i] if doing_assembly else 0 num_not_found = not_founds[i] total = num_assembled_not_binned+num_not_found if doing_binning: total += num_binned binned_means.append(float(num_binned)/total) if doing_assembly: assembled_means.append(float(num_assembled)/total) print_sample("%2.1f" % mean(binned) if doing_binning else None, "%2.1f" % mean(assembled) if doing_assembly else None, None, "%2.1f" % mean(not_founds), 'average', mypercent_binned=mean(binned_means)*100 if doing_binning else None, mypercent_assembled=(mean(assembled_means)*100 if doing_assembly else None)) if binned_otu_table_io: binned_table.write_to(binned_otu_table_io) if unbinned_otu_table_io: unbinned_table.write_to(unbinned_otu_table_io) if assembled_otu_table_io: assembled_table.write_to(assembled_otu_table_io) if unaccounted_for_otu_table_io: unaccounted_for_table.write_to(unaccounted_for_otu_table_io)
def print_appraisal(self, appraisal, output_io=sys.stdout, accounted_for_otu_table_io=None, unaccounted_for_otu_table_io=None): '''print the Appraisal object overview to STDOUT''' output_io.write("\t".join(['sample','num_found','num_not_found','percent_found'])+"\n") founds = [] not_founds = [] def print_sample(num_found, num_not_found, sample, mypercent=None): if mypercent: percent = mypercent elif num_found + num_not_found == 0: percent = 0.0 else: percent = float(num_found)/(num_found+num_not_found) * 100 output_io.write("\t".join([sample, str(num_found), str(num_not_found), "%2.1f" % percent])+"\n") def mean(l): return float(sum(l))/len(l) if len(l) > 0 else float('nan') if accounted_for_otu_table_io: accounted_for_table = OtuTable() if unaccounted_for_otu_table_io: unaccounted_for_table = OtuTable() for appraisal_result in appraisal.appraisal_results: print_sample(appraisal_result.num_found, appraisal_result.num_not_found, appraisal_result.metagenome_sample_name) founds.append(appraisal_result.num_found) not_founds.append(appraisal_result.num_not_found) if accounted_for_otu_table_io: accounted_for_table.add(appraisal_result.found_otus) if accounted_for_otu_table_io: unaccounted_for_table.add(appraisal_result.not_found_otus) print_sample(sum(founds), sum(not_founds), 'total') means = [] for i, num_found in enumerate(founds): num_not_found = not_founds[i] means.append(float(num_found)/(num_found+num_not_found)) print_sample("%2.1f" % mean(founds), "%2.1f" % mean(not_founds), 'average', mypercent=mean(means)*100) if accounted_for_otu_table_io: accounted_for_table.write_to(accounted_for_otu_table_io) if unaccounted_for_otu_table_io: unaccounted_for_table.write_to(unaccounted_for_otu_table_io)
def appraise(self, **kwargs): '''Given a collection of OTU tables derived from samples, and OTU table(s) corresponding to a collection of recovered genomes, how much of the community has been recovered in those genomes? Parameters ---------- kwargs: sequence_identity: float for 'near enough', None when an exact match is required. Returns ------- An Appraisal object containing appraisals for each metagenome ''' genome_otu_table_collection = kwargs.pop('genome_otu_table_collection') metagenome_otu_table_collection = kwargs.pop('metagenome_otu_table_collection') sequence_identity = kwargs.pop('sequence_identity', None) if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) logging.info("Read in %i markers from the different genomes" %\ len(genome_otu_table_collection)) filtered_genome_otus = \ list(genome_otu_table_collection.excluded_duplicate_distinct_genes()) logging.info("After excluding duplicate markers that may indicate " "contamination, found %i markers" % len(filtered_genome_otus)) if sequence_identity is None: genome_otu_sequences = set() genome_names = set() for otu in filtered_genome_otus: genome_otu_sequences.add(otu.sequence) genome_names.add(otu.sample_name) logging.info("Read in %i unique sequences from the %i reference genomes" %\ (len(genome_otu_sequences), len(genome_names))) # read in metagenome OTU sequences sample_name_to_appraisal = {} for otu in metagenome_otu_table_collection: try: appraisal = sample_name_to_appraisal[otu.sample_name] except KeyError: appraisal = AppraisalResult() appraisal.metagenome_sample_name = otu.sample_name sample_name_to_appraisal[otu.sample_name] = appraisal count = otu.count if otu.sequence in genome_otu_sequences: appraisal.num_found += count appraisal.found_otus.append(otu) else: appraisal.num_not_found += count appraisal.not_found_otus.append(otu) app = Appraisal() app.appraisal_results = sample_name_to_appraisal.values() return app else: sample_name_to_appraisal = {} seen_otus = set() genome_otu_table = OtuTable() genome_otu_table.add(filtered_genome_otus) filtered_collection = OtuTableCollection() filtered_collection.otu_table_objects = [genome_otu_table] for uc in SequenceSearcher().global_search(metagenome_otu_table_collection, filtered_collection, sequence_identity): q = uc.query key = str([q.sample_name, q.sequence]) if key in seen_otus: logging.warn("Double-saw an OTU..") continue else: seen_otus.add(key) if q.sample_name not in sample_name_to_appraisal: res = AppraisalResult() res.metagenome_sample_name = q.sample_name sample_name_to_appraisal[q.sample_name] = res appraisal = sample_name_to_appraisal[q.sample_name] if uc.target is None: appraisal.num_not_found += q.count appraisal.not_found_otus.append(q) else: appraisal.num_found += q.count appraisal.found_otus.append(q) app = Appraisal() app.appraisal_results = sample_name_to_appraisal.values() return app
def run(self, **kwargs): forward_read_files = kwargs.pop('sequences') output_otu_table = kwargs.pop('otu_table', None) archive_otu_table = kwargs.pop('archive_otu_table', None) num_threads = kwargs.pop('threads') known_otu_tables = kwargs.pop('known_otu_tables') singlem_assignment_method = kwargs.pop('assignment_method') output_jplace = kwargs.pop('output_jplace') output_extras = kwargs.pop('output_extras') evalue = kwargs.pop('evalue') min_orf_length = kwargs.pop('min_orf_length') restrict_read_length = kwargs.pop('restrict_read_length') filter_minimum_protein = kwargs.pop('filter_minimum_protein') filter_minimum_nucleotide = kwargs.pop('filter_minimum_nucleotide') include_inserts = kwargs.pop('include_inserts') singlem_packages = kwargs.pop('singlem_packages') window_size = kwargs.pop('window_size') assign_taxonomy = kwargs.pop('assign_taxonomy') known_sequence_taxonomy = kwargs.pop('known_sequence_taxonomy') working_directory = kwargs.pop('working_directory') force = kwargs.pop('force') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) self._num_threads = num_threads self._evalue = evalue self._min_orf_length = min_orf_length self._restrict_read_length = restrict_read_length self._filter_minimum_protein = filter_minimum_protein self._filter_minimum_nucleotide = filter_minimum_nucleotide hmms = HmmDatabase(singlem_packages) if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD: graftm_assignment_method = DIAMOND_ASSIGNMENT_METHOD else: graftm_assignment_method = singlem_assignment_method if logging.getLevelName(logging.getLogger().level) == 'DEBUG': self._graftm_verbosity = '5' else: self._graftm_verbosity = '2' using_temporary_working_directory = working_directory is None if using_temporary_working_directory: shared_mem_directory = '/dev/shm' if os.path.exists(shared_mem_directory): logging.debug("Using shared memory as a base directory") tmp = tempdir.TempDir(basedir=shared_mem_directory) tempfiles_path = os.path.join(tmp.name, 'tempfiles') os.mkdir(tempfiles_path) os.environ['TEMP'] = tempfiles_path else: logging.debug("Shared memory directory not detected, using default temporary directory instead") tmp = tempdir.TempDir() working_directory = tmp.name else: working_directory = working_directory if os.path.exists(working_directory): if force: logging.info("Overwriting directory %s" % working_directory) shutil.rmtree(working_directory) os.mkdir(working_directory) else: raise Exception("Working directory '%s' already exists, not continuing" % working_directory) else: os.mkdir(working_directory) logging.debug("Using working directory %s" % working_directory) self._working_directory = working_directory extracted_reads = None def return_cleanly(): if extracted_reads: extracted_reads.cleanup() if using_temporary_working_directory: tmp.dissolve() logging.info("Finished") #### Search self._singlem_package_database = hmms search_result = self._search(hmms, forward_read_files) sample_names = search_result.samples_with_hits() if len(sample_names) == 0: logging.info("No reads identified in any samples, stopping") return_cleanly() return logging.debug("Recovered %i samples with at least one hit e.g. '%s'" \ % (len(sample_names), sample_names[0])) #### Alignment align_result = self._align(search_result) ### Extract reads that have already known taxonomy if known_otu_tables: logging.info("Parsing known taxonomy OTU tables") known_taxes = KnownOtuTable() known_taxes.parse_otu_tables(known_otu_tables) logging.debug("Read in %i sequences with known taxonomy" % len(known_taxes)) else: known_taxes = [] if known_sequence_taxonomy: logging.debug("Parsing sequence-wise taxonomy..") tax1 = GreenGenesTaxonomy.read(open(known_sequence_taxonomy)).taxonomy known_sequence_tax = {} for seq_id, tax in tax1.items(): known_sequence_tax[seq_id] = '; '.join(tax) logging.info("Read in %i taxonomies from the GreenGenes format taxonomy file" % len(known_sequence_tax)) ### Extract other reads which do not have known taxonomy extracted_reads = self._extract_relevant_reads( align_result, include_inserts, known_taxes) logging.info("Finished extracting aligned sequences") #### Taxonomic assignment if assign_taxonomy: logging.info("Running taxonomic assignment with graftm..") assignment_result = self._assign_taxonomy( extracted_reads, graftm_assignment_method) #### Process taxonomically assigned reads # get the sequences out for each of them otu_table_object = OtuTable() regular_output_fields = split('gene sample sequence num_hits coverage taxonomy') otu_table_object.fields = regular_output_fields + \ split('read_names nucleotides_aligned taxonomy_by_known?') for sample_name, singlem_package, tmp_graft, known_sequences, unknown_sequences in extracted_reads: def add_info(infos, otu_table_object, known_tax): for info in infos: to_print = [ singlem_package.graftm_package_basename(), sample_name, info.seq, info.count, info.coverage, info.taxonomy, info.names, info.aligned_lengths, known_tax] otu_table_object.data.append(to_print) known_infos = self._seqs_to_counts_and_taxonomy( known_sequences, known_taxes, False, True) add_info(known_infos, otu_table_object, True) if tmp_graft: # if any sequences were aligned (not just already known) tmpbase = os.path.basename(tmp_graft.name[:-6])#remove .fasta if assign_taxonomy: is_known_taxonomy = False aligned_seqs = self._get_windowed_sequences( assignment_result.prealigned_sequence_file( sample_name, singlem_package, tmpbase), assignment_result.nucleotide_hits_file( sample_name, singlem_package, tmpbase), singlem_package, include_inserts) if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD: tax_file = assignment_result.diamond_assignment_file( sample_name, singlem_package, tmpbase) else: tax_file = assignment_result.read_tax_file( sample_name, singlem_package, tmpbase) logging.debug("Reading taxonomy from %s" % tax_file) if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD: taxonomies = DiamondResultParser(tax_file) use_first = True else: if not os.path.isfile(tax_file): logging.warn("Unable to find tax file for gene %s from sample %s " "(likely do to min length filtering), skipping" % ( os.path.basename(singlem_package.base_directory()), sample_name)) taxonomies = {} else: taxonomies = TaxonomyFile(tax_file) use_first = False else: # Taxonomy has not been assigned. aligned_seqs = unknown_sequences if known_sequence_taxonomy: taxonomies = known_sequence_tax else: taxonomies = {} use_first = False # irrelevant is_known_taxonomy = True new_infos = list(self._seqs_to_counts_and_taxonomy( aligned_seqs, taxonomies, use_first, False)) add_info(new_infos, otu_table_object, is_known_taxonomy) if output_jplace: base_dir = assignment_result._base_dir( sample_name, singlem_package, tmpbase) input_jplace_file = os.path.join(base_dir, "placements.jplace") output_jplace_file = os.path.join(base_dir, "%s_%s_%s.jplace" % ( output_jplace, sample_name, singlem_package.graftm_package_basename())) logging.debug("Converting jplace file %s to singlem jplace file %s" % ( input_jplace_file, output_jplace_file)) with open(output_jplace_file, 'w') as output_jplace_io: self._write_jplace_from_infos( open(input_jplace_file), new_infos, output_jplace_io) if output_otu_table: with open(output_otu_table, 'w') as f: if output_extras: otu_table_object.write_to(f, otu_table_object.fields) else: otu_table_object.write_to(f, regular_output_fields) if archive_otu_table: with open(archive_otu_table, 'w') as f: otu_table_object.archive(hmms.singlem_packages).write_to(f) return_cleanly()
def print_appraisal(self, appraisal, doing_binning, output_io=sys.stdout, doing_assembly=False, binned_otu_table_io=None, unbinned_otu_table_io=None, assembled_otu_table_io=None, unaccounted_for_otu_table_io=None): '''print the Appraisal object overview to STDOUT''' headers = ['sample'] if doing_binning: headers.append('num_binned') if doing_assembly: headers.append('num_assembled') headers.append('num_not_found') if doing_binning: headers.append('percent_binned') if doing_assembly: headers.append('percent_assembled') output_io.write("\t".join(headers) + "\n") binned = [] assembled = [] assembled_not_binned = [] not_founds = [] def print_sample(num_binned, num_assembled, num_assembled_not_binned, num_not_found, sample, mypercent_binned=None, mypercent_assembled=None): if mypercent_binned is not None or mypercent_assembled is not None: if doing_binning: percent_binned = mypercent_binned if doing_assembly: percent_assembled = mypercent_assembled else: total = num_not_found if doing_binning: total += num_binned if doing_assembly: total += num_assembled_not_binned if total == 0: if doing_binning: percent_binned = 0.0 if doing_assembly: percent_assembled = 0.0 else: if doing_binning: percent_binned = float(num_binned) / total * 100 if doing_assembly: percent_assembled = float(num_assembled) / total * 100 to_write = [sample] if doing_binning: to_write.append(str(num_binned)) if doing_assembly: to_write.append(str(num_assembled)) to_write.append(str(num_not_found)) if doing_binning: to_write.append("%2.1f" % percent_binned) if doing_assembly: to_write.append("%2.1f" % percent_assembled) output_io.write("\t".join(to_write) + "\n") def mean(l): return float(sum(l)) / len(l) if len(l) > 0 else float('nan') if binned_otu_table_io: binned_table = OtuTable() if unbinned_otu_table_io: unbinned_table = OtuTable() if assembled_otu_table_io: assembled_table = OtuTable() if unaccounted_for_otu_table_io: unaccounted_for_table = OtuTable() for appraisal_result in appraisal.appraisal_results: if doing_assembly: num_assembled_not_binned = appraisal_result.num_assembled_not_binned( ) print_sample( appraisal_result.num_binned if doing_binning else None, appraisal_result.num_assembled if doing_assembly else None, num_assembled_not_binned if doing_assembly else None, appraisal_result.num_not_found, appraisal_result.metagenome_sample_name) if doing_binning: binned.append(appraisal_result.num_binned) if doing_assembly: assembled.append(appraisal_result.num_assembled) assembled_not_binned.append(num_assembled_not_binned) not_founds.append(appraisal_result.num_not_found) if binned_otu_table_io: binned_table.add(appraisal_result.binned_otus) if unbinned_otu_table_io: unbinned_table.add( appraisal_result.assembled_not_binned_otus()) if assembled_otu_table_io: assembled_table.add(appraisal_result.assembled_otus) if unaccounted_for_otu_table_io: unaccounted_for_table.add(appraisal_result.not_found_otus) print_sample( sum(binned) if doing_binning else None, sum(assembled) if doing_assembly else None, sum(assembled_not_binned) if doing_assembly else None, sum(not_founds), 'total') binned_means = [] assembled_means = [] if doing_binning: to_enumerate = binned else: to_enumerate = assembled for i, _ in enumerate(to_enumerate): num_binned = binned[i] if doing_binning else 0 num_assembled = assembled[i] if doing_assembly else 0 num_assembled_not_binned = assembled_not_binned[ i] if doing_assembly else 0 num_not_found = not_founds[i] total = num_assembled_not_binned + num_not_found if doing_binning: total += num_binned binned_means.append(float(num_binned) / total) if doing_assembly: assembled_means.append(float(num_assembled) / total) print_sample("%2.1f" % mean(binned) if doing_binning else None, "%2.1f" % mean(assembled) if doing_assembly else None, None, "%2.1f" % mean(not_founds), 'average', mypercent_binned=mean(binned_means) * 100 if doing_binning else None, mypercent_assembled=(mean(assembled_means) * 100 if doing_assembly else None)) if binned_otu_table_io: binned_table.write_to(binned_otu_table_io) if unbinned_otu_table_io: unbinned_table.write_to(unbinned_otu_table_io) if assembled_otu_table_io: assembled_table.write_to(assembled_otu_table_io) if unaccounted_for_otu_table_io: unaccounted_for_table.write_to(unaccounted_for_otu_table_io)