def print_appraisal(self, appraisal, output_io=sys.stdout, accounted_for_otu_table_io=None, unaccounted_for_otu_table_io=None): '''print the Appraisal object overview to STDOUT''' output_io.write("\t".join(['sample','num_found','num_not_found','percent_found'])+"\n") founds = [] not_founds = [] def print_sample(num_found, num_not_found, sample, mypercent=None): if mypercent: percent = mypercent elif num_found + num_not_found == 0: percent = 0.0 else: percent = float(num_found)/(num_found+num_not_found) * 100 output_io.write("\t".join([sample, str(num_found), str(num_not_found), "%2.1f" % percent])+"\n") def mean(l): return float(sum(l))/len(l) if len(l) > 0 else float('nan') if accounted_for_otu_table_io: accounted_for_table = OtuTable() if unaccounted_for_otu_table_io: unaccounted_for_table = OtuTable() for appraisal_result in appraisal.appraisal_results: print_sample(appraisal_result.num_found, appraisal_result.num_not_found, appraisal_result.metagenome_sample_name) founds.append(appraisal_result.num_found) not_founds.append(appraisal_result.num_not_found) if accounted_for_otu_table_io: accounted_for_table.add(appraisal_result.found_otus) if accounted_for_otu_table_io: unaccounted_for_table.add(appraisal_result.not_found_otus) print_sample(sum(founds), sum(not_founds), 'total') means = [] for i, num_found in enumerate(founds): num_not_found = not_founds[i] means.append(float(num_found)/(num_found+num_not_found)) print_sample("%2.1f" % mean(founds), "%2.1f" % mean(not_founds), 'average', mypercent=mean(means)*100) if accounted_for_otu_table_io: accounted_for_table.write_to(accounted_for_otu_table_io) if unaccounted_for_otu_table_io: unaccounted_for_table.write_to(unaccounted_for_otu_table_io)
def print_samples(self, **kwargs): db = SequenceDatabase.acquire(kwargs.pop('db')) sample_names = kwargs.pop('sample_names') taxonomy = kwargs.pop('taxonomy') output_io = kwargs.pop('output_io') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) dbm = self._connect_to_sqlite(db) max_set_size = 999 # Cannot query sqlite with > 999 '?' entries, so # query in batches. if sample_names: query_chunks = set(sample_names) else: query_chunks = [taxonomy] otus = OtuTable() total_printed = 0 for chunk in SequenceDatabase.grouper(query_chunks, max_set_size): if sample_names: it = dbm.table('otus').where_in( 'sample_name', [sample for sample in chunk if sample is not None]).get() elif taxonomy: it = dbm.table('otus').where('taxonomy', 'like', "%%%s%%" % taxonomy).get() else: raise Exception("Programming error") for entry in it: otu = OtuTableEntry() otu.marker = entry.marker otu.sample_name = entry.sample_name otu.sequence = entry.sequence otu.count = entry.num_hits otu.coverage = entry.coverage otu.taxonomy = entry.taxonomy otus.add([otu]) total_printed += 1 otus.write_to(output_io) logging.info("Printed %i OTU table entries" % total_printed)
def print_samples(self, **kwargs): db = SequenceDatabase.acquire(kwargs.pop('db')) sample_names = kwargs.pop('sample_names') taxonomy = kwargs.pop('taxonomy') output_io = kwargs.pop('output_io') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) dbm = self._connect_to_sqlite(db) max_set_size = 999 # Cannot query sqlite with > 999 '?' entries, so # query in batches. if sample_names: query_chunks = set(sample_names) else: query_chunks = [taxonomy] otus = OtuTable() total_printed = 0 for chunk in SequenceDatabase.grouper(query_chunks, max_set_size): if sample_names: it = dbm.table('otus').where_in( 'sample_name', [sample for sample in chunk if sample is not None]).get() elif taxonomy: it = dbm.table('otus').where( 'taxonomy', 'like', "%%%s%%" % taxonomy).get() else: raise Exception("Programming error") for entry in it: otu = OtuTableEntry() otu.marker = entry.marker otu.sample_name = entry.sample_name otu.sequence = entry.sequence otu.count = entry.num_hits otu.coverage = entry.coverage otu.taxonomy = entry.taxonomy otus.add([otu]) total_printed += 1 otus.write_to(output_io) logging.info("Printed %i OTU table entries" % total_printed)
def run(self, **kwargs): forward_read_files = kwargs.pop('sequences') output_otu_table = kwargs.pop('otu_table', None) archive_otu_table = kwargs.pop('archive_otu_table', None) num_threads = kwargs.pop('threads') known_otu_tables = kwargs.pop('known_otu_tables') singlem_assignment_method = kwargs.pop('assignment_method') output_jplace = kwargs.pop('output_jplace') output_extras = kwargs.pop('output_extras') evalue = kwargs.pop('evalue') min_orf_length = kwargs.pop('min_orf_length') restrict_read_length = kwargs.pop('restrict_read_length') filter_minimum_protein = kwargs.pop('filter_minimum_protein') filter_minimum_nucleotide = kwargs.pop('filter_minimum_nucleotide') include_inserts = kwargs.pop('include_inserts') singlem_packages = kwargs.pop('singlem_packages') window_size = kwargs.pop('window_size') assign_taxonomy = kwargs.pop('assign_taxonomy') known_sequence_taxonomy = kwargs.pop('known_sequence_taxonomy') working_directory = kwargs.pop('working_directory') force = kwargs.pop('force') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) self._num_threads = num_threads self._evalue = evalue self._min_orf_length = min_orf_length self._restrict_read_length = restrict_read_length self._filter_minimum_protein = filter_minimum_protein self._filter_minimum_nucleotide = filter_minimum_nucleotide hmms = HmmDatabase(singlem_packages) if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD: graftm_assignment_method = DIAMOND_ASSIGNMENT_METHOD else: graftm_assignment_method = singlem_assignment_method if logging.getLevelName(logging.getLogger().level) == 'DEBUG': self._graftm_verbosity = '5' else: self._graftm_verbosity = '2' using_temporary_working_directory = working_directory is None if using_temporary_working_directory: shared_mem_directory = '/dev/shm' if os.path.exists(shared_mem_directory): logging.debug("Using shared memory as a base directory") tmp = tempdir.TempDir(basedir=shared_mem_directory) tempfiles_path = os.path.join(tmp.name, 'tempfiles') os.mkdir(tempfiles_path) os.environ['TEMP'] = tempfiles_path else: logging.debug("Shared memory directory not detected, using default temporary directory instead") tmp = tempdir.TempDir() working_directory = tmp.name else: working_directory = working_directory if os.path.exists(working_directory): if force: logging.info("Overwriting directory %s" % working_directory) shutil.rmtree(working_directory) os.mkdir(working_directory) else: raise Exception("Working directory '%s' already exists, not continuing" % working_directory) else: os.mkdir(working_directory) logging.debug("Using working directory %s" % working_directory) self._working_directory = working_directory extracted_reads = None def return_cleanly(): if extracted_reads: extracted_reads.cleanup() if using_temporary_working_directory: tmp.dissolve() logging.info("Finished") #### Search self._singlem_package_database = hmms search_result = self._search(hmms, forward_read_files) sample_names = search_result.samples_with_hits() if len(sample_names) == 0: logging.info("No reads identified in any samples, stopping") return_cleanly() return logging.debug("Recovered %i samples with at least one hit e.g. '%s'" \ % (len(sample_names), sample_names[0])) #### Alignment align_result = self._align(search_result) ### Extract reads that have already known taxonomy if known_otu_tables: logging.info("Parsing known taxonomy OTU tables") known_taxes = KnownOtuTable() known_taxes.parse_otu_tables(known_otu_tables) logging.debug("Read in %i sequences with known taxonomy" % len(known_taxes)) else: known_taxes = [] if known_sequence_taxonomy: logging.debug("Parsing sequence-wise taxonomy..") tax1 = GreenGenesTaxonomy.read(open(known_sequence_taxonomy)).taxonomy known_sequence_tax = {} for seq_id, tax in tax1.items(): known_sequence_tax[seq_id] = '; '.join(tax) logging.info("Read in %i taxonomies from the GreenGenes format taxonomy file" % len(known_sequence_tax)) ### Extract other reads which do not have known taxonomy extracted_reads = self._extract_relevant_reads( align_result, include_inserts, known_taxes) logging.info("Finished extracting aligned sequences") #### Taxonomic assignment if assign_taxonomy: logging.info("Running taxonomic assignment with graftm..") assignment_result = self._assign_taxonomy( extracted_reads, graftm_assignment_method) #### Process taxonomically assigned reads # get the sequences out for each of them otu_table_object = OtuTable() regular_output_fields = split('gene sample sequence num_hits coverage taxonomy') otu_table_object.fields = regular_output_fields + \ split('read_names nucleotides_aligned taxonomy_by_known?') for sample_name, singlem_package, tmp_graft, known_sequences, unknown_sequences in extracted_reads: def add_info(infos, otu_table_object, known_tax): for info in infos: to_print = [ singlem_package.graftm_package_basename(), sample_name, info.seq, info.count, info.coverage, info.taxonomy, info.names, info.aligned_lengths, known_tax] otu_table_object.data.append(to_print) known_infos = self._seqs_to_counts_and_taxonomy( known_sequences, known_taxes, False, True) add_info(known_infos, otu_table_object, True) if tmp_graft: # if any sequences were aligned (not just already known) tmpbase = os.path.basename(tmp_graft.name[:-6])#remove .fasta if assign_taxonomy: is_known_taxonomy = False aligned_seqs = self._get_windowed_sequences( assignment_result.prealigned_sequence_file( sample_name, singlem_package, tmpbase), assignment_result.nucleotide_hits_file( sample_name, singlem_package, tmpbase), singlem_package, include_inserts) if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD: tax_file = assignment_result.diamond_assignment_file( sample_name, singlem_package, tmpbase) else: tax_file = assignment_result.read_tax_file( sample_name, singlem_package, tmpbase) logging.debug("Reading taxonomy from %s" % tax_file) if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD: taxonomies = DiamondResultParser(tax_file) use_first = True else: if not os.path.isfile(tax_file): logging.warn("Unable to find tax file for gene %s from sample %s " "(likely do to min length filtering), skipping" % ( os.path.basename(singlem_package.base_directory()), sample_name)) taxonomies = {} else: taxonomies = TaxonomyFile(tax_file) use_first = False else: # Taxonomy has not been assigned. aligned_seqs = unknown_sequences if known_sequence_taxonomy: taxonomies = known_sequence_tax else: taxonomies = {} use_first = False # irrelevant is_known_taxonomy = True new_infos = list(self._seqs_to_counts_and_taxonomy( aligned_seqs, taxonomies, use_first, False)) add_info(new_infos, otu_table_object, is_known_taxonomy) if output_jplace: base_dir = assignment_result._base_dir( sample_name, singlem_package, tmpbase) input_jplace_file = os.path.join(base_dir, "placements.jplace") output_jplace_file = os.path.join(base_dir, "%s_%s_%s.jplace" % ( output_jplace, sample_name, singlem_package.graftm_package_basename())) logging.debug("Converting jplace file %s to singlem jplace file %s" % ( input_jplace_file, output_jplace_file)) with open(output_jplace_file, 'w') as output_jplace_io: self._write_jplace_from_infos( open(input_jplace_file), new_infos, output_jplace_io) if output_otu_table: with open(output_otu_table, 'w') as f: if output_extras: otu_table_object.write_to(f, otu_table_object.fields) else: otu_table_object.write_to(f, regular_output_fields) if archive_otu_table: with open(archive_otu_table, 'w') as f: otu_table_object.archive(hmms.singlem_packages).write_to(f) return_cleanly()
def print_appraisal(self, appraisal, doing_binning, output_io=sys.stdout, doing_assembly=False, binned_otu_table_io=None, unbinned_otu_table_io=None, assembled_otu_table_io=None, unaccounted_for_otu_table_io=None): '''print the Appraisal object overview to STDOUT''' headers = ['sample'] if doing_binning: headers.append('num_binned') if doing_assembly: headers.append('num_assembled') headers.append('num_not_found') if doing_binning: headers.append('percent_binned') if doing_assembly: headers.append('percent_assembled') output_io.write("\t".join(headers) + "\n") binned = [] assembled = [] assembled_not_binned = [] not_founds = [] def print_sample(num_binned, num_assembled, num_assembled_not_binned, num_not_found, sample, mypercent_binned=None, mypercent_assembled=None): if mypercent_binned is not None or mypercent_assembled is not None: if doing_binning: percent_binned = mypercent_binned if doing_assembly: percent_assembled = mypercent_assembled else: total = num_not_found if doing_binning: total += num_binned if doing_assembly: total += num_assembled_not_binned if total == 0: if doing_binning: percent_binned = 0.0 if doing_assembly: percent_assembled = 0.0 else: if doing_binning: percent_binned = float(num_binned) / total * 100 if doing_assembly: percent_assembled = float(num_assembled) / total * 100 to_write = [sample] if doing_binning: to_write.append(str(num_binned)) if doing_assembly: to_write.append(str(num_assembled)) to_write.append(str(num_not_found)) if doing_binning: to_write.append("%2.1f" % percent_binned) if doing_assembly: to_write.append("%2.1f" % percent_assembled) output_io.write("\t".join(to_write) + "\n") def mean(l): return float(sum(l)) / len(l) if len(l) > 0 else float('nan') if binned_otu_table_io: binned_table = OtuTable() if unbinned_otu_table_io: unbinned_table = OtuTable() if assembled_otu_table_io: assembled_table = OtuTable() if unaccounted_for_otu_table_io: unaccounted_for_table = OtuTable() for appraisal_result in appraisal.appraisal_results: if doing_assembly: num_assembled_not_binned = appraisal_result.num_assembled_not_binned( ) print_sample( appraisal_result.num_binned if doing_binning else None, appraisal_result.num_assembled if doing_assembly else None, num_assembled_not_binned if doing_assembly else None, appraisal_result.num_not_found, appraisal_result.metagenome_sample_name) if doing_binning: binned.append(appraisal_result.num_binned) if doing_assembly: assembled.append(appraisal_result.num_assembled) assembled_not_binned.append(num_assembled_not_binned) not_founds.append(appraisal_result.num_not_found) if binned_otu_table_io: binned_table.add(appraisal_result.binned_otus) if unbinned_otu_table_io: unbinned_table.add( appraisal_result.assembled_not_binned_otus()) if assembled_otu_table_io: assembled_table.add(appraisal_result.assembled_otus) if unaccounted_for_otu_table_io: unaccounted_for_table.add(appraisal_result.not_found_otus) print_sample( sum(binned) if doing_binning else None, sum(assembled) if doing_assembly else None, sum(assembled_not_binned) if doing_assembly else None, sum(not_founds), 'total') binned_means = [] assembled_means = [] if doing_binning: to_enumerate = binned else: to_enumerate = assembled for i, _ in enumerate(to_enumerate): num_binned = binned[i] if doing_binning else 0 num_assembled = assembled[i] if doing_assembly else 0 num_assembled_not_binned = assembled_not_binned[ i] if doing_assembly else 0 num_not_found = not_founds[i] total = num_assembled_not_binned + num_not_found if doing_binning: total += num_binned binned_means.append(float(num_binned) / total) if doing_assembly: assembled_means.append(float(num_assembled) / total) print_sample("%2.1f" % mean(binned) if doing_binning else None, "%2.1f" % mean(assembled) if doing_assembly else None, None, "%2.1f" % mean(not_founds), 'average', mypercent_binned=mean(binned_means) * 100 if doing_binning else None, mypercent_assembled=(mean(assembled_means) * 100 if doing_assembly else None)) if binned_otu_table_io: binned_table.write_to(binned_otu_table_io) if unbinned_otu_table_io: unbinned_table.write_to(unbinned_otu_table_io) if assembled_otu_table_io: assembled_table.write_to(assembled_otu_table_io) if unaccounted_for_otu_table_io: unaccounted_for_table.write_to(unaccounted_for_otu_table_io)
def print_appraisal(self, appraisal, doing_binning, output_io=sys.stdout, doing_assembly=False, binned_otu_table_io=None, unbinned_otu_table_io=None, assembled_otu_table_io=None, unaccounted_for_otu_table_io=None): '''print the Appraisal object overview to STDOUT''' headers = ['sample'] if doing_binning: headers.append('num_binned') if doing_assembly: headers.append('num_assembled') headers.append('num_not_found') if doing_binning: headers.append('percent_binned') if doing_assembly: headers.append('percent_assembled') output_io.write("\t".join(headers)+"\n") binned = [] assembled = [] assembled_not_binned = [] not_founds = [] def print_sample(num_binned, num_assembled, num_assembled_not_binned, num_not_found, sample, mypercent_binned=None, mypercent_assembled=None): if mypercent_binned is not None or mypercent_assembled is not None: if doing_binning: percent_binned = mypercent_binned if doing_assembly: percent_assembled = mypercent_assembled else: total = num_not_found if doing_binning: total += num_binned if doing_assembly: total += num_assembled_not_binned if total == 0: if doing_binning: percent_binned = 0.0 if doing_assembly: percent_assembled = 0.0 else: if doing_binning: percent_binned = float(num_binned)/total * 100 if doing_assembly: percent_assembled = float(num_assembled)/total * 100 to_write = [sample] if doing_binning: to_write.append(str(num_binned)) if doing_assembly: to_write.append(str(num_assembled)) to_write.append(str(num_not_found)) if doing_binning: to_write.append("%2.1f" % percent_binned) if doing_assembly: to_write.append("%2.1f" % percent_assembled) output_io.write("\t".join(to_write)+"\n") def mean(l): return float(sum(l))/len(l) if len(l) > 0 else float('nan') if binned_otu_table_io: binned_table = OtuTable() if unbinned_otu_table_io: unbinned_table = OtuTable() if assembled_otu_table_io: assembled_table = OtuTable() if unaccounted_for_otu_table_io: unaccounted_for_table = OtuTable() for appraisal_result in appraisal.appraisal_results: if doing_assembly: num_assembled_not_binned = appraisal_result.num_assembled_not_binned() print_sample(appraisal_result.num_binned if doing_binning else None, appraisal_result.num_assembled if doing_assembly else None, num_assembled_not_binned if doing_assembly else None, appraisal_result.num_not_found, appraisal_result.metagenome_sample_name) if doing_binning: binned.append(appraisal_result.num_binned) if doing_assembly: assembled.append(appraisal_result.num_assembled) assembled_not_binned.append(num_assembled_not_binned) not_founds.append(appraisal_result.num_not_found) if binned_otu_table_io: binned_table.add(appraisal_result.binned_otus) if unbinned_otu_table_io: unbinned_table.add(appraisal_result.assembled_not_binned_otus()) if assembled_otu_table_io: assembled_table.add(appraisal_result.assembled_otus) if unaccounted_for_otu_table_io: unaccounted_for_table.add(appraisal_result.not_found_otus) print_sample(sum(binned) if doing_binning else None, sum(assembled) if doing_assembly else None, sum(assembled_not_binned) if doing_assembly else None, sum(not_founds), 'total') binned_means = [] assembled_means = [] if doing_binning: to_enumerate = binned else: to_enumerate = assembled for i, _ in enumerate(to_enumerate): num_binned = binned[i] if doing_binning else 0 num_assembled = assembled[i] if doing_assembly else 0 num_assembled_not_binned = assembled_not_binned[i] if doing_assembly else 0 num_not_found = not_founds[i] total = num_assembled_not_binned+num_not_found if doing_binning: total += num_binned binned_means.append(float(num_binned)/total) if doing_assembly: assembled_means.append(float(num_assembled)/total) print_sample("%2.1f" % mean(binned) if doing_binning else None, "%2.1f" % mean(assembled) if doing_assembly else None, None, "%2.1f" % mean(not_founds), 'average', mypercent_binned=mean(binned_means)*100 if doing_binning else None, mypercent_assembled=(mean(assembled_means)*100 if doing_assembly else None)) if binned_otu_table_io: binned_table.write_to(binned_otu_table_io) if unbinned_otu_table_io: unbinned_table.write_to(unbinned_otu_table_io) if assembled_otu_table_io: assembled_table.write_to(assembled_otu_table_io) if unaccounted_for_otu_table_io: unaccounted_for_table.write_to(unaccounted_for_otu_table_io)