예제 #1
0
 def print_appraisal(self, appraisal,
                     output_io=sys.stdout,
                     accounted_for_otu_table_io=None,
                     unaccounted_for_otu_table_io=None):
     '''print the Appraisal object overview to STDOUT'''
     
     output_io.write("\t".join(['sample','num_found','num_not_found','percent_found'])+"\n")
     founds = []
     not_founds = []
     
     def print_sample(num_found, num_not_found, sample, mypercent=None):
         if mypercent:
             percent = mypercent
         elif num_found + num_not_found == 0:
             percent = 0.0
         else:
             percent = float(num_found)/(num_found+num_not_found) * 100
         output_io.write("\t".join([sample, str(num_found), str(num_not_found), "%2.1f" % percent])+"\n")
         
     def mean(l):
         return float(sum(l))/len(l) if len(l) > 0 else float('nan')
     
     if accounted_for_otu_table_io:
         accounted_for_table = OtuTable()
     if unaccounted_for_otu_table_io:
         unaccounted_for_table = OtuTable()
         
     for appraisal_result in appraisal.appraisal_results:
         print_sample(appraisal_result.num_found,
                      appraisal_result.num_not_found,
                      appraisal_result.metagenome_sample_name)
         founds.append(appraisal_result.num_found)
         not_founds.append(appraisal_result.num_not_found)
         if accounted_for_otu_table_io:
             accounted_for_table.add(appraisal_result.found_otus)
         if accounted_for_otu_table_io:
             unaccounted_for_table.add(appraisal_result.not_found_otus)
         
     print_sample(sum(founds), sum(not_founds), 'total')
     
     means = []
     for i, num_found in enumerate(founds):
         num_not_found = not_founds[i]
         means.append(float(num_found)/(num_found+num_not_found))
     print_sample("%2.1f" % mean(founds), "%2.1f" % mean(not_founds), 'average',
                  mypercent=mean(means)*100)
     
     if accounted_for_otu_table_io:
         accounted_for_table.write_to(accounted_for_otu_table_io)
     if unaccounted_for_otu_table_io:
         unaccounted_for_table.write_to(unaccounted_for_otu_table_io)
예제 #2
0
    def print_samples(self, **kwargs):
        db = SequenceDatabase.acquire(kwargs.pop('db'))
        sample_names = kwargs.pop('sample_names')
        taxonomy = kwargs.pop('taxonomy')
        output_io = kwargs.pop('output_io')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        dbm = self._connect_to_sqlite(db)

        max_set_size = 999  # Cannot query sqlite with > 999 '?' entries, so
        # query in batches.
        if sample_names:
            query_chunks = set(sample_names)
        else:
            query_chunks = [taxonomy]
        otus = OtuTable()
        total_printed = 0
        for chunk in SequenceDatabase.grouper(query_chunks, max_set_size):
            if sample_names:
                it = dbm.table('otus').where_in(
                    'sample_name',
                    [sample for sample in chunk if sample is not None]).get()
            elif taxonomy:
                it = dbm.table('otus').where('taxonomy', 'like',
                                             "%%%s%%" % taxonomy).get()
            else:
                raise Exception("Programming error")

            for entry in it:
                otu = OtuTableEntry()
                otu.marker = entry.marker
                otu.sample_name = entry.sample_name
                otu.sequence = entry.sequence
                otu.count = entry.num_hits
                otu.coverage = entry.coverage
                otu.taxonomy = entry.taxonomy
                otus.add([otu])
                total_printed += 1
        otus.write_to(output_io)
        logging.info("Printed %i OTU table entries" % total_printed)
예제 #3
0
파일: querier.py 프로젝트: wwood/singlem
    def print_samples(self, **kwargs):
        db = SequenceDatabase.acquire(kwargs.pop('db'))
        sample_names = kwargs.pop('sample_names')
        taxonomy = kwargs.pop('taxonomy')
        output_io = kwargs.pop('output_io')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        dbm = self._connect_to_sqlite(db)

        max_set_size = 999 # Cannot query sqlite with > 999 '?' entries, so
                           # query in batches.
        if sample_names:
            query_chunks = set(sample_names)
        else:
            query_chunks = [taxonomy]
        otus = OtuTable()
        total_printed = 0
        for chunk in SequenceDatabase.grouper(query_chunks, max_set_size):
            if sample_names:
                it = dbm.table('otus').where_in(
                    'sample_name', [sample for sample in chunk if sample is not None]).get()
            elif taxonomy:
                it = dbm.table('otus').where(
                    'taxonomy', 'like', "%%%s%%" % taxonomy).get()
            else:
                raise Exception("Programming error")

            for entry in it:
                otu = OtuTableEntry()
                otu.marker = entry.marker
                otu.sample_name = entry.sample_name
                otu.sequence = entry.sequence
                otu.count = entry.num_hits
                otu.coverage = entry.coverage
                otu.taxonomy = entry.taxonomy
                otus.add([otu])
                total_printed += 1
        otus.write_to(output_io)
        logging.info("Printed %i OTU table entries" % total_printed)
예제 #4
0
    def run(self, **kwargs):
        forward_read_files = kwargs.pop('sequences')
        output_otu_table = kwargs.pop('otu_table', None)
        archive_otu_table = kwargs.pop('archive_otu_table', None)
        num_threads = kwargs.pop('threads')
        known_otu_tables = kwargs.pop('known_otu_tables')
        singlem_assignment_method = kwargs.pop('assignment_method')
        output_jplace = kwargs.pop('output_jplace')
        output_extras = kwargs.pop('output_extras')
        evalue = kwargs.pop('evalue')
        min_orf_length = kwargs.pop('min_orf_length')
        restrict_read_length = kwargs.pop('restrict_read_length')
        filter_minimum_protein = kwargs.pop('filter_minimum_protein')
        filter_minimum_nucleotide = kwargs.pop('filter_minimum_nucleotide')
        include_inserts = kwargs.pop('include_inserts')
        singlem_packages = kwargs.pop('singlem_packages')
        window_size = kwargs.pop('window_size')
        assign_taxonomy = kwargs.pop('assign_taxonomy')
        known_sequence_taxonomy = kwargs.pop('known_sequence_taxonomy')

        working_directory = kwargs.pop('working_directory')
        force = kwargs.pop('force')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)
        
        self._num_threads = num_threads
        self._evalue = evalue
        self._min_orf_length = min_orf_length
        self._restrict_read_length = restrict_read_length
        self._filter_minimum_protein = filter_minimum_protein
        self._filter_minimum_nucleotide = filter_minimum_nucleotide

        hmms = HmmDatabase(singlem_packages)
        if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
            graftm_assignment_method = DIAMOND_ASSIGNMENT_METHOD
        else:
            graftm_assignment_method = singlem_assignment_method
            
        if logging.getLevelName(logging.getLogger().level) == 'DEBUG':
            self._graftm_verbosity = '5'
        else:
            self._graftm_verbosity = '2'

        using_temporary_working_directory = working_directory is None
        if using_temporary_working_directory:
            shared_mem_directory = '/dev/shm'
            if os.path.exists(shared_mem_directory):
                logging.debug("Using shared memory as a base directory")
                tmp = tempdir.TempDir(basedir=shared_mem_directory)
                tempfiles_path = os.path.join(tmp.name, 'tempfiles')
                os.mkdir(tempfiles_path)
                os.environ['TEMP'] = tempfiles_path
            else:
                logging.debug("Shared memory directory not detected, using default temporary directory instead")
                tmp = tempdir.TempDir()
            working_directory = tmp.name
        else:
            working_directory = working_directory
            if os.path.exists(working_directory):
                if force:
                    logging.info("Overwriting directory %s" % working_directory)
                    shutil.rmtree(working_directory)
                    os.mkdir(working_directory)
                else:
                    raise Exception("Working directory '%s' already exists, not continuing" % working_directory)
            else:
                os.mkdir(working_directory)
        logging.debug("Using working directory %s" % working_directory)
        self._working_directory = working_directory

        extracted_reads = None
        def return_cleanly():
            if extracted_reads: extracted_reads.cleanup()
            if using_temporary_working_directory: tmp.dissolve()
            logging.info("Finished")

        #### Search
        self._singlem_package_database = hmms
        search_result = self._search(hmms, forward_read_files)
        sample_names = search_result.samples_with_hits()
        if len(sample_names) == 0:
            logging.info("No reads identified in any samples, stopping")
            return_cleanly()
            return
        logging.debug("Recovered %i samples with at least one hit e.g. '%s'" \
                     % (len(sample_names), sample_names[0]))

        #### Alignment
        align_result = self._align(search_result)

        ### Extract reads that have already known taxonomy
        if known_otu_tables:
            logging.info("Parsing known taxonomy OTU tables")
            known_taxes = KnownOtuTable()
            known_taxes.parse_otu_tables(known_otu_tables)
            logging.debug("Read in %i sequences with known taxonomy" % len(known_taxes))
        else:
            known_taxes = []
        if known_sequence_taxonomy:
            logging.debug("Parsing sequence-wise taxonomy..")
            tax1 = GreenGenesTaxonomy.read(open(known_sequence_taxonomy)).taxonomy
            known_sequence_tax = {}
            for seq_id, tax in tax1.items():
                known_sequence_tax[seq_id] = '; '.join(tax)
            logging.info("Read in %i taxonomies from the GreenGenes format taxonomy file" % len(known_sequence_tax))

        ### Extract other reads which do not have known taxonomy
        extracted_reads = self._extract_relevant_reads(
            align_result, include_inserts, known_taxes)
        logging.info("Finished extracting aligned sequences")

        #### Taxonomic assignment
        if assign_taxonomy:
            logging.info("Running taxonomic assignment with graftm..")
            assignment_result = self._assign_taxonomy(
                extracted_reads, graftm_assignment_method)

        #### Process taxonomically assigned reads
        # get the sequences out for each of them
        otu_table_object = OtuTable()
        regular_output_fields = split('gene sample sequence num_hits coverage taxonomy')
        otu_table_object.fields = regular_output_fields + \
                                  split('read_names nucleotides_aligned taxonomy_by_known?')

        for sample_name, singlem_package, tmp_graft, known_sequences, unknown_sequences in extracted_reads:
            def add_info(infos, otu_table_object, known_tax):
                for info in infos:
                    to_print = [
                        singlem_package.graftm_package_basename(),
                        sample_name,
                        info.seq,
                        info.count,
                        info.coverage,
                        info.taxonomy,
                        info.names,
                        info.aligned_lengths,
                        known_tax]
                    otu_table_object.data.append(to_print)
            known_infos = self._seqs_to_counts_and_taxonomy(
                known_sequences,
                known_taxes,
                False,
                True)
            add_info(known_infos, otu_table_object, True)
            
            if tmp_graft: # if any sequences were aligned (not just already known)
                tmpbase = os.path.basename(tmp_graft.name[:-6])#remove .fasta
                
                if assign_taxonomy:
                    is_known_taxonomy = False
                    aligned_seqs = self._get_windowed_sequences(
                        assignment_result.prealigned_sequence_file(
                            sample_name, singlem_package, tmpbase),
                        assignment_result.nucleotide_hits_file(
                            sample_name, singlem_package, tmpbase),
                        singlem_package,
                        include_inserts)
                    if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
                        tax_file = assignment_result.diamond_assignment_file(
                            sample_name, singlem_package, tmpbase)
                    else:
                        tax_file = assignment_result.read_tax_file(
                            sample_name, singlem_package, tmpbase)
                    logging.debug("Reading taxonomy from %s" % tax_file)

                    if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
                        taxonomies = DiamondResultParser(tax_file)
                        use_first = True
                    else:
                        if not os.path.isfile(tax_file):
                            logging.warn("Unable to find tax file for gene %s from sample %s "
                                         "(likely do to min length filtering), skipping" % (
                                             os.path.basename(singlem_package.base_directory()),
                                             sample_name))
                            taxonomies = {}
                        else:
                            taxonomies = TaxonomyFile(tax_file)
                        use_first = False
                        
                else: # Taxonomy has not been assigned.
                    aligned_seqs = unknown_sequences
                    if known_sequence_taxonomy:
                        taxonomies = known_sequence_tax
                    else:
                        taxonomies = {}
                    use_first = False # irrelevant
                    is_known_taxonomy = True
                    
                new_infos = list(self._seqs_to_counts_and_taxonomy(
                    aligned_seqs, taxonomies, use_first, False))
                add_info(new_infos, otu_table_object, is_known_taxonomy)
                
                if output_jplace:
                    base_dir = assignment_result._base_dir(
                        sample_name, singlem_package, tmpbase)
                    input_jplace_file = os.path.join(base_dir, "placements.jplace")
                    output_jplace_file = os.path.join(base_dir, "%s_%s_%s.jplace" % (
                        output_jplace, sample_name, singlem_package.graftm_package_basename()))
                    logging.debug("Converting jplace file %s to singlem jplace file %s" % (
                        input_jplace_file, output_jplace_file))
                    with open(output_jplace_file, 'w') as output_jplace_io:
                        self._write_jplace_from_infos(
                            open(input_jplace_file), new_infos, output_jplace_io)

                            
        if output_otu_table:
            with open(output_otu_table, 'w') as f:
                if output_extras:
                    otu_table_object.write_to(f, otu_table_object.fields)
                else:
                    otu_table_object.write_to(f, regular_output_fields)
        if archive_otu_table:
            with open(archive_otu_table, 'w') as f:
                otu_table_object.archive(hmms.singlem_packages).write_to(f)
        return_cleanly()
예제 #5
0
    def print_appraisal(self,
                        appraisal,
                        doing_binning,
                        output_io=sys.stdout,
                        doing_assembly=False,
                        binned_otu_table_io=None,
                        unbinned_otu_table_io=None,
                        assembled_otu_table_io=None,
                        unaccounted_for_otu_table_io=None):
        '''print the Appraisal object overview to STDOUT'''

        headers = ['sample']
        if doing_binning: headers.append('num_binned')
        if doing_assembly: headers.append('num_assembled')
        headers.append('num_not_found')
        if doing_binning: headers.append('percent_binned')
        if doing_assembly: headers.append('percent_assembled')
        output_io.write("\t".join(headers) + "\n")

        binned = []
        assembled = []
        assembled_not_binned = []
        not_founds = []

        def print_sample(num_binned,
                         num_assembled,
                         num_assembled_not_binned,
                         num_not_found,
                         sample,
                         mypercent_binned=None,
                         mypercent_assembled=None):
            if mypercent_binned is not None or mypercent_assembled is not None:
                if doing_binning:
                    percent_binned = mypercent_binned
                if doing_assembly:
                    percent_assembled = mypercent_assembled
            else:
                total = num_not_found
                if doing_binning: total += num_binned
                if doing_assembly: total += num_assembled_not_binned
                if total == 0:
                    if doing_binning: percent_binned = 0.0
                    if doing_assembly: percent_assembled = 0.0
                else:
                    if doing_binning:
                        percent_binned = float(num_binned) / total * 100
                    if doing_assembly:
                        percent_assembled = float(num_assembled) / total * 100
            to_write = [sample]
            if doing_binning: to_write.append(str(num_binned))
            if doing_assembly: to_write.append(str(num_assembled))
            to_write.append(str(num_not_found))
            if doing_binning:
                to_write.append("%2.1f" % percent_binned)
            if doing_assembly:
                to_write.append("%2.1f" % percent_assembled)
            output_io.write("\t".join(to_write) + "\n")

        def mean(l):
            return float(sum(l)) / len(l) if len(l) > 0 else float('nan')

        if binned_otu_table_io:
            binned_table = OtuTable()
        if unbinned_otu_table_io:
            unbinned_table = OtuTable()
        if assembled_otu_table_io:
            assembled_table = OtuTable()
        if unaccounted_for_otu_table_io:
            unaccounted_for_table = OtuTable()

        for appraisal_result in appraisal.appraisal_results:
            if doing_assembly:
                num_assembled_not_binned = appraisal_result.num_assembled_not_binned(
                )
            print_sample(
                appraisal_result.num_binned if doing_binning else None,
                appraisal_result.num_assembled if doing_assembly else None,
                num_assembled_not_binned if doing_assembly else None,
                appraisal_result.num_not_found,
                appraisal_result.metagenome_sample_name)
            if doing_binning:
                binned.append(appraisal_result.num_binned)
            if doing_assembly:
                assembled.append(appraisal_result.num_assembled)
                assembled_not_binned.append(num_assembled_not_binned)
            not_founds.append(appraisal_result.num_not_found)
            if binned_otu_table_io:
                binned_table.add(appraisal_result.binned_otus)
            if unbinned_otu_table_io:
                unbinned_table.add(
                    appraisal_result.assembled_not_binned_otus())
            if assembled_otu_table_io:
                assembled_table.add(appraisal_result.assembled_otus)
            if unaccounted_for_otu_table_io:
                unaccounted_for_table.add(appraisal_result.not_found_otus)

        print_sample(
            sum(binned) if doing_binning else None,
            sum(assembled) if doing_assembly else None,
            sum(assembled_not_binned) if doing_assembly else None,
            sum(not_founds), 'total')

        binned_means = []
        assembled_means = []
        if doing_binning:
            to_enumerate = binned
        else:
            to_enumerate = assembled
        for i, _ in enumerate(to_enumerate):
            num_binned = binned[i] if doing_binning else 0
            num_assembled = assembled[i] if doing_assembly else 0
            num_assembled_not_binned = assembled_not_binned[
                i] if doing_assembly else 0
            num_not_found = not_founds[i]
            total = num_assembled_not_binned + num_not_found
            if doing_binning:
                total += num_binned
                binned_means.append(float(num_binned) / total)
            if doing_assembly:
                assembled_means.append(float(num_assembled) / total)
        print_sample("%2.1f" % mean(binned) if doing_binning else None,
                     "%2.1f" % mean(assembled) if doing_assembly else None,
                     None,
                     "%2.1f" % mean(not_founds),
                     'average',
                     mypercent_binned=mean(binned_means) *
                     100 if doing_binning else None,
                     mypercent_assembled=(mean(assembled_means) *
                                          100 if doing_assembly else None))

        if binned_otu_table_io:
            binned_table.write_to(binned_otu_table_io)
        if unbinned_otu_table_io:
            unbinned_table.write_to(unbinned_otu_table_io)
        if assembled_otu_table_io:
            assembled_table.write_to(assembled_otu_table_io)
        if unaccounted_for_otu_table_io:
            unaccounted_for_table.write_to(unaccounted_for_otu_table_io)
예제 #6
0
파일: appraiser.py 프로젝트: wwood/singlem
    def print_appraisal(self, appraisal,
                        doing_binning,
                        output_io=sys.stdout,
                        doing_assembly=False,
                        binned_otu_table_io=None,
                        unbinned_otu_table_io=None,
                        assembled_otu_table_io=None,
                        unaccounted_for_otu_table_io=None):
        '''print the Appraisal object overview to STDOUT'''

        headers = ['sample']
        if doing_binning: headers.append('num_binned')
        if doing_assembly: headers.append('num_assembled')
        headers.append('num_not_found')
        if doing_binning: headers.append('percent_binned')
        if doing_assembly: headers.append('percent_assembled')
        output_io.write("\t".join(headers)+"\n")

        binned = []
        assembled = []
        assembled_not_binned = []
        not_founds = []

        def print_sample(num_binned, num_assembled, num_assembled_not_binned, num_not_found, sample,
                         mypercent_binned=None, mypercent_assembled=None):
            if mypercent_binned is not None or mypercent_assembled is not None:
                if doing_binning:
                    percent_binned = mypercent_binned
                if doing_assembly:
                    percent_assembled = mypercent_assembled
            else:
                total = num_not_found
                if doing_binning: total += num_binned
                if doing_assembly: total += num_assembled_not_binned
                if total == 0:
                    if doing_binning: percent_binned = 0.0
                    if doing_assembly: percent_assembled = 0.0
                else:
                    if doing_binning:
                        percent_binned = float(num_binned)/total * 100
                    if doing_assembly:
                        percent_assembled = float(num_assembled)/total * 100
            to_write = [sample]
            if doing_binning: to_write.append(str(num_binned))
            if doing_assembly: to_write.append(str(num_assembled))
            to_write.append(str(num_not_found))
            if doing_binning:
                to_write.append("%2.1f" % percent_binned)
            if doing_assembly:
                to_write.append("%2.1f" % percent_assembled)
            output_io.write("\t".join(to_write)+"\n")

        def mean(l):
            return float(sum(l))/len(l) if len(l) > 0 else float('nan')

        if binned_otu_table_io:
            binned_table = OtuTable()
        if unbinned_otu_table_io:
            unbinned_table = OtuTable()
        if assembled_otu_table_io:
            assembled_table = OtuTable()
        if unaccounted_for_otu_table_io:
            unaccounted_for_table = OtuTable()

        for appraisal_result in appraisal.appraisal_results:
            if doing_assembly:
                num_assembled_not_binned = appraisal_result.num_assembled_not_binned()
            print_sample(appraisal_result.num_binned if doing_binning else None,
                         appraisal_result.num_assembled if doing_assembly else None,
                         num_assembled_not_binned if doing_assembly else None,
                         appraisal_result.num_not_found,
                         appraisal_result.metagenome_sample_name)
            if doing_binning:
                binned.append(appraisal_result.num_binned)
            if doing_assembly:
                assembled.append(appraisal_result.num_assembled)
                assembled_not_binned.append(num_assembled_not_binned)
            not_founds.append(appraisal_result.num_not_found)
            if binned_otu_table_io:
                binned_table.add(appraisal_result.binned_otus)
            if unbinned_otu_table_io:
                unbinned_table.add(appraisal_result.assembled_not_binned_otus())
            if assembled_otu_table_io:
                assembled_table.add(appraisal_result.assembled_otus)
            if unaccounted_for_otu_table_io:
                unaccounted_for_table.add(appraisal_result.not_found_otus)

        print_sample(sum(binned) if doing_binning else None,
                     sum(assembled) if doing_assembly else None,
                     sum(assembled_not_binned) if doing_assembly else None,
                     sum(not_founds),
                     'total')

        binned_means = []
        assembled_means = []
        if doing_binning:
            to_enumerate = binned
        else:
            to_enumerate = assembled
        for i, _ in enumerate(to_enumerate):
            num_binned = binned[i] if doing_binning else 0
            num_assembled = assembled[i] if doing_assembly else 0
            num_assembled_not_binned = assembled_not_binned[i] if doing_assembly else 0
            num_not_found = not_founds[i]
            total = num_assembled_not_binned+num_not_found
            if doing_binning:
                total += num_binned
                binned_means.append(float(num_binned)/total)
            if doing_assembly:
                assembled_means.append(float(num_assembled)/total)
        print_sample("%2.1f" % mean(binned) if doing_binning else None,
                     "%2.1f" % mean(assembled) if doing_assembly else None,
                     None,
                     "%2.1f" % mean(not_founds),
                     'average',
                     mypercent_binned=mean(binned_means)*100 if doing_binning else None,
                     mypercent_assembled=(mean(assembled_means)*100 if doing_assembly else None))

        if binned_otu_table_io:
            binned_table.write_to(binned_otu_table_io)
        if unbinned_otu_table_io:
            unbinned_table.write_to(unbinned_otu_table_io)
        if assembled_otu_table_io:
            assembled_table.write_to(assembled_otu_table_io)
        if unaccounted_for_otu_table_io:
            unaccounted_for_table.write_to(unaccounted_for_otu_table_io)