Пример #1
0
    def generate_variabile_nts_table(self):
        if self.skip_SNV_profiling:
            return

        variable_nts_table = TableForVariability(self.profile_db_path,
                                                 progress=self.progress)

        for contig in self.contigs:
            for split in contig.splits:
                for column_profile in list(split.column_profiles.values()):
                    # let's figure out more about this particular variable position
                    pos_in_contig = column_profile['pos_in_contig']

                    column_profile['in_partial_gene_call'], \
                    column_profile['in_complete_gene_call'],\
                    column_profile['base_pos_in_codon'] = self.get_nt_position_info(contig.name, pos_in_contig)

                    column_profile['sample_id'] = self.sample_id
                    column_profile[
                        'corresponding_gene_call'] = -1  # this means there is no gene call that corresponds to this
                    # nt position, which will be updated in the following lines.
                    # yeah, we use '-1', because genecaller ids start from 0 :/
                    column_profile['codon_order_in_gene'] = -1

                    # if this particular position (`pos_in_contig`) falls within a COMPLETE gene call,
                    # we would like to find out which unique gene caller id(s) match to this position.
                    if column_profile['in_complete_gene_call']:
                        corresponding_gene_caller_ids = self.get_corresponding_gene_caller_ids_for_base_position(
                            contig.name, pos_in_contig)

                        # if there are more than one corresponding gene call, this usually indicates an assembly error
                        # just to be on the safe side, we will not report a corresopnding unique gene callers id for this
                        # position
                        if len(corresponding_gene_caller_ids) == 1:
                            # if we are here, it means this nucleotide position is in a complete gene call. we will do two things here.
                            # first, we will store the gene_callers_id that corresponds to this nt position, and then we will store the
                            # order of the corresponding codon in the gene for this nt position.
                            gene_callers_id = corresponding_gene_caller_ids[0]
                            column_profile[
                                'corresponding_gene_call'] = gene_callers_id
                            column_profile[
                                'codon_order_in_gene'] = self.get_corresponding_codon_order_in_gene(
                                    gene_callers_id, contig.name,
                                    pos_in_contig)

                            # save this information for later use
                            self.codons_in_genes_to_profile_SCVs.add(
                                (gene_callers_id,
                                 column_profile['codon_order_in_gene']), )

                    variable_nts_table.append(column_profile)

        variable_nts_table.store()

        self.layer_additional_data[
            'num_SNVs_reported'] = variable_nts_table.num_entries
        self.layer_additional_keys.append('num_SNVs_reported')
Пример #2
0
    def generate_variabile_nts_table(self):
        if self.skip_SNV_profiling:
            return

        variable_nts_table = TableForVariability(self.profile_db_path, progress=null_progress)

        for contig in self.contigs:
            for split in contig.splits:
                for column_profile in list(split.column_profiles.values()):
                    variable_nts_table.append(column_profile)

        variable_nts_table.store()
Пример #3
0
    def merge_variable_nts_tables(self):
        variable_nts_table = TableForVariability(self.merged_profile_db_path, progress=self.progress)

        for input_profile_db_path in self.profile_dbs_info_dict:
            sample_profile_db = dbops.ProfileDatabase(input_profile_db_path, quiet=True)
            sample_variable_nts_table = sample_profile_db.db.get_table_as_list_of_tuples(tables.variable_nts_table_name, tables.variable_nts_table_structure)
            sample_profile_db.disconnect()

            for tpl in sample_variable_nts_table:
                entry = tuple([variable_nts_table.next_id(tables.variable_nts_table_name)] + list(tpl[1:]))
                variable_nts_table.db_entries.append(entry)

        variable_nts_table.store()
Пример #4
0
    def profile(self):
        manager = multiprocessing.Manager()
        available_index_queue = manager.Queue()
        output_queue = manager.Queue(self.queue_size)

        # put contig indices into the queue to be read from within
        # the worker
        for i in range(0, self.num_contigs):
            available_index_queue.put(i)

        processes = []
        for i in range(0, self.num_threads):
            processes.append(
                multiprocessing.Process(
                    target=BAMProfiler.profile_contig_worker,
                    args=(self, available_index_queue, output_queue)))

        for proc in processes:
            proc.start()

        recieved_contigs = 0
        discarded_contigs = 0
        memory_usage = None

        self.progress.new('Profiling w/' + str(self.num_threads) +
                          ' thread%s' % ('s' if self.num_threads > 1 else ''),
                          progress_total_items=self.num_contigs)
        self.progress.update('initializing threads ...')
        # FIXME: memory usage should be generalized.
        last_memory_update = int(time.time())

        self.progress.update('contigs are being processed ...')
        self.progress.increment(recieved_contigs)
        while recieved_contigs < self.num_contigs:
            try:
                contig = output_queue.get()

                # if we have a contig back, it means we are good to go with it,
                # otherwise it is garbage.
                if contig:
                    self.contigs.append(contig)
                else:
                    discarded_contigs += 1

                recieved_contigs += 1

                if (int(time.time()) - last_memory_update) > 5:
                    memory_usage = utils.get_total_memory_usage()
                    last_memory_update = int(time.time())

                self.progress.update('%d of %d contigs ⚙  / MEM ☠️  %s' % \
                            (recieved_contigs, self.num_contigs, memory_usage or '??'))

                # here you're about to witness the poor side of Python (or our use of it).
                # the problem we run into here was the lack of action from the garbage
                # collector on the processed objects. although we couldn't find any refs to
                # these objects, garbage collecter kept them in the memory, and `del` statement
                # on the `split` object did not yield any improvement either. so here we are
                # accessing to the atomic data structures in our split objects to try to relieve
                # the memory by encouraging the garbage collector to realize what's up
                # explicitly.
                if self.write_buffer_size > 0 and len(
                        self.contigs) % self.write_buffer_size == 0:
                    self.store_contigs_buffer()
                    for c in self.contigs:
                        for split in c.splits:
                            del split.coverage
                            del split.auxiliary
                            del split
                        del c.splits[:]
                        del c.coverage
                        del c
                    del self.contigs[:]
            except KeyboardInterrupt:
                self.run.info_single(
                    "Anvi'o profiler recieved SIGINT, terminating all processes...",
                    nl_before=2)
                break

        for proc in processes:
            proc.terminate()

        self.store_contigs_buffer()
        self.auxiliary_db.close()

        self.progress.end()

        # FIXME: this needs to be checked:
        if discarded_contigs > 0:
            self.run.info('contigs_after_C',
                          pp(recieved_contigs - discarded_contigs))

        overall_mean_coverage = 1
        if self.total_length_of_all_contigs != 0:
            overall_mean_coverage = self.total_coverage_values_for_all_contigs / self.total_length_of_all_contigs

        # FIXME: We know this is ugly. You can keep your opinion to yourself.
        if overall_mean_coverage > 0.0:
            # avoid dividing by zero
            dbops.ProfileDatabase(self.profile_db_path).db._exec(
                "UPDATE atomic_data_splits SET abundance = abundance / " +
                str(overall_mean_coverage) + " * 1.0;")
            dbops.ProfileDatabase(self.profile_db_path).db._exec(
                "UPDATE atomic_data_contigs SET abundance = abundance / " +
                str(overall_mean_coverage) + " * 1.0;")

        if not self.skip_SNV_profiling:
            self.layer_additional_data[
                'num_SNVs_reported'] = TableForVariability(
                    self.profile_db_path, progress=null_progress).num_entries
            self.layer_additional_keys.append('num_SNVs_reported')

        self.check_contigs(num_contigs=recieved_contigs - discarded_contigs)