Exemplo n.º 1
0
    def do_auxiliary_profile_data(self):
        self.progress.new('Splitting "%s"' % self.bin_id)
        self.progress.update('Subsetting the auxiliary data (for profile db)')

        new_auxiliary_profile_data_path = dbops.get_auxiliary_data_path_for_profile_db(
            self.bin_profile_db_path)
        parent_auxiliary_profile_data_path = self.summary.auxiliary_data_path

        bin_profile_auxiliary = auxiliarydataops.AuxiliaryDataForSplitCoverages(
            new_auxiliary_profile_data_path,
            self.contigs_db_hash,
            create_new=True)

        parent_profile_auxiliary = auxiliarydataops.AuxiliaryDataForSplitCoverages(
            parent_auxiliary_profile_data_path,
            self.summary.a_meta['contigs_db_hash'])

        for split_name in self.split_names:
            sample_coverages = parent_profile_auxiliary.get(split_name)
            for sample_name in sample_coverages:
                bin_profile_auxiliary.append(split_name, sample_name,
                                             sample_coverages[sample_name])

        bin_profile_auxiliary.store()
        bin_profile_auxiliary.close()
        parent_profile_auxiliary.close()

        if self.compress_auxiliary_data:
            self.progress.update(
                'Compressing the profile db auxiliary data file ...')
            utils.gzip_compress_file(new_auxiliary_profile_data_path)

        self.progress.end()
Exemplo n.º 2
0
    def merge_split_coverage_data(self):
        output_file_path = os.path.join(self.output_directory,
                                        'AUXILIARY-DATA.h5')
        merged_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(
            output_file_path, self.contigs_db_hash, create_new=True)

        self.progress.new('Merging split coverage data')

        # fill coverages in from all samples
        for input_profile_db_path in self.profile_dbs_info_dict:
            self.progress.update(input_profile_db_path)
            input_file_path = os.path.join(
                os.path.dirname(input_profile_db_path), 'AUXILIARY-DATA.h5')
            sample_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(
                input_file_path, self.contigs_db_hash)

            for split_name in self.split_names:
                coverages_dict = sample_split_coverage_values.get(split_name)
                for sample_name in coverages_dict:
                    merged_split_coverage_values.append(
                        split_name, sample_name, coverages_dict[sample_name])

            sample_split_coverage_values.close()

        merged_split_coverage_values.close()

        self.progress.end()
Exemplo n.º 3
0
    def merge_split_coverage_data(self):
        output_file_path = os.path.join(self.output_directory, 'AUXILIARY-DATA.db')
        merged_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(output_file_path, self.contigs_db_hash, create_new=True)

        AUX = lambda x: os.path.join(os.path.dirname(x), 'AUXILIARY-DATA.db')

        if False in [filesnpaths.is_file_exists(AUX(p), dont_raise=True) for p in self.profile_dbs_info_dict]:
            self.run.warning("Some of your single profile databases to be merged are missing auxiliary data files associated with them. Did you\
                              download them from somewhere and forgot to download the AUXILIARY-DATA.db files? Well. That's fine. Anvi'o will\
                              continue merging your profiles without split coverages (which means you will not be able to inspect nucleotide\
                              level coverage values and some other bells and whistles). If you want, you can kill this process now with CTRL+C\
                              and redo it with all database files in proper places.")

            return None

        self.progress.new('Merging split coverage data')

        # fill coverages in from all samples
        for input_profile_db_path in self.profile_dbs_info_dict:
            self.progress.update(input_profile_db_path)
            sample_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(AUX(input_profile_db_path), self.contigs_db_hash)

            for split_name in self.split_names:
                coverages_dict = sample_split_coverage_values.get(split_name)
                for sample_name in coverages_dict:
                    merged_split_coverage_values.append(split_name, sample_name, coverages_dict[sample_name])

            sample_split_coverage_values.close()

        merged_split_coverage_values.store()
        merged_split_coverage_values.close()

        self.progress.end()
Exemplo n.º 4
0
    def store_split_coverages(self):
        output_file = self.generate_output_destination('AUXILIARY-DATA.h5')
        split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(
            output_file, self.a_meta['contigs_db_hash'], create_new=True)

        self.progress.new('Storing split coverages')

        contigs_counter = 1
        for contig_name in self.contigs:
            self.progress.update('working on contig %s of %s' %
                                 (pp(contigs_counter), pp(len(self.contigs))))

            for split in self.contigs[contig_name].splits:
                split_coverage_values.append(split.name, self.sample_id,
                                             split.coverage.c)

            contigs_counter += 1

        self.progress.end()

        split_coverage_values.close()

        self.run.info('split_coverage_values',
                      'stored in %s' % output_file,
                      display_only=True)
        self.run.info('split_coverage_values', True, quiet=True)
Exemplo n.º 5
0
    def merge_split_coverage_data(self):
        AUX = lambda x: os.path.join(os.path.dirname(x), 'AUXILIARY-DATA.db')

        if False in [
                filesnpaths.is_file_exists(AUX(p), dont_raise=True)
                for p in self.profile_dbs_info_dict
        ]:
            self.run.warning(
                "Some of your single profile databases to be merged are missing auxiliary data files associated with them. Did you "
                "download them from somewhere and forgot to download the AUXILIARY-DATA.db files? Well. That's fine. Anvi'o will "
                "continue merging your profiles without split coverages (which means you will not be able to inspect nucleotide "
                "level coverage values and some other bells and whistles). If you want, you can kill this process now with CTRL+C "
                "and redo it with all database files in proper places.")

            return None

        output_file_path = os.path.join(self.output_directory,
                                        'AUXILIARY-DATA.db')
        merged_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(
            output_file_path, self.contigs_db_hash, create_new=True)

        self._concatenate_single_profile_tables(
            merged_split_coverage_values.db,
            table_name='split_coverages',
            is_auxiliary=True)

        self.progress.new('Creating index for `split_coverages` table')
        self.progress.update('...')
        merged_split_coverage_values.close()
        self.progress.end()
Exemplo n.º 6
0
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError("You can not run profiling without a contigs database. You can create\
                                one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists=self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(self.output_directory, self.progress, delete_if_exists=self.overwrite_output_destinations)

        self.progress.update('Creating a new single profile database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        if self.skip_SNV_profiling:
            self.profile_SCVs = False

        meta_values = {'db_type': 'profile',
                       'anvio': __version__,
                       'sample_id': self.sample_id,
                       'samples': self.sample_id,
                       'merged': False,
                       'blank': self.blank,
                       'items_ordered': False,
                       'default_view': 'single',
                       'min_contig_length': self.min_contig_length,
                       'max_contig_length': self.max_contig_length,
                       'SNVs_profiled': not self.skip_SNV_profiling,
                       'SCVs_profiled': self.profile_SCVs,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'report_variability_full': self.report_variability_full,
                       'contigs_db_hash': self.a_meta['contigs_db_hash'],
                       'description': self.description if self.description else '_No description is provided_'}
        profile_db.create(meta_values)

        self.progress.update('Creating a new auxiliary database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.auxiliary_db_path = self.generate_output_destination('AUXILIARY-DATA.db')
        self.auxiliary_db = auxiliarydataops.AuxiliaryDataForSplitCoverages(self.auxiliary_db_path,
                                                                            self.a_meta['contigs_db_hash'],
                                                                            create_new=True,
                                                                            run=null_run,
                                                                            progress=null_progress)

        self.progress.end()

        if self.skip_SNV_profiling:
            self.run.warning('Single-nucleotide variation will not be characterized for this profile.')

        if not self.profile_SCVs:
            self.run.warning('Amino acid linkmer frequencies will not be characterized for this profile.')
Exemplo n.º 7
0
    def merge_split_coverage_data(self):
        self.is_all_samples_have_it('split_coverage_values')

        output_file_path = os.path.join(self.output_directory,
                                        'AUXILIARY-DATA.h5')
        merged_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(
            output_file_path, self.contigs_db_hash, create_new=True)

        # fill coverages in from all samples
        for runinfo in self.input_runinfo_dicts.values():
            input_file_path = os.path.join(
                os.path.dirname(runinfo['profile_db']), 'AUXILIARY-DATA.h5')
            sample_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(
                input_file_path, self.contigs_db_hash)

            for split_name in self.split_names:
                coverages_dict = sample_split_coverage_values.get(split_name)
                for sample_name in coverages_dict:
                    merged_split_coverage_values.append(
                        split_name, sample_name, coverages_dict[sample_name])

            sample_split_coverage_values.close()

        merged_split_coverage_values.close()
Exemplo n.º 8
0
    def store_split_coverages(self):
        output_file = self.generate_output_destination('AUXILIARY-DATA.h5')
        split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(
            output_file,
            self.a_meta['contigs_db_hash'],
            create_new=True,
            open_in_append_mode=True)

        contigs_counter = 1
        for contig in self.contigs:
            for split in contig.splits:
                split_coverage_values.append(split.name, self.sample_id,
                                             split.coverage.c)

            contigs_counter += 1

        split_coverage_values.close()
Exemplo n.º 9
0
    def process_db(self, entry_name, profile_db_path, bam_file_path):
        """Function that does everything.

        `entry_name` is the entry name in bams and profiles file.
        """

        self.progress.new(f"Processing '{entry_name}'")

        ################################################################################
        self.progress.update("Recovering the coverage data")
        ################################################################################

        profile_db = dbops.ProfileSuperclass(argparse.Namespace(
            profile_db=profile_db_path, contigs_db=self.contigs_db_path),
                                             r=run_quiet,
                                             p=progress_quiet)
        sample_id = profile_db.p_meta['sample_id']

        # here we open our bam file with an inversions fetch filter.
        # we will access to it later when it is time to get the FWD/FWD and
        # REV/REV reads.
        bam_file = bamops.BAMFileObject(bam_file_path, 'rb')

        if self.process_only_inverted_reads:
            bam_file.fetch_filter = 'inversions'
        else:
            bam_file.fetch_filter = None

        ################################################################################
        self.progress.update("Computing coverage stretches")
        ################################################################################
        # populate coverage stretches in contigs based on coverage data in this
        # particular profile_db. we will then go through each stretch to find
        # those that include palindromic sequences
        contig_coverages = {}
        coverage_stretches_in_contigs = {}
        for contig_name in self.contig_names:
            contig_coverage = np.array([])

            split_names = self.contig_name_to_split_names[contig_name]

            for i in range(len(split_names)):
                split_name = split_names[i]
                split_coverages = auxiliarydataops.AuxiliaryDataForSplitCoverages(
                    profile_db.auxiliary_data_path,
                    profile_db.p_meta['contigs_db_hash']).get(split_name)
                contig_coverage = np.concatenate(
                    (contig_coverage, split_coverages[sample_id]), axis=None)

            # now we know the `contig_coverage`. it is time to break it into stretches
            # of 'high coverage' regions (as in coverage > `self.min_coverage_to_define_stretches`), and store that
            # information into the dictionary `coverage_stretches_in_contigs`
            coverage_stretches_in_contigs[contig_name] = []

            # we also know the contig length here, so let's keep that in mind:
            contig_length = len(contig_coverage)

            # to find regions of high coverage, we first need to 'pad' our array to ensure it always
            # starts and ends with 'low coverage'.
            regions_of_contig_covered_enough = np.hstack(
                [[False],
                 contig_coverage >= self.min_coverage_to_define_stretches,
                 [False]])

            regions_of_contig_covered_enough_diff = np.diff(
                regions_of_contig_covered_enough.astype(int))
            cov_stretch_start_positions = np.where(
                regions_of_contig_covered_enough_diff == 1)[0]
            cov_stretch_end_positions = np.where(
                regions_of_contig_covered_enough_diff == -1)[0]

            # at this stage, `cov_stretch_start_positions` and `cov_stretch_end_positions` contain pairs of
            # positions that match to the begining and end of stretches. we will remove those that are too
            # short to be considered, and store the start/end positions for the remaining stretches of
            # high coverage into the dictionary `coverage_stretches_in_contigs`
            for i in range(0, len(cov_stretch_start_positions)):
                cov_stretch_start, cov_stretch_end = cov_stretch_start_positions[
                    i], cov_stretch_end_positions[i]

                if (cov_stretch_end -
                        cov_stretch_start) >= self.min_stretch_length:
                    coverage_stretches_in_contigs[contig_name].append(
                        (cov_stretch_start, cov_stretch_end), )

            # now it is time to merge those stretches of coverage if they are close to one another to avoid
            # over-splitting areas of coverage due to short regions with low-coverage in the middle like this,
            # where we wish to identify A and B together in a single stretch:
            #
            #                A         B
            #
            #                -         -
            #               ---        --
            #              -----      -----
            #             --------   --------
            #           -----------------------
            # -----------------------------------------------
            coverage_stretches_in_contigs[contig_name] = utils.merge_stretches(
                coverage_stretches_in_contigs[contig_name],
                min_distance_between_independent_stretches=self.
                min_distance_between_independent_stretches)
            # extend start and stop positions of merged stretches to ENSURE we are not
            # missing important information because bioinformatics.
            coverage_stretches_in_contigs[contig_name] = [(0 if (e[0] - self.num_nts_to_pad_a_stretch< 0) else e[0] - self.num_nts_to_pad_a_stretch,
                                                           contig_length if (e[1] + self.num_nts_to_pad_a_stretch) > contig_length else e[1] + self.num_nts_to_pad_a_stretch) \
                                                                for e in coverage_stretches_in_contigs[contig_name]]

            contig_coverages[contig_name] = contig_coverage

        ################################################################################
        self.progress.update("Getting ready to process stretches")
        ################################################################################
        # time to go through each stretch and look for palindromes
        # first, we will set up the Palindromes class
        _args = argparse.Namespace(
            min_palindrome_length=self.min_palindrome_length,
            max_num_mismatches=self.max_num_mismatches,
            min_distance=self.min_distance_palindrome)

        P = Palindromes(_args, run=run_quiet, progress=progress_quiet)
        P.verbose = False

        # now we can go through all the stretches to look for palindromes. this is a LOOOOOONG loop.
        # down below, we will got through each contig name, find stretches of good coverage of FWD/FWD
        # and REV/REV reads (since their coverage values are stored in the profile db of 'inversions'
        # type), find palindromes in those sequences that match to those coverage stretches, build some
        # constructs, and then go through every FWD/FWD and REV/REV read from the BAM file to see if
        # our constructs occur in any of them, which is the only 100% proof of an active inversion.
        for contig_name in coverage_stretches_in_contigs:
            contig_sequence = self.contig_sequences[contig_name]['sequence']
            for start, stop in coverage_stretches_in_contigs[contig_name]:
                stretch_sequence_coverage = contig_coverages[contig_name][
                    start:stop]
                stretch_sequence = contig_sequence[start:stop]
                sequence_name = f"{contig_name}_{start}_{stop}"

                # if the user wants to learn about only a single sequence, we only
                # focus on that one and prematurely go to the next stretch unless
                # there is a match
                if self.only_report_from and sequence_name != self.only_report_from:
                    continue

                # before we go any further, let's print out the sequence in consideration
                # for the user if they used `--verbose`
                if anvio.DEBUG or self.verbose:
                    self.progress.reset()
                    self.run.warning(None,
                                     header=f"Palindromes in {sequence_name}",
                                     lc='yellow',
                                     nl_before=3)
                    self.run.info_single(f"Sequence {stretch_sequence}",
                                         cut_after=0)
                    self.run.info_single("Coverage:", nl_before=1, nl_after=1)
                    self.plot_coverage(f"{sequence_name}",
                                       stretch_sequence_coverage)

                ################################################################################
                self.progress.update(f"{contig_name}: looking for palindromes")
                ################################################################################
                P.find(stretch_sequence,
                       sequence_name=sequence_name,
                       display_palindromes=False)

                if not len(P.palindromes[sequence_name]):
                    # there is no palindrome in this one
                    if anvio.DEBUG or self.verbose:
                        self.progress.reset()
                        self.run.info_single("No palindromes in this one :/",
                                             mc="red")
                    continue
                else:
                    if anvio.DEBUG or self.verbose:
                        self.progress.reset()
                        self.run.info_single(
                            f"The sequence has {PL('palindrome', len(P.palindromes[sequence_name]))}:",
                            mc="green")

                ################################################################################
                self.progress.update(f"{contig_name}: building constructs")
                ################################################################################
                # this is important. here we are getting ready to test each our inversion candidate
                # by reconstructing Florian's imaginary sequences. in the next step we will see if
                # any of these sequences are in any of the FWD/FWD or REV/REV reads
                inversion_candidates = []
                for inversion_candidate in P.palindromes[sequence_name]:
                    region_A_start = inversion_candidate.first_start - 6
                    region_A_end = inversion_candidate.first_start
                    region_A = stretch_sequence[region_A_start:region_A_end]

                    region_B_start = inversion_candidate.first_end
                    region_B_end = inversion_candidate.first_end + 6
                    region_B = stretch_sequence[region_B_start:region_B_end]

                    region_C_start = inversion_candidate.second_start - 6
                    region_C_end = inversion_candidate.second_start
                    region_C = stretch_sequence[region_C_start:region_C_end]

                    region_D_start = inversion_candidate.second_end
                    region_D_end = inversion_candidate.second_end + 6
                    region_D = stretch_sequence[region_D_start:region_D_end]

                    construct_v1_left = region_A + inversion_candidate.first_sequence + utils.rev_comp(
                        region_C)
                    construct_v1_right = utils.rev_comp(
                        region_B) + utils.rev_comp(
                            inversion_candidate.second_sequence) + region_D

                    construct_v2_left = region_A + inversion_candidate.second_sequence + utils.rev_comp(
                        region_C)
                    construct_v2_right = utils.rev_comp(
                        region_B) + utils.rev_comp(
                            inversion_candidate.first_sequence) + region_D

                    # update the palindrome instance with its constructs
                    inversion_candidate.v1_left = construct_v1_left
                    inversion_candidate.v1_right = construct_v1_right
                    inversion_candidate.v2_left = construct_v2_left
                    inversion_candidate.v2_right = construct_v2_right

                    if anvio.DEBUG or self.verbose:
                        self.progress.reset()
                        inversion_candidate.display()
                        self.run.info("Construct v1 left",
                                      construct_v1_left,
                                      mc="cyan")
                        self.run.info("Construct v1 right",
                                      construct_v1_right,
                                      mc="cyan")
                        self.run.info("Construct v2 left",
                                      construct_v2_left,
                                      mc="cyan")
                        self.run.info("Construct v2 right",
                                      construct_v2_right,
                                      mc="cyan")

                    inversion_candidates.append(inversion_candidate)

                # here we have, for a given `contig_name` and `start` and `stop` positions of a stretch in it, we have our inversion candidates,
                ################################################################################
                self.progress.update(
                    f"{contig_name}[{start}:{stop}]: testing constructs")
                ################################################################################
                true_inversion = None

                for read in bam_file.fetch_only(contig_name,
                                                start=start,
                                                end=stop):
                    for inversion_candidate in inversion_candidates:
                        if inversion_candidate.v1_left in read.query_sequence:
                            true_inversion = inversion_candidate
                            break
                        elif inversion_candidate.v1_right in read.query_sequence:
                            true_inversion = inversion_candidate
                            break
                        elif inversion_candidate.v2_left in read.query_sequence:
                            true_inversion = inversion_candidate
                            break
                        elif inversion_candidate.v2_right in read.query_sequence:
                            true_inversion = inversion_candidate
                            break

                if anvio.DEBUG or self.verbose:
                    if true_inversion:
                        self.progress.reset()
                        self.run.info_single(
                            f"Of the {PL('inversion candidate', len(inversion_candidates))} above, "
                            f"the one below had at least one perfect matches to their constructs in REV/REV or "
                            f"FWD/FWD reads from the BAM file:",
                            mc="green",
                            nl_before=1)

                        true_inversion.display()
                    else:
                        self.progress.reset()
                        self.run.info_single(
                            f"No true inversions in this one: none of the REV/REV or FWD/FWD reads "
                            f"had any of the constructs in {PL('inversion candidate', len(inversion_candidates))}.",
                            mc="red",
                            nl_before=1)

        self.progress.end()
Exemplo n.º 10
0
    def init_commons(self):
        self.progress.new('Init')

        self.progress.update('Checking the output file path ..')
        if self.output_file_path:
            filesnpaths.is_output_file_writable(self.output_file_path)

        self.progress.update('Checking the samples of interest ..')
        if self.samples_of_interest_path:
            filesnpaths.is_file_exists(self.samples_of_interest_path)
            self.samples_of_interest = set([
                s.strip()
                for s in open(self.samples_of_interest_path).readlines()
            ])
        else:
            self.samples_of_interest = set([])

        self.progress.update('Making sure our databases are here ..')
        if not self.profile_db_path:
            raise ConfigError, 'You need to provide a profile database.'

        if not self.contigs_db_path:
            raise ConfigError, 'You need to provide a contigs database.'

        self.progress.update('Making sure our databases are compatible ..')
        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        if self.min_coverage_in_each_sample and not self.quince_mode:
            self.progress.end()
            raise ConfigError, "When you sepecify a coverage value through --min-coverage-in-each-sample, you must also\
                                use --quince-mode flag, since the former parameter needs to know the coverage values in all\
                                samples even if variation is reported for only one sample among otheres. This is the only way\
                                to figure out whether variation is not reported for other samples due to low or zero coverage,\
                                or there was no variation to report despite the high coverage. Anvi'o could turn --quince-mode\
                                flat automatically for you, but then it is much better if you have full control and understaning\
                                of what is going on."

        if self.quince_mode:
            self.progress.update('Accessing auxiliary data file ...')
            auxiliary_data_file_path = os.path.join(
                os.path.dirname(self.profile_db_path), 'AUXILIARY-DATA.h5')
            if not os.path.exists(auxiliary_data_file_path):
                raise ConfigError, "Anvi'o needs the auxiliary data file to run this program with '--quince-mode' flag.\
                                    However it wasn't found at '%s' :/" % auxiliary_data_file_path
            self.merged_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(
                auxiliary_data_file_path, None, ignore_hash=True)

        self.progress.update(
            'Attempting to get our splits of interest sorted ..')
        if self.collection_name:
            # the user wants to go with the collection id path. fine. we will get our split names from
            # the profile database.
            if not self.bin_id:
                self.progress.end()
                raise ConfigError, 'When you declare a collection id, you must also declare a bin name\
                                    (from which the split names of interest will be acquired)'

            if self.splits_of_interest or self.splits_of_interest_path:
                self.progress.end()
                raise ConfigError, "You declared a collection id and one or more bin names so anvi'o can find out\
                                    splits of interest, but you also have specified informaiton for split names?\
                                    This is confusing. You should choose one way or another :/"

            self.splits_of_interest = ccollections.GetSplitNamesInBins(
                self.args).get_split_names_only()
        else:
            # OK. no collection id. we will go oldschool. we whope to find what we are looking for in
            # self.splits_of_interst_path  at this point (which may have been filled through the command
            # line client), or in self.splits_of_interest (which may have been filled in by another program)
            if not self.splits_of_interest:
                if not self.splits_of_interest_path:
                    self.progress.end()
                    raise ConfigError, 'You did not declare a source for split names. You either should give me\
                                        a file with split names you are interested in, or a collection id and\
                                        bin name so I can learn split names from the profile database.'

                filesnpaths.is_file_exists(self.splits_of_interest_path)
                self.splits_of_interest = set([
                    c.strip().replace('\r', '')
                    for c in open(self.splits_of_interest_path).readlines()
                ])

        self.input_file_path = '/' + '/'.join(
            os.path.abspath(self.profile_db_path).split('/')[:-1])

        self.progress.update('Reading the data ...')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        self.sample_ids = profile_db.samples  # we set this now, but we will overwrite it with args.samples_of_interest if necessary

        if not profile_db.meta['SNVs_profiled']:
            self.progress.end()
            raise ConfigError, "Well well well. It seems SNVs were not characterized for this profile database.\
                                Sorry, there is nothing to report here!"

        if self.engine == 'NT':
            self.data = profile_db.db.get_table_as_dict(
                t.variable_nts_table_name)
        elif self.engine == 'AA':
            # AA specific stuff. first check whether things were profiled
            if not profile_db.meta['AA_frequencies_profiled']:
                raise ConfigError, "It seems AA frequencies were not characterized for this profile database.\
                                    There is nothing to report here for AAs!"

            # get the data.
            self.data = profile_db.db.get_table_as_dict(
                t.variable_aas_table_name)

            # append split_name information
            for e in self.data.values():
                e['split_name'] = self.gene_callers_id_to_split_name_dict[
                    e['corresponding_gene_call']]
        else:
            raise ConfigError, "VariabilitySuper :: Anvi'o doesn't know what to do with a engine on '%s' yet :/" % self.engine

        profile_db.disconnect()

        self.progress.end()
Exemplo n.º 11
0
    def charts(self, order_name, item_name):
        if self.interactive.mode == 'gene':
            split_name = self.interactive.gene_callers_id_to_split_name_dict[
                int(item_name)]
        else:
            split_name = item_name

        data = {
            'layers': [],
            'title': split_name,
            'index': None,
            'total': None,
            'coverage': [],
            'variability': [],
            'competing_nucleotides': [],
            'previous_contig_name': None,
            'next_contig_name': None,
            'genes': [],
            'outlier_SNVs_shown': not self.args.hide_outlier_SNVs
        }

        if split_name not in self.interactive.split_names:
            return data

        if not self.interactive.auxiliary_profile_data_available:
            return data

        data['index'], data['total'], data['previous_contig_name'], data[
            'next_contig_name'] = self.get_index_total_previous_and_next_items(
                order_name, item_name)

        layers = sorted(self.interactive.p_meta['samples'])

        auxiliary_coverages_db = auxiliarydataops.AuxiliaryDataForSplitCoverages(
            self.interactive.auxiliary_data_path,
            self.interactive.p_meta['contigs_db_hash'])
        coverages = auxiliary_coverages_db.get(split_name)
        auxiliary_coverages_db.close()

        data['coverage'] = [coverages[layer].tolist() for layer in layers]

        ## get the variability information dict for split:
        progress.new('Variability')
        progress.update('Collecting info for "%s"' % split_name)
        split_variability_info_dict = self.interactive.get_variability_information_for_split(
            split_name, skip_outlier_SNVs=self.args.hide_outlier_SNVs)

        for layer in layers:
            progress.update('Formatting variability data: "%s"' % layer)
            data['layers'].append(layer)
            data['competing_nucleotides'].append(
                split_variability_info_dict[layer]['competing_nucleotides'])
            data['variability'].append(
                split_variability_info_dict[layer]['variability'])

        levels_occupied = {1: []}
        for entry_id in self.interactive.split_name_to_genes_in_splits_entry_ids[
                split_name]:
            gene_callers_id = self.interactive.genes_in_splits[entry_id][
                'gene_callers_id']
            p = self.interactive.genes_in_splits[entry_id]
            # p looks like this at this point:
            #
            # {'percentage_in_split': 100,
            #  'start_in_split'     : 16049,
            #  'stop_in_split'      : 16633}
            #  'prot'               : u'prot2_03215',
            #  'split'              : u'D23-1contig18_split_00036'}
            #
            # we will add a bit more attributes:
            p['source'] = self.interactive.genes_in_contigs_dict[
                gene_callers_id]['source']
            p['direction'] = self.interactive.genes_in_contigs_dict[
                gene_callers_id]['direction']
            p['start_in_contig'] = self.interactive.genes_in_contigs_dict[
                gene_callers_id]['start']
            p['stop_in_contig'] = self.interactive.genes_in_contigs_dict[
                gene_callers_id]['stop']
            p['complete_gene_call'] = 'No' if self.interactive.genes_in_contigs_dict[
                gene_callers_id]['partial'] else 'Yes'
            p['length'] = p['stop_in_contig'] - p['start_in_contig']
            p['functions'] = self.interactive.gene_function_calls_dict[
                gene_callers_id] if gene_callers_id in self.interactive.gene_function_calls_dict else None

            for level in levels_occupied:
                level_ok = True
                for gene_tuple in levels_occupied[level]:
                    if (p['start_in_split'] >= gene_tuple[0] - 100 and p['start_in_split'] <= gene_tuple[1] + 100) or\
                                (p['stop_in_split'] >= gene_tuple[0] - 100 and p['stop_in_split'] <= gene_tuple[1] + 100) or \
                                (p['start_in_split'] <= gene_tuple[0] - 100 and p['stop_in_split'] >= gene_tuple[1] + 100):
                        level_ok = False
                        break
                if level_ok:
                    levels_occupied[level].append(
                        (p['start_in_split'], p['stop_in_split']), )
                    p['level'] = level
                    break
            if not level_ok:
                levels_occupied[level + 1] = [
                    (p['start_in_split'], p['stop_in_split']),
                ]
                p['level'] = level + 1

            data['genes'].append(p)

        progress.end()

        return json.dumps(data)