Exemplo n.º 1
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress

        # initialize self.arg parameters
        A = lambda x, t: t(args.__dict__[x]
                           ) if x in self.args.__dict__ else None
        null = lambda x: x
        self.contigs_db_path = A('contigs_db', null)
        self.structure_db_path = A('structure_db', null)
        self.genes_to_remove = A('genes_to_remove', null)
        self.genes_to_remove_path = A('genes_to_remove_file', null)
        self.genes_to_add = A('genes_to_add', null)
        self.genes_to_add_path = A('genes_to_add_file', null)
        self.full_modeller_output = A('dump_dir', null)
        self.modeller_executable = A('modeller_executable', null)
        self.DSSP_executable = None

        utils.is_contigs_db(self.contigs_db_path)
        self.contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        self.contigs_db_hash = self.contigs_db.meta['contigs_db_hash']

        # init ContigsSuperClass
        self.contigs_super = ContigsSuperclass(self.args)

        if not any([
                self.genes_to_remove, self.genes_to_remove_path,
                self.genes_to_add, self.genes_to_add_path
        ]):
            raise ConfigError(
                "Please specify some genes to add or remove to your database.")

        if self.genes_to_remove and self.genes_to_remove_path:
            raise ConfigError(
                "Provide either --genes-to-remove or --genes-to-remove-path. You provided both."
            )

        if self.genes_to_add and self.genes_to_add_path:
            raise ConfigError(
                "Provide either --genes-to-add or --genes-to-add-path. You provided both."
            )

        if self.genes_to_remove or self.genes_to_remove_path:
            self.run.warning("Removing genes...",
                             header="Updating %s" % self.structure_db_path,
                             lc='green')
            self.load_structure_db()
            remove = self.parse_genes(self.genes_to_remove,
                                      self.genes_to_remove_path)
            self.remove_genes(remove)
            self.structure_db.disconnect()

        if self.genes_to_add or self.genes_to_add_path:
            self.run.warning("Adding genes...",
                             header="Updating %s" % self.structure_db_path,
                             lc='green')
            self.load_structure_db()
            self.add_genes()
Exemplo n.º 2
0
    def populate_search_tables(self, sources={}):
        # make sure the output file is OK to write.
        filesnpaths.is_output_file_writable(self.db_path, ok_if_exists=True)

        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        self.check_sources(sources)

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        hmmpressed_files = self.hmmpress_sources(sources, tmp_directory_path)

        self.run.info("Contigs DB", self.db_path)
        self.run.info("HMM sources", ', '.join(sources.keys()))

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        have_hmm_sources_with_non_RNA_contig_context = False
        for target in targets:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                target)

            if not self.genes_are_called and context != "CONTIG":
                raise ConfigError(
                    "You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an "
                    "HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run "
                    "HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal "
                    "RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter "
                    "'--installed-hmm-profile PROFILE_NAME_HERE')." %
                    (context, alphabet))

            self.run.info('Alphabet/context target found',
                          '%s:%s' % (alphabet, context))

            if context == 'CONTIG' and alphabet != 'RNA':
                have_hmm_sources_with_non_RNA_contig_context = True

            class Args:
                pass

            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(
                    tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.get_sequences_for_gene_callers_ids(
                    output_file_path=target_files_dict['%s:GENE' % alphabet],
                    simple_headers=True,
                    rna_alphabet=True if alphabet == 'RNA' else False,
                    report_aa_sequences=True if alphabet == 'AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError(
                        "You are somewhere you shouldn't be. You came here because you thought it would be OK "
                        "to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If "
                        "you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(
                        tmp_directory_path,
                        '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(
                        self.db_path,
                        target_files_dict['%s:CONTIG' % alphabet],
                        rna_alphabet=True if alphabet == 'RNA' else False)

        if have_hmm_sources_with_non_RNA_contig_context:
            # in that case, we should remind people what's up.
            self.run.warning(
                "The HMM profiles that are about to be run includes at least one HMM profile that runs on "
                "contigs and not genes. Thus, this HMM operation will not be working with gene calls anvi'o "
                "already knows about. Which means, the resulting hits will need to be added as 'new gene calls' "
                "into the contigs database. So far so good. But because we are in the realm of contigs rather "
                "than genes, the resulting HMM hits will unlikely correspond to open reading frames that are "
                "supposed to be translated (such as ribosomal RNAs). While anvi'o adds new gene calls to your "
                "contigs database for these hits, it will NOT report amino acid sequences for the "
                "new gene calls that will emerge from these HMMs, expecting you to judge whether this will "
                "influence your pangenomic analyses or other things you thought you would be doing with the "
                "result of this HMM search downstream. If you do not feel like being the judge of anything today "
                "you can move on yet remember to remember this if things look somewhat weird later on.",
                header="THE MORE YOU KNOW 🌈",
                lc="green")

        commander = HMMer(target_files_dict,
                          num_threads_to_use=self.num_threads_to_use,
                          program_to_use=self.hmm_program)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                sources[source]['target'])

            if alphabet in ['DNA', 'RNA'
                            ] and 'domtable' in self.hmmer_desired_output:
                raise ConfigError(
                    "Domain table output was requested (probably with the --get-domtable-output flag, "
                    "does that look familiar?) but unfortunately this option is incompatible with the "
                    f"current source of HMM profiles, {source}, because this source uses a nucleotide "
                    "alphabet.")

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = hmmpressed_files[source]
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmmer_output = commander.run_hmmer(
                source,
                alphabet,
                context,
                kind_of_search,
                domain,
                len(all_genes_searched_against),
                hmm_model,
                reference,
                noise_cutoff_terms,
                desired_output=self.hmmer_desired_output,
                hmmer_output_dir=self.hmmer_output_dir)

            if self.hmmer_output_dir:
                self.run.info("HMMER output directory", self.hmmer_output_dir)

            if not isinstance(hmmer_output, tuple):
                hmm_scan_hits_txt = hmmer_output
            else:
                hmm_scan_hits_txt, domain_hits_txt = hmmer_output
                self.run.info("Domain table output", domain_hits_txt)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                try:
                    parser = parser_modules['search']['hmmer_table_output'](
                        hmm_scan_hits_txt,
                        alphabet=alphabet,
                        context=context,
                        program=self.hmm_program)
                except StupidHMMError as e:
                    raise ConfigError(
                        f"Unfortunately something went wrong while anvi'o was trying to parse some HMM output for your data. "
                        f"This error is typically due to contig names that are long and variable in length, which that "
                        f"confuses HMMER and so it generates output tables that are simply unparseable. Anvi'o does its best, "
                        f"but occasionally fails, which leads to this error. If you are curious why is this happening, you can take a "
                        f"look at this issue where this issue is described: https://github.com/merenlab/anvio/issues/1564. "
                        f"Solution to this is relatively easy: use `anvi-script-reformat-fasta` with `--simplify-names` flag "
                        f"BEFORE generating your contigs database as we advice you to. Sorry you came all this way just to "
                        f"find out about this :/ Here is the origial error message anvi'o produced from the code beneath: {e}."
                    )

                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single(
                    "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)."
                    % source,
                    nl_before=1)

            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene caller ids in it. so there are two things we need to do.
                # one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.
                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(
                    search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info(
                        'Pruned',
                        '%d out of %d hits were removed due to redundancy' %
                        (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
                    kind_of_search,
                    search_results_dict,
                    skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain,
                        all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)

            shutil.rmtree(tmp_directory_path)
Exemplo n.º 3
0
    get_taxo_line = "anvi-estimate-scg-taxonomy --quiet -T {threads} -c {temp_folder}/clean_bins/{bin_id}/{bin_id}.db -o {tempfile}"
    head = ["d__", "p__", "c__", "o__", "f__", "g__", "s__"]
    for bin_id in tqdm(os.listdir(pjoin(temp_folder, "clean_bins"))):
        if not os.path.exists(pjoin(temp_folder, "clean_bins", bin_id, bin_id + ".db")):
            call(f"anvi-gen-contigs-database --ignore-internal-stop-codons --quiet -n {binset_name} -f {temp_folder}/clean_bins/{bin_id}/{bin_id}.fna -o {temp_folder}/clean_bins/{bin_id/{bin_id}.db -T {threads} --skip-gene-calling")
        if os.path.isdir(pjoin(temp_folder, "clean_bins", bin_id)) and bin_id not in stats:
            tt = ContigSummarizer(pjoin(temp_folder, "clean_bins", bin_id, bin_id + ".db")).get_contigs_db_info_dict(gene_caller_to_use="Prodigal")
            t_file = NamedTemporaryFile()
            formating_dat['bin_id'] = bin_id
            formating_dat['tempfile'] = t_file.name
            call(get_taxo_line.format(**formating_dat), shell = True)
            with open(t_file.name) as handle:
                handle.readline().split()
                scg_taxo = handle.readline().strip().split("\t")
            params.__dict__['contigs_db'] = pjoin(temp_folder, "clean_bins", bin_id, bin_id + ".db")
            c = ContigsSuperclass(params)
            calls = c.get_sequences_for_gene_callers_ids(simple_headers=False)[1]
            with open(pjoin(temp_folder, binset_name + ".faa"), "a") as handle:
                seqs = []
                for k,v in calls.items():
                    ss = Seq(v['sequence']).translate()
                    seqs.append(SeqRecord(ss, id = bin_id + ";" + str(k), description = ""))
                SeqIO.write(seqs, handle, "fasta")

            export_sequences_from_contigs_db(pjoin(temp_folder, "clean_bins", bin_id, bin_id + ".db"), t_file.name)
            call("cat {tempfile} >> {temp_folder}/{binset_name}.fna".format(**formating_dat), shell = True)
            t_file.close()
            est_coding = tt['avg_gene_length']*tt['num_genes']/tt['total_length']
            tt = {k : v for k,v in tt.items() if k in fields}
            stats[bin_id] = tt
            if scg_taxo[0] != '':
Exemplo n.º 4
0
    def populate_search_tables(self, sources={}):
        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        for target in targets:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(target)

            if not self.genes_are_called and context != "CONTIG":
                raise ConfigError("You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an\
                                   HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run\
                                   HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal\
                                   RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter\
                                   '--installed-hmm-profile Ribosomal_RNAs')." % (context, alphabet))

            self.run.info('Target found', '%s:%s' % (alphabet, context))

            class Args: pass
            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['%s:GENE' % alphabet],
                                                                           simple_headers=True,
                                                                           rna_alphabet=True if alphabet=='RNA' else False,
                                                                           report_aa_sequences=True if alphabet=='AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError("You are somewhere you shouldn't be. You came here because you thought it would be OK\
                                       to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\
                                       you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(tmp_directory_path, '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(self.db_path,
                                                           target_files_dict['%s:CONTIG' % alphabet],
                                                           rna_alphabet=True if alphabet=='RNA' else False)

        commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(sources[source]['target'])

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = sources[source]['model']
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmm_scan_hits_txt = commander.run_hmmscan(source,
                                                      alphabet,
                                                      context,
                                                      kind_of_search,
                                                      domain,
                                                      len(all_genes_searched_against),
                                                      hmm_model,
                                                      reference,
                                                      noise_cutoff_terms)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context)
                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single("The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1)


            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names that contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do
                # to do. one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.

                if source != "Ribosomal_RNAs":
                    self.run.warning("You just called an HMM profile that runs on contigs and not genes. Because this HMM\
                                      operation is not directly working with gene calls anvi'o already knows about, the resulting\
                                      hits will need to be added as 'new gene calls' into the contigs database. So far so good.\
                                      But because we are in the contigs realm rater than genes realm, it is likely that\
                                      resulting hits will not correspond to open reading frames that are supposed to be\
                                      translated (such as ribosomal RNAs), because otherwise you would be working with genes\
                                      instad of defining CONTIGS as your context in that HMM profile you just used unless you\
                                      not sure what you are doing. Hence, anvi'o will not report amino acid sequences for the\
                                      new gene calls it will recover through these HMMs. Please take a moment and you be the\
                                      judge of whether this will influence your pangenomic analyses or other things you thought\
                                      you would be doing with the result of this HMM search downstream. If you do not feel like\
                                      being the judge of anything today you can move on yet remember to remember this if things\
                                      look somewhat weird later on.",
                                     header="Psst. Your fancy HMM profile '%s' speaking" % source,
                                     lc="green")

                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info('Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(kind_of_search,
                                                                                                           search_results_dict,
                                                                                                           skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)
Exemplo n.º 5
0
    def populate_search_tables(self, sources={}):
        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        for target in targets:

            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                target)

            self.run.info('Target found', '%s:%s' % (alphabet, context))

            class Args:
                pass

            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args)

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(
                    tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
                    output_file_path=target_files_dict['%s:GENE' % alphabet],
                    simple_headers=True,
                    rna_alphabet=True if alphabet == 'RNA' else False,
                    report_aa_sequences=True if alphabet == 'AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError(
                        "You are somewhere you shouldn't be. You came here because you thought it would be OK\
                                       to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\
                                       you think this is dumb, please let us know."
                    )
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(
                        tmp_directory_path,
                        '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(
                        self.db_path,
                        target_files_dict['%s:CONTIG' % alphabet],
                        rna_alphabet=True if alphabet == 'RNA' else False)

        commander = HMMer(target_files_dict,
                          num_threads_to_use=self.num_threads_to_use)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                sources[source]['target'])

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = sources[source]['model']
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmm_scan_hits_txt = commander.run_hmmscan(
                source, alphabet, context, kind_of_search, domain,
                all_genes_searched_against, hmm_model, reference,
                noise_cutoff_terms)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt,
                                                             alphabet=alphabet,
                                                             context=context)
                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single(
                    "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)."
                    % source,
                    nl_before=1)

            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names that contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do
                # to do. one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.

                self.run.warning(
                    "Alright! You just called an HMM profile that runs on contigs. Because it is not\
                                 working with anvi'o gene calls directly, the resulting hits will need to be added\
                                 as 'new gene calls' into the contigs database. This is a new feature, and if it\
                                 starts screwing things up for you please let us know. Other than that you're pretty\
                                 much golden. Carry on.",
                    header="Psst. Your fancy HMM profile '%s' speaking" %
                    source,
                    lc="green")

                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(
                    search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info(
                        'Pruned',
                        '%d out of %d hits were removed due to redundancy' %
                        (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
                    kind_of_search, search_results_dict)

            self.append(source, reference, kind_of_search, domain,
                        all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)
Exemplo n.º 6
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress

        # initialize self.arg parameters
        A = lambda x, t: t(args.__dict__[x]
                           ) if x in self.args.__dict__ else None
        null = lambda x: x
        self.contigs_db_path = A('contigs_db', null)
        self.genes_of_interest_path = A('genes_of_interest', null)
        self.splits_of_interest_path = A('splits_of_interest', null)
        self.bin_id = A('bin_id', null)
        self.collection_name = A('collection_name', null)
        self.gene_caller_ids = A('gene_caller_ids', null)
        self.output_db_path = A('output_db_path', null)
        self.full_modeller_output = A('dump_dir', null)
        self.skip_DSSP = A('skip_DSSP', bool)
        self.modeller_executable = A('modeller_executable', null)
        self.DSSP_executable = None

        utils.is_contigs_db(self.contigs_db_path)
        self.contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        self.contigs_db_hash = self.contigs_db.meta['contigs_db_hash']

        # MODELLER params
        self.modeller_database = A('modeller_database', null)
        self.scoring_method = A('scoring_method', null)
        self.max_number_templates = A('max_number_templates', null)
        self.percent_identical_cutoff = A('percent_identical_cutoff', null)
        self.num_models = A('num_models', null)
        self.deviation = A('deviation', null)
        self.very_fast = A('very_fast', bool)

        # check database output
        if not self.output_db_path:
            self.output_db_path = "STRUCTURE.db"
        if not self.output_db_path.endswith('.db'):
            raise ConfigError(
                "The structure database output file (`-o / --output`) must end with '.db'"
            )
        filesnpaths.is_output_file_writable(self.output_db_path)

        # check modeller output
        if self.full_modeller_output:
            self.full_modeller_output = filesnpaths.check_output_directory(
                self.full_modeller_output, ok_if_exists=False)

        # identify which genes user wants to model structures for
        self.genes_of_interest = self.get_genes_of_interest(
            self.genes_of_interest_path, self.gene_caller_ids)

        self.sanity_check()

        # residue annotation
        self.annotation_sources_info = self.get_annotation_sources_info()
        self.residue_info_table_structure, self.residue_info_table_types = self.get_residue_info_table_structure(
        )
        self.res_annotation_df = pd.DataFrame({})

        # initialize StructureDatabase
        self.structure_db = StructureDatabase(
            self.output_db_path,
            self.contigs_db_hash,
            residue_info_structure_extras=self.residue_info_table_structure,
            residue_info_types_extras=self.residue_info_table_types,
            create_new=True)

        # init ContigsSuperClass
        self.contigs_super = ContigsSuperclass(self.args)
Exemplo n.º 7
0
def get_contigs_db_info_dict(contigs_db_path,
                             run=run,
                             progress=progress,
                             include_AA_counts=False,
                             split_names=None):
    """Returns an info dict for a given contigs db"""
    class Args:
        def __init__(self):
            self.contigs_db = contigs_db_path

    args = Args()
    run = run
    progress = progress
    run.verbose = False
    progress.verbose = False
    c = ContigsSuperclass(args, r=run, p=progress)

    info_dict = {'path': contigs_db_path}

    for key in c.a_meta:
        info_dict[key] = c.a_meta[key]

    if split_names:
        split_names = set(split_names)

    if split_names:
        c.init_split_sequences()
        seq = ''.join(
            [c.split_sequences[split_name] for split_name in split_names])
        info_dict['total_length'] = len(seq)
        info_dict['gc_content'] = sequence.Composition(seq).GC_content
        info_dict['gene_caller_ids'] = set([
            e['gene_callers_id'] for e in c.genes_in_splits.values()
            if e['split'] in split_names
        ])
        info_dict['num_genes'] = len(info_dict['gene_caller_ids'])
        info_dict['avg_gene_length'] = numpy.mean([
            (c.genes_in_contigs_dict[gene_caller_id]['stop'] -
             c.genes_in_contigs_dict[gene_caller_id]['start'])
            for gene_caller_id in info_dict['gene_caller_ids']
        ])
        info_dict['num_genes_per_kb'] = info_dict[
            'num_genes'] * 1000.0 / info_dict['total_length']
        info_dict['num_splits'] = len(split_names)
    else:
        c.init_contig_sequences()
        seq = ''.join([e['sequence'] for e in c.contig_sequences.values()])
        info_dict['gc_content'] = sequence.Composition(seq).GC_content
        info_dict['num_genes'] = len(c.genes_in_contigs_dict)
        info_dict['gene_caller_ids'] = set(c.genes_in_contigs_dict.keys())
        info_dict['avg_gene_length'] = numpy.mean([
            (gene['stop'] - gene['start'])
            for gene in c.genes_in_contigs_dict.values() if not gene['partial']
        ])
        info_dict['num_genes_per_kb'] = info_dict[
            'num_genes'] * 1000.0 / info_dict['total_length']

    # get completeness / contamination estimates
    if split_names:
        comp = completeness.Completeness(contigs_db_path).get_info_for_splits(
            split_names)
    else:
        comp = completeness.Completeness(contigs_db_path).get_info_for_splits(
            set(c.splits_basic_info.keys()))

    if comp.has_key('Campbell_et_al'):
        info_dict['percent_complete'] = comp['Campbell_et_al'][
            'percent_complete']
        info_dict['percent_redundancy'] = comp['Campbell_et_al'][
            'percent_redundancy']

    # lets get all amino acids used in all complete gene calls:
    if include_AA_counts:
        if split_names:
            AA_counts_dict = c.get_AA_counts_dict(split_names=split_names)
        else:
            AA_counts_dict = c.get_AA_counts_dict()

        info_dict['AA_counts'] = AA_counts_dict['AA_counts']
        info_dict['total_AAs'] = AA_counts_dict['total_AAs']

    return info_dict
Exemplo n.º 8
0
def get_contigs_db_info_dict(contigs_db_path, run=run, progress=progress, include_AA_counts=False, split_names=None, exclude_partial_gene_calls=True):
    """Returns an info dict for a given contigs db"""

    class Args:
        def __init__(self):
            self.contigs_db = contigs_db_path

    args = Args()
    run = run
    progress = progress
    run.verbose = False
    progress.verbose = False
    c = ContigsSuperclass(args, r=run, p=progress)

    info_dict = {'path': contigs_db_path}

    for key in c.a_meta:
        info_dict[key] = c.a_meta[key]

    # Two different strategies here depending on whether we work with a given set if split ids or
    # everything in the contigs database.
    if split_names:
        split_names = set(split_names)
        c.init_split_sequences()
        seq = ''.join([c.split_sequences[split_name] for split_name in split_names])
        candidate_gene_caller_ids = set([e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names])
    else:
        c.init_contig_sequences()
        seq = ''.join([e['sequence'] for e in c.contig_sequences.values()])
        candidate_gene_caller_ids = c.genes_in_contigs_dict.keys()

    info_dict['gc_content'] = sequence.Composition(seq).GC_content
    info_dict['total_length'] = len(seq)

    gene_caller_ids = set([])
    excluded_gene_ids = set([])
    for gene_caller_id in candidate_gene_caller_ids:
        if c.genes_in_contigs_dict[gene_caller_id]['partial'] and exclude_partial_gene_calls:
            excluded_gene_ids.add(gene_caller_id)
        else:
            gene_caller_ids.add(gene_caller_id)

    info_dict['gene_caller_ids'] = gene_caller_ids
    info_dict['excluded_gene_ids'] = excluded_gene_ids
    info_dict['num_genes'] = len(gene_caller_ids)
    info_dict['gene_lengths'] = dict([(gene_caller_id, (c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start'])) for gene_caller_id in gene_caller_ids])
    info_dict['avg_gene_length'] = numpy.mean(info_dict['gene_lengths'].values())
    info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length']

    # get completeness / contamination estimates
    p_completion, p_redundancy, domain, domain_confidence, results_dict = completeness.Completeness(contigs_db_path).get_info_for_splits(split_names if split_names else set(c.splits_basic_info.keys()))

    info_dict['percent_complete'] = p_completion
    info_dict['percent_redundancy'] = p_redundancy
    info_dict['scg_domain'] = domain
    info_dict['scg_domain_confidence'] = domain_confidence

    # lets get all amino acids used in all complete gene calls:
    if include_AA_counts:
        if split_names:
            AA_counts_dict = c.get_AA_counts_dict(split_names=split_names)
        else:
            AA_counts_dict = c.get_AA_counts_dict()

        info_dict['AA_counts'] = AA_counts_dict['AA_counts']
        info_dict['total_AAs'] = AA_counts_dict['total_AAs']

    return info_dict