Пример #1
0
    def run_kb_dram_annotate(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kb_dram_annotate
        # validate inputs
        if not isinstance(params['assembly_input_ref'], str) or not len(
                params['assembly_input_ref']):
            raise ValueError('Pass in a valid assembly reference string')
        if not isinstance(params['desc'], str) or not len(params['desc']):
            raise ValueError('Pass in a valid genomeSet description')
        if not isinstance(params['output_name'], str) or not len(
                params['output_name']):
            raise ValueError('Pass in a valid genomeSet output name')
        if not isinstance(params['min_contig_size'],
                          int) or (params['min_contig_size'] < 0):
            raise ValueError('Min contig size must be a non-negative integer')

        # setup params
        with open("/kb/module/kbase.yml", 'r') as stream:
            data_loaded = yaml.load(stream)
        version = str(data_loaded['module-version'])

        is_metagenome = params['is_metagenome']
        min_contig_size = params['min_contig_size']
        trans_table = str(params['trans_table'])
        bitscore = params['bitscore']
        rbh_bitscore = params['rbh_bitscore']
        output_dir = os.path.join(self.shared_folder, 'DRAM_annos')
        output_objects = []

        # create Util objects
        wsClient = workspaceService(self.workspaceURL, token=ctx['token'])
        assembly_util = AssemblyUtil(self.callback_url)
        genome_util = GenomeFileUtil(self.callback_url)

        # set DRAM database locations
        print('DRAM version: %s' % dram_version)
        import_config('/data/DRAM_databases/CONFIG')
        # This is a hack to get around a bug in my database setup
        set_database_paths(
            description_db_loc='/data/DRAM_databases/description_db.sqlite')
        print_database_locations()

        # get files
        assemblies = assembly_util.get_fastas(
            {'ref_lst': [params['assembly_input_ref']]})
        # would paths ever have more than one thing?
        fasta_locs = [
            assembly_data['paths'][0]
            for assembly_ref, assembly_data in assemblies.items()
        ]
        # get assembly refs from dram assigned genome names
        assembly_ref_dict = {
            os.path.splitext(
                os.path.basename(
                    remove_suffix(assembly_data['paths'][0], '.gz')))[0]:
            assembly_ref
            for assembly_ref, assembly_data in assemblies.items()
        }

        # annotate and distill with DRAM
        annotate_bins(fasta_locs,
                      output_dir,
                      min_contig_size,
                      trans_table=trans_table,
                      bit_score_threshold=bitscore,
                      rbh_bit_score_threshold=rbh_bitscore,
                      low_mem_mode=True,
                      rename_bins=False,
                      keep_tmp_dir=False,
                      threads=THREADS,
                      verbose=False)
        output_files = get_annotation_files(output_dir)
        distill_output_dir = os.path.join(output_dir, 'distilled')
        summarize_genomes(output_files['annotations']['path'],
                          output_files['trnas']['path'],
                          output_files['rrnas']['path'],
                          output_dir=distill_output_dir,
                          groupby_column='fasta')
        output_files = get_distill_files(distill_output_dir, output_files)

        if is_metagenome:
            pass  # TODO: make annotated metagenome object
        else:
            # generate genome files
            annotations = pd.read_csv(output_files['annotations']['path'],
                                      sep='\t',
                                      index_col=0)
            genome_objects = generate_genomes(
                annotations, output_files['genes_fna']['path'],
                output_files['genes_faa']['path'], assembly_ref_dict,
                assemblies, params["workspace_name"], ctx.provenance())

            genome_ref_dict = dict()
            genome_set_elements = dict()
            for genome_object in genome_objects:
                info = genome_util.save_one_genome(genome_object)["info"]
                genome_ref = '%s/%s/%s' % (info[6], info[0], info[4])
                genome_set_elements[genome_object["name"]] = {
                    'ref': genome_ref
                }
                output_objects.append({
                    "ref": genome_ref,
                    "description": 'Annotated Genome'
                })
                genome_ref_dict[genome_object["name"]] = genome_ref

            # add ontology terms
            anno_api = annotation_ontology_api(service_ver="beta")
            ontology_events = add_ontology_terms(annotations, params['desc'],
                                                 version,
                                                 params['workspace_name'],
                                                 self.workspaceURL,
                                                 genome_ref_dict)
            [
                anno_api.add_annotation_ontology_events(i)
                for i in ontology_events
            ]

            # make genome set
            # TODO: only make genome set if there is more than one genome
            if 'provenance' in ctx:
                provenance = ctx['provenance']
            else:
                provenance = [{}]
            # add additional info to provenance here, in this case the input data object reference
            provenance[0]['input_ws_objects'] = list(genome_ref_dict.values())
            provenance[0]['service'] = 'kb_SetUtilities'
            provenance[0]['method'] = 'KButil_Batch_Create_GenomeSet'
            output_genomeSet_obj = {
                'description': params['desc'],
                'elements': genome_set_elements
            }
            output_genomeSet_name = params['output_name']
            new_obj_info = wsClient.save_objects({
                'workspace':
                params['workspace_name'],
                'objects': [{
                    'type': 'KBaseSearch.GenomeSet',
                    'data': output_genomeSet_obj,
                    'name': output_genomeSet_name,
                    'meta': {},
                    'provenance': provenance
                }]
            })[0]
            genome_set_ref = '%s/%s/%s' % (new_obj_info[6], new_obj_info[0],
                                           new_obj_info[4])
            output_objects.append({
                "ref": genome_set_ref,
                "description": params['desc']
            })

        # generate report
        product_html_loc = os.path.join(distill_output_dir, 'product.html')
        report = generate_product_report(self.callback_url,
                                         params['workspace_name'], output_dir,
                                         product_html_loc, output_files,
                                         output_objects)
        output = {
            'report_name': report['name'],
            'report_ref': report['ref'],
        }
        #END run_kb_dram_annotate

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kb_dram_annotate return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Пример #2
0
 def test_what_is_fastas(self):
     assembly_util = AssemblyUtil(self.callback_url)
     fastas = assembly_util.get_fastas({'ref_lst': ['41343/11/3']})
     print(fastas)
Пример #3
0
    def run_kb_dramv_annotate(self, ctx, params):
        """
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kb_dramv_annotate
        warnings.filterwarnings("ignore")

        # setup
        affi_contigs_shock_ids = params['affi_contigs_shock_id']
        min_contig_size = params['min_contig_size']
        trans_table = str(params['trans_table'])
        bitscore = params['bitscore']
        rbh_bitscore = params['rbh_bitscore']

        assembly_util = AssemblyUtil(self.callback_url)
        datafile_util = DataFileUtil(self.callback_url)

        # get contigs and merge
        assemblies = assembly_util.get_fastas(
            {'ref_lst': [params['assembly_input_ref']]})
        fasta = os.path.join(self.shared_folder, 'merged_contigs.fasta')
        with open(fasta, 'w') as f:
            for assembly_ref, assembly_data in assemblies.items():
                fasta_path = assembly_data['paths'][0]
                for line in open(fasta_path):
                    f.write(line)

        # get affi contigs, read all and merge
        affi_contigs_path = os.path.join(self.shared_folder,
                                         'VIRSorter_affi-contigs.tab')
        with open(affi_contigs_path, 'w') as f:
            for affi_contigs_shock_id in affi_contigs_shock_ids:
                temp_affi_contigs_path = os.path.join(
                    self.shared_folder, 'temp_VIRSorter_affi-contigs.tab')
                temp_affi_contigs = datafile_util.shock_to_file({
                    'shock_id':
                    affi_contigs_shock_id,
                    'file_path':
                    temp_affi_contigs_path,
                    'unpack':
                    'unpack'
                })['file_path']
                for line in open(temp_affi_contigs):
                    f.write(line)
                os.remove(temp_affi_contigs)

        # set DRAM database locations
        print('DRAM version: %s' % dram_version)
        import_config('/data/DRAM_databases/CONFIG')
        # This is a hack to get around a bug in my database setup
        set_database_paths(
            description_db_loc='/data/DRAM_databases/description_db.sqlite')
        print_database_locations()

        # clean affi contigs file
        cleaned_fasta = os.path.join(
            self.shared_folder, '%s.cleaned.fasta' % os.path.basename(fasta))
        remove_bad_chars(input_fasta=fasta, output=cleaned_fasta)
        cleaned_affi_contigs = os.path.join(
            self.shared_folder, 'VIRSorter_affi-contigs.cleaned.tab')
        remove_bad_chars(input_virsorter_affi_contigs=affi_contigs_path,
                         output=cleaned_affi_contigs)

        # annotate and distill
        output_dir = os.path.join(self.shared_folder, 'DRAM_annos')
        annotate_vgfs(cleaned_fasta,
                      cleaned_affi_contigs,
                      output_dir,
                      min_contig_size,
                      trans_table=trans_table,
                      bit_score_threshold=bitscore,
                      rbh_bit_score_threshold=rbh_bitscore,
                      low_mem_mode=True,
                      keep_tmp_dir=False,
                      threads=THREADS,
                      verbose=False)
        output_files = get_annotation_files(output_dir)
        distill_output_dir = os.path.join(output_dir, 'distilled')
        summarize_vgfs(output_files['annotations']['path'],
                       distill_output_dir,
                       groupby_column='scaffold')
        output_files = get_viral_distill_files(distill_output_dir,
                                               output_files)

        # generate report
        product_html_loc = os.path.join(distill_output_dir, 'product.html')
        report = generate_product_report(self.callback_url,
                                         params['workspace_name'], output_dir,
                                         product_html_loc, output_files)
        output = {
            'report_name': report['name'],
            'report_ref': report['ref'],
        }
        #END run_kb_dramv_annotate

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kb_dramv_annotate return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]