def prepareTestData(cls):
        """This function creates an assembly object for testing"""
        fasta_content = '>seq1 something soemthing asdf\n' \
                        'agcttttcat\n' \
                        '>seq2\n' \
                        'agctt\n' \
                        '>seq3\n' \
                        'agcttttcatgg'

        filename = os.path.join(cls.scratch, 'test1.fasta')
        with open(filename, 'w') as f:
            f.write(fasta_content)
        assemblyUtil = AssemblyUtil(cls.callback_url)
        cls.assembly_ref = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': filename
            },
            'workspace_name':
            cls.wsName,
            'assembly_name':
            'TestAssembly'
        })
예제 #2
0
    def stage_assembly_files(self, object_list):
        """
        _stage_assembly_files: download the fasta files to the scratch area
        return list of file names
        """
        log('Processing assembly object list: {}'.format(object_list))

        auc = AssemblyUtil(self.callbackURL)
        staged_file_list = []

        for assembly_upa in object_list:
            try:
                filename = auc.get_assembly_as_fasta({'ref':
                                                      assembly_upa})['path']
            except ServerError as assembly_error:
                print(str(assembly_error))
                raise

            staged_file_list.append(filename)

        log('Created file list: {}'.format(staged_file_list))
        return staged_file_list
예제 #3
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_staging_exporter'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_staging_exporter',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL)
        cls.serviceImpl = kb_staging_exporter(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']

        cls.ru = ReadsUtils(cls.callback_url)
        cls.au = AssemblyUtil(cls.callback_url)
        cls.gfu = GenomeFileUtil(cls.callback_url, service_ver='dev')
        cls.rau = ReadsAlignmentUtils(cls.callback_url)
예제 #4
0
 def setUpClass(cls):
     token = environ.get('KB_AUTH_TOKEN', None)
     config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('MetagenomeUtils'):
         cls.cfg[nameval[0]] = nameval[1]
     # Getting username from Auth profile for token
     authServiceUrl = cls.cfg['auth-service-url']
     auth_client = _KBaseAuth(authServiceUrl)
     user_id = auth_client.get_user(token)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({
         'token':
         token,
         'user_id':
         user_id,
         'provenance': [{
             'service': 'MetagenomeUtils',
             'method': 'please_never_use_it_in_production',
             'method_params': []
         }],
         'authenticated':
         1
     })
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = workspaceService(cls.wsURL)
     cls.serviceImpl = MetagenomeUtils(cls.cfg)
     cls.scratch = cls.cfg['scratch']
     cls.callback_url = os.environ['SDK_CALLBACK_URL']
     suffix = int(time.time() * 1000)
     wsName = "test_kb_maxbin_" + str(suffix)
     cls.ws_info = cls.wsClient.create_workspace({'workspace': wsName})
     cls.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token)
     cls.au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=token)
예제 #5
0
    def getAssemblyInfo(self, ass_name):
        if hasattr(self.__class__, 'assemblyInfo'):
            if self.__class__.assemblyInfo.get(ass_name):
                return self.__class__.assemblyInfo[ass_name]

        # copy the local test file to the shared scratch space so that the AssemblyUtil
        # container can see it.
        test_fasta_file_local = os.path.join('data', 'assemblies', ass_name)
        test_fasta_file_scratch = os.path.join(self.scratch, os.path.basename(test_fasta_file_local))
        shutil.copy(test_fasta_file_local, test_fasta_file_scratch)

        # call the AssemblyUtil libary to upload the test data to KBase
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        ass_ref = au.save_assembly_from_fasta({'file': {'path': test_fasta_file_scratch},
                                               'workspace_name': self.getWsName(),
                                               'assembly_name': ass_name})

        # get the object metadata for the new test dataset
        new_obj_info = self.ws.get_object_info_new({'objects': [{'ref': ass_ref}]})
        if not hasattr(self.__class__, 'assemblyInfo'):
            self.__class__.assemblyInfo = dict()
        self.__class__.assemblyInfo[ass_name] = new_obj_info[0]
        return new_obj_info[0]
 def setUpClass(cls):
     config_file = environ.get("KB_DEPLOYMENT_CONFIG", None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items("ProkkaAnnotation"):
         cls.cfg[nameval[0]] = nameval[1]
     # Token validation
     token = environ.get("KB_AUTH_TOKEN", None)
     authServiceUrl = cls.cfg.get(
         "auth-service-url",
         "https://kbase.us/services/authorization/Sessions/Login")
     auth_client = _KBaseAuth(authServiceUrl)
     user_id = auth_client.get_user(token)
     # WARNING: don"t call any logging methods on the context object,
     # it"ll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({
         "token":
         token,
         "user_id":
         user_id,
         "provenance": [{
             "service": "ProkkaAnnotation",
             "method": "please_never_use_it_in_production",
             "method_params": []
         }],
         "authenticated":
         1
     })
     cls.wsURL = cls.cfg["workspace-url"]
     cls.wsClient = workspaceService(cls.wsURL, token=token)
     cls.serviceImpl = ProkkaAnnotation(cls.cfg)
     cls.callback_url = os.environ['SDK_CALLBACK_URL']
     cls.gfu = GenomeFileUtil(cls.callback_url)
     cls.au = AssemblyUtil(cls.callback_url)
     cls.scratch = cls.cfg['scratch']
예제 #7
0
    def load_genome_direct(cls, filename, assembly_filename, obj_name):
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        assembly_path = os.path.join(cls.cfg['scratch'], os.path.basename(assembly_filename))
        shutil.copy(assembly_filename, assembly_path)
        assembly_ref = au.save_assembly_from_fasta({
            'workspace_name': cls.wsName,
            'assembly_name': obj_name + '.assembly',
            'file': {'path': assembly_path}
        })

        data = json.load(open(filename))
        data['assembly_ref'] = assembly_ref
        save_info = {
            'workspace': cls.wsName,
            'objects': [{
                'data': data,
                'name': obj_name + '.genome',
                'type': 'KBaseGenomes.Genome',
            }],
        }
        info = cls.wsClient.save_objects(save_info)[0]['info']
        ref = f"{info[6]}/{info[0]}/{info[4]}"
        print('created test genome: ' + ref + ' from file ' + filename)
        return ref
 def __init__(self):
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.au = AssemblyUtil(self.callback_url)
     self.gfu = GenomeFileUtil(self.callback_url)
     pass
예제 #9
0
 def __init__(self, config):
     self.scratch = os.path.abspath(config['scratch'])
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.mgu = MetagenomeUtils(self.callback_url)
     self.au = AssemblyUtil(self.callback_url)
     self.ws = Workspace(config['workspace-url'], token=config['token'])
예제 #10
0
    def finish_run(self, params):
        """
        Finish up the run by uploading output and
        creating the report
        """
        console = []
        self.log(console, 'Running post')

        # run hipmer, capture output as it happens
        self.log(console, 'running hipmer:')

        # grab path of output contigs
        output_contigs = ''
        for root, subdirs, files in os.walk(self.scratch):
            for f in files:
                if f == 'final_assembly.fa':
                    output_contigs = os.path.join(root,f)
                    print("found OUTPUT CONTIGS {}".format(output_contigs))
                    continue

        output_name = params['output_contigset_name']
        slurm_out = os.path.join(self.scratch, 'slurm.out')

        if not os.path.exists(output_contigs):
            self.log(console, "It looks like HipMER failed. Could not find the output contigs.")
            self.log(console, "Show errors in log file")
            with open(slurm_out, 'r') as f:
                for line in f:
                    if line.lower().find('error') >= 0:
                        self.log(console, line)
            raise RuntimeError("Error in HipMER execution")

        wsname = params['workspace_name']

        self.log(console, 'Filtering short length contigs from HipMer assembly')

        assemblyUtil = AssemblyUtil(self.callbackURL, token=self.token)

        assembly_size_filter = params['assembly_size_filter']

        filtered_fasta_file_path = self.filter_contigs_by_length(output_contigs, assembly_size_filter)

        if os.stat(filtered_fasta_file_path).st_size == 0:
            raise ValueError("Error: Using input parameters, you have filtered all contigs from the HipMer \
                             assembly. Decrease the minimum contig size and try again.")
        else:
            output_contigs = filtered_fasta_file_path

        self.log(console, 'Uploading FASTA file to Assembly')

        save_input = {'file': {'path': output_contigs},
                      'workspace_name': wsname,
                      'assembly_name': output_name
                      }

        output_data_ref = assemblyUtil.save_assembly_from_fasta(save_input)

        # create a Report
        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params['workspace_name'] + '/'
        report += params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   \%d\t--\t%d' % (counts[c], edges[c])
            report += ' to %d bp\n' % (edges[c + 1])

        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        try:
            quastret = kbq.run_QUAST({'files': [{'path': output_contigs,
                                                 'label': params['output_contigset_name']}]})
        except Exception as e:
            # not really any way to test this, all inputs have been checked
            # earlier and should be ok
            print('Logging exception from running QUAST')
            print((str(e)))
            # TODO delete shock node
            raise

        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        try:
            report_info = kbr.create_extended_report(
                {'message': report,
                 'objects_created': [{'ref': output_data_ref,
                                      'description': 'Assembled contigs'}],
                 'direct_html_link_index': 0,
                 'html_links': [{'shock_id': quastret['shock_id'],
                                 'name': 'report.html',
                                 'label': 'QUAST report'}
                                ],
                 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()),
                 'workspace_name': params['workspace_name']
                 })
        except Exception as e:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from creating report object')
            print((str(e)))
            # TODO delete shock node
            raise

        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'],
                  'report_ref': report_info['ref']
                  }
        return output
예제 #11
0
    def test_fractiontate_contigs_ASSEMBLY_GENOMESET_06(self):
        method = 'fractionate_contigs_pos_filter_ASSEMBLY_GENOMESET_06'

        print("\n\nRUNNING: test_" + method + "()")
        print("==========================================================\n\n")

        # upload test data
        try:
            auClient = AssemblyUtil(self.callback_url,
                                    token=self.getContext()['token'])
        except Exception as e:
            raise ValueError(
                'Unable to instantiate auClient with callbackURL: ' +
                self.callback_url + ' ERROR: ' + str(e))
        try:
            gfuClient = GenomeFileUtil(self.callback_url,
                                       token=self.getContext()['token'])
        except Exception as e:
            raise ValueError(
                'Unable to instantiate gfuClient with callbackURL: ' +
                self.callback_url + ' ERROR: ' + str(e))

        base_1 = 'assembly_1plus2'
        base_2a = 'assembly_2a'
        base_2b = 'assembly_2b'
        type_1 = 'Assembly'
        type_2a = 'Genome'
        type_2b = 'Genome'
        ass_file_1_fa = base_1 + '.fa.gz'
        ass_file_2a_fa = base_2a + '.fa.gz'
        ass_file_2b_fa = base_2b + '.fa.gz'
        ass_file_2a_gff = base_2a + '.gff'
        ass_file_2b_gff = base_2b + '.gff'
        ass_path_1_fa = os.path.join(self.scratch, ass_file_1_fa)
        ass_path_2a_fa = os.path.join(self.scratch, ass_file_2a_fa)
        ass_path_2b_fa = os.path.join(self.scratch, ass_file_2b_fa)
        ass_path_2a_gff = os.path.join(self.scratch, ass_file_2a_gff)
        ass_path_2b_gff = os.path.join(self.scratch, ass_file_2b_gff)
        shutil.copy(os.path.join("data", ass_file_1_fa), ass_path_1_fa)
        shutil.copy(os.path.join("data", ass_file_2a_fa), ass_path_2a_fa)
        shutil.copy(os.path.join("data", ass_file_2b_fa), ass_path_2b_fa)
        shutil.copy(os.path.join("data", ass_file_2a_gff), ass_path_2a_gff)
        shutil.copy(os.path.join("data", ass_file_2b_gff), ass_path_2b_gff)
        ass_ref_1 = auClient.save_assembly_from_fasta({
            'file': {
                'path': ass_path_1_fa
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            base_1 + '.' + type_1
        })
        ass_ref_2a = gfuClient.fasta_gff_to_genome({
            'fasta_file': {
                'path': ass_path_2a_fa
            },
            'gff_file': {
                'path': ass_path_2a_gff
            },
            'generate_missing_genes':
            1,
            'source':
            'GFF',
            'scientific_name':
            base_2a,
            'workspace_name':
            self.getWsName(),
            'genome_name':
            base_2a + '.' + type_2a
        }).get('genome_ref')
        ass_ref_2b = gfuClient.fasta_gff_to_genome({
            'fasta_file': {
                'path': ass_path_2b_fa
            },
            'gff_file': {
                'path': ass_path_2b_gff
            },
            'generate_missing_genes':
            1,
            'source':
            'GFF',
            'scientific_name':
            base_2b,
            'workspace_name':
            self.getWsName(),
            'genome_name':
            base_2b + '.' + type_2b
        }).get('genome_ref')

        # GenomeSet
        genomeSet_obj = {
            'description': 'test genomeSet',
            'elements': {
                'genome_0': {
                    'ref': ass_ref_2a
                },
                'genome_1': {
                    'ref': ass_ref_2b
                }
            }
        }
        provenance = [{}]
        genomeSet_info = self.getWsClient().save_objects({
            'workspace':
            self.getWsName(),
            'objects': [{
                'type': 'KBaseSearch.GenomeSet',
                'data': genomeSet_obj,
                'name': 'test_genomeSet_2a2b',
                'meta': {},
                'provenance': provenance
            }]
        })[0]
        genomeSet_ref = str(genomeSet_info[WSID_I]) + '/' + \
                        str(genomeSet_info[OBJID_I]) + '/' + \
                        str(genomeSet_info[VERSION_I])

        # run method
        base_output_name = method + '_output'
        fractionate_mode = 'neg'
        params = {
            'workspace_name':
            self.getWsName(),
            'input_assembly_ref':
            ass_ref_1,
            'input_pos_filter_obj_refs': [genomeSet_ref],
            'fractionate_mode':
            fractionate_mode,
            'output_name':
            'test_fractionated' + '-' + base_1 + '.' + type_1 + '-' +
            'genomeset_2a2b' + '-' + fractionate_mode
        }
        result = self.getImpl().run_fractionate_contigs(
            self.getContext(), params)
        print('RESULT:')
        pprint(result)
        pass
예제 #12
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('SetAPI'):
            cls.cfg[nameval[0]] = nameval[1]
        authServiceUrl = cls.cfg.get(
            'auth-service-url',
            "https://kbase.us/services/authorization/Sessions/Login")
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'SetAPI',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = SetAPI(cls.cfg)

        # setup data at the class level for now (so that the code is run
        # once for all tests, not before each test case.  Not sure how to
        # do that outside this function..)
        suffix = int(time.time() * 1000)
        wsName = "test_SetAPI_" + str(suffix)
        ret = cls.wsClient.create_workspace({'workspace': wsName})
        #        wsName = 'pranjan77:1477441032423'
        cls.wsName = wsName
        # copy test file to scratch area
        fna_filename = "seq.fna"
        fna_path = os.path.join(cls.cfg['scratch'], fna_filename)
        shutil.copy(os.path.join("data", fna_filename), fna_path)

        ru = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        ws_obj_name = 'MyNewAssembly'
        cls.assembly1ref = ru.save_assembly_from_fasta({
            'file': {
                'path': fna_path
            },
            'workspace_name':
            wsName,
            'assembly_name':
            'assembly_obj_1'
        })
        cls.assembly2ref = ru.save_assembly_from_fasta({
            'file': {
                'path': fna_path
            },
            'workspace_name':
            wsName,
            'assembly_name':
            'assembly_obj_2'
        })
예제 #13
0
    def run_megahit(self, ctx, params):
        """
        :param params: instance of type "MegaHitParams" (Run MEGAHIT. Most
           parameters here are just passed forward to MEGAHIT workspace_name
           - the name of the workspace for input/output read_library_ref -
           the name of the PE read library (SE library support in the future)
           output_contig_set_name - the name of the output contigset
           megahit_parameter_preset - override a group of parameters;
           possible values: meta            '--min-count 2 --k-list
           21,41,61,81,99' (generic metagenomes, default) meta-sensitive 
           '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more
           sensitive but slower) meta-large      '--min-count 2 --k-list
           27,37,47,57,67,77,87' (large & complex metagenomes, like soil)
           bulk            '--min-count 3 --k-list 31,51,71,91,99 --no-mercy'
           (experimental, standard bulk sequencing with >= 30x depth)
           single-cell     '--min-count 3 --k-list 21,33,55,77,99,121
           --merge_level 20,0.96' (experimental, single cell data) min_count
           - minimum multiplicity for filtering (k_min+1)-mers, default 2
           k_min - minimum kmer size (<= 255), must be odd number, defaults
           to 21 k_max - maximum kmer size (<= 255), must be odd number,
           defaults to 141 k_step - increment of kmer size of each iteration
           (<= 28), must be even number, defaults to 10 k_list - list of kmer
           sizes (all must be odd, in the range 15-255, increment <= 28);
           override using `--k-min', `--k-max' and `--k-step'
           min_contig_length - minimum length of contigs to output, default
           is 2000 max_mem_percent - maximum memory to make available to
           MEGAHIT, as a percentage of available system memory (optional,
           default = 0.9 or 90%) @optional megahit_parameter_preset @optional
           min_count @optional k_min @optional k_max @optional k_step
           @optional k_list @optional min_contig_length @optional
           max_mem_percent) -> structure: parameter "workspace_name" of
           String, parameter "read_library_ref" of String, parameter
           "output_contigset_name" of String, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_length" of Long, parameter "max_mem_percent"
           of Double
        :returns: instance of type "MegaHitOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_megahit
        print('Running run_megahit with params=')
        pprint(params)

        # STEP 1: basic parameter checks + parsing
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_ref' not in params:
            raise ValueError('read_library_ref parameter is required')
        if 'output_contigset_name' not in params:
            raise ValueError('output_contigset_name parameter is required')

        # STEP 2: get the read library as deinterleaved fastq files
        input_ref = params['read_library_ref']
        reads_params = {'read_libraries': [input_ref],
                        'interleaved': 'false',
                        'gzipped': None
                        }
        ru = ReadsUtils(self.callbackURL)
        reads = ru.download_reads(reads_params)['files']

        print('Input reads files:')
        fwd = reads[input_ref]['files']['fwd']
        rev = reads[input_ref]['files']['rev']
        pprint('forward: ' + fwd)
        pprint('reverse: ' + rev)

        # STEP 3: run megahit
        # construct the command
        megahit_cmd = [self.MEGAHIT]

        # we only support PE reads, so add that
        megahit_cmd.append('-1')
        megahit_cmd.append(fwd)
        megahit_cmd.append('-2')
        megahit_cmd.append(rev)

        # if a preset is defined, use that:
        if 'megahit_parameter_preset' in params:
            if params['megahit_parameter_preset']:
                megahit_cmd.append('--presets')
                megahit_cmd.append(params['megahit_parameter_preset'])

        if 'min_count' in params:
            if params['min_count']:
                megahit_cmd.append('--min-count')
                megahit_cmd.append(str(params['min_count']))
        if 'k_min' in params:
            if params['k_min']:
                megahit_cmd.append('--k-min')
                megahit_cmd.append(str(params['k_min']))
        if 'k_max' in params:
            if params['k_max']:
                megahit_cmd.append('--k-max')
                megahit_cmd.append(str(params['k_max']))
        if 'k_step' in params:
            if params['k_step']:
                megahit_cmd.append('--k-step')
                megahit_cmd.append(str(params['k_step']))
        if 'k_list' in params:
            if params['k_list']:
                k_list = []
                for k_val in params['k_list']:
                    k_list.append(str(k_val))
                megahit_cmd.append('--k-list')
                megahit_cmd.append(','.join(k_list))

        min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH
        if 'min_contig_length' in params:
            if params['min_contig_length']:
                if str(params['min_contig_length']).isdigit():
                    min_contig_length = params['min_contig_length']
                else:
                    raise ValueError('min_contig_length parameter must be a non-negative integer')

        megahit_cmd.append('--min-contig-len')
        megahit_cmd.append(str(min_contig_length))

        # Set the number of CPUs to the number of cores minus 1
        megahit_cmd.append('--num-cpu-threads')
        megahit_cmd.append(str(max([(multiprocessing.cpu_count() - 1), 1])))

        # set mem usage
        # Note: this just sets the default value - 90% of available system memory allocated
        # to the container. Exposing it here as a place to later expose as a parameter.
        max_mem_percent = params.get('max_mem_percent', 0.9)
        megahit_cmd.append('-m')
        megahit_cmd.append(str(max_mem_percent))

        # set the output location
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        megahit_cmd.append('-o')
        megahit_cmd.append(output_dir)

        # run megahit
        print('running megahit:')
        print('    ' + ' '.join(megahit_cmd))
        p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False)
        retcode = p.wait()

        print('Return code: ' + str(retcode))
        if p.returncode != 0:
            error_str = report_megahit_error(output_dir, retcode)
            raise RuntimeError(error_str)

        output_contigs = os.path.join(output_dir, 'final.contigs.fa')

        # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
        if self.mac_mode:
            shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa'))
            output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa')

        # STEP 4: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL)
        output_data_ref = assemblyUtil.save_assembly_from_fasta({
                                                                'file': {'path': output_contigs},
                                                                'workspace_name': params['workspace_name'],
                                                                'assembly_name': params['output_contigset_name']
                                                                })


        # STEP 5: generate and save the report

        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params['workspace_name'] + '/' + params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        try:
            quastret = kbq.run_QUAST({'files': [{'path': output_contigs,
                                                 'label': params['output_contigset_name']}]})
        except ServerError as qe:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from running QUAST')
            print(str(qe))
            # TODO delete shock node
            raise

        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        try:
            report_info = kbr.create_extended_report(
                {'message': report,
                 'objects_created': [{'ref': output_data_ref, 'description': 'Assembled contigs'}],
                 'direct_html_link_index': 0,
                 'html_links': [{'shock_id': quastret['shock_id'],
                                 'name': 'report.html',
                                 'label': 'QUAST report'}
                                ],
                 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()),
                 'workspace_name': params['workspace_name']
                 })
        except ServerError as re:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from creating report object')
            print(str(re))
            # TODO delete shock node
            raise

        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'], 'report_ref': report_info['ref']}

        #END run_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #14
0
    def run_kraken2(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kraken2

        # Download input data as FASTA or FASTQ
        logging.info('Calling run_kraken2')
        logging.info(f'params {params}')
        # Check for presence of input file types in params
        input_genomes = 'input_genomes' in params and len(
            params['input_genomes']
        ) > 0 and None not in params['input_genomes']
        input_refs = 'input_refs' in params and len(
            params['input_refs']) > 0 and None not in params['input_refs']
        input_paired_refs = 'input_paired_refs' in params and len(
            params['input_paired_refs']
        ) > 0 and None not in params['input_paired_refs']
        for name in ['workspace_name', 'db_type']:
            if name not in params:
                raise ValueError('Parameter "' + name +
                                 '" is required but missing')
        if not input_genomes and not input_refs and not input_paired_refs:
            raise ValueError(
                'You must enter either an input genome or input reads')

        if input_refs and input_paired_refs:
            raise ValueError(
                'You must enter either single-end or paired-end reads, '
                'but not both')

        if input_genomes and (input_refs or input_paired_refs):
            raise ValueError(
                'You must enter either an input genome or input reads, '
                'but not both')

        if input_genomes and (not isinstance(params['input_genomes'][0], str)):
            raise ValueError('Pass in a valid input genome string')

        if input_refs and (not isinstance(params['input_refs'], list)):
            raise ValueError('Pass in a list of input references')

        if input_paired_refs and (not isinstance(params['input_paired_refs'],
                                                 list)):
            raise ValueError('Pass in a list of input references')

        logging.info(params['db_type'])
        logging.info(
            f'input_genomes {input_genomes} input_refs {input_refs} input_paired_refs {input_paired_refs}'
        )
        input_string = []
        if input_genomes:
            assembly_util = AssemblyUtil(self.callback_url)
            fasta_file_obj = assembly_util.get_assembly_as_fasta(
                {'ref': params['input_genomes'][0]})
            logging.info(fasta_file_obj)
            fasta_file = fasta_file_obj['path']
            input_string.append(fasta_file)

        if input_refs:
            logging.info('Downloading Reads data as a Fastq file.')
            logging.info(f"input_refs {params['input_refs']}")
            readsUtil = ReadsUtils(self.callback_url)
            download_reads_output = readsUtil.download_reads(
                {'read_libraries': params['input_refs']})
            print(
                f"Input parameters {params['input_refs']}, {params['db_type']}"
                f"download_reads_output {download_reads_output}")
            fastq_files = []
            fastq_files_name = []
            for key, val in download_reads_output['files'].items():
                if 'fwd' in val['files'] and val['files']['fwd']:
                    fastq_files.append(val['files']['fwd'])
                    fastq_files_name.append(val['files']['fwd_name'])
                if 'rev' in val['files'] and val['files']['rev']:
                    fastq_files.append(val['files']['rev'])
                    fastq_files_name.append(val['files']['rev_name'])
            logging.info(f"fastq files {fastq_files}")
            input_string.append(' '.join(fastq_files))

        if input_paired_refs:
            logging.info('Downloading Reads data as a Fastq file.')
            logging.info(f"input_refs {params['input_paired_refs']}")
            readsUtil = ReadsUtils(self.callback_url)
            download_reads_output = readsUtil.download_reads(
                {'read_libraries': params['input_paired_refs']})
            print(
                f"Input parameters {params['input_paired_refs']}, {params['db_type']}"
                f"download_reads_output {download_reads_output}")
            fastq_files = []
            fastq_files_name = []
            # input_string.append('--paired')
            for key, val in download_reads_output['files'].items():
                if 'fwd' in val['files'] and val['files']['fwd']:
                    fastq_files.append(val['files']['fwd'])
                    fastq_files_name.append(val['files']['fwd_name'])
                if 'rev' in val['files'] and val['files']['rev']:
                    fastq_files.append(val['files']['rev'])
                    fastq_files_name.append(val['files']['rev_name'])
            # if len(fastq_files) % 2 != 0:
            #     raise ValueError('There must be an even number of Paired-end reads files')
            logging.info(f"fastq files {fastq_files}")
            input_string.extend(fastq_files)

        logging.info(f'input_string {input_string}')

        output_dir = os.path.join(self.shared_folder, 'kraken2_output')
        report_file_name = 'report.txt'
        report_file = os.path.join(output_dir, report_file_name)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        outprefix = "kraken2"

        cmd = [
            '/kb/module/lib/kraken2/src/kraken2.sh', '-d',
            '/data/kraken2/' + params['db_type'], '-o', output_dir, '-p',
            outprefix, '-t', '1', '-i'
        ]
        cmd.extend(input_string)

        # cmd = ['kraken2', '--db', '/data/kraken2/' + params['db_type'],
        #        '--output', output_dir, '--report', report_file,
        #        '--threads', '1']
        # cmd.extend(['--confidence', str(params['confidence'])]) if 'confidence' in params else cmd

        logging.info(f'cmd {cmd}')
        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
        logging.info(f'subprocess {p.communicate()}')

        summary_file = os.path.join(output_dir, outprefix + '.report.csv')
        report_dir = os.path.join(output_dir, 'html_report')
        if not os.path.exists(report_dir):
            os.makedirs(report_dir)
        summary_file_dt = os.path.join(report_dir, 'kraken2.datatable.html')
        self._generate_DataTable(summary_file, summary_file_dt)
        shutil.copy2('/kb/module/lib/kraken2/src/index.html',
                     os.path.join(report_dir, 'index.html'))
        shutil.copy2(os.path.join(output_dir, outprefix + '.krona.html'),
                     os.path.join(report_dir, 'kraken2.krona.html'))
        shutil.move(os.path.join(output_dir, outprefix + '.tree.svg'),
                    os.path.join(report_dir, 'kraken2.tree.svg'))
        html_zipped = self.package_folder(report_dir, 'index.html',
                                          'index.html')

        # columns = [
        #     'Percentage of fragments covered by the clade rooted at this taxon',
        #     'Number of fragments covered by the clade rooted at this taxon',
        #     'Number of fragments assigned directly to this taxon', 'rank code',
        #     'taxid', 'name']
        # report_df = pd.read_csv(report_file, sep='\t',
        #                         header=None, names=columns)
        # code_dict = {'U': 'Unclassified', 'R': 'Root', 'D': 'Domain',
        #              'K': 'Kingdom', 'P': 'Phylum', 'C': 'Class', 'O': 'Order',
        #              'F': 'Family', 'G': 'Genus', 'S': 'Species'}
        # report_df['rank code'] = report_df['rank code'].apply(
        #     lambda x: code_dict[x[0]] + x[1] if len(x) > 1 else code_dict[x])

        # self._generate_report_table(report_df, report_html_file, output_dir)
        # report_df.to_html(report_html_file, classes='Kraken2_report', index=False)
        # html_zipped = self.package_folder(output_dir, 'report.html',
        #                                   'report')
        # Step 5 - Build a Report and return
        objects_created = []
        output_files = os.listdir(output_dir)
        output_files_list = []
        for output in output_files:
            if not os.path.isdir(output):
                output_files_list.append({
                    'path':
                    os.path.join(output_dir, output),
                    'name':
                    output
                })
        message = f"Kraken2 run finished on {input_string} against {params['db_type']}."
        report_params = {
            'message': message,
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'file_links': output_files_list,
            'html_links': [html_zipped],
            'direct_html_link_index': 0,
            'html_window_height': 460
        }

        # STEP 6: construct the output to send back
        kbase_report_client = KBaseReport(self.callback_url)
        report_output = kbase_report_client.create_extended_report(
            report_params)
        report_output['report_params'] = report_params
        logging.info(report_output)
        # Return references which will allow inline display of
        # the report in the Narrative
        output = {
            'report_name': report_output['name'],
            'report_ref': report_output['ref'],
            'report_params': report_output['report_params']
        }
        #END run_kraken2

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kraken2 return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def run_rmrContigFilter(self, ctx, params):
        """
        Example app which filters contigs in an assembly using both a minimum contig length
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_rmrContigFilter

        # Print statements to stdout/stderr are captured and available as the App log
        logging.info('Starting run_rmrContigFilter function. Params=' +
                     pformat(params))

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        logging.info('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError(
                'Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'assembly_input_ref' not in params:
            raise ValueError(
                'Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError(
                'Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from min_length parameter (' +
                str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' +
                             str(min_length) + ')')

        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        logging.info('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(
            {'ref': assembly_input_ref})

        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        logging.info('Filtered Assembly to ' + str(n_remaining) +
                     ' contigs out of ' + str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder,
                                           'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')

        # Step 4 - Save the new Assembly back to the system
        logging.info('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': filtered_fasta_file
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            fasta_file['assembly_name']
        })

        # Step 4b - Build html report
        # create html string
        # write string to file to self.shared_folder
        # upload to shock
        # send to report

        html_header = "<!DOCTYPE html><html><head><meta charset=\"UTF-8\"><title>title</title></head><body><table>"
        html_footer = "</table></body></html>"

        tableentries = "<tr><th>ID</th><th>A %</th><th>C %</th><th>T %</th><th>G %</th></tr>"
        for contig in good_contigs:
            Acount = contig.seq.upper().count('A')
            Ccount = contig.seq.upper().count('C')
            Tcount = contig.seq.upper().count('T')
            Gcount = contig.seq.upper().count('G')
            total = Acount + Ccount + Tcount + Gcount

            Aper = 100 * (Acount / total)
            Cper = 100 * (Ccount / total)
            Gper = 100 * (Gcount / total)
            Tper = 100 * (Tcount / total)

            tmprow = "<tr><td>" + contig.id + "</td><td>" + str(round(
                Aper,
                2)) + "</td><td>" + str(round(Cper, 2)) + "</td><td>" + str(
                    round(Tper, 2)) + "</td><td>" + str(round(
                        Gper, 2)) + "</td></tr>"

            tableentries += tmprow

        # Create the html string
        html_str = html_header + tableentries + html_footer

        # Write the html string to a file in the shared folder
        html_file_dir = os.path.join(self.shared_folder, 'html')
        if not os.path.isdir(html_file_dir):
            os.mkdir(html_file_dir)
        html_file_path = os.path.join(html_file_dir, 'output_table.html')
        html_file = open(html_file_path, "w")
        html_file.write(html_str)
        html_file.close()
        """
        Will try to not use shock first
        # Upload the html file to shock
        dfu = DataFileUtil(self.callback_url)

        try:
            shock_html_upload = dfu.file_to_shock({'file_path': html_file_dir, 'make_handle': 0, 'pack':'zip'})
        except:
            raise ValueError('Unable to upload html file to shock with DataFileUtil')
        """

        # Step 5 - Build a Report and return
        """
        Old Report .create method:
        https://github.com/kbaseapps/KBaseReportPy/blob/master/lib/KBaseReportPy/KBaseReportPyImpl.py

        reportObj = {
            'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}],
            'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']})
        """

        # New report .create_extended_report
        reportObj = {
            'objects_created': [{
                'ref': new_assembly,
                'description': 'Filtered contigs'
            }],
            'message':
            'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
            str(n_total),
            'direct_html':
            None,
            'direct_html_link_index':
            0,
            'file_links': [],
            #'html_links': [{'shock-id': shock_html_upload['shock_id'], 'name': 'output-table.html', 'label': 'contig table'}],
            'html_links': [{
                'path': html_file_dir,
                'name': 'output_table.html',
                'description': 'HTML report for contig filtering'
            }],
            'workspace_name':
            params['workspace_name'],
        }

        report = KBaseReport(self.callback_url)
        report_info = report.create_extended_report(reportObj)

        # STEP 6: contruct the output to send back
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'assembly_output': new_assembly,
            'n_initial_contigs': n_total,
            'n_contigs_removed': n_total - n_remaining,
            'n_contigs_remaining': n_remaining
        }
        logging.info('returning:' + pformat(output))

        #END run_rmrContigFilter

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_rmrContigFilter return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def run_rmrContigFilter_max(self, ctx, params):
        """
        New app which filters contigs in an assembly using both a minimum and a maximum contig length
        :param params: instance of type "rmrContigFiltermaxinput" ->
           structure: parameter "output_workspace" of String, parameter
           "assembly_input_ref" of type "data_obj_ref", parameter
           "output_assembly_name" of String, parameter "min_length" of Long,
           parameter "max_length" of Long, parameter "report_ref" of String,
           parameter "report_name" of String
        :returns: instance of type "ReportResultsmax" -> structure: parameter
           "objNameOrId" of type "assembly_ref", parameter "report_name" of
           String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_rmrContigFilter_max

        # Print statements to stdout/stderr are captured and available as the App log
        logging.info('Starting run_rmrContigFilter_max function. Params=' +
                     pformat(params))

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        logging.info('Validating parameters.')
        if 'output_workspace' not in params:
            raise ValueError(
                'Parameter output_workspace is not set in input arguments')
        workspace_name = params['output_workspace']
        if 'assembly_input_ref' not in params:
            raise ValueError(
                'Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError(
                'Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from min_length parameter (' +
                str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' +
                             str(min_length) + ')')
        if 'max_length' not in params:
            raise ValueError(
                'Parameter max_length is not set in input arguments')
        max_length_orig = params['max_length']
        max_length = None
        try:
            max_length = int(max_length_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from max_length parameter (' +
                str(max_length_orig) + ')')
        if max_length < 0:
            raise ValueError('max_length parameter cannot be negative (' +
                             str(max_length) + ')')
        if min_length >= max_length:
            raise ValueError(
                'max_length cannot be less than or equal to min_length')

        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        logging.info('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(
            {'ref': assembly_input_ref})

        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length and len(record.seq) <= max_length:
                good_contigs.append(record)
                n_remaining += 1

        logging.info('Filtered Assembly to ' + str(n_remaining) +
                     ' contigs out of ' + str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder,
                                           'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')

        # Step 4 - Save the new Assembly back to the system
        logging.info('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': filtered_fasta_file
            },
            'workspace_name':
            workspace_name,
            #'assembly_name': fasta_file['assembly_name']
            'assembly_name':
            params['output_assembly_name']
        })

        # Step 5 - Build a Report and return
        report = KBaseReport(self.callback_url)

        # This is the old plain text report given in the SDK tutorial
        #reportObj = {
        #    'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}],
        #    'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)
        #}

        # This is the old plain text report, we need report.create_extended_report for our new output
        # report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']})

        # STEP 6: contruct the output to send back

        # We want to output the new assembly in an assembly viewer, to show the dynamic table
        # associated with the new assembly. We also want to keep our report text.

        report_info = report.create_extended_report({
            "message":
            'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
            str(n_total),
            "objects_created": [{
                'ref': new_assembly,
                'description': 'Filtered contigs'
            }],
            #"workspace_id": params['workspace_id'],
            "workspace_name":
            params["output_workspace"]
        })

        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'objNameOrId': params["output_assembly_name"],
            #'n_initial_contigs': n_total,
            #'n_contigs_removed': n_total - n_remaining,
            #'n_contigs_remaining': n_remaining,
            'wsNameOrId': params['output_workspace'],
            #'workspace_id': report_info['ws_id']
        }

        logging.info('returning:' + pformat(output))

        # This will print the ref # to the new assembly created from the filter
        # print("\n\nNEW ASSEMBLY: "+new_assembly+"\n\n")

        #END run_rmrContigFilter_max

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_rmrContigFilter_max return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #17
0
    def setUpClass(cls):
        token = os.environ.get('KB_AUTH_TOKEN', None)
        config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
        config = configparser.ConfigParser()
        config.read(config_file)
        cls.cfg = {n[0]: n[1] for n in config.items('GenomeAnnotationAPI')}
        authServiceUrl = cls.cfg.get(
            'auth-service-url',
            "https://kbase.us/services/authorization/Sessions/Login")
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'GenomeAnnotationAPI',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })

        cls.ws = Workspace(cls.cfg['workspace-url'], token=token)
        cls.impl = GenomeAnnotationAPI(cls.cfg)
        test_gbk_file = "/kb/module/test/data/kb_g.399.c.1.gbk"
        temp_gbk_file = "/kb/module/work/tmp/kb_g.399.c.1.gbk"
        shutil.copy(test_gbk_file, temp_gbk_file)
        suffix = int(time.time() * 1000)
        wsName = "test_GenomeAnnotationAPI_" + str(suffix)
        cls.ws.create_workspace({'workspace': wsName})
        cls.wsName = wsName

        data = json.load(open('data/rhodobacter_contigs.json'))
        # save to ws
        save_info = {
            'workspace':
            wsName,
            'objects': [{
                'type': 'KBaseGenomes.ContigSet',
                'data': data,
                'name': 'rhodo_contigs'
            }]
        }
        info = cls.ws.save_objects(save_info)[0]
        contigset_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
        data = json.load(open('data/rhodobacter.json'))
        data['contigset_ref'] = contigset_ref
        # save to ws
        info = cls.impl.save_one_genome_v1(cls.ctx, {
            'workspace': wsName,
            'name': "rhodobacter",
            'data': data,
        })[0]['info']
        cls.old_genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(
            info[4])
        print('created old test genome')

        assembly_file_path = os.path.join(cls.cfg['scratch'],
                                          'e_coli_assembly.fasta')
        shutil.copy('data/e_coli_assembly.fasta', assembly_file_path)
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        assembly_ref = au.save_assembly_from_fasta({
            'workspace_name': cls.wsName,
            'assembly_name': 'ecoli.assembly',
            'file': {
                'path': assembly_file_path
            }
        })
        data = json.load(open('data/new_ecoli_genome.json'))
        data['assembly_ref'] = assembly_ref
        # save to ws
        save_info = {
            'workspace':
            wsName,
            'objects': [{
                'type': 'KBaseGenomes.Genome',
                'data': data,
                'name': 'new_ecoli'
            }]
        }
        info = cls.ws.save_objects(save_info)[0]
        cls.new_genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(
            info[4])
        print('created new test genome')
예제 #18
0
    def run_unicycler(self, ctx, params):
        """
        Run Unicycler
        :param params: instance of type "UnicyclerParams" (To run Unicycler,
           you need at least one short read paired end library, and optional
           unpaired reads (divided into short and long.  All reads of the
           same time must be combined into a single file. workspace_name -
           the name of the workspace from which to take input and store
           output. output_contigset_name - the name of the output contigset
           short_paired_libraries - a list of short, paired end reads
           libraries short_unpaired_libraries - a list of short, paired end
           reads libraries long_reads_libraries - a list of long reads
           @optional min_contig_length @optional num_linear_seqs @optional
           bridging_mode) -> structure: parameter "workspace_name" of String,
           parameter "output_contigset_name" of String, parameter
           "short_paired_libraries" of list of type "paired_lib" (The
           workspace object name of a PairedEndLibrary file, whether of the
           KBaseAssembly or KBaseFile type.), parameter
           "short_unpaired_libraries" of list of type "unpaired_lib" (The
           workspace object name of a SingleEndLibrary file, whether of the
           KBaseAssembly or KBaseFile type.), parameter "long_reads_library"
           of String, parameter "min_contig_length" of Long, parameter
           "num_linear_seqs" of Long, parameter "bridging_mode" of String
        :returns: instance of type "UnicyclerOutput" (Output parameters for
           Unicycler run. report_name - the name of the KBaseReport.Report
           workspace object. report_ref - the workspace reference of the
           report.) -> structure: parameter "report_name" of String,
           parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_unicycler
        console = []
        warnings = []
        self.log(
            console, 'Running run_unicycler with params:\n{}'.format(
                json.dumps(params, indent=1)))
        token = self.cfg['KB_AUTH_TOKEN']

        # param checks
        required_params = [
            'workspace_name', 'output_contigset_name', 'min_contig_length',
            'num_linear_seqs', 'bridging_mode'
        ]
        for required_param in required_params:
            if required_param not in params or params[required_param] is None:
                raise ValueError("Must define required param: '" +
                                 required_param + "'")

        # needs either short paired or long
        if ('short_paired_libraries' not in params
                or params['short_paired_libraries'] is None
                or len(params['short_paired_libraries'])
                == 0) and ('long_reads_library' not in params
                           or params['long_reads_library'] is None):
            raise ValueError(
                "Must define either short_paired_libraries or long_reads_library"
            )

        # load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        if 'input_ws_objects' not in provenance[0]:
            provenance[0]['input_ws_objects'] = []

        if 'short_paired_libraries' in params and params[
                'short_paired_libraries'] is not None and len(
                    params['short_paired_libraries']) > 0:
            provenance[0]['input_ws_objects'].extend(
                params['short_paired_libraries'])
        if 'short_unpaired_libraries' in params and params[
                'short_unpaired_libraries'] is not None and len(
                    params['short_unpaired_libraries']) > 0:
            provenance[0]['input_ws_objects'].extend(
                params['short_unpaired_libraries'])
        if 'long_reads_library' in params and params[
                'long_reads_library'] is not None:
            provenance[0]['input_ws_objects'].append(
                params['long_reads_library'])

        # build command line
        cmd = 'unicycler'

        # download, split, and recombine short paired libraries
        if 'short_paired_libraries' in params and params[
                'short_paired_libraries'] is not None and len(
                    params['short_paired_libraries']) > 0:
            short1, short2 = self.download_short_paired(
                console, token, params['workspace_name'],
                params['short_paired_libraries'])
            cmd += ' -1 ' + short1 + ' -2 ' + short2

        # download and combine short unpaired libraries
        if 'short_unpaired_libraries' in params and params[
                'short_unpaired_libraries'] is not None and len(
                    params['short_unpaired_libraries']) > 0:
            unpaired = self.download_short_unpaired(
                console, token, params['workspace_name'],
                params['short_unpaired_libraries'])
            cmd += ' -s ' + unpaired

        # download long library
        if 'long_reads_library' in params and params[
                'long_reads_library'] is not None:
            longLib = self.download_long(console, warnings, token,
                                         params['workspace_name'],
                                         params['long_reads_library'],
                                         params['min_long_read_length'])
            cmd += ' -l ' + longLib

        # other params
        cmd += ' --min_fasta_length ' + str(params['min_contig_length'])
        cmd += ' --linear_seqs ' + str(params['num_linear_seqs'])
        cmd += ' --mode ' + str(params['bridging_mode'])
        cmd += ' --keep 0'

        if ('no_correct' in params and (params['no_correct'] == 1)):
            cmd += ' --no_correct'

        # output directory
        outputDir = os.path.join(self.scratch,
                                 "unicycler_" + str(uuid.uuid4()))
        self.mkdir_p(outputDir)
        cmd += ' -o ' + outputDir

        # run it
        self.log(console, "command: " + cmd)
        cmdProcess = subprocess.Popen(cmd,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.STDOUT,
                                      shell=True)
        for line in cmdProcess.stdout:
            self.log(console, line.decode("utf-8").rstrip())
        cmdProcess.wait()
        if cmdProcess.returncode != 0:
            raise ValueError('Error running ' + cmd)

        # save assembly
        try:
            contigsPath = os.path.join(outputDir, 'assembly.fasta')
            auClient = AssemblyUtil(url=self.callbackURL,
                                    token=token,
                                    service_ver='release')
            auClient.save_assembly_from_fasta({
                'file': {
                    'path': contigsPath
                },
                'workspace_name':
                params['workspace_name'],
                'assembly_name':
                params['output_contigset_name']
            })
        except Exception as e:
            raise ValueError('Error saving assembly\n' + str(e))

        # make report
        report_name, report_ref = self.generate_report(
            console, warnings, contigsPath, params, outputDir,
            params['workspace_name'])
        output = {'report_name': report_name, 'report_ref': report_ref}

        #END run_unicycler

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_unicycler return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #19
0
    def download_long(self, console, warnings, token, wsname, lib,
                      min_long_read_length):
        try:
            # object info
            try:
                wsClient = Workspace(self.workspaceURL, token=token)
            except Exception as e:
                raise ValueError("unable to instantiate wsClient. " + str(e))

            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)  # object_info tuple

            obj_id = {'ref': lib if '/' in lib else (wsname + '/' + lib)}
            lib_obj_info = wsClient.get_object_info_new({'objects':
                                                         [obj_id]})[0]
            lib_obj_type = lib_obj_info[TYPE_I]
            lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                  lib_obj_type)  # remove trailing version
            lib_ref = str(lib_obj_info[WSID_I])+'/' + \
                str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I])
            if lib_obj_type == 'KBaseGenomes.ContigSet' or lib_obj_type == 'KBaseGenomeAnnotations.Assembly':
                # download using assembly util / data file util
                self.log(console,
                         "Getting long reads (from contigs object).\n")
                auClient = AssemblyUtil(url=self.callbackURL, token=token)
                dfuClient = DataFileUtil(url=self.callbackURL, token=token)
                contigFile = auClient.get_assembly_as_fasta({
                    'ref': lib_ref
                }).get('path')
                long_reads_path = dfuClient.unpack_file(
                    {'file_path': contig_file})['file_path']
                self.log(
                    warnings,
                    "Warning:  Long reads are in FASTA format, so short read check was not performed."
                )

            else:
                ruClient = ReadsUtils(url=self.callbackURL, token=token)
                self.log(console,
                         "Getting long reads (from reads library object).\n")
                result = ruClient.download_reads({
                    'read_libraries': [lib_ref],
                    'interleaved': 'false'
                })
                long_reads_path = result['files'][lib_ref]['files']['fwd']
                [n_reads, n_reads_short
                 ] = self.filter_short_fastq(console, long_reads_path,
                                             min_long_read_length)
                if (n_reads_short > 0):
                    self.log(
                        warnings, "Warning:  Of " + str(n_reads) +
                        " long reads, " + str(n_reads_short) +
                        " are shorter than " + str(min_long_read_length) +
                        "; consider using the filtlong app to filter out shorter reads."
                    )

        except Exception as e:
            raise ValueError('Unable to download long reads\n' + str(e))
        return long_reads_path
예제 #20
0
    def test_fractiontate_contigs_ASSEMBLY_BINNEDCONTIGS_08(self):
        method = 'fractionate_contigs_pos_filter_ASSEMBLY_BINNEDCONTIGS_08'

        print("\n\nRUNNING: test_" + method + "()")
        print("==========================================================\n\n")

        # upload test data
        try:
            auClient = AssemblyUtil(self.callback_url,
                                    token=self.getContext()['token'])
        except Exception as e:
            raise ValueError(
                'Unable to instantiate auClient with callbackURL: ' +
                self.callback_url + ' ERROR: ' + str(e))
        try:
            mguClient = MetagenomeUtils(self.callback_url,
                                        token=self.getContext()['token'])
        except Exception as e:
            raise ValueError(
                'Unable to instantiate mguClient with callbackURL: ' +
                self.callback_url + ' ERROR: ' + str(e))
        base_1 = 'assembly_1plus2'
        base_2 = 'assembly'
        dir_2 = 'binned_contigs'
        type_1 = 'Assembly'
        type_2 = 'BinnedContigs'

        ass_file_1_fa = base_1 + '.fa.gz'
        ass_path_1_fa = os.path.join(self.scratch, ass_file_1_fa)
        dir_2_path = os.path.join(self.scratch, dir_2)
        shutil.copy(os.path.join("data", ass_file_1_fa), ass_path_1_fa)
        shutil.copytree(os.path.join("data", dir_2), dir_2_path)
        ass_ref_1 = auClient.save_assembly_from_fasta({
            'file': {
                'path': ass_path_1_fa
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            base_1 + '.' + type_1
        })
        binned_contigs_ref_2 = mguClient.file_to_binned_contigs({
            'file_directory':
            dir_2_path,
            'workspace_name':
            self.getWsName(),
            'assembly_ref':
            ass_ref_1,
            'binned_contig_name':
            base_2 + '.' + type_2
        })['binned_contig_obj_ref']

        # run method
        base_output_name = method + '_output'
        fractionate_mode = 'neg'
        params = {
            'workspace_name':
            self.getWsName(),
            'input_assembly_ref':
            ass_ref_1,
            'input_pos_filter_obj_refs': [binned_contigs_ref_2],
            'fractionate_mode':
            fractionate_mode,
            'output_name':
            'test_fractionated' + '-' + base_1 + '.' + type_1 + '-' +
            'binned_contigs_2a2b' + '-' + fractionate_mode
        }
        result = self.getImpl().run_fractionate_contigs(
            self.getContext(), params)
        print('RESULT:')
        pprint(result)
        pass
예제 #21
0
    def test_fractiontate_contigs_ASSEMBLY_ASSEMBLYSET_07(self):
        method = 'fractionate_contigs_pos_filter_ASSEMBLY_ASSEMBLYSET_07'

        print("\n\nRUNNING: test_" + method + "()")
        print("==========================================================\n\n")

        # upload test data
        try:
            auClient = AssemblyUtil(self.callback_url,
                                    token=self.getContext()['token'])
        except Exception as e:
            raise ValueError(
                'Unable to instantiate auClient with callbackURL: ' +
                self.callback_url + ' ERROR: ' + str(e))
        try:
            setAPI_Client = SetAPI(self.serviceWizardURL,
                                   token=self.getContext()['token'])
        except Exception as e:
            raise ValueError(
                'Unable to instantiate setAPI_Client with serviceWizardURL: ' +
                self.serviceWizardURL + ' ERROR: ' + str(e))
        base_1 = 'assembly_1plus2'
        base_2a = 'assembly_2a'
        base_2b = 'assembly_2b'
        type_1 = 'Assembly'
        type_2a = 'Assembly'
        type_2b = 'Assembly'
        ass_file_1_fa = base_1 + '.fa.gz'
        ass_file_2a_fa = base_2a + '.fa.gz'
        ass_file_2b_fa = base_2b + '.fa.gz'
        ass_path_1_fa = os.path.join(self.scratch, ass_file_1_fa)
        ass_path_2a_fa = os.path.join(self.scratch, ass_file_2a_fa)
        ass_path_2b_fa = os.path.join(self.scratch, ass_file_2b_fa)
        shutil.copy(os.path.join("data", ass_file_1_fa), ass_path_1_fa)
        shutil.copy(os.path.join("data", ass_file_2a_fa), ass_path_2a_fa)
        shutil.copy(os.path.join("data", ass_file_2b_fa), ass_path_2b_fa)
        ass_ref_1 = auClient.save_assembly_from_fasta({
            'file': {
                'path': ass_path_1_fa
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            base_1 + '.' + type_1
        })
        ass_ref_2a = auClient.save_assembly_from_fasta({
            'file': {
                'path': ass_path_2a_fa
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            base_2a + '.' + type_2a
        })
        ass_ref_2b = auClient.save_assembly_from_fasta({
            'file': {
                'path': ass_path_2b_fa
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            base_2b + '.' + type_2b
        })

        # AssemblySet
        assemblySet_items = [{
            'ref': ass_ref_2a,
            'label': 'assembly_2a'
        }, {
            'ref': ass_ref_2b,
            'label': 'assembly_2b'
        }]
        assemblySet_obj = {
            'description': 'test assemblySet',
            'items': assemblySet_items
        }
        assemblySet_ref = setAPI_Client.save_assembly_set_v1({
            'workspace_name':
            self.getWsName(),
            'output_object_name':
            'assembly_2a2b.AssemblySet',
            'data':
            assemblySet_obj
        })['set_ref']

        # run method
        base_output_name = method + '_output'
        fractionate_mode = 'neg'
        params = {
            'workspace_name':
            self.getWsName(),
            'input_assembly_ref':
            ass_ref_1,
            'input_pos_filter_obj_refs': [assemblySet_ref],
            'fractionate_mode':
            fractionate_mode,
            'output_name':
            'test_fractionated' + '-' + base_1 + '.' + type_1 + '-' +
            'assemblyset_2a2b' + '-' + fractionate_mode
        }
        result = self.getImpl().run_fractionate_contigs(
            self.getContext(), params)
        print('RESULT:')
        pprint(result)
        pass
예제 #22
0
    def run_ContigFilter_max(self, ctx, params):
        """
        New app which filters contigs in an assembly using both a minimum and a maximum contig length
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_ContigFilter_max
        # Check that the parameters are valid
        for name in [
                'min_length', 'max_length', 'assembly_ref', 'workspace_name'
        ]:
            if name not in params:
                raise ValueError('Parameter "' + name +
                                 '" is required but missing')
        if not isinstance(params['min_length'],
                          int) or (params['min_length'] < 0):
            raise ValueError('Min length must be a non-negative integer')
        if not isinstance(params['max_length'],
                          int) or (params['max_length'] < 0):
            raise ValueError('Max length must be a non-negative integer')
        if not isinstance(params['assembly_ref'], str) or not len(
                params['assembly_ref']):
            raise ValueError('Pass in a valid assembly reference string')

        print(params['min_length'], params['max_length'],
              params['assembly_ref'])
        output = {}

        assembly_util = AssemblyUtil(self.callback_url)
        fasta_file = assembly_util.get_assembly_as_fasta(
            {'ref': params['assembly_ref']})
        print(fasta_file)

        # Parse the downloaded file in FASTA format
        parsed_assembly = SeqIO.parse(fasta_file['path'], 'fasta')
        min_length = params['min_length']
        max_length = params['max_length']

        # Keep a list of contigs greater than min_length
        good_contigs = []
        # total contigs regardless of length
        n_total = 0
        # total contigs over the min_length
        n_remaining = 0
        for record in parsed_assembly:
            n_total += 1
            if len(record.seq) >= min_length and len(record.seq) <= max_length:
                good_contigs.append(record)
                n_remaining += 1
        # Create a file to hold the filtered data
        workspace_name = params['workspace_name']
        filtered_path = os.path.join(self.shared_folder, 'filtered.fasta')
        SeqIO.write(good_contigs, filtered_path, 'fasta')
        # Upload the filtered data to the workspace
        new_ref = assembly_util.save_assembly_from_fasta({
            'file': {
                'path': filtered_path
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            fasta_file['assembly_name']
        })
        # Create an output summary message for the report
        text_message = "".join([
            'Filtered assembly to ',
            str(n_remaining), ' contigs out of ',
            str(n_total)
        ])
        # Data for creating the report, referencing the assembly we uploaded
        report_data = {
            'objects_created': [{
                'ref': new_ref,
                'description': 'Filtered contigs'
            }],
            'text_message':
            text_message
        }
        # Initialize the report
        kbase_report = KBaseReport(self.callback_url)
        report = kbase_report.create({
            'report': report_data,
            'workspace_name': workspace_name
        })
        # Return the report reference and name in our results
        output = {
            'report_ref': report['ref'],
            'report_name': report['name'],
            'n_total': n_total,
            'n_remaining': n_remaining,
            'filtered_assembly_ref': new_ref
        }
        #END run_ContigFilter_max

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_ContigFilter_max return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def run_cnelsonAppDemo(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_cnelsonAppDemo

        # Print statements to stdout/stderr are captured and available as the App log
        logging.info('Starting run_cnelsonAppDemo function. Params=' +
                     pformat(params))

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        logging.info('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError(
                'Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'assembly_input_ref' not in params:
            raise ValueError(
                'Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError(
                'Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from min_length parameter (' +
                str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' +
                             str(min_length) + ')')

        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        logging.info('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(
            {'ref': assembly_input_ref})

        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        logging.info('Filtered Assembly to ' + str(n_remaining) +
                     ' contigs out of ' + str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder,
                                           'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')

        # Step 4 - Save the new Assembly back to the system
        logging.info('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': filtered_fasta_file
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            fasta_file['assembly_name']
        })

        # Step 5 - Build a Report and return
        reportObj = {
            'objects_created': [{
                'ref': new_assembly,
                'description': 'Filtered contigs'
            }],
            'text_message':
            'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
            str(n_total)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': reportObj,
            'workspace_name': params['workspace_name']
        })

        # STEP 6: contruct the output to send back
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'assembly_output': new_assembly,
            'n_initial_contigs': n_total,
            'n_contigs_removed': n_total - n_remaining,
            'n_contigs_remaining': n_remaining
        }
        logging.info('returning:' + pformat(output))

        #END run_cnelsonAppDemo

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_cnelsonAppDemo return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def BuildFastaFromSequenceSet(self, ctx, params):
        """
        :param params: instance of type "BuildSeqIn" -> structure: parameter
           "workspace_name" of String, parameter "SequenceSetRef" of String,
           parameter "fasta_outpath" of String
        :returns: instance of type "BuildSeqOut" -> structure: parameter
           "fasta_outpath" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN BuildFastaFromSequenceSet
        dfu = DataFileUtil(self.callback_url)

        bu = BackgroundUtils()
        TU = TestUtils()
        if params['TESTFLAG'] and params['background']:
            targetpath = '/kb/module/work/tmp/testgenome.fa'
            TU.GetGenome(targetpath)
            bu.BuildBackground(targetpath)
        elif params['background']:

            ws = Workspace('https://appdev.kbase.us/services/ws')
            subset = ws.get_object_subset([{
                'included': [
                    '/features/[*]/location', '/features/[*]/id',
                    '/assembly_ref'
                ],
                'ref':
                params['genome_ref']
            }])
            aref = subset[0]['data']['assembly_ref']
            assembly_ref = {'ref': aref}
            print('Downloading Assembly data as a Fasta file.')
            assemblyUtil = AssemblyUtil(self.callback_url)
            fasta_file = assemblyUtil.get_assembly_as_fasta(
                assembly_ref)['path']
            bu.BuildBackground(fasta_file)

        get_objects_params = {'object_refs': [params['SequenceSetRef']]}

        SeqSet = dfu.get_objects(get_objects_params)['data'][0]['data']
        outFile = open(params['fasta_outpath'], 'w')
        for s in SeqSet['sequences']:
            sname = '>' + s['sequence_id'] + '\n'
            outFile.write(sname)
            sseq = s['sequence'] + '\n'
            outFile.write(sseq)
        outFile.close()

        fu = FastaUtils()
        if params['mask_repeats']:
            fu.RemoveRepeats(params['fasta_outpath'], params['fasta_outpath'])

        output = {'fasta_outpath': params['fasta_outpath']}
        #END BuildFastaFromSequenceSet

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method BuildFastaFromSequenceSet return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
예제 #25
0
 def __init__(self):
     self.callbackURL = os.environ['SDK_CALLBACK_URL']
     self.au = AssemblyUtil(self.callbackURL)
     self.vu = VariationUtil(self.callbackURL)
     pass
예제 #26
0
    def stage_input(self, input_ref, fasta_file_extension):
        '''
        Stage input based on an input data reference for CheckM

        input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome

        This method creates a directory in the scratch area with the set of Fasta files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input('124/15/1', 'fna')

            staged_input
            {"input_dir": '...'}
        '''
        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'
        [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11)  # object_info tuple
        ws = Workspace(self.ws_url)

        # 1) generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'bins_' + suffix)
        all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension)
        if not os.path.exists(input_dir):
            os.makedirs(input_dir)


        # 2) based on type, download the files
        obj_name = self.get_data_obj_name (input_ref)
        type_name = self.get_data_obj_type (input_ref)

        # auClient
        try:
            auClient = AssemblyUtil(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate auClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e))

        # setAPI_Client
        try:
            #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token'])  # for SDK local.  local doesn't work for SetAPI
            setAPI_Client = SetAPI (url=self.serviceWizardURL, token=self.ctx['token'])  # for dynamic service
        except Exception as e:
            raise ValueError('Unable to instantiate setAPI_Client with serviceWizardURL: '+ self.serviceWizardURL +' ERROR: ' + str(e))

        # mguClient
        try:
            mguClient = MetagenomeUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate mguClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e))


        # Standard Single Assembly
        #
        if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']:
            # create file data
            filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension)
            auClient.get_assembly_as_fasta({'ref': input_ref, 'filename': filename})
            if not os.path.isfile(filename):
                raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # AssemblySet
        #
        elif type_name == 'KBaseSets.AssemblySet':

            # read assemblySet
            try:
                assemblySet_obj = setAPI_Client.get_assembly_set_v1 ({'ref':input_ref, 'include_item_info':1})
            except Exception as e:
                raise ValueError('Unable to get object from workspace: (' + input_ref +')' + str(e))
            assembly_refs = []
            assembly_names = []
            for assembly_item in assemblySet_obj['data']['items']:
                this_assembly_ref = assembly_item['ref']
                # assembly obj info
                try:
                    this_assembly_info = ws.get_object_info_new ({'objects':[{'ref':this_assembly_ref}]})[0]
                    this_assembly_name = this_assembly_info[NAME_I]
                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' + this_assembly_ref +'): ' + str(e))
                assembly_refs.append(this_assembly_ref)
                assembly_names.append(this_assembly_name)

            # create file data (name for file is what's reported in results)
            for ass_i,assembly_ref in enumerate(assembly_refs):
                this_name = assembly_names[ass_i]
                filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename})
                if not os.path.isfile(filename):
                    raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # Binned Contigs
        #
        elif type_name == 'KBaseMetagenomes.BinnedContigs':

            # download the bins as fasta and set the input folder name
            bin_file_dir = mguClient.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory']
            os.rename(bin_file_dir, input_dir)
            # make sure fasta file isn't empty
            self.set_fasta_file_extensions(input_dir, fasta_file_extension)
            for (dirpath, dirnames, filenames) in os.walk(input_dir):
                for fasta_file in filenames:
                    fasta_path = os.path.join (input_dir,fasta_file)
                    min_fasta_len = 1
                    if not self.fasta_seq_len_at_least(fasta_path, min_fasta_len):
                        raise ValueError('Binned Assembly is empty for fasta_path: '+str(fasta_path))
                break

        # Genome and GenomeSet
        #
        elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet':
            genome_obj_names = []
            genome_sci_names = []
            genome_assembly_refs = []

            if type_name == 'KBaseGenomes.Genome':
                genomeSet_refs = [input_ref]
            else:  # get genomeSet_refs from GenomeSet object
                genomeSet_refs = []
                try:
                    genomeSet_object = ws.get_objects2({'objects':[{'ref':input_ref}]})['data'][0]['data']
                except Exception as e:
                    raise ValueError('Unable to fetch '+str(input_ref)+' object from workspace: ' + str(e))
                    #to get the full stack trace: traceback.format_exc()

                # iterate through genomeSet members
                for genome_id in genomeSet_object['elements'].keys():
                    if 'ref' not in genomeSet_object['elements'][genome_id] or \
                       genomeSet_object['elements'][genome_id]['ref'] == None or \
                       genomeSet_object['elements'][genome_id]['ref'] == '':
                        raise ValueError('genome_ref not found for genome_id: '+str(genome_id)+' in genomeSet: '+str(input_ref))
                    else:
                        genomeSet_refs.append(genomeSet_object['elements'][genome_id]['ref'])

            # genome obj data
            for i,this_input_ref in enumerate(genomeSet_refs):
                try:
                    objects = ws.get_objects2({'objects':[{'ref':this_input_ref}]})['data']
                    genome_obj = objects[0]['data']
                    genome_obj_info = objects[0]['info']
                    genome_obj_names.append(genome_obj_info[NAME_I])
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError ("unable to fetch genome: "+this_input_ref)

                # Get genome_assembly_ref
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    raise ValueError (msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj['assembly_ref'] != None:
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING assembly_ref: "+str(genome_obj['assembly_ref'])
                    print (msg)
                    genome_assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj['contigset_ref'] != None:
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING contigset_ref: "+str(genome_obj['contigset_ref'])
                    print (msg)
                    genome_assembly_refs.append(genome_obj['contigset_ref'])

            # create file data (name for file is what's reported in results)
            for ass_i,assembly_ref in enumerate(genome_assembly_refs):
                this_name = genome_obj_names[ass_i]
                filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename})
                if not os.path.isfile(filename):
                    raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # Unknown type slipped through
        #
        else:
            raise ValueError('Cannot stage fasta file input directory from type: ' + type_name)


        # create summary fasta file with all bins
        self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta)

        return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}
예제 #27
0
    def setUpClass(cls):
        print('Setting up class')
        token = os.environ.get('KB_AUTH_TOKEN', None)
        config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
        config = configparser.ConfigParser()
        config.read(config_file)
        cls.cfg = {n[0]: n[1] for n in config.items('GenomeAnnotationAPI')}
        authServiceUrl = cls.cfg.get('auth-service-url',
                "https://kbase.us/services/authorization/Sessions/Login")
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'GenomeAnnotationAPI',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})

        cls.ws = Workspace(cls.cfg['workspace-url'], token=token)
        cls.impl = GenomeAnnotationAPI(cls.cfg)

        # Second user
        test_cfg_file = '/kb/module/work/test.cfg'
        test_cfg_text = "[test]\n"
        with open(test_cfg_file, "r") as f:
            test_cfg_text += f.read()
        config = configparser.ConfigParser()
        config.read_file(io.StringIO(test_cfg_text))
        test_cfg_dict = dict(config.items("test"))
        if ('test_token2' not in test_cfg_dict):
            raise ValueError("Configuration in <module>/test_local/test.cfg file should " +
                             "include second user credentials ('test_token2')")
        token2 = test_cfg_dict['test_token2']
        user2 = auth_client.get_user(token2)
        cls.ctx2 = MethodContext(None)
        cls.ctx2.update({'token': token2,
                         'user_id': user2,
                         'provenance': [
                            {'service': 'NarrativeService',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                         'authenticated': 1})
        
        # create one WS for all tests
        suffix = int(time.time() * 1000)
        wsName = "test_GenomeAnnotationAPI_" + str(suffix)
        ret = cls.ws.create_workspace({'workspace': wsName})
        cls.wsName = wsName

        # preload with reference data
        with open ('data/rhodobacter.json', 'r') as file:
            data_str=file.read()
        data = json.loads(data_str)
        # save old genome
        info = cls.impl.save_one_genome_v1(cls.ctx, {
               'workspace': wsName,
               'name': "rhodobacter",
               'data': data,
           })[0]['info']
        cls.rhodobacter_ref = str(info[6]) +'/' + str(info[0]) + '/' + str(info[4])
        print('created rhodobacter test genome: ' + cls.rhodobacter_ref)

        assembly_file_path = os.path.join(cls.cfg['scratch'],
                                          'e_coli_assembly.fasta')
        shutil.copy('data/e_coli_assembly.fasta', assembly_file_path)
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        assembly_ref = au.save_assembly_from_fasta({
            'workspace_name': cls.wsName,
            'assembly_name': 'ecoli.assembly',
            'file': {'path': assembly_file_path}
        })
        data = json.load(open('data/new_ecoli_genome.json'))
        data['assembly_ref'] = assembly_ref
        # save new genome
        save_info = {
            'workspace': wsName,
            'objects': [{
                'type': 'KBaseGenomes.Genome',
                'data': data,
                'name': 'new_ecoli'
            }]
        }
        info = cls.ws.save_objects(save_info)[0]
        cls.new_genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(
            info[4])
        print('created new test genome')
예제 #28
0
 def test_what_is_fastas(self):
     assembly_util = AssemblyUtil(self.callback_url)
     fastas = assembly_util.get_fastas({'ref_lst': ['41343/11/3']})
     print(fastas)
    def test_annotate_contigs(self):

        assembly_file_name = "small.fna"  # "AP009048.fna"
        assembly_test_file = os.path.join("/kb/module/test/data/", assembly_file_name)
        assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name)
        shutil.copy(assembly_test_file, assembly_temp_file)
        assembly_name = "Assembly.1"
        au = AssemblyUtil(os.environ["SDK_CALLBACK_URL"])
        assembly_ref = au.save_assembly_from_fasta({"file": {"path": assembly_temp_file},
                                                    "workspace_name": self.getWsName(),
                                                    "assembly_name": assembly_name})
        # Add a genome to the WS to test ref_paths
        genome_name = "Genome.1"
        genome = {"id": "Unknown", "features": [],
                  "scientific_name": "",
                  "domain": "", "genetic_code": 0,
                  "assembly_ref": assembly_ref,
                  "cdss": [], "mrnas": [],
                  "source": "Magic!",
                  "gc_content": 0, "dna_size": 0,
                  "reference_annotation": 0}
        prov = self.getContext().provenance()
        gfu = GenomeFileUtil(os.environ["SDK_CALLBACK_URL"])
        info = gfu.save_one_genome(
            {"workspace": self.getWsName(), "name": genome_name,
             "data": genome, "provenance": prov})["info"]
        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])
        result = self.getImpl().annotate(self.getContext(),
                                         {"object_ref": "{};{}".format(genome_ref, assembly_ref),
                                          "output_workspace": self.getWsName(),
                                          "output_genome_name": genome_name,
                                          "evalue": None,
                                          "fast": 0,
                                          "gcode": 0,
                                          "genus": "genus",
                                          "kingdom": "Bacteria",
                                          "metagenome": 0,
                                          "mincontiglen": 1,
                                          "norrna": 0,
                                          "notrna": 0,
                                          "rawproduct": 0,
                                          "rfam": 1,
                                          "scientific_name": "Super : diper - name;"
                                          })[0]
        rep = self.getWsClient().get_objects([{"ref": result["report_ref"]}])[0]["data"]
        self.assertTrue("text_message" in rep)
        print("Report:\n" + str(rep["text_message"]))
        genome_ref = self.getWsName() + "/" + genome_name
        genome = self.getWsClient().get_objects([{"ref": genome_ref}])[0]["data"]
        features_to_work = {}
        for feature in genome["features"]:
            features_to_work[feature["id"]] = feature["location"]
        aseq = AssemblySequenceAPI(os.environ["SDK_CALLBACK_URL"], token=self.getContext()["token"])
        dna_sequences = aseq.get_dna_sequences({"requested_features": features_to_work,
                                                "assembly_ref": genome["assembly_ref"]})[
            "dna_sequences"]
        bad_dnas = 0
        for feature in genome["features"]:
            if feature["dna_sequence"] != dna_sequences[feature["id"]]:
                bad_dnas += 1
        self.assertEqual(bad_dnas, 0)
예제 #30
0
    def test_fractiontate_contigs_ASSEMBLY_GENOMELIST_05(self):
        method = 'fractionate_contigs_pos_filter_ASSEMBLY_GENOMELIST_05'

        print("\n\nRUNNING: test_" + method + "()")
        print("==========================================================\n\n")

        # upload test data
        try:
            auClient = AssemblyUtil(self.callback_url,
                                    token=self.getContext()['token'])
        except Exception as e:
            raise ValueError(
                'Unable to instantiate auClient with callbackURL: ' +
                self.callback_url + ' ERROR: ' + str(e))
        try:
            gfuClient = GenomeFileUtil(self.callback_url,
                                       token=self.getContext()['token'])
        except Exception as e:
            raise ValueError(
                'Unable to instantiate gfuClient with callbackURL: ' +
                self.callback_url + ' ERROR: ' + str(e))

        base_1 = 'assembly_1plus2'
        base_2a = 'assembly_2a'
        base_2b = 'assembly_2b'
        type_1 = 'Assembly'
        type_2a = 'Genome'
        type_2b = 'Genome'
        ass_file_1_fa = base_1 + '.fa.gz'
        ass_file_2a_fa = base_2a + '.fa.gz'
        ass_file_2b_fa = base_2b + '.fa.gz'
        ass_file_2a_gff = base_2a + '.gff'
        ass_file_2b_gff = base_2b + '.gff'
        ass_path_1_fa = os.path.join(self.scratch, ass_file_1_fa)
        ass_path_2a_fa = os.path.join(self.scratch, ass_file_2a_fa)
        ass_path_2b_fa = os.path.join(self.scratch, ass_file_2b_fa)
        ass_path_2a_gff = os.path.join(self.scratch, ass_file_2a_gff)
        ass_path_2b_gff = os.path.join(self.scratch, ass_file_2b_gff)
        shutil.copy(os.path.join("data", ass_file_1_fa), ass_path_1_fa)
        shutil.copy(os.path.join("data", ass_file_2a_fa), ass_path_2a_fa)
        shutil.copy(os.path.join("data", ass_file_2b_fa), ass_path_2b_fa)
        shutil.copy(os.path.join("data", ass_file_2a_gff), ass_path_2a_gff)
        shutil.copy(os.path.join("data", ass_file_2b_gff), ass_path_2b_gff)
        ass_ref_1 = auClient.save_assembly_from_fasta({
            'file': {
                'path': ass_path_1_fa
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            base_1 + '.' + type_1
        })
        ass_ref_2a = gfuClient.fasta_gff_to_genome({
            'fasta_file': {
                'path': ass_path_2a_fa
            },
            'gff_file': {
                'path': ass_path_2a_gff
            },
            'generate_missing_genes':
            1,
            'source':
            'GFF',
            'scientific_name':
            base_2a,
            'workspace_name':
            self.getWsName(),
            'genome_name':
            base_2a + '.' + type_2a
        }).get('genome_ref')
        ass_ref_2b = gfuClient.fasta_gff_to_genome({
            'fasta_file': {
                'path': ass_path_2b_fa
            },
            'gff_file': {
                'path': ass_path_2b_gff
            },
            'generate_missing_genes':
            1,
            'source':
            'GFF',
            'scientific_name':
            base_2b,
            'workspace_name':
            self.getWsName(),
            'genome_name':
            base_2b + '.' + type_2b
        }).get('genome_ref')

        # run method
        base_output_name = method + '_output'
        fractionate_mode = 'pos'
        params = {
            'workspace_name':
            self.getWsName(),
            'input_assembly_ref':
            ass_ref_1,
            'input_pos_filter_obj_refs': [ass_ref_2a, ass_ref_2b],
            'fractionate_mode':
            fractionate_mode,
            'output_name':
            'test_fractionated' + '-' + base_1 + '.' + type_1 + '-' + base_2a +
            '.' + type_2a + '-' + base_2b + '.' + type_2b + '-' +
            fractionate_mode
        }
        result = self.getImpl().run_fractionate_contigs(
            self.getContext(), params)
        print('RESULT:')
        pprint(result)
        pass