def prepareTestData(cls): """This function creates an assembly object for testing""" fasta_content = '>seq1 something soemthing asdf\n' \ 'agcttttcat\n' \ '>seq2\n' \ 'agctt\n' \ '>seq3\n' \ 'agcttttcatgg' filename = os.path.join(cls.scratch, 'test1.fasta') with open(filename, 'w') as f: f.write(fasta_content) assemblyUtil = AssemblyUtil(cls.callback_url) cls.assembly_ref = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': filename }, 'workspace_name': cls.wsName, 'assembly_name': 'TestAssembly' })
def stage_assembly_files(self, object_list): """ _stage_assembly_files: download the fasta files to the scratch area return list of file names """ log('Processing assembly object list: {}'.format(object_list)) auc = AssemblyUtil(self.callbackURL) staged_file_list = [] for assembly_upa in object_list: try: filename = auc.get_assembly_as_fasta({'ref': assembly_upa})['path'] except ServerError as assembly_error: print(str(assembly_error)) raise staged_file_list.append(filename) log('Created file list: {}'.format(staged_file_list)) return staged_file_list
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_staging_exporter'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'user_id': user_id, 'provenance': [{ 'service': 'kb_staging_exporter', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL) cls.serviceImpl = kb_staging_exporter(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] cls.ru = ReadsUtils(cls.callback_url) cls.au = AssemblyUtil(cls.callback_url) cls.gfu = GenomeFileUtil(cls.callback_url, service_ver='dev') cls.rau = ReadsAlignmentUtils(cls.callback_url)
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('MetagenomeUtils'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'user_id': user_id, 'provenance': [{ 'service': 'MetagenomeUtils', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL) cls.serviceImpl = MetagenomeUtils(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] suffix = int(time.time() * 1000) wsName = "test_kb_maxbin_" + str(suffix) cls.ws_info = cls.wsClient.create_workspace({'workspace': wsName}) cls.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token) cls.au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=token)
def getAssemblyInfo(self, ass_name): if hasattr(self.__class__, 'assemblyInfo'): if self.__class__.assemblyInfo.get(ass_name): return self.__class__.assemblyInfo[ass_name] # copy the local test file to the shared scratch space so that the AssemblyUtil # container can see it. test_fasta_file_local = os.path.join('data', 'assemblies', ass_name) test_fasta_file_scratch = os.path.join(self.scratch, os.path.basename(test_fasta_file_local)) shutil.copy(test_fasta_file_local, test_fasta_file_scratch) # call the AssemblyUtil libary to upload the test data to KBase au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) ass_ref = au.save_assembly_from_fasta({'file': {'path': test_fasta_file_scratch}, 'workspace_name': self.getWsName(), 'assembly_name': ass_name}) # get the object metadata for the new test dataset new_obj_info = self.ws.get_object_info_new({'objects': [{'ref': ass_ref}]}) if not hasattr(self.__class__, 'assemblyInfo'): self.__class__.assemblyInfo = dict() self.__class__.assemblyInfo[ass_name] = new_obj_info[0] return new_obj_info[0]
def setUpClass(cls): config_file = environ.get("KB_DEPLOYMENT_CONFIG", None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items("ProkkaAnnotation"): cls.cfg[nameval[0]] = nameval[1] # Token validation token = environ.get("KB_AUTH_TOKEN", None) authServiceUrl = cls.cfg.get( "auth-service-url", "https://kbase.us/services/authorization/Sessions/Login") auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don"t call any logging methods on the context object, # it"ll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ "token": token, "user_id": user_id, "provenance": [{ "service": "ProkkaAnnotation", "method": "please_never_use_it_in_production", "method_params": [] }], "authenticated": 1 }) cls.wsURL = cls.cfg["workspace-url"] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = ProkkaAnnotation(cls.cfg) cls.callback_url = os.environ['SDK_CALLBACK_URL'] cls.gfu = GenomeFileUtil(cls.callback_url) cls.au = AssemblyUtil(cls.callback_url) cls.scratch = cls.cfg['scratch']
def load_genome_direct(cls, filename, assembly_filename, obj_name): au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) assembly_path = os.path.join(cls.cfg['scratch'], os.path.basename(assembly_filename)) shutil.copy(assembly_filename, assembly_path) assembly_ref = au.save_assembly_from_fasta({ 'workspace_name': cls.wsName, 'assembly_name': obj_name + '.assembly', 'file': {'path': assembly_path} }) data = json.load(open(filename)) data['assembly_ref'] = assembly_ref save_info = { 'workspace': cls.wsName, 'objects': [{ 'data': data, 'name': obj_name + '.genome', 'type': 'KBaseGenomes.Genome', }], } info = cls.wsClient.save_objects(save_info)[0]['info'] ref = f"{info[6]}/{info[0]}/{info[4]}" print('created test genome: ' + ref + ' from file ' + filename) return ref
def __init__(self): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.au = AssemblyUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) pass
def __init__(self, config): self.scratch = os.path.abspath(config['scratch']) self.callback_url = os.environ['SDK_CALLBACK_URL'] self.mgu = MetagenomeUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.ws = Workspace(config['workspace-url'], token=config['token'])
def finish_run(self, params): """ Finish up the run by uploading output and creating the report """ console = [] self.log(console, 'Running post') # run hipmer, capture output as it happens self.log(console, 'running hipmer:') # grab path of output contigs output_contigs = '' for root, subdirs, files in os.walk(self.scratch): for f in files: if f == 'final_assembly.fa': output_contigs = os.path.join(root,f) print("found OUTPUT CONTIGS {}".format(output_contigs)) continue output_name = params['output_contigset_name'] slurm_out = os.path.join(self.scratch, 'slurm.out') if not os.path.exists(output_contigs): self.log(console, "It looks like HipMER failed. Could not find the output contigs.") self.log(console, "Show errors in log file") with open(slurm_out, 'r') as f: for line in f: if line.lower().find('error') >= 0: self.log(console, line) raise RuntimeError("Error in HipMER execution") wsname = params['workspace_name'] self.log(console, 'Filtering short length contigs from HipMer assembly') assemblyUtil = AssemblyUtil(self.callbackURL, token=self.token) assembly_size_filter = params['assembly_size_filter'] filtered_fasta_file_path = self.filter_contigs_by_length(output_contigs, assembly_size_filter) if os.stat(filtered_fasta_file_path).st_size == 0: raise ValueError("Error: Using input parameters, you have filtered all contigs from the HipMer \ assembly. Decrease the minimum contig size and try again.") else: output_contigs = filtered_fasta_file_path self.log(console, 'Uploading FASTA file to Assembly') save_input = {'file': {'path': output_contigs}, 'workspace_name': wsname, 'assembly_name': output_name } output_data_ref = assemblyUtil.save_assembly_from_fasta(save_input) # create a Report # compute a simple contig length distribution for the report lengths = [] for seq_record in SeqIO.parse(output_contigs, 'fasta'): lengths.append(len(seq_record.seq)) report = '' report += 'ContigSet saved to: ' + params['workspace_name'] + '/' report += params['output_contigset_name'] + '\n' report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n' bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' \%d\t--\t%d' % (counts[c], edges[c]) report += ' to %d bp\n' % (edges[c + 1]) print('Running QUAST') kbq = kb_quast(self.callbackURL) try: quastret = kbq.run_QUAST({'files': [{'path': output_contigs, 'label': params['output_contigset_name']}]}) except Exception as e: # not really any way to test this, all inputs have been checked # earlier and should be ok print('Logging exception from running QUAST') print((str(e))) # TODO delete shock node raise print('Saving report') kbr = KBaseReport(self.callbackURL) try: report_info = kbr.create_extended_report( {'message': report, 'objects_created': [{'ref': output_data_ref, 'description': 'Assembled contigs'}], 'direct_html_link_index': 0, 'html_links': [{'shock_id': quastret['shock_id'], 'name': 'report.html', 'label': 'QUAST report'} ], 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'] }) except Exception as e: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from creating report object') print((str(e))) # TODO delete shock node raise # STEP 6: contruct the output to send back output = {'report_name': report_info['name'], 'report_ref': report_info['ref'] } return output
def test_fractiontate_contigs_ASSEMBLY_GENOMESET_06(self): method = 'fractionate_contigs_pos_filter_ASSEMBLY_GENOMESET_06' print("\n\nRUNNING: test_" + method + "()") print("==========================================================\n\n") # upload test data try: auClient = AssemblyUtil(self.callback_url, token=self.getContext()['token']) except Exception as e: raise ValueError( 'Unable to instantiate auClient with callbackURL: ' + self.callback_url + ' ERROR: ' + str(e)) try: gfuClient = GenomeFileUtil(self.callback_url, token=self.getContext()['token']) except Exception as e: raise ValueError( 'Unable to instantiate gfuClient with callbackURL: ' + self.callback_url + ' ERROR: ' + str(e)) base_1 = 'assembly_1plus2' base_2a = 'assembly_2a' base_2b = 'assembly_2b' type_1 = 'Assembly' type_2a = 'Genome' type_2b = 'Genome' ass_file_1_fa = base_1 + '.fa.gz' ass_file_2a_fa = base_2a + '.fa.gz' ass_file_2b_fa = base_2b + '.fa.gz' ass_file_2a_gff = base_2a + '.gff' ass_file_2b_gff = base_2b + '.gff' ass_path_1_fa = os.path.join(self.scratch, ass_file_1_fa) ass_path_2a_fa = os.path.join(self.scratch, ass_file_2a_fa) ass_path_2b_fa = os.path.join(self.scratch, ass_file_2b_fa) ass_path_2a_gff = os.path.join(self.scratch, ass_file_2a_gff) ass_path_2b_gff = os.path.join(self.scratch, ass_file_2b_gff) shutil.copy(os.path.join("data", ass_file_1_fa), ass_path_1_fa) shutil.copy(os.path.join("data", ass_file_2a_fa), ass_path_2a_fa) shutil.copy(os.path.join("data", ass_file_2b_fa), ass_path_2b_fa) shutil.copy(os.path.join("data", ass_file_2a_gff), ass_path_2a_gff) shutil.copy(os.path.join("data", ass_file_2b_gff), ass_path_2b_gff) ass_ref_1 = auClient.save_assembly_from_fasta({ 'file': { 'path': ass_path_1_fa }, 'workspace_name': self.getWsName(), 'assembly_name': base_1 + '.' + type_1 }) ass_ref_2a = gfuClient.fasta_gff_to_genome({ 'fasta_file': { 'path': ass_path_2a_fa }, 'gff_file': { 'path': ass_path_2a_gff }, 'generate_missing_genes': 1, 'source': 'GFF', 'scientific_name': base_2a, 'workspace_name': self.getWsName(), 'genome_name': base_2a + '.' + type_2a }).get('genome_ref') ass_ref_2b = gfuClient.fasta_gff_to_genome({ 'fasta_file': { 'path': ass_path_2b_fa }, 'gff_file': { 'path': ass_path_2b_gff }, 'generate_missing_genes': 1, 'source': 'GFF', 'scientific_name': base_2b, 'workspace_name': self.getWsName(), 'genome_name': base_2b + '.' + type_2b }).get('genome_ref') # GenomeSet genomeSet_obj = { 'description': 'test genomeSet', 'elements': { 'genome_0': { 'ref': ass_ref_2a }, 'genome_1': { 'ref': ass_ref_2b } } } provenance = [{}] genomeSet_info = self.getWsClient().save_objects({ 'workspace': self.getWsName(), 'objects': [{ 'type': 'KBaseSearch.GenomeSet', 'data': genomeSet_obj, 'name': 'test_genomeSet_2a2b', 'meta': {}, 'provenance': provenance }] })[0] genomeSet_ref = str(genomeSet_info[WSID_I]) + '/' + \ str(genomeSet_info[OBJID_I]) + '/' + \ str(genomeSet_info[VERSION_I]) # run method base_output_name = method + '_output' fractionate_mode = 'neg' params = { 'workspace_name': self.getWsName(), 'input_assembly_ref': ass_ref_1, 'input_pos_filter_obj_refs': [genomeSet_ref], 'fractionate_mode': fractionate_mode, 'output_name': 'test_fractionated' + '-' + base_1 + '.' + type_1 + '-' + 'genomeset_2a2b' + '-' + fractionate_mode } result = self.getImpl().run_fractionate_contigs( self.getContext(), params) print('RESULT:') pprint(result) pass
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('SetAPI'): cls.cfg[nameval[0]] = nameval[1] authServiceUrl = cls.cfg.get( 'auth-service-url', "https://kbase.us/services/authorization/Sessions/Login") auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'user_id': user_id, 'provenance': [{ 'service': 'SetAPI', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = SetAPI(cls.cfg) # setup data at the class level for now (so that the code is run # once for all tests, not before each test case. Not sure how to # do that outside this function..) suffix = int(time.time() * 1000) wsName = "test_SetAPI_" + str(suffix) ret = cls.wsClient.create_workspace({'workspace': wsName}) # wsName = 'pranjan77:1477441032423' cls.wsName = wsName # copy test file to scratch area fna_filename = "seq.fna" fna_path = os.path.join(cls.cfg['scratch'], fna_filename) shutil.copy(os.path.join("data", fna_filename), fna_path) ru = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) ws_obj_name = 'MyNewAssembly' cls.assembly1ref = ru.save_assembly_from_fasta({ 'file': { 'path': fna_path }, 'workspace_name': wsName, 'assembly_name': 'assembly_obj_1' }) cls.assembly2ref = ru.save_assembly_from_fasta({ 'file': { 'path': fna_path }, 'workspace_name': wsName, 'assembly_name': 'assembly_obj_2' })
def run_megahit(self, ctx, params): """ :param params: instance of type "MegaHitParams" (Run MEGAHIT. Most parameters here are just passed forward to MEGAHIT workspace_name - the name of the workspace for input/output read_library_ref - the name of the PE read library (SE library support in the future) output_contig_set_name - the name of the output contigset megahit_parameter_preset - override a group of parameters; possible values: meta '--min-count 2 --k-list 21,41,61,81,99' (generic metagenomes, default) meta-sensitive '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more sensitive but slower) meta-large '--min-count 2 --k-list 27,37,47,57,67,77,87' (large & complex metagenomes, like soil) bulk '--min-count 3 --k-list 31,51,71,91,99 --no-mercy' (experimental, standard bulk sequencing with >= 30x depth) single-cell '--min-count 3 --k-list 21,33,55,77,99,121 --merge_level 20,0.96' (experimental, single cell data) min_count - minimum multiplicity for filtering (k_min+1)-mers, default 2 k_min - minimum kmer size (<= 255), must be odd number, defaults to 21 k_max - maximum kmer size (<= 255), must be odd number, defaults to 141 k_step - increment of kmer size of each iteration (<= 28), must be even number, defaults to 10 k_list - list of kmer sizes (all must be odd, in the range 15-255, increment <= 28); override using `--k-min', `--k-max' and `--k-step' min_contig_length - minimum length of contigs to output, default is 2000 max_mem_percent - maximum memory to make available to MEGAHIT, as a percentage of available system memory (optional, default = 0.9 or 90%) @optional megahit_parameter_preset @optional min_count @optional k_min @optional k_max @optional k_step @optional k_list @optional min_contig_length @optional max_mem_percent) -> structure: parameter "workspace_name" of String, parameter "read_library_ref" of String, parameter "output_contigset_name" of String, parameter "megahit_parameter_preset" of String, parameter "min_count" of Long, parameter "k_min" of Long, parameter "k_max" of Long, parameter "k_step" of Long, parameter "k_list" of list of Long, parameter "min_contig_length" of Long, parameter "max_mem_percent" of Double :returns: instance of type "MegaHitOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_megahit print('Running run_megahit with params=') pprint(params) # STEP 1: basic parameter checks + parsing if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'read_library_ref' not in params: raise ValueError('read_library_ref parameter is required') if 'output_contigset_name' not in params: raise ValueError('output_contigset_name parameter is required') # STEP 2: get the read library as deinterleaved fastq files input_ref = params['read_library_ref'] reads_params = {'read_libraries': [input_ref], 'interleaved': 'false', 'gzipped': None } ru = ReadsUtils(self.callbackURL) reads = ru.download_reads(reads_params)['files'] print('Input reads files:') fwd = reads[input_ref]['files']['fwd'] rev = reads[input_ref]['files']['rev'] pprint('forward: ' + fwd) pprint('reverse: ' + rev) # STEP 3: run megahit # construct the command megahit_cmd = [self.MEGAHIT] # we only support PE reads, so add that megahit_cmd.append('-1') megahit_cmd.append(fwd) megahit_cmd.append('-2') megahit_cmd.append(rev) # if a preset is defined, use that: if 'megahit_parameter_preset' in params: if params['megahit_parameter_preset']: megahit_cmd.append('--presets') megahit_cmd.append(params['megahit_parameter_preset']) if 'min_count' in params: if params['min_count']: megahit_cmd.append('--min-count') megahit_cmd.append(str(params['min_count'])) if 'k_min' in params: if params['k_min']: megahit_cmd.append('--k-min') megahit_cmd.append(str(params['k_min'])) if 'k_max' in params: if params['k_max']: megahit_cmd.append('--k-max') megahit_cmd.append(str(params['k_max'])) if 'k_step' in params: if params['k_step']: megahit_cmd.append('--k-step') megahit_cmd.append(str(params['k_step'])) if 'k_list' in params: if params['k_list']: k_list = [] for k_val in params['k_list']: k_list.append(str(k_val)) megahit_cmd.append('--k-list') megahit_cmd.append(','.join(k_list)) min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH if 'min_contig_length' in params: if params['min_contig_length']: if str(params['min_contig_length']).isdigit(): min_contig_length = params['min_contig_length'] else: raise ValueError('min_contig_length parameter must be a non-negative integer') megahit_cmd.append('--min-contig-len') megahit_cmd.append(str(min_contig_length)) # Set the number of CPUs to the number of cores minus 1 megahit_cmd.append('--num-cpu-threads') megahit_cmd.append(str(max([(multiprocessing.cpu_count() - 1), 1]))) # set mem usage # Note: this just sets the default value - 90% of available system memory allocated # to the container. Exposing it here as a place to later expose as a parameter. max_mem_percent = params.get('max_mem_percent', 0.9) megahit_cmd.append('-m') megahit_cmd.append(str(max_mem_percent)) # set the output location timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'output.' + str(timestamp)) megahit_cmd.append('-o') megahit_cmd.append(output_dir) # run megahit print('running megahit:') print(' ' + ' '.join(megahit_cmd)) p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False) retcode = p.wait() print('Return code: ' + str(retcode)) if p.returncode != 0: error_str = report_megahit_error(output_dir, retcode) raise RuntimeError(error_str) output_contigs = os.path.join(output_dir, 'final.contigs.fa') # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there if self.mac_mode: shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa')) output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa') # STEP 4: save the resulting assembly assemblyUtil = AssemblyUtil(self.callbackURL) output_data_ref = assemblyUtil.save_assembly_from_fasta({ 'file': {'path': output_contigs}, 'workspace_name': params['workspace_name'], 'assembly_name': params['output_contigset_name'] }) # STEP 5: generate and save the report # compute a simple contig length distribution for the report lengths = [] for seq_record in SeqIO.parse(output_contigs, 'fasta'): lengths.append(len(seq_record.seq)) report = '' report += 'ContigSet saved to: ' + params['workspace_name'] + '/' + params['output_contigset_name'] + '\n' report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n' bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n' print('Running QUAST') kbq = kb_quast(self.callbackURL) try: quastret = kbq.run_QUAST({'files': [{'path': output_contigs, 'label': params['output_contigset_name']}]}) except ServerError as qe: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from running QUAST') print(str(qe)) # TODO delete shock node raise print('Saving report') kbr = KBaseReport(self.callbackURL) try: report_info = kbr.create_extended_report( {'message': report, 'objects_created': [{'ref': output_data_ref, 'description': 'Assembled contigs'}], 'direct_html_link_index': 0, 'html_links': [{'shock_id': quastret['shock_id'], 'name': 'report.html', 'label': 'QUAST report'} ], 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'] }) except ServerError as re: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from creating report object') print(str(re)) # TODO delete shock node raise # STEP 6: contruct the output to send back output = {'report_name': report_info['name'], 'report_ref': report_info['ref']} #END run_megahit # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_megahit return value ' + 'output is not type dict as required.') # return the results return [output]
def run_kraken2(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kraken2 # Download input data as FASTA or FASTQ logging.info('Calling run_kraken2') logging.info(f'params {params}') # Check for presence of input file types in params input_genomes = 'input_genomes' in params and len( params['input_genomes'] ) > 0 and None not in params['input_genomes'] input_refs = 'input_refs' in params and len( params['input_refs']) > 0 and None not in params['input_refs'] input_paired_refs = 'input_paired_refs' in params and len( params['input_paired_refs'] ) > 0 and None not in params['input_paired_refs'] for name in ['workspace_name', 'db_type']: if name not in params: raise ValueError('Parameter "' + name + '" is required but missing') if not input_genomes and not input_refs and not input_paired_refs: raise ValueError( 'You must enter either an input genome or input reads') if input_refs and input_paired_refs: raise ValueError( 'You must enter either single-end or paired-end reads, ' 'but not both') if input_genomes and (input_refs or input_paired_refs): raise ValueError( 'You must enter either an input genome or input reads, ' 'but not both') if input_genomes and (not isinstance(params['input_genomes'][0], str)): raise ValueError('Pass in a valid input genome string') if input_refs and (not isinstance(params['input_refs'], list)): raise ValueError('Pass in a list of input references') if input_paired_refs and (not isinstance(params['input_paired_refs'], list)): raise ValueError('Pass in a list of input references') logging.info(params['db_type']) logging.info( f'input_genomes {input_genomes} input_refs {input_refs} input_paired_refs {input_paired_refs}' ) input_string = [] if input_genomes: assembly_util = AssemblyUtil(self.callback_url) fasta_file_obj = assembly_util.get_assembly_as_fasta( {'ref': params['input_genomes'][0]}) logging.info(fasta_file_obj) fasta_file = fasta_file_obj['path'] input_string.append(fasta_file) if input_refs: logging.info('Downloading Reads data as a Fastq file.') logging.info(f"input_refs {params['input_refs']}") readsUtil = ReadsUtils(self.callback_url) download_reads_output = readsUtil.download_reads( {'read_libraries': params['input_refs']}) print( f"Input parameters {params['input_refs']}, {params['db_type']}" f"download_reads_output {download_reads_output}") fastq_files = [] fastq_files_name = [] for key, val in download_reads_output['files'].items(): if 'fwd' in val['files'] and val['files']['fwd']: fastq_files.append(val['files']['fwd']) fastq_files_name.append(val['files']['fwd_name']) if 'rev' in val['files'] and val['files']['rev']: fastq_files.append(val['files']['rev']) fastq_files_name.append(val['files']['rev_name']) logging.info(f"fastq files {fastq_files}") input_string.append(' '.join(fastq_files)) if input_paired_refs: logging.info('Downloading Reads data as a Fastq file.') logging.info(f"input_refs {params['input_paired_refs']}") readsUtil = ReadsUtils(self.callback_url) download_reads_output = readsUtil.download_reads( {'read_libraries': params['input_paired_refs']}) print( f"Input parameters {params['input_paired_refs']}, {params['db_type']}" f"download_reads_output {download_reads_output}") fastq_files = [] fastq_files_name = [] # input_string.append('--paired') for key, val in download_reads_output['files'].items(): if 'fwd' in val['files'] and val['files']['fwd']: fastq_files.append(val['files']['fwd']) fastq_files_name.append(val['files']['fwd_name']) if 'rev' in val['files'] and val['files']['rev']: fastq_files.append(val['files']['rev']) fastq_files_name.append(val['files']['rev_name']) # if len(fastq_files) % 2 != 0: # raise ValueError('There must be an even number of Paired-end reads files') logging.info(f"fastq files {fastq_files}") input_string.extend(fastq_files) logging.info(f'input_string {input_string}') output_dir = os.path.join(self.shared_folder, 'kraken2_output') report_file_name = 'report.txt' report_file = os.path.join(output_dir, report_file_name) if not os.path.exists(output_dir): os.makedirs(output_dir) outprefix = "kraken2" cmd = [ '/kb/module/lib/kraken2/src/kraken2.sh', '-d', '/data/kraken2/' + params['db_type'], '-o', output_dir, '-p', outprefix, '-t', '1', '-i' ] cmd.extend(input_string) # cmd = ['kraken2', '--db', '/data/kraken2/' + params['db_type'], # '--output', output_dir, '--report', report_file, # '--threads', '1'] # cmd.extend(['--confidence', str(params['confidence'])]) if 'confidence' in params else cmd logging.info(f'cmd {cmd}') p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) logging.info(f'subprocess {p.communicate()}') summary_file = os.path.join(output_dir, outprefix + '.report.csv') report_dir = os.path.join(output_dir, 'html_report') if not os.path.exists(report_dir): os.makedirs(report_dir) summary_file_dt = os.path.join(report_dir, 'kraken2.datatable.html') self._generate_DataTable(summary_file, summary_file_dt) shutil.copy2('/kb/module/lib/kraken2/src/index.html', os.path.join(report_dir, 'index.html')) shutil.copy2(os.path.join(output_dir, outprefix + '.krona.html'), os.path.join(report_dir, 'kraken2.krona.html')) shutil.move(os.path.join(output_dir, outprefix + '.tree.svg'), os.path.join(report_dir, 'kraken2.tree.svg')) html_zipped = self.package_folder(report_dir, 'index.html', 'index.html') # columns = [ # 'Percentage of fragments covered by the clade rooted at this taxon', # 'Number of fragments covered by the clade rooted at this taxon', # 'Number of fragments assigned directly to this taxon', 'rank code', # 'taxid', 'name'] # report_df = pd.read_csv(report_file, sep='\t', # header=None, names=columns) # code_dict = {'U': 'Unclassified', 'R': 'Root', 'D': 'Domain', # 'K': 'Kingdom', 'P': 'Phylum', 'C': 'Class', 'O': 'Order', # 'F': 'Family', 'G': 'Genus', 'S': 'Species'} # report_df['rank code'] = report_df['rank code'].apply( # lambda x: code_dict[x[0]] + x[1] if len(x) > 1 else code_dict[x]) # self._generate_report_table(report_df, report_html_file, output_dir) # report_df.to_html(report_html_file, classes='Kraken2_report', index=False) # html_zipped = self.package_folder(output_dir, 'report.html', # 'report') # Step 5 - Build a Report and return objects_created = [] output_files = os.listdir(output_dir) output_files_list = [] for output in output_files: if not os.path.isdir(output): output_files_list.append({ 'path': os.path.join(output_dir, output), 'name': output }) message = f"Kraken2 run finished on {input_string} against {params['db_type']}." report_params = { 'message': message, 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'file_links': output_files_list, 'html_links': [html_zipped], 'direct_html_link_index': 0, 'html_window_height': 460 } # STEP 6: construct the output to send back kbase_report_client = KBaseReport(self.callback_url) report_output = kbase_report_client.create_extended_report( report_params) report_output['report_params'] = report_params logging.info(report_output) # Return references which will allow inline display of # the report in the Narrative output = { 'report_name': report_output['name'], 'report_ref': report_output['ref'], 'report_params': report_output['report_params'] } #END run_kraken2 # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kraken2 return value ' + 'output is not type dict as required.') # return the results return [output]
def run_rmrContigFilter(self, ctx, params): """ Example app which filters contigs in an assembly using both a minimum contig length :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_rmrContigFilter # Print statements to stdout/stderr are captured and available as the App log logging.info('Starting run_rmrContigFilter function. Params=' + pformat(params)) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. logging.info('Validating parameters.') if 'workspace_name' not in params: raise ValueError( 'Parameter workspace_name is not set in input arguments') workspace_name = params['workspace_name'] if 'assembly_input_ref' not in params: raise ValueError( 'Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'min_length' not in params: raise ValueError( 'Parameter min_length is not set in input arguments') min_length_orig = params['min_length'] min_length = None try: min_length = int(min_length_orig) except ValueError: raise ValueError( 'Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')') if min_length < 0: raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. logging.info('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta( {'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. good_contigs = [] n_total = 0 n_remaining = 0 for record in SeqIO.parse(fasta_file['path'], 'fasta'): n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 logging.info('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)) filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_fasta_file, 'fasta') # Step 4 - Save the new Assembly back to the system logging.info('Uploading filtered Assembly data.') new_assembly = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': filtered_fasta_file }, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Step 4b - Build html report # create html string # write string to file to self.shared_folder # upload to shock # send to report html_header = "<!DOCTYPE html><html><head><meta charset=\"UTF-8\"><title>title</title></head><body><table>" html_footer = "</table></body></html>" tableentries = "<tr><th>ID</th><th>A %</th><th>C %</th><th>T %</th><th>G %</th></tr>" for contig in good_contigs: Acount = contig.seq.upper().count('A') Ccount = contig.seq.upper().count('C') Tcount = contig.seq.upper().count('T') Gcount = contig.seq.upper().count('G') total = Acount + Ccount + Tcount + Gcount Aper = 100 * (Acount / total) Cper = 100 * (Ccount / total) Gper = 100 * (Gcount / total) Tper = 100 * (Tcount / total) tmprow = "<tr><td>" + contig.id + "</td><td>" + str(round( Aper, 2)) + "</td><td>" + str(round(Cper, 2)) + "</td><td>" + str( round(Tper, 2)) + "</td><td>" + str(round( Gper, 2)) + "</td></tr>" tableentries += tmprow # Create the html string html_str = html_header + tableentries + html_footer # Write the html string to a file in the shared folder html_file_dir = os.path.join(self.shared_folder, 'html') if not os.path.isdir(html_file_dir): os.mkdir(html_file_dir) html_file_path = os.path.join(html_file_dir, 'output_table.html') html_file = open(html_file_path, "w") html_file.write(html_str) html_file.close() """ Will try to not use shock first # Upload the html file to shock dfu = DataFileUtil(self.callback_url) try: shock_html_upload = dfu.file_to_shock({'file_path': html_file_dir, 'make_handle': 0, 'pack':'zip'}) except: raise ValueError('Unable to upload html file to shock with DataFileUtil') """ # Step 5 - Build a Report and return """ Old Report .create method: https://github.com/kbaseapps/KBaseReportPy/blob/master/lib/KBaseReportPy/KBaseReportPyImpl.py reportObj = { 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}], 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) } report = KBaseReport(self.callback_url) report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']}) """ # New report .create_extended_report reportObj = { 'objects_created': [{ 'ref': new_assembly, 'description': 'Filtered contigs' }], 'message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total), 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], #'html_links': [{'shock-id': shock_html_upload['shock_id'], 'name': 'output-table.html', 'label': 'contig table'}], 'html_links': [{ 'path': html_file_dir, 'name': 'output_table.html', 'description': 'HTML report for contig filtering' }], 'workspace_name': params['workspace_name'], } report = KBaseReport(self.callback_url) report_info = report.create_extended_report(reportObj) # STEP 6: contruct the output to send back output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'assembly_output': new_assembly, 'n_initial_contigs': n_total, 'n_contigs_removed': n_total - n_remaining, 'n_contigs_remaining': n_remaining } logging.info('returning:' + pformat(output)) #END run_rmrContigFilter # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_rmrContigFilter return value ' + 'output is not type dict as required.') # return the results return [output]
def run_rmrContigFilter_max(self, ctx, params): """ New app which filters contigs in an assembly using both a minimum and a maximum contig length :param params: instance of type "rmrContigFiltermaxinput" -> structure: parameter "output_workspace" of String, parameter "assembly_input_ref" of type "data_obj_ref", parameter "output_assembly_name" of String, parameter "min_length" of Long, parameter "max_length" of Long, parameter "report_ref" of String, parameter "report_name" of String :returns: instance of type "ReportResultsmax" -> structure: parameter "objNameOrId" of type "assembly_ref", parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_rmrContigFilter_max # Print statements to stdout/stderr are captured and available as the App log logging.info('Starting run_rmrContigFilter_max function. Params=' + pformat(params)) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. logging.info('Validating parameters.') if 'output_workspace' not in params: raise ValueError( 'Parameter output_workspace is not set in input arguments') workspace_name = params['output_workspace'] if 'assembly_input_ref' not in params: raise ValueError( 'Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'min_length' not in params: raise ValueError( 'Parameter min_length is not set in input arguments') min_length_orig = params['min_length'] min_length = None try: min_length = int(min_length_orig) except ValueError: raise ValueError( 'Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')') if min_length < 0: raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')') if 'max_length' not in params: raise ValueError( 'Parameter max_length is not set in input arguments') max_length_orig = params['max_length'] max_length = None try: max_length = int(max_length_orig) except ValueError: raise ValueError( 'Cannot parse integer from max_length parameter (' + str(max_length_orig) + ')') if max_length < 0: raise ValueError('max_length parameter cannot be negative (' + str(max_length) + ')') if min_length >= max_length: raise ValueError( 'max_length cannot be less than or equal to min_length') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. logging.info('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta( {'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. good_contigs = [] n_total = 0 n_remaining = 0 for record in SeqIO.parse(fasta_file['path'], 'fasta'): n_total += 1 if len(record.seq) >= min_length and len(record.seq) <= max_length: good_contigs.append(record) n_remaining += 1 logging.info('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)) filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_fasta_file, 'fasta') # Step 4 - Save the new Assembly back to the system logging.info('Uploading filtered Assembly data.') new_assembly = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': filtered_fasta_file }, 'workspace_name': workspace_name, #'assembly_name': fasta_file['assembly_name'] 'assembly_name': params['output_assembly_name'] }) # Step 5 - Build a Report and return report = KBaseReport(self.callback_url) # This is the old plain text report given in the SDK tutorial #reportObj = { # 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}], # 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) #} # This is the old plain text report, we need report.create_extended_report for our new output # report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']}) # STEP 6: contruct the output to send back # We want to output the new assembly in an assembly viewer, to show the dynamic table # associated with the new assembly. We also want to keep our report text. report_info = report.create_extended_report({ "message": 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total), "objects_created": [{ 'ref': new_assembly, 'description': 'Filtered contigs' }], #"workspace_id": params['workspace_id'], "workspace_name": params["output_workspace"] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'objNameOrId': params["output_assembly_name"], #'n_initial_contigs': n_total, #'n_contigs_removed': n_total - n_remaining, #'n_contigs_remaining': n_remaining, 'wsNameOrId': params['output_workspace'], #'workspace_id': report_info['ws_id'] } logging.info('returning:' + pformat(output)) # This will print the ref # to the new assembly created from the filter # print("\n\nNEW ASSEMBLY: "+new_assembly+"\n\n") #END run_rmrContigFilter_max # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_rmrContigFilter_max return value ' + 'output is not type dict as required.') # return the results return [output]
def setUpClass(cls): token = os.environ.get('KB_AUTH_TOKEN', None) config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None) config = configparser.ConfigParser() config.read(config_file) cls.cfg = {n[0]: n[1] for n in config.items('GenomeAnnotationAPI')} authServiceUrl = cls.cfg.get( 'auth-service-url', "https://kbase.us/services/authorization/Sessions/Login") auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'user_id': user_id, 'provenance': [{ 'service': 'GenomeAnnotationAPI', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.ws = Workspace(cls.cfg['workspace-url'], token=token) cls.impl = GenomeAnnotationAPI(cls.cfg) test_gbk_file = "/kb/module/test/data/kb_g.399.c.1.gbk" temp_gbk_file = "/kb/module/work/tmp/kb_g.399.c.1.gbk" shutil.copy(test_gbk_file, temp_gbk_file) suffix = int(time.time() * 1000) wsName = "test_GenomeAnnotationAPI_" + str(suffix) cls.ws.create_workspace({'workspace': wsName}) cls.wsName = wsName data = json.load(open('data/rhodobacter_contigs.json')) # save to ws save_info = { 'workspace': wsName, 'objects': [{ 'type': 'KBaseGenomes.ContigSet', 'data': data, 'name': 'rhodo_contigs' }] } info = cls.ws.save_objects(save_info)[0] contigset_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) data = json.load(open('data/rhodobacter.json')) data['contigset_ref'] = contigset_ref # save to ws info = cls.impl.save_one_genome_v1(cls.ctx, { 'workspace': wsName, 'name': "rhodobacter", 'data': data, })[0]['info'] cls.old_genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str( info[4]) print('created old test genome') assembly_file_path = os.path.join(cls.cfg['scratch'], 'e_coli_assembly.fasta') shutil.copy('data/e_coli_assembly.fasta', assembly_file_path) au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) assembly_ref = au.save_assembly_from_fasta({ 'workspace_name': cls.wsName, 'assembly_name': 'ecoli.assembly', 'file': { 'path': assembly_file_path } }) data = json.load(open('data/new_ecoli_genome.json')) data['assembly_ref'] = assembly_ref # save to ws save_info = { 'workspace': wsName, 'objects': [{ 'type': 'KBaseGenomes.Genome', 'data': data, 'name': 'new_ecoli' }] } info = cls.ws.save_objects(save_info)[0] cls.new_genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str( info[4]) print('created new test genome')
def run_unicycler(self, ctx, params): """ Run Unicycler :param params: instance of type "UnicyclerParams" (To run Unicycler, you need at least one short read paired end library, and optional unpaired reads (divided into short and long. All reads of the same time must be combined into a single file. workspace_name - the name of the workspace from which to take input and store output. output_contigset_name - the name of the output contigset short_paired_libraries - a list of short, paired end reads libraries short_unpaired_libraries - a list of short, paired end reads libraries long_reads_libraries - a list of long reads @optional min_contig_length @optional num_linear_seqs @optional bridging_mode) -> structure: parameter "workspace_name" of String, parameter "output_contigset_name" of String, parameter "short_paired_libraries" of list of type "paired_lib" (The workspace object name of a PairedEndLibrary file, whether of the KBaseAssembly or KBaseFile type.), parameter "short_unpaired_libraries" of list of type "unpaired_lib" (The workspace object name of a SingleEndLibrary file, whether of the KBaseAssembly or KBaseFile type.), parameter "long_reads_library" of String, parameter "min_contig_length" of Long, parameter "num_linear_seqs" of Long, parameter "bridging_mode" of String :returns: instance of type "UnicyclerOutput" (Output parameters for Unicycler run. report_name - the name of the KBaseReport.Report workspace object. report_ref - the workspace reference of the report.) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_unicycler console = [] warnings = [] self.log( console, 'Running run_unicycler with params:\n{}'.format( json.dumps(params, indent=1))) token = self.cfg['KB_AUTH_TOKEN'] # param checks required_params = [ 'workspace_name', 'output_contigset_name', 'min_contig_length', 'num_linear_seqs', 'bridging_mode' ] for required_param in required_params: if required_param not in params or params[required_param] is None: raise ValueError("Must define required param: '" + required_param + "'") # needs either short paired or long if ('short_paired_libraries' not in params or params['short_paired_libraries'] is None or len(params['short_paired_libraries']) == 0) and ('long_reads_library' not in params or params['long_reads_library'] is None): raise ValueError( "Must define either short_paired_libraries or long_reads_library" ) # load provenance provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] if 'input_ws_objects' not in provenance[0]: provenance[0]['input_ws_objects'] = [] if 'short_paired_libraries' in params and params[ 'short_paired_libraries'] is not None and len( params['short_paired_libraries']) > 0: provenance[0]['input_ws_objects'].extend( params['short_paired_libraries']) if 'short_unpaired_libraries' in params and params[ 'short_unpaired_libraries'] is not None and len( params['short_unpaired_libraries']) > 0: provenance[0]['input_ws_objects'].extend( params['short_unpaired_libraries']) if 'long_reads_library' in params and params[ 'long_reads_library'] is not None: provenance[0]['input_ws_objects'].append( params['long_reads_library']) # build command line cmd = 'unicycler' # download, split, and recombine short paired libraries if 'short_paired_libraries' in params and params[ 'short_paired_libraries'] is not None and len( params['short_paired_libraries']) > 0: short1, short2 = self.download_short_paired( console, token, params['workspace_name'], params['short_paired_libraries']) cmd += ' -1 ' + short1 + ' -2 ' + short2 # download and combine short unpaired libraries if 'short_unpaired_libraries' in params and params[ 'short_unpaired_libraries'] is not None and len( params['short_unpaired_libraries']) > 0: unpaired = self.download_short_unpaired( console, token, params['workspace_name'], params['short_unpaired_libraries']) cmd += ' -s ' + unpaired # download long library if 'long_reads_library' in params and params[ 'long_reads_library'] is not None: longLib = self.download_long(console, warnings, token, params['workspace_name'], params['long_reads_library'], params['min_long_read_length']) cmd += ' -l ' + longLib # other params cmd += ' --min_fasta_length ' + str(params['min_contig_length']) cmd += ' --linear_seqs ' + str(params['num_linear_seqs']) cmd += ' --mode ' + str(params['bridging_mode']) cmd += ' --keep 0' if ('no_correct' in params and (params['no_correct'] == 1)): cmd += ' --no_correct' # output directory outputDir = os.path.join(self.scratch, "unicycler_" + str(uuid.uuid4())) self.mkdir_p(outputDir) cmd += ' -o ' + outputDir # run it self.log(console, "command: " + cmd) cmdProcess = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) for line in cmdProcess.stdout: self.log(console, line.decode("utf-8").rstrip()) cmdProcess.wait() if cmdProcess.returncode != 0: raise ValueError('Error running ' + cmd) # save assembly try: contigsPath = os.path.join(outputDir, 'assembly.fasta') auClient = AssemblyUtil(url=self.callbackURL, token=token, service_ver='release') auClient.save_assembly_from_fasta({ 'file': { 'path': contigsPath }, 'workspace_name': params['workspace_name'], 'assembly_name': params['output_contigset_name'] }) except Exception as e: raise ValueError('Error saving assembly\n' + str(e)) # make report report_name, report_ref = self.generate_report( console, warnings, contigsPath, params, outputDir, params['workspace_name']) output = {'report_name': report_name, 'report_ref': report_ref} #END run_unicycler # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_unicycler return value ' + 'output is not type dict as required.') # return the results return [output]
def download_long(self, console, warnings, token, wsname, lib, min_long_read_length): try: # object info try: wsClient = Workspace(self.workspaceURL, token=token) except Exception as e: raise ValueError("unable to instantiate wsClient. " + str(e)) [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple obj_id = {'ref': lib if '/' in lib else (wsname + '/' + lib)} lib_obj_info = wsClient.get_object_info_new({'objects': [obj_id]})[0] lib_obj_type = lib_obj_info[TYPE_I] lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", lib_obj_type) # remove trailing version lib_ref = str(lib_obj_info[WSID_I])+'/' + \ str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I]) if lib_obj_type == 'KBaseGenomes.ContigSet' or lib_obj_type == 'KBaseGenomeAnnotations.Assembly': # download using assembly util / data file util self.log(console, "Getting long reads (from contigs object).\n") auClient = AssemblyUtil(url=self.callbackURL, token=token) dfuClient = DataFileUtil(url=self.callbackURL, token=token) contigFile = auClient.get_assembly_as_fasta({ 'ref': lib_ref }).get('path') long_reads_path = dfuClient.unpack_file( {'file_path': contig_file})['file_path'] self.log( warnings, "Warning: Long reads are in FASTA format, so short read check was not performed." ) else: ruClient = ReadsUtils(url=self.callbackURL, token=token) self.log(console, "Getting long reads (from reads library object).\n") result = ruClient.download_reads({ 'read_libraries': [lib_ref], 'interleaved': 'false' }) long_reads_path = result['files'][lib_ref]['files']['fwd'] [n_reads, n_reads_short ] = self.filter_short_fastq(console, long_reads_path, min_long_read_length) if (n_reads_short > 0): self.log( warnings, "Warning: Of " + str(n_reads) + " long reads, " + str(n_reads_short) + " are shorter than " + str(min_long_read_length) + "; consider using the filtlong app to filter out shorter reads." ) except Exception as e: raise ValueError('Unable to download long reads\n' + str(e)) return long_reads_path
def test_fractiontate_contigs_ASSEMBLY_BINNEDCONTIGS_08(self): method = 'fractionate_contigs_pos_filter_ASSEMBLY_BINNEDCONTIGS_08' print("\n\nRUNNING: test_" + method + "()") print("==========================================================\n\n") # upload test data try: auClient = AssemblyUtil(self.callback_url, token=self.getContext()['token']) except Exception as e: raise ValueError( 'Unable to instantiate auClient with callbackURL: ' + self.callback_url + ' ERROR: ' + str(e)) try: mguClient = MetagenomeUtils(self.callback_url, token=self.getContext()['token']) except Exception as e: raise ValueError( 'Unable to instantiate mguClient with callbackURL: ' + self.callback_url + ' ERROR: ' + str(e)) base_1 = 'assembly_1plus2' base_2 = 'assembly' dir_2 = 'binned_contigs' type_1 = 'Assembly' type_2 = 'BinnedContigs' ass_file_1_fa = base_1 + '.fa.gz' ass_path_1_fa = os.path.join(self.scratch, ass_file_1_fa) dir_2_path = os.path.join(self.scratch, dir_2) shutil.copy(os.path.join("data", ass_file_1_fa), ass_path_1_fa) shutil.copytree(os.path.join("data", dir_2), dir_2_path) ass_ref_1 = auClient.save_assembly_from_fasta({ 'file': { 'path': ass_path_1_fa }, 'workspace_name': self.getWsName(), 'assembly_name': base_1 + '.' + type_1 }) binned_contigs_ref_2 = mguClient.file_to_binned_contigs({ 'file_directory': dir_2_path, 'workspace_name': self.getWsName(), 'assembly_ref': ass_ref_1, 'binned_contig_name': base_2 + '.' + type_2 })['binned_contig_obj_ref'] # run method base_output_name = method + '_output' fractionate_mode = 'neg' params = { 'workspace_name': self.getWsName(), 'input_assembly_ref': ass_ref_1, 'input_pos_filter_obj_refs': [binned_contigs_ref_2], 'fractionate_mode': fractionate_mode, 'output_name': 'test_fractionated' + '-' + base_1 + '.' + type_1 + '-' + 'binned_contigs_2a2b' + '-' + fractionate_mode } result = self.getImpl().run_fractionate_contigs( self.getContext(), params) print('RESULT:') pprint(result) pass
def test_fractiontate_contigs_ASSEMBLY_ASSEMBLYSET_07(self): method = 'fractionate_contigs_pos_filter_ASSEMBLY_ASSEMBLYSET_07' print("\n\nRUNNING: test_" + method + "()") print("==========================================================\n\n") # upload test data try: auClient = AssemblyUtil(self.callback_url, token=self.getContext()['token']) except Exception as e: raise ValueError( 'Unable to instantiate auClient with callbackURL: ' + self.callback_url + ' ERROR: ' + str(e)) try: setAPI_Client = SetAPI(self.serviceWizardURL, token=self.getContext()['token']) except Exception as e: raise ValueError( 'Unable to instantiate setAPI_Client with serviceWizardURL: ' + self.serviceWizardURL + ' ERROR: ' + str(e)) base_1 = 'assembly_1plus2' base_2a = 'assembly_2a' base_2b = 'assembly_2b' type_1 = 'Assembly' type_2a = 'Assembly' type_2b = 'Assembly' ass_file_1_fa = base_1 + '.fa.gz' ass_file_2a_fa = base_2a + '.fa.gz' ass_file_2b_fa = base_2b + '.fa.gz' ass_path_1_fa = os.path.join(self.scratch, ass_file_1_fa) ass_path_2a_fa = os.path.join(self.scratch, ass_file_2a_fa) ass_path_2b_fa = os.path.join(self.scratch, ass_file_2b_fa) shutil.copy(os.path.join("data", ass_file_1_fa), ass_path_1_fa) shutil.copy(os.path.join("data", ass_file_2a_fa), ass_path_2a_fa) shutil.copy(os.path.join("data", ass_file_2b_fa), ass_path_2b_fa) ass_ref_1 = auClient.save_assembly_from_fasta({ 'file': { 'path': ass_path_1_fa }, 'workspace_name': self.getWsName(), 'assembly_name': base_1 + '.' + type_1 }) ass_ref_2a = auClient.save_assembly_from_fasta({ 'file': { 'path': ass_path_2a_fa }, 'workspace_name': self.getWsName(), 'assembly_name': base_2a + '.' + type_2a }) ass_ref_2b = auClient.save_assembly_from_fasta({ 'file': { 'path': ass_path_2b_fa }, 'workspace_name': self.getWsName(), 'assembly_name': base_2b + '.' + type_2b }) # AssemblySet assemblySet_items = [{ 'ref': ass_ref_2a, 'label': 'assembly_2a' }, { 'ref': ass_ref_2b, 'label': 'assembly_2b' }] assemblySet_obj = { 'description': 'test assemblySet', 'items': assemblySet_items } assemblySet_ref = setAPI_Client.save_assembly_set_v1({ 'workspace_name': self.getWsName(), 'output_object_name': 'assembly_2a2b.AssemblySet', 'data': assemblySet_obj })['set_ref'] # run method base_output_name = method + '_output' fractionate_mode = 'neg' params = { 'workspace_name': self.getWsName(), 'input_assembly_ref': ass_ref_1, 'input_pos_filter_obj_refs': [assemblySet_ref], 'fractionate_mode': fractionate_mode, 'output_name': 'test_fractionated' + '-' + base_1 + '.' + type_1 + '-' + 'assemblyset_2a2b' + '-' + fractionate_mode } result = self.getImpl().run_fractionate_contigs( self.getContext(), params) print('RESULT:') pprint(result) pass
def run_ContigFilter_max(self, ctx, params): """ New app which filters contigs in an assembly using both a minimum and a maximum contig length :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_ContigFilter_max # Check that the parameters are valid for name in [ 'min_length', 'max_length', 'assembly_ref', 'workspace_name' ]: if name not in params: raise ValueError('Parameter "' + name + '" is required but missing') if not isinstance(params['min_length'], int) or (params['min_length'] < 0): raise ValueError('Min length must be a non-negative integer') if not isinstance(params['max_length'], int) or (params['max_length'] < 0): raise ValueError('Max length must be a non-negative integer') if not isinstance(params['assembly_ref'], str) or not len( params['assembly_ref']): raise ValueError('Pass in a valid assembly reference string') print(params['min_length'], params['max_length'], params['assembly_ref']) output = {} assembly_util = AssemblyUtil(self.callback_url) fasta_file = assembly_util.get_assembly_as_fasta( {'ref': params['assembly_ref']}) print(fasta_file) # Parse the downloaded file in FASTA format parsed_assembly = SeqIO.parse(fasta_file['path'], 'fasta') min_length = params['min_length'] max_length = params['max_length'] # Keep a list of contigs greater than min_length good_contigs = [] # total contigs regardless of length n_total = 0 # total contigs over the min_length n_remaining = 0 for record in parsed_assembly: n_total += 1 if len(record.seq) >= min_length and len(record.seq) <= max_length: good_contigs.append(record) n_remaining += 1 # Create a file to hold the filtered data workspace_name = params['workspace_name'] filtered_path = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_path, 'fasta') # Upload the filtered data to the workspace new_ref = assembly_util.save_assembly_from_fasta({ 'file': { 'path': filtered_path }, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Create an output summary message for the report text_message = "".join([ 'Filtered assembly to ', str(n_remaining), ' contigs out of ', str(n_total) ]) # Data for creating the report, referencing the assembly we uploaded report_data = { 'objects_created': [{ 'ref': new_ref, 'description': 'Filtered contigs' }], 'text_message': text_message } # Initialize the report kbase_report = KBaseReport(self.callback_url) report = kbase_report.create({ 'report': report_data, 'workspace_name': workspace_name }) # Return the report reference and name in our results output = { 'report_ref': report['ref'], 'report_name': report['name'], 'n_total': n_total, 'n_remaining': n_remaining, 'filtered_assembly_ref': new_ref } #END run_ContigFilter_max # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_ContigFilter_max return value ' + 'output is not type dict as required.') # return the results return [output]
def run_cnelsonAppDemo(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_cnelsonAppDemo # Print statements to stdout/stderr are captured and available as the App log logging.info('Starting run_cnelsonAppDemo function. Params=' + pformat(params)) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. logging.info('Validating parameters.') if 'workspace_name' not in params: raise ValueError( 'Parameter workspace_name is not set in input arguments') workspace_name = params['workspace_name'] if 'assembly_input_ref' not in params: raise ValueError( 'Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'min_length' not in params: raise ValueError( 'Parameter min_length is not set in input arguments') min_length_orig = params['min_length'] min_length = None try: min_length = int(min_length_orig) except ValueError: raise ValueError( 'Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')') if min_length < 0: raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. logging.info('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta( {'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. good_contigs = [] n_total = 0 n_remaining = 0 for record in SeqIO.parse(fasta_file['path'], 'fasta'): n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 logging.info('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)) filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_fasta_file, 'fasta') # Step 4 - Save the new Assembly back to the system logging.info('Uploading filtered Assembly data.') new_assembly = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': filtered_fasta_file }, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Step 5 - Build a Report and return reportObj = { 'objects_created': [{ 'ref': new_assembly, 'description': 'Filtered contigs' }], 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) } report = KBaseReport(self.callback_url) report_info = report.create({ 'report': reportObj, 'workspace_name': params['workspace_name'] }) # STEP 6: contruct the output to send back output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'assembly_output': new_assembly, 'n_initial_contigs': n_total, 'n_contigs_removed': n_total - n_remaining, 'n_contigs_remaining': n_remaining } logging.info('returning:' + pformat(output)) #END run_cnelsonAppDemo # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_cnelsonAppDemo return value ' + 'output is not type dict as required.') # return the results return [output]
def BuildFastaFromSequenceSet(self, ctx, params): """ :param params: instance of type "BuildSeqIn" -> structure: parameter "workspace_name" of String, parameter "SequenceSetRef" of String, parameter "fasta_outpath" of String :returns: instance of type "BuildSeqOut" -> structure: parameter "fasta_outpath" of String """ # ctx is the context object # return variables are: output #BEGIN BuildFastaFromSequenceSet dfu = DataFileUtil(self.callback_url) bu = BackgroundUtils() TU = TestUtils() if params['TESTFLAG'] and params['background']: targetpath = '/kb/module/work/tmp/testgenome.fa' TU.GetGenome(targetpath) bu.BuildBackground(targetpath) elif params['background']: ws = Workspace('https://appdev.kbase.us/services/ws') subset = ws.get_object_subset([{ 'included': [ '/features/[*]/location', '/features/[*]/id', '/assembly_ref' ], 'ref': params['genome_ref'] }]) aref = subset[0]['data']['assembly_ref'] assembly_ref = {'ref': aref} print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta( assembly_ref)['path'] bu.BuildBackground(fasta_file) get_objects_params = {'object_refs': [params['SequenceSetRef']]} SeqSet = dfu.get_objects(get_objects_params)['data'][0]['data'] outFile = open(params['fasta_outpath'], 'w') for s in SeqSet['sequences']: sname = '>' + s['sequence_id'] + '\n' outFile.write(sname) sseq = s['sequence'] + '\n' outFile.write(sseq) outFile.close() fu = FastaUtils() if params['mask_repeats']: fu.RemoveRepeats(params['fasta_outpath'], params['fasta_outpath']) output = {'fasta_outpath': params['fasta_outpath']} #END BuildFastaFromSequenceSet # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method BuildFastaFromSequenceSet return value ' + 'output is not type dict as required.') # return the results return [output]
def __init__(self): self.callbackURL = os.environ['SDK_CALLBACK_URL'] self.au = AssemblyUtil(self.callbackURL) self.vu = VariationUtil(self.callbackURL) pass
def stage_input(self, input_ref, fasta_file_extension): ''' Stage input based on an input data reference for CheckM input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome This method creates a directory in the scratch area with the set of Fasta files, names will have the fasta_file_extension parameter tacked on. ex: staged_input = stage_input('124/15/1', 'fna') staged_input {"input_dir": '...'} ''' # config #SERVICE_VER = 'dev' SERVICE_VER = 'release' [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11) # object_info tuple ws = Workspace(self.ws_url) # 1) generate a folder in scratch to hold the input suffix = str(int(time.time() * 1000)) input_dir = os.path.join(self.scratch, 'bins_' + suffix) all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension) if not os.path.exists(input_dir): os.makedirs(input_dir) # 2) based on type, download the files obj_name = self.get_data_obj_name (input_ref) type_name = self.get_data_obj_type (input_ref) # auClient try: auClient = AssemblyUtil(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError('Unable to instantiate auClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e)) # setAPI_Client try: #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token']) # for SDK local. local doesn't work for SetAPI setAPI_Client = SetAPI (url=self.serviceWizardURL, token=self.ctx['token']) # for dynamic service except Exception as e: raise ValueError('Unable to instantiate setAPI_Client with serviceWizardURL: '+ self.serviceWizardURL +' ERROR: ' + str(e)) # mguClient try: mguClient = MetagenomeUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError('Unable to instantiate mguClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e)) # Standard Single Assembly # if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']: # create file data filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': input_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # AssemblySet # elif type_name == 'KBaseSets.AssemblySet': # read assemblySet try: assemblySet_obj = setAPI_Client.get_assembly_set_v1 ({'ref':input_ref, 'include_item_info':1}) except Exception as e: raise ValueError('Unable to get object from workspace: (' + input_ref +')' + str(e)) assembly_refs = [] assembly_names = [] for assembly_item in assemblySet_obj['data']['items']: this_assembly_ref = assembly_item['ref'] # assembly obj info try: this_assembly_info = ws.get_object_info_new ({'objects':[{'ref':this_assembly_ref}]})[0] this_assembly_name = this_assembly_info[NAME_I] except Exception as e: raise ValueError('Unable to get object from workspace: (' + this_assembly_ref +'): ' + str(e)) assembly_refs.append(this_assembly_ref) assembly_names.append(this_assembly_name) # create file data (name for file is what's reported in results) for ass_i,assembly_ref in enumerate(assembly_refs): this_name = assembly_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # Binned Contigs # elif type_name == 'KBaseMetagenomes.BinnedContigs': # download the bins as fasta and set the input folder name bin_file_dir = mguClient.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory'] os.rename(bin_file_dir, input_dir) # make sure fasta file isn't empty self.set_fasta_file_extensions(input_dir, fasta_file_extension) for (dirpath, dirnames, filenames) in os.walk(input_dir): for fasta_file in filenames: fasta_path = os.path.join (input_dir,fasta_file) min_fasta_len = 1 if not self.fasta_seq_len_at_least(fasta_path, min_fasta_len): raise ValueError('Binned Assembly is empty for fasta_path: '+str(fasta_path)) break # Genome and GenomeSet # elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet': genome_obj_names = [] genome_sci_names = [] genome_assembly_refs = [] if type_name == 'KBaseGenomes.Genome': genomeSet_refs = [input_ref] else: # get genomeSet_refs from GenomeSet object genomeSet_refs = [] try: genomeSet_object = ws.get_objects2({'objects':[{'ref':input_ref}]})['data'][0]['data'] except Exception as e: raise ValueError('Unable to fetch '+str(input_ref)+' object from workspace: ' + str(e)) #to get the full stack trace: traceback.format_exc() # iterate through genomeSet members for genome_id in genomeSet_object['elements'].keys(): if 'ref' not in genomeSet_object['elements'][genome_id] or \ genomeSet_object['elements'][genome_id]['ref'] == None or \ genomeSet_object['elements'][genome_id]['ref'] == '': raise ValueError('genome_ref not found for genome_id: '+str(genome_id)+' in genomeSet: '+str(input_ref)) else: genomeSet_refs.append(genomeSet_object['elements'][genome_id]['ref']) # genome obj data for i,this_input_ref in enumerate(genomeSet_refs): try: objects = ws.get_objects2({'objects':[{'ref':this_input_ref}]})['data'] genome_obj = objects[0]['data'] genome_obj_info = objects[0]['info'] genome_obj_names.append(genome_obj_info[NAME_I]) genome_sci_names.append(genome_obj['scientific_name']) except: raise ValueError ("unable to fetch genome: "+this_input_ref) # Get genome_assembly_ref if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \ and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None): msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" MISSING BOTH contigset_ref AND assembly_ref. Cannot process. Exiting." raise ValueError (msg) continue elif 'assembly_ref' in genome_obj and genome_obj['assembly_ref'] != None: msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING assembly_ref: "+str(genome_obj['assembly_ref']) print (msg) genome_assembly_refs.append(genome_obj['assembly_ref']) elif 'contigset_ref' in genome_obj and genome_obj['contigset_ref'] != None: msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING contigset_ref: "+str(genome_obj['contigset_ref']) print (msg) genome_assembly_refs.append(genome_obj['contigset_ref']) # create file data (name for file is what's reported in results) for ass_i,assembly_ref in enumerate(genome_assembly_refs): this_name = genome_obj_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # Unknown type slipped through # else: raise ValueError('Cannot stage fasta file input directory from type: ' + type_name) # create summary fasta file with all bins self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta) return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}
def setUpClass(cls): print('Setting up class') token = os.environ.get('KB_AUTH_TOKEN', None) config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None) config = configparser.ConfigParser() config.read(config_file) cls.cfg = {n[0]: n[1] for n in config.items('GenomeAnnotationAPI')} authServiceUrl = cls.cfg.get('auth-service-url', "https://kbase.us/services/authorization/Sessions/Login") auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'user_id': user_id, 'provenance': [ {'service': 'GenomeAnnotationAPI', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) cls.ws = Workspace(cls.cfg['workspace-url'], token=token) cls.impl = GenomeAnnotationAPI(cls.cfg) # Second user test_cfg_file = '/kb/module/work/test.cfg' test_cfg_text = "[test]\n" with open(test_cfg_file, "r") as f: test_cfg_text += f.read() config = configparser.ConfigParser() config.read_file(io.StringIO(test_cfg_text)) test_cfg_dict = dict(config.items("test")) if ('test_token2' not in test_cfg_dict): raise ValueError("Configuration in <module>/test_local/test.cfg file should " + "include second user credentials ('test_token2')") token2 = test_cfg_dict['test_token2'] user2 = auth_client.get_user(token2) cls.ctx2 = MethodContext(None) cls.ctx2.update({'token': token2, 'user_id': user2, 'provenance': [ {'service': 'NarrativeService', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) # create one WS for all tests suffix = int(time.time() * 1000) wsName = "test_GenomeAnnotationAPI_" + str(suffix) ret = cls.ws.create_workspace({'workspace': wsName}) cls.wsName = wsName # preload with reference data with open ('data/rhodobacter.json', 'r') as file: data_str=file.read() data = json.loads(data_str) # save old genome info = cls.impl.save_one_genome_v1(cls.ctx, { 'workspace': wsName, 'name': "rhodobacter", 'data': data, })[0]['info'] cls.rhodobacter_ref = str(info[6]) +'/' + str(info[0]) + '/' + str(info[4]) print('created rhodobacter test genome: ' + cls.rhodobacter_ref) assembly_file_path = os.path.join(cls.cfg['scratch'], 'e_coli_assembly.fasta') shutil.copy('data/e_coli_assembly.fasta', assembly_file_path) au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) assembly_ref = au.save_assembly_from_fasta({ 'workspace_name': cls.wsName, 'assembly_name': 'ecoli.assembly', 'file': {'path': assembly_file_path} }) data = json.load(open('data/new_ecoli_genome.json')) data['assembly_ref'] = assembly_ref # save new genome save_info = { 'workspace': wsName, 'objects': [{ 'type': 'KBaseGenomes.Genome', 'data': data, 'name': 'new_ecoli' }] } info = cls.ws.save_objects(save_info)[0] cls.new_genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str( info[4]) print('created new test genome')
def test_what_is_fastas(self): assembly_util = AssemblyUtil(self.callback_url) fastas = assembly_util.get_fastas({'ref_lst': ['41343/11/3']}) print(fastas)
def test_annotate_contigs(self): assembly_file_name = "small.fna" # "AP009048.fna" assembly_test_file = os.path.join("/kb/module/test/data/", assembly_file_name) assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name) shutil.copy(assembly_test_file, assembly_temp_file) assembly_name = "Assembly.1" au = AssemblyUtil(os.environ["SDK_CALLBACK_URL"]) assembly_ref = au.save_assembly_from_fasta({"file": {"path": assembly_temp_file}, "workspace_name": self.getWsName(), "assembly_name": assembly_name}) # Add a genome to the WS to test ref_paths genome_name = "Genome.1" genome = {"id": "Unknown", "features": [], "scientific_name": "", "domain": "", "genetic_code": 0, "assembly_ref": assembly_ref, "cdss": [], "mrnas": [], "source": "Magic!", "gc_content": 0, "dna_size": 0, "reference_annotation": 0} prov = self.getContext().provenance() gfu = GenomeFileUtil(os.environ["SDK_CALLBACK_URL"]) info = gfu.save_one_genome( {"workspace": self.getWsName(), "name": genome_name, "data": genome, "provenance": prov})["info"] genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]) result = self.getImpl().annotate(self.getContext(), {"object_ref": "{};{}".format(genome_ref, assembly_ref), "output_workspace": self.getWsName(), "output_genome_name": genome_name, "evalue": None, "fast": 0, "gcode": 0, "genus": "genus", "kingdom": "Bacteria", "metagenome": 0, "mincontiglen": 1, "norrna": 0, "notrna": 0, "rawproduct": 0, "rfam": 1, "scientific_name": "Super : diper - name;" })[0] rep = self.getWsClient().get_objects([{"ref": result["report_ref"]}])[0]["data"] self.assertTrue("text_message" in rep) print("Report:\n" + str(rep["text_message"])) genome_ref = self.getWsName() + "/" + genome_name genome = self.getWsClient().get_objects([{"ref": genome_ref}])[0]["data"] features_to_work = {} for feature in genome["features"]: features_to_work[feature["id"]] = feature["location"] aseq = AssemblySequenceAPI(os.environ["SDK_CALLBACK_URL"], token=self.getContext()["token"]) dna_sequences = aseq.get_dna_sequences({"requested_features": features_to_work, "assembly_ref": genome["assembly_ref"]})[ "dna_sequences"] bad_dnas = 0 for feature in genome["features"]: if feature["dna_sequence"] != dna_sequences[feature["id"]]: bad_dnas += 1 self.assertEqual(bad_dnas, 0)
def test_fractiontate_contigs_ASSEMBLY_GENOMELIST_05(self): method = 'fractionate_contigs_pos_filter_ASSEMBLY_GENOMELIST_05' print("\n\nRUNNING: test_" + method + "()") print("==========================================================\n\n") # upload test data try: auClient = AssemblyUtil(self.callback_url, token=self.getContext()['token']) except Exception as e: raise ValueError( 'Unable to instantiate auClient with callbackURL: ' + self.callback_url + ' ERROR: ' + str(e)) try: gfuClient = GenomeFileUtil(self.callback_url, token=self.getContext()['token']) except Exception as e: raise ValueError( 'Unable to instantiate gfuClient with callbackURL: ' + self.callback_url + ' ERROR: ' + str(e)) base_1 = 'assembly_1plus2' base_2a = 'assembly_2a' base_2b = 'assembly_2b' type_1 = 'Assembly' type_2a = 'Genome' type_2b = 'Genome' ass_file_1_fa = base_1 + '.fa.gz' ass_file_2a_fa = base_2a + '.fa.gz' ass_file_2b_fa = base_2b + '.fa.gz' ass_file_2a_gff = base_2a + '.gff' ass_file_2b_gff = base_2b + '.gff' ass_path_1_fa = os.path.join(self.scratch, ass_file_1_fa) ass_path_2a_fa = os.path.join(self.scratch, ass_file_2a_fa) ass_path_2b_fa = os.path.join(self.scratch, ass_file_2b_fa) ass_path_2a_gff = os.path.join(self.scratch, ass_file_2a_gff) ass_path_2b_gff = os.path.join(self.scratch, ass_file_2b_gff) shutil.copy(os.path.join("data", ass_file_1_fa), ass_path_1_fa) shutil.copy(os.path.join("data", ass_file_2a_fa), ass_path_2a_fa) shutil.copy(os.path.join("data", ass_file_2b_fa), ass_path_2b_fa) shutil.copy(os.path.join("data", ass_file_2a_gff), ass_path_2a_gff) shutil.copy(os.path.join("data", ass_file_2b_gff), ass_path_2b_gff) ass_ref_1 = auClient.save_assembly_from_fasta({ 'file': { 'path': ass_path_1_fa }, 'workspace_name': self.getWsName(), 'assembly_name': base_1 + '.' + type_1 }) ass_ref_2a = gfuClient.fasta_gff_to_genome({ 'fasta_file': { 'path': ass_path_2a_fa }, 'gff_file': { 'path': ass_path_2a_gff }, 'generate_missing_genes': 1, 'source': 'GFF', 'scientific_name': base_2a, 'workspace_name': self.getWsName(), 'genome_name': base_2a + '.' + type_2a }).get('genome_ref') ass_ref_2b = gfuClient.fasta_gff_to_genome({ 'fasta_file': { 'path': ass_path_2b_fa }, 'gff_file': { 'path': ass_path_2b_gff }, 'generate_missing_genes': 1, 'source': 'GFF', 'scientific_name': base_2b, 'workspace_name': self.getWsName(), 'genome_name': base_2b + '.' + type_2b }).get('genome_ref') # run method base_output_name = method + '_output' fractionate_mode = 'pos' params = { 'workspace_name': self.getWsName(), 'input_assembly_ref': ass_ref_1, 'input_pos_filter_obj_refs': [ass_ref_2a, ass_ref_2b], 'fractionate_mode': fractionate_mode, 'output_name': 'test_fractionated' + '-' + base_1 + '.' + type_1 + '-' + base_2a + '.' + type_2a + '-' + base_2b + '.' + type_2b + '-' + fractionate_mode } result = self.getImpl().run_fractionate_contigs( self.getContext(), params) print('RESULT:') pprint(result) pass