def load_test_genome_with_cache(self, filename, gbff_cache_filename): """ cache filename needs to in scratch space """ with open(filename, 'r') as file: data_str = file.read() data = json.loads(data_str) # save to ws save_info = { 'workspace': self.getWsName(), 'objects': [{ 'type': 'KBaseGenomes.Genome', 'data': data, 'name': 'e_coli' }] } dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) shock_file = dfu.file_to_shock({ 'file_path': gbff_cache_filename, 'make_handle': 1 }) data['genbank_handle_ref'] = shock_file['handle']['hid'] # save to ws save_info['objects'][0]['name'] = 'e_coli_with_genbank' result = self.ws.save_objects(save_info) info = result[0] ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) print('created test genome with gbff cache: ' + ref + ' from file ' + filename) return ref
def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url)
def UploadFromMEME(self, ctx, params): """ :param params: instance of type "UploadGibbsInParams" -> structure: parameter "path" of String, parameter "ws_name" of String, parameter "obj_name" of String :returns: instance of type "UploadOutput" -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: output #BEGIN UploadFromMEME print('Extracting motifs') motifList = MU.parse_meme_output(params['path']) print(motifList) MSO = {} MSO['Condition'] = 'Temp' MSO['SequenceSet_ref'] = '123' MSO['Motifs'] = [] MSO['Alphabet'] = ['A', 'C', 'G', 'T'] MSO['Background'] = {} for letter in MSO['Alphabet']: MSO['Background'][letter] = 0.0 MSU.parseMotifList(motifList, MSO) MSU.CheckLength(MSO, params['min_len'], params['max_len']) if 'absolute_locations' in params: for motif in MSO['Motifs']: for loc in motif['Motif_Locations']: if loc['sequence_id'] in params['absolute_locations']: loc['sequence_id'] = params['contig'] absStart = int(params['start']) loc['start'] = absStart loc['end'] = absStart + loc['end'] dfu = DataFileUtil(self.callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name']) save_objects_params['objects'] = [{ 'type': 'KBaseGeneRegulation.MotifSet', 'data': MSO, 'name': params['obj_name'] }] info = dfu.save_objects(save_objects_params)[0] print('SAVED OBJECT') print(info) motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) print(motif_set_ref) output = {'obj_ref': motif_set_ref} print(output) #END UploadFromMEME # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method UploadFromMEME return value ' + 'output is not type dict as required.') # return the results return [output]
def make_fake_expression(callback_url, dummy_file, name, genome_ref, annotation_ref, alignment_ref, ws_name, ws_client): """ Makes a Fake KBaseRNASeq.RNASeqExpression object and returns a ref to it. genome_ref: reference to a genome object annotation_ref: reference to a KBaseRNASeq.GFFAnnotation alignment_ref: reference to a KBaseRNASeq.RNASeqAlignment """ dfu = DataFileUtil(callback_url) dummy_shock_info = dfu.file_to_shock({ "file_path": dummy_file, "make_handle": 1 }) exp = { "id": "fake", "type": "fake", "numerical_interpretation": "fake", "expression_levels": { "feature_1": 0, "feature_2": 1, "feature_3": 2 }, "genome_id": genome_ref, "annotation_id": annotation_ref, "mapped_rnaseq_alignment": { "id1": alignment_ref }, "condition": "", "tool_used": "none", "tool_version": "0.0.0", "file": dummy_shock_info['handle'] } return make_fake_object(exp, "KBaseRNASeq.RNASeqExpression", name, ws_name, ws_client)
def download_genome_to_json_files(token, genome_ref, target_dir): if not os.path.exists(target_dir): os.makedirs(target_dir) file_name_to_data_map = {} dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token, service_ver='dev') genome_data = dfu.get_objects({'object_refs': [genome_ref]})['data'][0] genome_obj = genome_data['data'] genome_meta = genome_data['info'][10] file_name_to_data_map["genome.json"] = genome_obj file_name_to_data_map["genome.meta.json"] = genome_meta if 'genbank_handle_ref' in genome_obj: gbk_file_name = "genome.gbk" dfu.shock_to_file({ 'handle_id': genome_obj['genbank_handle_ref'], 'file_path': os.path.join(target_dir, gbk_file_name) }) genome_obj['genbank_handle_ref'] = gbk_file_name if 'contigset_ref' in genome_obj: contigset_data = dfu.get_objects( {'object_refs': [genome_obj['contigset_ref']]})['data'][0] contigset_obj = contigset_data['data'] contigset_meta = contigset_data['info'][10] file_name_to_data_map["contigset.json"] = contigset_obj file_name_to_data_map["contigset.meta.json"] = contigset_meta genome_obj['contigset_ref'] = "contigset.json" elif 'assembly_ref' in genome_obj: assembly_data = dfu.get_objects( {'object_refs': [genome_obj['assembly_ref']]})['data'][0] assembly_obj = assembly_data['data'] assembly_meta = assembly_data['info'][10] file_name_to_data_map["assembly.json"] = assembly_obj file_name_to_data_map["assembly.meta.json"] = assembly_meta genome_obj['assembly_ref'] = "assembly.json" fasta_handle_ref = assembly_obj['fasta_handle_ref'] fasta_file_name = "assembly.fa" dfu.shock_to_file({ 'handle_id': fasta_handle_ref, 'file_path': os.path.join(target_dir, fasta_file_name) }) assembly_obj['fasta_handle_ref'] = fasta_file_name assembly_obj['external_source_id'] = fasta_file_name if 'taxon_ref' in assembly_obj: taxon_obj = dfu.get_objects( {'object_refs': [assembly_obj['taxon_ref']]})['data'][0]['data'] file_name_to_data_map["taxon.json"] = taxon_obj assembly_obj['taxon_ref'] = "taxon.json" if 'taxon_ref' in genome_obj: genome_obj['taxon_ref'] = "taxon.json" taxon_obj['parent_taxon_ref'] = "" for target_file_name in file_name_to_data_map: with open(os.path.join(target_dir, target_file_name), 'w') as f: json.dump(file_name_to_data_map[target_file_name], f, sort_keys=True, indent=4)
def test_shock_handle_ws(self): test_phrase = "Hi there!" path_to_temp_file = "/kb/module/work/tmp/temp_" + str( time.time()) + ".fq" self.textToFile(test_phrase, path_to_temp_file) dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=self.ctx['token']) uploaded = dfu.file_to_shock({ 'file_path': path_to_temp_file, 'make_handle': 1 }) fhandle = uploaded['handle'] self.assertTrue('hid' in fhandle, "Handle: " + str(fhandle)) data = {'hid': fhandle['hid']} obj_name = 'TestObject.1' info = self.getWsClient().save_objects({ 'workspace': self.getWsName(), 'objects': [{ 'type': 'Empty.AHandle', 'data': data, 'name': obj_name }] })[0] self.assertEqual(info[1], obj_name) ref = self.getWsName() + '/' + obj_name handle_data = self.getWsClient().get_objects([{'ref': ref}])[0]['data'] self.assertTrue('hid' in handle_data, "Data: " + str(handle_data)) hid = handle_data['hid'] path_to_temp_file2 = "/kb/module/work/tmp/temp2_" + str( time.time()) + ".fq" dfu.shock_to_file({'handle_id': hid, 'file_path': path_to_temp_file2}) self.assertEqual(test_phrase, self.fileToText(path_to_temp_file2))
def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.__LOGGER = logging.getLogger('KBaseRNASeq') if 'log_level' in config: self.__LOGGER.setLevel(config['log_level']) else: self.__LOGGER.setLevel(logging.INFO) streamHandler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "%(asctime)s - %(filename)s - %(lineno)d - \ %(levelname)s - %(message)s") formatter.converter = time.gmtime streamHandler.setFormatter(formatter) self.__LOGGER.addHandler(streamHandler) self.__LOGGER.info("Logger was set") script_utils.check_sys_stat(self.__LOGGER) self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.samtools = SamTools(config) #END_CONSTRUCTOR pass
def BuildFastaFromSequenceSet(self, ctx, params): """ :param params: instance of type "BuildSeqIn" -> structure: parameter "workspace_name" of String, parameter "SequenceSetRef" of String, parameter "fasta_outpath" of String :returns: instance of type "BuildSeqOut" -> structure: parameter "fasta_outpath" of String """ # ctx is the context object # return variables are: output #BEGIN BuildFastaFromSequenceSet dfu = DataFileUtil(self.callback_url) get_objects_params = {'object_refs': [params['SequenceSetRef']]} SeqSet = dfu.get_objects(get_objects_params)['data'][0]['data'] outFile = open(params['fasta_outpath'], 'w') for s in SeqSet['sequences']: sname = '>' + s['sequence_id'] + '\n' outFile.write(sname) sseq = s['sequence'] + '\n' outFile.write(sseq) outFile.close() output = {'fasta_outpath': params['fasta_outpath']} #END BuildFastaFromSequenceSet # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method BuildFastaFromSequenceSet return value ' + 'output is not type dict as required.') # return the results return [output]
def make_fake_alignment(callback_url, dummy_file, name, reads_ref, genome_ref, ws_name, ws_client): """ Makes a Fake KBaseRNASeq.RNASeqAlignment object and returns a ref to it. callback_url: needed for DataFileUtil, dummy_file: path to some dummy "alignment" file (make it small - needs to be uploaded to shock) name: the name of the object reads_ref: a reference to a valid (probably fake) reads library genome_ref: a reference to a valid (also probably fake) genome workspace_name: the name of the workspace to save this object workspace_client: a Workspace client tuned to the server of your choice """ dfu = DataFileUtil(callback_url) dummy_shock_info = dfu.file_to_shock({ "file_path": dummy_file, "make_handle": 1 }) fake_alignment = { "file": dummy_shock_info['handle'], "library_type": "fake", "read_sample_id": reads_ref, "condition": "fake", "genome_id": genome_ref } return make_fake_object(fake_alignment, "KBaseRNASeq.RNASeqAlignment", name, ws_name, ws_client)
def run_skip(self, reads_file): """ Doesn't run RQCFilter, but a dummy skip version. It returns the same result structure, so it doesn't derail the other pipeline steps. However, the "filtered_fastq_file" is the unchanged fastq file, other than gzipping it. run_log is just an empty (but existing!) file. """ print("NOT running RQCFilter, just putting together some results.") # make the dummy output dir outdir = os.path.join( self.scratch_dir, "dummy_rqcfilter_output_{}".format(int(time() * 1000))) mkdir(outdir) # mock up a log file dummy_log = os.path.join(outdir, "dummy_rqcfilter_log.txt") open(dummy_log, 'w').close() # just compress the reads and move them into that output dir (probably don't need to # move them, but let's be consistent) dfu = DataFileUtil(self.callback_url) compressed_reads = dfu.pack_file({ "file_path": reads_file, "pack": "gzip" })["file_path"] base_name = os.path.basename(compressed_reads) not_filtered_reads = os.path.join(outdir, base_name) os.rename(compressed_reads, not_filtered_reads) return { "output_directory": outdir, "filtered_fastq_file": not_filtered_reads, "run_log": dummy_log, "command": "BBTools.run_RQCFilter_local -- skipped. No command run.", "version_string": "KBase BBTools module" }
def _proc_ws_obj_params(self, ctx, params): """ Check the validity of workspace and object params and return them """ dst_ref = params.get(self.PARAM_IN_DST_REF) ws_name_id, obj_name_id = os.path.split(dst_ref) if not bool(ws_name_id.strip()) or ws_name_id == '/': raise ValueError("Workspace name or id is required in " + self.PARAM_IN_DST_REF) if not bool(obj_name_id.strip()): raise ValueError("Object name or id is required in " + self.PARAM_IN_DST_REF) dfu = DataFileUtil(self.callback_url) if not isinstance(ws_name_id, int): try: ws_name_id = dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id)) return ws_name_id, obj_name_id
def __init__(self, config): """ :param config: :param logger: :param directory: Working directory :param urls: Service urls """ # BEGIN_CONSTRUCTOR self.ws_url = config["workspace-url"] self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.srv_wiz_url = config['srv-wiz-url'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev') self.eu = ExpressionUtils(self.callback_url) self.ws = Workspace(self.ws_url, token=self.token) self.scratch = os.path.join(config['scratch'], str(uuid.uuid4())) self._mkdir_p(self.scratch) self.tool_used = "Cufflinks" self.tool_version = os.environ['VERSION'] # END_CONSTRUCTOR pass
def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) # Note added X due to kb|g.1886.fasta self.valid_chars = "-ACGTUWSMKRYBDHVNX" self.amino_acid_specific_characters = "PLIFQE"
def _upload_report(self, report_dir, file_links, workspace_name, saved_objects): dfu = DataFileUtil(self.callback_url) upload_info = dfu.file_to_shock({ 'file_path': report_dir, 'pack': 'zip' }) shock_id = upload_info['shock_id'] report_params = { 'message': 'JGI metagenome assembly report', 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': shock_id, 'name': 'index.html', 'description': 'assembly report' }], 'file_links': file_links, 'report_object_name': 'JGI_assembly_pipeline.' + str(uuid.uuid4()), 'workspace_name': workspace_name, 'objects_created': saved_objects } report_client = KBaseReport(self.callback_url) report = report_client.create_extended_report(report_params) return {'report_ref': report['ref'], 'report_name': report['name']}
def test_simple_upload(self): genomeFileUtil = self.getImpl() ### Test for a Local Function Call - file needs to be just on disk tmp_dir = self.__class__.cfg['scratch'] #file_name = "GCF_000005845.2_ASM584v2_genomic.gbff.gz" #shutil.copy(os.path.join("data", file_name), tmp_dir) gbk_path = self.getTempGenbank() # os.path.join(tmp_dir, file_name) print('attempting upload via local function directly') ws_obj_name = 'MyGenome' result = genomeFileUtil.genbank_to_genome_annotation(self.getContext(), { 'file_path':gbk_path, 'workspace_name':self.getWsName(), 'genome_name':ws_obj_name }); pprint(result) # todo: add test that result is correct ### Test for upload from SHOCK - upload the file to shock first print('attempting upload through shock') data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=self.__class__.ctx['token'], service_ver='dev') shock_id = data_file_cli.file_to_shock({'file_path': gbk_path})['shock_id'] ws_obj_name2 = 'MyGenome.2' result2 = genomeFileUtil.genbank_to_genome_annotation(self.getContext(), { 'shock_id':shock_id, 'workspace_name':self.getWsName(), 'genome_name':ws_obj_name2, 'convert_to_legacy':1 }); pprint(result2)
def test_reannotate_RICKETS(self): genome_ref = '31932/5/1' genome_ref = '32038/3/2' genome_ref = '32132/5/1' genome_name = 'Aceti' self.callback_url = os.environ["SDK_CALLBACK_URL"] self.dfu = DataFileUtil(self.callback_url) result = self.getImpl().annotate( self.getContext(), { "object_ref": genome_ref, "output_workspace": self.getWsName(), "output_genome_name": genome_name, "evalue": None, "fast": 0, "gcode": 0, "genus": "genus", "kingdom": "Bacteria", "metagenome": 0, "mincontiglen": 1, "norrna": 0, "notrna": 0, "rawproduct": 0, "rfam": 1, "scientific_name": "RhodoBacter" })[0] genome_data = self.dfu.get_objects( {"object_refs": [result['output_genome_ref']]})["data"][0]['data'] scratch = "/kb/module/work/tmp/" with open(scratch + 'OUTPUT_GENOME.txt', 'w+') as outfile: json.dump(genome_data, outfile)
def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.setapi = SetAPI(self.callback_url) self.wss = workspaceService(config['workspace-url'])
def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url): self.scratch_dir = scratch_dir self.rau = ReadsAlignmentUtils(callback_url) self.kbr = KBaseReport(callback_url) self.dfu = DataFileUtil(callback_url) self.set_api = SetAPI(srv_wiz_url) self.ws = Workspace(workspace_url) self.valid_commands = ['bamqc', 'multi-bamqc']
def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.ws = Workspace(self.ws_url, token=self.token) self.scratch = config['scratch']
def __init__(self, config): log('--->\nInitializing RNASeqDownloaderUtils instance:\n config: %s' % config) self.scratch = config['scratch'] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url, token=self.token) self.rau = ReadsAlignmentUtils(self.callback_url, token=self.token)
def __init__(self, config): #BEGIN_CONSTRUCTOR self.workspaceURL = config['workspace-url'] self.scratch = os.path.abspath(config['scratch']) self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) #END_CONSTRUCTOR pass
def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = os.path.join(config['scratch'], 'import_GenBank_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url, service_ver='dev') self.uploader_utils = UploaderUtil(config)
def Xtest_modify_old_genome(self): self.callback_url = os.environ["SDK_CALLBACK_URL"] self.gfu = GenomeFileUtil(self.callback_url) self.dfu = DataFileUtil(self.callback_url) old_genome = "30045/15/1" new_genome = "30045/14/1" genome_name = 'OldRhodo' genome_data_old = self.dfu.get_objects({"object_refs": [old_genome]})["data"][0] genome_data_new = self.dfu.get_objects({"object_refs": [new_genome]})["data"][0] sso_1 = { "id": "1", "evidence": [], "term_name": "1", "ontology_ref": "1", "term_lineage": [] } sso_2 = { "id": "2", "evidence": [], "term_name": "2", "ontology_ref": "2", "term_lineage": [] } sso_terms = {'SSO1': sso_1, 'SSO2': sso_2} print("ABOUT TO MODIFY OLD GENOME") for i, item in enumerate(genome_data_old['data']['features']): genome_data_old['data']['features'][i]['ontology_terms'] = { "SSO": sso_terms } print("ABOUT TO MODIFY NEW GENOME") for i, item in enumerate(genome_data_new['data']['features']): genome_data_new['data']['features'][i]['ontology_terms'] = { "SSO": sso_terms } print("ABOUT TO SAVE OLD GENOME") info = self.gfu.save_one_genome({ "workspace": self.getWsName(), "name": genome_name, "data": genome_data_old["data"], "provenance": self.ctx.provenance() })["info"] print("ABOUT TO SAVE NEW GENOME") info = self.gfu.save_one_genome({ "workspace": self.getWsName(), "name": genome_name, "data": genome_data_new["data"], "provenance": self.ctx.provenance() })["info"]
def __init__(self, config, logger=None): self.config = config self.logger = logger self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'EM_' + str(uuid.uuid4())) self.ws_url = config['workspace-url'] self.ws_client = Workspace(self.ws_url) self.dfu = DataFileUtil(self.callback_url) pass
def __init__(self, config): #BEGIN_CONSTRUCTOR self.utils = Utils(config) self.scratch = config['scratch'] self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.ws = Workspace(config['workspace-url']) logging.basicConfig(level=logging.INFO) #END_CONSTRUCTOR pass
def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'import_assembly_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.uploader_utils = UploaderUtil(config)
def test_simple_upload(self): # fetch the test files and set things up genomeFileUtil = self.getImpl() gbk_path = "data/GCF_000005845.2_ASM584v2_genomic.gbff" ### Test for a Local Function Call print('attempting upload via local function directly') ws_obj_name = 'MyGenome' result = genomeFileUtil.genbank_to_genome(self.getContext(), { 'file' : { 'path': gbk_path }, 'workspace_name':self.getWsName(), 'genome_name':ws_obj_name })[0] pprint(result) self.assertIsNotNone(result['genome_ref']) target_dir = os.path.join("/kb/module/work/tmp", "GCF_000005845") download_genome_to_json_files(self.getContext()['token'], result['genome_ref'], target_dir) #self.assertEqual(0, len(compare_genome_json_files(target_dir, # os.path.join("/kb/module/test/data", # "GCF_000005845")))) # todo: add test that result is correct ### Test for upload from SHOCK - upload the file to shock first print('attempting upload through shock') data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=self.__class__.ctx['token'], service_ver='dev') shutil.copy(gbk_path, self.__class__.cfg['scratch']) shock_id = data_file_cli.file_to_shock({ 'file_path': os.path.join(self.__class__.cfg['scratch'], gbk_path.split("/")[-1]) })['shock_id'] ws_obj_name2 = 'MyGenome.2' result2 = genomeFileUtil.genbank_to_genome(self.getContext(), { 'file': {'shock_id':shock_id}, 'workspace_name':self.getWsName(), 'genome_name':ws_obj_name2, })[0] pprint(result2) self.assertIsNotNone(result['genome_ref']) # todo: add test that result is correct ### Test for upload via FTP- use something from genbank print('attempting upload through ftp url') ws_obj_name3 = 'MyGenome.3' result3 = genomeFileUtil.genbank_to_genome(self.getContext(), { 'file':{'ftp_url': self.__class__.TEST_ECOLI_FILE_FTP}, 'workspace_name': self.getWsName(), 'genome_name': ws_obj_name3, })[0] pprint(result3) self.assertIsNotNone(result3['genome_ref'])
def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.ws_client = Workspace(self.ws_url) self.dfu = DataFileUtil(self.callback_url) #END_CONSTRUCTOR pass
def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.scratch = os.path.abspath(config['scratch']) self.callbackURL = os.environ['SDK_CALLBACK_URL'] # self.shared_folder = os.path.abspath(config['scratch']) self.dfu = DataFileUtil(self.callbackURL) #END_CONSTRUCTOR pass
def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.kbse = KBaseSearchEngine(config['search-url']) self.gen_api = GenericsAPI(self.callback_url) self.DEFAULT_ONTOLOGY_REF = "KbaseOntologies/Custom" self.DEFAULT_ONTOLOGY_ID = "Custom:Term" self.DEFAULT_UNIT_ID = "Custom:Unit"
def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.sj = sleep_job(self.callback_url) self.sac = simpleapp_client(self.callback_url) self.aj = alans_job(self.callback_url,service_ver='dev') #END_CONSTRUCTOR pass
def export_genome_annotation_as_genbank(self, ctx, params): """ A method designed especially for download, this calls 'genome_annotation_to_genbank' to do the work, but then packages the output with WS provenance and object info into a zip file and saves to shock. :param params: instance of type "ExportParams" -> structure: parameter "input_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_genome_annotation_as_genbank # validate parameters if 'input_ref' not in params: raise ValueError('Cannot export GenomeAnnotation- not input_ref field defined.') # get WS metadata to get ws_name and obj_name ws = Workspace(url=self.workspaceURL) info = ws.get_object_info_new({'objects':[{'ref': params['input_ref'] }],'includeMetadata':0, 'ignoreErrors':0})[0] # export to a file file = self.genome_annotation_to_genbank(ctx, { 'genome_ref': params['input_ref'], 'new_genbank_file_name': info[1]+'.gbk' })[0] # create the output directory and move the file there export_package_dir = os.path.join(self.sharedFolder, info[1]) os.makedirs(export_package_dir) shutil.move(file['path'], os.path.join(export_package_dir, os.path.basename(file['path']))) # package it up and be done dfUtil = DataFileUtil(self.callback_url) package_details = dfUtil.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [ params['input_ref'] ] }) output = { 'shock_id': package_details['shock_id'] } #END export_genome_annotation_as_genbank # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_genome_annotation_as_genbank return value ' + 'output is not type dict as required.') # return the results return [output]
def upload_file_to_shock(logger, filePath, make_handle = True, shock_service_url = None, #attributes = '{}', ssl_verify = True, token = None): """ Use HTTP multi-part POST to save a file to a SHOCK instance. """ #shock_service_url is from config dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token) #return dfu.file_to_shock({"file_path":filePath, "attributes": json.dumps(attributes), "make_handle" : make_handle}) return dfu.file_to_shock( { "file_path":filePath, "make_handle" : make_handle } )
def test_basic_upload_and_download(self): assemblyUtil = self.getImpl() tmp_dir = self.__class__.cfg['scratch'] file_name = "test.fna" shutil.copy(os.path.join("data", file_name), tmp_dir) fasta_path = os.path.join(tmp_dir, file_name) print('attempting upload') ws_obj_name = 'MyNewAssembly' result = assemblyUtil.save_assembly_from_fasta(self.getContext(), {'file': {'path': fasta_path}, 'workspace_name': self.getWsName(), 'assembly_name': ws_obj_name }) pprint(result) self.check_fasta_file(ws_obj_name, fasta_path) print('attempting upload through shock') data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL']) shock_id = data_file_cli.file_to_shock({'file_path': fasta_path})['shock_id'] ws_obj_name2 = 'MyNewAssembly.2' result2 = assemblyUtil.save_assembly_from_fasta(self.getContext(), {'shock_id': shock_id, 'workspace_name': self.getWsName(), 'assembly_name': ws_obj_name2 }) pprint(result2) self.check_fasta_file(ws_obj_name2, fasta_path) print('attempting upload via ftp url') ftp_url = 'ftp://ftp.ensemblgenomes.org/pub/release-29/bacteria//fasta/bacteria_8_collection/acaryochloris_marina_mbic11017/dna/Acaryochloris_marina_mbic11017.GCA_000018105.1.29.dna.genome.fa.gz' ws_obj_name3 = 'MyNewAssembly.3' result3 = assemblyUtil.save_assembly_from_fasta(self.getContext(), {'ftp_url': ftp_url, 'workspace_name': self.getWsName(), 'assembly_name': ws_obj_name3 }) pprint(result3) # todo: add checks here on ws object ws_obj_name3 = 'MyNewAssembly.3' result4 = assemblyUtil.export_assembly_as_fasta(self.getContext(), {'input_ref': self.getWsName() + '/' + ws_obj_name3}) pprint(result4)
def test_filtered_everything(self): assemblyUtil = self.getImpl() tmp_dir = self.__class__.cfg['scratch'] file_name = "legacy_test.fna" shutil.copy(os.path.join("data", file_name), tmp_dir) fasta_path = os.path.join(tmp_dir, file_name) print('attempting upload') ws_obj_name = 'FilteredAssembly' result = assemblyUtil.save_assembly_from_fasta(self.getContext(), {'file': {'path': fasta_path}, 'workspace_name': self.getWsName(), 'assembly_name': ws_obj_name, 'min_contig_length': 500 }) dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) assembly = dfu.get_objects({'object_refs': [result[0]]})['data'][0]['data'] self.assertEqual(assembly['dna_size'], 0) self.assertEqual(assembly['gc_content'], None) self.assertEqual(assembly['num_contigs'], 0)
def test_load_with_filter_and_options(self): assemblyUtil = self.getImpl() tmp_dir = self.__class__.cfg['scratch'] file_name = "legacy_test.fna" shutil.copy(os.path.join("data", file_name), tmp_dir) fasta_path = os.path.join(tmp_dir, file_name) print('attempting upload') ws_obj_name = 'FilteredAssembly' result = assemblyUtil.save_assembly_from_fasta(self.getContext(), {'file': {'path': fasta_path}, 'workspace_name': self.getWsName(), 'assembly_name': ws_obj_name, 'min_contig_length': 9, 'external_source': 'someplace', 'external_source_id': 'id', 'external_source_origination_date': 'sunday', 'type': 'metagenome', 'contig_info': {'s3': {'is_circ': 0, 'description': 'somethin'}} }) dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) assembly = dfu.get_objects({'object_refs': [result[0]]})['data'][0]['data'] self.assertEqual(len(assembly['contigs']), 1) self.assertEqual(assembly['contigs']['s3']['md5'], '4f339bd56e5f43ecb52e8682a790a111') self.assertEqual(assembly['contigs']['s3']['contig_id'], 's3') self.assertEqual(assembly['contigs']['s3']['length'], 18) self.assertEqual(assembly['contigs']['s3']['is_circ'], 0) self.assertEqual(assembly['contigs']['s3']['description'], 'somethin') self.assertEqual(assembly['dna_size'], 18) self.assertEqual(assembly['gc_content'], 0.44444) self.assertEqual(assembly['md5'], 'eba4d1771060e19671a56832d159526e') self.assertEqual(assembly['num_contigs'], 1) self.assertEqual(assembly['type'], 'metagenome') self.assertEqual(assembly['external_source'], 'someplace') self.assertEqual(assembly['external_source_id'], 'id') self.assertEqual(assembly['external_source_origination_date'], 'sunday')
def download_file_from_shock(logger, shock_service_url = None, shock_id = None, filename = None, directory = None, filesize= None, token = None): """ Given a SHOCK instance URL and a SHOCK node id, download the contents of that node to a file on disk. """ if filename is not None: shockFileName = filename if directory is not None: filePath = os.path.join(directory, shockFileName) else: filePath = shockFileName #shock_service_url is from config dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token) return dfu.shock_to_file({"shock_id" : shock_id, "file_path":filePath, "unpack" : None})
def ztest_aaa_upload_to_shock(self): print "upload ref data to shock staging" self.dfUtil = DataFileUtil(os.environ['SDK_CALLBACK_URL']) #file_path = self.write_file('Phage_gene_catalog.tar.gz', 'Test') input_file_name = 'Phage_gene_catalog_plus_viromes.tar.gz'#'Phage_gene_catalog.tar.gz'#''PFAM_27.tar.gz' source_file_path = "/kb/module/work/"+input_file_name# os.path.join(tmp_dir, input_file_name) tmp_dir = self.cfg['scratch'] target_file_path = os.path.join(tmp_dir, input_file_name) print "file_path " + source_file_path+"\t"+target_file_path orig_size = os.path.getsize(source_file_path) shutil.copy(source_file_path, target_file_path) print "Testing "+target_file_path print(os.path.isfile(target_file_path)) ret1 = self.dfUtil.file_to_shock( {'file_path': target_file_path}) print str(ret1) shock_id = ret1['shock_id'] print "shock_id "+shock_id file_path2 = os.path.join("/kb/module/work/", 'test.tar.gz') #ret2 = self.dfUtil.shock_to_file( # {'shock_id': shock_id, 'file_path': file_path2})[0] ret2 = self.dfUtil.shock_to_file( {'shock_id': shock_id, 'file_path': file_path2}) print(ret2) file_name = ret2['node_file_name'] attribs = ret2['attributes'] self.assertEqual(file_name, 'Phage_gene_catalog_plus_viromes.tar.gz') self.assertEqual(ret2['file_path'], file_path2) self.assertEqual(ret2['size'], orig_size) self.assertIsNone(attribs)
class FastaToAssembly: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) # Note added X due to kb|g.1886.fasta self.valid_chars = "-ACGTUWSMKRYBDHVNX" self.amino_acid_specific_characters = "PLIFQE" def import_fasta(self, ctx, params): print('validating parameters') self.validate_params(params) print('staging input files') fasta_file_path = self.stage_input(params) if 'min_contig_length' in params: min_contig_length = int(params['min_contig_length']) print('filtering fasta file by contig length (min len=' + str(min_contig_length) + 'bp)') fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length) print('parsing FASTA file: ' + str(fasta_file_path)) assembly_data = self.parse_fasta(fasta_file_path, params) print(' - parsed ' + str(assembly_data['num_contigs']) + ' contigs, ' + str(assembly_data['dna_size']) + 'bp') print('saving assembly to KBase') # save file to shock and build handle fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path) # construct the output object assembly_object_to_save = self.build_assembly_object(assembly_data, fasta_file_handle_info, params) # save to WS and return if 'workspace_id' in params: workspace_id = int(params['workspace_id']) else: workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) assembly_info = self.save_assembly_object(workspace_id, params['assembly_name'], assembly_object_to_save) return assembly_info def build_assembly_object(self, assembly_data, fasta_file_handle_info, params): ''' construct the WS object data to save based on the parsed info and params ''' assembly_data['assembly_id'] = params['assembly_name'] assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid'] assembly_data['fasta_handle_info'] = fasta_file_handle_info assembly_data['type'] = 'Unknown' if 'type' in params: assembly_data['type'] = params['type'] if 'taxon_ref' in params: assembly_data['taxon_ref'] = params['taxon_ref'] if 'external_source' in params: assembly_data['external_source'] = params['external_source'] if 'external_source_id' in params: assembly_data['external_source_id'] = params['external_source_id'] if 'external_source_origination_date' in params: assembly_data['external_source_origination_date'] = params['external_source_origination_date'] return assembly_data def parse_fasta(self, fasta_file_path, params): ''' Do the actual work of inspecting each contig ''' # variables to store running counts of things total_length = 0 base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0} md5_list = [] # map from contig_id to contig_info all_contig_data = {} extra_contig_info = {} if'contig_info' in params: extra_contig_info = params['contig_info'] for record in SeqIO.parse(fasta_file_path, "fasta"): # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()), # id='gi|113968346|ref|NC_008321.1|', # name='gi|113968346|ref|NC_008321.1|', # description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome', # dbxrefs=[]) sequence = str(record.seq).upper() contig_info = { 'contig_id': record.id, 'name': record.id, 'description': record.description[len(record.id):].strip(), 'length': len(record.seq) } # 1) compute sequence character statistics running total total_length += contig_info['length'] sequence_count_table = dict(Counter(sequence)) for character in sequence_count_table: if character in base_counts: base_counts[character] = base_counts[character] + sequence_count_table[character] else: base_counts[character] = sequence_count_table[character] if character not in self.valid_chars: if character in self.amino_acid_specific_characters: raise ValueError('This fasta file may have amino acids in it instead ' + 'of the required nucleotides.') raise ValueError("This FASTA file has non nucleic acid characters : {0}".format(character)) # 2) record number of 'N' characters (only set if there are some) Ncount = 0 if 'N' in sequence_count_table: Ncount = sequence_count_table['N'] contig_info['Ncount'] = Ncount # 2b) record if the contig is circular if record.id in extra_contig_info: if 'is_circ' in extra_contig_info[record.id]: contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ']) if 'description' in extra_contig_info[record.id]: contig_info['description'] = str(extra_contig_info[record.id]['description']) # 3) record md5 checksum contig_md5 = md5(sequence).hexdigest() contig_info['md5'] = contig_md5 md5_list.append(contig_md5) # 4) record the all important GC to ~3 significant digits GC_count = 0 for base in ['G', 'C']: if base in sequence_count_table: GC_count += sequence_count_table[base] contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5) # 5) add to contig list if contig_info['contig_id'] in all_contig_data: raise ValueError('The fasta header key ' + contig_info['contig_id'] + 'appears more than once in the file') all_contig_data[contig_info['contig_id']] = contig_info # Aggregate stats for the data total_gc_content = None if total_length > 0: total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5) assembly_data = { 'md5': md5(",".join(sorted(md5_list))).hexdigest(), 'base_counts': base_counts, 'dna_size': total_length, 'gc_content': total_gc_content, 'contigs': all_contig_data, 'num_contigs': len(all_contig_data) } return assembly_data def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length): ''' generates SeqRecords iterator for writing from a legacy contigset object ''' rows = 0 rows_added = 0 for record in fasta_record_iter: rows += 1 if len(record.seq) >= min_contig_length: rows_added += 1 yield record print(' - filtered out ' + str(rows - rows_added) + ' of ' + str(rows) + ' contigs that were shorter than ' + str(min_contig_length) + 'bp.') def filter_contigs_by_length(self, fasta_file_path, min_contig_length): ''' removes all contigs less than the min_contig_length provided ''' filtered_fasta_file_path = fasta_file_path + '.filtered.fa' fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta') SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length), filtered_fasta_file_path, 'fasta') return filtered_fasta_file_path def save_assembly_object(self, workspace_id, assembly_name, obj_data): print('Saving Assembly to Workspace') sys.stdout.flush() obj_info = self.dfu.save_objects({'id': workspace_id, 'objects': [{'type': 'KBaseGenomeAnnotations.Assembly', 'data': obj_data, 'name': assembly_name }] })[0] return obj_info def save_fasta_file_to_shock(self, fasta_file_path): ''' Given the path to the file, upload to shock and return Handle information returns: typedef structure { string shock_id; Handle handle; string node_file_name; string size; } FileToShockOutput; ''' print('Uploading fasta file (' + str(fasta_file_path) + ') to SHOCK') sys.stdout.flush() return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1}) def stage_input(self, params): ''' Setup the input_directory by fetching the files and returning the path to the file''' file_path = None if 'file' in params: file_path = os.path.abspath(params['file']['path']) elif 'shock_id' in params: print('Downloading file from SHOCK node: ' + str(params['shock_id'])) sys.stdout.flush() input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4())) os.makedirs(input_directory) file_name = self.dfu.shock_to_file({'file_path': input_directory, 'shock_id': params['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) elif 'ftp_url' in params: print('Downloading file from: ' + str(params['ftp_url'])) sys.stdout.flush() file_path = self.dfu.download_web_file({'file_url': params['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] # extract the file if it is compressed if file_path is not None: unpacked_file = self.dfu.unpack_file({'file_path': file_path}) return unpacked_file['file_path'] raise ValueError('No valid fasta could be extracted based on the input parameters') def validate_params(self, params): for key in ('workspace_name', 'assembly_name'): if key not in params: raise ValueError('required "' + key + '" field was not defined') # one and only one of either 'file', 'shock_id', or ftp_url is required input_count = 0 for key in ('file', 'shock_id', 'ftp_url'): if key in params and params[key] is not None: input_count = input_count + 1 if key == 'file': if not isinstance(params[key], dict) or 'path' not in params[key]: raise ValueError('when specifying a fasta file input, "path" field was not defined in "file"') if input_count == 0: raise ValueError('required fasta file as input, set as either "file", "shock_id", or "ftp_url"') if input_count > 1: raise ValueError('required exactly one fasta file as input source, you set more than one of ' + 'these fields: "file", "shock_id", or "ftp_url"')
def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL']) self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.ws = Workspace(config["workspace-url"])
class FeatureSetDownload: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL']) self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.ws = Workspace(config["workspace-url"]) @staticmethod def validate_params(params, expected={"workspace_name", "featureset_name"}): expected = set(expected) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) def to_tsv(self, params): working_dir = os.path.join(self.scratch, 'featureset-download-'+str(uuid.uuid4())) os.makedirs(working_dir) header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function'] fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref']) files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)} writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t', lineterminator='\n') writer.writeheader() for feat in fs_dicts: writer.writerow(feat) return fs_name, files def make_featureset_dict(self, fs_ref): features = [] ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0] feat_set = ret['data'] fs_name = ret['info'][1] feat_by_genome = defaultdict(list) for k, v in feat_set['elements'].items(): feat_by_genome[v[0]].append(k) for genome, fids in feat_by_genome.items(): genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1] res = self.gsu.search({'ref': genome, 'structured_query': {'feature_id': fids}, 'sort_by': [['contig_id', 1]], 'start': 0, 'limit': len(fids) }) for feat in res['features']: features.append({'Feature Id': feat['feature_id'], 'Aliases': ", ".join(feat['aliases'].keys()), 'Genome': "{} ({})".format(genome_name, genome), 'Type': feat['feature_type'], 'Function': feat['function'] }) return fs_name, features def export(self, files, name, params): export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4())) os.makedirs(export_package_dir) for file in files: shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['featureset_ref']] }) return {'shock_id': package_details['shock_id']}
class kb_virsorterTest(unittest.TestCase): @classmethod def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) user_id = requests.post( 'https://kbase.us/services/authorization/Sessions/Login', data='token={}&fields=user_id'.format(token)).json()['user_id'] # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'user_id': user_id, 'provenance': [ {'service': 'kb_virsorter', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_virsorter'): cls.cfg[nameval[0]] = nameval[1] cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = kb_virsorter(cls.cfg) cls.testobjref = [] #cls.testobjdata = [] cls.testwsname = [] @classmethod def tearDownClass(cls): if hasattr(cls, 'wsName'): cls.wsClient.delete_workspace({'workspace': cls.wsName}) print('Test workspace was deleted') if hasattr(cls, 'testwsname') and len(cls.testwsname) > 0: try: print('Deleting workspace 2 ' + cls.testwsname[0]) cls.wsClient.delete_workspace({'workspace': cls.testwsname[0]}) print('Test workspace 2 was deleted ' + cls.testwsname[0]) except Exception as e: print e #if hasattr(cls, 'testobjdata'): # try: # print('Deleting shock data ' + str(len(cls.testobjdata))) # print('Deleting shock data ' + str(len(cls.testobjdata[0]['data'][0]))) # print('Deleting shock data ' + str(cls.testobjdata[0])) # node = cls.testobjdata[0]['data'][0]['lib']['file']['id'] # cls.delete_shock_node(node) # print('Test shock data was deleted') # except Exception as e: # print e def getWsClient(self): return self.__class__.wsClient def getWsName(self): if hasattr(self.__class__, 'wsName'): return self.__class__.wsName suffix = int(time.time() * 1000) wsName = "test_kb_virsorter_" + str(suffix) ret = self.getWsClient().create_workspace({'workspace': wsName}) self.__class__.wsName = wsName return wsName def getImpl(self): return self.__class__.serviceImpl def getContext(self): return self.__class__.ctx def write_file(self, filename, content): tmp_dir = self.cfg['scratch'] file_path = os.path.join(tmp_dir, filename) with open(file_path, 'w') as fh1: fh1.write(content) return file_path def delete_shock_node(self, node_id): header = {'Authorization': 'Oauth {0}'.format(cls.token)} requests.delete(cls.shockURL + '/node/' + node_id, headers=header, allow_redirects=True) def ztest_aaa_upload_to_shock(self): print "upload ref data to shock staging" self.dfUtil = DataFileUtil(os.environ['SDK_CALLBACK_URL']) #file_path = self.write_file('Phage_gene_catalog.tar.gz', 'Test') input_file_name = 'Phage_gene_catalog_plus_viromes.tar.gz'#'Phage_gene_catalog.tar.gz'#''PFAM_27.tar.gz' source_file_path = "/kb/module/work/"+input_file_name# os.path.join(tmp_dir, input_file_name) tmp_dir = self.cfg['scratch'] target_file_path = os.path.join(tmp_dir, input_file_name) print "file_path " + source_file_path+"\t"+target_file_path orig_size = os.path.getsize(source_file_path) shutil.copy(source_file_path, target_file_path) print "Testing "+target_file_path print(os.path.isfile(target_file_path)) ret1 = self.dfUtil.file_to_shock( {'file_path': target_file_path}) print str(ret1) shock_id = ret1['shock_id'] print "shock_id "+shock_id file_path2 = os.path.join("/kb/module/work/", 'test.tar.gz') #ret2 = self.dfUtil.shock_to_file( # {'shock_id': shock_id, 'file_path': file_path2})[0] ret2 = self.dfUtil.shock_to_file( {'shock_id': shock_id, 'file_path': file_path2}) print(ret2) file_name = ret2['node_file_name'] attribs = ret2['attributes'] self.assertEqual(file_name, 'Phage_gene_catalog_plus_viromes.tar.gz') self.assertEqual(ret2['file_path'], file_path2) self.assertEqual(ret2['size'], orig_size) self.assertIsNone(attribs) #self.delete_shock_node(shock_id) def create_random_string(self): N = 20 return ''.join( random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(N)) def test_virsorter_ok(self): self.upload_assembly() if not self.testwsname: self.testwsname.append(self.create_random_string()) print "upload_reads self.testwsname[0] " + self.testwsname[0] #try: # ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]}) # test_ws_name #except Exception as e: # # print "ERROR" # # print(type(e)) # # print(e.args) # print(e) # pass print "self.testwsname "+ str(self.testwsname) params = {} params['assembly_ref'] = str(self.testobjref[0])#str(self.testwsname[0])+"/"+ #"16589/2/1"#""#'16589/2/1'#self.testobjref params['ws_name'] = self.testwsname[0] result = self.getImpl().run_virsorter(self.getContext(), params) print('RESULT run_virsorter:') pprint(result) #testresult = [ # {'blah': 'blah', 'bleh': 'bleh'}] testresult = [{'report_ref': result[0]['report_ref'], 'report_name': result[0]['report_name']}] self.assertEqual(sorted(result), sorted(testresult)) def upload_assembly(self): if not self.testobjref: print "upload_assembly start" indata = 'U00096.2.fa'#_first1000. ftarget = os.path.join(self.cfg['scratch'], indata)#self.scratch, indata) print "ftarget " + ftarget ret = shutil.copy('../test_data/' + indata, ftarget) #self.readsUtilClient = ReadsUtils(os.environ['SDK_CALLBACK_URL']) self.assemblyUtilClient = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) if not self.testwsname: self.testwsname.append(self.create_random_string()) print "upload_assembly self.testwsname[0] " + self.testwsname[0] try: ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]}) #test_ws_name except Exception as e: #print "ERROR" #print(type(e)) #print(e.args) print(e) pass try: print "attempt upload" print "ftarget " + ftarget ref = self.assemblyUtilClient.save_assembly_from_fasta( { 'workspace_name': self.testwsname[0], 'assembly_name': 'Ecolik12MG1655', 'file': {'path': ftarget}}) print "upload_assembly" print ref #self.testobjref = [] self.testobjref.append(self.testwsname[0] + '/Ecolik12MG1655/1') #self.testobjdata = [] #self.testobjdata.append(self.dfu.get_objects( # {'object_refs': [self.testobjref[0]]})) ##print self.testobjdata[0] except Exception as e: print e pass print "self.testobjref[0]" print self.testobjref print self.testobjref[0]
def genome_annotation_to_genbank(self, ctx, params): """ :param params: instance of type "GenomeAnnotationToGenbankParams" (genome_ref -- Reference to the GenomeAnnotation or Genome object in KBase in any ws supported format OR genome_name + workspace_name -- specifiy the genome name and workspace name of what you want. If genome_ref is defined, these args are ignored. new_genbank_file_name -- specify the output name of the genbank file, optional save_to_shock -- set to 1 or 0, if 1 then output is saved to shock. default is zero) -> structure: parameter "genome_ref" of String, parameter "genome_name" of String, parameter "workspace_name" of String, parameter "new_genbank_file_name" of String, parameter "save_to_shock" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)) :returns: instance of type "GenbankFile" -> structure: parameter "path" of String, parameter "shock_id" of String """ # ctx is the context object # return variables are: file #BEGIN genome_annotation_to_genbank print('genome_annotation_to_genbank -- paramaters = ') pprint(params) service_endpoints = { "workspace_service_url": self.workspaceURL, "shock_service_url": self.shockURL, "handle_service_url": self.handleURL } # parse/validate parameters. could do a better job here. genome_ref = None if 'genome_ref' in params and params['genome_ref'] is not None: genome_ref = params['genome_ref'] else: if 'genome_name' not in params: raise ValueError('genome_ref and genome_name are not defined. One of those is required.') if 'workspace_name' not in params: raise ValueError('workspace_name is not defined. This is required if genome_name is specified' + ' without a genome_ref') genome_ref = params['workspace_name'] + '/' + params['genome_name'] # do a quick lookup of object info- could use this to do some validation. Here we need it to provide # a nice output file name if it is not set... We should probably catch errors here and print out a nice # message - usually this would mean the ref was bad. ws = Workspace(url=self.workspaceURL) info = ws.get_object_info_new({'objects':[{'ref':genome_ref}],'includeMetadata':0, 'ignoreErrors':0})[0] print('resolved object to:'); pprint(info) if 'new_genbank_file_name' not in params or params['new_genbank_file_name'] is None: new_genbank_file_name = info[1] + ".gbk" else: new_genbank_file_name = params['new_genbank_file_name'] # construct a working directory to hand off to the data_api working_directory = os.path.join(self.sharedFolder, 'genome-download-'+str(uuid.uuid4())) os.makedirs(working_directory) output_file_destination = os.path.join(working_directory,new_genbank_file_name) # do it print('calling: doekbase.data_api.downloaders.GenomeAnnotation.downloadAsGBK'); GenomeAnnotation.downloadAsGBK( genome_ref, service_endpoints, ctx['token'], output_file_destination, working_directory) # if we need to upload to shock, well then do that too. file = {} if 'save_to_shock' in params and params['save_to_shock'] == 1: dfUtil = DataFileUtil(self.callback_url, token=ctx['token']) file['shock_id'] =dfUtil.file_to_shock({ 'file_path':output_file_destination, 'gzip':0, 'make_handle':0 #attributes: {} #we can set shock attributes if we want })['shock_id'] else: file['path'] = output_file_destination #END genome_annotation_to_genbank # At some point might do deeper type checking... if not isinstance(file, dict): raise ValueError('Method genome_annotation_to_genbank return value ' + 'file is not type dict as required.') # return the results return [file]
def genbank_to_genome_annotation(self, ctx, params): """ :param params: instance of type "GenbankToGenomeAnnotationParams" (file_path or shock_id -- Local path or shock_id of the uploaded file with genome sequence in GenBank format or zip-file with GenBank files. genome_name -- The name you would like to use to reference this GenomeAnnotation. If not supplied, will use the Taxon Id and the data source to determine the name. taxon_wsname - name of the workspace containing the Taxonomy data, defaults to 'ReferenceTaxons') -> structure: parameter "file_path" of String, parameter "shock_id" of String, parameter "ftp_url" of String, parameter "genome_name" of String, parameter "workspace_name" of String, parameter "source" of String, parameter "taxon_wsname" of String, parameter "convert_to_legacy" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)) :returns: instance of type "GenomeAnnotationDetails" -> structure: parameter "genome_annotation_ref" of String """ # ctx is the context object # return variables are: details #BEGIN genbank_to_genome_annotation print('genbank_to_genome_annotation -- paramaters = ') pprint(params) # validate input and set defaults. Note that because we don't call the uploader method # as a stand alone script, we do the validation here. if 'workspace_name' not in params: raise ValueError('workspace_name field was not defined') workspace_name = params['workspace_name'] if 'genome_name' not in params: raise ValueError('genome_name field was not defined') genome_name = params['genome_name'] source = 'Genbank' if 'source' in params: source = source; taxon_wsname = 'ReferenceTaxons' if 'taxon_wsname' in params: taxon_wsname = params['taxon_wsname'] # other options to handle # release # taxon_reference # exclude_feature_types # type # construct the input directory where we stage files input_directory = os.path.join(self.sharedFolder, 'genome-upload-staging-'+str(uuid.uuid4())) os.makedirs(input_directory) # determine how to get the file: if it is from shock, download it. If it # is just sitting there, then use it. Move the file to the staging input directory genbank_file_path = None if 'file_path' not in params: if 'shock_id' not in params: if 'ftp_url' not in params: raise ValueError('No input file (either file_path, shock_id, or ftp_url) provided') else: # TODO handle ftp - this creates a directory for us, so update the input directory print('calling Transform download utility: script_utils.download'); print('URL provided = '+params['ftp_url']); script_utils.download_from_urls( working_directory = input_directory, token = ctx['token'], # not sure why this requires a token to download from a url... urls = { 'ftpfiles': params['ftp_url'] } ); input_directory = os.path.join(input_directory,'ftpfiles') # unpack everything in input directory dir_contents = os.listdir(input_directory) print('downloaded directory listing:') pprint(dir_contents) dir_files = [] for f in dir_contents: if os.path.isfile(os.path.join(input_directory, f)): dir_files.append(f) print('processing files in directory...') for f in dir_files: # unpack if needed using the standard transform utility print('unpacking '+f) script_utils.extract_data(filePath=os.path.join(input_directory,f)) else: # handle shock file dfUtil = DataFileUtil(self.callback_url, token=ctx['token']) file_name = dfUtil.shock_to_file({ 'file_path': input_directory, 'shock_id': params['shock_id'] })['node_file_name'] genbank_file_path = os.path.join(input_directory, file_name) else: # copy the local file to the input staging directory # (NOTE: could just move it, but then this method would have the side effect of moving your # file which another SDK module might have an open handle on) local_file_path = params['file_path'] genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path)) shutil.copy2(local_file_path, genbank_file_path) if genbank_file_path is not None: print("input genbank file =" + genbank_file_path) # unpack if needed using the standard transform utility script_utils.extract_data(filePath=genbank_file_path) # do the upload (doesn't seem to return any information) uploader.upload_genome( logger=None, shock_service_url = self.shockURL, handle_service_url = self.handleURL, workspace_service_url = self.workspaceURL, input_directory=input_directory, workspace_name = workspace_name, core_genome_name = genome_name, source = source, taxon_wsname = taxon_wsname ) #### Code to convert to legacy type if requested if 'convert_to_legacy' in params and params['convert_to_legacy']==1: from doekbase.data_api.converters import genome as cvt print('Converting to legacy type, object={}'.format(genome_name)) cvt.convert_genome( shock_url=self.shockURL, handle_url=self.handleURL, ws_url=self.workspaceURL, obj_name=genome_name, ws_name=workspace_name) # clear the temp directory shutil.rmtree(input_directory) # get WS metadata to return the reference to the object (could be returned by the uploader method...) ws = Workspace(url=self.workspaceURL) info = ws.get_object_info_new({'objects':[{'ref':workspace_name + '/' + genome_name}],'includeMetadata':0, 'ignoreErrors':0})[0] details = { 'genome_annotation_ref':str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) } #END genbank_to_genome_annotation # At some point might do deeper type checking... if not isinstance(details, dict): raise ValueError('Method genbank_to_genome_annotation return value ' + 'details is not type dict as required.') # return the results return [details]