def test_handles(self): wsName = self.generatePesudoRandomWorkspaceName() self.ws.set_permissions({'workspace': wsName, 'new_permission': 'w', 'users': [self.ctx2['user_id']]}) temp_shock_file = "/kb/module/work/tmp/shock1.txt" with open(temp_shock_file, "w") as f1: f1.write("Test Shock Handle") token1 = self.ctx['token'] dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token1) handle1 = dfu.file_to_shock({'file_path': temp_shock_file, 'make_handle': 1})['handle'] hid1 = handle1['hid'] genome_name = "Genome.1" self.impl.save_one_genome_v1(self.ctx, { 'workspace': wsName, 'name': genome_name, 'data': { 'id': "qwerty", 'scientific_name': "Qwerty", 'domain': "Bacteria", 'genetic_code': 11, 'genbank_handle_ref': hid1} }) genome = self.impl.get_genome_v1(self.ctx2, {'genomes': [{'ref': wsName + '/' + genome_name} ]})[0]['genomes'][0]['data'] self.impl.save_one_genome_v1(self.ctx2, {'workspace': wsName, 'name': genome_name, 'data': genome})[0] genome = self.impl.get_genome_v1(self.ctx2, {'genomes': [{'ref': wsName + '/' + genome_name} ]})[0]['genomes'][0]['data'] self.assertTrue('genbank_handle_ref' in genome) hid2 = genome['genbank_handle_ref'] self.assertNotEqual(hid1, hid2)
def setUpClass(cls): cls.token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('GenericsAPI'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(cls.token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': cls.token, 'user_id': user_id, 'provenance': [{ 'service': 'GenericsAPI', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL) cls.serviceImpl = GenericsAPI(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] cls.shockURL = cls.cfg['shock-url'] cls.dfu = DataFileUtil(cls.callback_url) cls.sample_uploader = sample_uploader(cls.callback_url, service_ver="dev") cls.sample_url = cls.cfg.get('kbase-endpoint') + '/sampleservice' cls.sample_ser = SampleService(cls.sample_url) cls.hs = HandleService(url=cls.cfg['handle-service-url'], token=cls.token) suffix = int(time.time() * 1000) cls.wsName = "test_GenericsAPI_" + str(suffix) ret = cls.wsClient.create_workspace({'workspace': cls.wsName}) cls.wsId = ret[0] small_file = os.path.join(cls.scratch, 'test.txt') with open(small_file, "w") as f: f.write("empty content") cls.test_shock = cls.dfu.file_to_shock({ 'file_path': small_file, 'make_handle': True }) cls.handles_to_delete = [] cls.nodes_to_delete = [] cls.handles_to_delete.append(cls.test_shock['handle']['hid']) cls.nodes_to_delete.append(cls.test_shock['shock_id']) cls.prepare_data()
def setUpClass(cls): cls.token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) # type: ignore for nameval in config.items('GenomeFileUtil'): cls.cfg[nameval[0]] = nameval[1] authServiceUrl = cls.cfg.get( 'auth-service-url', "https://kbase.us/services/authorization/Sessions/Login") auth_client = _KBaseAuth(authServiceUrl) cls.user_id = auth_client.get_user(cls.token) cls.ctx = MethodContext(None) cls.ctx.update({ 'token': cls.token, 'user_id': cls.user_id, 'provenance': [{ 'service': 'GenomeFileUtil', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=cls.token) cls.serviceImpl = GenomeFileUtil(cls.cfg) cls.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=cls.token) cls.scratch = cls.cfg['scratch'] cls.shockURL = cls.cfg['shock-url'] cls.gfu_cfg = SDKConfig(cls.cfg) cls.prepare_data()
def __init__(self, config): self.scratch = config["scratch"] self.ctx = config['ctx']; self.callback_url = config["SDK_CALLBACK_URL"] self.ws_client = workspaceService(config["workspace-url"]) self.gfu = GenomeFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url) self.dfu = DataFileUtil(self.callback_url) self.genome_api = GenomeAnnotationAPI(self.callback_url) self.sso_ref = None self.sso_event = None self.ec_to_sso = {} self.output_workspace = None
def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.sp_uploader = sample_uploader(self.callback_url, service_ver='beta') self.dotfu = KBaseDataObjectToFileUtils(self.callback_url, token=self.token, service_ver='beta')
def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url, service_ver='release') self.working_dir = self.scratch self.data_util = DataUtil(config) self.dfu = DataFileUtil(self.callback_url) self.output_dir = os.path.join(self.working_dir, self.MDS_OUT_DIR) self._mkdir_p(self.output_dir) # If input is from files, then pd.DataFrame needs to be transposed in run_metaMDS_with_file method self.need_to_transpose = True
class geneminerutils: def __init__(self): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) #self.hr = htmlreportutils() #self.config = config #self.params = params def download_genelist(self, genelistref): get_objects_params = {'object_refs': [genelistref]} geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data'] #geneset_query = ",".join(geneset) return (geneset['element_ordering']) # #with open(genesetfile, 'w') as filehandle: # #for item in geneset['element_ordering']: # # filehandle.write('%s\n' % item) # #return (genesetfile) def generate_query(self, genomenetmine_dyn_url, genelistref, species, pheno): #pheno = ["disease"] #species = "potatoknet" #genes = ["PGSC0003DMG400006345", "PGSC0003DMG400012792", "PGSC0003DMG400033029", "PGSC0003DMG400016390", # "PGSC0003DMG400039594", "PGSC0003DMG400028153"] #genomenetmine_dyn_url = 'http://ec2-18-236-212-118.us-west-2.compute.amazonaws.com:5000/networkquery/api' genes = self.download_genelist(genelistref) gsp = genescoreparser() x = gsp.summary(genomenetmine_dyn_url, genes, species, pheno) return (x) def get_evidence(self,genomenetmine_dyn_url, genelistref, species, pheno ): genes = self.download_genelist(genelistref) ep = evidenceparser() x = ep.summary(genomenetmine_dyn_url, genes, species, pheno) return (x)
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_orthofinder'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'user_id': user_id, 'provenance': [ {'service': 'kb_orthofinder', 'method': 'annotate_plant_transcripts', 'method_params': [] }], 'authenticated': 1}) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL) cls.serviceImpl = kb_orthofinder(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.test_data = cls.cfg['test_data'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] cls.gfu = GenomeFileUtil(cls.callback_url) cls.dfu = DataFileUtil(cls.callback_url) cls.genome = "Test_Genome" cls.prepare_data()
def setUpClass(cls): token = os.environ.get('KB_AUTH_TOKEN', None) config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('ProteinStructureUtils'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'user_id': user_id, 'provenance': [ {'service': 'ProteinStructureUtils', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = Workspace(cls.wsURL) cls.serviceImpl = ProteinStructureUtils(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] cls.dfu = DataFileUtil(cls.callback_url) suffix = int(time.time() * 1000) cls.wsName = "test_ProteinStructureUtils_" + str(suffix) cls.ws_id = cls.wsClient.create_workspace({'workspace': cls.wsName})[0] cls.prepareData()
def setUpClass(cls): cls.token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_uploadmethods'): cls.cfg[nameval[0]] = nameval[1] authServiceUrl = cls.cfg.get( 'auth-service-url', "https://kbase.us/services/authorization/Sessions/Login") auth_client = _KBaseAuth(authServiceUrl) cls.user_id = auth_client.get_user(cls.token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': cls.token, 'user_id': cls.user_id, 'provenance': [{ 'service': 'kb_uploadmethods', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=cls.token) cls.serviceImpl = kb_uploadmethods(cls.cfg) cls.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=cls.token) cls.scratch = cls.cfg['scratch'] cls.shockURL = cls.cfg['shock-url']
def test_basic_upload_and_download(self): assemblyUtil = self.getImpl() tmp_dir = self.__class__.cfg['scratch'] file_name = "test.fna" shutil.copy(os.path.join("data", file_name), tmp_dir) fasta_path = os.path.join(tmp_dir, file_name) print('attempting upload') ws_obj_name = 'MyNewAssembly' result = assemblyUtil.save_assembly_from_fasta(self.getContext(), {'file': {'path': fasta_path}, 'workspace_name': self.getWsName(), 'assembly_name': ws_obj_name, 'taxon_ref': 'ReferenceTaxons/unknown_taxon', }) pprint(result) self.check_fasta_file(ws_obj_name, fasta_path) return print('attempting upload through shock') data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL']) shock_id = data_file_cli.file_to_shock({'file_path': fasta_path})['shock_id'] ws_obj_name2 = 'MyNewAssembly.2' result2 = assemblyUtil.save_assembly_from_fasta(self.getContext(), {'shock_id': shock_id, 'workspace_name': self.getWsName(), 'assembly_name': ws_obj_name2 }) pprint(result2) self.check_fasta_file(ws_obj_name2, fasta_path) print('attempting upload via ftp url') ftp_url = 'ftp://ftp.ensemblgenomes.org/pub/release-29/bacteria//fasta/bacteria_8_collection/acaryochloris_marina_mbic11017/dna/Acaryochloris_marina_mbic11017.GCA_000018105.1.29.dna.genome.fa.gz' ws_obj_name3 = 'MyNewAssembly.3' result3 = assemblyUtil.save_assembly_from_fasta(self.getContext(), {'ftp_url': ftp_url, 'workspace_name': self.getWsName(), 'assembly_name': ws_obj_name3 }) pprint(result3) # todo: add checks here on ws object ws_obj_name3 = 'MyNewAssembly.3' result4 = assemblyUtil.export_assembly_as_fasta(self.getContext(), {'input_ref': self.getWsName() + '/' + ws_obj_name3}) pprint(result4)
def create_html_report(self, callback_url, output_dir, workspace_name): ''' function for creating html report :param callback_url: :param output_dir: :param workspace_name: :return: ''' dfu = DataFileUtil(callback_url) report_name = 'kb_variant_report_' + str(uuid.uuid4()) report = KBaseReport(callback_url) index_file_path = output_dir + "/snpEff_genes.txt" htmlstring = self.create_enrichment_report("snpEff_genes.txt", output_dir) try: with open(output_dir + "/index.html", "w") as html_file: html_file.write(htmlstring + "\n") except IOError: print("Unable to write " + index_file_path + " file on disk.") report_shock_id = dfu.file_to_shock({ 'file_path': output_dir, 'pack': 'zip' })['shock_id'] html_file = { 'shock_id': report_shock_id, 'name': 'index.html', 'label': 'index.html', 'description': 'HTMLL report for GSEA' } report_info = report.create_extended_report({ 'direct_html_link_index': 0, 'html_links': [html_file], 'report_object_name': report_name, 'workspace_name': workspace_name }) return { 'report_name': report_info['name'], 'report_ref': report_info['ref'] }
def test_AssemblySet_input(self): # Initiate empty data dictionaries and get data_util dfu = DataFileUtil(self.callback_url) assembly_dict = dict() assembly_set_dict = dict() dfu_dict = dict() dfu_dict_2 = dict() # Get workspace id and name wsName = self.getWsName() ws_id = dfu.ws_name_to_id(wsName) # FASTA to assembly object Fasta_assembly_dict = { "path": "/kb/module/work/tmp/NC_021490.fasta", "assembly_name": "test_assembly" } params = { "file": Fasta_assembly_dict, "workspace_name": wsName, "assembly_name": "test_assembly" } ref = self.getImpl().save_assembly_from_fasta(self.ctx, params) # Create assembly data dictionaries assembly_dict.update({"label": "assemblySetTest", "ref": ref[0]}) assembly_set_dict.update({ "description": " ", "items": [assembly_dict] }) # Create DataFileUtil dictionaries dfu_dict.update({ "type": "KBaseSets.AssemblySet", "data": assembly_set_dict, "name": "Assembly_Test" }) dfu_dict_2.update({'id': ws_id, 'objects': [dfu_dict]}) # Create assembly set object assembly_set_obj = dfu.save_objects(dfu_dict_2) assembly_set_ref = [ str(assembly_set_obj[0][6]) + '/' + str(assembly_set_obj[0][0]) + '/' + str(assembly_set_obj[0][4]) ] # Get FASTA ret = self.getImpl().get_fastas(self.callback_url, assembly_set_ref)
def setUpClass(cls): token = os.environ.get('KB_AUTH_TOKEN', None) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'provenance': [{ 'service': 'GenomeFileUtil', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('GenomeFileUtil'): cls.cfg[nameval[0]] = nameval[1] cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = GenomeFileUtil(cls.cfg) gbk_path = "data/e_coli/GCF_000005845.2_ASM584v2_genomic.gbff" ws_obj_name = 'ecoli_genome' suffix = int(time.time() * 1000) cls.wsName = "test_GenomeFileUtil_" + str(suffix) ret = cls.wsClient.create_workspace({'workspace': cls.wsName}) result = cls.serviceImpl.genbank_to_genome( cls.ctx, { 'file': { 'path': gbk_path }, 'workspace_name': cls.wsName, 'genome_name': ws_obj_name, 'generate_ids_if_needed': 1, 'source': "RefSeq Reference" })[0] # print("HERE IS THE RESULT:") data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=cls.ctx['token'], service_ver='dev') genome = data_file_cli.get_objects( {'object_refs': [result['genome_ref']]})['data'][0]['data'] cls.assembly_ref = genome["assembly_ref"]
def setUpClass(cls): cls.maxDiff = 70000 cls.token = os.environ.get('KB_AUTH_TOKEN', None) config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('GenericsAPI'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(cls.token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': cls.token, 'user_id': user_id, 'provenance': [ {'service': 'GenericsAPI', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL) cls.serviceImpl = GenericsAPI(cls.cfg) cls.serviceUtils = AttributesUtil(cls.cfg) cls.shockURL = cls.cfg['shock-url'] cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] cls.dfu = DataFileUtil(cls.callback_url) cls.hs = HandleService(url=cls.cfg['handle-service-url'], token=cls.token) suffix = int(time.time() * 1000) cls.wsName = "test_CompoundSetUtils_" + str(suffix) ret = cls.wsClient.create_workspace({'workspace': cls.wsName}) cls.wsId = ret[0] cls.attribute_mapping = json.load(open('data/AM1.json')) info = cls.dfu.save_objects({ "id": cls.wsId, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": cls.attribute_mapping, "name": "test_cond_set" }] })[0] cls.attribute_mapping_ref = "%s/%s/%s" % (info[6], info[0], info[4]) cls.attribute_mapping_2 = json.load(open('data/AM2.json')) small_file = os.path.join(cls.scratch, 'test.txt') with open(small_file, "w") as f: f.write("empty content") cls.test_shock = cls.dfu.file_to_shock({'file_path': small_file, 'make_handle': True}) cls.handles_to_delete = [] cls.nodes_to_delete = [] cls.handles_to_delete.append(cls.test_shock['handle']['hid']) cls.nodes_to_delete.append(cls.test_shock['shock_id'])
class SampleServiceUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.srv_wiz_url = config['srv-wiz-url'] self.sample_url = config.get('kbase-endpoint') + '/sampleservice' self.dfu = DataFileUtil(self.callback_url) self.sample_ser = SampleService(self.sample_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) def get_sample_service_url(self): return self.sample_url def get_sample(self, sample_id, version=None): sample_url = self.get_sample_service_url() headers = {"Authorization": self.token} params = {"id": sample_id, "version": version} payload = { "method": "SampleService.get_sample", "id": str(uuid.uuid4()), "params": [params], "version": "1.1" } resp = requests.post(url=sample_url, headers=headers, data=json.dumps(payload)) resp_json = resp.json() if resp_json.get('error'): raise RuntimeError( f"Error from SampleService - {resp_json['error']}") sample = resp_json['result'][0] # sample = self.sample_ser.get_sample(params)[0] return sample def get_ids_from_samples(self, sample_set_ref): logging.info('start retrieving sample ids from sample set') sample_set = self.dfu.get_objects({"object_refs": [sample_set_ref] })['data'][0]['data'] samples = sample_set['samples'] data_ids = [] for sample in samples: sample_id = sample.get('id') version = sample.get('version') sample_data = self.get_sample(sample_id, version=version) data_id = sample_data['name'] data_ids.append(data_id) return data_ids
def __init__(self, config, ctx): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.re_api = RE_API(config['re-url'], ctx['token']) self.dfu = DataFileUtil(self.callback_url) self.kbse = KBaseSearchEngine(config['search-url']) self.kbr = KBaseReport(self.callback_url) self.object_categories = ['Narrative', 'Genome', 'FBAModel', 'Tree']
def test_gff_and_metagenome_to_metagenome(self): dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) result = self.serviceImpl.ws_obj_gff_to_metagenome(self.ctx, { 'workspace_name': self.wsName, 'genome_name': 'MyGenome', 'gff_file': {'path': self.gff_path}, 'ws_ref': self.metagenome_ref, 'source': 'GFF', 'type': 'Reference', 'genome_type': 'Metagenome', 'is_metagenome': True, 'generate_missing_genes': True, 'taxon_id': '3702', })[0] metagenome = dfu.get_objects({'object_refs': [result['metagenome_ref']]})['data'][0]['data'] # make sure its same as original self._compare_features(self.genome_orig, metagenome)
def update_clients(): callback_url = os.environ['SDK_CALLBACK_URL'] Var.update( dfu=DataFileUtil(callback_url), kbr=KBaseReport(callback_url), fpu=FunctionalProfileUtil(callback_url, service_ver='dev'), gapi=GenericsAPI(callback_url, service_ver='dev'), )
def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.report_util = kb_GenericsReport(self.callback_url) self.data_util = DataUtil(config) self.sampleservice_util = SampleServiceUtil(config) self.attr_util = AttributesUtil(config) self.matrix_util = MatrixUtil(config) self.matrix_types = [ x.split(".")[1].split('-')[0] for x in self.data_util.list_generic_types() ] self.taxon_wsname = config['taxon-workspace-name'] self.kbse = KBaseSearchEngine(config['search-url']) self.taxon_cache = dict()
def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.mgu = MetagenomeUtils(self.callback_url)
def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.ws = Workspace(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.scratch = config['scratch']
def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] # self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.report_util = kb_GenericsReport(self.callback_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO)
def create_html_report(self, callback_url, output_dir, workspace_name, objects_created): ''' function for creating html report ''' dfu = DataFileUtil(callback_url) report_name = 'kb_gsea_report_' + str(uuid.uuid4()) report = KBaseReport(callback_url) # report_dir = "localhost" # htmlstring = "<a href=" + report_dir + "/jbrowse/index.html>report link</a>" # htmlstring = "<a href='./igv_output/index.html'>report link</a>" # index_file_path = output_dir + "snp_eff/snpEff_summary.html" # html_file = open(index_file_path, "wt") # n = html_file.write(htmlstring) # html_file.close() # Source path # source = "/kb/module/deps/jbrowse" # Destination path # destination = output_dir +"/jbrowse" # dest = shutil.copytree(source, destination) # os.system("cp -r " + source +" "+ destination) report_name = 'VariationAnnotationReport' + str(uuid.uuid4()) report_shock_id = dfu.file_to_shock({'file_path': output_dir, 'pack': 'zip'})['shock_id'] html_file = { 'shock_id': report_shock_id, 'name': "index.html", 'label': 'index.html', 'description': 'HTMLL report for VariationAnnotation' } report_info = report.create_extended_report({ 'objects_created': objects_created, 'direct_html_link_index': 0, 'html_links': [html_file], 'report_object_name': report_name, 'workspace_name': workspace_name }) return { 'report_name': report_info['name'], 'report_ref': report_info['ref'] }
class genelistutil: def __init__(self): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) pass def download_genelist(self, genelistref, genesetfile): get_objects_params = {'object_refs': [genelistref]} geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data'] with open(genesetfile, 'w') as filehandle: for item in geneset['element_ordering']: filehandle.write('%s\n' % item) def listToString(self, s): # initialize an empty string str1 = "" # traverse in the string for ele in s: str1 += ele # return string return str1 # function to get unique values def unique(self, list1): # insert the list to the set list_set = set(list1) # convert the set to the list unique_list = (list(list_set)) for x in unique_list: print(x) def get_genomeid_from_featuresetid(self, genelistref): genome = {} get_objects_params = {'object_refs': [genelistref]} geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data'] print(type(geneset['elements'].values())) for k, v in geneset['elements'].items(): print(self.listToString(v)) genome[self.listToString(v)] = 1 if (len(genome) != 1): exit("source of genome is not unique\n") else: return (list(genome.values())[0])
def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.data_util = DataUtil(config) self.matrix_types = [x.split(".")[1].split('-')[0] for x in self.data_util.list_generic_types()]
def UploadFromMdscan(self, callback_url, params): """ :param params: instance of type "UploadmfmdInParams" -> structure: parameter "path" of String, parameter "ws_name" of String, parameter "obj_name" of String :returns: instance of type "UploadOutput" -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: output #BEGIN UploadFromMdscan print('Extracting motifs') motifList = self.parse_mdscan_output(params['path']) print(motifList) MSO = {} MSO = motifList dfu = DataFileUtil(callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name']) save_objects_params['objects'] = [{ 'type': 'KBaseGeneRegulation.MotifSet', 'data': MSO, 'name': params['obj_name'] }] info = dfu.save_objects(save_objects_params)[0] print('SAVED OBJECT') print(info) motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) print(motif_set_ref) output = {'obj_ref': motif_set_ref} print(output) #exit("test") #END UploadFromMdscan # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method UploadFrommfmd return value ' + 'output is not type dict as required.') # return the results return [output]
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_deseq'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'user_id': user_id, 'provenance': [{ 'service': 'kb_deseq', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = Workspace(cls.wsURL) cls.ws = Workspace(cls.wsURL, token=token) cls.serviceImpl = kb_deseq(cls.cfg) cls.serviceImpl.status(cls.ctx) cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] cls.gfu = GenomeFileUtil(cls.callback_url, service_ver='dev') cls.dfu = DataFileUtil(cls.callback_url) cls.ru = ReadsUtils(cls.callback_url) cls.rau = ReadsAlignmentUtils(cls.callback_url) cls.stringtie = kb_stringtie(cls.callback_url) cls.eu = ExpressionUtils(cls.callback_url) cls.deseq_runner = DESeqUtil(cls.cfg) suffix = int(time.time() * 1000) cls.wsName = "test_kb_stringtie_" + str(suffix) cls.wsClient.create_workspace({'workspace': cls.wsName}) cls.dfu.ws_name_to_id(cls.wsName) # public on CI cls.expressionset_ref = '30957/52/41' cls.condition_1 = 'Ecoli_WT' cls.condition_2 = 'Ecoli_ydcR' # public on Appdev cls.expressionset_ref = '60454/19' cls.condition_1 = 'WT' cls.condition_2 = 'Hy5'
def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) # set up directory for files folder self.output_dir = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(self.output_dir) self.files_folder = os.path.join(self.output_dir, 'files') os.mkdir(self.files_folder) self.file_paths = [] self.html_paths = [] self.GenAPI = GenericsAPI(self.callback_url)
def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.ws = Workspace(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.scratch = config['scratch']
def __init__(self, config): #BEGIN_CONSTRUCTOR self.workspaceURL = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.scratch = os.path.abspath(config['scratch']) self.config = config #END_CONSTRUCTOR pass
def upload_pangenome(cb_url, scratch, Pangenome, workspace_name, pangenome_name): """ params: cb_url : callback url scratch : folder path to Pangenome object pangenome : KBaseGenomes.Pangenome like object workspace_name : workspace name pangenome_name : Pangenome display name Returns: pangenome_ref: Pangenome workspace reference pangenome_info: info on pangenome object """ dfu = DataFileUtil(cb_url) meta = {} hidden = 0 # dump pangenome to scratch for upload # data_path = os.path.join(scratch, pangenome_name + '.json') # json.dump(pangenome, open(data_path, 'w')) if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = dfu.ws_name_to_id(workspace_name) save_params = { 'id': workspace_id, 'objects': [{ 'type': 'KBaseGenomes.Pangenome', 'data': Pangenome, 'name': pangenome_name, 'meta': meta, 'hidden': hidden }] } info = dfu.save_objects(save_params)[0] ref = "{}/{}/{}".format(info[6], info[0], info[4]) print("Pangenome saved to {}".format(ref)) return {'pangenome_ref': ref, 'pangenome_info': info}
def run_Gblocks(self, ctx, params): """ Method for trimming MSAs of either DNA or PROTEIN sequences ** ** input_type: MSA ** output_type: MSA :param params: instance of type "Gblocks_Params" (Gblocks Input Params) -> structure: parameter "workspace_name" of type "workspace_name" (** The workspace object refs are of form: ** ** objects = ws.get_objects([{'ref': params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means the entire name combining the workspace id and the object name ** "id" is a numerical identifier of the workspace or object, and should just be used for workspace ** "name" is a string identifier of a workspace or object. This is received from Narrative.), parameter "desc" of String, parameter "input_ref" of type "data_obj_ref", parameter "output_name" of type "data_obj_name", parameter "trim_level" of Long, parameter "min_seqs_for_conserved" of Long, parameter "min_seqs_for_flank" of Long, parameter "max_pos_contig_nonconserved" of Long, parameter "min_block_len" of Long, parameter "remove_mask_positions_flag" of Long :returns: instance of type "Gblocks_Output" (Gblocks Output) -> structure: parameter "report_name" of type "data_obj_name", parameter "report_ref" of type "data_obj_ref" """ # ctx is the context object # return variables are: returnVal #BEGIN run_Gblocks console = [] invalid_msgs = [] self.log(console,'Running run_Gblocks with params=') self.log(console, "\n"+pformat(params)) report = '' # report = 'Running run_Gblocks with params=' # report += "\n"+pformat(params) #### do some basic checks # if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'input_ref' not in params: raise ValueError('input_ref parameter is required') if 'output_name' not in params: raise ValueError('output_name parameter is required') #### Get the input_ref MSA object ## try: ws = workspaceService(self.workspaceURL, token=ctx['token']) objects = ws.get_objects([{'ref': params['input_ref']}]) data = objects[0]['data'] info = objects[0]['info'] input_name = info[1] input_type_name = info[2].split('.')[1].split('-')[0] except Exception as e: raise ValueError('Unable to fetch input_ref object from workspace: ' + str(e)) #to get the full stack trace: traceback.format_exc() if input_type_name == 'MSA': MSA_in = data row_order = [] default_row_labels = dict() if 'row_order' in MSA_in.keys(): row_order = MSA_in['row_order'] else: row_order = sorted(MSA_in['alignment'].keys()) if 'default_row_labels' in MSA_in.keys(): default_row_labels = MSA_in['default_row_labels'] else: for row_id in row_order: default_row_labels[row_id] = row_id if len(row_order) < 2: self.log(invalid_msgs,"must have multiple records in MSA: "+params['input_ref']) # export features to FASTA file input_MSA_file_path = os.path.join(self.scratch, input_name+".fasta") self.log(console, 'writing fasta file: '+input_MSA_file_path) records = [] for row_id in row_order: #self.log(console,"row_id: '"+row_id+"'") # DEBUG #self.log(console,"alignment: '"+MSA_in['alignment'][row_id]+"'") # DEBUG # using SeqIO makes multiline sequences. (Gblocks doesn't care, but FastTree doesn't like multiline, and I don't care enough to change code) #record = SeqRecord(Seq(MSA_in['alignment'][row_id]), id=row_id, description=default_row_labels[row_id]) #records.append(record) #SeqIO.write(records, input_MSA_file_path, "fasta") records.extend(['>'+row_id, MSA_in['alignment'][row_id] ]) with open(input_MSA_file_path,'w',0) as input_MSA_file_handle: input_MSA_file_handle.write("\n".join(records)+"\n") # Determine whether nuc or protein sequences # NUC_MSA_pattern = re.compile("^[\.\-_ACGTUXNRYSWKMBDHVacgtuxnryswkmbdhv \t\n]+$") all_seqs_nuc = True for row_id in row_order: #self.log(console, row_id+": '"+MSA_in['alignment'][row_id]+"'") if NUC_MSA_pattern.match(MSA_in['alignment'][row_id]) == None: all_seqs_nuc = False break # Missing proper input_type # else: raise ValueError('Cannot yet handle input_ref type of: '+type_name) # DEBUG: check the MSA file contents # with open(input_MSA_file_path, 'r', 0) as input_MSA_file_handle: # for line in input_MSA_file_handle: # #self.log(console,"MSA_LINE: '"+line+"'") # too big for console # self.log(invalid_msgs,"MSA_LINE: '"+line+"'") # validate input data # N_seqs = 0 L_first_seq = 0 with open(input_MSA_file_path, 'r', 0) as input_MSA_file_handle: for line in input_MSA_file_handle: if line.startswith('>'): N_seqs += 1 continue if L_first_seq == 0: for c in line: if c != '-' and c != ' ' and c != "\n": L_first_seq += 1 # min_seqs_for_conserved if 'min_seqs_for_conserved' in params and params['min_seqs_for_conserved'] != None and int(params['min_seqs_for_conserved']) != 0: if int(params['min_seqs_for_conserved']) < int(0.5*N_seqs)+1: self.log(invalid_msgs,"Min Seqs for Conserved Pos ("+str(params['min_seqs_for_conserved'])+") must be >= N/2+1 (N="+str(N_seqs)+", N/2+1="+str(int(0.5*N_seqs)+1)+")\n") if int(params['min_seqs_for_conserved']) > int(params['min_seqs_for_flank']): self.log(invalid_msgs,"Min Seqs for Conserved Pos ("+str(params['min_seqs_for_conserved'])+") must be <= Min Seqs for Flank Pos ("+str(params['min_seqs_for_flank'])+")\n") # min_seqs_for_flank if 'min_seqs_for_flank' in params and params['min_seqs_for_flank'] != None and int(params['min_seqs_for_flank']) != 0: if int(params['min_seqs_for_flank']) > N_seqs: self.log(invalid_msgs,"Min Seqs for Flank Pos ("+str(params['min_seqs_for_flank'])+") must be <= N (N="+str(N_seqs)+")\n") # max_pos_contig_nonconserved if 'max_pos_contig_nonconserved' in params and params['max_pos_contig_nonconserved'] != None and int(params['max_pos_contig_nonconserved']) != 0: if int(params['max_pos_contig_nonconserved']) < 0: self.log(invalid_msgs,"Max Num Non-Conserved Pos ("+str(params['max_pos_contig_nonconserved'])+") must be >= 0"+"\n") if int(params['max_pos_contig_nonconserved']) > L_first_seq or int(params['max_pos_contig_nonconserved']) >= 32000: self.log(invalid_msgs,"Max Num Non-Conserved Pos ("+str(params['max_pos_contig_nonconserved'])+") must be <= L first seq ("+str(L_first_seq)+") and < 32000\n") # min_block_len if 'min_block_len' in params and params['min_block_len'] != None and int(params['min_block_len']) != 0: if int(params['min_block_len']) < 2: self.log(invalid_msgs,"Min Block Len ("+str(params['min_block_len'])+") must be >= 2"+"\n") if int(params['min_block_len']) > L_first_seq or int(params['min_block_len']) >= 32000: self.log(invalid_msgs,"Min Block Len ("+str(params['min_block_len'])+") must be <= L first seq ("+str(L_first_seq)+") and < 32000\n") # trim_level if 'trim_level' in params and params['trim_level'] != None and int(params['trim_level']) != 0: if int(params['trim_level']) < 0 or int(params['trim_level']) > 2: self.log(invalid_msgs,"Trim Level ("+str(params['trim_level'])+") must be >= 0 and <= 2"+"\n") if len(invalid_msgs) > 0: # load the method provenance from the context object self.log(console,"SETTING PROVENANCE") # DEBUG provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference provenance[0]['input_ws_objects'] = [] provenance[0]['input_ws_objects'].append(params['input_ref']) provenance[0]['service'] = 'kb_gblocks' provenance[0]['method'] = 'run_Gblocks' # report report += "FAILURE\n\n"+"\n".join(invalid_msgs)+"\n" reportObj = { 'objects_created':[], 'text_message':report } reportName = 'gblocks_report_'+str(uuid.uuid4()) report_obj_info = ws.save_objects({ # 'id':info[6], 'workspace':params['workspace_name'], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, 'provenance':provenance } ] })[0] self.log(console,"BUILDING RETURN OBJECT") returnVal = { 'report_name': reportName, 'report_ref': str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' + str(report_obj_info[4]) # 'output_ref': None } self.log(console,"run_Gblocks DONE") return [returnVal] ### Construct the command # # e.g. # for "0.5" gaps: cat "o\n<MSA_file>\nb\n5\ng\nm\nq\n" | Gblocks # for "all" gaps: cat "o\n<MSA_file>\nb\n5\n5\ng\nm\nq\n" | Gblocks # gblocks_cmd = [self.GBLOCKS_bin] # check for necessary files if not os.path.isfile(self.GBLOCKS_bin): raise ValueError("no such file '"+self.GBLOCKS_bin+"'") if not os.path.isfile(input_MSA_file_path): raise ValueError("no such file '"+input_MSA_file_path+"'") if not os.path.getsize(input_MSA_file_path) > 0: raise ValueError("empty file '"+input_MSA_file_path+"'") # DEBUG # with open(input_MSA_file_path,'r',0) as input_MSA_file_handle: # for line in input_MSA_file_handle: # #self.log(console,"MSA LINE: '"+line+"'") # too big for console # self.log(invalid_msgs,"MSA LINE: '"+line+"'") # set the output path timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000) output_dir = os.path.join(self.scratch,'output.'+str(timestamp)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Gblocks names output blocks MSA by appending "-gb" to input file #output_GBLOCKS_file_path = os.path.join(output_dir, input_name+'-gb') output_GBLOCKS_file_path = input_MSA_file_path+'-gb' output_aln_file_path = output_GBLOCKS_file_path # Gblocks is interactive and only accepts args from pipe input #if 'arg' in params and params['arg'] != None and params['arg'] != 0: # fasttree_cmd.append('-arg') # fasttree_cmd.append(val) # Run GBLOCKS, capture output as it happens # self.log(console, 'RUNNING GBLOCKS:') self.log(console, ' '+' '.join(gblocks_cmd)) # report += "\n"+'running GBLOCKS:'+"\n" # report += ' '+' '.join(gblocks_cmd)+"\n" # FastTree requires shell=True in order to see input data env = os.environ.copy() #joined_fasttree_cmd = ' '.join(fasttree_cmd) # redirect out doesn't work with subprocess unless you join command first #p = subprocess.Popen([joined_fasttree_cmd], \ p = subprocess.Popen(gblocks_cmd, \ cwd = self.scratch, \ stdin = subprocess.PIPE, \ stdout = subprocess.PIPE, \ stderr = subprocess.PIPE, \ shell = True, \ env = env) # executable = '/bin/bash' ) # write commands to process # # for "0.5" gaps: cat "o\n<MSA_file>\nb\n5\ng\nm\nq\n" | Gblocks # for "all" gaps: cat "o\n<MSA_file>\nb\n5\n5\ng\nm\nq\n" | Gblocks p.stdin.write("o"+"\n") # open MSA file p.stdin.write(input_MSA_file_path+"\n") if 'trim_level' in params and params['trim_level'] != None and int(params['trim_level']) != 0: p.stdin.write("b"+"\n") if int(params['trim_level']) >= 1: self.log (console,"changing trim level") p.stdin.write("5"+"\n") # set to "half" if int(params['trim_level']) == 2: self.log (console,"changing trim level") p.stdin.write("5"+"\n") # set to "all" elif int(params['trim_level']) > 2: raise ValueError ("trim_level ("+str(params['trim_level'])+") was not between 0-2") p.stdin.write("m"+"\n") # flank must precede conserved because it acts us upper bound for acceptable conserved values if 'min_seqs_for_flank' in params and params['min_seqs_for_flank'] != None and int(params['min_seqs_for_flank']) != 0: self.log (console,"changing min_seqs_for_flank") p.stdin.write("b"+"\n") p.stdin.write("2"+"\n") p.stdin.write(str(params['min_seqs_for_flank'])+"\n") p.stdin.write("m"+"\n") if 'min_seqs_for_conserved' in params and params['min_seqs_for_conserved'] != None and int(params['min_seqs_for_conserved']) != 0: self.log (console,"changing min_seqs_for_conserved") p.stdin.write("b"+"\n") p.stdin.write("1"+"\n") p.stdin.write(str(params['min_seqs_for_conserved'])+"\n") p.stdin.write("m"+"\n") if 'max_pos_contig_nonconserved' in params and params['max_pos_contig_nonconserved'] != None and int(params['max_pos_contig_nonconserved']) > -1: self.log (console,"changing max_pos_contig_nonconserved") p.stdin.write("b"+"\n") p.stdin.write("3"+"\n") p.stdin.write(str(params['max_pos_contig_nonconserved'])+"\n") p.stdin.write("m"+"\n") if 'min_block_len' in params and params['min_block_len'] != None and params['min_block_len'] != 0: self.log (console,"changing min_block_len") p.stdin.write("b"+"\n") p.stdin.write("4"+"\n") p.stdin.write(str(params['min_block_len'])+"\n") p.stdin.write("m"+"\n") p.stdin.write("g"+"\n") # get blocks p.stdin.write("q"+"\n") # quit p.stdin.close() p.wait() # Read output # while True: line = p.stdout.readline() #line = p.stderr.readline() if not line: break self.log(console, line.replace('\n', '')) p.stdout.close() #p.stderr.close() p.wait() self.log(console, 'return code: ' + str(p.returncode)) # if p.returncode != 0: if p.returncode != 1: raise ValueError('Error running GBLOCKS, return code: '+str(p.returncode) + '\n\n'+ '\n'.join(console)) # Check that GBLOCKS produced output # if not os.path.isfile(output_GBLOCKS_file_path): raise ValueError("failed to create GBLOCKS output: "+output_GBLOCKS_file_path) elif not os.path.getsize(output_GBLOCKS_file_path) > 0: raise ValueError("created empty file for GBLOCKS output: "+output_GBLOCKS_file_path) # load the method provenance from the context object # self.log(console,"SETTING PROVENANCE") # DEBUG provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference provenance[0]['input_ws_objects'] = [] provenance[0]['input_ws_objects'].append(params['input_ref']) provenance[0]['service'] = 'kb_gblocks' provenance[0]['method'] = 'run_Gblocks' # reformat output to single-line FASTA MSA and check that output not empty (often happens when param combinations don't produce viable blocks # output_fasta_buf = [] id_order = [] this_id = None ids = dict() alignment = dict() L_alignment = 0; L_alignment_set = False with open(output_GBLOCKS_file_path,'r',0) as output_GBLOCKS_file_handle: for line in output_GBLOCKS_file_handle: line = line.rstrip() if line.startswith('>'): this_id = line[1:] output_fasta_buf.append ('>'+re.sub('\s','_',default_row_labels[this_id])) id_order.append(this_id) alignment[this_id] = '' if L_alignment != 0 and not L_alignment_set: L_alignment_set = True continue output_fasta_buf.append (line) for c in line: if c != ' ' and c != "\n": alignment[this_id] += c if not L_alignment_set: L_alignment += 1 if L_alignment == 0: self.log(invalid_msgs,"params produced no blocks. Consider changing to less stringent values") else: if 'remove_mask_positions_flag' in params and params['remove_mask_positions_flag'] != None and params['remove_mask_positions_flag'] != '' and params['remove_mask_positions_flag'] == 1: self.log (console,"removing mask positions") mask = [] new_alignment = dict() for i in range(0,L_alignment): mask[i] = '+' if alignment[id_order[0]][i] == '-' \ or alignment[id_order[0]][i] == 'X' \ or alignment[id_order[0]][i] == 'x': mask[i] = '-' for row_id in id_order: new_alignment[row_id] = '' for i,c in enumerate(alignment[row_id]): if mask[i] == '+': new_alignment[row_id] += c alignment = new_alignment L_alignment = len(alignment[id_order[0]]) # write fasta with tidied ids output_MSA_file_path = os.path.join(output_dir, params['output_name']+'.fasta'); with open(output_MSA_file_path,'w',0) as output_MSA_file_handle: output_MSA_file_handle.write("\n".join(output_fasta_buf)+"\n") # Upload results # if len(invalid_msgs) == 0: self.log(console,"UPLOADING RESULTS") # DEBUG # Didn't write file # with open(output_MSA_file_path,'r',0) as output_MSA_file_handle: # output_MSA_buf = output_MSA_file_handle.read() # output_MSA_buf = output_MSA_buf.rstrip() # self.log(console,"\nMSA:\n"+output_MSA_buf+"\n") # Build output_MSA structure # first extract old info from MSA (labels, ws_refs, etc.) # MSA_out = dict() for key in MSA_in.keys(): MSA_out[key] = MSA_in[key] # then replace with new info # MSA_out['alignment'] = alignment MSA_out['name'] = params['output_name'] MSA_out['alignment_length'] = alignment_length = L_alignment MSA_name = params['output_name'] MSA_description = '' if 'desc' in params and params['desc'] != None and params['desc'] != '': MSA_out['desc'] = MSA_description = params['desc'] # Store MSA_out # new_obj_info = ws.save_objects({ 'workspace': params['workspace_name'], 'objects':[{ 'type': 'KBaseTrees.MSA', 'data': MSA_out, 'name': params['output_name'], 'meta': {}, 'provenance': provenance }] })[0] # create CLW formatted output file max_row_width = 60 id_aln_gap_width = 1 gap_chars = '' for sp_i in range(id_aln_gap_width): gap_chars += ' ' # DNA if all_seqs_nuc: strong_groups = { 'AG': True, 'CTU': True } weak_groups = None # PROTEINS else: strong_groups = { 'AST': True, 'EKNQ': True, 'HKNQ': True, 'DENQ': True, 'HKQR': True, 'ILMV': True, 'FILM': True, 'HY': True, 'FWY': True } weak_groups = { 'ACS': True, 'ATV': True, 'AGS': True, 'KNST': True, 'APST': True, 'DGNS': True, 'DEKNQS': True, 'DEHKNQ': True, 'EHKNQR': True, 'FILMV': True, 'FHY': True } clw_buf = [] clw_buf.append ('CLUSTALW format of GBLOCKS trimmed MSA '+MSA_name+': '+MSA_description) clw_buf.append ('') long_id_len = 0 aln_pos_by_id = dict() for row_id in row_order: aln_pos_by_id[row_id] = 0 row_id_disp = default_row_labels[row_id] if long_id_len < len(row_id_disp): long_id_len = len(row_id_disp) full_row_cnt = alignment_length // max_row_width if alignment_length % max_row_width == 0: full_row_cnt -= 1 for chunk_i in range (full_row_cnt + 1): for row_id in row_order: row_id_disp = re.sub('\s','_',default_row_labels[row_id]) for sp_i in range (long_id_len-len(row_id_disp)): row_id_disp += ' ' aln_chunk_upper_bound = (chunk_i+1)*max_row_width if aln_chunk_upper_bound > alignment_length: aln_chunk_upper_bound = alignment_length aln_chunk = alignment[row_id][chunk_i*max_row_width:aln_chunk_upper_bound] for c in aln_chunk: if c != '-': aln_pos_by_id[row_id] += 1 clw_buf.append (row_id_disp+gap_chars+aln_chunk+' '+str(aln_pos_by_id[row_id])) # conservation line cons_line = '' for pos_i in range(chunk_i*max_row_width, aln_chunk_upper_bound): col_chars = dict() seq_cnt = 0 for row_id in row_order: char = alignment[row_id][pos_i] if char != '-': seq_cnt += 1 col_chars[char] = True if seq_cnt <= 1: cons_char = ' ' elif len(col_chars.keys()) == 1: cons_char = '*' else: strong = False for strong_group in strong_groups.keys(): this_strong_group = True for seen_char in col_chars.keys(): if seen_char not in strong_group: this_strong_group = False break if this_strong_group: strong = True break if not strong: weak = False if weak_groups != None: for weak_group in weak_groups.keys(): this_weak_group = True for seen_char in col_chars.keys(): if seen_char not in weak_group: this_strong_group = False break if this_weak_group: weak = True if strong: cons_char = ':' elif weak: cons_char = '.' else: cons_char = ' ' cons_line += cons_char lead_space = '' for sp_i in range(long_id_len): lead_space += ' ' lead_space += gap_chars clw_buf.append(lead_space+cons_line) clw_buf.append('') # write clw to file clw_buf_str = "\n".join(clw_buf)+"\n" output_clw_file_path = os.path.join(output_dir, input_name+'-MSA.clw'); with open (output_clw_file_path, "w", 0) as output_clw_file_handle: output_clw_file_handle.write(clw_buf_str) output_clw_file_handle.close() # upload GBLOCKS FASTA output to SHOCK for file_links dfu = DFUClient(self.callbackURL) try: output_upload_ret = dfu.file_to_shock({'file_path': output_aln_file_path, # DEBUG # 'make_handle': 0, # 'pack': 'zip'}) 'make_handle': 0}) except: raise ValueError ('error loading aln_out file to shock') # upload GBLOCKS CLW output to SHOCK for file_links try: output_clw_upload_ret = dfu.file_to_shock({'file_path': output_clw_file_path, # DEBUG # 'make_handle': 0, # 'pack': 'zip'}) 'make_handle': 0}) except: raise ValueError ('error loading clw_out file to shock') # make HTML reports # # HERE # build output report object # self.log(console,"BUILDING REPORT") # DEBUG reportName = 'gblocks_report_'+str(uuid.uuid4()) reportObj = { 'objects_created':[{'ref':params['workspace_name']+'/'+params['output_name'], 'description':'GBLOCKS MSA'}], #'message': '', 'message': clw_buf_str, 'direct_html': '', #'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'workspace_name': params['workspace_name'], 'report_object_name': reportName } reportObj['file_links'] = [{'shock_id': output_upload_ret['shock_id'], 'name': params['output_name']+'-GBLOCKS.FASTA', 'label': 'GBLOCKS-trimmed MSA FASTA' }, {'shock_id': output_clw_upload_ret['shock_id'], 'name': params['output_name']+'-GBLOCKS.CLW', 'label': 'GBLOCKS-trimmed MSA CLUSTALW' }] # save report object # SERVICE_VER = 'release' reportClient = KBaseReport(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']}) report_info = reportClient.create_extended_report(reportObj) else: # len(invalid_msgs) > 0 reportName = 'gblocks_report_'+str(uuid.uuid4()) report += "FAILURE:\n\n"+"\n".join(invalid_msgs)+"\n" reportObj = { 'objects_created':[], 'text_message':report } ws = workspaceService(self.workspaceURL, token=ctx['token']) report_obj_info = ws.save_objects({ #'id':info[6], 'workspace':params['workspace_name'], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, 'provenance':provenance } ] })[0] report_info = dict() report_info['name'] = report_obj_info[1] report_info['ref'] = str(report_obj_info[6])+'/'+str(report_obj_info[0])+'/'+str(report_obj_info[4]) # done returnVal = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } self.log(console,"run_Gblocks DONE") #END run_Gblocks # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method run_Gblocks return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
class FeatureSetBuilder: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_upload_featureset_from_diff_expr_params(self, params): """ _validate_upload_featureset_from_diff_expr_params: validates params passed to upload_featureset_from_diff_expr method """ log('start validating upload_featureset_from_diff_expr params') # check for required parameters for p in ['diff_expression_ref', 'workspace_name', 'p_cutoff', 'q_cutoff', 'fold_change_cutoff']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) p = params.get('fold_scale_type') if p and p != 'logarithm': raise ValueError('"fold_scale_type" parameter must be set to "logarithm", if used') @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning("Unexpected parameter {} supplied".format(param)) def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, workspace_name): """ _generate_report: generate summary report """ log('start creating report') output_html_files = self._generate_html_report(up_feature_set_ref_list, down_feature_set_ref_list) objects_created = list() for up_feature_set_ref in up_feature_set_ref_list: objects_created += [{'ref': up_feature_set_ref, 'description': 'Upper FeatureSet Object'}] for down_feature_set_ref in down_feature_set_ref_list: objects_created += [{'ref': down_feature_set_ref, 'description': 'Lower FeatureSet Object'}] for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list: objects_created += [{'ref': filtered_expression_matrix_ref, 'description': 'Filtered ExpressionMatrix Object'}] report_params = {'message': '', 'workspace_name': workspace_name, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') uppper_feature_content = '' for up_feature_set_ref in up_feature_set_ref_list: feature_set_obj = self.ws.get_objects2({'objects': [{'ref': up_feature_set_ref}]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name, len(feature_ids)) lower_feature_content = '' for down_feature_set_ref in down_feature_set_ref_list: feature_set_obj = self.ws.get_objects2({'objects': [{'ref': down_feature_set_ref}]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name, len(feature_ids)) with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<tr><td>Upper_FeatureSet</td></tr>', uppper_feature_content) report_template = report_template.replace('<tr><td>Lower_FeatureSet</td></tr>', lower_feature_content) result_file.write(report_template) html_report.append({'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report'}) return html_report def _process_diff_expression(self, diff_expression_set_ref, result_directory, condition_label_pair): """ _process_diff_expression: process differential expression object info """ log('start processing differential expression object') diff_expr_set_data = self.ws.get_objects2({'objects': [{'ref': diff_expression_set_ref}]})['data'][0]['data'] set_items = diff_expr_set_data['items'] diff_expr_matrix_file_name = 'gene_results.csv' diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name) with open(diff_expr_matrix_file, 'w') as csvfile: fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for set_item in set_items: diff_expression_ref = set_item['ref'] diff_expression_data = self.ws.get_objects2({'objects': [{'ref': diff_expression_ref}]})['data'][0]['data'] label_string = set_item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_1 = label_list[0] condition_2 = label_list[1] if condition_1 in condition_label_pair and condition_2 in condition_label_pair: genome_id = diff_expression_data['genome_ref'] matrix_data = diff_expression_data['data'] selected_diff_expression_ref = diff_expression_ref with open(diff_expr_matrix_file, 'a') as csvfile: row_ids = matrix_data.get('row_ids') row_values = matrix_data.get('values') writer = csv.DictWriter(csvfile, fieldnames=fieldnames) for pos, row_id in enumerate(row_ids): row_value = row_values[pos] writer.writerow({'gene_id': row_id, 'log2_fold_change': row_value[0], 'p_value': row_value[1], 'q_value': row_value[2]}) return diff_expr_matrix_file, genome_id, selected_diff_expression_ref def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name): """ _generate_feature_set: generate FeatureSet object KBaseCollections.FeatureSet type: typedef structure { string description; list<feature_id> element_ordering; mapping<feature_id, list<genome_ref>> elements; } FeatureSet; """ log('start saving KBaseCollections.FeatureSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) elements = {feature_id: [genome_id] for feature_id in feature_ids} feature_set_data = {'description': 'Generated FeatureSet from DifferentialExpression', 'element_ordering': feature_ids, 'elements': elements} object_type = 'KBaseCollections.FeatureSet' save_object_params = { 'id': workspace_id, 'objects': [{'type': object_type, 'data': feature_set_data, 'name': feature_set_name}]} dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) return feature_set_obj_ref def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value, comp_fold_change_cutoff): """ _process_matrix_file: filter matrix file by given cutoffs """ log('start processing matrix file') up_feature_ids = [] down_feature_ids = [] if comp_fold_change_cutoff < 0: comp_fold_change_cutoff = -comp_fold_change_cutoff with open(diff_expr_matrix_file, 'r') as file: reader = csv.DictReader(file) for row in reader: feature_id = row['gene_id'] row_p_value = row['p_value'] row_q_value = row['q_value'] row_fold_change_cutoff = row['log2_fold_change'] null_value = {'NA', 'null', ''} col_value = {row_p_value, row_q_value, row_fold_change_cutoff} if not col_value.intersection(null_value): p_value_condition = float(row_p_value) <= comp_p_value q_value_condition = float(row_q_value) <= comp_q_value up_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) >= comp_fold_change_cutoff)) down_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) <= -comp_fold_change_cutoff)) if up_matches_condition: up_feature_ids.append(feature_id) elif down_matches_condition: down_feature_ids.append(feature_id) return list(set(up_feature_ids)), list(set(down_feature_ids)) def _filter_expression_matrix(self, expression_matrix_ref, feature_ids, workspace_name, filtered_expression_matrix_suffix="", diff_expression_matrix_ref=None, filtered_expression_matrix_name=None): """ _filter_expression_matrix: generated filtered expression matrix """ log('start saving ExpressionMatrix object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_matrix_obj = self.dfu.get_objects({'object_refs': [expression_matrix_ref]})['data'][0] expression_matrix_info = expression_matrix_obj['info'] expression_matrix_data = expression_matrix_obj['data'] expression_matrix_name = expression_matrix_info[1] if not filtered_expression_matrix_name: if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name): filtered_expression_matrix_name = re.sub('_*[Ee]xpression_*[Mm]atrix', filtered_expression_matrix_suffix, expression_matrix_name) else: filtered_expression_matrix_name = expression_matrix_name + \ filtered_expression_matrix_suffix filtered_expression_matrix_data = expression_matrix_data.copy() data = filtered_expression_matrix_data['data'] row_ids = data['row_ids'] values = data['values'] filtered_data = data.copy() filtered_row_ids = list() filtered_values = list() for pos, row_id in enumerate(row_ids): if row_id in feature_ids: filtered_row_ids.append(row_id) filtered_values.append(values[pos]) filtered_data['row_ids'] = filtered_row_ids filtered_data['values'] = filtered_values filtered_expression_matrix_data['data'] = filtered_data expression_obj = {'type': expression_matrix_info[2], 'data': filtered_expression_matrix_data, 'name': filtered_expression_matrix_name} # we now save the filtering DEM in a EM field added for this purpose if diff_expression_matrix_ref: expression_obj['data']['diff_expr_matrix_ref'] = diff_expression_matrix_ref expression_obj['extra_provenance_input_refs'] = [diff_expression_matrix_ref] save_object_params = { 'id': workspace_id, 'objects': [expression_obj]} dfu_oi = self.dfu.save_objects(save_object_params)[0] filtered_expression_matrix_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) return filtered_expression_matrix_ref def _xor(self, a, b): return bool(a) != bool(b) def _check_input_labels(self, condition_pairs, available_condition_labels): """ _check_input_labels: check input condition pairs """ checked = True for condition_pair in condition_pairs: label_string = condition_pair['label_string'][0].strip() label_list = [x.strip() for x in label_string.split(',')] first_label = label_list[0] second_label = label_list[1] if first_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format(first_label) error_msg += 'Available conditions: {}'.format(available_condition_labels) raise ValueError(error_msg) if second_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format(second_label) error_msg += 'Available conditions: {}'.format(available_condition_labels) raise ValueError(error_msg) if first_label == second_label: raise ValueError('Input conditions are the same') return checked def _get_condition_labels(self, diff_expression_set_ref): """ _get_condition_labels: get all possible condition label pairs """ log('getting all possible condition pairs') condition_label_pairs = list() available_condition_labels = set() diff_expression_set_obj = self.ws.get_objects2({'objects': [{'ref': diff_expression_set_ref}] })['data'][0] diff_expression_set_data = diff_expression_set_obj['data'] items = diff_expression_set_data.get('items') for item in items: label_string = item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_label_pairs.append(label_list) available_condition_labels |= set(label_list) log('all possible condition pairs:\n{}'.format(condition_label_pairs)) return condition_label_pairs, available_condition_labels def _get_feature_ids(self, genome_ref, ids): """ _get_feature_ids: get feature ids from genome """ genome_features = self.gsu.search({'ref': genome_ref, 'limit': len(ids), 'structured_query': {"$or": [{"feature_id": x} for x in ids]}, 'sort_by': [['feature_id', True]]})['features'] features_ids = set((feature.get('feature_id') for feature in genome_features)) return features_ids def _build_fs_obj(self, params): new_feature_set = { 'description': '', 'element_ordering': [], 'elements': {} } genome_ref = params['genome'] if params.get('base_feature_sets', []) and None not in params['base_feature_sets']: base_feature_sets = self.dfu.get_objects( {'object_refs': params['base_feature_sets']} )['data'] for ret in base_feature_sets: base_set = ret['data'] base_set_name = ret['info'][1] new_feature_set['element_ordering'] += [x for x in base_set['element_ordering'] if x not in new_feature_set['elements']] for element, genome_refs in base_set['elements'].items(): if element in new_feature_set['elements']: new_feature_set['elements'][element] += [x for x in genome_refs if x not in new_feature_set['elements'][ element]] else: new_feature_set['elements'][element] = genome_refs new_feature_set['description'] += 'From FeatureSet {}: {}\n'.format( base_set_name, base_set.get('description')) new_feature_ids = [] if params.get('feature_ids'): if isinstance(params['feature_ids'], str): new_feature_ids += params['feature_ids'].split(',') else: new_feature_ids += params['feature_ids'] if params.get('feature_ids_custom'): new_feature_ids += params['feature_ids_custom'].split(',') if new_feature_ids: genome_feature_ids = self._get_feature_ids(genome_ref, new_feature_ids) for new_feature in new_feature_ids: if new_feature not in genome_feature_ids: raise ValueError('Feature ID {} does not exist in the supplied genome {}'.format( new_feature, genome_ref)) if new_feature in new_feature_set['elements']: if genome_ref not in new_feature_set['elements'][new_feature]: new_feature_set['elements'][new_feature].append(genome_ref) else: new_feature_set['elements'][new_feature] = [genome_ref] new_feature_set['element_ordering'].append(new_feature) if params.get('description'): new_feature_set['description'] = params['description'] return new_feature_set def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.ws = Workspace(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.scratch = config['scratch'] def upload_featureset_from_diff_expr(self, params): """ upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression based on given threshold cutoffs required params: diff_expression_ref: DifferetialExpressionMatrixSet object reference expression_matrix_ref: ExpressionMatrix object reference p_cutoff: p value cutoff q_cutoff: q value cutoff fold_scale_type: one of ["linear", "log2+1", "log10+1"] fold_change_cutoff: fold change cutoff feature_set_suffix: Result FeatureSet object name suffix filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix workspace_name: the name of the workspace it gets saved to return: result_directory: folder path that holds all files generated up_feature_set_ref_list: list of generated upper FeatureSet object reference down_feature_set_ref_list: list of generated down FeatureSet object reference filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ self._validate_upload_featureset_from_diff_expr_params(params) diff_expression_set_ref = params.get('diff_expression_ref') diff_expression_set_info = self.ws.get_object_info3({"objects": [{"ref": diff_expression_set_ref}]} )['infos'][0] diff_expression_set_name = diff_expression_set_info[1] result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) (available_condition_label_pairs, available_condition_labels) = self._get_condition_labels(diff_expression_set_ref) run_all_combinations = params.get('run_all_combinations') condition_pairs = params.get('condition_pairs') if not self._xor(run_all_combinations, condition_pairs): error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' " error_msg += "or provide partial condition pairs. Don't do both or neither" raise ValueError(error_msg) if run_all_combinations: condition_label_pairs = available_condition_label_pairs else: if self._check_input_labels(condition_pairs, available_condition_labels): condition_label_pairs = list() for condition_pair in condition_pairs: label_string = condition_pair['label_string'][0].strip() condition_labels = [x.strip() for x in label_string.split(',')] condition_label_pairs.append(condition_labels) up_feature_set_ref_list = list() down_feature_set_ref_list = list() filtered_expression_matrix_ref_list = list() for condition_label_pair in condition_label_pairs: condition_string = '-'.join(reversed(condition_label_pair)) diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression( diff_expression_set_ref, result_directory, condition_label_pair) up_feature_ids, down_feature_ids = self._process_matrix_file( diff_expr_matrix_file, params.get('p_cutoff'), params.get('q_cutoff'), params.get('fold_change_cutoff')) filtered_em_name = _sanitize_name(condition_string) + params.get('filtered_expression_matrix_suffix') if params.get('expression_matrix_ref'): filtered_expression_matrix_ref = self._filter_expression_matrix( params.get('expression_matrix_ref'), up_feature_ids + down_feature_ids, params.get('workspace_name'), "", diff_expr_matrix_ref, filtered_em_name) filtered_expression_matrix_ref_list.append(filtered_expression_matrix_ref) feature_set_suffix = params.get('feature_set_suffix', "") up_feature_set_name = "{}_{}_up{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) up_feature_set_ref = self._generate_feature_set(up_feature_ids, genome_id, params.get('workspace_name'), up_feature_set_name) up_feature_set_ref_list.append(up_feature_set_ref) down_feature_set_name = "{}_{}_down{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) down_feature_set_ref = self._generate_feature_set(down_feature_ids, genome_id, params.get('workspace_name'), down_feature_set_name) down_feature_set_ref_list.append(down_feature_set_ref) returnVal = {'result_directory': result_directory, 'up_feature_set_ref_list': up_feature_set_ref_list, 'down_feature_set_ref_list': down_feature_set_ref_list, 'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list} report_output = self._generate_report(up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, params.get('workspace_name')) returnVal.update(report_output) return returnVal def filter_matrix_with_fs(self, params): self.validate_params(params, ('feature_set_ref', 'workspace_name', 'expression_matrix_ref', 'filtered_expression_matrix_suffix')) ret = self.dfu.get_objects( {'object_refs': [params['feature_set_ref']]} )['data'][0] feature_set = ret['data'] feature_set_name = ret['info'][1] feature_ids = set(feature_set['elements'].keys()) filtered_matrix_ref = self._filter_expression_matrix( params['expression_matrix_ref'], feature_ids, params['workspace_name'], params['filtered_expression_matrix_suffix']) objects_created = [{'ref': filtered_matrix_ref, 'description': 'Filtered ExpressionMatrix Object'}] message = "Filtered Expression Matrix based of the {} feature ids present in {}"\ .format(len(feature_ids), feature_set_name) report_params = {'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return {'filtered_expression_matrix_ref': filtered_matrix_ref, 'report_name': output['name'], 'report_ref': output['ref']} def build_feature_set(self, params): self.validate_params(params, {'output_feature_set', 'workspace_name', }, {'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets', 'description'}) feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets') if not any([params.get(x) for x in feature_sources]): raise ValueError("You must supply at least one feature source: {}".format( ", ".join(feature_sources))) workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) new_feature_set = self._build_fs_obj(params) save_object_params = { 'id': workspace_id, 'objects': [{'type': 'KBaseCollections.FeatureSet', 'data': new_feature_set, 'name': params['output_feature_set']}]} dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) objects_created = [{'ref': feature_set_obj_ref, 'description': 'Feature Set'}] message = 'A new feature set containing {} features was created.'.format( len(new_feature_set['elements'])) report_params = {'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return {'feature_set_ref': feature_set_obj_ref, 'report_name': output['name'], 'report_ref': output['ref']}
class FeatureSetDownload: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL']) self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.ws = Workspace(config["workspace-url"]) @staticmethod def validate_params(params, expected={"workspace_name", "featureset_name"}): expected = set(expected) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) def to_tsv(self, params): working_dir = os.path.join(self.scratch, 'featureset-download-'+str(uuid.uuid4())) os.makedirs(working_dir) header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function'] fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref']) files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)} writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t', lineterminator='\n') writer.writeheader() for feat in fs_dicts: writer.writerow(feat) return fs_name, files def make_featureset_dict(self, fs_ref): features = [] ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0] feat_set = ret['data'] fs_name = ret['info'][1] feat_by_genome = defaultdict(list) for k, v in feat_set['elements'].items(): feat_by_genome[v[0]].append(k) for genome, fids in feat_by_genome.items(): genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1] res = self.gsu.search({'ref': genome, 'structured_query': {'feature_id': fids}, 'sort_by': [['contig_id', 1]], 'start': 0, 'limit': len(fids) }) for feat in res['features']: features.append({'Feature Id': feat['feature_id'], 'Aliases': ", ".join(sorted(feat['aliases'].keys())), 'Genome': "{} ({})".format(genome_name, genome), 'Type': feat['feature_type'], 'Function': feat['function'] }) return fs_name, features def export(self, files, name, params): export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4())) os.makedirs(export_package_dir) for file in files: shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['featureset_ref']] }) return {'shock_id': package_details['shock_id']}
def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL']) self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.ws = Workspace(config["workspace-url"])