class genelistutil: def __init__(self): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) pass def download_genelist(self, genelistref, genesetfile): get_objects_params = {'object_refs': [genelistref]} geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data'] with open(genesetfile, 'w') as filehandle: for item in geneset['element_ordering']: filehandle.write('%s\n' % item) def listToString(self, s): str1 = "" for ele in s: str1 += ele return str1 def get_genomeid_from_featuresetid(self, genelistref): genome = {} get_objects_params = {'object_refs': [genelistref]} geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data'] for k, v in geneset['elements'].items(): genome[self.listToString(v)] = 1 if (len(genome) != 1): exit("source of genome is not unique\n") else: return (list(genome.keys())[0])
def setUpClass(cls): token = os.environ.get('KB_AUTH_TOKEN', None) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'provenance': [ {'service': 'GenomeFileUtil', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('GenomeFileUtil'): cls.cfg[nameval[0]] = nameval[1] cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = GenomeFileUtil(cls.cfg) gff_path = "data/fasta_gff/RefSeq/Bacterial_Data/NC_021490.gff.gz" fasta_path = "data/fasta_gff/RefSeq/Bacterial_Data/NC_021490.fasta.gz" ws_obj_name = 'fungal_model' suffix = int(time.time() * 1000) cls.wsName = "test_GenomeFileUtil_" + str(suffix) ret = cls.wsClient.create_workspace({'workspace': cls.wsName}) print('Uploading GFF file') result = cls.serviceImpl.fasta_gff_to_genome( cls.ctx, { 'workspace_name': cls.wsName, 'genome_name': 'MyGenome', 'fasta_file': {'path': fasta_path}, 'gff_file': {'path': gff_path}, 'source': 'GFF', 'type': 'Reference' })[0] data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL']) cls.genome_orig = data_file_cli.get_objects( {'object_refs': [result['genome_ref']]})['data'][0]['data'] print('testing GFF download by building the file') down_result = cls.serviceImpl.genome_to_gff( cls.ctx, {'genome_ref': result['genome_ref']})[0] print('Reuploading GFF file') new_result = cls.serviceImpl.fasta_gff_to_genome( cls.ctx, { 'workspace_name': cls.wsName, 'genome_name': 'MyGenome', 'fasta_file': {'path': fasta_path}, 'gff_file': {'path': down_result['file_path']}, 'source': 'GFF', 'type': 'Reference' })[0] cls.genome_new = data_file_cli.get_objects({'object_refs': [new_result['genome_ref']]})['data'][0]['data']
def setUpClass(cls): token = os.environ.get('KB_AUTH_TOKEN', None) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'provenance': [ {'service': 'GenomeFileUtil', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('GenomeFileUtil'): cls.cfg[nameval[0]] = nameval[1] cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = GenomeFileUtil(cls.cfg) # gbk_path = "data/Cyanidioschyzon/Cyanidioschyzon_merolae.ASM9120v1.30.gbff" gbk_path = "data/Cyanidioschyzon/Cyanidioschyzon_merolae_one_locus.gbff" ws_obj_name = 'Cyanidioschyzon_merolae_duplicate_test_orig' suffix = int(time.time() * 1000) cls.wsName = "test_GenomeFileUtil_" + str(suffix) cls.wsClient.create_workspace({'workspace': cls.wsName}) result = cls.serviceImpl.genbank_to_genome(cls.ctx, { 'file': {'path': gbk_path}, 'workspace_name': cls.wsName, 'genome_name': ws_obj_name, 'generate_ids_if_needed': 1, 'taxon_id': '511145', 'source': "Ensembl user" })[0] data_file_cli = DataFileUtil( os.environ['SDK_CALLBACK_URL'], token=cls.ctx['token'], service_ver='dev' ) cls.genome_orig = data_file_cli.get_objects({'object_refs': [result['genome_ref']]})['data'][0]['data'] print('testing Genbank download by building the file') cls.serviceImpl.export_genome_as_genbank(cls.ctx, { 'input_ref': result['genome_ref']}) new_gbk_path = "/kb/module/work/tmp/Cyanidioschyzon_merolae_duplicate_test_orig/KBase_derived_Cyanidioschyzon_merolae_duplicate_test_orig.gbff" new_ws_obj_name = 'Cyanidioschyzon_merolae_duplicate_test_new' new_result = cls.serviceImpl.genbank_to_genome(cls.ctx, { 'file': {'path': new_gbk_path}, 'workspace_name': cls.wsName, 'genome_name': new_ws_obj_name, 'generate_ids_if_needed': 1, 'taxon_id': '511145', 'source': "Ensembl user" })[0] cls.genome_new = data_file_cli.get_objects({'object_refs': [new_result['genome_ref']]})['data'][0]['data']
def load_fastas(config, scratch: str, upa: str): ''' Returns list of (fasta_path, upa) ''' dfu = DataFileUtil(config['callback_url']) au = AssemblyUtil(config['callback_url']) mgu = MetagenomeUtils(config['callback_url']) ws = Workspace(config['workspace-url']) obj_data = dfu.get_objects({"object_refs": [upa]})['data'][0] obj_type = obj_data['info'][2] if 'KBaseSets.GenomeSet' in obj_type: upas = [gsi['ref'] for gsi in obj_data['data']['items']] elif 'KBaseSearch.GenomeSet' in obj_type: upas = [gse['ref'] for gse in obj_data['data']['elements'].values()] elif "KBaseGenomes.Genome" in obj_type: upas = [upa] elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type: # in this case we use the assembly file util to get the fasta file # file_output = os.path.join(scratch, "input_fasta.fa") faf = au.get_assembly_as_fasta({"ref": upa}) return [(faf['path'], upa)] elif "KBaseSets.AssemblySet" in obj_type: fasta_paths = [] for item_upa in obj_data['data']['items']: faf = au.get_assembly_as_fasta({"ref": item_upa['ref']}) fasta_paths.append((faf['path'], item_upa['ref'])) return fasta_paths elif 'KBaseMetagenomes.BinnedContigs' in obj_type: fasta_paths = [] bin_file_dir = mgu.binned_contigs_to_file({ 'input_ref': upa, 'save_to_shock': 0 })['bin_file_directory'] for (dirpath, dirnames, filenames) in os.walk(bin_file_dir): for fasta_file in filenames: fasta_path = os.path.join(scratch, fasta_file) fasta_path = os.path.splitext(fasta_path)[0] + ".fa" copyfile(os.path.join(bin_file_dir, fasta_file), fasta_path) # Should I verify that the bins have contigs? # is it possible to have empty bins? fasta_paths.append((fasta_path, upa)) break return fasta_paths else: raise Error('Input genome/metagenome reference has unhandled type') fasta_paths = [] for genome_upa in upas: genome_data = ws.get_objects2({'objects': [{ "ref": genome_upa }]})['data'][0]['data'] assembly_upa = genome_upa + ';' + str( genome_data.get('contigset_ref') or genome_data.get('assembly_ref')) faf = au.get_assembly_as_fasta({'ref': assembly_upa}) fasta_paths.append((faf['path'], assembly_upa)) return fasta_paths
def mock_dfu_get_objects(params): logging.info('Mocking `dfu.get_objects(%s)`' % params) upa = ref_leaf(params['object_refs'][0]) fp = _glob_upa(GET_OBJECTS_DIR, upa) # Download and cache if fp is None: logging.info('Calling in cache mode `dfu.get_objects`') dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) obj = dfu.get_objects(params) fp = os.path.join( mkcache(GET_OBJECTS_DIR), file_safe_ref(upa) + TRANSFORM_NAME_SEP + obj['data'][0]['info'][1] + '.json') with open(fp, 'w') as fh: json.dump(obj, fh) return obj # Pull from cache else: with open(fp) as fh: obj = json.load(fh) return obj
class geneminerutils: def __init__(self): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) #self.hr = htmlreportutils() #self.config = config #self.params = params def download_genelist(self, genelistref): get_objects_params = {'object_refs': [genelistref]} geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data'] #geneset_query = ",".join(geneset) return (geneset['element_ordering']) # #with open(genesetfile, 'w') as filehandle: # #for item in geneset['element_ordering']: # # filehandle.write('%s\n' % item) # #return (genesetfile) def generate_query(self, genomenetmine_dyn_url, genelistref, species, pheno): #pheno = ["disease"] #species = "potatoknet" #genes = ["PGSC0003DMG400006345", "PGSC0003DMG400012792", "PGSC0003DMG400033029", "PGSC0003DMG400016390", # "PGSC0003DMG400039594", "PGSC0003DMG400028153"] #genomenetmine_dyn_url = 'http://ec2-18-236-212-118.us-west-2.compute.amazonaws.com:5000/networkquery/api' genes = self.download_genelist(genelistref) gsp = genescoreparser() x = gsp.summary(genomenetmine_dyn_url, genes, species, pheno) return (x) def get_evidence(self,genomenetmine_dyn_url, genelistref, species, pheno ): genes = self.download_genelist(genelistref) ep = evidenceparser() x = ep.summary(genomenetmine_dyn_url, genes, species, pheno) return (x)
def test_gff_and_metagenome_to_metagenome(self): dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) result = self.serviceImpl.ws_obj_gff_to_metagenome( self.ctx, { 'workspace_name': self.wsName, 'genome_name': 'MyGenome', 'gff_file': { 'path': self.gff_path }, 'ws_ref': self.metagenome_ref, 'source': 'GFF', 'type': 'Reference', 'genome_type': 'Metagenome', 'generate_missing_genes': True, })[0] self.assertTrue('metagenome_ref' in result) ret = dfu.get_objects({'object_refs': [result['metagenome_ref']]})['data'][0] metagenome = ret['data'] info = ret['info'] # type_checks self.assertTrue( 'KBaseMetagenomes.AnnotatedMetagenomeAssembly' in info[2]) # make sure its same as original self._compare_features(self.genome_orig, metagenome)
def check_CDS_warnings(self, result, test_name): data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], service_ver='dev') genome = data_file_cli.get_objects( {'object_refs': [result['genome_ref']]})['data'][0]['data'] print("IN TEST NAME : " + str(test_name)) cds_warning_count = 0 cds_with_warning_count = 0 if 'cdss' in genome: total_cds_count = len(genome['cdss']) for feature in genome['cdss']: if 'warnings' in feature: if test_name == "test_jgi_bacterial_fasta_gff2_to_genome": print( str(feature['id']) + " warnings:" + str(feature['warnings'])) print("Location: " + str(feature['location'])) print("Translation: " + feature['protein_translation']) print("DNA Sequence: " + feature["dna_sequence"]) cds_with_warning_count = cds_with_warning_count + 1 cds_warning_count = cds_warning_count + len( feature['warnings']) print("Total CDS: " + str(total_cds_count)) print("CDS Warning Count: " + str(cds_warning_count)) print("CDSs with a warning Count: " + str(cds_with_warning_count)) print("Percent CDS with warning: " + str((cds_with_warning_count / float(total_cds_count)) * 100))
def print_genome_warnings(self, result): data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], service_ver='dev') genome = data_file_cli.get_objects( {'object_refs': [result['genome_ref']]})['data'][0]['data'] if 'warnings' in genome: print("Genome warnings:" + str(genome['warnings']))
def load_fastas(config, scratch, upa): ''' ''' dfu = DataFileUtil(config['callback_url']) au = AssemblyUtil(config['callback_url']) ws = Workspace(config['workspace-url']) obj_data = dfu.get_objects({"object_refs":[upa]})['data'][0] obj_type = obj_data['info'][2] if 'KBaseSets.GenomeSet' in obj_type: upas = [gsi['ref'] for gsi in obj_data['data']['items']] elif 'KBaseSearch.GenomeSet' in obj_type: upas = [gse['ref'] for gse in obj_data['data']['elements'].values()] elif "KBaseGenomes.Genome" in obj_type: upas = [upa] elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type: # in this case we use the assembly file util to get the fasta file file_output = os.path.join(scratch, "input_fasta.fa") faf = au.get_assembly_as_fasta({"ref": upa}) return [(faf['path'], upa)] fasta_paths = [] for genome_upa in upas: if upa != genome_upa: genome_upa = upa + ';' + genome_upa genome_data = ws.get_objects2( {'objects':[{"ref":genome_upa}]})['data'][0]['data'] target_upa = genome_data.get('contigset_ref') or genome_data.get('assembly_ref') assembly_upa = genome_upa + ';' + target_upa faf = au.get_assembly_as_fasta({"ref":assembly_upa}) fasta_paths.append((faf['path'], assembly_upa)) return fasta_paths
def test_no_sequence_kept(self): # features, cds, and non_coding_features should not have sequences in it. print("test_no_sequence_kept") gbk_path = "data/e_coli/GCF_000005845.2_ASM584v2_genomic.gbff" ws_obj_name = 'no_sequence' result = self.getImpl().genbank_to_genome( self.getContext(), { 'file': { 'path': gbk_path}, 'workspace_name': self.getWsName(), 'genome_name': ws_obj_name, 'generate_ids_if_needed': 1 })[0] data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=self.__class__.token, service_ver='dev') genome = data_file_cli.get_objects({'object_refs': [result['genome_ref']]})['data'][0]['data'] count_features_with_dna_sequence = 0 for feature in genome['features']: if "dna_sequence" in feature: count_features_with_dna_sequence += 1 count_non_coding_features_with_sequence = 0 for feature in genome['non_coding_features']: if "dna_sequence" in feature: count_non_coding_features_with_sequence += 1 count_cdss_with_sequence = 0 for feature in genome['cdss']: if "dna_sequence" in feature: count_cdss_with_sequence += 1 self.assertTrue(count_features_with_dna_sequence == 0,"All features should not have DNA sequences.") self.assertTrue(count_non_coding_features_with_sequence == 0, "All non_coding_features should not have DNA sequences.") self.assertTrue(count_cdss_with_sequence == 0,"All CDSs should not have DNA sequences.")
def BuildFastaFromSequenceSet(self, ctx, params): """ :param params: instance of type "BuildSeqIn" -> structure: parameter "workspace_name" of String, parameter "SequenceSetRef" of String, parameter "fasta_outpath" of String :returns: instance of type "BuildSeqOut" -> structure: parameter "fasta_outpath" of String """ # ctx is the context object # return variables are: output #BEGIN BuildFastaFromSequenceSet #END BuildFastaFromSequenceSet # At some point might do deeper type checking... dfu = DataFileUtil(self.callback_url) bu = BackgroundUtils() TU = TestUtils() if params['TESTFLAG'] and params['background']: targetpath = '/kb/module/work/tmp/testgenome.fa' TU.GetGenome(targetpath) bu.BuildBackground(targetpath) elif params['background']: ws = Workspace('https://appdev.kbase.us/services/ws') subset = ws.get_object_subset([{ 'included':['/features/[*]/location', '/features/[*]/id','/assembly_ref'], 'ref':params['genome_ref']}]) aref = subset[0]['data']['assembly_ref'] assembly_ref = {'ref': aref} print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref)['path'] bu.BuildBackground(fasta_file) get_objects_params = {'object_refs' : [params['SequenceSetRef']]} SeqSet = dfu.get_objects(get_objects_params)['data'][0]['data'] outFile = open(params['fasta_outpath'],'w') for s in SeqSet['sequences']: sname = '>' + s['sequence_id'] + '\n' outFile.write(sname) sseq = s['sequence'] + '\n' outFile.write(sseq) outFile.close() fu=FastaUtils() if params['mask_repeats']: fu.RemoveRepeats(params['fasta_outpath'],params['fasta_outpath']) output = {'fasta_outpath' : params['fasta_outpath']} #END BuildFastaFromSequenceSet # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method BuildFastaFromSequenceSet return value ' + 'output is not type dict as required.') # return the results return [output]
def mock_dfu_get_objects(params): logging.info('Mocking dfu.get_objects(%s)' % params) upa = ref_leaf(params['object_refs'][0]) fp = _glob_upa(GET_OBJECTS_DIR, upa) # Download and cache if fp is None: logging.info('Calling in cache mode `dfu.get_objects`') dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) obj = dfu.get_objects(params) fp = os.path.join( mkcache(GET_OBJECTS_DIR), file_safe_ref(upa) + '__' + obj['data'][0]['info'][1] + '.json') with open(fp, 'w') as fh: json.dump(obj, fh) # Pull from cache else: with open(fp) as fh: obj = json.load(fh) # swap in arg obj if replace_obj is not None: ref = params['object_refs'][0] if ref in replace_obj: obj['data'][0]['data'] = replace_obj[ref] return obj
def BuildFastaFromSequenceSet(self, ctx, params): """ :param params: instance of type "BuildSeqIn" -> structure: parameter "workspace_name" of String, parameter "SequenceSetRef" of String, parameter "fasta_outpath" of String :returns: instance of type "BuildSeqOut" -> structure: parameter "fasta_outpath" of String """ # ctx is the context object # return variables are: output #BEGIN BuildFastaFromSequenceSet dfu = DataFileUtil(self.callback_url) get_objects_params = {'object_refs': [params['SequenceSetRef']]} SeqSet = dfu.get_objects(get_objects_params)['data'][0]['data'] outFile = open(params['fasta_outpath'], 'w') for s in SeqSet['sequences']: sname = '>' + s['sequence_id'] + '\n' outFile.write(sname) sseq = s['sequence'] + '\n' outFile.write(sseq) outFile.close() output = {'fasta_outpath': params['fasta_outpath']} #END BuildFastaFromSequenceSet # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method BuildFastaFromSequenceSet return value ' + 'output is not type dict as required.') # return the results return [output]
def _get_assembly(self, genome): if 'assembly_ref' in genome: assembly_ref = genome['assembly_ref'] else: assembly_ref = genome['contigset_ref'] log('Assembly reference = ' + assembly_ref) log('Downloading assembly') dfu = DataFileUtil(self.cfg.callbackURL) log('object_refs:' + self.genome_ref + ";" + assembly_ref) assembly_data = dfu.get_objects( {'object_refs': [self.genome_ref + ";" + assembly_ref]})['data'][0]['data'] if isinstance(assembly_data['contigs'], dict): # is an assembly circular_contigs = set([ x['contig_id'] for x in list(assembly_data['contigs'].values()) if x.get('is_circ') ]) else: # is a contig set circular_contigs = set([ x['id'] for x in assembly_data['contigs'] if x.get('replicon_geometry') == 'circular' ]) au = AssemblyUtil(self.cfg.callbackURL) assembly_file_path = au.get_assembly_as_fasta( {'ref': self.genome_ref + ";" + assembly_ref})['path'] return assembly_file_path, circular_contigs
class VariationToVCF: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) def is_gz_file(filepath): with open(filepath, 'rb') as test_f: return binascii.hexlify(test_f.read(2)) == b'1f8b' def export_as_vcf(self, params): if 'input_var_ref' not in params: raise ValueError('Cannot export Variation- no input_var_ref field defined.') file = self.variation_to_vcf({'variation_ref': params['input_var_ref']}) export_dir = os.path.join(self.scratch, file['variation_name']) os.makedirs(export_dir) try: shutil.move(file['path'], os.path.join(export_dir, os.path.basename(file['path']))) except shutil.Error as e: exit(e) dfupkg = self.dfu.package_for_download({ 'file_path': export_dir, 'ws_refs': [params['input_var_ref']] }) return {'shock_id': dfupkg['shock_id']} def variation_to_vcf(self, params): self.validate_params(params) print('downloading ws object data: '+params["variation_ref"]) variation_obj = self.dfu.get_objects({'object_refs': [params['variation_ref']]})['data'][0] ws_type = variation_obj['info'][2] obj_name = variation_obj['info'][1] if 'KBaseGwasData.Variations' in ws_type: dl_path = self.process_vcf(self.scratch, variation_obj['data']) else: raise ValueError('Cannot write data to VCF; invalid WS type (' + ws_type + '). Supported types is KBaseGwasData.Variations') return {'path': dl_path, 'variation_name': obj_name} def process_vcf(self, output_vcf_file_path, data): obj = self.dfu.shock_to_file({ 'handle_id': data['vcf_handle_ref'], 'file_path': output_vcf_file_path, }) return obj['file_path'] def validate_params(self, params): for key in ['variation_ref']: if key not in params: raise ValueError('required "' + key + '" field was not defined')
class SampleServiceUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.srv_wiz_url = config['srv-wiz-url'] self.sample_url = config.get('kbase-endpoint') + '/sampleservice' self.dfu = DataFileUtil(self.callback_url) self.sample_ser = SampleService(self.sample_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) def get_sample_service_url(self): return self.sample_url def get_sample(self, sample_id, version=None): sample_url = self.get_sample_service_url() headers = {"Authorization": self.token} params = {"id": sample_id, "version": version} payload = { "method": "SampleService.get_sample", "id": str(uuid.uuid4()), "params": [params], "version": "1.1" } resp = requests.post(url=sample_url, headers=headers, data=json.dumps(payload)) resp_json = resp.json() if resp_json.get('error'): raise RuntimeError( f"Error from SampleService - {resp_json['error']}") sample = resp_json['result'][0] # sample = self.sample_ser.get_sample(params)[0] return sample def get_ids_from_samples(self, sample_set_ref): logging.info('start retrieving sample ids from sample set') sample_set = self.dfu.get_objects({"object_refs": [sample_set_ref] })['data'][0]['data'] samples = sample_set['samples'] data_ids = [] for sample in samples: sample_id = sample.get('id') version = sample.get('version') sample_data = self.get_sample(sample_id, version=version) data_id = sample_data['name'] data_ids.append(data_id) return data_ids
def DownloadMotifSet(self, refList, callback): MotifSetDict = {} dfu = DataFileUtil(callback) for ref in refList: get_objects_params = {'object_refs': [ref]} MotifSet = dfu.get_objects(get_objects_params)['data'][0]['data'] MotifSetDict[ref] = deepcopy(MotifSet) return MotifSetDict
class genelistutil: def __init__(self): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) pass def download_genelist(self, genelistref, genesetfile): get_objects_params = {'object_refs': [genelistref]} geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data'] with open(genesetfile, 'w') as filehandle: for item in geneset['element_ordering']: filehandle.write('%s\n' % item) def listToString(self, s): # initialize an empty string str1 = "" # traverse in the string for ele in s: str1 += ele # return string return str1 # function to get unique values def unique(self, list1): # insert the list to the set list_set = set(list1) # convert the set to the list unique_list = (list(list_set)) for x in unique_list: print(x) def get_genomeid_from_featuresetid(self, genelistref): genome = {} get_objects_params = {'object_refs': [genelistref]} geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data'] print(type(geneset['elements'].values())) for k, v in geneset['elements'].items(): print(self.listToString(v)) genome[self.listToString(v)] = 1 if (len(genome) != 1): exit("source of genome is not unique\n") else: return (list(genome.values())[0])
def DownloadMotifSet(refList,callback): MotifSetDict = {} #init DFU dfu = DataFileUtil(callback) for ref in refList: get_objects_params = {'object_refs' : [ref]} #get_ss_params = {'object_refs' : [params['SS_ref']]} MotifSet = dfu.get_objects(get_objects_params)['data'][0]['data'] MotifSetDict[ref] = deepcopy(MotifSet) return MotifSetDict
def _fetch_df_from_refs(self, object_refs): dfu = DataFileUtil(self.callback_url) tables = dfu.get_objects({'object_refs': object_refs})['data'] lambda_df = self._fetch_df_from_json(tables[0]) stoich_df = self._fetch_df_from_json(tables[1]) df = stoich_df.merge(lambda_df["lambda_O2"], left_index=True, right_index=True) return df
def setUpClass(cls): token = os.environ.get('KB_AUTH_TOKEN', None) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'provenance': [{ 'service': 'GenomeFileUtil', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('GenomeFileUtil'): cls.cfg[nameval[0]] = nameval[1] cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = GenomeFileUtil(cls.cfg) gbk_path = "data/Arabidopsis_gbff/A_thaliana_Ensembl_TAIR10_38_chr4_minus_xref.gbff" ws_obj_name = 'Yeast_chromosome1' suffix = int(time.time() * 1000) cls.wsName = "test_GenomeFileUtil_" + str(suffix) ret = cls.wsClient.create_workspace({'workspace': cls.wsName}) result = cls.serviceImpl.genbank_to_genome( cls.ctx, { 'file': { 'path': gbk_path }, 'workspace_name': cls.wsName, 'genome_name': ws_obj_name, 'generate_ids_if_needed': 1, 'source': "Ensembl" })[0] # print("HERE IS THE RESULT:") data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=cls.ctx['token'], service_ver='dev') cls.genome = data_file_cli.get_objects( {'object_refs': [result['genome_ref']]})['data'][0]['data'] json.dump( cls.genome, open(cls.cfg['scratch'] + "/relationship_test_genome.json", 'w')) cls.gene_ids = set((x['id'] for x in cls.genome['features'])) cls.nc_feat_ids = set( (x['id'] for x in cls.genome['non_coding_features'])) cls.mrna_ids = set((x['id'] for x in cls.genome['mrnas'])) cls.cds_ids = set((x['id'] for x in cls.genome['cdss']))
def setUpClass(cls): token = os.environ.get('KB_AUTH_TOKEN', None) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'provenance': [{ 'service': 'GenomeFileUtil', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) config_file = os.environ['KB_DEPLOYMENT_CONFIG'] cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('GenomeFileUtil'): cls.cfg[nameval[0]] = nameval[1] cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = GenomeFileUtil(cls.cfg) gff_path = "data/e_coli/NC_000913.3.gff3" fna_path = "data/e_coli/NC_000913.3.fasta" # fna_path = "data/e_coli/GCF_000005845.2_ASM584v2.fasta" ws_obj_name = 'ecoli_contigs' suffix = int(time.time() * 1000) cls.wsName = "test_GenomeFileUtil_" + str(suffix) cls.wsClient.create_workspace({'workspace': cls.wsName}) result = cls.serviceImpl.fasta_gff_to_genome( cls.ctx, { 'gff_file': { 'path': gff_path }, 'fasta_file': { 'path': fna_path }, 'taxon_id': 511145, 'workspace_name': cls.wsName, 'genome_name': ws_obj_name, 'generate_missing_genes': 1, 'generate_ids_if_needed': 1 })[0] data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=cls.ctx['token'], service_ver='dev') dfu_result = data_file_cli.get_objects( {'object_refs': [result['genome_ref']]}) cls.genome = dfu_result['data'][0]['data']
def setUpClass(cls): token = os.environ.get('KB_AUTH_TOKEN', None) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'provenance': [{ 'service': 'GenomeFileUtil', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('GenomeFileUtil'): cls.cfg[nameval[0]] = nameval[1] cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = GenomeFileUtil(cls.cfg) gbk_path = "data/e_coli/GCF_000005845.2_ASM584v2_genomic.gbff" ws_obj_name = 'ecoli_genome' suffix = int(time.time() * 1000) cls.wsName = "test_GenomeFileUtil_" + str(suffix) ret = cls.wsClient.create_workspace({'workspace': cls.wsName}) result = cls.serviceImpl.genbank_to_genome( cls.ctx, { 'file': { 'path': gbk_path }, 'workspace_name': cls.wsName, 'genome_name': ws_obj_name, 'generate_ids_if_needed': 1, 'source': "RefSeq Reference" })[0] # print("HERE IS THE RESULT:") data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=cls.ctx['token'], service_ver='dev') genome = data_file_cli.get_objects( {'object_refs': [result['genome_ref']]})['data'][0]['data'] cls.assembly_ref = genome["assembly_ref"]
def test_gff_and_metagenome_to_metagenome(self): dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) result = self.serviceImpl.ws_obj_gff_to_metagenome(self.ctx, { 'workspace_name': self.wsName, 'genome_name': 'MyGenome', 'gff_file': {'path': self.gff_path}, 'ws_ref': self.metagenome_ref, 'source': 'GFF', 'type': 'Reference', 'genome_type': 'Metagenome', 'is_metagenome': True, 'generate_missing_genes': True, 'taxon_id': '3702', })[0] metagenome = dfu.get_objects({'object_refs': [result['metagenome_ref']]})['data'][0]['data'] # make sure its same as original self._compare_features(self.genome_orig, metagenome)
def check_CDS_warnings(self, result, test_name): data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL']) genome = data_file_cli.get_objects( {'object_refs': [result['genome_ref']]})['data'][0]['data'] print("IN TEST NAME : " + str(test_name)) cds_warning_count = 0 cds_with_warning_count = 0 if 'cdss' in genome: total_cds_count = len(genome['cdss']) for feature in genome['cdss']: if 'warnings' in feature: cds_with_warning_count = cds_with_warning_count + 1 cds_warning_count = cds_warning_count + len( feature['warnings']) print("Total CDS: " + str(total_cds_count)) print("CDS Warning Count: " + str(cds_warning_count)) print("CDSs with a warning Count: " + str(cds_with_warning_count)) print("Percent CDS with warning: " + str((cds_with_warning_count / float(total_cds_count)) * 100))
def get_upa_name(ws_url, cb_url, upa, is_test): ''' ''' if is_test: return "test_object" ws = Workspace(ws_url) objs = ws.get_object_info3({'objects': [{'ref': upa}]}) upa_names = [info[1] for info in objs['infos']] if len(upa_names) > 0: return upa_names[0] dfu = DataFileUtil(cb_url) objs = dfu.get_objects({'object_refs': [upa]})['data'] upa_names = [obj['info'][1] for obj in objs] if len(upa_names) > 0: return upa_names[0] else: raise ValueError("Could not find name of workspace object with id %s" % upa)
def setUpClass(cls): token = os.environ.get('KB_AUTH_TOKEN', None) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'provenance': [ {'service': 'GenomeFileUtil', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('GenomeFileUtil'): cls.cfg[nameval[0]] = nameval[1] cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = GenomeFileUtil(cls.cfg) gbk_path = "data/Arabidopsis_gbff/Arab_Chloro_Modified.gbff" ws_obj_name = 'ArabidopsisChloro' suffix = int(time.time() * 1000) cls.wsName = "test_GenomeFileUtil_" + str(suffix) ret = cls.wsClient.create_workspace({'workspace': cls.wsName}) result = cls.serviceImpl.genbank_to_genome( cls.ctx, { 'file': { 'path': gbk_path}, 'workspace_name': cls.wsName, 'genome_name': ws_obj_name, 'generate_ids_if_needed': 1, 'source': "RefSeq Latest" })[0] data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=cls.ctx['token'], service_ver='dev') cls.genome = data_file_cli.get_objects({'object_refs': [result['genome_ref']]})['data'][0]['data'] json.dump(cls.genome, open(cls.cfg['scratch']+"/test_genome.json", 'w'))
def test_genome_gff_to_genome(self): result = self.serviceImpl.ws_obj_gff_to_genome( self.ctx, { 'workspace_name': self.wsName, 'genome_name': 'MyGenome', 'gff_file': { 'path': self.gff_path }, 'ws_ref': self.genome_ref, 'source': 'GFF', 'type': 'Reference', 'taxon_id': '243276' })[0] dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) genome = dfu.get_objects({"object_refs": [result['genome_ref']]})['data'][0]['data'] self.feature_list_comparison(self.genome_orig, genome, 'features') self.feature_list_comparison(self.genome_orig, genome, 'cdss') self.feature_list_comparison(self.genome_orig, genome, 'mrnas') self.feature_list_comparison(self.genome_orig, genome, 'non_coding_features')
def test_same_genome(self): gbk_path = "data/e_coli/GCF_000005845.2_ASM584v2_genomic.gbff" ws_obj_name = 'ecoli_genome' existing_assembly_ref = genome = self.__class__.assembly_ref result = self.getImpl().genbank_to_genome( self.getContext(), { 'file': { 'path': gbk_path }, 'workspace_name': self.getWsName(), 'genome_name': ws_obj_name, 'generate_ids_if_needed': 1, 'generate_missing_genes': 1, 'source': 'refseq reference', 'use_existing_assembly': existing_assembly_ref })[0] data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL']) genome = data_file_cli.get_objects( {'object_refs': [result['genome_ref']]})['data'][0]['data'] self.assertTrue(genome['assembly_ref'] == existing_assembly_ref, "Same file did not keep the same assembly ref")
class FeatureSetDownload: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL']) self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.ws = Workspace(config["workspace-url"]) @staticmethod def validate_params(params, expected={"workspace_name", "featureset_name"}): expected = set(expected) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) def to_tsv(self, params): working_dir = os.path.join(self.scratch, 'featureset-download-'+str(uuid.uuid4())) os.makedirs(working_dir) header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function'] fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref']) files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)} writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t', lineterminator='\n') writer.writeheader() for feat in fs_dicts: writer.writerow(feat) return fs_name, files def make_featureset_dict(self, fs_ref): features = [] ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0] feat_set = ret['data'] fs_name = ret['info'][1] feat_by_genome = defaultdict(list) for k, v in feat_set['elements'].items(): feat_by_genome[v[0]].append(k) for genome, fids in feat_by_genome.items(): genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1] res = self.gsu.search({'ref': genome, 'structured_query': {'feature_id': fids}, 'sort_by': [['contig_id', 1]], 'start': 0, 'limit': len(fids) }) for feat in res['features']: features.append({'Feature Id': feat['feature_id'], 'Aliases': ", ".join(sorted(feat['aliases'].keys())), 'Genome': "{} ({})".format(genome_name, genome), 'Type': feat['feature_type'], 'Function': feat['function'] }) return fs_name, features def export(self, files, name, params): export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4())) os.makedirs(export_package_dir) for file in files: shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['featureset_ref']] }) return {'shock_id': package_details['shock_id']}
class FeatureSetBuilder: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_upload_featureset_from_diff_expr_params(self, params): """ _validate_upload_featureset_from_diff_expr_params: validates params passed to upload_featureset_from_diff_expr method """ log('start validating upload_featureset_from_diff_expr params') # check for required parameters for p in ['diff_expression_ref', 'workspace_name', 'p_cutoff', 'q_cutoff', 'fold_change_cutoff']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) p = params.get('fold_scale_type') if p and p != 'logarithm': raise ValueError('"fold_scale_type" parameter must be set to "logarithm", if used') @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning("Unexpected parameter {} supplied".format(param)) def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, workspace_name): """ _generate_report: generate summary report """ log('start creating report') output_html_files = self._generate_html_report(up_feature_set_ref_list, down_feature_set_ref_list) objects_created = list() for up_feature_set_ref in up_feature_set_ref_list: objects_created += [{'ref': up_feature_set_ref, 'description': 'Upper FeatureSet Object'}] for down_feature_set_ref in down_feature_set_ref_list: objects_created += [{'ref': down_feature_set_ref, 'description': 'Lower FeatureSet Object'}] for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list: objects_created += [{'ref': filtered_expression_matrix_ref, 'description': 'Filtered ExpressionMatrix Object'}] report_params = {'message': '', 'workspace_name': workspace_name, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') uppper_feature_content = '' for up_feature_set_ref in up_feature_set_ref_list: feature_set_obj = self.ws.get_objects2({'objects': [{'ref': up_feature_set_ref}]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name, len(feature_ids)) lower_feature_content = '' for down_feature_set_ref in down_feature_set_ref_list: feature_set_obj = self.ws.get_objects2({'objects': [{'ref': down_feature_set_ref}]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name, len(feature_ids)) with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<tr><td>Upper_FeatureSet</td></tr>', uppper_feature_content) report_template = report_template.replace('<tr><td>Lower_FeatureSet</td></tr>', lower_feature_content) result_file.write(report_template) html_report.append({'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report'}) return html_report def _process_diff_expression(self, diff_expression_set_ref, result_directory, condition_label_pair): """ _process_diff_expression: process differential expression object info """ log('start processing differential expression object') diff_expr_set_data = self.ws.get_objects2({'objects': [{'ref': diff_expression_set_ref}]})['data'][0]['data'] set_items = diff_expr_set_data['items'] diff_expr_matrix_file_name = 'gene_results.csv' diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name) with open(diff_expr_matrix_file, 'w') as csvfile: fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for set_item in set_items: diff_expression_ref = set_item['ref'] diff_expression_data = self.ws.get_objects2({'objects': [{'ref': diff_expression_ref}]})['data'][0]['data'] label_string = set_item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_1 = label_list[0] condition_2 = label_list[1] if condition_1 in condition_label_pair and condition_2 in condition_label_pair: genome_id = diff_expression_data['genome_ref'] matrix_data = diff_expression_data['data'] selected_diff_expression_ref = diff_expression_ref with open(diff_expr_matrix_file, 'a') as csvfile: row_ids = matrix_data.get('row_ids') row_values = matrix_data.get('values') writer = csv.DictWriter(csvfile, fieldnames=fieldnames) for pos, row_id in enumerate(row_ids): row_value = row_values[pos] writer.writerow({'gene_id': row_id, 'log2_fold_change': row_value[0], 'p_value': row_value[1], 'q_value': row_value[2]}) return diff_expr_matrix_file, genome_id, selected_diff_expression_ref def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name): """ _generate_feature_set: generate FeatureSet object KBaseCollections.FeatureSet type: typedef structure { string description; list<feature_id> element_ordering; mapping<feature_id, list<genome_ref>> elements; } FeatureSet; """ log('start saving KBaseCollections.FeatureSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) elements = {feature_id: [genome_id] for feature_id in feature_ids} feature_set_data = {'description': 'Generated FeatureSet from DifferentialExpression', 'element_ordering': feature_ids, 'elements': elements} object_type = 'KBaseCollections.FeatureSet' save_object_params = { 'id': workspace_id, 'objects': [{'type': object_type, 'data': feature_set_data, 'name': feature_set_name}]} dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) return feature_set_obj_ref def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value, comp_fold_change_cutoff): """ _process_matrix_file: filter matrix file by given cutoffs """ log('start processing matrix file') up_feature_ids = [] down_feature_ids = [] if comp_fold_change_cutoff < 0: comp_fold_change_cutoff = -comp_fold_change_cutoff with open(diff_expr_matrix_file, 'r') as file: reader = csv.DictReader(file) for row in reader: feature_id = row['gene_id'] row_p_value = row['p_value'] row_q_value = row['q_value'] row_fold_change_cutoff = row['log2_fold_change'] null_value = {'NA', 'null', ''} col_value = {row_p_value, row_q_value, row_fold_change_cutoff} if not col_value.intersection(null_value): p_value_condition = float(row_p_value) <= comp_p_value q_value_condition = float(row_q_value) <= comp_q_value up_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) >= comp_fold_change_cutoff)) down_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) <= -comp_fold_change_cutoff)) if up_matches_condition: up_feature_ids.append(feature_id) elif down_matches_condition: down_feature_ids.append(feature_id) return list(set(up_feature_ids)), list(set(down_feature_ids)) def _filter_expression_matrix(self, expression_matrix_ref, feature_ids, workspace_name, filtered_expression_matrix_suffix="", diff_expression_matrix_ref=None, filtered_expression_matrix_name=None): """ _filter_expression_matrix: generated filtered expression matrix """ log('start saving ExpressionMatrix object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_matrix_obj = self.dfu.get_objects({'object_refs': [expression_matrix_ref]})['data'][0] expression_matrix_info = expression_matrix_obj['info'] expression_matrix_data = expression_matrix_obj['data'] expression_matrix_name = expression_matrix_info[1] if not filtered_expression_matrix_name: if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name): filtered_expression_matrix_name = re.sub('_*[Ee]xpression_*[Mm]atrix', filtered_expression_matrix_suffix, expression_matrix_name) else: filtered_expression_matrix_name = expression_matrix_name + \ filtered_expression_matrix_suffix filtered_expression_matrix_data = expression_matrix_data.copy() data = filtered_expression_matrix_data['data'] row_ids = data['row_ids'] values = data['values'] filtered_data = data.copy() filtered_row_ids = list() filtered_values = list() for pos, row_id in enumerate(row_ids): if row_id in feature_ids: filtered_row_ids.append(row_id) filtered_values.append(values[pos]) filtered_data['row_ids'] = filtered_row_ids filtered_data['values'] = filtered_values filtered_expression_matrix_data['data'] = filtered_data expression_obj = {'type': expression_matrix_info[2], 'data': filtered_expression_matrix_data, 'name': filtered_expression_matrix_name} # we now save the filtering DEM in a EM field added for this purpose if diff_expression_matrix_ref: expression_obj['data']['diff_expr_matrix_ref'] = diff_expression_matrix_ref expression_obj['extra_provenance_input_refs'] = [diff_expression_matrix_ref] save_object_params = { 'id': workspace_id, 'objects': [expression_obj]} dfu_oi = self.dfu.save_objects(save_object_params)[0] filtered_expression_matrix_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) return filtered_expression_matrix_ref def _xor(self, a, b): return bool(a) != bool(b) def _check_input_labels(self, condition_pairs, available_condition_labels): """ _check_input_labels: check input condition pairs """ checked = True for condition_pair in condition_pairs: label_string = condition_pair['label_string'][0].strip() label_list = [x.strip() for x in label_string.split(',')] first_label = label_list[0] second_label = label_list[1] if first_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format(first_label) error_msg += 'Available conditions: {}'.format(available_condition_labels) raise ValueError(error_msg) if second_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format(second_label) error_msg += 'Available conditions: {}'.format(available_condition_labels) raise ValueError(error_msg) if first_label == second_label: raise ValueError('Input conditions are the same') return checked def _get_condition_labels(self, diff_expression_set_ref): """ _get_condition_labels: get all possible condition label pairs """ log('getting all possible condition pairs') condition_label_pairs = list() available_condition_labels = set() diff_expression_set_obj = self.ws.get_objects2({'objects': [{'ref': diff_expression_set_ref}] })['data'][0] diff_expression_set_data = diff_expression_set_obj['data'] items = diff_expression_set_data.get('items') for item in items: label_string = item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_label_pairs.append(label_list) available_condition_labels |= set(label_list) log('all possible condition pairs:\n{}'.format(condition_label_pairs)) return condition_label_pairs, available_condition_labels def _get_feature_ids(self, genome_ref, ids): """ _get_feature_ids: get feature ids from genome """ genome_features = self.gsu.search({'ref': genome_ref, 'limit': len(ids), 'structured_query': {"$or": [{"feature_id": x} for x in ids]}, 'sort_by': [['feature_id', True]]})['features'] features_ids = set((feature.get('feature_id') for feature in genome_features)) return features_ids def _build_fs_obj(self, params): new_feature_set = { 'description': '', 'element_ordering': [], 'elements': {} } genome_ref = params['genome'] if params.get('base_feature_sets', []) and None not in params['base_feature_sets']: base_feature_sets = self.dfu.get_objects( {'object_refs': params['base_feature_sets']} )['data'] for ret in base_feature_sets: base_set = ret['data'] base_set_name = ret['info'][1] new_feature_set['element_ordering'] += [x for x in base_set['element_ordering'] if x not in new_feature_set['elements']] for element, genome_refs in base_set['elements'].items(): if element in new_feature_set['elements']: new_feature_set['elements'][element] += [x for x in genome_refs if x not in new_feature_set['elements'][ element]] else: new_feature_set['elements'][element] = genome_refs new_feature_set['description'] += 'From FeatureSet {}: {}\n'.format( base_set_name, base_set.get('description')) new_feature_ids = [] if params.get('feature_ids'): if isinstance(params['feature_ids'], str): new_feature_ids += params['feature_ids'].split(',') else: new_feature_ids += params['feature_ids'] if params.get('feature_ids_custom'): new_feature_ids += params['feature_ids_custom'].split(',') if new_feature_ids: genome_feature_ids = self._get_feature_ids(genome_ref, new_feature_ids) for new_feature in new_feature_ids: if new_feature not in genome_feature_ids: raise ValueError('Feature ID {} does not exist in the supplied genome {}'.format( new_feature, genome_ref)) if new_feature in new_feature_set['elements']: if genome_ref not in new_feature_set['elements'][new_feature]: new_feature_set['elements'][new_feature].append(genome_ref) else: new_feature_set['elements'][new_feature] = [genome_ref] new_feature_set['element_ordering'].append(new_feature) if params.get('description'): new_feature_set['description'] = params['description'] return new_feature_set def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.ws = Workspace(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.scratch = config['scratch'] def upload_featureset_from_diff_expr(self, params): """ upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression based on given threshold cutoffs required params: diff_expression_ref: DifferetialExpressionMatrixSet object reference expression_matrix_ref: ExpressionMatrix object reference p_cutoff: p value cutoff q_cutoff: q value cutoff fold_scale_type: one of ["linear", "log2+1", "log10+1"] fold_change_cutoff: fold change cutoff feature_set_suffix: Result FeatureSet object name suffix filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix workspace_name: the name of the workspace it gets saved to return: result_directory: folder path that holds all files generated up_feature_set_ref_list: list of generated upper FeatureSet object reference down_feature_set_ref_list: list of generated down FeatureSet object reference filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ self._validate_upload_featureset_from_diff_expr_params(params) diff_expression_set_ref = params.get('diff_expression_ref') diff_expression_set_info = self.ws.get_object_info3({"objects": [{"ref": diff_expression_set_ref}]} )['infos'][0] diff_expression_set_name = diff_expression_set_info[1] result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) (available_condition_label_pairs, available_condition_labels) = self._get_condition_labels(diff_expression_set_ref) run_all_combinations = params.get('run_all_combinations') condition_pairs = params.get('condition_pairs') if not self._xor(run_all_combinations, condition_pairs): error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' " error_msg += "or provide partial condition pairs. Don't do both or neither" raise ValueError(error_msg) if run_all_combinations: condition_label_pairs = available_condition_label_pairs else: if self._check_input_labels(condition_pairs, available_condition_labels): condition_label_pairs = list() for condition_pair in condition_pairs: label_string = condition_pair['label_string'][0].strip() condition_labels = [x.strip() for x in label_string.split(',')] condition_label_pairs.append(condition_labels) up_feature_set_ref_list = list() down_feature_set_ref_list = list() filtered_expression_matrix_ref_list = list() for condition_label_pair in condition_label_pairs: condition_string = '-'.join(reversed(condition_label_pair)) diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression( diff_expression_set_ref, result_directory, condition_label_pair) up_feature_ids, down_feature_ids = self._process_matrix_file( diff_expr_matrix_file, params.get('p_cutoff'), params.get('q_cutoff'), params.get('fold_change_cutoff')) filtered_em_name = _sanitize_name(condition_string) + params.get('filtered_expression_matrix_suffix') if params.get('expression_matrix_ref'): filtered_expression_matrix_ref = self._filter_expression_matrix( params.get('expression_matrix_ref'), up_feature_ids + down_feature_ids, params.get('workspace_name'), "", diff_expr_matrix_ref, filtered_em_name) filtered_expression_matrix_ref_list.append(filtered_expression_matrix_ref) feature_set_suffix = params.get('feature_set_suffix', "") up_feature_set_name = "{}_{}_up{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) up_feature_set_ref = self._generate_feature_set(up_feature_ids, genome_id, params.get('workspace_name'), up_feature_set_name) up_feature_set_ref_list.append(up_feature_set_ref) down_feature_set_name = "{}_{}_down{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) down_feature_set_ref = self._generate_feature_set(down_feature_ids, genome_id, params.get('workspace_name'), down_feature_set_name) down_feature_set_ref_list.append(down_feature_set_ref) returnVal = {'result_directory': result_directory, 'up_feature_set_ref_list': up_feature_set_ref_list, 'down_feature_set_ref_list': down_feature_set_ref_list, 'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list} report_output = self._generate_report(up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, params.get('workspace_name')) returnVal.update(report_output) return returnVal def filter_matrix_with_fs(self, params): self.validate_params(params, ('feature_set_ref', 'workspace_name', 'expression_matrix_ref', 'filtered_expression_matrix_suffix')) ret = self.dfu.get_objects( {'object_refs': [params['feature_set_ref']]} )['data'][0] feature_set = ret['data'] feature_set_name = ret['info'][1] feature_ids = set(feature_set['elements'].keys()) filtered_matrix_ref = self._filter_expression_matrix( params['expression_matrix_ref'], feature_ids, params['workspace_name'], params['filtered_expression_matrix_suffix']) objects_created = [{'ref': filtered_matrix_ref, 'description': 'Filtered ExpressionMatrix Object'}] message = "Filtered Expression Matrix based of the {} feature ids present in {}"\ .format(len(feature_ids), feature_set_name) report_params = {'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return {'filtered_expression_matrix_ref': filtered_matrix_ref, 'report_name': output['name'], 'report_ref': output['ref']} def build_feature_set(self, params): self.validate_params(params, {'output_feature_set', 'workspace_name', }, {'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets', 'description'}) feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets') if not any([params.get(x) for x in feature_sources]): raise ValueError("You must supply at least one feature source: {}".format( ", ".join(feature_sources))) workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) new_feature_set = self._build_fs_obj(params) save_object_params = { 'id': workspace_id, 'objects': [{'type': 'KBaseCollections.FeatureSet', 'data': new_feature_set, 'name': params['output_feature_set']}]} dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) objects_created = [{'ref': feature_set_obj_ref, 'description': 'Feature Set'}] message = 'A new feature set containing {} features was created.'.format( len(new_feature_set['elements'])) report_params = {'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return {'feature_set_ref': feature_set_obj_ref, 'report_name': output['name'], 'report_ref': output['ref']}