class MotifSaver: def __init__(self, callback, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) def saveMotifSet(self, motifset, params): if isinstance(motifset, list): logging.info('Saving multiple motifset objects...') # TODO: accept lists of constructed motif set object # TODO: check if list is a save_objects list or list of motifsets process accordingly # TODO: accept list of object names obj = self.dfu.save_objects({ 'id': self.dfu.ws_name_to_id(params['ws_name']), 'objects': [{ 'type': 'KBaseGeneRegulation.MotifSet', 'data': motifset[0], 'name': str(uuid.uuid4()) }] })[0] return str(obj[6]) + "/" + str(obj[0]) + "/" + str(obj[4]) elif isinstance(motifset, dict): logging.info('Saving a single motifset object...') # TODO: accept object name obj = self.dfu.save_objects({ 'id': self.dfu.ws_name_to_id(params['ws_name']), 'objects': [{ 'type': 'KBaseGeneRegulation.MotifSet', 'data': motifset, 'name': str(uuid.uuid4()) }] })[0] return str(obj[6]) + "/" + str(obj[0]) + "/" + str(obj[4]) else: raise ValueError( 'Input to motif saver should be either: ' + '\n' '1. a list of constructed KBaseGeneRegulation.MotifSet objects (dictionary)\n' + '2. a single KBaseGeneRegulation.MotifSet object (dictionary)')
def test_AssemblySet_input(self): # Initiate empty data dictionaries and get data_util dfu = DataFileUtil(self.callback_url) assembly_dict = dict() assembly_set_dict = dict() dfu_dict = dict() dfu_dict_2 = dict() # Get workspace id and name wsName = self.getWsName() ws_id = dfu.ws_name_to_id(wsName) # FASTA to assembly object Fasta_assembly_dict = { "path": "/kb/module/work/tmp/NC_021490.fasta", "assembly_name": "test_assembly" } params = { "file": Fasta_assembly_dict, "workspace_name": wsName, "assembly_name": "test_assembly" } ref = self.getImpl().save_assembly_from_fasta(self.ctx, params) # Create assembly data dictionaries assembly_dict.update({"label": "assemblySetTest", "ref": ref[0]}) assembly_set_dict.update({ "description": " ", "items": [assembly_dict] }) # Create DataFileUtil dictionaries dfu_dict.update({ "type": "KBaseSets.AssemblySet", "data": assembly_set_dict, "name": "Assembly_Test" }) dfu_dict_2.update({'id': ws_id, 'objects': [dfu_dict]}) # Create assembly set object assembly_set_obj = dfu.save_objects(dfu_dict_2) assembly_set_ref = [ str(assembly_set_obj[0][6]) + '/' + str(assembly_set_obj[0][0]) + '/' + str(assembly_set_obj[0][4]) ] # Get FASTA ret = self.getImpl().get_fastas(self.callback_url, assembly_set_ref)
def UploadFromMdscan(self, callback_url, params): """ :param params: instance of type "UploadmfmdInParams" -> structure: parameter "path" of String, parameter "ws_name" of String, parameter "obj_name" of String :returns: instance of type "UploadOutput" -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: output #BEGIN UploadFromMdscan print('Extracting motifs') motifList = self.parse_mdscan_output(params['path']) print(motifList) MSO = {} MSO = motifList dfu = DataFileUtil(callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name']) save_objects_params['objects'] = [{ 'type': 'KBaseGeneRegulation.MotifSet', 'data': MSO, 'name': params['obj_name'] }] info = dfu.save_objects(save_objects_params)[0] print('SAVED OBJECT') print(info) motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) print(motif_set_ref) output = {'obj_ref': motif_set_ref} print(output) #exit("test") #END UploadFromMdscan # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method UploadFrommfmd return value ' + 'output is not type dict as required.') # return the results return [output]
def upload_pangenome(cb_url, scratch, Pangenome, workspace_name, pangenome_name): """ params: cb_url : callback url scratch : folder path to Pangenome object pangenome : KBaseGenomes.Pangenome like object workspace_name : workspace name pangenome_name : Pangenome display name Returns: pangenome_ref: Pangenome workspace reference pangenome_info: info on pangenome object """ dfu = DataFileUtil(cb_url) meta = {} hidden = 0 # dump pangenome to scratch for upload # data_path = os.path.join(scratch, pangenome_name + '.json') # json.dump(pangenome, open(data_path, 'w')) if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = dfu.ws_name_to_id(workspace_name) save_params = { 'id': workspace_id, 'objects': [{ 'type': 'KBaseGenomes.Pangenome', 'data': Pangenome, 'name': pangenome_name, 'meta': meta, 'hidden': hidden }] } info = dfu.save_objects(save_params)[0] ref = "{}/{}/{}".format(info[6], info[0], info[4]) print("Pangenome saved to {}".format(ref)) return {'pangenome_ref': ref, 'pangenome_info': info}
def test_metagenome_binned_input(self): # Setup path = "data/binnedContigs.json" ws_path = '/kb/module/work/tmp' assembly_path = "data/CCESR16_SPAdes.assembly.fa" shutil.copy2(path, ws_path) shutil.copy2(assembly_path, ws_path) dfu = DataFileUtil(self.callback_url) wsName = self.getWsName() ws_id = dfu.ws_name_to_id(wsName) # FASTA to assembly object Fasta_assembly_dict = { "path": '/kb/module/work/tmp/CCESR16_SPAdes.assembly.fa', "assembly_name": "meta_assembly" } assembly_params = { "file": Fasta_assembly_dict, "workspace_name": wsName, "assembly_name": "test_assembly" } meta_assembly_ref = self.getImpl().save_assembly_from_fasta( self.ctx, assembly_params)[0] # Upload genome, copy genome to workspace folder, & genome data dictionary input meta_data = json.load(open(path)) meta_data['assembly_ref'] = meta_assembly_ref meta_dict = [{ 'name': 'Meta_test', 'type': 'KBaseMetagenomes.BinnedContigs', 'data': meta_data }] # Create .Genome object in workspace with save_objects binned_obj = dfu.save_objects({'id': ws_id, 'objects': meta_dict}) binned_obj_info = binned_obj[0] binned_obj_ref = str(binned_obj_info[6]) + '/' + str( binned_obj_info[0]) + '/' + str(binned_obj_info[4]) # Get FASTA ret = self.getImpl().get_fastas(self.callback_url, [binned_obj_ref])
class GenomeInterface: def __init__(self, config): self.handle_url = config.handleURL self.shock_url = config.shockURL self.sw_url = config.srvWizURL self.token = config.token self.auth_service_url = config.authServiceUrl self.callback_url = config.callbackURL self.re_api_url = config.re_api_url self.auth_client = _KBaseAuth(self.auth_service_url) self.dfu = DataFileUtil(self.callback_url) self.taxon_wsname = config.raw['taxon-workspace-name'] self.scratch = config.raw['scratch'] self.ws_large_data = WsLargeDataIO(self.callback_url) @staticmethod def _validate_save_one_genome_params(params): """ _validate_save_one_genome_params: validates params passed to save_one_genome method """ logging.info('start validating save_one_genome params') # check for required parameters for p in ['workspace', 'name', 'data']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _check_shock_response(self, response, errtxt): """ _check_shock_response: check shock node response (Copied from DataFileUtil) """ logging.info('start checking shock response') if not response.ok: try: err = json.loads(response.content)['error'][0] except Exception: # this means shock is down or not responding. logging.error( "Couldn't parse response error content from Shock: " + response.content) response.raise_for_status() raise ValueError(errtxt + str(err)) def _own_handle(self, genome_data, handle_property): """ _own_handle: check that handle_property point to shock nodes owned by calling user """ logging.info( 'start checking handle {} ownership'.format(handle_property)) if handle_property in genome_data: handle_id = genome_data[handle_property] hs = HandleService(self.handle_url, token=self.token) handles = hs.hids_to_handles([handle_id]) shock_id = handles[0]['id'] # Copy from DataFileUtil.own_shock_node implementation: header = {'Authorization': 'Oauth {}'.format(self.token)} res = requests.get(self.shock_url + '/node/' + shock_id + '/acl/?verbosity=full', headers=header, allow_redirects=True) self._check_shock_response( res, 'Error getting ACLs for Shock node {}: '.format(shock_id)) owner = res.json()['data']['owner']['username'] user_id = self.auth_client.get_user(self.token) if owner != user_id: logging.info('start copying node to owner: {}'.format(user_id)) dfu_shock = self.dfu.copy_shock_node({ 'shock_id': shock_id, 'make_handle': True }) handle_id = dfu_shock['handle']['hid'] genome_data[handle_property] = handle_id def _check_dna_sequence_in_features(self, genome): """ _check_dna_sequence_in_features: check dna sequence in each feature """ logging.info('start checking dna sequence in each feature') if 'features' in genome: features_to_work = {} for feature in genome['features']: if not ('dna_sequence' in feature and feature['dna_sequence']): features_to_work[feature['id']] = feature['location'] if len(features_to_work) > 0: aseq = AssemblySequenceAPI(self.sw_url, token=self.token) get_dna_params = {'requested_features': features_to_work} if 'assembly_ref' in genome: get_dna_params['assembly_ref'] = genome['assembly_ref'] elif 'contigset_ref' in genome: get_dna_params['contigset_ref'] = genome['contigset_ref'] else: # Nothing to do (it may be test genome without contigs)... return dna_sequences = aseq.get_dna_sequences( get_dna_params)['dna_sequences'] for feature in genome['features']: if feature['id'] in dna_sequences: feature['dna_sequence'] = dna_sequences[feature['id']] feature['dna_sequence_length'] = len( feature['dna_sequence']) def get_one_genome(self, params): """Fetch a genome using WSLargeDataIO and return it as a python dict""" logging.info('fetching genome object') res = self.ws_large_data.get_objects(params)['data'][0] data = json.load(open(res['data_json_file'])) return data, res['info'] # return self.dfu.get_objects(params)['data'][0] def save_one_genome(self, params): logging.info('start saving genome object') self._validate_save_one_genome_params(params) workspace = params['workspace'] name = params['name'] data = params['data'] # XXX there is no `workspace_datatype` param in the spec ws_datatype = params.get('workspace_datatype', "KBaseGenomes.Genome") # XXX there is no `meta` param in the spec meta = params.get('meta', {}) if "AnnotatedMetagenomeAssembly" in ws_datatype: if params.get('upgrade') or 'feature_counts' not in data: data = self._update_metagenome(data) else: if params.get('upgrade') or 'feature_counts' not in data: data = self._update_genome(data) # check all handles point to shock nodes owned by calling user self._own_handle(data, 'genbank_handle_ref') self._own_handle(data, 'gff_handle_ref') if "AnnotatedMetagenomeAssembly" not in ws_datatype: self._check_dna_sequence_in_features(data) data['warnings'] = self.validate_genome(data) # sort data data = GenomeUtils.sort_dict(data) # dump genome to scratch for upload data_path = os.path.join(self.scratch, name + ".json") json.dump(data, open(data_path, 'w')) if 'hidden' in params and str( params['hidden']).lower() in ('yes', 'true', 't', '1'): hidden = 1 else: hidden = 0 if isinstance(workspace, int) or workspace.isdigit(): workspace_id = workspace else: workspace_id = self.dfu.ws_name_to_id(workspace) save_params = { 'id': workspace_id, 'objects': [{ 'type': ws_datatype, 'data_json_file': data_path, 'name': name, 'meta': meta, 'hidden': hidden }] } dfu_oi = self.ws_large_data.save_objects(save_params)[0] returnVal = {'info': dfu_oi, 'warnings': data.get('warnings', [])} return returnVal @staticmethod def determine_tier(source): """ Given a user provided source parameter, assign a source and genome tier """ low_source = source.lower() if 'refseq' in low_source: if 'reference' in low_source: return "RefSeq", ['Reference', 'Representative', 'ExternalDB'] if 'representative' in low_source: return "RefSeq", ['Representative', 'ExternalDB'] if 'user' in low_source: return "RefSeq", ['ExternalDB', 'User'] return "RefSeq", ['ExternalDB'] if 'phytozome' in low_source: if 'flagship' in source: return "Phytosome", [ 'Reference', 'Representative', 'ExternalDB' ] return "Phytosome", ['Representative', 'ExternalDB'] if 'ensembl' in low_source: if 'user' in low_source: return "Ensembl", ['ExternalDB', 'User'] return "Ensembl", ['Representative', 'ExternalDB'] return source, ['User'] def _update_metagenome(self, genome): """Checks for missing required fields and fixes breaking changes""" if 'molecule_type' not in genome: genome['molecule_type'] = 'Unknown' def _update_genome(self, genome): """Checks for missing required fields and fixes breaking changes""" # do top level updates ontologies_present = defaultdict(dict) # type: dict ontologies_present.update(genome.get('ontologies_present', {})) ontology_events = genome.get('ontology_events', []) # NOTE: 'genome_tiers' not in Metagenome spec if 'genome_tiers' not in genome: genome['source'], genome['genome_tiers'] = self.determine_tier( genome['source']) if 'molecule_type' not in genome: genome['molecule_type'] = 'Unknown' # If an NCBI taxonomy ID is provided, fetch additional data about the taxon # NOTE: Metagenome object does not have a 'taxon_assignments' field if 'taxon_assignments' in genome and genome['taxon_assignments'].get( 'ncbi'): tax_id = int(genome['taxon_assignments']['ncbi']) GenomeUtils.set_taxon_data(tax_id, self.re_api_url, genome) else: GenomeUtils.set_default_taxon_data(genome) if any([ x not in genome for x in ('dna_size', 'md5', 'gc_content', 'num_contigs') ]): if 'assembly_ref' in genome: assembly_data = self.dfu.get_objects({ 'object_refs': [genome['assembly_ref']], 'ignore_errors': 0 })['data'][0]['data'] genome["gc_content"] = assembly_data['gc_content'] genome["dna_size"] = assembly_data['dna_size'] genome["md5"] = assembly_data['md5'] genome["num_contigs"] = assembly_data['num_contigs'] if assembly_data.get('type'): genome['genome_type'] = assembly_data['type'] elif 'contigset_ref' in genome: contig_data = self.dfu.get_objects({ 'object_refs': [genome['contigset_ref']], 'included': ['contigs/[*]/length', 'md5'], 'ignore_errors': 0 })['data'][0]['data'] genome["gc_content"] = None genome["dna_size"] = sum( (c['length'] for c in contig_data['contigs'])) genome["md5"] = contig_data['md5'] genome["num_contigs"] = len(contig_data['contigs']) # NOTE: metagenomes do not have the following fields if 'cdss' not in genome: genome['cdss'] = [] if 'mrnas' not in genome: genome['mrnas'] = [] if 'non_coding_features' not in genome: genome['non_coding_features'] = [] # do feature level updates retained_features = [] type_counts = defaultdict(int) for field in ('mrnas', 'cdss', 'features'): for i, feat in enumerate(genome.get(field, [])): if 'function' in feat and not isinstance(feat, list): feat['functions'] = feat['function'].split('; ') del feat['function'] if 'aliases' in feat: if not feat['aliases']: del feat['aliases'] elif not isinstance(feat['aliases'][0], (list, tuple)): feat['aliases'] = [['gene_synonym', x] for x in feat['aliases']] if 'type' in feat: type_counts[feat['type']] += 1 for ontology, terms in feat.get('ontology_terms', {}).items(): for term in terms.values(): if isinstance(term, list): continue ontologies_present[ontology][ term['id']] = term['term_name'] term_evidence = [] for ev in term['evidence']: ev['id'] = ontology if "ontology_ref" in term: ev['ontology_ref'] = term["ontology_ref"] if ev not in ontology_events: ontology_events.append(ev) term_evidence.append(ontology_events.index(ev)) feat['ontology_terms'][ontology][ term['id']] = term_evidence # remove deprecated fields feat.pop('protein_families', None) feat.pop('atomic_regulons', None) feat.pop('orthologs', None) feat.pop('coexpressed_fids', None) feat.pop('publications', None) feat.pop('regulon_data', None) feat.pop('subsystem_data', None) if 'dna_sequence_length' not in feat: feat['dna_sequence_length'] = sum( x[3] for x in feat['location']) if 'protein_translation' in feat and 'protein_md5' not in feat: feat['protein_md5'] = hashlib.md5( feat.get('protein_translation', '').encode('utf8')).hexdigest() # split all the stuff lumped together in old versions into the # right arrays if field == 'features': if feat.get('type', 'gene') == 'gene': if not feat.get('cdss', []): type_counts['non_coding_genes'] += 1 genome['non_coding_features'].append(feat) else: retained_features.append(feat) elif feat.get('type', 'gene') == 'CDS': if 'parent_gene' not in feat: feat['parent_gene'] = '' genome['cdss'].append(feat) elif feat.get('type', 'gene') == 'mRNA': if 'parent_gene' not in feat: feat['parent_gene'] = '' genome['mrnas'].append(feat) genome['features'] = retained_features if ontology_events: genome['ontology_events'] = ontology_events if ontologies_present: genome['ontologies_present'] = ontologies_present type_counts['mRNA'] = len(genome.get('mrnas', [])) type_counts['CDS'] = len(genome.get('cdss', [])) type_counts['protein_encoding_gene'] = len(genome['features']) type_counts['non_coding_features'] = len( genome.get('non_coding_features', [])) genome['feature_counts'] = type_counts return genome @staticmethod def validate_genome(g): """ Run a series of checks on the genome object and return any warnings """ allowed_tiers = {'Representative', 'Reference', 'ExternalDB', 'User'} logging.info('Validating genome object contents') warnings = g.get('warnings', []) # TODO: Determine whether these checks make any sense for Metagenome # object. Looks like many don't. # Add validations for Metagenome object # this will fire for some annotation methods like PROKKA if g.get('domain') == "Bacteria" and len(g.get('cdss', [])) != len( g['features']): warnings.append( "For prokaryotes, CDS array should generally be the" " same length as the Features array.") if g.get('domain') == "Eukaryota" and len(g.get( 'features', [])) == len(g.get('cdss', [])): warnings.append( "For Eukaryotes, CDS array should not be the same " "length as the Features array due to RNA splicing.") if g.get('molecule_type') not in {"DNA", 'ds-DNA'}: if g.get('domain', '') not in {'Virus', 'Viroid'} and \ g['molecule_type'] not in {"DNA", 'ds-DNA'}: warnings.append("Genome molecule_type {} is not expected " "for domain {}.".format( g['molecule_type'], g.get('domain', ''))) if "genome_tiers" in g and set(g['genome_tiers']) - allowed_tiers: warnings.append("Undefined terms in genome_tiers: " + ", ".join(set(g['genome_tiers']) - allowed_tiers)) assignments = g.get('taxon_assignments', {}) if 'ncbi' not in assignments or ('taxon_ref' in g and g['taxon_ref'] == "ReferenceTaxons/unknown_taxon"): warnings.append('Unable to determine organism taxonomy') GenomeInterface.handle_large_genomes(g) return warnings @staticmethod def handle_large_genomes(g): """Determines the size of various feature arrays and starts removing the dna_sequence if the genome is getting too big to store in the workspace""" def _get_size(obj): return sys.getsizeof(json.dumps(obj)) # seems pretty uneccessary... def sizeof_fmt(num): for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: if abs(num) < 1024.0: return "%3.1f %sB" % (num, unit) num /= 1024.0 return "%.1f %sB" % (num, 'Yi') feature_lists = ('mrnas', 'features', 'non_coding_features', 'cdss') master_key_sizes = dict() # Change want full breakdown to True if want to see break down of sizes. # By making this a changeable flag it will run faster for standard uploads. want_full_breakdown = False for x in feature_lists: if x in g: need_to_remove_dna_sequence = _get_size(g) > MAX_GENOME_SIZE if need_to_remove_dna_sequence or want_full_breakdown: feature_type_dict_keys = dict() for feature in g[x]: for feature_key in list(feature.keys()): if feature_key == "dna_sequence" and need_to_remove_dna_sequence: # NOTE: should this get stored somewhere? del (feature["dna_sequence"]) else: if feature_key not in feature_type_dict_keys: feature_type_dict_keys[feature_key] = 0 feature_type_dict_keys[ feature_key] += sys.getsizeof( feature[feature_key]) for feature_key in feature_type_dict_keys: feature_type_dict_keys[feature_key] = sizeof_fmt( feature_type_dict_keys[feature_key]) master_key_sizes[x] = feature_type_dict_keys print(f"{x}: {sizeof_fmt(_get_size(g[x]))}") total_size = _get_size(g) print(f"Total size {sizeof_fmt(total_size)} ") if want_full_breakdown: print( f"Here is the breakdown of the sizes of feature lists elements : " f"{str(master_key_sizes)}") if total_size > MAX_GENOME_SIZE: print( f"Here is the breakdown of the sizes of feature lists elements : " f"{str(master_key_sizes)}") raise ValueError( f"This genome size of {sizeof_fmt(total_size)} exceeds the maximum " f"permitted size of {sizeof_fmt(MAX_GENOME_SIZE)}.\n" f"Here is the breakdown for feature lists and their respective " f"sizes:\n{master_key_sizes}")
class VariationUtil: ''' Module Name: VariationUtil Module Description: A KBase module: VariationUtil ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.4" GIT_URL = "" GIT_COMMIT_HASH = "2a4c2dbc058b702811c967997e7100c834e755d4" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR # TODO: Make sure we need to define config just once # TODO: Change the code tp match this style self.config = config self.config['SDK_CALLBACK_URL'] = os.environ['SDK_CALLBACK_URL'] self.config['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.config['ws_url'] = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shared_folder = config['scratch'] self.hr = htmlreportutils() self.ws_url = config['workspace-url'] self.wsc = Workspace(self.ws_url) self.dfu = DataFileUtil(self.callback_url) self.shock_url = config['shock-url'] self.sw_url = config['srv-wiz-url'] pass #END_CONSTRUCTOR pass def save_variation_from_vcf(self, ctx, params): """ Save a variation (and trait?) object to Kbase given a reference genome, object output name, Variant Call Format (VCF) file, and sample attribute file. :param params: instance of type "save_variation_input" (## funcdef save_variation_from_vcf ## required input params: genome_or_assembly_ref: KBaseGenomes.Genome or KBaseGenomeAnnotations.Assembly object reference *** variation input data *** vcf_staging_file_path: path to location data associated with samples variation_object_name: output name for KBase variation object *** sample input data *** sample_attribute_ref: x/y/z reference to kbase sample attribute optional params: NA output report: report_name report_ref HTML visualization: Manhattan plot *** Visualization *** plot_maf: generate histogram of minor allele frequencies plot_hwe: generate histogram of Hardy-Weinberg Equilibrium p-values) -> structure: parameter "workspace_name" of String, parameter "genome_or_assembly_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "vcf_staging_file_path" of type "filepath" (KBase file path to staging files), parameter "variation_object_name" of String, parameter "sample_attribute_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "save_variation_output" -> structure: parameter "variation_ref" of String, parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: report #BEGIN save_variation_from_vcf # Get workspace id ws_id = self.dfu.ws_name_to_id(params['workspace_name']) genome_ref = None assembly_ref = None # 1) Find whether the input is a genome or assembly # and get genome_ref and assembly_ref genome_or_assembly_ref = params['genome_or_assembly_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') # 2) Validate VCF, compress, and build VCF index logging.info("Validating VCF, Compressing VCF and Indexing VCF") VCFUtilsConfig = {"scratch": self.scratch} VCFUtilsParams = { 'vcf_staging_file_path': params['vcf_staging_file_path'] } VCU = VCFUtils(VCFUtilsConfig) vcf_compressed, vcf_index, vcf_strain_ids = VCU.validate_compress_and_index_vcf( VCFUtilsParams) if vcf_index is not None: logging.info("vcf compressed :" + str(vcf_compressed)) logging.info("vcf index :" + str(vcf_index)) logging.info("vcf strain ids :" + str(vcf_strain_ids)) else: raise ValueError( "No result obtained after compression and indexing step") # Get strain info # TODO: Remove hard coded stuff StrainInfoConfig = self.config StrainInfoParams = { "ws_id": ws_id, "vcf_strain_ids": vcf_strain_ids, "sample_set_ref": params["sample_set_ref"], "sample_attribute_name": params["sample_attribute_name"] } si = StrainInfo(StrainInfoConfig) sample_attribute_ref, strains = si.sample_strain_info(StrainInfoParams) print(sample_attribute_ref) print(strains) # 3) Create json for variation object. In a following step genomic_indexes will be # added to this json before it is saved as Variation object VCFToVariationConfig = {"ws_url": self.ws_url, "scratch": self.scratch} VCFToVariationParams = { "vcf_compressed": vcf_compressed, "vcf_index": vcf_index, "assembly_ref": assembly_ref } if genome_ref is not None: VCFToVariationParams['genome_ref'] = genome_ref vtv = VCFToVariation(VCFToVariationConfig) variation_object_data = vtv.generate_variation_object_data( VCFToVariationParams) # Append sample information if sample_attribute_ref: variation_object_data[ 'sample_attribute_ref'] = sample_attribute_ref else: raise ValueError(f'sample attribute ref not found') if strains: variation_object_data['strains'] = strains else: raise ValueError(f'strains not found') if 'sample_set_ref' in params: variation_object_data['sample_set_ref'] = params['sample_set_ref'] else: raise ValueError(f'sample_set_ref not found in params') # 4) JbrowseConfig = { "ws_url": self.ws_url, "scratch": self.scratch, "sw_url": self.sw_url, "shock_url": self.shock_url } JbrowseParams = { "vcf_path": vcf_compressed, "assembly_ref": assembly_ref, "binsize": 10000, "vcf_shock_id": variation_object_data['vcf_handle']['id'], "vcf_index_shock_id": variation_object_data['vcf_index_handle']['id'] } if genome_ref is not None: JbrowseParams["genome_ref"] = genome_ref jb = JbrowseUtil(JbrowseConfig) jbrowse_report = jb.prepare_jbrowse_report(JbrowseParams) # 5) Now we have the genomic indices and we have all the information needed to save # the variation object # TODO: Take out the genomic_indexes field from the object spec # TODO: Take out the vcf_handle stuff not needed variation_object_data['genomic_indexes'] = jbrowse_report[ 'genomic_indexes'] var_obj = self.dfu.save_objects({ 'id': self.dfu.ws_name_to_id(params['workspace_name']), 'objects': [{ 'type': 'KBaseGwasData.Variations', 'data': variation_object_data, 'name': params['variation_object_name'] }] })[0] var_obj_ref = str(var_obj[6]) + "/" + str(var_obj[0]) + "/" + str( var_obj[4]) print(var_obj_ref) # 5) Build Variation report # This is a simple report # workspace = params['workspace_name'] created_objects = [] created_objects.append({ "ref": var_obj_ref, "description": "Variation Object" }) ReportConfig = { "ws_url": self.ws_url, "scratch": self.scratch, } ReportParams = {"variation_ref": var_obj_ref} vr = VariationReport(ReportConfig) htmlreport_dir = vr.create_variation_report(ReportParams) report = self.hr.create_html_report(htmlreport_dir, workspace, created_objects) report['variation_ref'] = var_obj_ref print(report) #END save_variation_from_vcf # At some point might do deeper type checking... if not isinstance(report, dict): raise ValueError('Method save_variation_from_vcf return value ' + 'report is not type dict as required.') # return the results return [report] def export_variation_as_vcf(self, ctx, params): """ Export KBase variation object as Variant Call Format (VCF) file :param params: instance of type "export_variation_input" (## funcdef export_variation_as_vcf ## required input params: Variation object reference optional params: NA output report: Shock id pointing to exported vcf file) -> structure: parameter "input_var_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "export_variation_output" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_variation_as_vcf vtv = VariationToVCF(self.callback_url, self.shared_folder) output = vtv.export_as_vcf(params) #END export_variation_as_vcf # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_variation_as_vcf return value ' + 'output is not type dict as required.') # return the results return [output] def get_variation_as_vcf(self, ctx, params): """ Given a reference to a variation object, and output name: return a Variant Call Format (VCF) file path and name. :param params: instance of type "get_variation_input" (## funcdef get_variation_as_vcf ## required input params: Variation object reference output file name optional params: NA output report: path to returned vcf name of variation object) -> structure: parameter "variation_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "filename" of String :returns: instance of type "get_variation_output" -> structure: parameter "path" of type "filepath" (KBase file path to staging files), parameter "variation_name" of String """ # ctx is the context object # return variables are: file #BEGIN get_variation_as_vcf vtv = VariationToVCF(self.callback_url, self.shared_folder) file = vtv.variation_to_vcf(params) #END get_variation_as_vcf # At some point might do deeper type checking... if not isinstance(file, dict): raise ValueError('Method get_variation_as_vcf return value ' + 'file is not type dict as required.') # return the results return [file] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class VCFToVariation: def __init__(self, config, scratch, callback_url ): self.scratch = config['scratch'] self.ws_url = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.wsc = Workspace(self.ws_url) self.scratch = scratch self.callback_url = callback_url self.au = AssemblyUtil(self.callback_url) self.gapi = GenericsAPI(self.callback_url) def _parse_vcf_data(self, params): vcf_filepath = self._stage_input(params) # file is validated by this point, can assume vcf_filepath is valid reader = vcf.Reader(open(vcf_filepath, 'r')) version = float(reader.metadata['fileformat'][4:6]) genotypes = reader.samples chromosomes = [] contigs = {} totalvars = 0 for record in reader: totalvars += 1 if record.CHROM not in chromosomes: chromosomes.append(record.CHROM) if record.CHROM not in contigs.keys(): passvar = 1 if not record.FILTER else 0 contigs[record.CHROM] = { 'contig_id': record.CHROM, 'totalvariants': 1, 'passvariants': passvar, 'length': int(record.affected_end-record.affected_start), } else: contigs[record.CHROM]['totalvariants'] += 1 if not record.FILTER: contigs[record.CHROM]['passvariants'] += 1 vcf_info = { 'version': version, 'contigs': contigs, 'total_variants': totalvars, 'genotype_ids': genotypes, 'chromosome_ids': chromosomes, 'file_ref': vcf_filepath } return vcf_info def _validate_vcf_to_sample(self, vcf_genotypes, sample_ids): genos_not_found = [] vgenotypes = [x.upper().strip() for x in vcf_genotypes] sids = [x.upper().strip() for x in sample_ids] for geno in vgenotypes: if geno not in sids: genos_not_found.append(geno) if not genos_not_found: return True else: return genos_not_found def _chk_if_vcf_ids_in_assembly(self, vcf_chromosomes, assembly_chromosomes): chromos_not_in_assembly = [] pp(assembly_chromosomes) for chromo in vcf_chromosomes: if chromo not in assembly_chromosomes: chromos_not_in_assembly.append(chromo) if not chromos_not_in_assembly: return True else: return chromos_not_in_assembly def _get_vcf_version(self, vcf_filepath): with(gzip.open if is_gz_file(vcf_filepath) else open)(vcf_filepath, 'rt') as vcf: line = vcf.readline() tokens = line.split('=') if not (tokens[0].startswith('##fileformat')): log("Invalid VCF. ##fileformat line in meta is improperly formatted.") raise ValueError("Invalid VCF. ##fileformat line in meta is improperly formatted. " "Check VCF file specifications: https://samtools.github.io/hts-specs/") vcf_version = float(tokens[1][-4:].rstrip()) return vcf_version def validate_vcf(self, params): if 'genome_or_assembly_ref' not in params: raise ValueError('Genome or Assembly reference not in input parameters: \n\n'+params) if 'vcf_staging_file_path' not in params: raise ValueError('VCF staging file path not in input parameters: \n\n' + params) vcf_filepath = self._stage_input(params) vcf_version = self._get_vcf_version(vcf_filepath) # setup directorys for validation output validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4())) os.mkdir(validation_output_dir) # vcftools (vcf-validator) supports VCF v4.0-4.2 # https://github.com/vcftools/vcftools # EBIvariation/vcf-validator (vcf_validator_linux) supports VCF v4.1-4.3 # https://github.com/EBIvariation/vcf-validator # vcftools is only to validate VCF v4.0 if vcf_version >= 4.1: print("Using vcf_validator_linux...") validator_cmd = ["vcf_validator_linux"] validator_cmd.append("-i") validator_cmd.append(vcf_filepath) validator_cmd.append("-l") validator_cmd.append('error') print("VCF version "+str(vcf_version)+".") elif vcf_version >= 4.0: print("Using vcftools to validate...") validator_cmd = ["vcf-validator"] validator_cmd.append(vcf_filepath) print("VCF version 4.0.") else: raise ValueError('VCF Version not in file, or fileformat line malformatted, or not version >=4.0. file format line must be the ' 'first line of vcf file and in appropriate syntax. Check VCF file specifications: ' 'https://samtools.github.io/hts-specs/') print("Validator command: {}".format(validator_cmd)) p = subprocess.Popen(validator_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) validator_output = [] while True: line = p.stdout.readline() if not line: break if line.decode("utf-8").strip().startswith('[info]'): validator_output.append(line.decode("utf-8")) out, err = p.communicate() validation_output_filename = os.path.join(validation_output_dir, 'vcf_validation.txt') file_output_chk = [] try: if validator_output[0][:6] == '[info]': # validation by vcf_validator_linux validation_output_filename = validator_output[1].split(' ')[6].strip('\n') vo = validator_output[2].split(' ') file_output_chk = ''.join(vo[9:]).strip('\n') if not os.path.exists(validation_output_filename): raise ValueError(validation_output_filename+' does not exist!') if not file_output_chk == 'isvalid': print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) #TODO: more detailed validation parsing for vcf_validator_linux else: if validator_output: with open(validation_output_filename, 'w') as f: for line in validator_output: f.write(str(line)) f.close() print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) else: with open(validation_output_filename, 'w') as f: f.write("vcftools used to validate vcf file:\n"+vcf_filepath+"\n\File is validate as of vcf spec v4.0") f.close() # TODO: more detailed validation parsing for vcftools except IndexError: # if vcf file < v4.1, and valid it will produce index error on line 132 if validator_output: with open(validation_output_filename, 'w') as f: for line in validator_output: f.write(str(line)) f.close() print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) else: with open(validation_output_filename, 'w') as f: f.write("vcftools used to validate vcf file:\n" + vcf_filepath + "\n\File is validate as of vcf spec v4.0") f.close() if not os.path.exists(validation_output_filename): print('Validator did not generate log file!') raise SystemError("Validator did not generate a log file.") log("Validator output filepath: {}".format(validation_output_filename)) log("Return code from validator {}".format(p.returncode)) return validation_output_filename def _stage_input(self, params): # extract file location from input ui parameters if params['vcf_staging_file_path'].startswith('/kb/module/test/'): # variation utils unit test vcf_local_file_path = params['vcf_staging_file_path'] if vcf_local_file_path.endswith('.gz'): with gzip.open(vcf_local_file_path, 'rb') as f_in: with open(vcf_local_file_path[:-3], 'wb') as f_out: shutil.copyfileobj(f_in, f_out) vcf_local_file_path = vcf_local_file_path[:-3] else: staging_dir = '/staging' vcf_local_file_path = os.path.join(staging_dir, params['vcf_staging_file_path']) if not os.path.exists(vcf_local_file_path): raise OSError('VCF input path does not exist, or is not readable') orig_file_path = os.path.join(self.scratch, 'original_' + os.path.basename(vcf_local_file_path)) print(f'VCF: {vcf_local_file_path} Orig: {orig_file_path}') self.original_file = shutil.copy(vcf_local_file_path, orig_file_path) # TODO: use data file utils here, upload vcf to shock, use dfu. if is_gz_file(vcf_local_file_path): # /staging is read only, therefore have to copy before uncompressing if not vcf_local_file_path == os.path.join(self.scratch, params['vcf_staging_file_path']): copy = shutil.copy(vcf_local_file_path, os.path.join(self.scratch,params['vcf_staging_file_path'])) unpack = self.dfu.unpack_file({'file_path': copy}) else: unpack = {} unpack['file_path'] = os.path.join(self.scratch,params['vcf_staging_file_path']) params['vcf_local_file_path'] = unpack['file_path'] return unpack['file_path'] else: params['vcf_local_file_path'] = vcf_local_file_path return vcf_local_file_path def _create_sample_attribute_file(self, vcf_file, sample_attribute_mapping_file): """ function for creating sample attribute mapping file. """ try: with open (vcf_file, 'r') as vcf_handle: Lines = vcf_handle.readlines() for line in Lines: if(line.startswith("#CHROM")): header = line.lstrip().split("\t") try: with open (sample_attribute_mapping_file, 'w') as attribute_mapping_handle: attribute_mapping_handle.write("Attribute\tAttribute ontology ID\tUnit\tUnit ontology ID") for i in range(9,len(header)): attribute_mapping_handle.write("\t"+header[i]) #attribute_mapping_handle.write("\n") attribute_mapping_handle.write("label\t\t\t") for j in range(9,len(header)): attribute_mapping_handle.write("\t"+header[j]) #attribute_mapping_handle.write("\n") except IOError: print("Could not write to file:", sample_attribute_mapping_file) except IOError: print("Could not read file:", vcf_file) def _validate_assembly_ids(self, params): # All chromosome ids from the vcf should be in assembly # but not all assembly chromosome ids should be in vcf if ('genome_ref' in params): subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': params['genome_or_assembly_ref'] }]) self.vcf_info['assembly_ref'] = subset[0]['data']['assembly_ref'] if ('assembly_ref' in params): self.vcf_info['assembly_ref'] = params['assembly_ref'] assembly_chromosome_ids_call = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': self.vcf_info['assembly_ref'] }]) assembly_chromosomes = assembly_chromosome_ids_call[0]['data']['contigs'].keys() vcf_chromosomes = self.vcf_info['chromosome_ids'] chk_assembly_ids = self._chk_if_vcf_ids_in_assembly(vcf_chromosomes, assembly_chromosomes) if isinstance(chk_assembly_ids, list): failed_ids = ' '.join(chk_assembly_ids) print(f'VCF contig ids: {failed_ids} are not present in assembly.') raise ValueError(f'VCF contig ids: {failed_ids} are not present in assembly.') return assembly_chromosomes def _validate_sample_ids(self, params): # All samples within the VCF file need to be in sample attribute list vcf_genotypes = self.vcf_info['genotype_ids'] sample_ids_subset = self.wsc.get_object_subset([{ 'included': ['/instances'], 'ref': params['sample_attribute_ref'] }]) sample_ids = sample_ids_subset[0]['data']['instances'].keys() validate_genotypes = self._validate_vcf_to_sample(vcf_genotypes, sample_ids) if isinstance(validate_genotypes, list): failed_genos = ' '.join(validate_genotypes) print(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.') raise ValueError(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.') return sample_ids def _construct_contig_info(self, params): """ KBaseGwasData.Variations type spec /* Contig variation data contig_id - contig identifier totalvariants - total number of variants in each contig passvariants - total number of variants that pass quality variation filter in contig length - length of contig from assembly data */ typdef structure { string contig_id; int totalvariants; int passvariants; int length; // from assembly } contig_info; """ assembly_chromosome_dict = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': self.vcf_info['assembly_ref'] }])[0]['data']['contigs'] contigs = [] contig_infos = self.vcf_info['contigs'] for contig_id in contig_infos: length_contig = assembly_chromosome_dict[contig_id].get("length") contig_infos[contig_id]["length"] = length_contig contigs.append(contig_infos[contig_id]) return contigs def _bgzip_vcf(self, vcf_filepath): if not os.path.exists(vcf_filepath): print (vcf_filepath + " does not exist") zip_cmd = ["bgzip", vcf_filepath] p = subprocess.Popen(zip_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() bgzip_file_path = vcf_filepath + ".gz" print (bgzip_file_path) return bgzip_file_path def _index_vcf(self, bgzip_file): output_dir = self.scratch bgzip_filepath = os.path.join(self.scratch, bgzip_file) if not os.path.exists(bgzip_filepath): print (bgzip_filepath + " does not exist") index_cmd = ["tabix", "-p", "vcf", bgzip_filepath] p = subprocess.Popen(index_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() index_file_path = bgzip_filepath + ".tbi" return index_file_path def _index_assembly(self, assembly_file): if not os.path.exists(assembly_file): print (assembly_file + " does not exist") logging.info("indexing assembly file") assembly_index_cmd = ["samtools", "faidx", assembly_file] print(assembly_index_cmd) p = subprocess.Popen(assembly_index_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() logging.info("indexing of assembly file done!") return assembly_file + ".fai" def _download_assembly(self, assembly_ref): file = self.au.get_assembly_as_fasta({ 'ref': assembly_ref }) return file def _construct_variation(self, params, contigs_info): """ KBaseGwasData.Variations type spec /* Variation object data structure num_genotypes - number of total genotypes within variant file num_variants - number of total variants within variant file contigs - list of contig ids and variant information attribute_ref - KBase reference to attribute mapping workspace object genome_ref - KBase reference to genome workspace object assembly_ref - KBase reference to assemebly workspace object vcf_handle_ref - VCF handle reference to VCF file @optional genome_ref */ typedef structure { int numgenotypes; int numvariants; list<contig_info> contigs; attribute_ref population; // KBaseExperiments.AttributeMapping genome_ref genome_ref; // KBaseGenomes.Genome assembly_ref assemby_ref; // KBaseGenomeAnnotations.Assembly vcf_handle_ref vcf_handle_ref; } Variations; :param params: KBase ui input parameters :param population: previoiusly constructed sample population data :return: constructed variation object (dictionary) """ if not self.vcf_info['file_ref'].startswith(self.scratch): new_vcf_file = os.path.join(self.scratch, os.path.basename(self.vcf_info['file_ref'])) self.vcf_info['file_ref'] = shutil.copy(self.vcf_info['file_ref'], new_vcf_file) vcf_staged_file = self.original_file bgzip_file_path = self._bgzip_vcf(vcf_staged_file) vcf_shock_file_ref = self.dfu.file_to_shock( {'file_path': bgzip_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(bgzip_file_path, vcf_shock_file_ref) index_file_path = self._index_vcf(bgzip_file_path) vcf_index_shock_file_ref = self.dfu.file_to_shock( {'file_path': index_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(index_file_path, vcf_index_shock_file_ref) assembly_file_path = self._download_assembly(self.vcf_info['assembly_ref'])['path'] assembly_index_file_path = self._index_assembly(assembly_file_path) assembly_index_shock_file_ref = self.dfu.file_to_shock( {'file_path': assembly_index_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(assembly_index_file_path, assembly_index_shock_file_ref) variation_obj = { 'numgenotypes': int(len(self.vcf_info['genotype_ids'])), 'numvariants': int(self.vcf_info['total_variants']), 'contigs': contigs_info, 'population': params['sample_attribute_ref'], # TYPE SPEC CHANGE: need to change type spec to assembly_ref instead of assemby_ref 'assemby_ref': self.vcf_info['assembly_ref'], 'vcf_handle_ref': vcf_shock_file_ref['handle']['hid'], 'vcf_handle' : vcf_shock_file_ref['handle'], 'vcf_index_handle_ref': vcf_index_shock_file_ref['handle']['hid'], 'vcf_index_handle': vcf_index_shock_file_ref['handle'], 'assembly_index_handle_ref': assembly_index_shock_file_ref['handle']['hid'], 'assembly_index_handle': assembly_index_shock_file_ref['handle'] } if 'genome_ref' in params: variation_obj['genome_ref'] = params['genome_ref'] return variation_obj def _save_var_obj(self, params, var): """ :param params: :param var: :return: DataFileUtils object_info: objid - the numerical id of the object. name - the name of the object. type - the type of the object. save_date - the save date of the object. ver - the version of the object. saved_by - the user that saved or copied the object. wsid - the id of the workspace containing the object. workspace - the name of the workspace containing the object. chsum - the md5 checksum of the object. size - the size of the object in bytes. meta - arbitrary user-supplied metadata about the object. """ print('Saving Variation to workspace...\n') if var: if not 'variation_object_name' in params: var_obj_name = 'variation_'+str(uuid.uuid4()) else: var_obj_name = params['variation_object_name'] var_obj_info = self.dfu.save_objects({ 'id': self.dfu.ws_name_to_id(params['workspace_name']), 'objects': [{ 'type': 'KBaseGwasData.Variations', 'data': var, 'name': var_obj_name }] })[0] return var_obj_info else: raise ValueError('Variation object blank, cannot not save to workspace!') def _validate_sample_attribute_ref(self, params): #params["sample_attribute_ref"] = '' #just for testing if not params['sample_attribute_ref']: sample_attribute_mapping_file = os.path.join(self.scratch ,"sample_attribute.tsv") #hardcoded for testing self._create_sample_attribute_file(params['vcf_local_file_path'], sample_attribute_mapping_file) logging.info("Uploading sample attribute file to ref") vcf_sample_attribute_shock_file_ref = self.dfu.file_to_shock( {'file_path': sample_attribute_mapping_file, 'make_handle': 1} ) shock_id = vcf_sample_attribute_shock_file_ref['shock_id'] ws_id = self.dfu.ws_name_to_id(params['workspace_name']) import_params = { 'input_shock_id' : shock_id, 'output_ws_id': ws_id, 'output_obj_name': 'Sample_attribute'} ret = self.gapi.file_to_attribute_mapping(import_params) params['sample_attribute_ref'] = ret['attribute_mapping_ref'] def import_vcf(self, params): # VCF validation # VCF file validation file_valid_result = self.validate_vcf(params) self._validate_sample_attribute_ref(params) # VCF file parsing self.vcf_info = self._parse_vcf_data(params) # Validate vcf chromosome ids against assembly chromosome ids self._validate_assembly_ids(params) # Validate vcf genotypes against sample meta data ids self._validate_sample_ids(params) # Variation object construction # construct contigs_info contigs_info = self._construct_contig_info(params) # construct variation var = self._construct_variation(params, contigs_info) # Save variation object to workspace var_wksp_obj = self._save_var_obj(params, var) return [var_wksp_obj, var]
class FastaToAssembly: def __init__(self, callback_url, scratch, ws_url): self.scratch = scratch self.dfu = DataFileUtil(callback_url) self.ws = Workspace(ws_url) # Note added X due to kb|g.1886.fasta self.valid_chars = "-ACGTUWSMKRYBDHVNX" self.amino_acid_specific_characters = "PLIFQE" def import_fasta(self, ctx, params): print('validating parameters') self.validate_params(params) print('staging input files') fasta_file_path = self.stage_input(params) if 'min_contig_length' in params: min_contig_length = int(params['min_contig_length']) print(f'filtering FASTA file by contig length (min len={min_contig_length} bp)') fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length) print(f'parsing FASTA file: {fasta_file_path}') assembly_data = self.parse_fasta(fasta_file_path, params) print(f' - parsed {assembly_data["num_contigs"]} contigs,{assembly_data["dna_size"]} bp') print('saving assembly to KBase') # save file to shock and build handle fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path) # construct the output object assembly_object_to_save = self.build_assembly_object(assembly_data, fasta_file_handle_info, params) json.dump(assembly_object_to_save, open(self.scratch+"/example.json", 'w')) # save to WS and return if 'workspace_id' in params: workspace_id = int(params['workspace_id']) else: workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) assembly_info = self.save_assembly_object(workspace_id, params['assembly_name'], assembly_object_to_save) return assembly_info def build_assembly_object(self, assembly_data, fasta_file_handle_info, params): """ construct the WS object data to save based on the parsed info and params """ assembly_data['assembly_id'] = params['assembly_name'] assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid'] fasta_file_handle_info['handle'] = fasta_file_handle_info['handle'] assembly_data['fasta_handle_info'] = fasta_file_handle_info assembly_data['type'] = 'Unknown' if 'type' in params: assembly_data['type'] = params['type'] if 'taxon_ref' in params: info = self.ws.get_object_info3({'objects':[{'ref': params['taxon_ref']}]})['infos'][0] assembly_data['taxon_ref'] = f'{info[6]}/{info[0]}/{info[4]}' if 'external_source' in params: assembly_data['external_source'] = params['external_source'] if 'external_source_id' in params: assembly_data['external_source_id'] = params['external_source_id'] if 'external_source_origination_date' in params: assembly_data['external_source_origination_date'] = params['external_source_origination_date'] return sort_dict(assembly_data) def parse_fasta(self, fasta_file_path, params): """ Do the actual work of inspecting each contig """ # variables to store running counts of things total_length = 0 base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0} md5_list = [] # map from contig_id to contig_info all_contig_data = {} extra_contig_info = {} if'contig_info' in params: extra_contig_info = params['contig_info'] for record in SeqIO.parse(fasta_file_path, "fasta"): # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()), # id='gi|113968346|ref|NC_008321.1|', # name='gi|113968346|ref|NC_008321.1|', # description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome', # dbxrefs=[]) sequence = str(record.seq).upper() contig_info = { 'contig_id': record.id, 'name': record.id, 'description': record.description[len(record.id):].strip(), 'length': len(record.seq) } # 1) compute sequence character statistics running total total_length += contig_info['length'] sequence_count_table = dict(Counter(sequence)) for character in sequence_count_table: if character in base_counts: base_counts[character] = base_counts[character] + sequence_count_table[character] else: base_counts[character] = sequence_count_table[character] if character not in self.valid_chars: if character in self.amino_acid_specific_characters: raise ValueError('This FASTA file may have amino acids in it instead ' 'of the required nucleotides.') raise ValueError(f"This FASTA file has non nucleic acid characters: " f"{character}") # 2) record number of 'N' characters (only set if there are some) Ncount = 0 if 'N' in sequence_count_table: Ncount = sequence_count_table['N'] contig_info['Ncount'] = Ncount # 2b) record if the contig is circular if record.id in extra_contig_info: if 'is_circ' in extra_contig_info[record.id]: contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ']) if 'description' in extra_contig_info[record.id]: contig_info['description'] = str(extra_contig_info[record.id]['description']) # 3) record md5 checksum contig_md5 = md5(sequence.encode()).hexdigest() contig_info['md5'] = contig_md5 md5_list.append(contig_md5) # 4) record the all important GC to ~3 significant digits GC_count = 0 for base in ['G', 'C']: if base in sequence_count_table: GC_count += sequence_count_table[base] contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5) # 5) add to contig list if contig_info['contig_id'] in all_contig_data: raise ValueError('The FASTA header key ' + contig_info['contig_id'] + 'appears more than once in the file') all_contig_data[contig_info['contig_id']] = contig_info # Aggregate stats for the data total_gc_content = None if total_length > 0: total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5) assembly_data = { 'md5': md5(",".join(sorted(md5_list)).encode()).hexdigest(), 'base_counts': base_counts, 'dna_size': total_length, 'gc_content': total_gc_content, 'contigs': all_contig_data, 'num_contigs': len(all_contig_data) } return assembly_data @staticmethod def fasta_filter_contigs_generator(fasta_record_iter, min_contig_length): """ generates SeqRecords iterator for writing from a legacy contigset object """ rows = 0 rows_added = 0 for record in fasta_record_iter: rows += 1 if len(record.seq) >= min_contig_length: rows_added += 1 yield record print(f' - filtered out {rows - rows_added} of {rows} contigs that were shorter ' f'than {(min_contig_length)} bp.') def filter_contigs_by_length(self, fasta_file_path, min_contig_length): """ removes all contigs less than the min_contig_length provided """ filtered_fasta_file_path = fasta_file_path + '.filtered.fa' fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta') SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length), filtered_fasta_file_path, 'fasta') return filtered_fasta_file_path def save_assembly_object(self, workspace_id, assembly_name, obj_data): print('Saving Assembly to Workspace') sys.stdout.flush() if len(obj_data["contigs"]) == 0: raise ValueError('There are no contigs to save, thus there is no valid assembly.') obj_info = self.dfu.save_objects({'id': workspace_id, 'objects': [{'type': 'KBaseGenomeAnnotations.Assembly', 'data': obj_data, 'name': assembly_name }] })[0] return obj_info def save_fasta_file_to_shock(self, fasta_file_path): """ Given the path to the file, upload to shock and return Handle information returns: typedef structure { string shock_id; Handle handle; string node_file_name; string size; } FileToShockOutput; """ print(f'Uploading FASTA file ({fasta_file_path}) to SHOCK') sys.stdout.flush() return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1}) def stage_input(self, params): """ Setup the input_directory by fetching the files and returning the path to the file""" file_path = None if 'file' in params: if not os.path.isfile(params['file']['path']): raise ValueError('KBase Assembly Utils tried to save an assembly, but the calling application specified a file ('+params['file']['path']+') that is missing. Please check the application logs for details.') file_path = os.path.abspath(params['file']['path']) elif 'shock_id' in params: print(f'Downloading file from SHOCK node: {params["shock_id"]}') sys.stdout.flush() input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4())) os.makedirs(input_directory) file_name = self.dfu.shock_to_file({'file_path': input_directory, 'shock_id': params['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) elif 'ftp_url' in params: print(f'Downloading file from: {params["ftp_url"]}') sys.stdout.flush() file_path = self.dfu.download_web_file({'file_url': params['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] # extract the file if it is compressed if file_path is not None: unpacked_file = self.dfu.unpack_file({'file_path': file_path}) return unpacked_file['file_path'] raise ValueError('No valid FASTA could be extracted based on the input parameters') @staticmethod def validate_params(params): for key in ('workspace_name', 'assembly_name'): if key not in params: raise ValueError('required "' + key + '" field was not defined') # one and only one of either 'file', 'shock_id', or ftp_url is required input_count = 0 for key in ('file', 'shock_id', 'ftp_url'): if key in params and params[key] is not None: input_count = input_count + 1 if key == 'file': if not isinstance(params[key], dict) or 'path' not in params[key]: raise ValueError('when specifying a FASTA file input, "path" field was not defined in "file"') if input_count == 0: raise ValueError('required FASTA file as input, set as either "file", "shock_id", or "ftp_url"') if input_count > 1: raise ValueError('required exactly one FASTA file as input source, you set more than one of ' + 'these fields: "file", "shock_id", or "ftp_url"')
class MatrixUtil: def _validate_import_matrix_from_excel_params(self, params): """ _validate_import_matrix_from_excel_params: validates params passed to import_matrix_from_excel method """ logging.info('start validating import_matrix_from_excel params') # check for required parameters for p in ['obj_type', 'matrix_name', 'workspace_name', 'scale']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) obj_type = params.get('obj_type') if obj_type not in self.matrix_types: raise ValueError('Unknown matrix object type: {}'.format(obj_type)) scale = params.get('scale') if scale not in SCALE_TYPES: raise ValueError('Unknown scale type: {}'.format(scale)) if params.get('input_file_path'): file_path = params.get('input_file_path') elif params.get('input_shock_id'): file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') elif params.get('input_staging_file_path'): file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params.get('input_staging_file_path') }).get('copy_file_path') else: error_msg = "Must supply either a input_shock_id or input_file_path " error_msg += "or input_staging_file_path" raise ValueError(error_msg) refs = {k: v for k, v in params.items() if "_ref" in k} return (obj_type, file_path, params.get('workspace_name'), params.get('matrix_name'), refs, scale) def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ logging.info('Start uploading file to shock: {}'.format(file_path)) file_to_shock_params = {'file_path': file_path, 'pack': 'zip'} shock_id = self.dfu.file_to_shock(file_to_shock_params).get('shock_id') return shock_id @staticmethod def _mkdir_p(path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise @staticmethod def _find_between(s, start, end): """ _find_between: find string in between start and end """ return re.search('{}(.*){}'.format(start, end), s).group(1) @staticmethod def _write_mapping_sheet(file_path, sheet_name, mapping, index): """ _write_mapping_sheet: write mapping to sheet """ df_dict = collections.OrderedDict() df_dict[index[0]] = [] df_dict[index[1]] = [] for key, value in mapping.items(): df_dict.get(index[0]).append(key) df_dict.get(index[1]).append(value) df = pd.DataFrame.from_dict(df_dict) with pd.ExcelWriter(file_path, engine='openpyxl') as writer: writer.book = load_workbook(file_path) df.to_excel(writer, sheet_name=sheet_name) def _generate_report(self, matrix_obj_ref, workspace_name): """ _generate_report: generate summary report """ report_params = { 'message': '', 'objects_created': [{ 'ref': matrix_obj_ref, 'description': 'Imported Matrix' }], 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_excel_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output @staticmethod def _process_mapping_sheet(file_path, sheet_name): """ _process_mapping: process mapping sheet """ try: df = pd.read_excel(file_path, sheet_name=sheet_name, dtype='str') except XLRDError: return dict() else: mapping = {value[0]: value[1] for value in df.values.tolist()} return mapping def _process_attribute_mapping_sheet(self, file_path, sheet_name, matrix_name, workspace_id): """ _process_attribute_mapping_sheet: process attribute_mapping sheet """ try: df = pd.read_excel(file_path, sheet_name=sheet_name) except XLRDError: return '' else: obj_name = f'{matrix_name}_{sheet_name}' result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_name)) df.to_excel(file_path) import_attribute_mapping_params = { 'output_obj_name': obj_name, 'output_ws_id': workspace_id, 'input_file_path': file_path } ref = self.attr_util.file_to_attribute_mapping( import_attribute_mapping_params) return ref.get('attribute_mapping_ref') @staticmethod def _file_to_df(file_path): logging.info('start parsing file content to data frame') try: df = pd.read_excel(file_path, sheet_name='data', index_col=0) except XLRDError: try: df = pd.read_excel(file_path, index_col=0) logging.warning( 'WARNING: A sheet named "data" was not found in the attached file,' ' proceeding with the first sheet as the data sheet.') except XLRDError: try: reader = pd.read_csv(file_path, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(file_path, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide tsv, excel or csv file' ) df.index = df.index.astype('str') df.columns = df.columns.astype('str') # fill NA with "None" so that they are properly represented as nulls in the KBase Object df = df.where((pd.notnull(df)), None) return df def _file_to_data(self, file_path, refs, matrix_name, workspace_id): logging.info('Start reading and converting excel file data') data = refs df = self._file_to_df(file_path) matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist() } data.update({'data': matrix_data}) data.update( self._get_axis_attributes('col', matrix_data, refs, file_path, matrix_name, workspace_id)) data.update( self._get_axis_attributes('row', matrix_data, refs, file_path, matrix_name, workspace_id)) # processing metadata metadata = self._process_mapping_sheet(file_path, 'metadata') data['attributes'] = {} data['search_attributes'] = [] for k, v in metadata.items(): k = k.strip() v = v.strip() if k in TYPE_ATTRIBUTES: data[k] = v else: data['attributes'][k] = v data['search_attributes'].append(" | ".join((k, v))) return data def _get_axis_attributes(self, axis, matrix_data, refs, file_path, matrix_name, workspace_id): """Get the row/col_attributemapping and mapping of ids, validating as needed""" # Parameter specified mappings should take precedence over tabs in excel so only process # if attributemapping_ref is missing: attr_data = {} if refs.get(f'{axis}_attributemapping_ref'): attributemapping_ref = refs[f'{axis}_attributemapping_ref'] else: attributemapping_ref = self._process_attribute_mapping_sheet( file_path, f'{axis}_attribute_mapping', matrix_name, workspace_id) if attributemapping_ref: attr_data[f'{axis}_attributemapping_ref'] = attributemapping_ref # col/row_mappings may not be supplied id_mapping = self._process_mapping_sheet(file_path, f'{axis}_mapping') if id_mapping: attr_data[f'{axis}_mapping'] = id_mapping # if no mapping, axis ids must match the attribute mapping elif attributemapping_ref: am_data = self.dfu.get_objects( {'object_refs': [attributemapping_ref]})['data'][0]['data'] axis_ids = matrix_data[f'{axis}_ids'] unmatched_ids = set(axis_ids) - set(am_data['instances'].keys()) if unmatched_ids: name = "Column" if axis == 'col' else "Row" raise ValueError( f"The following {name} IDs from the uploaded matrix do not match " f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}" f"\nPlease verify the input data or upload an excel file with a" f"{name} mapping tab.") else: # just gen the IDs in this matrix attr_data[f'{axis}_mapping'] = {x: x for x in axis_ids} return attr_data @staticmethod def _build_header_str(attribute_names): #not going to be used header_str = '' width = 100.0 / len(attribute_names) header_str += '<tr class="header">' header_str += '<th style="width:{0:.2f}%;">Feature ID</th>'.format( width) for attribute_name in attribute_names: header_str += '<th style="width:{0:.2f}%;"'.format(width) header_str += '>{}</th>'.format(attribute_name) header_str += '</tr>' return header_str def _build_html_str(self, row_mapping, attributemapping_data, row_ids): #not going to be used logging.info('Start building html replacement') attribute_names = [ attributes.get('attribute') for attributes in attributemapping_data.get('attributes') ] header_str = self._build_header_str(attribute_names) table_str = '' instances = attributemapping_data.get('instances') for feature_id, attribute_id in row_mapping.items(): if feature_id in row_ids: feature_instances = instances.get(attribute_id) table_str += '<tr>' table_str += '<td>{}</td>'.format(feature_id) for feature_instance in feature_instances: table_str += '<td>{}</td>'.format(feature_instance) table_str += '</tr>' return header_str, table_str def _generate_search_html_report(self, header_str, table_str): #generate search html report html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'search.html') shutil.copy2( os.path.join(os.path.dirname(__file__), 'templates', 'kbase_icon.png'), output_directory) shutil.copy2( os.path.join(os.path.dirname(__file__), 'templates', 'search_icon.png'), output_directory) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'templates', 'search_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '//HEADER_STR', header_str) report_template = report_template.replace( '//TABLE_STR', table_str) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_directory, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Search Matrix App' }) return html_report def _generate_search_report(self, header_str, table_str, workspace_name): logging.info('Start creating report') output_html_files = self._generate_search_html_report( header_str, table_str) report_params = { 'message': '', 'workspace_name': workspace_name, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_matrix_filter_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output @staticmethod def _filter_value_data(value_data, remove_ids, dimension): """Filters a value matrix based on column or row ids""" def _norm_id(_id): return _id.replace(" ", "_") val_df = pd.DataFrame(value_data['values'], index=value_data['row_ids'], columns=value_data['col_ids'], dtype='object') if dimension == 'row': filtered_df = val_df.drop(remove_ids, axis=0, errors='ignore') filtered_df = filtered_df.drop([_norm_id(x) for x in remove_ids], axis=0, errors='ignore') elif dimension == 'col': filtered_df = val_df.drop(remove_ids, axis=1, errors='ignore') filtered_df = filtered_df.drop([_norm_id(x) for x in remove_ids], axis=1, errors='ignore') else: raise ValueError('Unexpected dimension: {}'.format(dimension)) filtered_value_data = { "values": filtered_df.values.tolist(), "col_ids": list(filtered_df.columns), "row_ids": list(filtered_df.index), } return filtered_value_data def _standardize_df(self, df, with_mean=True, with_std=True): logging.info("Standardizing matrix data") df.fillna(0, inplace=True) x_train = df.values scaler = preprocessing.StandardScaler(with_mean=with_mean, with_std=with_std).fit(x_train) standardized_values = scaler.transform(x_train) standardize_df = pd.DataFrame(index=df.index, columns=df.columns, data=standardized_values) return standardize_df def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.data_util = DataUtil(config) self.attr_util = AttributesUtil(config) self.matrix_types = [ x.split(".")[1].split('-')[0] for x in self.data_util.list_generic_types() ] def standardize_matrix(self, params): """ standardize a matrix """ input_matrix_ref = params.get('input_matrix_ref') workspace_name = params.get('workspace_name') new_matrix_name = params.get('new_matrix_name') with_mean = params.get('with_mean', 1) with_std = params.get('with_std', 1) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name input_matrix_obj = self.dfu.get_objects( {'object_refs': [input_matrix_ref]})['data'][0] input_matrix_info = input_matrix_obj['info'] input_matrix_name = input_matrix_info[1] input_matrix_data = input_matrix_obj['data'] if not new_matrix_name: current_time = time.localtime() new_matrix_name = input_matrix_name + time.strftime( '_%H_%M_%S_%Y_%m_%d', current_time) data_matrix = self.data_util.fetch_data({ 'obj_ref': input_matrix_ref }).get('data_matrix') df = pd.read_json(data_matrix) standardize_df = self._standardize_df(df, with_mean, with_std) new_matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': standardize_df.values.tolist() } input_matrix_data['data'] = new_matrix_data logging.info("Saving new standardized matrix object") info = self.dfu.save_objects({ "id": workspace_id, "objects": [{ "type": input_matrix_info[2], "data": input_matrix_data, "name": new_matrix_name }] })[0] new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) objects_created = [{ 'ref': new_matrix_obj_ref, 'description': 'Standardized Matrix' }] report_params = { 'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) return { 'new_matrix_obj_ref': new_matrix_obj_ref, 'report_name': output['name'], 'report_ref': output['ref'] } def filter_matrix(self, params): #not going to be used """ filter_matrix: create sub-matrix based on input feature_ids arguments: matrix_obj_ref: object reference of a matrix workspace_name: workspace name feature_ids: string of feature ids that result matrix contains filtered_matrix_name: name of newly created filtered matrix object """ matrix_obj_ref = params.get('matrix_obj_ref') workspace_name = params.get('workspace_name') remove_ids = params.get('remove_ids') dimension = params.get('dimension') filtered_matrix_name = params.get('filtered_matrix_name') matrix_source = self.dfu.get_objects({"object_refs": [matrix_obj_ref]})['data'][0] matrix_info = matrix_source.get('info') matrix_data = matrix_source.get('data') matrix_type = self._find_between(matrix_info[2], '\.', '\-') value_data = matrix_data.get('data') remove_ids = [x.strip() for x in remove_ids.split(',')] filtered_value_data = self._filter_value_data(value_data, remove_ids, dimension) # if the matrix has changed shape, update the mappings if len(filtered_value_data['row_ids']) < len( matrix_data['data']['row_ids']): if matrix_data.get('row_mapping'): matrix_data['row_mapping'] = { k: matrix_data['row_mapping'][k] for k in filtered_value_data['row_ids'] } if matrix_data.get('feature_mapping'): matrix_data['feature_mapping'] = { k: matrix_data['feature_mapping'][k] for k in filtered_value_data['row_ids'] } if len(filtered_value_data['col_ids']) < len( matrix_data['data']['col_ids']): if matrix_data.get('col_mapping'): matrix_data['col_mapping'] = { k: matrix_data['col_mapping'][k] for k in filtered_value_data['col_ids'] } matrix_data['data'] = filtered_value_data if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name filtered_matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(matrix_type), 'obj_name': filtered_matrix_name, 'data': matrix_data, 'workspace_name': workspace_id })['obj_ref'] returnVal = {'matrix_obj_refs': [filtered_matrix_obj_ref]} report_output = self._generate_report(filtered_matrix_obj_ref, workspace_name) returnVal.update(report_output) return returnVal def search_matrix(self, params): #not going to be used """ search_matrix: generate a HTML report that allows users to select feature ids arguments: matrix_obj_ref: object reference of a matrix workspace_name: workspace name """ matrix_obj_ref = params.get('matrix_obj_ref') workspace_name = params.get('workspace_name') matrix_source = self.dfu.get_objects({"object_refs": [matrix_obj_ref]})['data'][0] matrix_data = matrix_source.get('data') row_mapping = matrix_data.get('row_mapping') row_attributemapping_ref = matrix_data.get('row_attributemapping_ref') row_ids = matrix_data['data']['row_ids'] if not (row_mapping and row_attributemapping_ref): raise ValueError( 'Matrix obejct is missing either row_mapping or row_attributemapping_ref' ) attributemapping_data = self.dfu.get_objects( {"object_refs": [row_attributemapping_ref]})['data'][0]['data'] header_str, table_str = self._build_html_str(row_mapping, attributemapping_data, row_ids) returnVal = self._generate_search_report(header_str, table_str, workspace_name) return returnVal def import_matrix_from_excel(self, params): """ import_matrix_from_excel: import matrix object from excel arguments: obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix matrix_name: matrix object name workspace_name: workspace name matrix object to be saved to input_shock_id: file shock id or input_file_path: absolute file path or input_staging_file_path: staging area file path optional arguments: col_attributemapping_ref: column AttributeMapping reference row_attributemapping_ref: row AttributeMapping reference genome_ref: genome reference matrix_obj_ref: Matrix reference """ (obj_type, file_path, workspace_name, matrix_name, refs, scale) = self._validate_import_matrix_from_excel_params(params) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name data = self._file_to_data(file_path, refs, matrix_name, workspace_id) data['scale'] = scale if params.get('description'): data['description'] = params['description'] matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': data, 'workspace_name': workspace_id })['obj_ref'] returnVal = {'matrix_obj_ref': matrix_obj_ref} report_output = self._generate_report(matrix_obj_ref, workspace_name) returnVal.update(report_output) return returnVal def export_matrix(self, params): """ export_matrix: univeral downloader for matrix data object arguments: obj_ref: generics object reference optional arguments: generics_module: select the generics data to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; and only data is needed generics_module should be {'data': 'FloatMatrix2D'} """ logging.info('Start exporting matrix') if 'input_ref' in params: params['obj_ref'] = params.pop('input_ref') obj_source = self.dfu.get_objects( {"object_refs": [params.get('obj_ref')]})['data'][0] obj_data = obj_source.get('data') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_source.get('info')[1])) data_matrix = self.data_util.fetch_data(params).get('data_matrix') df = pd.read_json(data_matrix) df.to_excel(file_path, sheet_name='data') if obj_data.get('col_mapping'): self._write_mapping_sheet(file_path, 'col_mapping', obj_data.get('col_mapping'), ['col_name', 'instance_name']) obj_data.pop('col_mapping') if obj_data.get('row_mapping'): self._write_mapping_sheet(file_path, 'row_mapping', obj_data.get('row_mapping'), ['row_name', 'instance_name']) obj_data.pop('row_mapping') try: obj_data.pop('data') except KeyError: logging.warning('Missing key [data]') obj_data.update(obj_data.get('attributes', {})) # flatten for printing self._write_mapping_sheet(file_path, 'metadata', obj_data, ['name', 'value']) shock_id = self._upload_to_shock(file_path) return {'shock_id': shock_id}
class AttributesUtil: def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.srv_wiz_url = config['srv-wiz-url'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.kbse = KBaseSearchEngine(config['search-url']) self.data_util = DataUtil(config) self.wsClient = workspaceService(self.ws_url, token=self.token) self.DEFAULT_ONTOLOGY_ID = "Custom:Term" self.DEFAULT_UNIT_ID = "Custom:Unit" self.ONT_LABEL_DEL = " - " self.ONT_TERM_DEL = ":" @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) def file_to_attribute_mapping(self, params): """Convert a user supplied file to a compound set""" if 'input_file_path' in params: scratch_file_path = params['input_file_path'] elif 'input_shock_id' in params: scratch_file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') else: raise ValueError( "Must supply either a input_shock_id or input_file_path") attr_mapping = self._file_to_am_obj(scratch_file_path) info = self.dfu.save_objects({ "id": params['output_ws_id'], "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": attr_mapping, "name": params['output_obj_name'] }] })[0] return { "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4]) } def append_file_to_attribute_mapping(self, staging_file_subdir_path, old_am_ref, output_ws_id, new_am_name=None): """append an attribute mapping file to existing attribute mapping object """ download_staging_file_params = { 'staging_file_subdir_path': staging_file_subdir_path } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') append_am_data = self._file_to_am_obj(scratch_file_path) old_am_obj = self.dfu.get_objects({'object_refs': [old_am_ref]})['data'][0] old_am_info = old_am_obj['info'] old_am_name = old_am_info[1] old_am_data = old_am_obj['data'] new_am_data = self._check_and_append_am_data(old_am_data, append_am_data) if not new_am_name: current_time = time.localtime() new_am_name = old_am_name + time.strftime('_%H_%M_%S_%Y_%m_%d', current_time) info = self.dfu.save_objects({ "id": output_ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": new_am_data, "name": new_am_name }] })[0] return { "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4]) } def update_matrix_attribute_mapping(self, params): dimension = params.get('dimension') if dimension not in ['col', 'row']: raise ValueError('Please use "col" or "row" for input dimension') workspace_name = params.get('workspace_name') old_matrix_ref = params.get('input_matrix_ref') old_matrix_obj = self.dfu.get_objects( {'object_refs': [old_matrix_ref]})['data'][0] old_matrix_info = old_matrix_obj['info'] old_matrix_data = old_matrix_obj['data'] old_am_ref = old_matrix_data.get( '{}_attributemapping_ref'.format(dimension)) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name if not old_am_ref: raise ValueError( 'Matrix object does not have {} attribute mapping'.format( dimension)) new_am_ref = self.append_file_to_attribute_mapping( params['staging_file_subdir_path'], old_am_ref, workspace_id, params['output_am_obj_name'])['attribute_mapping_ref'] old_matrix_data['{}_attributemapping_ref'.format( dimension)] = new_am_ref info = self.dfu.save_objects({ "id": workspace_id, "objects": [{ "type": old_matrix_info[2], "data": old_matrix_data, "name": params['output_matrix_obj_name'] }] })[0] new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) objects_created = [{ 'ref': new_am_ref, 'description': 'Updated Attribute Mapping' }, { 'ref': new_matrix_obj_ref, 'description': 'Updated Matrix' }] report_params = { 'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) return { 'new_matrix_obj_ref': new_matrix_obj_ref, 'new_attribute_mapping_ref': new_am_ref, 'report_name': output['name'], 'report_ref': output['ref'] } def _check_and_append_am_data(self, old_am_data, append_am_data): exclude_keys = {'attributes', 'instances'} new_am_data = { k: old_am_data[k] for k in set(list(old_am_data.keys())) - exclude_keys } old_attrs = old_am_data.get('attributes') old_insts = old_am_data.get('instances') append_attrs = append_am_data.get('attributes') append_insts = append_am_data.get('instances') # checking duplicate attributes old_attrs_names = [old_attr.get('attribute') for old_attr in old_attrs] append_attrs_names = [ append_attr.get('attribute') for append_attr in append_attrs ] duplicate_attrs = set(old_attrs_names).intersection(append_attrs_names) if duplicate_attrs: error_msg = 'Duplicate attribute mappings: [{}]'.format( duplicate_attrs) raise ValueError(error_msg) # checking missing instances missing_inst = old_insts.keys() - append_insts.keys() if missing_inst: error_msg = 'Appended attribute mapping misses [{}] instances'.format( missing_inst) raise ValueError(error_msg) new_attrs = old_attrs + append_attrs new_am_data['attributes'] = new_attrs new_insts = deepcopy(old_insts) for inst_name, val in new_insts.items(): append_val = append_insts.get(inst_name) val.extend(append_val) new_am_data['instances'] = new_insts return new_am_data def _am_data_to_df(self, data): """ Converts a compound set object data to a dataframe """ attributes = pd.DataFrame(data['attributes']) attributes.rename(columns=lambda x: x.replace("ont", "ontology"). capitalize().replace("_", " ")) instances = pd.DataFrame(data['instances']) am_df = attributes.join(instances) return am_df def _clusterset_data_to_df(self, data): """ Converts a cluster set object data to a dataframe """ original_matrix_ref = data.get('original_data') data_matrix = self.data_util.fetch_data({ 'obj_ref': original_matrix_ref }).get('data_matrix') data_df = pd.read_json(data_matrix) clusters = data.get('clusters') id_name_list = [ list(cluster.get('id_to_data_position').keys()) for cluster in clusters ] id_names = [item for sublist in id_name_list for item in sublist] if set(data_df.columns.tolist()) == set( id_names): # cluster is based on columns data_df = data_df.T cluster_names = [None] * data_df.index.size cluster_id = 0 for cluster in clusters: item_ids = list(cluster.get('id_to_data_position').keys()) item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids] for idx in item_idx: cluster_names[idx] = cluster_id cluster_id += 1 data_df['cluster'] = cluster_names return data_df def _ws_obj_to_df(self, input_ref): """Converts workspace obj to a DataFrame""" res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0] name = res['info'][1] obj_type = res['info'][2] if "KBaseExperiments.AttributeMapping" in obj_type: cs_df = self._am_data_to_df(res['data']) elif "KBaseExperiments.ClusterSet" in obj_type: cs_df = self._clusterset_data_to_df(res['data']) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please supply KBaseExperiments.AttributeMapping or KBaseExperiments.ClusterSet' raise ValueError("err_msg") return name, cs_df, obj_type def _file_to_am_obj(self, scratch_file_path): try: df = pd.read_excel(scratch_file_path, dtype='str') except XLRDError: df = pd.read_csv(scratch_file_path, sep=None, dtype='str') df = df.replace('nan', '') if df.columns[1].lower() == "attribute ontology id": am_obj = self._df_to_am_obj(df) else: am_obj = self._isa_df_to_am_object(df) return am_obj def _df_to_am_obj(self, am_df): """Converts a dataframe from a user file to a compound set object""" if not len(am_df): raise ValueError("No attributes in supplied files") attribute_df = am_df.filter(regex="[Uu]nit|[Aa]ttribute") instance_df = am_df.drop(attribute_df.columns, axis=1) if not len(instance_df.columns): raise ValueError( "Unable to find any instance columns in supplied file") attribute_df.rename( columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(), inplace=True) if "attribute" not in attribute_df.columns: raise ValueError( "Unable to find a 'attribute' column in supplied file") attribute_df['source'] = 'upload' attribute_fields = ('attribute', 'unit', 'attribute_ont_id', 'unit_ont_id', 'source') attributes = attribute_df.filter( items=attribute_fields).to_dict('records') print(attributes) self._validate_attribute_values( am_df.set_index(attribute_df.attribute).iterrows()) attribute_mapping = { 'ontology_mapping_method': "User Curation", 'attributes': [self._add_ontology_info(f) for f in attributes], 'instances': instance_df.to_dict('list') } return attribute_mapping def _isa_df_to_am_object(self, isa_df): skip_columns = { 'Raw Data File', 'Derived Data File', 'Array Data File', 'Image File' } if 'Sample Name' in isa_df.columns and not any( isa_df['Sample Name'].duplicated()): isa_df.set_index('Sample Name', inplace=True) elif 'Assay Name' in isa_df.columns and not any( isa_df['Assay Name'].duplicated()): isa_df.set_index('Assay Name', inplace=True) elif not any(isa_df[isa_df.columns[0]].duplicated()): logging.warning(f'Using {isa_df.columns[0]} as ID column') isa_df.set_index(isa_df.columns[0], inplace=True) else: raise ValueError( "Unable to detect an ID column that was unigue for each row. " f"Considered 'Sample Names', 'Assay Names' and {isa_df.columns[0]}" ) self._validate_attribute_values(isa_df.iteritems()) attribute_mapping = { 'ontology_mapping_method': "User Curation - ISA format" } attribute_mapping[ 'attributes'], new_skip_cols = self._get_attributes_from_isa( isa_df, skip_columns) reduced_isa = isa_df.drop(columns=new_skip_cols, errors='ignore') attribute_mapping['instances'] = reduced_isa.T.to_dict('list') return attribute_mapping def _validate_attribute_values(self, attribute_series): errors = {} for attr, vals in attribute_series: try: validator = getattr(AttributeValidation, attr) attr_errors = validator(vals) if attr_errors: errors[attr] = attr_errors except AttributeError: continue if errors: for attr, attr_errors in errors.items(): logging.error( f'Attribute {attr} had the following validation errors:\n' "\n".join(attr_errors) + '\n') raise ValueError( f'The following attributes failed validation: {", ".join(errors)}' f'\n See the log for details') def _get_attributes_from_isa(self, isa_df, skip_columns): attributes = [] # associate attribute columns with the other columns that relate to them for i, col in enumerate(isa_df.columns): if col.startswith('Term Source REF'): skip_columns.add(col) last_attr = attributes[-1] if '_unit' in last_attr: last_attr['_unit_ont'] = col else: last_attr['_val_ont'] = col elif col.startswith('Term Accession Number'): # If the term Accession is a web link only grab the last bit # Similarly, sometimes the number is prefixed with the term source e.x. UO_0000012 isa_df[col] = isa_df[col].map( lambda x: x.split("/")[-1].split("_")[-1]) skip_columns.add(col) last_attr = attributes[-1] if '_unit' in last_attr: last_attr['_unit_accession'] = col else: last_attr['_val_accession'] = col elif col.startswith('Unit'): skip_columns.add(col) last_attr = attributes[-1] if last_attr.get('unit'): raise ValueError( "More than one unit column is supplied for attribute {}" .format(last_attr['attribute'])) last_attr['_unit'] = col elif col not in skip_columns: split_col = col.split("|", 1) if len(split_col) > 1: attributes.append({ "attribute": split_col[0], "attribute_ont_id": split_col[1], "source": "upload" }) else: attributes.append({"attribute": col, "source": "upload"}) # handle the categories for each attribute for i, attribute in enumerate(attributes): if '_val_accession' in attribute: category_df = isa_df[[ attribute['attribute'], attribute.pop('_val_ont'), attribute.pop('_val_accession') ]].drop_duplicates() category_df[ 'attribute_ont_id'] = category_df.iloc[:, 1].str.cat( category_df.iloc[:, 2], ":") category_df['value'] = category_df[attribute['attribute']] cats = category_df.set_index(attribute['attribute'])[[ 'value', 'attribute_ont_id' ]].to_dict('index') attribute['categories'] = { k: self._add_ontology_info(v) for k, v in cats.items() } if '_unit' in attribute: units = isa_df[attribute.pop('_unit')].unique() if len(units) > 1: raise ValueError( "More than one unit type is supplied for attribute {}: {}" .format(attribute['attribute'], units)) attribute['unit'] = units[0] if '_unit_ont' in attribute: unit_ont = isa_df[attribute.pop('_unit_ont')].str.cat( isa_df[attribute.pop('_unit_accession')], ":").unique() if len(units) > 1: raise ValueError( "More than one unit ontology is supplied for attribute " "{}: {}".format(attribute['attribute'], unit_ont)) attribute['unit_ont_id'] = unit_ont[0] attributes[i] = self._add_ontology_info(attribute) return attributes, skip_columns def _search_ontologies(self, term, closest=False): """ Match to an existing KBase ontology term :param term: Test to match :param closest: if false, term must exactly match an ontology ID :return: dict(ontology_ref, id) """ params = { "object_types": ["OntologyTerm"], "match_filter": { "lookup_in_keys": { "id": { "value": term } } }, "access_filter": { "with_private": 0, "with_public": 1 }, "pagination": { "count": 1 }, "post_processing": { "skip_data": 1 } } if closest: params['match_filter'] = {"full_text_in_all": term} res = self.kbse.search_objects(params) if not res['objects']: return None term = res['objects'][0] return { "ontology_ref": term['guid'].split(":")[1], "id": term['key_props']['id'] } def _add_ontology_info(self, attribute): """Searches KBASE ontologies for terms matching the user supplied attributes and units. Add the references if found""" optionals = { "unit", "unit_ont_id", "unit_ont_ref", } attribute = { k: v for k, v in attribute.items() if k not in optionals or v != "" } ont_info = self._search_ontologies( attribute.get('attribute_ont_id', "").replace("_", ":")) if ont_info: attribute['attribute_ont_ref'] = ont_info['ontology_ref'] attribute['attribute_ont_id'] = ont_info['id'] elif not attribute.get( 'attribute_ont_id') or attribute['attribute_ont_id'] == ":": attribute.pop('attribute_ont_id', None) if attribute.get('unit'): ont_info = self._search_ontologies( attribute.get('unit_ont_id', '').replace("_", ":")) if ont_info: attribute['unit_ont_ref'] = ont_info['ontology_ref'] attribute['unit_ont_id'] = ont_info['id'] elif not attribute.get( 'attribute_ont_id') or attribute['unit_ont_id'] == ":": attribute.pop('unit_ont_id', None) return attribute def to_tsv(self, params): """Convert an compound set to TSV file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".tsv") df.to_csv(files['file_path'], sep="\t", index=False) return _id, files def to_excel(self, params): """Convert an compound set to Excel file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".xlsx") writer = pd.ExcelWriter(files['file_path']) if "KBaseExperiments.AttributeMapping" in obj_type: df.to_excel(writer, "Attributes", index=False) elif "KBaseExperiments.ClusterSet" in obj_type: df.to_excel(writer, "ClusterSet", index=True) # else is checked in `_ws_obj_to_df` writer.save() return _id, files def export(self, file, name, input_ref): """Saves a set of files to SHOCK for export""" export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4())) os.makedirs(export_package_dir) shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [input_ref] }) return {'shock_id': package_details['shock_id']}
class PDBUtil: def _validate_import_pdb_file_params(self, params): """ _validate_import_matrix_from_excel_params: validates params passed to import_matrix_from_excel method """ # check for required parameters for p in ['structure_name', 'workspace_name']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) if params.get('input_file_path'): file_path = params.get('input_file_path') elif params.get('input_shock_id'): file_path = self.dfu.shock_to_file( {'shock_id': params['input_shock_id'], 'file_path': self.scratch}).get('file_path') elif params.get('input_staging_file_path'): file_path = self.dfu.download_staging_file( {'staging_file_subdir_path': params.get('input_staging_file_path')} ).get('copy_file_path') else: error_msg = "Must supply either a input_shock_id or input_file_path " error_msg += "or input_staging_file_path" raise ValueError(error_msg) return file_path, params.get('workspace_name'), params.get('structure_name') def _file_to_data(self, file_path): """Do the PDB conversion""" pdb1 = file_path structure = parser.get_structure("test", pdb1) model = structure[0] chain_no = 0 res_no = 0 atom_no = 0 pp_list = [] pp_no = 0 for model in structure: for chain in model: chain_no += 1 for residue in model.get_residues(): if PDB.is_aa(residue): res_no += 1 for atom in residue.get_atoms(): atom_no += 1 for pp in ppb.build_peptides(structure): pp_no += 1 my_seq= pp.get_sequence() pp_list += str(my_seq) seq = ''.join(pp_list) return { 'name': os.path.basename(file_path), 'num_chains': chain_no, 'num_residues': res_no, 'num_atoms': atom_no, 'protein': { 'id': os.path.basename(file_path), 'sequence': seq, 'md5': hashlib.md5(seq.encode()).hexdigest() }, } def _get_pdb_shock_id(self, obj_ref): """Return the shock id for the PDB file""" obj_data = self.dfu.get_objects({"object_refs": [obj_ref]})['data'][0]['data'] return self.hs.hids_to_handles([obj_data['pdb_handle']])[0]['id'] def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ logging.info('Start uploading file to shock: {}'.format(file_path)) file_to_shock_params = { 'file_path': file_path, 'pack': 'gzip', 'make_handle': True, } shock_id = self.dfu.file_to_shock(file_to_shock_params)['handle']['hid'] return shock_id def _generate_html_report(self, header_str, table_str): #TODO: make this work with the PDB viewer html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'search.html') with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'templates', 'viewer_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('//HEADER_STR', header_str) report_template = report_template.replace('//TABLE_STR', table_str) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({'file_path': output_directory, 'pack': 'zip'})['shock_id'] html_report.append({'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Search Matrix App'}) return html_report def _generate_report(self, pdb_obj_ref, workspace_name): """ _generate_report: generate summary report """ # included as an example. Replace with your own implementation # output_html_files = self._generate_html_report(header_str, table_str) report_params = {'message': 'You uploaded a PDB file!', #'html_links': output_html_files, #'direct_html_link_index': 0, 'objects_created': [{'ref': pdb_obj_ref, 'description': 'Imported PDB'}], 'workspace_name': workspace_name, 'report_object_name': 'import_pdb_from_staging_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.hs = AbstractHandle(config['handle-service-url']) def import_model_pdb_file(self, params): file_path, workspace_name, pdb_name = self._validate_import_pdb_file_params(params) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name data = self._file_to_data(file_path) data['pdb_handle'] = self._upload_to_shock(file_path) data['user_data'] = params.get('description', '') logging.info(data) info = self.dfu.save_objects({ 'id': workspace_id, 'objects': [ {'type': 'KBaseStructure.ModelProteinStructure', 'name': pdb_name, 'data': data}] })[0] obj_ref = f"{info[6]}/{info[0]}/{info[4]}" returnVal = {'structure_obj_ref': obj_ref} report_output = self._generate_report(obj_ref, workspace_name) returnVal.update(report_output) return returnVal def export_pdb(self, params): if "input_ref" not in params: raise ValueError("input_ref not in supplied params") return {'shock_id': self._get_pdb_shock_id(params['input_ref'])} def structure_to_pdb_file(self, params): if "input_ref" not in params: raise ValueError("input_ref not in supplied params") if "destination_dir" not in params: raise ValueError("destination_dir not in supplied params") shock_id = self._get_pdb_shock_id(params['input_ref']) file_path = self.dfu.shock_to_file({ 'shock_id': shock_id, 'file_path': params['destination_dir'], 'unpack': 'uncompress' })['file_path'] return {'file_path': file_path}
class FeatureSetBuilder: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_upload_featureset_from_diff_expr_params(self, params): """ _validate_upload_featureset_from_diff_expr_params: validates params passed to upload_featureset_from_diff_expr method """ log('start validating upload_featureset_from_diff_expr params') # check for required parameters for p in [ 'diff_expression_ref', 'workspace_name', 'p_cutoff', 'q_cutoff', 'fold_change_cutoff' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) p = params.get('fold_scale_type') if p and p != 'logarithm': raise ValueError( '"fold_scale_type" parameter must be set to "logarithm", if used' ) @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, workspace_name): """ _generate_report: generate summary report """ log('start creating report') output_html_files = self._generate_html_report( up_feature_set_ref_list, down_feature_set_ref_list) objects_created = list() for up_feature_set_ref in up_feature_set_ref_list: objects_created += [{ 'ref': up_feature_set_ref, 'description': 'Upper FeatureSet Object' }] for down_feature_set_ref in down_feature_set_ref_list: objects_created += [{ 'ref': down_feature_set_ref, 'description': 'Lower FeatureSet Object' }] for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list: objects_created += [{ 'ref': filtered_expression_matrix_ref, 'description': 'Filtered ExpressionMatrix Object' }] report_params = { 'message': '', 'workspace_name': workspace_name, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') uppper_feature_content = '' for up_feature_set_ref in up_feature_set_ref_list: feature_set_obj = self.ws.get_objects2( {'objects': [{ 'ref': up_feature_set_ref }]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format( feature_set_name, len(feature_ids)) lower_feature_content = '' for down_feature_set_ref in down_feature_set_ref_list: feature_set_obj = self.ws.get_objects2( {'objects': [{ 'ref': down_feature_set_ref }]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format( feature_set_name, len(feature_ids)) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<tr><td>Upper_FeatureSet</td></tr>', uppper_feature_content) report_template = report_template.replace( '<tr><td>Lower_FeatureSet</td></tr>', lower_feature_content) result_file.write(report_template) html_report.append({ 'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report' }) return html_report def _process_diff_expression(self, diff_expression_set_ref, result_directory, condition_label_pair): """ _process_diff_expression: process differential expression object info """ log('start processing differential expression object') diff_expr_set_data = self.ws.get_objects2( {'objects': [{ 'ref': diff_expression_set_ref }]})['data'][0]['data'] set_items = diff_expr_set_data['items'] diff_expr_matrix_file_name = 'gene_results.csv' diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name) with open(diff_expr_matrix_file, 'w') as csvfile: fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for set_item in set_items: diff_expression_ref = set_item['ref'] diff_expression_data = self.ws.get_objects2( {'objects': [{ 'ref': diff_expression_ref }]})['data'][0]['data'] label_string = set_item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_1 = label_list[0] condition_2 = label_list[1] if condition_1 in condition_label_pair and condition_2 in condition_label_pair: genome_id = diff_expression_data['genome_ref'] matrix_data = diff_expression_data['data'] selected_diff_expression_ref = diff_expression_ref with open(diff_expr_matrix_file, 'a') as csvfile: row_ids = matrix_data.get('row_ids') row_values = matrix_data.get('values') writer = csv.DictWriter(csvfile, fieldnames=fieldnames) for pos, row_id in enumerate(row_ids): row_value = row_values[pos] writer.writerow({ 'gene_id': row_id, 'log2_fold_change': row_value[0], 'p_value': row_value[1], 'q_value': row_value[2] }) return diff_expr_matrix_file, genome_id, selected_diff_expression_ref def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name): """ _generate_feature_set: generate FeatureSet object KBaseCollections.FeatureSet type: typedef structure { string description; list<feature_id> element_ordering; mapping<feature_id, list<genome_ref>> elements; } FeatureSet; """ log('start saving KBaseCollections.FeatureSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) elements = {feature_id: [genome_id] for feature_id in feature_ids} feature_set_data = { 'description': 'Generated FeatureSet from DifferentialExpression', 'element_ordering': feature_ids, 'elements': elements } object_type = 'KBaseCollections.FeatureSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': feature_set_data, 'name': feature_set_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) return feature_set_obj_ref def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value, comp_fold_change_cutoff): """ _process_matrix_file: filter matrix file by given cutoffs """ log('start processing matrix file') up_feature_ids = [] down_feature_ids = [] if comp_fold_change_cutoff < 0: comp_fold_change_cutoff = -comp_fold_change_cutoff with open(diff_expr_matrix_file, 'r') as file: reader = csv.DictReader(file) for row in reader: feature_id = row['gene_id'] row_p_value = row['p_value'] row_q_value = row['q_value'] row_fold_change_cutoff = row['log2_fold_change'] null_value = {'NA', 'null', ''} col_value = {row_p_value, row_q_value, row_fold_change_cutoff} if not col_value.intersection(null_value): p_value_condition = float(row_p_value) <= comp_p_value q_value_condition = float(row_q_value) <= comp_q_value up_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) >= comp_fold_change_cutoff)) down_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) <= -comp_fold_change_cutoff)) if up_matches_condition: up_feature_ids.append(feature_id) elif down_matches_condition: down_feature_ids.append(feature_id) return list(set(up_feature_ids)), list(set(down_feature_ids)) def _filter_expression_matrix(self, expression_matrix_ref, feature_ids, workspace_name, filtered_expression_matrix_suffix="", diff_expression_matrix_ref=None, filtered_expression_matrix_name=None): """ _filter_expression_matrix: generated filtered expression matrix """ log('start saving ExpressionMatrix object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_matrix_obj = self.dfu.get_objects( {'object_refs': [expression_matrix_ref]})['data'][0] expression_matrix_info = expression_matrix_obj['info'] expression_matrix_data = expression_matrix_obj['data'] expression_matrix_name = expression_matrix_info[1] if not filtered_expression_matrix_name: if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name): filtered_expression_matrix_name = re.sub( '_*[Ee]xpression_*[Mm]atrix', filtered_expression_matrix_suffix, expression_matrix_name) else: filtered_expression_matrix_name = expression_matrix_name + \ filtered_expression_matrix_suffix filtered_expression_matrix_data = expression_matrix_data.copy() data = filtered_expression_matrix_data['data'] row_ids = data['row_ids'] values = data['values'] filtered_data = data.copy() filtered_row_ids = list() filtered_values = list() for pos, row_id in enumerate(row_ids): if row_id in feature_ids: filtered_row_ids.append(row_id) filtered_values.append(values[pos]) filtered_data['row_ids'] = filtered_row_ids filtered_data['values'] = filtered_values filtered_expression_matrix_data['data'] = filtered_data expression_obj = { 'type': expression_matrix_info[2], 'data': filtered_expression_matrix_data, 'name': filtered_expression_matrix_name } # we now save the filtering DEM in a EM field added for this purpose if diff_expression_matrix_ref: expression_obj['data'][ 'diff_expr_matrix_ref'] = diff_expression_matrix_ref expression_obj['extra_provenance_input_refs'] = [ diff_expression_matrix_ref ] save_object_params = {'id': workspace_id, 'objects': [expression_obj]} dfu_oi = self.dfu.save_objects(save_object_params)[0] filtered_expression_matrix_ref = "{}/{}/{}".format( dfu_oi[6], dfu_oi[0], dfu_oi[4]) return filtered_expression_matrix_ref def _xor(self, a, b): return bool(a) != bool(b) def _check_input_labels(self, condition_pairs, available_condition_labels): """ _check_input_labels: check input condition pairs """ checked = True for condition_pair in condition_pairs: try: label_string = condition_pair['label_string'][0].strip() label_list = [x.strip() for x in label_string.split(',')] first_label = label_list[0] second_label = label_list[1] except IndexError: raise IndexError('No selected values for Partial Condition') if first_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format( first_label) error_msg += 'Available conditions: {}'.format( available_condition_labels) raise ValueError(error_msg) if second_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format( second_label) error_msg += 'Available conditions: {}'.format( available_condition_labels) raise ValueError(error_msg) if first_label == second_label: raise ValueError('Input conditions are the same') return checked def _get_condition_labels(self, diff_expression_set_ref): """ _get_condition_labels: get all possible condition label pairs """ log('getting all possible condition pairs') condition_label_pairs = list() available_condition_labels = set() diff_expression_set_obj = self.ws.get_objects2( {'objects': [{ 'ref': diff_expression_set_ref }]})['data'][0] diff_expression_set_data = diff_expression_set_obj['data'] items = diff_expression_set_data.get('items') for item in items: label_string = item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_label_pairs.append(label_list) available_condition_labels |= set(label_list) log('all possible condition pairs:\n{}'.format(condition_label_pairs)) return condition_label_pairs, available_condition_labels def _get_feature_ids(self, genome_ref, ids): """ _get_feature_ids: get feature ids from genome """ genome_features = self.gsu.search({ 'ref': genome_ref, 'limit': len(ids), 'structured_query': { "$or": [{ "feature_id": x } for x in ids] }, 'sort_by': [['feature_id', True]] })['features'] features_ids = set( (feature.get('feature_id') for feature in genome_features)) return features_ids def _build_fs_obj(self, params): new_feature_set = { 'description': '', 'element_ordering': [], 'elements': {} } genome_ref = params['genome'] if params.get('base_feature_sets', []) and None not in params['base_feature_sets']: base_feature_sets = self.dfu.get_objects( {'object_refs': params['base_feature_sets']})['data'] for ret in base_feature_sets: base_set = ret['data'] base_set_name = ret['info'][1] new_feature_set['element_ordering'] += [ x for x in base_set['element_ordering'] if x not in new_feature_set['elements'] ] for element, genome_refs in base_set['elements'].items(): if element in new_feature_set['elements']: new_feature_set['elements'][element] += [ x for x in genome_refs if x not in new_feature_set['elements'][element] ] else: new_feature_set['elements'][element] = genome_refs new_feature_set[ 'description'] += 'From FeatureSet {}: {}\n'.format( base_set_name, base_set.get('description')) new_feature_ids = [] if params.get('feature_ids'): if isinstance(params['feature_ids'], str): new_feature_ids += params['feature_ids'].split(',') else: new_feature_ids += params['feature_ids'] if params.get('feature_ids_custom'): new_feature_ids += params['feature_ids_custom'].split(',') if new_feature_ids: genome_feature_ids = self._get_feature_ids(genome_ref, new_feature_ids) for new_feature in new_feature_ids: if new_feature not in genome_feature_ids: raise ValueError( 'Feature ID {} does not exist in the supplied genome {}'. format(new_feature, genome_ref)) if new_feature in new_feature_set['elements']: if genome_ref not in new_feature_set['elements'][new_feature]: new_feature_set['elements'][new_feature].append(genome_ref) else: new_feature_set['elements'][new_feature] = [genome_ref] new_feature_set['element_ordering'].append(new_feature) if params.get('description'): new_feature_set['description'] = params['description'] return new_feature_set def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.ws = Workspace(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.scratch = config['scratch'] def upload_featureset_from_diff_expr(self, params): """ upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression based on given threshold cutoffs required params: diff_expression_ref: DifferetialExpressionMatrixSet object reference expression_matrix_ref: ExpressionMatrix object reference p_cutoff: p value cutoff q_cutoff: q value cutoff fold_scale_type: one of ["linear", "log2+1", "log10+1"] fold_change_cutoff: fold change cutoff feature_set_suffix: Result FeatureSet object name suffix filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix workspace_name: the name of the workspace it gets saved to return: result_directory: folder path that holds all files generated up_feature_set_ref_list: list of generated upper FeatureSet object reference down_feature_set_ref_list: list of generated down FeatureSet object reference filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ self._validate_upload_featureset_from_diff_expr_params(params) diff_expression_set_ref = params.get('diff_expression_ref') diff_expression_set_info = self.ws.get_object_info3( {"objects": [{ "ref": diff_expression_set_ref }]})['infos'][0] diff_expression_set_name = diff_expression_set_info[1] result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) (available_condition_label_pairs, available_condition_labels ) = self._get_condition_labels(diff_expression_set_ref) run_all_combinations = params.get('run_all_combinations') condition_pairs = params.get('condition_pairs') if not self._xor(run_all_combinations, condition_pairs): error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' " error_msg += "or provide partial condition pairs. Don't do both or neither" raise ValueError(error_msg) if run_all_combinations: condition_label_pairs = available_condition_label_pairs else: if self._check_input_labels(condition_pairs, available_condition_labels): condition_label_pairs = list() for condition_pair in condition_pairs: label_string = condition_pair['label_string'][0].strip() condition_labels = [ x.strip() for x in label_string.split(',') ] condition_label_pairs.append(condition_labels) up_feature_set_ref_list = list() down_feature_set_ref_list = list() filtered_expression_matrix_ref_list = list() for condition_label_pair in condition_label_pairs: condition_string = '-'.join(reversed(condition_label_pair)) diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression( diff_expression_set_ref, result_directory, condition_label_pair) up_feature_ids, down_feature_ids = self._process_matrix_file( diff_expr_matrix_file, params.get('p_cutoff'), params.get('q_cutoff'), params.get('fold_change_cutoff')) filtered_em_name = _sanitize_name(condition_string) + params.get( 'filtered_expression_matrix_suffix') if params.get('expression_matrix_ref'): filtered_expression_matrix_ref = self._filter_expression_matrix( params.get('expression_matrix_ref'), up_feature_ids + down_feature_ids, params.get('workspace_name'), "", diff_expr_matrix_ref, filtered_em_name) filtered_expression_matrix_ref_list.append( filtered_expression_matrix_ref) feature_set_suffix = params.get('feature_set_suffix', "") up_feature_set_name = "{}_{}_up{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) up_feature_set_ref = self._generate_feature_set( up_feature_ids, genome_id, params.get('workspace_name'), up_feature_set_name) up_feature_set_ref_list.append(up_feature_set_ref) down_feature_set_name = "{}_{}_down{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) down_feature_set_ref = self._generate_feature_set( down_feature_ids, genome_id, params.get('workspace_name'), down_feature_set_name) down_feature_set_ref_list.append(down_feature_set_ref) returnVal = { 'result_directory': result_directory, 'up_feature_set_ref_list': up_feature_set_ref_list, 'down_feature_set_ref_list': down_feature_set_ref_list, 'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list } report_output = self._generate_report( up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, params.get('workspace_name')) returnVal.update(report_output) return returnVal def filter_matrix_with_fs(self, params): self.validate_params( params, ('feature_set_ref', 'workspace_name', 'expression_matrix_ref', 'filtered_expression_matrix_suffix')) ret = self.dfu.get_objects( {'object_refs': [params['feature_set_ref']]})['data'][0] feature_set = ret['data'] feature_set_name = ret['info'][1] feature_ids = set(feature_set['elements'].keys()) filtered_matrix_ref = self._filter_expression_matrix( params['expression_matrix_ref'], feature_ids, params['workspace_name'], params['filtered_expression_matrix_suffix']) objects_created = [{ 'ref': filtered_matrix_ref, 'description': 'Filtered ExpressionMatrix Object' }] message = "Filtered Expression Matrix based of the {} feature ids present in {}" \ .format(len(feature_ids), feature_set_name) report_params = { 'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return { 'filtered_expression_matrix_ref': filtered_matrix_ref, 'report_name': output['name'], 'report_ref': output['ref'] } def build_feature_set(self, params): self.validate_params(params, { 'output_feature_set', 'workspace_name', }, { 'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets', 'description' }) feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets') if not any([params.get(x) for x in feature_sources]): raise ValueError( "You must supply at least one feature source: {}".format( ", ".join(feature_sources))) workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) new_feature_set = self._build_fs_obj(params) save_object_params = { 'id': workspace_id, 'objects': [{ 'type': 'KBaseCollections.FeatureSet', 'data': new_feature_set, 'name': params['output_feature_set'] }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) objects_created = [{ 'ref': feature_set_obj_ref, 'description': 'Feature Set' }] message = 'A new feature set containing {} features was created.'.format( len(new_feature_set['elements'])) report_params = { 'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return { 'feature_set_ref': feature_set_obj_ref, 'report_name': output['name'], 'report_ref': output['ref'] }
class GenomeInterface: def __init__(self, config): self.handle_url = config.handleURL self.shock_url = config.shockURL self.sw_url = config.srvWizURL self.token = config.token self.auth_service_url = config.authServiceUrl self.callback_url = config.callbackURL self.auth_client = _KBaseAuth(self.auth_service_url) self.dfu = DataFileUtil(self.callback_url) self.kbse = KBaseSearchEngine(config.raw['search-url']) self.taxon_wsname = config.raw['taxon-workspace-name'] self.scratch = config.raw['scratch'] self.ws_large_data = WsLargeDataIO(self.callback_url) @staticmethod def _validate_save_one_genome_params(params): """ _validate_save_one_genome_params: validates params passed to save_one_genome method """ log('start validating save_one_genome params') # check for required parameters for p in ['workspace', 'name', 'data']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _check_shock_response(self, response, errtxt): """ _check_shock_response: check shock node response (Copied from DataFileUtil) """ log('start checking shock response') if not response.ok: try: err = json.loads(response.content)['error'][0] except: # this means shock is down or not responding. self.log("Couldn't parse response error content from Shock: " + response.content) response.raise_for_status() raise ValueError(errtxt + str(err)) def _own_handle(self, genome_data, handle_property): """ _own_handle: check that handle_property point to shock nodes owned by calling user """ log('start checking handle {} ownership'.format(handle_property)) if handle_property in genome_data: handle_id = genome_data[handle_property] hs = HandleService(self.handle_url, token=self.token) handles = hs.hids_to_handles([handle_id]) shock_id = handles[0]['id'] # Copy from DataFileUtil.own_shock_node implementation: header = {'Authorization': 'Oauth {}'.format(self.token)} res = requests.get(self.shock_url + '/node/' + shock_id + '/acl/?verbosity=full', headers=header, allow_redirects=True) self._check_shock_response( res, 'Error getting ACLs for Shock node {}: '.format(shock_id)) owner = res.json()['data']['owner']['username'] user_id = self.auth_client.get_user(self.token) if owner != user_id: log('start copying node to owner: {}'.format(user_id)) dfu_shock = self.dfu.copy_shock_node({ 'shock_id': shock_id, 'make_handle': True }) handle_id = dfu_shock['handle']['hid'] genome_data[handle_property] = handle_id def _check_dna_sequence_in_features(self, genome): """ _check_dna_sequence_in_features: check dna sequence in each feature """ log('start checking dna sequence in each feature') if 'features' in genome: features_to_work = {} for feature in genome['features']: if not ('dna_sequence' in feature and feature['dna_sequence']): features_to_work[feature['id']] = feature['location'] if len(features_to_work) > 0: aseq = AssemblySequenceAPI(self.sw_url, token=self.token) get_dna_params = {'requested_features': features_to_work} if 'assembly_ref' in genome: get_dna_params['assembly_ref'] = genome['assembly_ref'] elif 'contigset_ref' in genome: get_dna_params['contigset_ref'] = genome['contigset_ref'] else: # Nothing to do (it may be test genome without contigs)... return dna_sequences = aseq.get_dna_sequences( get_dna_params)['dna_sequences'] for feature in genome['features']: if feature['id'] in dna_sequences: feature['dna_sequence'] = dna_sequences[feature['id']] feature['dna_sequence_length'] = len( feature['dna_sequence']) def get_one_genome(self, params): """Fetch a genome using WSLargeDataIO and return it as a python dict""" log('fetching genome object') res = self.ws_large_data.get_objects(params)['data'][0] data = json.load(open(res['data_json_file'])) return data, res['info'] #return self.dfu.get_objects(params)['data'][0] def save_one_genome(self, params): log('start saving genome object') self._validate_save_one_genome_params(params) workspace = params['workspace'] name = params['name'] data = params['data'] if 'meta' in params and params['meta']: meta = params['meta'] else: meta = {} if params.get('upgrade') or 'feature_counts' not in data: data = self._update_genome(data) # check all handles point to shock nodes owned by calling user self._own_handle(data, 'genbank_handle_ref') self._own_handle(data, 'gff_handle_ref') self._check_dna_sequence_in_features(data) data['warnings'] = self.validate_genome(data) # dump genome to scratch for upload data_path = os.path.join(self.scratch, name + ".json") json.dump(data, open(data_path, 'w')) if 'hidden' in params and str( params['hidden']).lower() in ('yes', 'true', 't', '1'): hidden = 1 else: hidden = 0 if isinstance(workspace, int) or workspace.isdigit(): workspace_id = workspace else: workspace_id = self.dfu.ws_name_to_id(workspace) save_params = { 'id': workspace_id, 'objects': [{ 'type': 'KBaseGenomes.Genome', 'data_json_file': data_path, 'name': name, 'meta': meta, 'hidden': hidden }] } dfu_oi = self.ws_large_data.save_objects(save_params)[0] returnVal = {'info': dfu_oi, 'warnings': data['warnings']} return returnVal def old_retrieve_taxon(self, taxon_wsname, scientific_name): """ old_retrieve_taxon: use SOLR to retrieve taxonomy and taxon_reference """ default = ('Unconfirmed Organism: ' + scientific_name, 'ReferenceTaxons/unknown_taxon', 'Unknown', 11) solr_url = 'http://kbase.us/internal/solr-ci/search/' solr_core = 'taxonomy_ci' query = '/select?q=scientific_name:"{}"&fl=scientific_name%2Cscientific_lineage%2Ctaxonomy_id%2Cdomain%2Cgenetic_code&rows=5&wt=json' match = re.match("\S+\s?\S*", scientific_name) if not match: return default res = requests.get(solr_url + solr_core + query.format(match.group(0))) results = res.json()['response']['docs'] if not results: return default taxonomy = results[0]['scientific_lineage'] taxon_reference = '{}/{}_taxon'.format(taxon_wsname, results[0]['taxonomy_id']) domain = results[0]['domain'] genetic_code = results[0]['genetic_code'] return taxonomy, taxon_reference, domain, genetic_code def retrieve_taxon(self, taxon_wsname, scientific_name): """ _retrieve_taxon: retrieve taxonomy and taxon_reference """ default = ('Unconfirmed Organism: ' + scientific_name, 'ReferenceTaxons/unknown_taxon', 'Unknown', 11) def extract_values(search_obj): return (search_obj['data']['scientific_lineage'], taxon_wsname + "/" + search_obj['object_name'], search_obj['data']['domain'], search_obj['data'].get('genetic_code', 11)) search_params = { "object_types": ["taxon"], "match_filter": { "lookup_in_keys": { "scientific_name": { "value": scientific_name } }, "exclude_subobjects": 1 }, "access_filter": { "with_private": 0, "with_public": 1 }, "sorting_rules": [{ "is_object_property": 0, "property": "timestamp", "ascending": 0 }] } objects = self.kbse.search_objects(search_params)['objects'] if len(objects): if len(objects) > 100000: raise RuntimeError( "Too many matching taxons returned for {}. " "Potential issue with searchAPI.".format(scientific_name)) return extract_values(objects[0]) search_params['match_filter']['lookup_in_keys'] = { "aliases": { "value": scientific_name } } objects = self.kbse.search_objects(search_params)['objects'] if len(objects): return extract_values(objects[0]) return default @staticmethod def determine_tier(source): """ Given a user provided source parameter, assign a source and genome tier """ low_source = source.lower() if 'refseq' in low_source: if 'reference' in low_source: return "RefSeq", ['Reference', 'Representative', 'ExternalDB'] if 'representative' in low_source: return "RefSeq", ['Representative', 'ExternalDB'] if 'user' in low_source: return "RefSeq", ['ExternalDB', 'User'] return "RefSeq", ['ExternalDB'] if 'phytozome' in low_source: if 'flagship' in source: return "Phytosome", [ 'Reference', 'Representative', 'ExternalDB' ] return "Phytosome", ['Representative', 'ExternalDB'] if 'ensembl' in low_source: if 'user' in low_source: return "Ensembl", ['ExternalDB', 'User'] return "Ensembl", ['Representative', 'ExternalDB'] return source, ['User'] def _update_genome(self, genome): """Checks for missing required fields and fixes breaking changes""" # do top level updates ontologies_present = defaultdict(dict) ontologies_present.update(genome.get('ontologies_present', {})) ontology_events = genome.get('ontology_events', []) if 'genome_tier' not in genome: genome['source'], genome['genome_tiers'] = self.determine_tier( genome['source']) if 'molecule_type' not in genome: genome['molecule_type'] = 'Unknown' if 'taxon_ref' not in genome: genome['taxonomy'], genome['taxon_ref'], genome['domain'], \ genome['genetic_code'] = self.retrieve_taxon( self.taxon_wsname, genome['scientific_name']) if any([ x not in genome for x in ('dna_size', 'md5', 'gc_content', 'num_contigs') ]): if 'assembly_ref' in genome: assembly_data = self.dfu.get_objects({ 'object_refs': [genome['assembly_ref']], 'ignore_errors': 0 })['data'][0]['data'] genome["gc_content"] = assembly_data['gc_content'] genome["dna_size"] = assembly_data['dna_size'] genome["md5"] = assembly_data['md5'] genome["num_contigs"] = assembly_data['num_contigs'] elif 'contigset_ref' in genome: contig_data = self.dfu.get_objects({ 'object_refs': [genome['contigset_ref']], 'included': ['contigs/[*]/length', 'md5'], 'ignore_errors': 0 })['data'][0]['data'] genome["gc_content"] = None genome["dna_size"] = sum( (c['length'] for c in contig_data['contigs'])) genome["md5"] = contig_data['md5'] genome["num_contigs"] = len(contig_data['contigs']) if 'cdss' not in genome: genome['cdss'] = [] if 'mrnas' not in genome: genome['mrnas'] = [] if 'non_coding_features' not in genome: genome['non_coding_features'] = [] # do feature level updates retained_features = [] type_counts = defaultdict(int) for field in ('mrnas', 'cdss', 'features'): for i, feat in enumerate(genome.get(field, [])): if 'function' in feat and not isinstance(feat, list): feat['functions'] = feat['function'].split('; ') del feat['function'] if 'aliases' in feat: if not feat['aliases']: del feat['aliases'] elif not isinstance(feat['aliases'][0], (list, tuple)): feat['aliases'] = [['gene_synonym', x] for x in feat['aliases']] if 'type' in feat: type_counts[feat['type']] += 1 for ontology, terms in feat.get('ontology_terms', {}).items(): for term in terms.values(): if isinstance(term, list): continue ontologies_present[ontology][ term['id']] = term['term_name'] term_evidence = [] for ev in term['evidence']: ev['id'] = ontology ev['ontology_ref'] = term["ontology_ref"] if ev not in ontology_events: ontology_events.append(ev) term_evidence.append(ontology_events.index(ev)) feat['ontology_terms'][ontology][ term['id']] = term_evidence # remove deprecated fields feat.pop('protein_families', None) feat.pop('atomic_regulons', None) feat.pop('orthologs', None) feat.pop('coexpressed_fids', None) feat.pop('publications', None) feat.pop('regulon_data', None) feat.pop('subsystem_data', None) if 'dna_sequence_length' not in feat: feat['dna_sequence_length'] = sum( x[3] for x in feat['location']) if 'protein_translation' in feat and 'protein_md5' not in feat: feat['protein_md5'] = hashlib.md5( feat.get('protein_translation', '').encode('utf8')).hexdigest() # split all the stuff lumped together in old versions into the # right arrays if field == 'features': if feat.get('type', 'gene') == 'gene': if not feat.get('cdss', []): genome['non_coding_features'].append(feat) else: retained_features.append(feat) elif feat.get('type', 'gene') == 'CDS': if 'parent_gene' not in feat: feat['parent_gene'] = '' genome['cdss'].append(feat) elif feat.get('type', 'gene') == 'mRNA': if 'parent_gene' not in feat: feat['parent_gene'] = '' genome['mrnas'].append(feat) genome['features'] = retained_features if ontology_events: genome['ontology_events'] = ontology_events if ontologies_present: genome['ontologies_present'] = ontologies_present type_counts['mRNA'] = len(genome.get('mrnas', [])) type_counts['CDS'] = len(genome.get('cdss', [])) type_counts['protein_encoding_gene'] = len(genome['features']) type_counts['non_coding_features'] = len( genome.get('non_coding_features', [])) genome['feature_counts'] = type_counts return genome @staticmethod def validate_genome(g): """ Run a series of checks on the genome object and return any warnings """ def _get_size(obj): return sys.getsizeof(json.dumps(obj)) def sizeof_fmt(num): for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: if abs(num) < 1024.0: return "%3.1f %sB" % (num, unit) num /= 1024.0 return "%.1f %sB" % (num, 'Yi') allowed_tiers = {'Representative', 'Reference', 'ExternalDB', 'User'} log('Validating genome object contents') warnings = g.get('warnings', []) # this will fire for some annotation methods like PROKKA if g['domain'] == "Bacteria" and len(g.get('cdss', [])) != len( g['features']): warnings.append( "For prokaryotes, CDS array should generally be the" " same length as the Features array.") if g['domain'] == "Eukaryota" and len(g.get('features', [])) == len( g.get('cdss', [])): warnings.append( "For Eukaryotes, CDS array should not be the same " "length as the Features array due to RNA splicing.") if "molecule_type" in g and g['molecule_type'] not in { "DNA", 'ds-DNA' }: if g.get('domain', '') not in {'Virus', 'Viroid'} and \ g['molecule_type'] not in {"DNA", 'ds-DNA'}: warnings.append("Genome molecule_type {} is not expected " "for domain {}.".format( g['molecule_type'], g.get('domain', ''))) if "genome_tiers" in g and set(g['genome_tiers']) - allowed_tiers: warnings.append("Undefined terms in genome_tiers: " + ", ".join(set(g['genome_tiers']) - allowed_tiers)) if g['taxon_ref'] == "ReferenceTaxons/unknown_taxon": warnings.append('Unable to determine organism taxonomy') #MAX_GENOME_SIZE = 1 #300000000 # UNCOMMENT TO TEST FAILURE MODE. Set to size needed feature_lists = ('mrnas', 'features', 'non_coding_features', 'cdss') master_key_sizes = dict() # Change want full breakdown to True if want to see break down of sizes. # By making this a changebale flag it will run faster for standard uploads. want_full_breakdown = False for x in feature_lists: if x in g: need_to_remove_dna_sequence = _get_size(g) > MAX_GENOME_SIZE if need_to_remove_dna_sequence or want_full_breakdown: feature_type_dict_keys = dict() for feature in g[x]: for feature_key in list(feature.keys()): if feature_key == "dna_sequence" and need_to_remove_dna_sequence: del (feature["dna_sequence"]) else: if feature_key not in feature_type_dict_keys: feature_type_dict_keys[feature_key] = 0 feature_type_dict_keys[ feature_key] += sys.getsizeof( feature[feature_key]) for feature_key in feature_type_dict_keys: feature_type_dict_keys[feature_key] = sizeof_fmt( feature_type_dict_keys[feature_key]) master_key_sizes[x] = feature_type_dict_keys print("{}: {}".format(x, sizeof_fmt(_get_size(g[x])))) total_size = _get_size(g) print("Total size {} ".format(sizeof_fmt(total_size))) if want_full_breakdown: print( "Here is the breakdown of the sizes of feature lists elements : {}" .format(str(master_key_sizes))) if total_size > MAX_GENOME_SIZE: print( "Here is the breakdown of the sizes of feature lists elements : {}" .format(str(master_key_sizes))) raise ValueError( "This genome size of {} exceeds the maximum permitted size of {}.\nHere " "is the breakdown for feature lists and their respective sizes:\n{}" .format(sizeof_fmt(total_size), sizeof_fmt(MAX_GENOME_SIZE), str(master_key_sizes))) return warnings
def UploadFrommfmd(callback_url, params): """ :param params: instance of type "UploadmfmdInParams" -> structure: parameter "path" of String, parameter "ws_name" of String, parameter "obj_name" of String :returns: instance of type "UploadOutput" -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: output #BEGIN UploadFrommfmd print('Extracting motifs') #motifList = MFU.parse_mfmd_output(params['path']) motifList = parse_mfmd_output(params['path']) print(motifList) MSO = {} MSO=motifList '''MSO['Condition'] = 'Temp' MSO['SequenceSet_ref'] = '123' MSO['Motifs'] = [] MSO['Alphabet'] = ['A','C','G','T'] #MSO['Background'] = MSU.GetBackground() #for letter in MSO['Alphabet']: # MSO['Background'][letter] = 0.0 #MSU.parseMotifList(motifList,MSO)''' '''params['min_len']=22 #put dummy value for min and max len params['max_len']=22 #MSU.CheckLength(motifList,params['min_len'],params['max_len']) #MSU.CheckLength(MSO,params['min_len'],params['max_len']) for motif in MSO['Motifs']: print() for letter in MSO['Alphabet']: if len(motif['PWM'][letter]) != len(motif['Iupac_sequence']): print('CAUGHT PWM ERROR HERE') exit(1) if 'absolute_locations' in params: for motif in MSO['Motifs']: for loc in motif['Motif_Locations']: if loc['sequence_id'] in params['absolute_locations']: loc['sequence_id'] = params['contig'] absStart = int(params['start']) loc['start'] = absStart loc['end'] = absStart + loc['end'] print("test2")''' dfu = DataFileUtil(callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name']) save_objects_params['objects'] = [{'type': 'KBaseGeneRegulation.MotifSet' , 'data' : MSO , 'name' : params['obj_name']}] info = dfu.save_objects(save_objects_params)[0] print('SAVED OBJECT') print(info) motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) print(motif_set_ref) output = {'obj_ref' : motif_set_ref} print(output) #exit("test") #END UploadFrommfmd # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method UploadFrommfmd return value ' + 'output is not type dict as required.') # return the results return [output]
class IntegrateAppImpl: @staticmethod def _validate_params(params, required, optional=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" required = set(required) optional = set(optional) pkeys = set(params) if required - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(required - pkeys))) defined_param = required | optional for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) def _build_figure(self, file_path, figure_matrix): # Make figure matrix html file and embed file_name = 'integrated_scatterplot_output.html' figure_html_path = os.path.join(file_path, file_name) output_file(figure_html_path) save(grid(figure_matrix)) return file_name def _build_table(self, table_dict, stats_df): html_lines = list() html_lines.append('<table class="table table-bordered table-striped">') header_list = [ "Enzymes", "Compartments", "Reactions", "EC numbers", "Subsystems" ] + self.conditions_ids + ["Mahalanobis distance", "p-value"] html_lines.append('<thead>') internal_header_line = "</td><td>".join(header_list) html_lines.append('<tr><td>' + internal_header_line + '</td></tr>') html_lines.append('</thead>') html_lines.append("<tbody>") print_row = True for complex_row in sorted(table_dict.keys()): print_row = True cpts = ", ".join(sorted(list(table_dict[complex_row]))) ecs = [] subsystems = [] reactions = [] conditions = [] mahal_list = [] pvalue_list = [] mahalanobis_dist = "0.00" pvalue = "0.00" for cpt in table_dict[complex_row]: for rxn in table_dict[complex_row][cpt]: if (rxn not in reactions): reactions.append(rxn) if (len(conditions) == 0): conditions = table_dict[complex_row][cpt][rxn] if (rxn in self.reactions_data): for ss in self.reactions_data[rxn]['subsystems']: ss = ss.replace("_", " ") ss = ss.replace(" in plants", "") if (ss not in subsystems): subsystems.append(ss) for ec in self.reactions_data[rxn]['ecs']: if (ec not in ecs): ecs.append(ec) str_md = "0.00" str_pv = "0.00" if (rxn + '_' + cpt not in stats_df.index): print("MISSING REACTION: ", complex_row, rxn + "_" + cpt) print_row = False else: str_md = "{0:.2f}".format( stats_df.loc[rxn + '_' + cpt]['mahalanobis']) str_pv = "{0:.2f}".format(stats_df.loc[rxn + '_' + cpt]['pvalue']) if (str_pv == "0.00"): str_pv = "{0:.2e}".format( stats_df.loc[rxn + '_' + cpt]['pvalue']) if (mahalanobis_dist != "0.00" and str_md != mahalanobis_dist): print( "WARNING: CHANGING STATS FOR SAME PROTEIN COMPLEXES\n" ) print( "===================================================\n\n" ) print(complex_row, cpts, rxn, conditions, stats_df.loc[rxn + '_' + cpt]['mahalanobis'], mahalanobis_dist, "\n") print( "===================================================\n\n" ) mahalanobis_dist = str_md pvalue = str_pv reactions = ", ".join(sorted(reactions)) subsystems = ", ".join(sorted(subsystems)) ecs = ", ".join(sorted(ecs)) conditions_strings = list() for i in range(len(conditions)): conditions[i][0] = "{0:.2f}".format(conditions[i][0]) conditions_strings.append(" | ".join(conditions[i])) # some complexes may have zero features predicted if (print_row is True): html_lines.append("<tr>") internal_row_line = "</td><td>".join( [complex_row, cpts, reactions, ecs, subsystems] + conditions_strings + [mahalanobis_dist, pvalue]) html_lines.append("<td>" + internal_row_line + "</td>") html_lines.append("</tr>") html_lines.append("</tbody>") html_lines.append("</table>") return "\n".join(html_lines) def _build_report(self, figure_matrix, table_dict, stats_df, saved_object_list, workspace_name): """ _generate_report: generate summary report """ # Make report directory and copy over files report_file_path = os.path.join(self.scratch, self.report_uuid) os.mkdir(report_file_path) table_html_string = self._build_table(table_dict, stats_df) if (len(self.conditions_ids) > 1): figure_html_file = self._build_figure(report_file_path, figure_matrix) output_html_files = self._generate_report_html( report_file_path, figure_html_file=figure_html_file, table_string=table_html_string) else: output_html_files = self._generate_report_html( report_file_path, table_string=table_html_string) report_params = { 'direct_html_link_index': 0, #Use to refer to index of 'html_links' 'workspace_name': workspace_name, 'report_object_name': 'plant_fba_' + self.report_uuid, 'objects_created': saved_object_list, 'html_links': output_html_files } output = self.kbr.create_extended_report(report_params) return {'report_name': output['name'], 'report_ref': output['ref']} def _generate_report_html(self, file_path, figure_html_file=None, table_string=None): """ _generate_report: generates the HTML for the upload report """ html_report_list = list() ############################################################## # Write table html file ############################################################## # Read in template html with open( os.path.join( '/kb/module/data', 'app_report_templates', 'integrate_abundances_report_tables_template.html') ) as report_template_file: report_template_string = report_template_file.read() # Generate and Insert html title title_string = "-".join( [self.input_params['input_expression_matrix']] + self.conditions_ids) report_template_string = report_template_string.replace( '*TITLE*', title_string) # Insert html table table_report_string = report_template_string.replace( '*TABLES*', table_string) # Write html file table_html_file = "integrated_table_output.html" with open(os.path.join(file_path, table_html_file), 'w') as table_file: table_file.write(table_report_string) ############################################################## # Write summary index.html file ############################################################## # Begin composing html html_lines = list() html_lines.append( '<h3 style="text-align: center">Integrate Abundances with Metabolism Report</h3>' ) html_lines.append( "<p>The \"Integrate Abundances with Metabolism\" app has finished running.</br>" ) html_lines.append("The app integrated the values from the <b>" + self.input_params['input_expression_matrix'] + "</b> ExpressionMatrix") html_lines.append(" with the <b>" + self.input_params['input_fbamodel'] + "</b> FBAModel</br>") html_lines.append( "Specifically, the app integrated the values from these chosen conditions in the ExpressionMatrix: <b>" + "</b>, <b>".join(self.conditions_ids) + "</b></br>") html_lines.append( "The results of the integration are stored in the <b>" + self.input_params['output_reaction_matrix'] + "</b> ReactionMatrix.</p><br/>") html_lines.append( 'The results of the integration are also tabulated in this <a href="' + table_html_file + '" target="_blank">Table</a></br>') if (len(self.conditions_ids) > 1): html_lines.append( 'The results of the integration can be also be visualized in these <a href="' + figure_html_file + '" target="_blank">Scatterplots</a>') # Read in template html with open( os.path.join('/kb/module/data', 'app_report_templates', 'integrate_abundances_report_template.html') ) as report_template_file: report_template_string = report_template_file.read() # Insert html summary_report_string = report_template_string.replace( '*TEXT*', "\n".join(html_lines)) summary_html_file = "index.html" with open(os.path.join(file_path, summary_html_file), 'w') as index_file: index_file.write(summary_report_string) ############################################################## # Upload files and compose html report object ############################################################## # Cache it in shock as an archive upload_info = self.dfu.file_to_shock({ 'file_path': file_path, 'pack': 'zip' }) # HTML Link objects html_link = dict() # Index # html_link = {'shock_id' : upload_info['shock_id'], # 'name' : summary_html_file, # 'label' : 'HTML report for integrate_abundances_with_metabolism app', # 'description' : 'HTML report for integrate_abundances_with_metabolism app'} # html_report_list.append(html_link) if (len(self.conditions_ids) > 1): # Figures html_link = { 'shock_id': upload_info['shock_id'], 'name': figure_html_file, 'label': 'Scatterplot figures generated by Integrate Abundances with Metabolism app', 'description': 'Scatterplot figures generated by Integrate Abundances with Metabolism app' } html_report_list.append(html_link) # Table html_link = { 'shock_id': upload_info['shock_id'], 'name': table_html_file, 'label': 'HTML table generated by Integrate Abundances with Metabolism app', 'description': 'HTML table generated by Integrate Abundances with Metabolism app' } html_report_list.append(html_link) return html_report_list def _load_fbamodel(self, model_ref): model_obj = self.dfu.get_objects({'object_refs': [model_ref]})['data'][0] print("Number of reactions: " + str(len(model_obj['data']['modelreactions']))) model_reaction_lookup_dict = dict() for index in range(len(model_obj['data']['modelreactions'])): model_reaction_lookup_dict[model_obj['data']['modelreactions'] [index]['id']] = index return [model_obj, model_reaction_lookup_dict] def _load_expression_matrix(self, expdata_ref): expdata_obj = self.dfu.get_objects({'object_refs': [expdata_ref]})['data'][0] conditions_ids = expdata_obj['data']['data']['col_ids'] features_ids = expdata_obj['data']['data']['row_ids'] feature_lookup_dict = dict() for index in range(len(features_ids)): feature_lookup_dict[features_ids[index]] = index condition_lookup_dict = dict() for index in range(len(conditions_ids)): condition_lookup_dict[conditions_ids[index]] = index if (len(self.conditions_ids) == 0): self.conditions_ids = conditions_ids return [ expdata_obj, features_ids, feature_lookup_dict, condition_lookup_dict ] def _compile_genome_scores(self, data, conditions_indices): Feature_Comparison_Dict = dict() for feature_index in range(len(data)): scores_dict = dict() for condition in self.conditions_ids: condition_index = conditions_indices[condition] #Retrieve value from 2D matrix score = data[feature_index][condition_index] #Force into string for easier comparison str_score = "{0:.2f}".format(score) if (str_score == "0.00"): continue scores_dict[condition] = score #Here we skip features where there aren't enough scores (should be same number of conditions) if (len(scores_dict) < len(self.conditions_ids)): continue for condition in scores_dict: if (condition not in Feature_Comparison_Dict): Feature_Comparison_Dict[condition] = list() Feature_Comparison_Dict[condition].append( scores_dict[condition]) return Feature_Comparison_Dict def _compile_model_scores_percentiles(self, data): # I want to compute percentile rank for each feature under each condition # The Conditions_Score_Dicts variable is used to "bin" identical scores # (to two decimal points, can be changed) # First, we iterate through the conditions for computing percentile rank # for each condition model_conditions_score_lists = dict() model_conditions_score_pct_dicts = dict() for condition_index in range(len(self.conditions_ids)): condition = self.conditions_ids[condition_index] # For each condition, we "bin" the scores score_reaction_dict = dict() score_reaction_list = list() # The counting of features is done independently because we skip scores of zero # (which this affect how percentile rank distributes) n_ftrs = 0 for reaction_index in range(len(data)): # Retrieve value from 2D matrix score = data[reaction_index][condition_index] # Many reactions are not assigned a score, and instead have a default tiny score if (score == float(-sys.maxsize - 1)): continue # Force into string for easier comparison str_score = "{0:.2f}".format(score) # I skip the relatively large number of reactions that have a value of zero # to prevent the computation of the percentile rank skewing towards zero if (str_score == "0.00"): continue n_ftrs += 1 if (str_score not in score_reaction_dict): score_reaction_dict[str_score] = list() score_reaction_dict[str_score].append(reaction_index) score_reaction_list.append(float(str_score)) model_conditions_score_lists[condition] = score_reaction_list # Then for each condition, we use the binned scores to compute # percentile rank if (condition not in model_conditions_score_pct_dicts): model_conditions_score_pct_dicts[condition] = dict() sorted_scores = sorted(score_reaction_dict.keys(), key=float) less_than_score_ftrs_count = 0 for score_index in range(len(sorted_scores)): n_score_ftrs = len( score_reaction_dict[sorted_scores[score_index]]) half_n_score_ftrs = float(n_score_ftrs) * 0.5 cumulative_n_score_ftrs = float( less_than_score_ftrs_count) + half_n_score_ftrs percentile_rank = cumulative_n_score_ftrs / float(n_ftrs) less_than_score_ftrs_count += len( score_reaction_dict[sorted_scores[score_index]]) model_conditions_score_pct_dicts[condition][ sorted_scores[score_index]] = percentile_rank # This next part of the code is to re-iterate through the data and to compose the dicts # that become ColumnDataStores, and also with default values # The reaction_percentile_comparison_dict is for the reaction percentile plot reaction_percentile_comparison_dict = dict() if ('All' not in reaction_percentile_comparison_dict): reaction_percentile_comparison_dict['All'] = dict() # The reaction_score_comparison_dict works for the genome features plot reaction_score_comparison_dict = dict() for reaction_index in range(len(data)): scores_dict = dict() for condition_index in range(len(self.conditions_ids)): condition = self.conditions_ids[condition_index] #Retrieve value from 2D matrix score = data[reaction_index][condition_index] #Many reactions are not assigned a score, and instead a default tiny score if (score == float(-sys.maxsize - 1)): continue scores_dict[condition] = score #Here we skip reactions where there aren't enough scores (should be same number of conditions) if (len(scores_dict) < len(self.conditions_ids)): continue for condition in scores_dict: # Collect reaction scores if (condition not in reaction_score_comparison_dict): reaction_score_comparison_dict[condition] = list() reaction_score_comparison_dict[condition].append( scores_dict[condition]) # Collect reaction percentiles if (condition not in reaction_percentile_comparison_dict['All']): reaction_percentile_comparison_dict['All'][ condition] = list() #Force into string for easier comparison str_score = "{0:.2f}".format(scores_dict[condition]) #We skip zero scores when computing the percentiles #So we have to check for them here condition_pct = 0.00 if (str_score != '0.00'): condition_pct = model_conditions_score_pct_dicts[ condition][str_score] reaction_percentile_comparison_dict['All'][condition].append( condition_pct) if ('reactions' not in reaction_percentile_comparison_dict['All']): reaction_percentile_comparison_dict['All'][ 'reactions'] = list() if(self.reactions_ids[reaction_index] not in \ reaction_percentile_comparison_dict['All']['reactions']): reaction_percentile_comparison_dict['All'][ 'reactions'].append(self.reactions_ids[reaction_index]) base_rxn = self.reactions_ids[reaction_index].split('_')[0] for ss in self.reactions_data[base_rxn]['subsystems']: if (ss not in reaction_percentile_comparison_dict): reaction_percentile_comparison_dict[ss] = dict() if (condition not in reaction_percentile_comparison_dict[ss]): reaction_percentile_comparison_dict[ss][ condition] = list() reaction_percentile_comparison_dict[ss][condition].append( condition_pct) if ('reactions' not in reaction_percentile_comparison_dict[ss]): reaction_percentile_comparison_dict[ss][ 'reactions'] = list() if(self.reactions_ids[reaction_index] not in \ reaction_percentile_comparison_dict[ss]['reactions']): reaction_percentile_comparison_dict[ss][ 'reactions'].append( self.reactions_ids[reaction_index]) self.mh_reactions_ids.append(self.reactions_ids[reaction_index]) # We set the default values here at the end of the loop because we don't know # how many reactions there will be for each category for category in reaction_percentile_comparison_dict: for key in ['color', 'size', 'tooltip', 'fill_alpha']: reaction_percentile_comparison_dict[category][key] = list() for index in range( len(reaction_percentile_comparison_dict[category][ self.conditions_ids[0]])): reaction_percentile_comparison_dict[category][ 'fill_alpha'].append(1.0) # format string of subsystems for tooltip rxn = reaction_percentile_comparison_dict[category][ 'reactions'][index] base_rxn = rxn.split('_')[0] ss_string = ", ".join( self.reactions_data[base_rxn]['subsystems']) reaction_percentile_comparison_dict[category][ 'tooltip'].append(rxn + ", " + ss_string) if (category == 'All'): reaction_percentile_comparison_dict[category][ 'color'].append('black') reaction_percentile_comparison_dict[category][ 'size'].append(6) else: reaction_percentile_comparison_dict[category][ 'color'].append('red') reaction_percentile_comparison_dict[category][ 'size'].append(8) return [ reaction_score_comparison_dict, reaction_percentile_comparison_dict ] def _compile_mahalanobis_dist_pvalue(self, data, threshold): data_df = pd.DataFrame(data, columns=self.conditions_ids, index=self.mh_reactions_ids) # I don't know the math well enough to follow what's going on, but I used # the recipe described here: # https://www.machinelearningplus.com/statistics/mahalanobis-distance/ # Covariance matrix via numpy cov_mat = np.cov(data_df.values.T) # Inverse covariance matrix via scipy.linalg # It won't accept a 1x1 matrix hence the if/else if (len(self.conditions_ids) > 1): inv_cov_mat = sp.linalg.inv(cov_mat) else: inv_cov_mat = 1 / cov_mat # two terms required, second using dot product data_minus_mean = data_df - np.mean(data_df) left_term = np.dot(data_minus_mean, inv_cov_mat) # dot product mahalanobis = np.dot(left_term, data_minus_mean.T) data_df['mahalanobis'] = mahalanobis.diagonal() # chi-squared p-values with one degree of freedom (two sets of variables) data_df['pvalue'] = 1 - sp.stats.chi2.cdf(data_df['mahalanobis'], 1) # find the outliers below a given threshold, i.e. p < 0.01 outliers = data_df.loc[data_df.pvalue < threshold] # this is used when you want to just plot the p-values alone data_df.index.name = 'reactions' outliers.index.name = 'reactions' #Need to return the mapping between reactions and the p-values return [data_df, outliers] def _integrate_abundances(self, model_obj, feature_lookup_dict, expdata_obj, condition_indices): reaction_values_matrix = list() reactions_ids = list() minmax_expscore_dict = dict() model_complexes_dict = dict() fh = open(self.scratch + '/output.txt', 'w') fh2 = open(self.scratch + '/rxn01486.txt', 'w') print_data = False for mdlrxn in range(len(model_obj['data']['modelreactions'])): mdlrxn_obj = model_obj['data']['modelreactions'][mdlrxn] reactions_ids.append(mdlrxn_obj['id']) [base_rxn, cpt_id] = mdlrxn_obj['id'].split('_') # if(base_rxn == 'rxn01486' or base_rxn == 'rxn37610'): # print_data=True rxndata_row = list() for condition in self.conditions_ids: if (condition not in minmax_expscore_dict): minmax_expscore_dict[condition] = { 'max': -sys.maxsize - 1, 'min': sys.maxsize } condition_index = condition_indices[condition] # Maximal gene expression for a reaction reaction_score = ['nan', ""] prots_str_list = list() for prt in mdlrxn_obj['modelReactionProteins']: # Minimal gene expression for a complex complex_score = ['nan', ""] subs_str_list = list() for sbnt in prt['modelReactionProteinSubunits']: # Maximal gene expression for a subunit subunit_score = ['nan', ""] ftrs_str_list = list() for feature in sbnt['feature_refs']: feature = feature.split('/')[-1] ftrs_str_list.append(feature) feature_index = feature_lookup_dict[feature] ftr_score = expdata_obj['data']['data']['values'][ feature_index][condition_index] if (print_data is True): fh2.write(mdlrxn_obj['id'] + ':' + feature + ':' + str(ftr_score) + '\n') if (ftr_score < minmax_expscore_dict[condition]['min']): minmax_expscore_dict[condition][ 'min'] = ftr_score if (ftr_score > minmax_expscore_dict[condition]['max']): minmax_expscore_dict[condition][ 'max'] = ftr_score # Maximal gene expression for a subunit if (subunit_score[0] == 'nan' or subunit_score[0] < ftr_score): subunit_score = [ftr_score, feature] if (print_data is True): fh2.write(subunit_score, '\n') ftr_str = "(" + ", ".join(ftrs_str_list) + ")" subs_str_list.append(ftr_str) # Minimal gene expression for a complex if (subunit_score[0] != 'nan'): if (complex_score[0] == 'nan' or complex_score[0] > subunit_score[0]): complex_score[0] = subunit_score[0] complex_score[1] = subunit_score[1] if (print_data is True): fh2.write(complex_score, '\n') sub_str = "[" + ", ".join(subs_str_list) + "]" prots_str_list.append(sub_str) # Maximal gene expression for a reaction if (complex_score[0] != 'nan'): if (reaction_score[0] == 'nan' or reaction_score[0] < complex_score[0]): reaction_score[0] = complex_score[0] reaction_score[1] = complex_score[1] if (reaction_score[0] == 'nan'): reaction_score[0] = float(-sys.maxsize - 1) if (print_data is True): fh2.write(condition + ':' + str(reaction_score[0]) + '(' + reaction_score[1] + ')\n') #Putting together dict for table proteins_string = ', '.join(prots_str_list) if (len(prots_str_list) > 0 and proteins_string != "[]" and proteins_string != "[()]"): if (proteins_string not in model_complexes_dict): model_complexes_dict[proteins_string] = dict() if (cpt_id not in model_complexes_dict[proteins_string]): model_complexes_dict[proteins_string][cpt_id] = dict() if (base_rxn not in model_complexes_dict[proteins_string] [cpt_id]): model_complexes_dict[proteins_string][cpt_id][ base_rxn] = list() fh.write('\t'.join([ condition, proteins_string, cpt_id, base_rxn, str(reaction_score[0]), reaction_score[1], '\n' ])) model_complexes_dict[proteins_string][cpt_id][ base_rxn].append(reaction_score) rxndata_row.append(reaction_score[0]) print_data = False reaction_values_matrix.append(rxndata_row) fh.close() self.reactions_ids = reactions_ids return (reaction_values_matrix, model_complexes_dict) def __init__(self, config, ctx, input_params): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url) self.scratch = config['scratch'] self.report_uuid = str(uuid.uuid4()) # There is a bug in the UI that won't let me collect a # a clean list of conditions, so I have to parse them # from a comma-separated string if ("input_columns" in input_params and input_params["input_columns"] != ""): conditions = list() for condition in input_params["input_columns"].split(','): conditions.append(condition) input_params["input_columns"] = conditions self.input_params = input_params # set in _load_expression_matrix() self.conditions_ids = list() # this is an optional parameter, but restricts the # number of chosen columns in the matrix if ('input_columns' in input_params and len(input_params['input_columns']) > 0): self.conditions_ids = input_params['input_columns'] # set in _integrate_abundances() self.reactions_ids = list() # set in _compile_model_scores_percentiles self.mh_reactions_ids = list() with open( os.path.join("/kb/module/PlantSEED", "Data/PlantSEED_v3", "PlantSEED_Roles.json")) as plsd_fh: PS_Roles = json.load(plsd_fh) plantseed = FetchPlantSEEDImpl() self.reactions_data = plantseed.fetch_reactions(PS_Roles) def integrate_abundances_with_metabolism(self): self._validate_params( self.input_params, { 'input_ws', 'input_fbamodel', 'input_expression_matrix', 'output_reaction_matrix' }, {'input_columns'}) ############################################################## # Load model and expression objects ############################################################## model_ref = self.input_params['input_ws'] + '/' + self.input_params[ 'input_fbamodel'] [model_obj, reaction_index] = self._load_fbamodel(model_ref) # The columns / conditions_ids are set in this function if not set via user parameter expression_ref = self.input_params[ 'input_ws'] + '/' + self.input_params['input_expression_matrix'] [expdata_obj, features_ids, feature_index, condition_index] = self._load_expression_matrix(expression_ref) ############################################################## # Extract expression abundances for use in first scatter plot ############################################################## feature_comparison_dict = self._compile_genome_scores( expdata_obj['data']['data']['values'], condition_index) #################################################################### # Actually integrate abundances and build new ReactionMatrix object #################################################################### (reaction_values_matrix, model_complexes_dict) = self._integrate_abundances( model_obj, feature_index, expdata_obj, condition_index) rxndata_obj = { 'row_ids': self.reactions_ids, 'col_ids': self.conditions_ids, 'values': reaction_values_matrix } ########################################################################################## # Extract / organize reaction expression scores for use in first and second scatter plot ########################################################################################## [reaction_scores_dict, reaction_percentiles_dict ] = self._compile_model_scores_percentiles(reaction_values_matrix) ############################################################################################################# # Multi-variate mahalanobis distances computed along with outliers depending on chi-squared p-value of 0.01 ############################################################################################################# [mahal_dist_df, outliers] = self._compile_mahalanobis_dist_pvalue( reaction_percentiles_dict['All'], 0.01) ############################################################## # Figure generator ############################################################## subsystem_select_list = ["None"] for category in sorted(list(reaction_percentiles_dict.keys())): if (category == 'All'): continue subsystem_select_list.append(category) for rxn_idx in range( len(reaction_percentiles_dict[category]['reactions'])): rxn = reaction_percentiles_dict[category]['reactions'][rxn_idx] pval = mahal_dist_df.loc[rxn]['pvalue'] # reaction_percentiles_dict[category]['fill_alpha'][rxn_idx] = 1-pval figure_generator = GenerateFigureImpl() figure_grid = figure_generator.generate_figure( self.conditions_ids, category_select=subsystem_select_list, genome_features=feature_comparison_dict, reaction_scores=reaction_scores_dict, reaction_percentiles=reaction_percentiles_dict) ############################################################## # Finishing and Saving ReactionMatrix ############################################################## ReactionMatrix_obj = { 'type': 'KBaseMatrices.ReactionMatrix', 'name': self.input_params['output_reaction_matrix'], 'data': { 'scale': 'raw', 'description': 'reaction expression score', 'fbamodel_ref': model_ref, 'expression_ref': expression_ref, 'data': rxndata_obj } } ws_id = self.dfu.ws_name_to_id(self.input_params['input_ws']) saved_matrix_dict = self.dfu.save_objects({ 'id': ws_id, 'objects': [ReactionMatrix_obj] })[0] saved_matrix_ref = "{}/{}/{}".format(saved_matrix_dict[6], saved_matrix_dict[0], saved_matrix_dict[4]) saved_matrix_desc = "Reaction matrix: " + self.input_params[ 'output_reaction_matrix'] ##################################################################### # Building the report with figures, tables, and saved_objects (to be improved) # We pass in a dict where each key is a row for the table ##################################################################### output_object_files = list() output_object_files.append({ 'ref': saved_matrix_ref, 'description': saved_matrix_desc }) return self._build_report(figure_grid, model_complexes_dict, mahal_dist_df, output_object_files, self.input_params['input_ws'])
def MotifEnsemble(self, ctx, params): """ :param params: instance of type "EnsembleParams" (Internal workflow: 1. Input - list of motifsets , workspace, threshold consensus 2. Download MotifSets -> Utils function 3. Assign motif ids by position in list Use refs to identify MSOs internally! Dictionary of motifsets key: ref, val set list of match sets: each item in the set is a tuple of (ref,index) for each motifset: <- enumerate to avoid duplicate for each motif in motifset for each other motifset: <- enumerate to avoid duplicate for each motif in other: compare(motif1,motif2): if motifs same: search list of sets for motif1: if found add motif2 if not in if not found search list of sets for motif2: if found add motif1 else add a new set with motif1 + motif2) -> structure: parameter "motifset_refs" of list of String, parameter "workspace_name" of String, parameter "threshold" of Double, parameter "proportion" of Double :returns: instance of type "Ensemble_out" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: out #BEGIN MotifEnsemble #TODO: ERROR CHECK (MULTIPLE MOTIFSETS, NONEMPTY, SSREF are the same, etc.) dms = DownloadMotifSets() MotifSetDict = dms.DownloadMotifSet(params['motifset_refs'], self.callback_url) matchSets = [] threshold = float(params['threshold']) fmu = FastaUtils() for i, MSR1 in enumerate(MotifSetDict.keys()): for j, motif1 in enumerate(MotifSetDict[MSR1]['Motifs']): for k, MSR2 in enumerate(MotifSetDict.keys()): if k > i: for l, motif2 in enumerate( MotifSetDict[MSR2]['Motifs']): print(motif1) print(motif2) print(threshold) if fmu.CompareMotifsBP(motif1, motif2, threshold): found1 = False found2 = False index1 = -1 index2 = -1 for m, mset in enumerate(matchSets): if (MSR1, j) in mset: found1 = True index1 = m if (MSR2, l) in mset: found2 = True index2 = m if not found1 and found2: matchSets[index2].add((MSR1, j)) elif not found2 and found1: matchSets[index1].add((MSR2, l)) elif found1 and found2: if index1 != index2: matchSets[index1].union( matchSets[index2]) matchSets.pop(index2) else: matchSets.append( set([(MSR1, j), (MSR2, l)])) numMotifSets = len(params['motifset_refs']) threshold = float(params['proportion']) KeepSets = [] print('NUM MATCHSETS********') print(len(matchSets)) for i, mset in enumerate(matchSets): uniqueRefs = {} for tuple in mset: if tuple[0] not in uniqueRefs: uniqueRefs[tuple[0]] = tuple[0] if float(len(uniqueRefs.keys())) / numMotifSets >= threshold: KeepSets.append(i) print(len(KeepSets)) ESO = {} for ref in MotifSetDict: ESO['Condition'] = MotifSetDict[ref]['Condition'] ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref'] ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet']) ESO['Background'] = deepcopy(MotifSetDict[ref]['Background']) break ESO['Motifs'] = [] #Add motifs for keep in KeepSets: motif = fmu.merge(matchSets[keep], MotifSetDict) ESO['Motifs'].append(deepcopy(motif)) #upload new MSO dfu = DataFileUtil(self.callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) save_objects_params['objects'] = [{ 'type': 'KBaseGeneRegulation.MotifSet', 'data': ESO, 'name': 'EnsembleMotifSet' }] info = dfu.save_objects(save_objects_params)[0] obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) htmlDir = self.shared_folder + '/ensemble_html' os.mkdir(htmlDir) mr = MakeNewReport() mr.MakeReport(htmlDir, ESO) try: html_upload_ret = dfu.file_to_shock({ 'file_path': htmlDir, 'make_handle': 0, 'pack': 'zip' }) except: raise ValueError('error uploading HTML file to shock') reportName = 'MEMEMotifFinder_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [{ 'ref': obj_ref, 'description': 'Motif Set generated by MEME' }], 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{ 'shock_id': html_upload_ret['shock_id'], 'name': 'index.html', 'label': 'Save promoter_download.zip' }] report = KBaseReport(self.callback_url, token=ctx['token']) report_info = report.create_extended_report(reportObj) out = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END MotifEnsemble # At some point might do deeper type checking... if not isinstance(out, dict): raise ValueError('Method MotifEnsemble return value ' + 'out is not type dict as required.') # return the results return [out]
class plant_fba: ''' Module Name: plant_fba Module Description: A KBase module: plant_fba ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "1.1.1" GIT_URL = "[email protected]:kbaseapps/plant_fba.git" GIT_COMMIT_HASH = "6f0b5af5a458c5158b9f0007399653a256edcd14" #BEGIN_CLASS_HEADER def convert_search_role(self, role): searchrole = role #Remove spaces searchrole = searchrole.strip() searchrole = searchrole.replace(' ', '') #Make all lowercase searchrole = searchrole.lower() #Remove EC and parentheses searchrole = re.sub(r'\(ec[\d-]+\.[\d-]\.[\d-]\.[\d-]\)', '', searchrole) return searchrole #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.token = os.environ['KB_AUTH_TOKEN'] self.shared_folder = config['scratch'] self.config = config self.dfu = DataFileUtil(self.callback_url) #END_CONSTRUCTOR pass def integrate_abundances_with_metabolism(self, ctx, input_params): """ :param input_params: instance of type "IntegrateAbundancesParams" (@optional input_columns) -> structure: parameter "input_ws" of String, parameter "input_expression_matrix" of String, parameter "input_fbamodel" of String, parameter "input_columns" of String, parameter "output_reaction_matrix" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output_report #BEGIN integrate_abundances_with_metabolism app = IntegrateAppImpl(self.config, ctx, input_params) output_report = app.integrate_abundances_with_metabolism() #END integrate_abundances_with_metabolism # At some point might do deeper type checking... if not isinstance(output_report, dict): raise ValueError( 'Method integrate_abundances_with_metabolism return value ' + 'output_report is not type dict as required.') # return the results return [output_report] def reconstruct_plant_metabolism(self, ctx, input_params): """ :param input_params: instance of type "ReconstructMetabolismParams" -> structure: parameter "input_ws" of String, parameter "input_genome" of String, parameter "output_ws" of String, parameter "output_fbamodel" of String, parameter "template" of String, parameter "template_ws" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output_report #BEGIN reconstruct_plant_metabolism #Compile biochemistry information abbrev_cpt_dict = dict() cpt_name_dict = dict() with open('/kb/module/data/compartments.txt') as fh: for line in fh.readlines(): line = line.strip('\r\n') array = line.split('\t') abbrev_cpt_dict[array[3]] = array[0] cpt_name_dict[array[0]] = array[2] # Fetch and parse biochemistry data with open( os.path.join("/kb/module/ModelSEEDDatabase", "Biochemistry", "reactions.json")) as msd_rxn_fh: MSD_reactions = json.load(msd_rxn_fh) MSD_reactions_dict = dict() for entry in MSD_reactions: MSD_reactions_dict[entry['id']] = entry with open( os.path.join("/kb/module/ModelSEEDDatabase", "Biochemistry", "compounds.json")) as msd_rxn_fh: MSD_compounds = json.load(msd_rxn_fh) MSD_compounds_dict = dict() for entry in MSD_compounds: MSD_compounds_dict[entry['id']] = entry # Retrieve Template, and compile indexes of roles and complexes if ('template_ws' not in input_params or input_params['template_ws'] == ''): input_params['template_ws'] = 'NewKBaseModelTemplates' if ('template' not in input_params or input_params['template'] == ''): input_params['template'] = 'PlantModelTemplate' template_ref = input_params['template_ws'] + '/' + input_params[ 'template'] template_obj = self.dfu.get_objects({'object_refs': [template_ref]})['data'][0] searchroles_dict = dict() roles_dict = dict() for role in template_obj['data']['roles']: searchrole = self.convert_search_role(role['name']) searchroles_dict[searchrole] = role['id'] roles_dict[role['id']] = role complex_dict = dict() for cpx in template_obj['data']['complexes']: complex_dict[cpx['id']] = cpx #Retrieve Genome annotation as dict role_cpt_ftr_dict = dict() genome_ref = input_params['input_ws'] + '/' + input_params[ 'input_genome'] genome_obj = self.dfu.get_objects({'object_refs': [genome_ref]})['data'][0] for feature in genome_obj['data']['features']: if ('functions' in feature and len(feature['functions']) > 0): for function_comment in feature['functions']: #Split for comments and retrieve compartments function_cpt_list = function_comment.split("#") for i in range(len(function_cpt_list)): function_cpt_list[i] = function_cpt_list[i].strip() function = function_cpt_list.pop(0) roles = re.split("\s*;\s+|\s+[\@\/]\s+", function) for role in roles: searchrole = self.convert_search_role(role) if (searchrole not in searchroles_dict): continue role_id = searchroles_dict[searchrole] if (role_id not in role_cpt_ftr_dict): role_cpt_ftr_dict[role_id] = dict() #Defaults to cytosol if (len(function_cpt_list) == 0): function_cpt_list.append('cytosol') for cpt in function_cpt_list: abbrev_cpt = cpt if (cpt not in abbrev_cpt_dict): print( "No compartmental abbreviation found for " + cpt) else: abbrev_cpt = abbrev_cpt_dict[cpt] if (abbrev_cpt not in role_cpt_ftr_dict[role_id]): role_cpt_ftr_dict[role_id][abbrev_cpt] = dict() role_cpt_ftr_dict[role_id][abbrev_cpt][ feature['id']] = 1 #Default dictionaries for objects needed for a model reaction default_mdlcpt_dict = { 'id': 'u0', 'label': 'unknown', 'pH': 7, 'potential': 0, 'compartmentIndex': 0, 'compartment_ref': '~//' } default_mdlcpd_dict = { 'id': '', 'charge': 0, 'formula': '', 'name': '', 'compound_ref': '', 'modelcompartment_ref': '~/modelcompartments/id/u0' } default_mdlrxn_dict = { 'id': '', 'direction': '', 'protons': 0, 'name': '', 'reaction_ref': '', 'probability': 0, 'modelcompartment_ref': '', 'modelReactionReagents': [], 'modelReactionProteins': [] } #Lookup dictionaries for compartments and compounds, to avoid duplicating them mdlcpts_dict = dict() mdlcpds_dict = dict() #Reaction complexes for the generated table rxncplxs_dict = dict() #Create New, but Empty Plant Reconstruction new_model_obj = { 'id': input_params['output_fbamodel'], 'type': "GenomeScale", 'source': "KBase", 'source_id': "PlantSEED_v2", 'template_ref': template_ref, 'genome_ref': genome_ref, 'name': input_params['output_fbamodel'], 'modelreactions': [], 'modelcompounds': [], 'modelcompartments': [], 'biomasses': [], 'gapgens': [], 'gapfillings': [] } for template_rxn in template_obj['data']['reactions']: if (template_rxn['type'] == 'gapfilling'): continue template_rxn_cpt = template_rxn['templatecompartment_ref'].split( '/')[-1] proteins_list = list() prots_str_list = list() #complex_ref and source are optional fields default_protein_dict = { 'note': template_rxn['type'], 'complex_ref': '', 'modelReactionProteinSubunits': [] } for cpx_ref in template_rxn['templatecomplex_refs']: cpx_id = cpx_ref.split('/')[-1] model_complex_ref = "~/template/complexes/id/" + cpx_id new_protein_dict = copy.deepcopy(default_protein_dict) new_protein_dict['complex_ref'] = model_complex_ref complex_present = False subunits_list = list() default_subunit_dict = { 'role': '', 'triggering': 0, 'optionalSubunit': 0, 'note': '', 'feature_refs': [] } matched_role_dict = dict() for cpxrole in complex_dict[cpx_id]['complexroles']: role_id = cpxrole['templaterole_ref'].split('/')[-1] if (role_id in role_cpt_ftr_dict): for role_cpt in role_cpt_ftr_dict[role_id]: role_cpt_present = False if (template_rxn_cpt == role_cpt and cpxrole['triggering'] == 1): complex_present = True role_cpt_present = True if (role_cpt_present == True): new_subunit_dict = copy.deepcopy( default_subunit_dict) new_subunit_dict['triggering'] = cpxrole[ 'triggering'] new_subunit_dict['optionalSubunit'] = cpxrole[ 'optional_role'] new_subunit_dict['role'] = roles_dict[role_id][ 'name'] if (len(roles_dict[role_id]['features']) > 0): new_subunit_dict[ 'note'] = 'Features characterized and annotated' else: #This never happens as of Fall 2019 print("Warning: " + roles_dict[role_id]['name'] + " is apparently uncharacterized!") new_subunit_dict[ 'note'] = 'Features uncharacterized but annotated' pass for ftr in role_cpt_ftr_dict[role_id][ role_cpt]: feature_ref = "~/genome/features/id/" + ftr new_subunit_dict['feature_refs'].append( feature_ref) matched_role_dict[role_id] = 1 subunits_list.append(new_subunit_dict) if (role_id not in role_cpt_ftr_dict and template_rxn['type'] == 'universal'): #This should still be added, with zero features to indicate the universality of the role in plant primary metabolism new_subunit_dict = copy.deepcopy(default_subunit_dict) new_subunit_dict['triggering'] = cpxrole['triggering'] new_subunit_dict['optionalSubunit'] = cpxrole[ 'optional_role'] new_subunit_dict['role'] = roles_dict[role_id]['name'] #Un-necessary, but explicitly stated new_subunit_dict['feature_refs'] = [] if (len(roles_dict[role_id]['features']) == 0): new_subunit_dict[ 'note'] = 'Features uncharacterized and unannotated' else: #As of Fall 2019, this includes two reactions new_subunit_dict[ 'note'] = "Features characterized but unannotated" print("Missing annotation: ", cpx_id, role_id, roles_dict[role_id]) matched_role_dict[role_id] = 1 subunits_list.append(new_subunit_dict) if (complex_present == True): #Check to see if members of a detected protein complex are missing #and add them if so, to round off the complex #This will only happen to a complex that is conditional (see above) for cpxrole in complex_dict[cpx_id]['complexroles']: role_id = cpxrole['templaterole_ref'].split('/')[-1] if (role_id not in matched_role_dict): print("Gapfilling complex: ", cpx_id, roles_dict[role_id]) new_subunit_dict = copy.deepcopy( default_subunit_dict) new_subunit_dict['triggering'] = cpxrole[ 'triggering'] new_subunit_dict['optionalSubunit'] = cpxrole[ 'optional_role'] new_subunit_dict[ 'note'] = "Complex-based-gapfilling" subunits_list.append(new_subunit_dict) if (len(subunits_list) > 0): new_protein_dict[ 'modelReactionProteinSubunits'] = subunits_list #Store features and subunits as complex string for table subs_str_list = list() for subunit in subunits_list: ftrs_str_list = list() for ftr_ref in subunit['feature_refs']: ftr = ftr_ref.split('/')[-1] ftrs_str_list.append(ftr) ftr_str = "(" + ", ".join(ftrs_str_list) + ")" subs_str_list.append(ftr_str) sub_str = "[" + ", ".join(subs_str_list) + "]" prots_str_list.append(sub_str) proteins_list.append(new_protein_dict) prot_str = ", ".join(prots_str_list) #This is important, we need to use role-based annotation to determine whether #a reaction should even be added to the model if (template_rxn['type'] == 'conditional' and len(proteins_list) == 0): continue #If the check passes, then, here, we instantiate the actual reaction that goes into the model new_mdlrxn_id = template_rxn['id'] + '0' new_mdlcpt_id = template_rxn_cpt + '0' base_rxn_id = template_rxn['id'].split('_')[0] #For table rxncplxs_dict[new_mdlrxn_id] = prot_str new_mdlrxn_dict = copy.deepcopy(default_mdlrxn_dict) new_mdlrxn_dict['id'] = new_mdlrxn_id new_mdlrxn_dict['name'] = MSD_reactions_dict[base_rxn_id][ 'abbreviation'] if (MSD_reactions_dict[base_rxn_id]['abbreviation'] == ""): new_mdlrxn_dict['name'] = base_rxn_id new_mdlrxn_dict['direction'] = template_rxn['direction'] new_mdlrxn_dict[ 'reaction_ref'] = '~/template/reactions/id/' + template_rxn[ 'id'] new_mdlrxn_dict[ 'modelcompartment_ref'] = '~/modelcompartments/id/' + new_mdlcpt_id #Here we check and instantiate a new modelcompartment if (new_mdlcpt_id not in mdlcpts_dict): new_mdlcpt_dict = copy.deepcopy(default_mdlcpt_dict) new_mdlcpt_dict['id'] = new_mdlcpt_id new_mdlcpt_dict['label'] = cpt_name_dict[template_rxn_cpt] new_mdlcpt_dict[ 'compartment_ref'] = '~/template/compartments/id/' + template_rxn_cpt mdlcpts_dict[new_mdlcpt_id] = new_mdlcpt_dict #Add Proteins as previously determined new_mdlrxn_dict['modelReactionProteins'] = proteins_list #Add Reagents for template_rgt in template_rxn['templateReactionReagents']: template_rgt_cpd_cpt_id = template_rgt[ 'templatecompcompound_ref'].split('/')[-1] (template_rgt_cpd, template_rgt_cpt) = template_rgt_cpd_cpt_id.split('_') #Check and add new model compartment new_mdlcpt_id = template_rgt_cpt + '0' if (new_mdlcpt_id not in mdlcpts_dict): new_mdlcpt_dict = copy.deepcopy(default_mdlcpt_dict) new_mdlcpt_dict['id'] = new_mdlcpt_id new_mdlcpt_dict['label'] = cpt_name_dict[template_rgt_cpt] new_mdlcpt_dict[ 'compartment_ref'] = '~/template/compartments/id/' + template_rgt_cpt mdlcpts_dict[new_mdlcpt_id] = new_mdlcpt_dict #Add new model compounds new_mdlcpd_id = template_rgt_cpd_cpt_id + '0' base_cpd_id = template_rgt_cpd_cpt_id.split('_')[0] if (new_mdlcpd_id not in mdlcpds_dict): new_mdlcpd_dict = copy.deepcopy(default_mdlcpd_dict) new_mdlcpd_dict['id'] = new_mdlcpd_id new_mdlcpd_dict['name'] = MSD_compounds_dict[base_cpd_id][ 'name'] new_mdlcpd_dict['charge'] = float( MSD_compounds_dict[base_cpd_id]['charge']) new_mdlcpd_dict['formula'] = MSD_compounds_dict[ base_cpd_id]['formula'] if(MSD_compounds_dict[base_cpd_id]['formula'] == "" or \ MSD_compounds_dict[base_cpd_id]['formula'] is None): print("Formula: ", base_cpd_id, MSD_compounds_dict[base_cpd_id]) new_mdlcpd_dict['formula'] = "" new_mdlcpd_dict[ 'compound_ref'] = '~/template/compounds/id/' + template_rgt_cpd new_mdlcpd_dict[ 'modelcompartment_ref'] = '~/modelcompartments/id/' + new_mdlcpt_id mdlcpds_dict[new_mdlcpd_id] = new_mdlcpd_dict new_rgt_dict = { 'coefficient': template_rgt['coefficient'], 'modelcompound_ref': '~/modelcompounds/id/' + new_mdlcpd_id } new_mdlrxn_dict['modelReactionReagents'].append(new_rgt_dict) new_model_obj['modelreactions'].append(new_mdlrxn_dict) #Having populated with list of reactions and biomass (to come), then add all compartments and compounds for cpt_id in mdlcpts_dict: new_model_obj['modelcompartments'].append(mdlcpts_dict[cpt_id]) #Last, but key modelcompound is the biomass, need to add it explicitly biocpd_id = "cpd11416" mdlbiocpd_dict = copy.deepcopy(default_mdlcpd_dict) mdlbiocpd_dict['id'] = biocpd_id + '_c0' mdlbiocpd_dict['name'] = 'Biomass' mdlbiocpd_dict['compound_ref'] = "~/template/compounds/id/" + biocpd_id mdlbiocpd_dict['modelcompartment_ref'] = "~/modelcompartments/id/c0" mdlcpds_dict[mdlbiocpd_dict['id']] = mdlbiocpd_dict for cpd_id in mdlcpds_dict: new_model_obj['modelcompounds'].append(mdlcpds_dict[cpd_id]) default_biomass_dict = { 'id': 'bio1', 'name': 'Plant leaf biomass', 'other': 1, 'dna': 0, 'rna': 0, 'protein': 0, 'cellwall': 0, 'lipid': 0, 'cofactor': 0, 'energy': 0, 'biomasscompounds': [] } default_biocpd_dict = {'modelcompound_ref': '', 'coefficient': 0} for template_biomass in template_obj['data']['biomasses']: new_template_biomass = copy.deepcopy(default_biomass_dict) new_template_biomass['id'] = template_biomass['id'] new_template_biomass['name'] = template_biomass['name'] for entry in [ 'dna', 'rna', 'protein', 'cellwall', 'lipid', 'cofactor', 'energy', 'other' ]: new_template_biomass[entry] = template_biomass[entry] for template_cpd in template_biomass['templateBiomassComponents']: new_biocpd_dict = copy.deepcopy(default_biocpd_dict) mdlcpd_id = template_cpd['templatecompcompound_ref'].split( '/')[-1] + '0' if (mdlcpd_id not in mdlcpds_dict): print("Missing: ", template_cpd) continue new_biocpd_dict[ 'modelcompound_ref'] = '~/modelcompounds/id/' + mdlcpd_id new_biocpd_dict['coefficient'] = template_cpd['coefficient'] new_template_biomass['biomasscompounds'].append( new_biocpd_dict) new_model_obj['biomasses'].append(new_template_biomass) print("Saving metabolic reconstruction") model_ws_object = { 'type': 'KBaseFBA.FBAModel', 'name': input_params['output_fbamodel'], 'data': new_model_obj } if ('output_ws' not in input_params or input_params['output_ws'] == ''): input_params['output_ws'] = input_params['input_ws'] ws_id = self.dfu.ws_name_to_id(input_params['output_ws']) saved_model_list = self.dfu.save_objects({ 'id': ws_id, 'objects': [model_ws_object] })[0] #Compose report string html_string = "<html><head><title>Reconstruct Plant Metabolism Report</title></head><body>" html_string += "<h2>Reconstruct Plant Metabolism Report</h2>" html_string += "<p>The \"Reconstruct Plant Metabolism\" app has finished running, " html_string += "reconstructing the primary metabolism from the " html_string += "enzymatic annotations in " + input_params[ 'input_genome'] + "</p>" html_string += "<p>Below we present the table of compartmentalized reactions in the metabolic reconstruction, " html_string += "it is similar to what you can see in the FBAModel viewer widget that appears " html_string += "below the report, but it has some additional information. Each row in the table is unique " html_string += "to each combination of reaction and compartment.</p>" html_string += "<p><ul>" html_string += "<li><b>Subsystems and Classes:</b> The table contains the metabolic subsystems and " html_string += "the general class of metabolism they fall into.</li>" html_string += "<li><b>Metabolic functions and EC numbers:</b> The table contains the original enzymatic " html_string += "annotation ('Roles') and their EC numbers that were associated with each biochemical reaction.</li>" html_string += "<li><b>Complexes:</b> The table contains the genes that were annotated with the metabolic functions. " html_string += "These genes that are associated with each reaction can be seen in the FBAModel viewer widget, but here " html_string += " one can see how they may be organized into protein complexes. Each set of parentheses '()' " html_string += "represents a single protein subunit (which may be the entire enzyme, or part of a large enzymatic " html_string += "complex). Each set of square brackets '[]' represents an entire enzyme, regardless of how many " html_string += "subunits it consists of. Each reaction may be catalyzed by different enzymes, each in turn composed " html_string += "of different subunits. The complexes reflect how the enzymes were curated in <i>Arabidopsis thaliana</i> " html_string += " so if any complex is shown to be empty, this means that the enzymatic annotation was not propagated " html_string += "from the original Arabidopsis gene. The original Arabidopsis curation also included protein localization " html_string += "so if a reaction has empty complexes in some compartments as opposed to others, this is an indication " html_string += "that annotation was only propagated for some localized Arabidopsis enzymes, and not others." html_string += "</ul></p>" # Fetch PlantSEED Data with open( os.path.join("/kb/module/PlantSEED", "Data/PlantSEED_v3", "PlantSEED_Roles.json")) as plsd_fh: PS_Roles = json.load(plsd_fh) plantseed = FetchPlantSEEDImpl() reactions_data = plantseed.fetch_reactions(PS_Roles) table = GenerateTableImpl() table_html_string = table.generate_table(reactions_data, complexes=rxncplxs_dict) with open( os.path.join( '/kb/module/data', 'app_report_templates', 'integrate_abundances_report_tables_template.html') ) as report_template_file: report_template_string = report_template_file.read() # Generate and insert html Title report_template_string = report_template_string.replace( '*TITLE*', input_params['output_fbamodel']) # Insert html table table_report_string = report_template_string.replace( '*TABLES*', html_string + table_html_string) #Make folder for report files uuid_string = str(uuid.uuid4()) report_file_path = os.path.join(self.shared_folder, uuid_string) os.mkdir(report_file_path) #Write html files with open(os.path.join(report_file_path, "index.html"), 'w') as index_file: index_file.write(table_report_string) #Cache it in shock as an archive upload_info = self.dfu.file_to_shock({ 'file_path': report_file_path, 'pack': 'zip' }) #Prepare report parameters report_params = { 'direct_html_link_index': 0, #Use to refer to index of 'html_links' 'workspace_name': input_params['input_ws'], 'report_object_name': 'plant_fba_' + uuid_string, 'objects_created': [], 'html_links': [] } #Html Link object html_link = { 'shock_id': upload_info['shock_id'], 'name': 'index.html', 'label': 'html files', 'description': 'HTML files' } report_params['html_links'].append(html_link) #Objects created object saved_model_ref = "{}/{}/{}".format(saved_model_list[6], saved_model_list[0], saved_model_list[4]) saved_model_desc = "FBAModel: " + input_params['output_fbamodel'] report_params['objects_created'].append({ 'ref': saved_model_ref, 'description': saved_model_desc }) kbase_report_client = KBaseReport(self.callback_url, token=self.token) report_client_output = kbase_report_client.create_extended_report( report_params) output_report = dict() output_report['report_name'] = report_client_output['name'] output_report['report_ref'] = report_client_output['ref'] #END reconstruct_plant_metabolism # At some point might do deeper type checking... if not isinstance(output_report, dict): raise ValueError( 'Method reconstruct_plant_metabolism return value ' + 'output_report is not type dict as required.') # return the results return [output_report] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
def run_FamaGenomeProfiling(self, ctx, params): """ Run genome functional profiling module of Fama. :param params: instance of type "FamaGenomeProfilingParams" (Parameters for genome functional profiling. workspace_name - the name of the workspace for input/output genome_refs - references to a genome object ref_dataset - the name of Fama reference dataset output_result_name - the name of the output DomainAnnotation) -> structure: parameter "workspace_name" of String, parameter "genome_ref" of list of String, parameter "ref_dataset" of String, parameter "output_feature_set_name" of String, parameter "output_annotation_name" of String :returns: instance of type "ReportResults" (Output report parameters report_name - the name of the report object report_ref - the reference to the report object) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_FamaGenomeProfiling # Import protein sequences from input genome_ref ws_client = Workspace(self.ws_url) input_genome_refs = params['genome_ref'] fama_reference = params['ref_dataset'] input_proteins = {} name2ref = {} for input_genome_ref in input_genome_refs: ret = ws_client.get_objects2( {'objects': [{ 'ref': input_genome_ref }]})['data'][0] obj_data = ret['data'] obj_name = ret['info'][1] obj_type = ret['info'][2].split('.')[1].split('-')[0] if obj_type == 'GenomeSet': print('GenomeSet data', obj_data) genome_refs = [] if 'elements' in obj_data: genome_refs = [ item['ref'] for item in obj_data['elements'].values() ] elif 'items' in obj_data: genome_refs = [item['ref'] for item in obj_data['items']] for sub_obj_ref in genome_refs: ret = ws_client.get_objects2( {'objects': [{ 'ref': sub_obj_ref }]})['data'][0] genome_data = ret['data'] genome_name = ret['info'][1] if genome_name in name2ref: raise ServerError( 'All input genome names must be unique. Check ' + genome_name) name2ref[genome_name] = sub_obj_ref proteins = genome_proteins_to_fasta( genome_data, self.shared_folder) input_proteins[genome_name] = {} input_proteins[genome_name]['fwd'] = proteins elif obj_type == 'Genome': if obj_name in name2ref: raise ServerError('All input genome names must be unique') name2ref[obj_name] = input_genome_ref proteins = genome_proteins_to_fasta(obj_data, self.shared_folder) input_proteins[obj_name] = {} input_proteins[obj_name]['fwd'] = proteins else: raise ServerError('Incompatible object: ' + input_genome_ref + ' (' + obj_name + ')') self.log('Input sequence files:', str(input_proteins)) self.log('reference: ', fama_reference) # Run Fama fama_params = { 'input_proteins': input_proteins, 'work_dir': self.shared_folder, 'reference': fama_reference, 'ws_name': params['workspace_name'], 'ws_client': ws_client, 'featureset_name': params['output_feature_set_name'], 'annotation_prefix': params['output_annotation_name'], 'name2ref': name2ref } fama_output = protein_functional_profiling_pipeline(fama_params) objects_created = fama_output['objects_created'] dfu = DataFileUtil(self.callback_url) workspace_id = dfu.ws_name_to_id(params['workspace_name']) object_type = 'KBaseCollections.FeatureSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': fama_output['feature_set_data'], 'name': params['output_feature_set_name'] }] } try: dfu_oi = dfu.save_objects(save_object_params)[0] except ServerError as dfue: # not really any way to test this block self.log('Logging exception saving feature set') self.log(str(dfue)) raise feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) objects_created.append({ 'ref': feature_set_obj_ref, 'description': 'Filtered genome features' }) self.log('FeatureSet saved to ' + feature_set_obj_ref) # Write HTML output to workspace message = 'Fama protein functional profiling finished successfully' try: dfu_output = dfu.file_to_shock( {'file_path': fama_output['html_report']}) except ServerError as dfue: # not really any way to test this block self.log('Logging exception loading results to shock') self.log(str(dfue)) raise self.log('HTML report saved: ' + str(dfu_output)) html_links = [{ 'shock_id': dfu_output['shock_id'], 'description': 'HTML report for Fama App', 'name': 'fama_report.html', 'label': 'Fama_report' }] for krona_file in fama_output['krona_charts']: try: dfu_output = dfu.file_to_shock({'file_path': krona_file}) html_links.append({ 'shock_id': dfu_output['shock_id'], 'description': 'Krona chart for function taxonomy profile', 'name': fama_output['krona_charts'][krona_file][0], 'label': fama_output['krona_charts'][krona_file][1] }) except ServerError as dfue: # not really any way to test this block self.log('Logging exception loading results to shock') self.log(str(dfue)) raise self.log('Krona chart saved: ' + str(dfu_output)) # Save report report_params = { 'message': message, 'objects_created': objects_created, 'direct_html_link_index': 0, 'html_links': html_links, 'file_links': fama_output['report_files'], 'report_object_name': 'fama_profiling_report_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'], 'html_window_height': 460 } try: self.log('Call KBaseReport at ' + str(self.callback_url)) report = KBaseReport(self.callback_url) self.log('Ready to save KBase report: ' + str(report_params)) report_info = report.create_extended_report(report_params) except ServerError as kre: # not really any way to test this block self.log('Logging exception saving report') self.log(str(kre)) raise report_info['report_params'] = report_params self.log('KBase report saved: ' + str(report_info)) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END run_FamaGenomeProfiling # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_FamaGenomeProfiling return value ' + 'output is not type dict as required.') # return the results return [output]
class BiomUtil: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _process_params(self, params): logging.info('start validating import_matrix_from_biom params') # check for required parameters for p in [ 'obj_type', 'matrix_name', 'workspace_name', 'scale', 'amplicon_set_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) obj_type = params.get('obj_type') if obj_type not in self.matrix_types: raise ValueError('Unknown matrix object type: {}'.format(obj_type)) scale = params.get('scale') if scale not in SCALE_TYPES: raise ValueError('Unknown scale type: {}'.format(scale)) biom_file = None tsv_file = None fasta_file = None metadata_keys = DEFAULT_META_KEYS if params.get('biom_tsv'): biom_tsv = params.get('biom_tsv') biom_file = biom_tsv.get('biom_file_biom_tsv') tsv_file = biom_tsv.get('tsv_file_biom_tsv') if not (biom_file and tsv_file): raise ValueError('missing BIOM or TSV file') biom_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': biom_file }).get('copy_file_path') tsv_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': tsv_file }).get('copy_file_path') mode = 'biom_tsv' elif params.get('biom_fasta'): biom_fasta = params.get('biom_fasta') biom_file = biom_fasta.get('biom_file_biom_fasta') fasta_file = biom_fasta.get('fasta_file_biom_fasta') if not (biom_file and fasta_file): raise ValueError('missing BIOM or FASTA file') biom_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': biom_file }).get('copy_file_path') fasta_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': fasta_file }).get('copy_file_path') mode = 'biom_fasta' elif params.get('tsv_fasta'): tsv_fasta = params.get('tsv_fasta') tsv_file = tsv_fasta.get('tsv_file_tsv_fasta') fasta_file = tsv_fasta.get('fasta_file_tsv_fasta') if not (tsv_file and fasta_file): raise ValueError('missing TSV or FASTA file') tsv_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': tsv_file }).get('copy_file_path') fasta_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': fasta_file }).get('copy_file_path') metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta') if metadata_keys_str: metadata_keys += [ x.strip() for x in metadata_keys_str.split(',') ] mode = 'tsv_fasta' elif params.get('tsv'): tsv = params.get('tsv') tsv_file = tsv.get('tsv_file_tsv') if not tsv_file: raise ValueError('missing TSV file') tsv_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': tsv_file }).get('copy_file_path') metadata_keys_str = tsv.get('metadata_keys_tsv') if metadata_keys_str: metadata_keys += [ x.strip() for x in metadata_keys_str.split(',') ] mode = 'tsv' else: raise ValueError('missing valide file group type in parameters') return (biom_file, tsv_file, fasta_file, mode, list(set(metadata_keys))) def _retrieve_value(self, biom_metadata_dict, tsv_metadata_df, key, required=False): if key in biom_metadata_dict: return {k.lower(): v for k, v in biom_metadata_dict.items()}.get(key) elif key in tsv_metadata_df: return {k.lower(): v for k, v in tsv_metadata_df.items()}.get(key) elif required: raise ValueError('missing necessary [{}] from file'.format(key)) else: return None def _search_taxon(self, scientific_name): """ logic borrowed from: GFU.GenomeInterface https://github.com/kbaseapps/GenomeFileUtil/blob/master/lib/GenomeFileUtil/core/GenomeInterface.py#L216 """ taxon_id = None search_params = { "object_types": ["taxon"], "match_filter": { "lookup_in_keys": { "scientific_name": { "value": scientific_name } }, "exclude_subobjects": 1 }, "access_filter": { "with_private": 0, "with_public": 1 }, "sorting_rules": [{ "is_object_property": 0, "property": "timestamp", "ascending": 0 }] } objects = self.kbse.search_objects(search_params)['objects'] if not objects: search_params['match_filter']['lookup_in_keys'] = { "aliases": { "value": scientific_name } } objects = self.kbse.search_objects(search_params)['objects'] if objects: taxon_id = objects[0].get('object_name') return taxon_id def _fetch_taxon_level(self, taxon_char): taxon_level_mapping = { 'l': 'Life', 'd': 'Domain', 'k': 'Kingdom', 'p': 'Phylum', 'c': 'Class', 'o': 'Order', 'f': 'Family', 'g': 'Genus', 's': 'Species' } return taxon_level_mapping.get(taxon_char[0].lower(), 'Unknown') def _fetch_taxonomy(self, datarow): lineage = self._retrieve_value([], datarow, 'taxonomy') if isinstance(lineage, str): delimiter = csv.Sniffer().sniff(lineage).delimiter lineage = [x.strip() for x in lineage.split(delimiter)] taxonomy = {'lineage': lineage} for key in ['score', 'taxonomy_source', 'species_name']: val = self._retrieve_value([], datarow, key) if val: taxonomy[key] = val for item in lineage[::-1]: scientific_name = item.split('_')[-1] taxon_level_char = item.split('_')[0] if scientific_name: taxon_id = self._search_taxon(scientific_name) if taxon_id: taxon_ref = f"{self.taxon_wsname}/{taxon_id}" taxon_level = self._fetch_taxon_level(taxon_level_char) taxonomy.update({ 'taxon_ref': taxon_ref, 'taxon_id': taxon_id, 'scientific_name': scientific_name, 'taxon_level': taxon_level }) break return taxonomy def _retrieve_tsv_amplicon_set_data(self, tsv_file): amplicons = dict() try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide TSV file') if 'consensus_sequence' not in df.columns.tolist(): raise ValueError('TSV file does not include consensus_sequence') logging.info('start processing each row in TSV') for observation_id in df.index: taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = { 'consensus_sequence': df.loc[observation_id, 'consensus_sequence'], 'taxonomy': taxonomy } amplicons.update({observation_id: amplicon}) logging.info('finished parsing TSV file') return amplicons def _retrieve_tsv_fasta_amplicon_set_data(self, tsv_file, fasta_file): amplicons = dict() try: logging.info('start parsing FASTA file') fastq_dict = SeqIO.index(fasta_file, "fasta") except Exception: raise ValueError( 'Cannot parse file. Please provide valide FASTA file') try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide TSV file') logging.info('start processing files') for observation_id in df.index: if observation_id not in fastq_dict: raise ValueError('FASTA file does not have [{}] OTU id'.format( observation_id)) taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = { 'consensus_sequence': str(fastq_dict.get(observation_id).seq), 'taxonomy': taxonomy } amplicons.update({observation_id: amplicon}) logging.info('finished processing files') return amplicons def _retrieve_biom_fasta_amplicon_set_data(self, biom_file, fasta_file): amplicons = dict() try: logging.info('start parsing FASTA file') fastq_dict = SeqIO.index(fasta_file, "fasta") except Exception: raise ValueError( 'Cannot parse file. Please provide valide FASTA file') logging.info('start parsing BIOM file') table = biom.load_table(biom_file) observation_ids = table._observation_ids.tolist() observation_metadata = table._observation_metadata logging.info('start processing files') for index, observation_id in enumerate(observation_ids): if observation_id not in fastq_dict: raise ValueError('FASTA file does not have [{}] OTU id'.format( observation_id)) taxonomy = self._fetch_taxonomy(observation_metadata[index]) amplicon = { 'consensus_sequence': str(fastq_dict.get(observation_id).seq), 'taxonomy': taxonomy } amplicons.update({observation_id: amplicon}) logging.info('finished processing files') return amplicons def _retrieve_biom_tsv_amplicon_set_data(self, biom_file, tsv_file): amplicons = dict() try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide tsv file') if 'consensus_sequence' not in df.columns.tolist(): raise ValueError('TSV file does not include consensus_sequence') logging.info('start parsing BIOM file') table = biom.load_table(biom_file) observation_ids = table._observation_ids.tolist() observation_metadata = table._observation_metadata logging.info('start processing files') for index, observation_id in enumerate(observation_ids): if observation_id not in df.index: raise ValueError('TSV file does not have [{}] OTU id'.format( observation_id)) taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = { 'consensus_sequence': df.loc[observation_id, 'consensus_sequence'], 'taxonomy': taxonomy } amplicons.update({observation_id: amplicon}) logging.info('finished processing files') return amplicons def _file_to_amplicon_set_data(self, biom_file, tsv_file, fasta_file, mode, refs, description, matrix_obj_ref): logging.info('start parsing amplicon_set_data') amplicon_set_data = dict() if mode == 'biom_tsv': amplicons = self._retrieve_biom_tsv_amplicon_set_data( biom_file, tsv_file) elif mode == 'biom_fasta': amplicons = self._retrieve_biom_fasta_amplicon_set_data( biom_file, fasta_file) elif mode == 'tsv_fasta': amplicons = self._retrieve_tsv_fasta_amplicon_set_data( tsv_file, fasta_file) elif mode == 'tsv': amplicons = self._retrieve_tsv_amplicon_set_data(tsv_file) else: raise ValueError( 'error parsing _file_to_amplicon_set_data, mode: {}'.format( mode)) amplicon_set_data.update({'amplicons': amplicons}) if 'reads_set_ref' in refs: amplicon_set_data['reads_set_ref'] = refs.get('reads_set_ref') if description: amplicon_set_data['description'] = description matrix_obj_ref_array = matrix_obj_ref.split('/') amplicon_set_data['amplicon_matrix_ref'] = '{}/{}'.format( matrix_obj_ref_array[0], matrix_obj_ref_array[1]) return amplicon_set_data def _file_to_amplicon_data(self, biom_file, tsv_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys=None): amplicon_data = refs if mode.startswith('biom'): logging.info('start parsing BIOM file for matrix data') table = biom.load_table(biom_file) observation_metadata = table._observation_metadata sample_metadata = table._sample_metadata matrix_data = { 'row_ids': table._observation_ids.tolist(), 'col_ids': table._sample_ids.tolist(), 'values': table.matrix_data.toarray().tolist() } logging.info('start building attribute mapping object') amplicon_data.update( self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data.update( self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} for k in ('create_date', 'generated_by'): val = getattr(table, k) if not val: continue if isinstance(val, bytes): amplicon_data['attributes'][k] = val.decode('utf-8') else: amplicon_data['attributes'][k] = str(val) elif mode.startswith('tsv'): observation_metadata = None sample_metadata = None try: logging.info('start parsing TSV file for matrix data') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide tsv file') else: metadata_df = None if metadata_keys: shared_metadata_keys = list( set(metadata_keys) & set(df.columns)) if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys: raise ValueError( 'TSV file does not include consensus_sequence') if shared_metadata_keys: metadata_df = df[shared_metadata_keys] df.drop(columns=shared_metadata_keys, inplace=True) try: df = df.astype(float) except ValueError: err_msg = 'Found some non-float values. Matrix contains only numeric values\n' err_msg += 'Please list any non-numeric column names in Metadata Keys field' raise ValueError(err_msg) df.fillna(0, inplace=True) matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist() } logging.info('start building attribute mapping object') amplicon_data.update( self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df)) amplicon_data.update( self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} else: raise ValueError( 'error parsing _file_to_amplicon_data, mode: {}'.format(mode)) amplicon_data.update({'data': matrix_data}) amplicon_data['search_attributes'] = [ f'{k}|{v}' for k, v in amplicon_data['attributes'].items() ] amplicon_data['scale'] = scale if description: amplicon_data['description'] = description return amplicon_data def get_attribute_mapping(self, axis, metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df=None): mapping_data = {} axis_ids = matrix_data[f'{axis}_ids'] if refs.get(f'{axis}_attributemapping_ref'): am_data = self.dfu.get_objects( {'object_refs': [refs[f'{axis}_attributemapping_ref']]})['data'][0]['data'] unmatched_ids = set(axis_ids) - set(am_data['instances'].keys()) if unmatched_ids: name = "Column" if axis == 'col' else "Row" raise ValueError( f"The following {name} IDs from the uploaded matrix do not match " f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}" f"\nPlease verify the input data or upload an excel file with a" f"{name} mapping tab.") else: mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata: name = matrix_name + "_{}_attributes".format(axis) mapping_data[ f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping( axis_ids, metadata, name, workspace_id) # if coming from biom file, metadata and axis IDs are guaranteed to match mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata_df is not None: name = matrix_name + "_{}_attributes".format(axis) mapping_data[ f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping( axis_ids, metadata_df, name, workspace_id) mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} return mapping_data def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name, ws_id): data = {'ontology_mapping_method': "TSV file", 'instances': {}} attribute_keys = metadata_df.columns.tolist() data['attributes'] = [{ 'attribute': key, 'source': 'upload' } for key in attribute_keys] for axis_id in axis_ids: data['instances'][axis_id] = metadata_df.loc[axis_id].tolist() logging.info( 'start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] return f'{info[6]}/{info[0]}/{info[4]}' def _metadata_to_attribute_mapping(self, instances, metadata, obj_name, ws_id): data = {'ontology_mapping_method': "BIOM file", 'instances': {}} sample_set = metadata[0:min(len(metadata), 25)] metadata_keys = sorted( set((k for m_dict in sample_set for k in m_dict))) data['attributes'] = [{ 'attribute': key, 'source': 'upload' } for key in metadata_keys] for inst, meta in zip(instances, metadata): data['instances'][inst] = [ str(meta[attr]) for attr in metadata_keys ] logging.info( 'start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] return f'{info[6]}/{info[0]}/{info[4]}' def _generate_report(self, matrix_obj_ref, amplicon_set_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_name): """ _generate_report: generate summary report """ objects_created = [{ 'ref': matrix_obj_ref, 'description': 'Imported Amplicon Matrix' }, { 'ref': amplicon_set_obj_ref, 'description': 'Imported Amplicon Set' }] if new_row_attr_ref: objects_created.append({ 'ref': new_row_attr_ref, 'description': 'Imported Amplicons(Row) Attribute Mapping' }) if new_col_attr_ref: objects_created.append({ 'ref': new_col_attr_ref, 'description': 'Imported Samples(Column) Attribute Mapping' }) report_params = { 'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _df_to_tsv(self, amplicon_set_df, result_dir, amplicon_set_ref): logging.info('writting amplicon set data frame to tsv file') amplicon_set_obj = self.dfu.get_objects( {'object_refs': [amplicon_set_ref]})['data'][0] amplicon_set_info = amplicon_set_obj['info'] amplicon_set_name = amplicon_set_info[1] file_path = os.path.join(result_dir, amplicon_set_name + ".tsv") amplicon_set_df.to_csv(file_path, sep='\t', index=True, header=True) return file_path def _amplicon_set_to_df(self, amplicon_set_ref): logging.info('converting amplicon set to data frame') am_set_data = self.dfu.get_objects({'object_refs': [amplicon_set_ref] })['data'][0]['data'] amplicon_matrix_ref = am_set_data.get('amplicon_matrix_ref') matrix_data = self.dfu.get_objects( {'object_refs': [amplicon_matrix_ref]})['data'][0]['data'] matrix_value_data = matrix_data.get('data') index = matrix_value_data.get('row_ids') columns = matrix_value_data.get('col_ids') values = matrix_value_data.get('values') df = pd.DataFrame(values, index=index, columns=columns) amplicons = am_set_data.get('amplicons') meta_index = list() meta_columns = [ 'taxonomy', 'taxon_id', 'taxon_ref', 'taxon_level', 'score', 'taxonomy_source', 'species_name', 'consensus_sequence' ] meta_values = list() for otu_id, amplicon in amplicons.items(): meta_index.append(otu_id) taxonomy_data = amplicon.get('taxonomy') taxonomy = taxonomy_data.get('lineage') taxon_id = taxonomy_data.get('taxon_id') taxon_ref = taxonomy_data.get('taxon_ref') taxon_level = taxonomy_data.get('taxon_level') score = taxonomy_data.get('score') taxonomy_source = taxonomy_data.get('taxonomy_source') species_name = taxonomy_data.get('species_name') consensus_sequence = amplicon.get('consensus_sequence') meta_values.append([ taxonomy, taxon_id, taxon_ref, taxon_level, score, taxonomy_source, species_name, consensus_sequence ]) meta_df = pd.DataFrame(meta_values, index=meta_index, columns=meta_columns) merged_df = df.merge(meta_df, left_index=True, right_index=True, how='left', validate='one_to_one') return merged_df def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.data_util = DataUtil(config) self.attr_util = AttributesUtil(config) self.matrix_util = MatrixUtil(config) self.matrix_types = [ x.split(".")[1].split('-')[0] for x in self.data_util.list_generic_types() ] self.taxon_wsname = config['taxon-workspace-name'] self.kbse = KBaseSearchEngine(config['search-url']) def import_matrix_from_biom(self, params): """ arguments: obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix matrix_name: matrix object name workspace_name: workspace name matrix object to be saved to input_shock_id: file shock id or input_file_path: absolute file path or input_staging_file_path: staging area file path optional arguments: col_attributemapping_ref: column AttributeMapping reference row_attributemapping_ref: row AttributeMapping reference genome_ref: genome reference matrix_obj_ref: Matrix reference """ (biom_file, tsv_file, fasta_file, mode, metadata_keys) = self._process_params(params) workspace_name = params.get('workspace_name') matrix_name = params.get('matrix_name') amplicon_set_name = params.get('amplicon_set_name') obj_type = params.get('obj_type') scale = params.get('scale') description = params.get('description') refs = {k: v for k, v in params.items() if "_ref" in k} if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys) new_row_attr_ref = None if not params.get('row_attributemapping_ref'): new_row_attr_ref = amplicon_data.get('row_attributemapping_ref') new_col_attr_ref = None if not params.get('col_attributemapping_ref'): new_col_attr_ref = amplicon_data.get('col_attributemapping_ref') logging.info('start saving Matrix object: {}'.format(matrix_name)) matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': amplicon_data, 'workspace_name': workspace_id })['obj_ref'] amplicon_set_data = self._file_to_amplicon_set_data( biom_file, tsv_file, fasta_file, mode, refs, description, matrix_obj_ref) logging.info( 'start saving AmpliconSet object: {}'.format(amplicon_set_name)) amplicon_set_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseExperiments.AmpliconSet', 'obj_name': amplicon_set_name, 'data': amplicon_set_data, 'workspace_name': workspace_id })['obj_ref'] logging.info( 'start resaving Matrix object with amplicon set: {}'.format( matrix_name)) amplicon_data['amplicon_set_ref'] = '{}/{}'.format( workspace_id, amplicon_set_name) matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': amplicon_data, 'workspace_name': workspace_id })['obj_ref'] returnVal = { 'matrix_obj_ref': matrix_obj_ref, 'amplicon_set_obj_ref': amplicon_set_obj_ref } report_output = self._generate_report(matrix_obj_ref, amplicon_set_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_name) returnVal.update(report_output) return returnVal def export_amplicon_set_tsv(self, params): """ export AmpliconSet as TSV """ logging.info('start exporting amplicon set object') amplicon_set_ref = params.get('input_ref') amplicon_set_df = self._amplicon_set_to_df(amplicon_set_ref) result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) self._df_to_tsv(amplicon_set_df, result_dir, amplicon_set_ref) package_details = self.dfu.package_for_download({ 'file_path': result_dir, 'ws_refs': [amplicon_set_ref] }) return {'shock_id': package_details['shock_id']}
class DataUtil: @staticmethod def _find_between(s, start, end): """ _find_between: find string in between start and end """ return re.search('{}(.*){}'.format(start, end), s).group(1) def _find_constraints(self, obj_type): """ _find_constraints: retrieve constraints (@contains, rowsum, unique, conditionally_required) """ type_info = self.wsClient.get_type_info(obj_type) type_desc = type_info.get('description') constraints = {} for tag in ('contains', 'rowsum', 'unique', 'conditionally_required'): constraints[tag] = [line.strip().split()[1:] for line in type_desc.split("\n") if line.startswith(f'@{tag}')] return constraints def _filter_constraints(self, constraints, data): """filters out constraints with missing keys""" contains_constraints = constraints.get('contains') filtered_constraints = [] for contains_constraint in contains_constraints: in_values = contains_constraint[1:] missing_key = True for in_value in in_values: if in_value.startswith('values'): search_value = re.search('{}(.*){}'.format('\(', '\)'), in_value).group(1) unique_list = search_value.split('.') key = unique_list[0] elif ':' in in_value: key = in_value.split(':')[0] else: unique_list = in_value.split('.') key = unique_list[0] if key in data: missing_key = False break if missing_key: filtered_constraints.append(contains_constraint) for x in filtered_constraints: contains_constraints.remove(x) return constraints def _retrieve_value(self, data, value): """Parse the provided 'data' object to retrieve the item in 'value'.""" logging.info('Getting value for {}'.format(value)) retrieve_data = [] m_data = DotMap(data) if value.startswith('set('): retrieve_data = value[4:-1].split(",") elif value.startswith('values('): # TODO: nested values e.g. values(values(ids)) search_value = re.search('{}(.*){}'.format('\(', '\)'), value).group(1) unique_list = search_value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = list(m_data_cp.values()) elif ':' in value: obj_ref = getattr(m_data, value.split(':')[0]) if obj_ref: included = value.split(':')[1] included = '/' + included.replace('.', '/') ref_data = self.wsClient.get_objects2({'objects': [{'ref': obj_ref, 'included': [included]}]})['data'][0]['data'] m_ref_data = DotMap(ref_data) if ref_data: if '*' not in included: for key in included.split('/')[1:]: m_ref_data = getattr(m_ref_data, key) else: keys = included.split('/')[1:] m_ref_data = [x.get(keys[2]) for x in ref_data.get(keys[0])] # TODO: only works for 2 level nested data like '/features/[*]/id' retrieve_data = list(m_ref_data) else: unique_list = value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = list(m_data_cp) logging.info('Retrieved value (first 20):\n{}\n'.format(retrieve_data[:20])) return retrieve_data def _validate(self, constraints, data): """ _validate: validate data """ validated = True failed_constraints = defaultdict(list) unique_constraints = constraints.get('unique') for unique_constraint in unique_constraints: retrieved_value = self._retrieve_value(data, unique_constraint[0]) if len(set(retrieved_value)) != len(retrieved_value): validated = False failed_constraints['unique'].append(unique_constraint[0]) contains_constraints = constraints.get('contains') for contains_constraint in contains_constraints: value = contains_constraint[0] in_values = contains_constraint[1:] retrieved_in_values = [] for in_value in in_values: retrieved_in_values += self._retrieve_value(data, in_value) if not (set(self._retrieve_value(data, value)) <= set(retrieved_in_values)): validated = False failed_constraints['contains'].append(" ".join(contains_constraint)) conditional_constraints = constraints.get('conditionally_required') for conditional_constraint in conditional_constraints: trigger = conditional_constraint[0] required_keys = conditional_constraint[1:] if trigger in data: missing_keys = [key for key in required_keys if key not in data] if missing_keys: validated = False failed_constraints['conditionally_required'].append( (trigger, required_keys, missing_keys)) return validated, failed_constraints @staticmethod def _mkdir_p(path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise @staticmethod def _raise_validation_error(params, validate): """Raise a meaningful error message for failed validation""" logging.error('Data failed type checking') failed_constraints = validate.get('failed_constraints') error_msg = ['Object {} failed type checking:'.format(params.get('obj_name'))] if failed_constraints.get('unique'): unique_values = failed_constraints.get('unique') error_msg.append('Object should have unique field: {}'.format(unique_values)) if failed_constraints.get('contains'): contained_values = failed_constraints.get('contains') for contained_value in contained_values: subset_value = contained_value.split(' ')[0] super_value = ' '.join(contained_value.split(' ')[1:]) if 'col_mapping' in super_value: error_msg.append('Column attribute mapping instances should contain all ' 'column index from original data') if 'row_mapping' in super_value: error_msg.append('Row attribute mapping instances should contain all row ' 'index from original data') error_msg.append('Object field [{}] should contain field [{}]'.format( super_value, subset_value)) for failure in failed_constraints.get('conditionally_required', []): error_msg.append('If object field "{}" is present than object field(s) {} should ' 'also be present. Object is missing {}'.format(*failure)) raise ValueError('\n'.join(error_msg)) def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.serviceWizardURL = config['srv-wiz-url'] self.wsClient = workspaceService(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.generics_service = GenericsService(self.serviceWizardURL) self.ws_large_data = WsLargeDataIO(self.callback_url) def list_generic_types(self, params=None): """ *Not yet exposed in spec* list_generic_types: lists the current valid generics types arguments: none return: A list of generic types in the current environment """ returnVal = [x['type_def'] for module in GENERICS_MODULES for x in self.wsClient.get_all_type_info(module)] return returnVal def fetch_data(self, params): """ fetch_data: fetch generics data as pandas dataframe for a generics data object arguments: obj_ref: generics object reference optional arguments: generics_module: the generics data module to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; generics_module should be {'data': 'FloatMatrix2D', 'condition_set_ref': 'condition_set_ref'} return: data_matrix: a pandas dataframe in json format """ for p in ['obj_ref']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) return self.generics_service.fetch_data(params) def validate_data(self, params): """ validate_data: validate data arguments: obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1' data: obj data to be validated return: validated: True or False """ constraints = self._find_constraints(params.get('obj_type')) data = params.get('data') constraints = self._filter_constraints(constraints, data) validated, failed_constraints = self._validate(constraints, data) return {'validated': validated, 'failed_constraints': failed_constraints} def save_object(self, params): """ save_object: validate data constraints and save matrix object arguments: obj_type: saving object data type obj_name: saving object name data: data to be saved workspace_name: workspace name matrix object to be saved to return: obj_ref: object reference """ logging.info('Starting validating and saving object data') obj_type = params.get('obj_type').split('-')[0] module_name = obj_type.split('.')[0] type_name = obj_type.split('.')[1] types = self.wsClient.get_module_info({'mod': module_name}).get('types') for module_type in types: if self._find_between(module_type, '\.', '\-') == type_name: obj_type = module_type break data = dict((k, v) for k, v in params.get('data').items() if v) validate = self.validate_data({'obj_type': obj_type, 'data': data}) if not validate.get('validated'): self._raise_validation_error(params, validate) # make sure users with shared object have access to the handle file upon saving handle = data.get('sequencing_file_handle') if handle: output_directory = os.path.join(self.scratch, str(uuid.uuid4())) logging.info('Downloading consensus sequence file in {}'.format(output_directory)) self._mkdir_p(output_directory) matrix_fasta_file = self.dfu.shock_to_file({ 'handle_id': handle, 'file_path': self.scratch}).get('file_path') logging.info('Saving consensus sequence file to shock: {}'.format(matrix_fasta_file)) handle_id = self.dfu.file_to_shock({'file_path': matrix_fasta_file, 'make_handle': True})['handle']['hid'] data['sequencing_file_handle'] = handle_id # cast data int_data_names = ['sequencing_quality_filter_cutoff', 'read_length_cutoff'] for data_name in int_data_names: if data_name in data: try: logging.info('Casting {} to int'.format(data_name)) data[data_name] = int(data[data_name]) except Exception as e: err_msg = 'Unexpected data type {}. '.format(data_name) err_msg += 'Data type {} requests {} to be an integer value. '.format( obj_type, data_name) err_msg += 'Provided [{}] {} instead'.format( type(data[data_name]), data[data_name]) raise ValueError(err_msg) from e float_data_names = ['barcode_error_rate', 'sequence_error_cutoff', 'clustering_cutoff'] for data_name in float_data_names: if data_name in data: try: logging.info('Casting {} to float'.format(data_name)) data[data_name] = float(data[data_name]) except Exception as e: err_msg = 'Unexpected data type {}. '.format(data_name) err_msg += 'Data type {} requests {} to be a float value. '.format( obj_type, data_name) err_msg += 'Provided [{}] {} instead'.format( type(data[data_name]), data[data_name]) raise ValueError(err_msg) from e ws_name_id = params.get('workspace_id') workspace_name = params.get('workspace_name') if not ws_name_id: if not isinstance(workspace_name, int): ws_name_id = self.dfu.ws_name_to_id(workspace_name) else: ws_name_id = workspace_name try: logging.info('Starting saving object via DataFileUtil') info = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data": data, "name": params.get('obj_name') }] })[0] except Exception: logging.info('Saving object via DataFileUtil failed') logging.info('Starting saving object via WsLargeDataIO') data_path = os.path.join(self.scratch, params.get('obj_name') + "_" + str(uuid.uuid4()) + ".json") json.dump(data, open(data_path, 'w')) info = self.ws_large_data.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data_json_file": data_path, "name": params.get('obj_name') }] })[0] return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])}
class PDBUtil: # “Expect Value” threshold to restrict which alignments will be significant E_VALUE_THRESH = 1e-20 # BLAST sequence identity threshold to determine which pdb structures will be # matched to a KBase genome/feature B_IDENTITY_THRESH = 0.6 def _validate_import_pdb_file_params(self, params): """ _validate_import_pdb_file_params: validates input params to import_model_pdb_file and import_experiment_pdb_file """ # check for required parameters for p in ['structure_name', 'workspace_name']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) if params.get('input_file_path'): file_path = params.get('input_file_path') elif params.get('input_shock_id'): file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') elif params.get('input_staging_file_path'): file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params.get('input_staging_file_path') }).get('copy_file_path') else: error_msg = "Must supply either a input_shock_id or input_file_path " error_msg += "or input_staging_file_path" raise ValueError(error_msg) return file_path, params.get('workspace_name'), params.get( 'structure_name') def _model_file_to_data(self, file_path, params): """ _model_file_to_data: Do the PDB conversion--parse the model pdb file for creating a pdb data object """ logging.info( f'Parsing pdb file {file_path} to a pdb structure with params: {params}' ) parser = PDB.PDBParser(PERMISSIVE=1) pdb1 = file_path pp_no = 0 data = {} try: structure = parser.get_structure("test", pdb1) except (RuntimeError, TypeError, KeyError, ValueError) as e: logging.info(f'PDBParser errored with message: {e.message}') raise else: ppb = PPBuilder() for pp in ppb.build_peptides(structure): pp_no += 1 # logging.info(f'Getting pdb structure data for {structure}!') (compound, source) = self._get_compound_source(structure) (num_models, model_ids) = self._get_models_from_structure(structure) (num_chains, chain_ids) = self._get_chains_from_structure(structure) (num_residues, residue_ids) = self._get_residues_from_structure(structure) (num_atoms, atom_ids) = self._get_atoms_from_structure(structure) model = structure[0] protein_data = self._get_proteins_by_structure( structure, model.get_id(), file_path) (protein_data, params) = self._match_features(params, protein_data) pdb_info = params.get('pdb_info', None) if pdb_info and pdb_info.get('sequence_identities', None): data = { 'name': structure.header.get('name', ''), 'num_chains': num_chains, 'num_residues': num_residues, 'num_atoms': num_atoms, 'compound': compound, 'source': source, 'proteins': protein_data } else: logging.info( f'Parsing pdb file {file_path} failed to match KBase genome/features!' ) data = {} finally: return data, pp_no, params def _exp_file_to_data(self, file_path, params): """ _exp_file_to_data: Do the PDB conversion--parse the experiment pdb file for creating a pdb data object """ logging.info( f'Parsing pdb file {file_path} to a pdb structure with params: {params}' ) parser = PDB.MMCIFParser() cif = file_path pp_no = 0 mmcif_data = None try: structure = parser.get_structure("PHA-L", cif) except (RuntimeError, TypeError, KeyError, ValueError) as e: logging.info(f'MMCIFParser errored with message: {e.message}') raise else: ppb = PPBuilder() for pp in ppb.build_peptides(structure): pp_no += 1 struc_name = structure.header.get('name', '') hd = self._upload_to_shock(file_path) # logging.info(f'Getting pdb structure data for {structure}!') (cpd, src) = self._get_compound_source(structure) (num_models, model_ids) = self._get_models_from_structure(structure) (num_chains, chain_ids) = self._get_chains_from_structure(structure) (num_residues, residue_ids) = self._get_residues_from_structure(structure) (num_atoms, atom_ids) = self._get_atoms_from_structure(structure) protein_data = self._get_proteins_by_structure( structure, model_ids[0], file_path) (protein_data, params) = self._match_features(params, protein_data) pdb_info = params.get('pdb_info', None) if pdb_info and pdb_info.get('sequence_identities', None): mmcif_data = { 'name': struc_name, 'head': structure.header.get('head', ''), 'rcsb_id': structure.header.get('rcsb_id', ''), 'deposition_date': structure.header.get('deposition_date', ''), 'release_date': structure.header.get('release_date', ''), 'structure_method': structure.header.get('structure_method', ''), 'resolution': structure.header.get('resolution', 0.0), 'structure_reference': structure.header.get('structure_reference', []), 'keywords': structure.header.get('keywords', ''), 'author': structure.header.get('author', ''), 'compound': cpd, 'source': src, 'num_models': num_models, 'num_chains': num_chains, 'num_residues': num_residues, 'num_atoms': num_atoms, 'num_het_atoms': structure.header.get('num_het_atoms', 0), 'num_water_atoms': structure.header.get('num_water_atoms', 0), 'num_disordered_atoms': structure.header.get('num_disordered_atoms', 0), 'num_disordered_residues': structure.header.get('num_disordered_residues', 0), 'pdb_handle': hd, 'mmcif_handle': hd, 'xml_handle': hd, 'proteins': protein_data } else: mmcif_data = {} logging.info( f'Parsing pdb file {file_path} failed to match KBase genome/features!' ) finally: return mmcif_data, pp_no, params def _match_features(self, params, protein_data): """ _match_features: match the protein_translation in feature_id with chain sequences in protein_data and compute the seq_identity and determine the exact_match example (in appdev): genome_obj = '57196/6/1', genome_name = 'Synthetic_bacterium_JCVI_Syn3.0_genome' feature_id = 'JCVISYN3_0004_CDS_1', feature_type = 'CDS' OR feature_id = 'JCVISYN3_0004', feature_type = 'gene' """ pdb_info = params.get('pdb_info', None) if pdb_info: kb_feature_type = '' kb_feature_seq = '' genome_name = pdb_info['genome_name'] narr_id = pdb_info['narrative_id'] feature_id = pdb_info['feature_id'] logging.info( f"Looking up for feature {feature_id} in genome {genome_name}'s features" ) # 1. Get the genome's features and reference (gn_ref, kb_genome_features) = self._get_genome_ref_features( narr_id, genome_name) if not gn_ref: logging.info( f"Given genome {genome_name} does not exist in workspace {narr_id}!" ) return protein_data, params pdb_info['genome_ref'] = gn_ref # 2. Match the genome features with the specified feature_id to obtain feature sequence for feat in kb_genome_features: if feat['id'] == feature_id: logging.info( f'Found genome feature match for {feature_id}') kb_feature_type = self._get_feature_type(feat) kb_feature_seq = feat.get('protein_translation', '') break pdb_info['feature_type'] = kb_feature_type # 3. Call self._compute_sequence_identity with the feature sequence and the the pdb # proteins' translations to to get the seq_identity and exact_match if kb_feature_seq: logging.info( f"Finding seq_identity and exact_match for feature {feature_id}" f" in genome {genome_name}'s features...") pdb_chain_ids = [] pdb_model_ids = [] pdb_seq_idens = [] pdb_exact_matches = [] for prot in protein_data: seq_idens, seq_mats = self._compute_sequence_identity( kb_feature_seq, prot.get('sequence', '')) if seq_idens: seq_idens.sort() max_iden = seq_idens.pop() if max_iden >= self.B_IDENTITY_THRESH: # get the good matches prot['seq_identity'] = max_iden prot['exact_match'] = 1 if max_iden > 0.99 else 0 prot['genome_ref'] = gn_ref prot['feature_id'] = feature_id prot['feature_type'] = kb_feature_type pdb_chain_ids.append(prot['chain_id']) pdb_model_ids.append(str(prot['model_id'])) pdb_seq_idens.append(str(prot['seq_identity'])) pdb_exact_matches.append(str(prot['exact_match'])) if pdb_seq_idens: pdb_info['sequence_identities'] = ','.join(pdb_seq_idens) if pdb_chain_ids: pdb_info['chain_ids'] = ','.join(pdb_chain_ids) if pdb_model_ids: pdb_info['model_ids'] = ','.join(pdb_model_ids) if pdb_exact_matches: pdb_info['exact_matches'] = ','.join(pdb_exact_matches) else: logging.info( f'Found NO feature in genome that matches with {feature_id}' ) else: logging.info( 'NO KBase genome/feature object info were given for uploading') return protein_data, params def _compute_sequence_identity(self, seq1, seq2): """ _compute_sequence_identity: Given two input sequences, do a blast identity check and then compute and return the matching percentage. """ # Create two sequence files Seq1 = SeqRecord(Seq(seq1), id="query_seq") Seq2 = SeqRecord(Seq(seq2), id="subject_seq") blast_dir = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(blast_dir) query_seq = os.path.join(blast_dir, 'seq_qry.fasta') subject_seq = os.path.join(blast_dir, 'seq_sbj.fasta') SeqIO.write(Seq1, query_seq, "fasta") SeqIO.write(Seq2, subject_seq, "fasta") # on my laptop: blastp_path = '/Users/qzhang/miniconda3/bin/blastp' blastp_path = 'blastp' output_file_path = os.path.join(blast_dir, 'blast_output.xml') # Build the BLASTp command blastp_cmd = [blastp_path] blastp_cmd.append('-out') blastp_cmd.append(output_file_path) blastp_cmd.append('-outfmt') blastp_cmd.append('5') blastp_cmd.append('-query') blastp_cmd.append(query_seq) blastp_cmd.append('-subject') blastp_cmd.append(subject_seq) # Run BLASTp and parse the output as XML and then parse the xml file for identity matches exact_matches = [] idens = [] try: p = subprocess.Popen(blastp_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) output, errors = p.communicate() if not output: logging.info(f'BLASTp returned: {p.returncode}') logging.info(f'OK> output: {output}') if errors: e = subprocess.CalledProcessError(p.returncode, blastp_cmd, output=output) raise e except OSError as e: logging.info(f'OSError > {e.errno}') logging.info(f'OSError > {e.strerror}') logging.info(f'OSError > {e.filename}') except subprocess.CalledProcessError as e: logging.info(f'CalledError > {e.returncode}') logging.info(f'CalledError > {e.output}') except: logging.info(f'Unexpected error > {sys.exc_info()[0]}') else: with open(output_file_path) as blast_fhd: blast_record = NCBIXML.read(blast_fhd) if blast_record: logging.info(f'query: {blast_record.query[:100]}') for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < self.E_VALUE_THRESH: logging.info('****Alignment****') logging.info(f'sequence: {alignment.title}') logging.info(f'length: {alignment.length}') logging.info(f'e value: {hsp.expect}') logging.info(f'hsp query: {hsp.query}') logging.info(f'hsp match: {hsp.match}') logging.info(f'hsp subject: {hsp.sbjct}') logging.info( f'hsp identities: {hsp.identities}') logging.info(f'hsp positives: {hsp.positives}') iden = round(hsp.identities / hsp.positives, 6) logging.info(f'identity={iden}') idens.append(iden) if hsp.positives == hsp.identities: exact_matches.append(alignment.title[:100]) return idens, exact_matches def _get_genome_ref_features(self, narr_id, genome_name): """ _get_genome_ref_features: Get the genome reference and features for genome_name """ genome_ref = '' genome_features = [] (genome_info, genome_data) = self._get_object_info_data(narr_id, genome_name) if genome_info and genome_data: genome_ref = '/'.join( [str(narr_id), str(genome_info[0]), str(genome_info[4])]) genome_features = genome_data['features'] return (genome_ref, genome_features) def _get_feature_type(self, feature_obj): """ _get_feature_type: Get the type for the feature object of given feature_obj """ feat_type = feature_obj.get('type', '') if not feat_type: if feature_obj.get('protein_translation'): feat_type = 'gene' else: feat_type = 'other' return feat_type def _get_object_info_data(self, narr_id, obj_name): """ _get_object_info_data: Get the object info/data with given obj_name in narrative narr_id """ obj_info = None obj_data = None if narr_id and obj_name: try: obj_data_res = self.ws_client.get_objects2( {'objects': [{ 'wsid': narr_id, 'name': obj_name }]})['data'][0] obj_info = obj_data_res['info'] obj_data = obj_data_res['data'] except: logging.info( f'No object with name {obj_name} exists in workspace {narr_id}' ) logging.info( f'Unexpected error occurred while getting object for {obj_name}' ) pass return (obj_info, obj_data) def _get_atoms_from_structure(self, pdb_structure): """ _get_atoms_from_structure: Given a pdb_structure object, parse atoms into a list of atoms and return it """ atom_ids = [] num_atoms = 0 my_residues = pdb_structure.get_residues() for r_ele in my_residues: for a_ele in r_ele.get_atoms(): num_atoms += 1 atom_ids.append(a_ele.get_id()) return (num_atoms, atom_ids) def _get_residues_from_structure(self, pdb_structure): """ _get_residues_from_structure: Given a pdb_structure object, parse residues into a list and return it """ res_ids = [] num_res = 0 my_res = pdb_structure.get_residues() for r_ele in my_res: if PDB.is_aa(r_ele): num_res += 1 res_ids.append(r_ele.get_id()) return (num_res, res_ids) def _get_chains_from_structure(self, pdb_structure): """ _get_chains: Given a pdb_structure object, parse chain ids into a list and return it """ chain_ids = [] num_chains = 0 my_chains = pdb_structure.get_chains() for c_ele in my_chains: if (c_ele): num_chains += 1 chain_ids.append(c_ele.get_id()) return (num_chains, chain_ids) def _get_models_from_structure(self, pdb_structure): """ _get_models_from_structure: Given a pdb_structure object, parse model ids into a list and return it """ model_ids = [] num_models = 0 my_models = pdb_structure.get_models() for m_ele in my_models: if (m_ele): num_models += 1 model_ids.append(m_ele.get_id()) return (num_models, model_ids) def _get_compound_source(self, structure): """ _get_compound_source: Parse data from given structure for compound and source """ cpd_dict = dict() cpd = structure.header.get('compound', {}) # logging.info(f'Compound:\n {cpd}') if cpd and cpd.get('1'): cpd_dict = cpd.get('1') src_dict = dict() src = structure.header.get('source', {}) # logging.info(f'Source:\n {src}') if src and src.get('1'): src_dict = src.get('1') return (cpd_dict, src_dict) def _get_proteins_by_structure(self, pdb_structure, model, file_path): """ _get_proteins_by_structure: Given a pdb_structure, parse the essential protein data """ ppb = PPBuilder() protein_data = [] # Parse for the chain_id and chain sequence for c_ele in pdb_structure.get_chains(): if (c_ele): c_ppd_list = [] for c_ppd in ppb.build_peptides(c_ele): c_pp_seq = str(c_ppd.get_sequence()) c_ppd_list.append(c_pp_seq) c_seq = ''.join(c_ppd_list) protein_data.append({ 'id': os.path.basename(file_path), 'model_id': model, 'chain_id': c_ele.get_id(), 'sequence': c_seq, 'md5': hashlib.md5(c_seq.encode()).hexdigest() }) return protein_data def _validate_file(self, file_path): """ _validate_file: Check if file_path is accessable, if yes, return the handle """ try: fh = open(file_path, 'r') except IOError as e: if e.errno == errno.ENOENT: # No such file or directory raise ValueError(f'"{file_path}" does not exist!') elif e.errno == errno.EACCES: # Permission denied raise ValueError(f'"{file_path}" cannot be read!') else: raise ValueError(f'"{e.strerror}" error occurred') else: fh.close() return True def _dfu_get_objects(self, obj_ref): """ _dfu_get_objects: call dfu.get_objects to return object data and info """ obj = self.dfu.get_objects({"object_refs": [obj_ref]})['data'][0] return obj['data'], obj['info'] def _get_pdb_shock_id(self, obj_ref): """ _get_pdb_shock_id: Return the shock id for the PDB file """ obj_data, obj_info = self._dfu_get_objects(obj_ref) return self.hs.hids_to_handles([obj_data['pdb_handle']])[0]['id'] def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ logging.info(f'Start uploading file to shock: {file_path}') file_to_shock_params = { 'file_path': file_path, 'pack': 'gzip', 'make_handle': True, } shock_id = self.dfu.file_to_shock( file_to_shock_params)['handle']['hid'] return shock_id def _generate_report_html(self, pdb_name, pdb_path): """ _generate_report_html: generates the HTML for the upload report """ html_report = list() # Make report directory and copy over files output_directory = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(output_directory) result_file_path = os.path.join(output_directory, 'viewer.html') new_pdb_path = os.path.join(output_directory, os.path.basename(pdb_path)) shutil.copy(pdb_path, new_pdb_path) # Fill in template HTML with open( os.path.join(os.path.dirname(__file__), 'templates', 'viewer_template.html')) as report_template_file: report_template = report_template_file.read()\ .replace('*PDB_NAME*', pdb_name)\ .replace('*PDB_PATH*', os.path.basename(pdb_path)) with open(result_file_path, 'w') as result_file: result_file.write(report_template) html_report.append({ 'path': output_directory, 'name': os.path.basename(result_file_path), 'description': 'HTML report for PDB upload' }) return html_report def _generate_report(self, method_name, pdb_obj_ref, workspace_name, n_poly_pep, pdb_name, pdb_path): """ _generate_report: generate summary report for upload """ output_html_files = self._generate_report_html(pdb_name, pdb_path) report_params = { 'message': f'You uploaded a PDB file. {n_poly_pep} polypeptides detected.', 'html_links': output_html_files, 'direct_html_link_index': 0, 'objects_created': [{ 'ref': pdb_obj_ref, 'description': 'Imported PDB' }], 'workspace_name': workspace_name, 'report_object_name': method_name + '_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _validate_batch_import_pdbs_params(self, params): """ _validate_batch_import_pdbs_params: validates params passed to batch_import_pdbs method """ # check for required parameters for p in [ 'structures_name', 'workspace_name', 'metadata_staging_file_path' ]: if p not in params: raise ValueError(f'"{p}" parameter is required, but missing') # metadata_staging_file_path must be from the staging area--must have the staging dir prefix if params.get('metadata_staging_file_path', None): staging_file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params.get('metadata_staging_file_path') }).get('copy_file_path') return (staging_file_path, params['workspace_name'], params['structures_name']) else: error_msg = "Must supply a 'metadata_staging_file_path'" raise ValueError(error_msg) def _read_file_by_type(self, file_path): """ _read_file_by_type: read the file given by file_path depending on its type, return a DataFrame object """ logging.info(f'Reading input from file: {file_path}...') if not self._validate_file(file_path): raise ValueError('Input file is invalid or not found!') df = None file_ext = pathlib.Path(file_path).suffix try: # read the data from file_path depending on its extension if 'csv' in file_ext: df = pd.read_csv(file_path) elif 'tsv' in file_ext: df = pd.read_csv(file_path, '\t') elif 'xls' in file_ext or 'od' in file_ext: # handle xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions df = pd.read_excel(file_path, index_col=None, engine='openpyxl') else: # invalid file type error_msg = "Invalid input file type, only 'csv/tsv/xlsx' are accepted!" raise ValueError(error_msg) # strip off the leading and trailing whitespaces of the column names df.columns = df.columns.str.strip() except (RuntimeError, TypeError, KeyError, ValueError, WorkspaceError) as e: logging.info( f'Reading file {file_path} errored with message: {e.message} and data: {e.data}' ) raise return df def _parse_metadata_file(self, metadata_file_path, ws_id): """ _parse_metadata_file: From metadata_file_path, a spreadsheet file, sort out the model_pdb_file_paths, exp_pdb_file_paths and the kbase_meta_data return: lists model_pdb_file_paths, exp_pdb_file_paths and dict kbase_meta_data """ logging.info( f'parsing metadata from input file {metadata_file_path}...') required_columns = [ 'Narrative ID', 'Object name (Genome AMA feature set)', 'Feature ID', 'PDB filename', 'Is model', 'From RCSB' ] pdb_file_paths = list() narrative_ids = list() genome_names = list() feature_ids = list() # df_meta_data is a Panda DataFrame object df_meta_data = self._read_file_by_type(metadata_file_path) df_col_list = df_meta_data.columns.values.tolist() # check if required columns are read in correctly for col in required_columns: if col not in df_col_list: missing_required = f"Required column '{col}' is missing!" raise ValueError(missing_required) df_indexes = df_meta_data.columns for i in range(len(df_meta_data[df_indexes[0]])): narr_id = int(df_meta_data[df_indexes[0]][i]) if not pd.isna(narr_id): narrative_ids.append(narr_id) else: missing_narr_id = "Please fill all the rows in column 'Narrative ID'!" raise ValueError(missing_narr_id) obj_name = df_meta_data[df_indexes[1]][i] if not pd.isna(obj_name): genome_names.append(obj_name) else: missing_obj_name = "Please fill all the rows in column 'Object name'!" raise ValueError(missing_obj_name) feat_id = df_meta_data[df_indexes[2]][i] if not pd.isna(feat_id): feature_ids.append(feat_id) else: missing_feature_id = f"Please fill all the rows in column '{required_columns[2]}'!" raise ValueError(missing_feature_id) pdb_fn = df_meta_data[df_indexes[3]][ i] # pdb_fn does not have staging dir prefix if pd.isna(pdb_fn): missing_pdb_file = f"Please fill all the rows in column '{required_columns[3]}'!" raise ValueError(missing_pdb_file) (struct_name, ext) = os.path.splitext(os.path.basename(pdb_fn)) from_rcsb = df_meta_data[df_indexes[5]][ i] # pdb file source, default to 'yes' if pd.isna(from_rcsb): from_rcsb = 'yes' is_model = df_meta_data[df_indexes[4]][i] if not pd.isna(is_model): pdb_file_paths.append({ 'file_path': pdb_fn, 'structure_name': struct_name, 'narrative_id': narr_id, 'genome_name': obj_name, 'feature_id': feat_id, 'is_model': 'y' in is_model or 'Y' in is_model, 'from_rcsb': 'y' in from_rcsb or 'Y' in from_rcsb }) else: missing_pdb_md = f"Please fill all the rows in columns '{required_columns[4]}'!" raise ValueError(missing_pdb_md) if not pdb_file_paths: error_msg = "No PDB file info is provided!" raise ValueError(error_msg) return (pdb_file_paths, narrative_ids, genome_names, feature_ids) def _generate_batch_report(self, workspace_name, structs_ref, structs_name, pdb_infos, failed_pdbs): """ _generate_batch_report: generate summary report for upload """ output_html_files = self._generate_batch_report_html( structs_name, pdb_infos) description = ( f'Imported PDBs into a ProteinStructures object "{structs_ref}", ' f'named "{structs_name}".') if failed_pdbs: failed_files = ','.join(failed_pdbs) description += f' These files "{failed_files}" failed to load.' report_params = { 'message': f'You have uploaded a batch of PDB files into {structs_name}.', 'html_links': output_html_files, 'direct_html_link_index': 0, 'objects_created': [{ 'ref': structs_ref, 'description': description }], 'workspace_name': workspace_name, 'report_object_name': 'batch_import_pdb_files_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _write_pdb_htmls(self, output_dir, succ_pdb_infos): """ _write_pdb_htmls: write the batch pdb info as a jQuery DataTable into HTML files """ pdb_html = '' srv_domain = urlparse( self.shock_url).netloc # parse url to get the domain portion srv_base_url = f'https://{srv_domain}' logging.info(f'Get the url for building the anchors: {srv_base_url}') dir_name = os.path.dirname(__file__) molstar_html_file = os.path.join(dir_name, 'templates', 'molstar_viewer.html') molstar_js_file = os.path.join(dir_name, 'templates', 'molstar.js') molstar_css_file = os.path.join(dir_name, 'templates', 'molstar.css') shutil.copy(molstar_html_file, os.path.join(output_dir, 'molstar_viewer.html')) shutil.copy(molstar_js_file, os.path.join(output_dir, 'molstar.js')) shutil.copy(molstar_css_file, os.path.join(output_dir, 'molstar.css')) for succ_pdb in succ_pdb_infos: row_html = '<tr>' file_path = succ_pdb['file_path'] pdb_file_path = succ_pdb[ 'scratch_path'] # This is the scratch path for this pdb file new_pdb_path = os.path.join(output_dir, os.path.basename(file_path)) shutil.copy(pdb_file_path, new_pdb_path) struct_nm = succ_pdb['structure_name'].upper() genome_name = succ_pdb['genome_name'] genome_ref = succ_pdb['genome_ref'] feat_id = succ_pdb['feature_id'] feat_type = succ_pdb['feature_type'] src_rcsb = succ_pdb['from_rcsb'] pdb_chains = [] pdb_models = [] seq_idens = [] if succ_pdb.get('chain_ids', None): pdb_chains = succ_pdb['chain_ids'].split() if succ_pdb.get('model_ids', None): pdb_models = succ_pdb['model_ids'].split() if succ_pdb.get('sequence_identities', None): seq_idens = succ_pdb['sequence_identities'].split() if src_rcsb: row_html += ( f'<td>{struct_nm}<a href="https://www.rcsb.org/3d-view/{struct_nm}"' f' target="_blank"> RCSB Structure</a></td>') else: row_html += (f'<td>{struct_nm}<a href="./molstar_viewer.html"' f' or <a href="molstar_viewer.html"' f' target="_blank"> MolStar Viewer</a></td>') row_html += (f'<td><a href="{srv_base_url}/#dataview/{genome_ref}"' f' target="_blank">{genome_name}</a></td>' f'<td>{feat_id}</td><td>{feat_type}</td>') row_html += f'<td>{pdb_models}</td>' row_html += f'<td>{pdb_chains}</td>' row_html += f'<td>{seq_idens}</td>' row_html += '</tr>' pdb_html += row_html return pdb_html def _generate_batch_report_html(self, prot_structs_name, succ_pdb_infos): """ _generate_batch_report_html: generates the HTML for the upload report """ html_report = list() # Make report directory and copy over uploaded pdb files output_directory = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(output_directory) # Create the template html file for reporting batch-uploaded pdb files batch_html_report_path = os.path.join(output_directory, 'batch_pdb_viewer.html') pdb_html = self._write_pdb_htmls(output_directory, succ_pdb_infos) # Fetch & fill in detailed info into template HTML with open( os.path.join( os.path.dirname(__file__), 'templates', 'batch_pdb_template.html')) as batch_template_html: batch_html_report = batch_template_html.read()\ .replace('<!--replace this content-->', pdb_html) with open(batch_html_report_path, 'w') as html_report_file: html_report_file.write(batch_html_report) print( f'Full batch_html_report has been written to {batch_html_report_path}' ) html_report.append({ 'path': output_directory, 'name': os.path.basename(batch_html_report_path), 'description': 'HTML report for PDB upload' }) return html_report def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.user_id = config['USER_ID'] self.dfu = DataFileUtil(self.callback_url) self.hs = AbstractHandle(config['handle-service-url']) self.ws_client = Workspace(config['workspace-url']) self.shock_url = config['shock-url'] def import_model_pdb_file(self, params, create_report=True): """ import_model_pdb_file: upload an experiment pdb file and convert into a KBaseStructure.ModelProteinStructure object """ logging.info( f'import_model_pdb_file to a pdb data structure with params: {params}' ) # file_path is the pdb file's working area path (after dfu.download_staging_file call) file_path, workspace_name, pdb_name = self._validate_import_pdb_file_params( params) (data, n_polypeptides, params) = self._model_file_to_data(file_path, params) if not data: logging.info( f'PDB file {file_path} import with "Import ModelProteinStructure" failed!' ) return {}, {} data['pdb_handle'] = self._upload_to_shock(file_path) data['user_data'] = params.get('description', '') pdb_info = params.get('pdb_info', None) if pdb_info: pdb_info['scratch_path'] = file_path logging.info(f'Model structure data:{data}') return data, pdb_info def import_experiment_pdb_file(self, params, create_report=True): """ import_experiment_pdb_file: upload an experiment pdb file and convert into a KBaseStructure.ExperimentalProteinStructure object """ logging.info( f'import_experiment_pdb_file to a pdb structure with params: {params}' ) # file_path is the pdb file's working area path (after dfu.download_staging_file call) file_path, workspace_name, mmcif_name = self._validate_import_pdb_file_params( params) # Parse the experimental pdb file for an experimental data structure (data, n_polypeptides, params) = self._exp_file_to_data(file_path, params) if not data: logging.info( f'Import {file_path} with "Import ExperimentalProteinStructure" failed!' ) return {}, {} data['pdb_handle'] = self._upload_to_shock(file_path) data['user_data'] = params.get('description', '') pdb_info = params.get('pdb_info', None) if pdb_info: pdb_info['scratch_path'] = file_path logging.info(data) return data, pdb_info def _export_pdb(self, params): """ _export_pdb: return the shock_id of the uploaded pdb object """ if "input_ref" not in params: raise ValueError("'input_ref' not in supplied params") return {'shock_id': self._get_pdb_shock_id(params['input_ref'])} def _structure_to_pdb_file(self, params): """ _structure_to_pdb_file: get the file path for the given pdb object """ if "input_ref" not in params: raise ValueError("input_ref not in supplied params") if "destination_dir" not in params: raise ValueError("destination_dir not in supplied params") shock_id = self._get_pdb_shock_id(params['input_ref']) file_path = self.dfu.shock_to_file({ 'shock_id': shock_id, 'file_path': params['destination_dir'], 'unpack': 'uncompress' })['file_path'] return {'file_path': file_path} def export_pdb_structures(self, params): """ export_pdb_structures: return the shock_ids of the ProteinStructures object """ if 'input_ref' not in params: raise ValueError("'input_ref' not in supplied params") model_pdbs = [] exp_pdbs = [] # shock_ids = [] for m_pdb in model_pdbs: pass for e_pdb in exp_pdbs: pass return {'shock_id': self._get_pdb_shock_id(params['input_ref'])} def batch_import_pdbs(self, params): """ batch_import_pdbs: upload two sets of pdb files and create a KBaseStructure.ProteinStructures object required params: metadata_staging_file_path: a metafile from the user's staging area that must be a subdirectory file path in staging area, e.g., /data/bulk/user_name/metadata_staging_file_path staging_file_subdir_path is metadata_staging_file_path structures_name: name of the ProteinStructures object to be generated workspace_name: workspace name that the protein structure(s) will be saved return: structures_ref: return ProteinStructures object reference report_name: name of generated report (if any) report_ref: report reference (if any) 1. call _validate_batch_import_pdbs_params to validate input params 2. call _parse_metadata to parse for model_pdb_files, exp_pdb_files and kbase_meta_data 3. call import_model_pdb_file on each entry in model_pdb_paths, and call import_experiment_pdb_file on each entry in exp_pdb_paths 4. assemble the data for a ProteinStructures and save the data object 5. call _generate_batch_report to generate a report for batch_import_pdbs' result """ (metadata_file_path, workspace_name, structures_name) = self._validate_batch_import_pdbs_params(params) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name params['workspace_id'] = workspace_id (pdb_file_paths, narrative_ids, genome_names, feature_ids) = self._parse_metadata_file(metadata_file_path, workspace_id) model_pdb_objects = list() exp_pdb_objects = list() pdb_infos = list() successful_files = list() failed_files = list() protein_structures = dict() total_structures = 0 pdb_params = {} # loop through the list of pdb_file_paths for pdb in pdb_file_paths: pdb_params['pdb_info'] = pdb pdb_params['input_staging_file_path'] = pdb['file_path'] pdb_params['input_file_path'] = None pdb_params['input_shock_id'] = None pdb_params['workspace_name'] = workspace_name pdb_params['structure_name'] = pdb['structure_name'] if pdb['is_model']: model_pdb_data, pdb_info = self.import_model_pdb_file( pdb_params, False) if model_pdb_data: model_pdb_objects.append(model_pdb_data) pdb_infos.append(pdb_info) successful_files.append(pdb['file_path']) total_structures += 1 else: failed_files.append(pdb['file_path']) else: exp_pdb_data, pdb_info = self.import_experiment_pdb_file( pdb_params, False) if exp_pdb_data: exp_pdb_objects.append(exp_pdb_data) pdb_infos.append(pdb_info) successful_files.append(pdb['file_path']) total_structures += 1 else: failed_files.append(pdb['file_path']) if not model_pdb_objects: logging.info("No model pdb structure was created/saved!") return {} protein_structures['model_structures'] = model_pdb_objects protein_structures['experimental_structures'] = exp_pdb_objects protein_structures['total_structures'] = total_structures protein_structures['description'] = ( f'Created {total_structures} ' f'structures in {structures_name}') logging.info( f'ProteinStructures data structure to be saved:\n{protein_structures}' ) returnVal = {} try: info = self.dfu.save_objects({ 'id': workspace_id, 'objects': [{ 'type': 'KBaseStructure.ProteinStructures', 'name': structures_name, 'data': protein_structures }] })[0] except (RuntimeError, TypeError, KeyError, ValueError, WorkspaceError) as e: err_msg = f'DFU.save_objects errored with message: {e.message} and data: {e.data}' logging.info(err_msg) raise ValueError(err_msg) else: structs_ref = f"{info[6]}/{info[0]}/{info[4]}" returnVal = {'structures_ref': structs_ref} report_output = self._generate_batch_report( workspace_name, structs_ref, structures_name, pdb_infos, failed_files) returnVal.update(report_output) finally: return returnVal
def test_genome_set_input(self): # Setup: copy data file to workspace and get workspace id path = "data/TestGenome.json" ws_path = '/kb/module/work/tmp' shutil.copy2(path, ws_path) dfu = DataFileUtil(self.callback_url) wsName = self.getWsName() ws_id = dfu.ws_name_to_id(wsName) # Initiate Dictionaries genome_dict, genome_set_dict, dfu_genomeset_dict, dfu_genomeset_dict_2, dfu_genome_search_dict, dfu_genome_search_dict_2 = {}, {}, {}, {}, {}, {} # Upload genome & genome data dictionary input data = json.load(open(path)) objs1 = [{ 'name': 'genome_test', 'type': 'KBaseGenomes.Genome', 'data': data }] # Create .Genome object in workspace with save_objects genome_obj = dfu.save_objects({'id': ws_id, 'objects': objs1}) # Get .Genome object reference genome_info = genome_obj[0] genome_ref = str(genome_info[6]) + '/' + str( genome_info[0]) + '/' + str(genome_info[4]) # Create genome object info dictionary genome_dict.update({"label": "GenomeSetTest", "ref": genome_ref}) # Create genome set object dictionary genome_set_dict.update({"description": " ", "items": [genome_dict]}) # Create DataFileUtil dictionaries for genome set data dfu_genomeset_dict.update({ "type": "KBaseSets.GenomeSet", "data": genome_set_dict, "name": "Genome_Set_Test" }) dfu_genomeset_dict_2.update({ 'id': ws_id, 'objects': [dfu_genomeset_dict] }) # Lastly, create .GenomeSet object with save_objects and get GenomeSet object reference genome_set_obj = dfu.save_objects(dfu_genomeset_dict_2) genome_set_info = genome_set_obj[0] genome_set_ref = str(genome_set_info[6]) + '/' + str( genome_set_info[0]) + '/' + str(genome_set_info[4]) # Test KBaseSearch.GenomeSet genome_set_dict.pop('items', None) genome_set_dict['elements'] = {"Set1": genome_dict} # Create DataFileUtil dictionaries for KBaseSearch.GenomeSet data dfu_genome_search_dict.update({ "type": "KBaseSearch.GenomeSet", "data": genome_set_dict, "name": "Genome_Set_Test_2" }) dfu_genome_search_dict_2.update({ 'id': ws_id, 'objects': [dfu_genome_search_dict] }) # Lastly, create .GenomeSet object with save_objects and get GenomeSet object reference search_genome_obj = dfu.save_objects(dfu_genome_search_dict_2) search_genome_info = search_genome_obj[0] search_set_ref = str(search_genome_info[6]) + '/' + str( search_genome_info[0]) + '/' + str(search_genome_info[4]) # Get FASTAS ret = self.getImpl().get_fastas(self.callback_url, [genome_set_ref, search_set_ref])
def run_MotifSuite(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "motifsuite_seq_input" -> structure: parameter "workspace_name" of String, parameter "genome_ref" of String, parameter "SS_ref" of String, parameter "promoter_length" of Long, parameter "motif_min_length" of Long, parameter "motif_max_length" of Long, parameter "obj_name" of String, parameter "prb" of Double, parameter "motif_length" of Long, parameter "background" of Long, parameter "mask_repeats" of Long, parameter "background_group" of mapping from String to String, parameter "threshold" of Double, parameter "proportion" of Double :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_MotifSuite report = KBaseReport(self.callback_url) mfmd_obj = MotifFindermfmd(self.callback_url) homer_obj = MotifFinderHomer(self.callback_url) meme_obj = MotifFinderMEME(self.callback_url) gibbs_obj = MotifFinderGibbs(self.callback_url) ensemble_obj = MotifEnsemble(self.callback_url) mdscan_obj = MotifFinderMdscan(self.callback_url) sampler_obj = MotifFinderSampler(self.callback_url) p1 = Process(target=homer_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p1.start() p1.join() p2 = Process(target=mfmd_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p2.start() p2.join() p3 = Process(target=meme_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p3.start() p3.join() p4 = Process(target=gibbs_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p4.start() p4.join() p5 = Process(target=mdscan_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p5.start() p5.join() p6 = Process(target=sampler_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p6.start() p6.join() MSU=MotifSuiteUtil() params['motifset_refs']= MSU.get_obj_refs() #params['motifset_refs'] =['29716/72/131','29716/72/132','29716/72/133','29716/72/134','29716/72/135','29716/72/136'] #params['motifset_refs'] =['29716/72/131','29716/72/132','29716/72/133'] print(params['motifset_refs']) #result = ensemble_obj.MotifEnsemble(params) #print('Ensemble RESULT:') #print(result) dms=DownloadMotifSets() MotifSetDict = dms.DownloadMotifSet(params['motifset_refs'],self.callback_url) matchSets = [] threshold = float(params['threshold']) fmu=FastaUtils() for i,MSR1 in enumerate(MotifSetDict.keys()): for j,motif1 in enumerate(MotifSetDict[MSR1]['Motifs']): for k,MSR2 in enumerate(MotifSetDict.keys()): if k > i: for l,motif2 in enumerate(MotifSetDict[MSR2]['Motifs']): if fmu.CompareMotifsBP(motif1,motif2,threshold): found1 = False found2 = False index1 = -1 index2 = -1 for m,mset in enumerate(matchSets): if (MSR1,j) in mset: found1 = True index1 = m if(MSR2,l) in mset: found2 = True index2 = m if not found1 and found2: matchSets[index2].add((MSR1,j)) elif not found2 and found1: matchSets[index1].add((MSR2,l)) elif found1 and found2: if index1 != index2: matchSets[index1].union(matchSets[index2]) matchSets.pop(index2) else: matchSets.append(set([(MSR1,j),(MSR2,l)])) numMotifSets = len(params['motifset_refs']) threshold = float(params['proportion']) KeepSets = [] print('NUM MATCHSETS********') print(len(matchSets)) for i,mset in enumerate(matchSets): uniqueRefs = {} for tuple in mset: if tuple[0] not in uniqueRefs: uniqueRefs[tuple[0]] = tuple[0] if float(len(uniqueRefs.keys()))/numMotifSets >= threshold: KeepSets.append(i) print(len(KeepSets)) ESO = {} for ref in MotifSetDict: ESO['Condition'] = MotifSetDict[ref]['Condition'] ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref'] ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet']) ESO['Background'] = deepcopy(MotifSetDict[ref]['Background']) break ESO['Motifs'] = [] #Add motifs for keep in KeepSets: motif = fmu.merge(matchSets[keep],MotifSetDict) ESO['Motifs'].append(deepcopy(motif)) #upload new MSO dfu = DataFileUtil(self.callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) save_objects_params['objects'] = [{'type': 'KBaseGeneRegulation.MotifSet' , 'data' : ESO , 'name' : 'EnsembleMotifSet'}] info = dfu.save_objects(save_objects_params)[0] obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) htmlDir = self.shared_folder + '/ensemble_html' os.mkdir(htmlDir) mr=MakeNewReport() mr.MakeReport(htmlDir,ESO) try: html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'}) except: raise ValueError ('error uploading HTML file to shock') reportName = 'MEMEMotifFinder_report_'+str(uuid.uuid4()) reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by MEME'}], 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'], 'name': 'index.html', 'label': 'Save promoter_download.zip' } ] report = KBaseReport(self.callback_url, token=ctx['token']) report_info = report.create_extended_report(reportObj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END run_MotifSuite # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_MotifSuite return value ' + 'output is not type dict as required.') # return the results return [output]
def MotifEnsemble(self, ctx, params): """ :param params: instance of type "EnsembleParams" (Internal workflow: 1. Input - list of motifsets , workspace, threshold consensus 2. Download MotifSets -> Utils function 3. Assign motif ids by position in list Use refs to identify MSOs internally! Dictionary of motifsets key: ref, val set list of match sets: each item in the set is a tuple of (ref,index) for each motifset: <- enumerate to avoid duplicate for each motif in motifset for each other motifset: <- enumerate to avoid duplicate for each motif in other: compare(motif1,motif2): if motifs same: search list of sets for motif1: if found add motif2 if not in if not found search list of sets for motif2: if found add motif1 else add a new set with motif1 + motif2) -> structure: parameter "motifset_refs" of list of String, parameter "workspace_name" of String, parameter "threshold" of Double :returns: instance of type "Ensemble_out" -> structure: parameter "motifset_ref" of String """ # ctx is the context object # return variables are: out #BEGIN MotifEnsemble #TODO: ERROR CHECK (MULTIPLE MOTIFSETS, NONEMPTY, SSREF are the same, etc.) MotifSetDict = DownloadMotifSet(params['motifset_refs'],self.callback_url) matchSets = [] threshold = float(params['threshold']) for i,MSR1 in enumerate(MotifSetDict.keys()): for j,motif1 in enumerate(MotifSetDict[MSR1]['Motifs']): for k,MSR2 in enumerate(MotifSetDict.keys()): if k > i: for l,motif2 in enumerate(MotifSetDict[MSR2]['Motifs']): if CompareMotifsBP(motif1,motif2,threshold): found1 = False found2 = False index1 = -1 index2 = -1 for m,mset in enumerate(matchSets): if (MSR1,j) in mset: found1 = True index1 = m if(MSR2,l) in mset: found2 = True index2 = m if not found1 and found2: matchSets[index2].add((MSR1,j)) elif not found2 and found1: matchSets[index1].add((MSR2,l)) elif found1 and found2: if index1 != index2: matchSets[index1].union(matchSets[index2]) matchSets.pop(index2) else: matchSets.append(set([(MSR1,j),(MSR2,l)])) numMotifSets = len(params['motifset_refs']) threshold = float(params['proportion']) KeepSets = [] print('NUM MATCHSETS********') print(len(matchSets)) for i,mset in enumerate(matchSets): uniqueRefs = {} for tuple in mset: if tuple[0] not in uniqueRefs: uniqueRefs[tuple[0]] = tuple[0] if float(len(uniqueRefs.keys()))/numMotifSets >= threshold: KeepSets.append(i) print(len(KeepSets)) #handle duplicates... #for i,tuple1 in enumerate(matchSets): # for j,tuple2 in enumerate(matchSets): # if j > i: # if tuple1[0] == tuple2[0]: #handle this.... #how...? #merge locations if theyre different #pick one motif by default(p-val) #run motif compare to ensure theyre actually similar enough # print('duplicate') #create new MSO ESO = {} for ref in MotifSetDict: ESO['Condition'] = MotifSetDict[ref]['Condition'] ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref'] ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet']) ESO['Background'] = deepcopy(MotifSetDict[ref]['Background']) break ESO['Motifs'] = [] #Add motifs for keep in KeepSets: motif = merge(matchSets[keep],MotifSetDict) ESO['Motifs'].append(deepcopy(motif)) #upload new MSO dfu = DataFileUtil(self.callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) #save_objects_params['id'] = params['workspace_name'] save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : ESO , 'name' : 'EnsembleMotifSet'}] info = dfu.save_objects(save_objects_params)[0] obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) #create report htmlDir = self.shared_folder + '/ensemble_html' os.mkdir(htmlDir) MakeReport(htmlDir,ESO) try: html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'}) except: raise ValueError ('error uploading HTML file to shock') #Create motif set object from MotifList #TODO set parameters correctly #add narrative support to set #MSO = {} #MSO['Condition'] = 'Temp' #MSO['FeatureSet_ref'] = '123' #MSO['Motifs'] = [] #MSO['Alphabet'] = ['A','C','G','T'] #MSO['Background'] = {} #for letter in MSO['Alphabet']: # MSO['Background'][letter] = 0.0 #MSU.parseMotifList(fullMotifList,MSO) #objname = 'MotifSet' + str(int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000)) #Pass motif set into this #save_objects_params = {} #save_objects_params['id'] = self.ws_info[0] #save_objects_params['id'] = long(params['workspace_name'].split('_')[1]) #save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) #save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : MSO , 'name' : objname}] #info = dfu.save_objects(save_objects_params)[0] #motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) #object_upload_ret = dfu.file_to_shock() reportName = 'MEMEMotifFinder_report_'+str(uuid.uuid4()) reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by MEME'}], 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj #reportObj['direct_html'] = None reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'], #'name': 'promoter_download.zip', 'name': 'index.html', 'label': 'Save promoter_download.zip' } ] report = KBaseReport(self.callback_url, token=ctx['token']) #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']}) report_info = report.create_extended_report(reportObj) out = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END MotifEnsemble # At some point might do deeper type checking... if not isinstance(out, dict): raise ValueError('Method MotifEnsemble return value ' + 'out is not type dict as required.') # return the results return [out]
class FeatureSetBuilder: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_upload_featureset_from_diff_expr_params(self, params): """ _validate_upload_featureset_from_diff_expr_params: validates params passed to upload_featureset_from_diff_expr method """ log('start validating upload_featureset_from_diff_expr params') # check for required parameters for p in ['diff_expression_ref', 'workspace_name', 'p_cutoff', 'q_cutoff', 'fold_change_cutoff']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) p = params.get('fold_scale_type') if p and p != 'logarithm': raise ValueError('"fold_scale_type" parameter must be set to "logarithm", if used') @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning("Unexpected parameter {} supplied".format(param)) def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, workspace_name): """ _generate_report: generate summary report """ log('start creating report') output_html_files = self._generate_html_report(up_feature_set_ref_list, down_feature_set_ref_list) objects_created = list() for up_feature_set_ref in up_feature_set_ref_list: objects_created += [{'ref': up_feature_set_ref, 'description': 'Upper FeatureSet Object'}] for down_feature_set_ref in down_feature_set_ref_list: objects_created += [{'ref': down_feature_set_ref, 'description': 'Lower FeatureSet Object'}] for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list: objects_created += [{'ref': filtered_expression_matrix_ref, 'description': 'Filtered ExpressionMatrix Object'}] report_params = {'message': '', 'workspace_name': workspace_name, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') uppper_feature_content = '' for up_feature_set_ref in up_feature_set_ref_list: feature_set_obj = self.ws.get_objects2({'objects': [{'ref': up_feature_set_ref}]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name, len(feature_ids)) lower_feature_content = '' for down_feature_set_ref in down_feature_set_ref_list: feature_set_obj = self.ws.get_objects2({'objects': [{'ref': down_feature_set_ref}]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name, len(feature_ids)) with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<tr><td>Upper_FeatureSet</td></tr>', uppper_feature_content) report_template = report_template.replace('<tr><td>Lower_FeatureSet</td></tr>', lower_feature_content) result_file.write(report_template) html_report.append({'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report'}) return html_report def _process_diff_expression(self, diff_expression_set_ref, result_directory, condition_label_pair): """ _process_diff_expression: process differential expression object info """ log('start processing differential expression object') diff_expr_set_data = self.ws.get_objects2({'objects': [{'ref': diff_expression_set_ref}]})['data'][0]['data'] set_items = diff_expr_set_data['items'] diff_expr_matrix_file_name = 'gene_results.csv' diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name) with open(diff_expr_matrix_file, 'w') as csvfile: fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for set_item in set_items: diff_expression_ref = set_item['ref'] diff_expression_data = self.ws.get_objects2({'objects': [{'ref': diff_expression_ref}]})['data'][0]['data'] label_string = set_item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_1 = label_list[0] condition_2 = label_list[1] if condition_1 in condition_label_pair and condition_2 in condition_label_pair: genome_id = diff_expression_data['genome_ref'] matrix_data = diff_expression_data['data'] selected_diff_expression_ref = diff_expression_ref with open(diff_expr_matrix_file, 'a') as csvfile: row_ids = matrix_data.get('row_ids') row_values = matrix_data.get('values') writer = csv.DictWriter(csvfile, fieldnames=fieldnames) for pos, row_id in enumerate(row_ids): row_value = row_values[pos] writer.writerow({'gene_id': row_id, 'log2_fold_change': row_value[0], 'p_value': row_value[1], 'q_value': row_value[2]}) return diff_expr_matrix_file, genome_id, selected_diff_expression_ref def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name): """ _generate_feature_set: generate FeatureSet object KBaseCollections.FeatureSet type: typedef structure { string description; list<feature_id> element_ordering; mapping<feature_id, list<genome_ref>> elements; } FeatureSet; """ log('start saving KBaseCollections.FeatureSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) elements = {feature_id: [genome_id] for feature_id in feature_ids} feature_set_data = {'description': 'Generated FeatureSet from DifferentialExpression', 'element_ordering': feature_ids, 'elements': elements} object_type = 'KBaseCollections.FeatureSet' save_object_params = { 'id': workspace_id, 'objects': [{'type': object_type, 'data': feature_set_data, 'name': feature_set_name}]} dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) return feature_set_obj_ref def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value, comp_fold_change_cutoff): """ _process_matrix_file: filter matrix file by given cutoffs """ log('start processing matrix file') up_feature_ids = [] down_feature_ids = [] if comp_fold_change_cutoff < 0: comp_fold_change_cutoff = -comp_fold_change_cutoff with open(diff_expr_matrix_file, 'r') as file: reader = csv.DictReader(file) for row in reader: feature_id = row['gene_id'] row_p_value = row['p_value'] row_q_value = row['q_value'] row_fold_change_cutoff = row['log2_fold_change'] null_value = {'NA', 'null', ''} col_value = {row_p_value, row_q_value, row_fold_change_cutoff} if not col_value.intersection(null_value): p_value_condition = float(row_p_value) <= comp_p_value q_value_condition = float(row_q_value) <= comp_q_value up_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) >= comp_fold_change_cutoff)) down_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) <= -comp_fold_change_cutoff)) if up_matches_condition: up_feature_ids.append(feature_id) elif down_matches_condition: down_feature_ids.append(feature_id) return list(set(up_feature_ids)), list(set(down_feature_ids)) def _filter_expression_matrix(self, expression_matrix_ref, feature_ids, workspace_name, filtered_expression_matrix_suffix="", diff_expression_matrix_ref=None, filtered_expression_matrix_name=None): """ _filter_expression_matrix: generated filtered expression matrix """ log('start saving ExpressionMatrix object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_matrix_obj = self.dfu.get_objects({'object_refs': [expression_matrix_ref]})['data'][0] expression_matrix_info = expression_matrix_obj['info'] expression_matrix_data = expression_matrix_obj['data'] expression_matrix_name = expression_matrix_info[1] if not filtered_expression_matrix_name: if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name): filtered_expression_matrix_name = re.sub('_*[Ee]xpression_*[Mm]atrix', filtered_expression_matrix_suffix, expression_matrix_name) else: filtered_expression_matrix_name = expression_matrix_name + \ filtered_expression_matrix_suffix filtered_expression_matrix_data = expression_matrix_data.copy() data = filtered_expression_matrix_data['data'] row_ids = data['row_ids'] values = data['values'] filtered_data = data.copy() filtered_row_ids = list() filtered_values = list() for pos, row_id in enumerate(row_ids): if row_id in feature_ids: filtered_row_ids.append(row_id) filtered_values.append(values[pos]) filtered_data['row_ids'] = filtered_row_ids filtered_data['values'] = filtered_values filtered_expression_matrix_data['data'] = filtered_data expression_obj = {'type': expression_matrix_info[2], 'data': filtered_expression_matrix_data, 'name': filtered_expression_matrix_name} # we now save the filtering DEM in a EM field added for this purpose if diff_expression_matrix_ref: expression_obj['data']['diff_expr_matrix_ref'] = diff_expression_matrix_ref expression_obj['extra_provenance_input_refs'] = [diff_expression_matrix_ref] save_object_params = { 'id': workspace_id, 'objects': [expression_obj]} dfu_oi = self.dfu.save_objects(save_object_params)[0] filtered_expression_matrix_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) return filtered_expression_matrix_ref def _xor(self, a, b): return bool(a) != bool(b) def _check_input_labels(self, condition_pairs, available_condition_labels): """ _check_input_labels: check input condition pairs """ checked = True for condition_pair in condition_pairs: label_string = condition_pair['label_string'][0].strip() label_list = [x.strip() for x in label_string.split(',')] first_label = label_list[0] second_label = label_list[1] if first_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format(first_label) error_msg += 'Available conditions: {}'.format(available_condition_labels) raise ValueError(error_msg) if second_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format(second_label) error_msg += 'Available conditions: {}'.format(available_condition_labels) raise ValueError(error_msg) if first_label == second_label: raise ValueError('Input conditions are the same') return checked def _get_condition_labels(self, diff_expression_set_ref): """ _get_condition_labels: get all possible condition label pairs """ log('getting all possible condition pairs') condition_label_pairs = list() available_condition_labels = set() diff_expression_set_obj = self.ws.get_objects2({'objects': [{'ref': diff_expression_set_ref}] })['data'][0] diff_expression_set_data = diff_expression_set_obj['data'] items = diff_expression_set_data.get('items') for item in items: label_string = item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_label_pairs.append(label_list) available_condition_labels |= set(label_list) log('all possible condition pairs:\n{}'.format(condition_label_pairs)) return condition_label_pairs, available_condition_labels def _get_feature_ids(self, genome_ref, ids): """ _get_feature_ids: get feature ids from genome """ genome_features = self.gsu.search({'ref': genome_ref, 'limit': len(ids), 'structured_query': {"$or": [{"feature_id": x} for x in ids]}, 'sort_by': [['feature_id', True]]})['features'] features_ids = set((feature.get('feature_id') for feature in genome_features)) return features_ids def _build_fs_obj(self, params): new_feature_set = { 'description': '', 'element_ordering': [], 'elements': {} } genome_ref = params['genome'] if params.get('base_feature_sets', []) and None not in params['base_feature_sets']: base_feature_sets = self.dfu.get_objects( {'object_refs': params['base_feature_sets']} )['data'] for ret in base_feature_sets: base_set = ret['data'] base_set_name = ret['info'][1] new_feature_set['element_ordering'] += [x for x in base_set['element_ordering'] if x not in new_feature_set['elements']] for element, genome_refs in base_set['elements'].items(): if element in new_feature_set['elements']: new_feature_set['elements'][element] += [x for x in genome_refs if x not in new_feature_set['elements'][ element]] else: new_feature_set['elements'][element] = genome_refs new_feature_set['description'] += 'From FeatureSet {}: {}\n'.format( base_set_name, base_set.get('description')) new_feature_ids = [] if params.get('feature_ids'): if isinstance(params['feature_ids'], str): new_feature_ids += params['feature_ids'].split(',') else: new_feature_ids += params['feature_ids'] if params.get('feature_ids_custom'): new_feature_ids += params['feature_ids_custom'].split(',') if new_feature_ids: genome_feature_ids = self._get_feature_ids(genome_ref, new_feature_ids) for new_feature in new_feature_ids: if new_feature not in genome_feature_ids: raise ValueError('Feature ID {} does not exist in the supplied genome {}'.format( new_feature, genome_ref)) if new_feature in new_feature_set['elements']: if genome_ref not in new_feature_set['elements'][new_feature]: new_feature_set['elements'][new_feature].append(genome_ref) else: new_feature_set['elements'][new_feature] = [genome_ref] new_feature_set['element_ordering'].append(new_feature) if params.get('description'): new_feature_set['description'] = params['description'] return new_feature_set def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.ws = Workspace(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.scratch = config['scratch'] def upload_featureset_from_diff_expr(self, params): """ upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression based on given threshold cutoffs required params: diff_expression_ref: DifferetialExpressionMatrixSet object reference expression_matrix_ref: ExpressionMatrix object reference p_cutoff: p value cutoff q_cutoff: q value cutoff fold_scale_type: one of ["linear", "log2+1", "log10+1"] fold_change_cutoff: fold change cutoff feature_set_suffix: Result FeatureSet object name suffix filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix workspace_name: the name of the workspace it gets saved to return: result_directory: folder path that holds all files generated up_feature_set_ref_list: list of generated upper FeatureSet object reference down_feature_set_ref_list: list of generated down FeatureSet object reference filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ self._validate_upload_featureset_from_diff_expr_params(params) diff_expression_set_ref = params.get('diff_expression_ref') diff_expression_set_info = self.ws.get_object_info3({"objects": [{"ref": diff_expression_set_ref}]} )['infos'][0] diff_expression_set_name = diff_expression_set_info[1] result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) (available_condition_label_pairs, available_condition_labels) = self._get_condition_labels(diff_expression_set_ref) run_all_combinations = params.get('run_all_combinations') condition_pairs = params.get('condition_pairs') if not self._xor(run_all_combinations, condition_pairs): error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' " error_msg += "or provide partial condition pairs. Don't do both or neither" raise ValueError(error_msg) if run_all_combinations: condition_label_pairs = available_condition_label_pairs else: if self._check_input_labels(condition_pairs, available_condition_labels): condition_label_pairs = list() for condition_pair in condition_pairs: label_string = condition_pair['label_string'][0].strip() condition_labels = [x.strip() for x in label_string.split(',')] condition_label_pairs.append(condition_labels) up_feature_set_ref_list = list() down_feature_set_ref_list = list() filtered_expression_matrix_ref_list = list() for condition_label_pair in condition_label_pairs: condition_string = '-'.join(reversed(condition_label_pair)) diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression( diff_expression_set_ref, result_directory, condition_label_pair) up_feature_ids, down_feature_ids = self._process_matrix_file( diff_expr_matrix_file, params.get('p_cutoff'), params.get('q_cutoff'), params.get('fold_change_cutoff')) filtered_em_name = _sanitize_name(condition_string) + params.get('filtered_expression_matrix_suffix') if params.get('expression_matrix_ref'): filtered_expression_matrix_ref = self._filter_expression_matrix( params.get('expression_matrix_ref'), up_feature_ids + down_feature_ids, params.get('workspace_name'), "", diff_expr_matrix_ref, filtered_em_name) filtered_expression_matrix_ref_list.append(filtered_expression_matrix_ref) feature_set_suffix = params.get('feature_set_suffix', "") up_feature_set_name = "{}_{}_up{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) up_feature_set_ref = self._generate_feature_set(up_feature_ids, genome_id, params.get('workspace_name'), up_feature_set_name) up_feature_set_ref_list.append(up_feature_set_ref) down_feature_set_name = "{}_{}_down{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) down_feature_set_ref = self._generate_feature_set(down_feature_ids, genome_id, params.get('workspace_name'), down_feature_set_name) down_feature_set_ref_list.append(down_feature_set_ref) returnVal = {'result_directory': result_directory, 'up_feature_set_ref_list': up_feature_set_ref_list, 'down_feature_set_ref_list': down_feature_set_ref_list, 'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list} report_output = self._generate_report(up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, params.get('workspace_name')) returnVal.update(report_output) return returnVal def filter_matrix_with_fs(self, params): self.validate_params(params, ('feature_set_ref', 'workspace_name', 'expression_matrix_ref', 'filtered_expression_matrix_suffix')) ret = self.dfu.get_objects( {'object_refs': [params['feature_set_ref']]} )['data'][0] feature_set = ret['data'] feature_set_name = ret['info'][1] feature_ids = set(feature_set['elements'].keys()) filtered_matrix_ref = self._filter_expression_matrix( params['expression_matrix_ref'], feature_ids, params['workspace_name'], params['filtered_expression_matrix_suffix']) objects_created = [{'ref': filtered_matrix_ref, 'description': 'Filtered ExpressionMatrix Object'}] message = "Filtered Expression Matrix based of the {} feature ids present in {}"\ .format(len(feature_ids), feature_set_name) report_params = {'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return {'filtered_expression_matrix_ref': filtered_matrix_ref, 'report_name': output['name'], 'report_ref': output['ref']} def build_feature_set(self, params): self.validate_params(params, {'output_feature_set', 'workspace_name', }, {'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets', 'description'}) feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets') if not any([params.get(x) for x in feature_sources]): raise ValueError("You must supply at least one feature source: {}".format( ", ".join(feature_sources))) workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) new_feature_set = self._build_fs_obj(params) save_object_params = { 'id': workspace_id, 'objects': [{'type': 'KBaseCollections.FeatureSet', 'data': new_feature_set, 'name': params['output_feature_set']}]} dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) objects_created = [{'ref': feature_set_obj_ref, 'description': 'Feature Set'}] message = 'A new feature set containing {} features was created.'.format( len(new_feature_set['elements'])) report_params = {'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return {'feature_set_ref': feature_set_obj_ref, 'report_name': output['name'], 'report_ref': output['ref']}
def test_update_taxon_assignments_valid(self): """ Test a valid call to the update_taxon_assignments method. """ taxon_key = str(uuid4()) taxon_val = str(uuid4()) taxon_val_new = str(uuid4()) # Copy the object to test workspace dfu = DataFileUtil(self.callbackURL) obj_ref = f"{_WORKSPACE_NAME}/{_OBJECT_NAME}" result = dfu.get_objects({'object_refs': [obj_ref]})['data'][0] obj_data = result['data'] # crate user owned handle in the object and update it hs = HandleService(self.handleURL) prev_handle_id = obj_data['genbank_handle_ref'] prev_shock_id = hs.hids_to_handles([prev_handle_id])[0]['id'] new_handle_id = dfu.own_shock_node({ 'shock_id': prev_shock_id, 'make_handle': 1 })['handle']['hid'] obj_data['genbank_handle_ref'] = new_handle_id # Save new object in test workspace obj_info = result['info'] new_obj = { 'type': obj_info[2], 'data': obj_data, 'name': 'GCF_002287175.1' } test_ws_id = dfu.ws_name_to_id(self.wsName) infos = dfu.save_objects({'id': test_ws_id, 'objects': [new_obj]}) obj_ref = f"{infos[0][6]}/{infos[0][0]}/{infos[0][4]}" new_ws_id = infos[0][6] new_obj_id = infos[0][0] get_obj_params = { 'wsid': new_ws_id, 'objid': new_obj_id, 'included': ['/taxon_assignments'] } # Add a new assignment self.serviceImpl.update_taxon_assignments( self.ctx, { 'workspace_id': new_ws_id, 'object_id': new_obj_id, 'taxon_assignments': { taxon_key: taxon_val } }) # Fetch the object and check the mapping obj = self.wsClient.get_objects2({'objects': [get_obj_params]})['data'][0]['data'] self.assertTrue(taxon_key in obj['taxon_assignments']) self.assertEqual(obj['taxon_assignments'][taxon_key], taxon_val) # Update the assignment we just added self.serviceImpl.update_taxon_assignments( self.ctx, { 'workspace_id': new_ws_id, 'object_id': new_obj_id, 'taxon_assignments': { taxon_key: taxon_val_new } }) # Fetch the object and check the mapping obj = self.wsClient.get_objects2({'objects': [get_obj_params]})['data'][0]['data'] self.assertTrue(taxon_key in obj['taxon_assignments']) self.assertEqual(obj['taxon_assignments'][taxon_key], taxon_val_new) # Remove the assignment we just added self.serviceImpl.update_taxon_assignments( self.ctx, { 'workspace_id': new_ws_id, 'object_id': new_obj_id, 'remove_assignments': [taxon_key] }) # Fetch the object and check the mapping obj = self.wsClient.get_objects2({'objects': [get_obj_params]})['data'][0]['data'] self.assertTrue(taxon_key not in obj['taxon_assignments']) self.assertEqual(obj['taxon_assignments'].get(taxon_key), None)
class DataUtil: @staticmethod def _find_between(s, start, end): """ _find_between: find string in between start and end """ return re.search('{}(.*){}'.format(start, end), s).group(1) def _find_constraints(self, obj_type): """ _find_constraints: retrieve constraints (@contains, rowsum, unique, conditionally_required) """ type_info = self.wsClient.get_type_info(obj_type) type_desc = type_info.get('description') constraints = {} for tag in ('contains', 'rowsum', 'unique', 'conditionally_required'): constraints[tag] = [ line.strip().split()[1:] for line in type_desc.split("\n") if line.startswith(f'@{tag}') ] return constraints def _filter_constraints(self, constraints, data): """filters out constraints with missing keys""" contains_constraints = constraints.get('contains') # exit(contains_constraints) [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']] filtered_constraints = [] for contains_constraint in contains_constraints: in_values = contains_constraint[1:] #exit(in_values) ['row_mapping'] missing_key = True for in_value in in_values: # exit(in_value) row_mapping if in_value.startswith('values'): search_value = re.search('{}(.*){}'.format('\(', '\)'), in_value).group(1) unique_list = search_value.split('.') key = unique_list[0] elif ':' in in_value: key = in_value.split(':')[0] else: unique_list = in_value.split('.') key = unique_list[0] if key in data: missing_key = False break if missing_key: filtered_constraints.append(contains_constraint) for x in filtered_constraints: contains_constraints.remove(x) #exit(constraints) ''' {'contains': [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']], 'rowsum': [], 'unique': [['data.row_ids'], ['data.col_ids']], 'conditionally_required': [['row_attributemapping_ref', 'row_mapping'], ['col_attributemapping_ref', 'col_mapping']]} ''' return constraints def _retrieve_value(self, data, value): """Parse the provided 'data' object to retrieve the item in 'value'.""" logging.info('Getting value for {}'.format(value)) retrieve_data = [] #exit(data) ''' {'row_attributemapping_ref': '44071/19/157', 'row_mapping': {'GG_OTU_1': 'GG_OTU_1', 'GG_OTU_2': 'GG_OTU_2', 'GG_OTU_3': 'GG_OTU_3', 'GG_OTU_4': 'GG_OTU_4', 'GG_OTU_5': 'GG_OTU_5'}, 'col_attributemapping_ref': '44071/20/79', 'col_mapping': {'Sample1': 'Sample1', 'Sample2': 'Sample2', 'Sample3': 'Sample3', 'Sample4': 'Sample4', 'Sample5': 'Sample5', 'Sample6': 'Sample6'}, 'attributes': {'generated_by': 'QIIME revision XYZ'}, 'data': {'row_ids': ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], 'col_ids': ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], 'values': [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]}, 'search_attributes': ['generated_by|QIIME revision XYZ'], 'scale': 'raw', 'description': 'OTU data'} ''' m_data = DotMap(data) #exit(m_data) ''' DotMap(row_attributemapping_ref='44071/19/158', row_mapping=DotMap(GG_OTU_1='GG_OTU_1', GG_OTU_2='GG_OTU_2', GG_OTU_3='GG_OTU_3', GG_OTU_4='GG_OTU_4', GG_OTU_5='GG_OTU_5'), col_attributemapping_ref='44071/20/80', col_mapping=DotMap(Sample1='Sample1', Sample2='Sample2', Sample3='Sample3', Sample4='Sample4', Sample5='Sample5', Sample6='Sample6'), attributes=DotMap(generated_by='QIIME revision XYZ'), data=DotMap(row_ids=['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], col_ids=['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], values=[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]), search_attributes=['generated_by|QIIME revision XYZ'], scale='raw', description='OTU data') ''' #exit(value) data.row_ids if value.startswith('set('): retrieve_data = value[4:-1].split(",") elif value.startswith( 'values('): # TODO: nested values e.g. values(values(ids)) search_value = re.search('{}(.*){}'.format('\(', '\)'), value).group(1) unique_list = search_value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = list(m_data_cp.values()) elif ':' in value: obj_ref = getattr(m_data, value.split(':')[0]) if obj_ref: included = value.split(':')[1] included = '/' + included.replace('.', '/') ref_data = self.wsClient.get_objects2( {'objects': [{ 'ref': obj_ref, 'included': [included] }]})['data'][0]['data'] m_ref_data = DotMap(ref_data) if ref_data: if '*' not in included: for key in included.split('/')[1:]: m_ref_data = getattr(m_ref_data, key) else: keys = included.split('/')[1:] m_ref_data = [ x.get(keys[2]) for x in ref_data.get(keys[0]) ] # TODO: only works for 2 level nested data like '/features/[*]/id' retrieve_data = list(m_ref_data) else: unique_list = value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = list(m_data_cp) logging.info('Retrieved value (first 20):\n{}\n'.format( retrieve_data[:20])) #exit(retrieve_data) ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'] return retrieve_data def _validate(self, constraints, data): """ _validate: validate data """ #exit(constraints) ''' {'contains': [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']], 'rowsum': [], 'unique': [['data.row_ids'], ['data.col_ids']], 'conditionally_required': [['row_attributemapping_ref', 'row_mapping'], ['col_attributemapping_ref', 'col_mapping']]} ''' validated = True failed_constraints = defaultdict(list) unique_constraints = constraints.get('unique') #exit(unique_constraints) [['data.row_ids'], ['data.col_ids']] for unique_constraint in unique_constraints: retrieved_value = self._retrieve_value(data, unique_constraint[0]) #exit(retrieved_value) ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'] if len(set(retrieved_value)) != len(retrieved_value): validated = False failed_constraints['unique'].append(unique_constraint[0]) contains_constraints = constraints.get('contains') #exit(contains_constraints) [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']] for contains_constraint in contains_constraints: value = contains_constraint[0] in_values = contains_constraint[1:] retrieved_in_values = [] for in_value in in_values: retrieved_in_values += self._retrieve_value(data, in_value) if not (set(self._retrieve_value(data, value)) <= set(retrieved_in_values)): validated = False failed_constraints['contains'].append( " ".join(contains_constraint)) conditional_constraints = constraints.get('conditionally_required') #exit(conditional_constraints) [['row_attributemapping_ref', 'row_mapping'], ['col_attributemapping_ref', 'col_mapping']] for conditional_constraint in conditional_constraints: trigger = conditional_constraint[0] required_keys = conditional_constraint[1:] if trigger in data: missing_keys = [ key for key in required_keys if key not in data ] if missing_keys: validated = False failed_constraints['conditionally_required'].append( (trigger, required_keys, missing_keys)) return validated, failed_constraints @staticmethod def _raise_validation_error(params, validate): """Raise a meaningful error message for failed validation""" logging.error('Data failed type checking') failed_constraints = validate.get('failed_constraints') error_msg = [ 'Object {} failed type checking:'.format(params.get('obj_name')) ] if failed_constraints.get('unique'): unique_values = failed_constraints.get('unique') error_msg.append( 'Object should have unique field: {}'.format(unique_values)) if failed_constraints.get('contains'): contained_values = failed_constraints.get('contains') for contained_value in contained_values: subset_value = contained_value.split(' ')[0] super_value = ' '.join(contained_value.split(' ')[1:]) if 'col_mapping' in super_value: error_msg.append( 'Column attribute mapping instances should contain all ' 'column index from original data') if 'row_mapping' in super_value: error_msg.append( 'Row attribute mapping instances should contain all row ' 'index from original data') error_msg.append( 'Object field [{}] should contain field [{}]'.format( super_value, subset_value)) for failure in failed_constraints.get('conditionally_required', []): error_msg.append( 'If object field "{}" is present than object field(s) {} should ' 'also be present. Object is missing {}'.format(*failure)) raise ValueError('\n'.join(error_msg)) def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.serviceWizardURL = config['srv-wiz-url'] self.wsClient = workspaceService(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.generics_service = GenericsService(self.serviceWizardURL) def list_generic_types(self, params=None): """ *Not yet exposed in spec* list_generic_types: lists the current valid generics types arguments: none return: A list of generic types in the current environment """ returnVal = [ x['type_def'] for module in GENERICS_MODULES for x in self.wsClient.get_all_type_info(module) ] return returnVal def fetch_data(self, params): #exit(params) {'obj_ref': '44071/21/241'} """ fetch_data: fetch generics data as pandas dataframe for a generics data object arguments: obj_ref: generics object reference optional arguments: generics_module: the generics data module to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; generics_module should be {'data': 'FloatMatrix2D', 'condition_set_ref': 'condition_set_ref'} return: data_matrix: a pandas dataframe in json format """ for p in ['obj_ref']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) #exit(self.generics_service.fetch_data(params)) {'data_matrix': '{"Sample1":{"GG_OTU_1":0.0,"GG_OTU_2":5.0,"GG_OTU_3":0.0,"GG_OTU_4":2.0,"GG_OTU_5":0.0},"Sample2":{"GG_OTU_1":0.0,"GG_OTU_2":1.0,"GG_OTU_3":0.0,"GG_OTU_4":1.0,"GG_OTU_5":1.0},"Sample3":{"GG_OTU_1":1.0,"GG_OTU_2":0.0,"GG_OTU_3":1.0,"GG_OTU_4":1.0,"GG_OTU_5":1.0},"Sample4":{"GG_OTU_1":0.0,"GG_OTU_2":2.0,"GG_OTU_3":4.0,"GG_OTU_4":0.0,"GG_OTU_5":0.0},"Sample5":{"GG_OTU_1":0.0,"GG_OTU_2":3.0,"GG_OTU_3":2.0,"GG_OTU_4":0.0,"GG_OTU_5":0.0},"Sample6":{"GG_OTU_1":0.0,"GG_OTU_2":1.0,"GG_OTU_3":0.0,"GG_OTU_4":1.0,"GG_OTU_5":0.0}}'} return self.generics_service.fetch_data(params) def validate_data(self, params): """ validate_data: validate data arguments: obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1' data: obj data to be validated return: validated: True or False """ constraints = self._find_constraints(params.get('obj_type')) data = params.get('data') constraints = self._filter_constraints(constraints, data) validated, failed_constraints = self._validate(constraints, data) return { 'validated': validated, 'failed_constraints': failed_constraints } def save_object(self, params): """ save_object: validate data constraints and save matrix object arguments: obj_type: saving object data type obj_name: saving object name data: data to be saved workspace_name: workspace name matrix object to be saved to return: obj_ref: object reference """ logging.info('Starting saving object') obj_type = params.get('obj_type') module_name = obj_type.split('.')[0] type_name = obj_type.split('.')[1] types = self.wsClient.get_module_info({ 'mod': module_name }).get('types') for module_type in types: if self._find_between(module_type, '\.', '\-') == type_name: obj_type = module_type break data = dict((k, v) for k, v in params.get('data').items() if v) validate = self.validate_data({'obj_type': obj_type, 'data': data}) if not validate.get('validated'): self._raise_validation_error(params, validate) workspace_name = params.get('workspace_name') if not isinstance(workspace_name, int): ws_name_id = self.dfu.ws_name_to_id(workspace_name) else: ws_name_id = workspace_name info = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data": data, "name": params.get('obj_name') }] })[0] return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])}
class FileUtil: def _validate_import_file_params(self, params): """ _validate_import_matrix_from_excel_params: validates params passed to import_matrix_from_excel method """ # check for required parameters for p in ['msa_name', 'workspace_name']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) if params.get('input_file_path'): file_path = params.get('input_file_path') elif params.get('input_shock_id'): file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') elif params.get('input_staging_file_path'): file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params.get('input_staging_file_path') }).get('copy_file_path') else: error_msg = "Must supply either a input_shock_id or input_file_path " error_msg += "or input_staging_file_path" raise ValueError(error_msg) return file_path, params['workspace_name'], params['msa_name'] def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ logging.info('Start uploading file to shock: {}'.format(file_path)) file_to_shock_params = { 'file_path': file_path, 'pack': 'gzip', 'make_handle': True, } shock_id = self.dfu.file_to_shock(file_to_shock_params)['shock_id'] return shock_id @staticmethod def _infer_seq_type(msa): dna_set = {"A", "C", "G", "T", "-"} seq_chars = {char for record in msa for char in record.seq} if seq_chars - dna_set: return "protein" else: return "dna" def _file_to_data(self, file_path, format='fasta'): """Do the file conversion""" data = { 'alignment': {}, 'default_row_labels': {}, 'row_order': [], } msa = AlignIO.read(file_path, format) data['alignment_length'] = msa.get_alignment_length() data['sequence_type'] = self._infer_seq_type(msa) for record in msa: data['row_order'].append(record.id) data['default_row_labels'][record.id] = record.description data['alignment'][record.id] = str(record.seq) message = f'A Multiple Sequence Alignment with {len(data["alignment"])} sequences and ' \ f'an alignment length of {data["alignment_length"]} was produced' return data, message def _generate_report(self, msa_ref, workspace_name, message): """ _generate_report: generate summary report for upload """ report_params = { 'message': message, 'objects_created': [{ 'ref': msa_ref, 'description': 'Imported MSA' }], 'workspace_name': workspace_name, 'report_object_name': f'import_msa_file_{uuid.uuid4()}' } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _get_object(self, params): ret = self.dfu.get_objects({'object_refs': [params['input_ref']]})['data'][0] obj_name = ret['info'][1] obj_data = ret['data'] return obj_name, obj_data def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) def import_fasta_file(self, params): file_path, workspace_name, msa_name = self._validate_import_file_params( params) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name data, message = self._file_to_data(file_path, params.get('file_format', 'fasta')) data['description'] = params.get('description', '') info = self.dfu.save_objects({ 'id': workspace_id, 'objects': [{ 'type': 'KBaseTrees.MSA', 'name': msa_name, 'data': data }] })[0] obj_ref = f"{info[6]}/{info[0]}/{info[4]}" returnVal = {'msa_obj_ref': obj_ref} report_output = self._generate_report(obj_ref, workspace_name, message) returnVal.update(report_output) return returnVal def msa_to_file(self, params, file_type='fasta'): if "input_ref" not in params: raise ValueError("input_ref not in supplied params") if "destination_dir" not in params: raise ValueError("destination_dir not in supplied params") obj_name, obj_data = self._get_object(params) keys = obj_data.get('row_order', obj_data['alignment'].keys) row_labels = obj_data.get('default_row_labels', {}) file_path = os.path.join(self.scratch, f'{obj_name}.{file_type}') seq_type = generic_protein if obj_data.get( 'sequence_type') == "protein" else generic_dna msa = MultipleSeqAlignment([ SeqRecord(Seq(obj_data['alignment'][key], seq_type), id=key, description=row_labels[key]) for key in keys ]) AlignIO.write(msa, file_path, file_type) return {'file_path': file_path} def msa_to_clustal_file(self, params): raise NotImplementedError def export_file(self, params, file_type='fasta'): params['destination_dir'] = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(params['destination_dir']) file_path = self.msa_to_file(params, file_type)['file_path'] return {'shock_id': self._upload_to_shock(file_path)}
class GFFUtils2: def __init__(self, config): self.callback_url = config['callback_url'] self.shared_folder = config['scratch'] #self.shared_folder = "/kb/module/work" self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.wsc = Workspace(self.ws_url) def _prep_gff(self, gff_file): outfile = os.path.join(self.genome_dir, 'out.gff') sortcmd = f'(grep ^"#" {gff_file}; grep -v ^"#" {gff_file} | sort -k1,1 -k4,4n)' with open(outfile, 'w') as o: p = subprocess.Popen(sortcmd, shell=True, stdout=o) out, err = p.communicate() o.close() bgzip = subprocess.Popen(['bgzip', 'out.gff'], cwd=self.genome_dir) out2, err2 = bgzip.communicate() outfile += '.gz' return outfile def _construct_gff_from_json(self, json, gff_file_path, contig_base_lengths): with open(gff_file_path, 'w') as f: for feature in json: if feature['feature_type'].strip().upper() == 'GENE': end = int(feature['location'][0]['start'])+int(feature['location'][0]['length']) metainfo = "ID="+feature['feature_id'] if feature['function']: metainfo += ';FUNCTION='+feature['function'] contig_id = str(feature['location'][0]['contig_id']) start = int(feature['location'][0]['start']) # TODO: Fix Plink reassignment of Chr prefixes try: global_pos = int(contig_base_lengths[contig_id]) + start except KeyError: try: global_pos = int(contig_base_lengths[contig_id.capitalize()]) + start except KeyError: try: global_pos = int(contig_base_lengths['Chr'+str(contig_id)]) + start except KeyError: try: global_pos = int(contig_base_lengths['Chr0'+str(contig_id)]) + start except KeyError: pp(contig_base_lengths) pp(contig_id) raise KeyError(e) """ Remove ontology for now if feature['ontology_terms']: metainfo += ';ONTOLOGY(' for k, v in feature['ontology_terms'].items(): metainfo += str(k) + ',' + str(v) + ':' metainfo = metainfo[:-1] # remove trailing ; metainfo += ')' """ constructed_gff_line = str(feature['location'][0]['contig_id']) + '\t' + \ 'KBase\tgene\t' + \ str(feature['location'][0]['start']) + '\t' + \ str(end) + '\t.\t' + \ str(feature['location'][0]['strand']) + '\t' + \ str(global_pos) + '\t' + \ str(metainfo) + '\n' f.write(constructed_gff_line) f.close() if os.path.exists(gff_file_path): return gff_file_path else: raise FileNotFoundError('Unable to create GFF file form genome JSON.') def _process_tabix_results(self, queryresult): queryinfo = queryresult[8].split(';') if len(queryinfo) >= 2: extension = [clean_tsv_data(queryinfo[0][3:]), "NA", clean_tsv_data(queryinfo[1][9:])] elif len(queryinfo) is 1: extension = [clean_tsv_data(queryinfo[0][3:]), "NA", "NA"] else: extension = ['NA', 'NA', 'NA'] return extension def find_gene_info(self, row): tb = tabix_query(self.sorted_gff, row["CHR"], int(row["POS"]), int(row["POS"])) tbresult = next(tb, None) if tbresult is None: tb2 = tabix_query(self.sorted_gff, 'chr' + row["CHR"], int(row["POS"]), int(row["POS"])) tbresult2 = next(tb2, None) if tbresult2 is None: tb3 = tabix_query(self.sorted_gff, 'chr0' + row["CHR"], int(row["POS"]), int(row["POS"])) tbresult3 = next(tb3, None) if tbresult3 is None: if int(row["POS"]) < 500: nstart = 0 else: nstart = int(row["POS"]) - 500 neigh_tb = tabix_query(self.sorted_gff, row["CHR"], nstart, int(row["POS"]) + 500) neigh_result = next(neigh_tb, None) if neigh_result is None: return pd.Series(['NA', 'NA', 'NA'], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: nq = self._process_tabix_results(neigh_result) return pd.Series([nq[1], nq[0], nq[2]], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q3 = self._process_tabix_results(tbresult3) return pd.Series(q3, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q2 = self._process_tabix_results(tbresult2) return pd.Series(q2, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q = self._process_tabix_results(tbresult) return pd.Series(q, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) def get_gwas_result_file(self, association_ref, association_name, p_value): #association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0]['data']['data'] association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0] association_results = association_obj['data']["association_details"][0]["association_results"] result = "CHR\tSNP\tPOS\tP\tBP\n" for variation in association_results: if (float(variation[3]) > float(p_value)): continue result += str(variation[0]) + "\t" result += str(variation[1]) + "\t" result += str(variation[2]) + "\t" result += str(variation[3]) + "\t" result += str(variation[2]) + "\n" filepath = os.path.join(self.genome_dir, association_name) with open(filepath, "w") as file1: file1.write(result) return (filepath) def build_featureset(self, filepath, genome_ref, description, workspace_name, association_name, prefix): gene_ids = dict() element_ordering = list() elements = dict() skip_words = ["GENEID", "NEIGHBORGENE", "NA"] with open(filepath, 'r') as reader: for line in reader: fields = line.split("\t") condition1 = fields[5] not in skip_words condition2 = fields[5] not in elements condition3 = fields[6] not in skip_words condition4 = fields[6] not in elements if condition1 and condition2: element_ordering.append(fields[5]) elements[fields[5]] = [genome_ref] if condition3 and condition4: element_ordering.append(fields[6]) elements[fields[6]] = [genome_ref] featureset = dict() featureset['description'] = description featureset['element_ordering'] = element_ordering featureset['elements'] = elements ws_id = self.dfu.ws_name_to_id(workspace_name) featureset_obj_name = prefix + str(association_name) save_info = self.dfu.save_objects( { 'id': ws_id, 'objects': [ {'type': 'KBaseCollections.FeatureSet', 'data': featureset, 'name': featureset_obj_name}]})[0] obj_ref = "{0}/{1}/{2}".format( save_info[6], save_info[0], save_info[4] ) return obj_ref def annotate_GWAS_results(self, genome_ref, association_ref, workspace_name, prefix, p_value): #TODO: Send outfile to prep gff function inseted of hardcord #TODO: Removed hard coded stuff and create new directory for each test function self.genome_dir_name = "_".join(genome_ref.split("/")) self.genome_dir = os.path.join(self.shared_folder, self.genome_dir_name) if not os.path.isdir(self.genome_dir): os.mkdir(self.genome_dir) sorted_gff_path = os.path.join(self.genome_dir, 'out.gff.gz') self.sorted_gff = sorted_gff_path if not os.path.exists(sorted_gff_path): feature_num = self.gsu.search({'ref': genome_ref})['num_found'] # get genome features for gff construction genome_features = self.gsu.search({ 'ref': genome_ref, 'limit': feature_num, #'sort_by': [['feature_id', True]] })['features'] assembly_ref = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }])[0]['data']['assembly_ref'] # get assembly contigs for base length calculations assembly_contigs = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': assembly_ref }])[0]['data']['contigs'] contig_ids = list(assembly_contigs.keys()) contig_ids.sort() contig_base_lengths = {} prev_length = 0 for contig in contig_ids: contig_base_lengths[contig] = prev_length prev_length += assembly_contigs[contig]['length'] gff_file = os.path.join(self.genome_dir, 'constructed.gff') constructed_gff = self._construct_gff_from_json(genome_features, gff_file, contig_base_lengths) self.sorted_gff = self._prep_gff(constructed_gff) tabix_index(self.sorted_gff) obj_info = self.wsc.get_object_info3({"objects": [{"ref": association_ref}]}) association_name =obj_info["infos"][0][1] gwas_results_file = self.get_gwas_result_file(association_ref, association_name, p_value) gwas_results = pd.read_csv(gwas_results_file, sep='\t') gwas_results[['GENEID', 'NEIGHBORGENE', 'FUNCTION']] = \ gwas_results.apply(self.find_gene_info, axis=1) new_results_path = os.path.abspath(os.path.join(gwas_results_file, '..')) fname = 'final_' + association_name new_results_path = os.path.join(new_results_path, fname ) gwas_results.to_csv(path_or_buf=new_results_path, sep='\t', index=False) description = "Genelist for GWAS results of trait " + association_name featureset_obj = self.build_featureset( new_results_path, genome_ref, description, workspace_name, association_name, prefix) return featureset_obj
class ImportAttributeMappingUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.genapi = GenericsAPI(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_attribute_mapping_from_staging(self, params): """ import_attribute_mapping_from_staging: wrapper method for fba_tools.tsv_file_to_attribute_mapping required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name attribute_mapping_name: output conditionSet object name workspace_name: workspace name/ID of the object return: obj_ref: return object reference """ log('--->\nrunning ImportConditionSetUtil.import_attribute_mapping_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_attribute_mapping_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') ws_id = self.dfu.ws_name_to_id(params['workspace_name']) import_attribute_mapping_params = { 'output_obj_name': params['attribute_mapping_name'], 'output_ws_id': ws_id, 'input_file_path': scratch_file_path } ref = self.genapi.file_to_attribute_mapping( import_attribute_mapping_params) # Update the workspace object related meta-data for staged file self.uploader_utils.update_staging_service( params.get('staging_file_subdir_path'), ref.get('attribute_mapping_ref')) returnVal = {'obj_ref': ref.get('attribute_mapping_ref')} return returnVal @staticmethod def validate_import_attribute_mapping_from_staging_params(params): """ validate_import_attribute_mapping_from_staging_params: validates params passed to import_attribute_mapping_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'workspace_name', 'attribute_mapping_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_attribute_mapping_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False} object_data = self.dfu.get_objects(get_objects_params) upload_message += "Attribute Mapping Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported File: {}\n'.format( params.get('staging_file_subdir_path')) report_params = { 'message': upload_message, 'objects_created': [{ 'ref': obj_ref, 'description': 'Imported Attribute Mapping' }], 'workspace_name': params['workspace_name'], 'report_object_name': 'kb_upload_methods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output