def mock_gapi_fetch_sequence(params): logging.info('Mocking `gapi.fetch_sequence(%s)`' % str(params)) upa = ref_leaf(params) fp = _glob_upa(FETCH_SEQUENCE_DIR, upa) # Download and cache if fp is None: logging.info('Calling in cache mode `gapi.fetch_sequence(%s)`' % str(params)) gapi = GenericsAPI(os.environ['SDK_CALLBACK_URL'], service_ver='dev') fp_work = gapi.fetch_sequence(params) fp_cache = os.path.join( mkcache(FETCH_SEQUENCE_DIR), file_safe_ref(upa) + '.fa' ) shutil.copyfile( fp_work, fp_cache ) return fp_work # Pull from cache else: return fp
def __init__(self, config, scratch, callback_url ): self.scratch = config['scratch'] self.ws_url = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.wsc = Workspace(self.ws_url) self.scratch = scratch self.callback_url = callback_url self.au = AssemblyUtil(self.callback_url) self.gapi = GenericsAPI(self.callback_url)
def update_clients(): callback_url = os.environ['SDK_CALLBACK_URL'] Var.update( dfu=DataFileUtil(callback_url), kbr=KBaseReport(callback_url), fpu=FunctionalProfileUtil(callback_url, service_ver='dev'), gapi=GenericsAPI(callback_url, service_ver='dev'), )
def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) # set up directory for files folder self.output_dir = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(self.output_dir) self.files_folder = os.path.join(self.output_dir, 'files') os.mkdir(self.files_folder) self.file_paths = [] self.html_paths = [] self.GenAPI = GenericsAPI(self.callback_url)
def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.report_util = kb_GenericsReport(self.callback_url) self.generics_api = GenericsAPI(self.callback_url) self.ws_large_data = WsLargeDataIO(self.callback_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO)
def setUpClass(cls): token = os.environ.get('KB_AUTH_TOKEN', None) config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_clustering'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'user_id': user_id, 'provenance': [{ 'service': 'kb_clustering', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = Workspace(cls.wsURL) cls.serviceImpl = kb_clustering(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] suffix = int(time.time() * 1000) cls.wsName = "test_ContigFilter_" + str(suffix) ret = cls.wsClient.create_workspace({'workspace': cls.wsName}) cls.wsId = ret[0] cls.dfu = DataFileUtil(cls.callback_url) cls.gen_api = GenericsAPI(cls.callback_url, service_ver='dev') cls.prepare_data()
def run_picrust2_pipeline(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_picrust2_pipeline #################################################################################################### #################################################################################################### #################################################################################################### #################################################################################################### #################################################################################################### logging.info(params) # ## ### params, app-globals, directories, etc #### ##### logging.info('BEGINNING KB_PICRUST2. params: %s' % str(params)) params = Params(params) dprint('params', run=locals()) reset_Var() # clear all fields but `debug` Var.update( params=params, dfu=DataFileUtil(self.callback_url), kbr=KBaseReport(self.callback_url), fpu=FunctionalProfileUtil(self.callback_url, service_ver='beta'), gapi=GenericsAPI(self.callback_url), shared_folder=self.shared_folder, run_dir=os.path.join(self.shared_folder, 'run_dir_picrust2_' + str(uuid.uuid4())), warnings=[], objects_created=[], ) os.mkdir(Var.run_dir) # for this API-method run Var.update(return_dir=os.path.join(Var.run_dir, 'return'), ) os.mkdir(Var.return_dir) # for return input/output/logs etc. if Var.debug: with open(os.path.join(Var.run_dir, '#params'), 'w') as fh: json.dump(params.params, fh) # TODO document `run_dir` structure # ## ### obj #### ##### # instantiate amp_mat = AmpliconMatrix(params['amplicon_matrix_upa']) if 'row_attributemapping_ref' in amp_mat.obj: row_attrmap = AttributeMapping( amp_mat.obj['row_attributemapping_ref'], amp_mat) else: msg = ( "Input AmpliconMatrix " "does not have a row AttributeMapping to assign PICRUSt2 functions to." ) logging.warning(msg) Var.warnings.append(msg) # validate input data amp_mat.validate_amplicon_abundance_data() # generate input files seq_flpth = os.path.join(Var.return_dir, 'study_seqs.fna') seq_abundance_table_flpth = os.path.join(Var.return_dir, 'study_seqs.tsv') amp_mat.to_fasta(seq_flpth) amp_mat.to_seq_abundance_table(seq_abundance_table_flpth) # objs should be app globals Var.amp_mat = amp_mat # ## ### args #### ##### # TODO get tee functionality working in run_check # to avoid extra cmd Var.out_dir = os.path.join(Var.return_dir, 'PICRUSt2_output') log_flpth = os.path.join(Var.return_dir, 'log.txt') p = 4 cmd_pipeline = ' '.join([ 'set -o pipefail &&', 'source activate picrust2 &&', 'picrust2_pipeline.py', '-s', seq_flpth, '-i', seq_abundance_table_flpth, '-o', Var.out_dir, '--per_sequence_contrib', '-p', str(p), '|& tee', log_flpth, ]) cmd_description = ' \\\n'.join([ 'cd %s &&' % Var.out_dir, 'source activate picrust2 &&', 'add_descriptions.py -i EC_metagenome_out/pred_metagenome_unstrat.tsv.gz -m EC', ' -o EC_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz', '&&', 'add_descriptions.py -i KO_metagenome_out/pred_metagenome_unstrat.tsv.gz -m KO', ' -o KO_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz', '&&', 'add_descriptions.py -i pathways_out/path_abun_unstrat.tsv.gz -m METACYC', ' -o pathways_out/path_abun_unstrat_descrip.tsv.gz' ]) get_cmd_func_l = lambda FUNC: [ ('cd %s && ' % Var.out_dir + 'source activate picrust2 && ' f'hsp.py -i {FUNC} -t out.tre -o {FUNC}_predicted.tsv.gz -p {p}'), ('cd %s && ' % Var.out_dir + 'source activate picrust2 && ' 'metagenome_pipeline.py ' '-i ../%s ' % os.path.basename(seq_abundance_table_flpth) + '-m marker_predicted_and_nsti.tsv.gz ' f'-f {FUNC}_predicted.tsv.gz ' f'-o {FUNC}_metagenome_out') ] + ([] if FUNC == 'PHENO' else [ # no descriptions for IMG phenotype ('cd %s && ' % Var.out_dir + 'source activate picrust2 && ' f'add_descriptions.py -i {FUNC}_metagenome_out/pred_metagenome_unstrat.tsv.gz -m {FUNC} ' f'-o {FUNC}_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz' ), ]) cmd_func_l = [] for func in ['cog', 'pfam', 'tigrfam', 'pheno']: if params.getd(func) == 1: cmd_func_l.extend(get_cmd_func_l(func.upper())) # ## ### run #### ##### run_check(cmd_pipeline) run_check(cmd_description) for cmd_func in cmd_func_l: run_check(cmd_func) # ## ### sanity checks #### ##### if Var.debug: for func in Var.func_l: if not Var.params.getd(func): continue fp0 = os.path.join(Var.out_dir, Var.func_2_cfg[func]['relfp'][0]) fp1 = os.path.join(Var.out_dir, Var.func_2_cfg[func]['relfp'][1]) # Check dropped amplicons are the unaligned/distant ones (debug) appfile.check_dropped_amplicon_ids(fp0, amp_mat) # Check no samples dropped (debug) appfile.check_dropped_sample_ids(fp1, amp_mat) # ## ### update/save Amplicon workflow objects #### ##### path_abun_predictions_tsv_gz_flpth = os.path.join( Var.out_dir, 'pathways_out/path_abun_predictions.tsv.gz') attribute = 'MetaCyc Predictions' source = 'PICRUSt2' # if row AttributeMapping, # update that and referencing objs if amp_mat.row_attrmap_upa is not None: # update row AttributeMapping with traits id2attr = appfile.parse_picrust2_traits( path_abun_predictions_tsv_gz_flpth) ind, attribute = row_attrmap.add_attribute_slot(attribute, source) row_attrmap.map_update_attribute(ind, id2attr) row_attrmap_upa_new = row_attrmap.save() # update AmpliconMatrix which references row AttributeMapping amp_mat.obj['row_attributemapping_ref'] = row_attrmap_upa_new amp_mat_upa_new = amp_mat.save(name=params['output_name']) Var.objects_created.extend([ { 'ref': row_attrmap_upa_new, 'description': 'Added attribute `%s`' % attribute, }, { 'ref': amp_mat_upa_new, 'description': 'Updated amplicon AttributeMapping reference to `%s`' % row_attrmap_upa_new }, ]) # ## ### html report w/ heatmaps #### ##### logging.info('Beginning report business') ## ## report Var.report_dir = os.path.join(Var.run_dir, 'report') report_html_flpth = report.HTMLReportWriter( [cmd_pipeline, cmd_description] + cmd_func_l, ).write() html_links = [{ 'path': Var.report_dir, 'name': os.path.basename(report_html_flpth), }] # ## ### FunctionalProfile #### ##### logging.info('Starting saving FunctionalProfiles if any') if Var.debug: FP_amp_mat_ref = params[ 'amplicon_matrix_upa'] # this makes mocking more flexible in case something makes a fake UPA else: FP_amp_mat_ref = amp_mat_upa_new # this AmpliconMatrix is new one with new AttributeMapping # gunzip TSVs out to another directory tsv_dir = os.path.join(Var.run_dir, 'decompressed_tsv') os.mkdir(tsv_dir) for func in Var.func_l: if not Var.params.getd(func): continue func_name = Var.func_2_cfg[func]['name'] if Var.params.getd('create_amplicon_fps'): id = 'amplicon_' + func desc = 'Amplicon %s abundance' % func_name fp_src = os.path.join(Var.out_dir, Var.func_2_cfg[func]['relfp'][0]) fp_dst = os.path.join(tsv_dir, id + '.tsv') gunzip(fp_src, fp_dst) upa = Var.fpu.import_func_profile( dict( workspace_id=Var.params['workspace_id'], func_profile_obj_name='%s.%s' % (Var.params['output_name'], id), original_matrix_ref=FP_amp_mat_ref, profile_file_path=fp_dst, profile_type='amplicon', profile_category='organism', data_epistemology='predicted', epistemology_method='PICRUSt2', description=desc, ))['func_profile_ref'] Var.objects_created.append(dict(ref=upa, description=desc)) if Var.params.getd('create_sample_fps'): id = 'metagenome_' + func desc = 'Metagenome %s abundance' % func_name fp_src = os.path.join(Var.out_dir, Var.func_2_cfg[func]['relfp'][1]) fp_dst = os.path.join(tsv_dir, id + '.tsv') gunzip(fp_src, fp_dst) upa = Var.fpu.import_func_profile( dict( workspace_id=Var.params['workspace_id'], func_profile_obj_name='%s.%s' % (Var.params['output_name'], id), original_matrix_ref=FP_amp_mat_ref, profile_file_path=fp_dst, profile_type='mg', profile_category='community', data_epistemology='predicted', epistemology_method='PICRUSt2', description=desc, ))['func_profile_ref'] Var.objects_created.append(dict(ref=upa, description=desc)) # look at TSVs dprint( 'ls -lh %s/*' % tsv_dir, #'file -i %s/*/*' % tsv_dir, run='cli') # ## ### return files #### ##### file_links = [{ 'path': Var.return_dir, 'name': 'PICRUSt2_results.zip', 'description': 'Input, output, cmd, intermediate files, log' }] params_report = { 'warnings': Var.warnings, 'objects_created': Var.objects_created, 'file_links': file_links, 'html_links': html_links, 'direct_html_link_index': 0, 'report_object_name': 'kb_PICRUSt2_report', 'workspace_name': params['workspace_name'], 'html_window_height': report.REPORT_HEIGHT, } Var.params_report = params_report obj = Var.kbr.create_extended_report(params_report) output = { 'report_name': obj['name'], 'report_ref': obj['ref'], } #END run_picrust2_pipeline # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_picrust2_pipeline return value ' + 'output is not type dict as required.') # return the results return [output]
class VCFToVariation: def __init__(self, config, scratch, callback_url ): self.scratch = config['scratch'] self.ws_url = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.wsc = Workspace(self.ws_url) self.scratch = scratch self.callback_url = callback_url self.au = AssemblyUtil(self.callback_url) self.gapi = GenericsAPI(self.callback_url) def _parse_vcf_data(self, params): vcf_filepath = self._stage_input(params) # file is validated by this point, can assume vcf_filepath is valid reader = vcf.Reader(open(vcf_filepath, 'r')) version = float(reader.metadata['fileformat'][4:6]) genotypes = reader.samples chromosomes = [] contigs = {} totalvars = 0 for record in reader: totalvars += 1 if record.CHROM not in chromosomes: chromosomes.append(record.CHROM) if record.CHROM not in contigs.keys(): passvar = 1 if not record.FILTER else 0 contigs[record.CHROM] = { 'contig_id': record.CHROM, 'totalvariants': 1, 'passvariants': passvar, 'length': int(record.affected_end-record.affected_start), } else: contigs[record.CHROM]['totalvariants'] += 1 if not record.FILTER: contigs[record.CHROM]['passvariants'] += 1 vcf_info = { 'version': version, 'contigs': contigs, 'total_variants': totalvars, 'genotype_ids': genotypes, 'chromosome_ids': chromosomes, 'file_ref': vcf_filepath } return vcf_info def _validate_vcf_to_sample(self, vcf_genotypes, sample_ids): genos_not_found = [] vgenotypes = [x.upper().strip() for x in vcf_genotypes] sids = [x.upper().strip() for x in sample_ids] for geno in vgenotypes: if geno not in sids: genos_not_found.append(geno) if not genos_not_found: return True else: return genos_not_found def _chk_if_vcf_ids_in_assembly(self, vcf_chromosomes, assembly_chromosomes): chromos_not_in_assembly = [] pp(assembly_chromosomes) for chromo in vcf_chromosomes: if chromo not in assembly_chromosomes: chromos_not_in_assembly.append(chromo) if not chromos_not_in_assembly: return True else: return chromos_not_in_assembly def _get_vcf_version(self, vcf_filepath): with(gzip.open if is_gz_file(vcf_filepath) else open)(vcf_filepath, 'rt') as vcf: line = vcf.readline() tokens = line.split('=') if not (tokens[0].startswith('##fileformat')): log("Invalid VCF. ##fileformat line in meta is improperly formatted.") raise ValueError("Invalid VCF. ##fileformat line in meta is improperly formatted. " "Check VCF file specifications: https://samtools.github.io/hts-specs/") vcf_version = float(tokens[1][-4:].rstrip()) return vcf_version def validate_vcf(self, params): if 'genome_or_assembly_ref' not in params: raise ValueError('Genome or Assembly reference not in input parameters: \n\n'+params) if 'vcf_staging_file_path' not in params: raise ValueError('VCF staging file path not in input parameters: \n\n' + params) vcf_filepath = self._stage_input(params) vcf_version = self._get_vcf_version(vcf_filepath) # setup directorys for validation output validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4())) os.mkdir(validation_output_dir) # vcftools (vcf-validator) supports VCF v4.0-4.2 # https://github.com/vcftools/vcftools # EBIvariation/vcf-validator (vcf_validator_linux) supports VCF v4.1-4.3 # https://github.com/EBIvariation/vcf-validator # vcftools is only to validate VCF v4.0 if vcf_version >= 4.1: print("Using vcf_validator_linux...") validator_cmd = ["vcf_validator_linux"] validator_cmd.append("-i") validator_cmd.append(vcf_filepath) validator_cmd.append("-l") validator_cmd.append('error') print("VCF version "+str(vcf_version)+".") elif vcf_version >= 4.0: print("Using vcftools to validate...") validator_cmd = ["vcf-validator"] validator_cmd.append(vcf_filepath) print("VCF version 4.0.") else: raise ValueError('VCF Version not in file, or fileformat line malformatted, or not version >=4.0. file format line must be the ' 'first line of vcf file and in appropriate syntax. Check VCF file specifications: ' 'https://samtools.github.io/hts-specs/') print("Validator command: {}".format(validator_cmd)) p = subprocess.Popen(validator_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) validator_output = [] while True: line = p.stdout.readline() if not line: break if line.decode("utf-8").strip().startswith('[info]'): validator_output.append(line.decode("utf-8")) out, err = p.communicate() validation_output_filename = os.path.join(validation_output_dir, 'vcf_validation.txt') file_output_chk = [] try: if validator_output[0][:6] == '[info]': # validation by vcf_validator_linux validation_output_filename = validator_output[1].split(' ')[6].strip('\n') vo = validator_output[2].split(' ') file_output_chk = ''.join(vo[9:]).strip('\n') if not os.path.exists(validation_output_filename): raise ValueError(validation_output_filename+' does not exist!') if not file_output_chk == 'isvalid': print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) #TODO: more detailed validation parsing for vcf_validator_linux else: if validator_output: with open(validation_output_filename, 'w') as f: for line in validator_output: f.write(str(line)) f.close() print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) else: with open(validation_output_filename, 'w') as f: f.write("vcftools used to validate vcf file:\n"+vcf_filepath+"\n\File is validate as of vcf spec v4.0") f.close() # TODO: more detailed validation parsing for vcftools except IndexError: # if vcf file < v4.1, and valid it will produce index error on line 132 if validator_output: with open(validation_output_filename, 'w') as f: for line in validator_output: f.write(str(line)) f.close() print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) else: with open(validation_output_filename, 'w') as f: f.write("vcftools used to validate vcf file:\n" + vcf_filepath + "\n\File is validate as of vcf spec v4.0") f.close() if not os.path.exists(validation_output_filename): print('Validator did not generate log file!') raise SystemError("Validator did not generate a log file.") log("Validator output filepath: {}".format(validation_output_filename)) log("Return code from validator {}".format(p.returncode)) return validation_output_filename def _stage_input(self, params): # extract file location from input ui parameters if params['vcf_staging_file_path'].startswith('/kb/module/test/'): # variation utils unit test vcf_local_file_path = params['vcf_staging_file_path'] if vcf_local_file_path.endswith('.gz'): with gzip.open(vcf_local_file_path, 'rb') as f_in: with open(vcf_local_file_path[:-3], 'wb') as f_out: shutil.copyfileobj(f_in, f_out) vcf_local_file_path = vcf_local_file_path[:-3] else: staging_dir = '/staging' vcf_local_file_path = os.path.join(staging_dir, params['vcf_staging_file_path']) if not os.path.exists(vcf_local_file_path): raise OSError('VCF input path does not exist, or is not readable') orig_file_path = os.path.join(self.scratch, 'original_' + os.path.basename(vcf_local_file_path)) print(f'VCF: {vcf_local_file_path} Orig: {orig_file_path}') self.original_file = shutil.copy(vcf_local_file_path, orig_file_path) # TODO: use data file utils here, upload vcf to shock, use dfu. if is_gz_file(vcf_local_file_path): # /staging is read only, therefore have to copy before uncompressing if not vcf_local_file_path == os.path.join(self.scratch, params['vcf_staging_file_path']): copy = shutil.copy(vcf_local_file_path, os.path.join(self.scratch,params['vcf_staging_file_path'])) unpack = self.dfu.unpack_file({'file_path': copy}) else: unpack = {} unpack['file_path'] = os.path.join(self.scratch,params['vcf_staging_file_path']) params['vcf_local_file_path'] = unpack['file_path'] return unpack['file_path'] else: params['vcf_local_file_path'] = vcf_local_file_path return vcf_local_file_path def _create_sample_attribute_file(self, vcf_file, sample_attribute_mapping_file): """ function for creating sample attribute mapping file. """ try: with open (vcf_file, 'r') as vcf_handle: Lines = vcf_handle.readlines() for line in Lines: if(line.startswith("#CHROM")): header = line.lstrip().split("\t") try: with open (sample_attribute_mapping_file, 'w') as attribute_mapping_handle: attribute_mapping_handle.write("Attribute\tAttribute ontology ID\tUnit\tUnit ontology ID") for i in range(9,len(header)): attribute_mapping_handle.write("\t"+header[i]) #attribute_mapping_handle.write("\n") attribute_mapping_handle.write("label\t\t\t") for j in range(9,len(header)): attribute_mapping_handle.write("\t"+header[j]) #attribute_mapping_handle.write("\n") except IOError: print("Could not write to file:", sample_attribute_mapping_file) except IOError: print("Could not read file:", vcf_file) def _validate_assembly_ids(self, params): # All chromosome ids from the vcf should be in assembly # but not all assembly chromosome ids should be in vcf if ('genome_ref' in params): subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': params['genome_or_assembly_ref'] }]) self.vcf_info['assembly_ref'] = subset[0]['data']['assembly_ref'] if ('assembly_ref' in params): self.vcf_info['assembly_ref'] = params['assembly_ref'] assembly_chromosome_ids_call = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': self.vcf_info['assembly_ref'] }]) assembly_chromosomes = assembly_chromosome_ids_call[0]['data']['contigs'].keys() vcf_chromosomes = self.vcf_info['chromosome_ids'] chk_assembly_ids = self._chk_if_vcf_ids_in_assembly(vcf_chromosomes, assembly_chromosomes) if isinstance(chk_assembly_ids, list): failed_ids = ' '.join(chk_assembly_ids) print(f'VCF contig ids: {failed_ids} are not present in assembly.') raise ValueError(f'VCF contig ids: {failed_ids} are not present in assembly.') return assembly_chromosomes def _validate_sample_ids(self, params): # All samples within the VCF file need to be in sample attribute list vcf_genotypes = self.vcf_info['genotype_ids'] sample_ids_subset = self.wsc.get_object_subset([{ 'included': ['/instances'], 'ref': params['sample_attribute_ref'] }]) sample_ids = sample_ids_subset[0]['data']['instances'].keys() validate_genotypes = self._validate_vcf_to_sample(vcf_genotypes, sample_ids) if isinstance(validate_genotypes, list): failed_genos = ' '.join(validate_genotypes) print(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.') raise ValueError(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.') return sample_ids def _construct_contig_info(self, params): """ KBaseGwasData.Variations type spec /* Contig variation data contig_id - contig identifier totalvariants - total number of variants in each contig passvariants - total number of variants that pass quality variation filter in contig length - length of contig from assembly data */ typdef structure { string contig_id; int totalvariants; int passvariants; int length; // from assembly } contig_info; """ assembly_chromosome_dict = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': self.vcf_info['assembly_ref'] }])[0]['data']['contigs'] contigs = [] contig_infos = self.vcf_info['contigs'] for contig_id in contig_infos: length_contig = assembly_chromosome_dict[contig_id].get("length") contig_infos[contig_id]["length"] = length_contig contigs.append(contig_infos[contig_id]) return contigs def _bgzip_vcf(self, vcf_filepath): if not os.path.exists(vcf_filepath): print (vcf_filepath + " does not exist") zip_cmd = ["bgzip", vcf_filepath] p = subprocess.Popen(zip_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() bgzip_file_path = vcf_filepath + ".gz" print (bgzip_file_path) return bgzip_file_path def _index_vcf(self, bgzip_file): output_dir = self.scratch bgzip_filepath = os.path.join(self.scratch, bgzip_file) if not os.path.exists(bgzip_filepath): print (bgzip_filepath + " does not exist") index_cmd = ["tabix", "-p", "vcf", bgzip_filepath] p = subprocess.Popen(index_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() index_file_path = bgzip_filepath + ".tbi" return index_file_path def _index_assembly(self, assembly_file): if not os.path.exists(assembly_file): print (assembly_file + " does not exist") logging.info("indexing assembly file") assembly_index_cmd = ["samtools", "faidx", assembly_file] print(assembly_index_cmd) p = subprocess.Popen(assembly_index_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() logging.info("indexing of assembly file done!") return assembly_file + ".fai" def _download_assembly(self, assembly_ref): file = self.au.get_assembly_as_fasta({ 'ref': assembly_ref }) return file def _construct_variation(self, params, contigs_info): """ KBaseGwasData.Variations type spec /* Variation object data structure num_genotypes - number of total genotypes within variant file num_variants - number of total variants within variant file contigs - list of contig ids and variant information attribute_ref - KBase reference to attribute mapping workspace object genome_ref - KBase reference to genome workspace object assembly_ref - KBase reference to assemebly workspace object vcf_handle_ref - VCF handle reference to VCF file @optional genome_ref */ typedef structure { int numgenotypes; int numvariants; list<contig_info> contigs; attribute_ref population; // KBaseExperiments.AttributeMapping genome_ref genome_ref; // KBaseGenomes.Genome assembly_ref assemby_ref; // KBaseGenomeAnnotations.Assembly vcf_handle_ref vcf_handle_ref; } Variations; :param params: KBase ui input parameters :param population: previoiusly constructed sample population data :return: constructed variation object (dictionary) """ if not self.vcf_info['file_ref'].startswith(self.scratch): new_vcf_file = os.path.join(self.scratch, os.path.basename(self.vcf_info['file_ref'])) self.vcf_info['file_ref'] = shutil.copy(self.vcf_info['file_ref'], new_vcf_file) vcf_staged_file = self.original_file bgzip_file_path = self._bgzip_vcf(vcf_staged_file) vcf_shock_file_ref = self.dfu.file_to_shock( {'file_path': bgzip_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(bgzip_file_path, vcf_shock_file_ref) index_file_path = self._index_vcf(bgzip_file_path) vcf_index_shock_file_ref = self.dfu.file_to_shock( {'file_path': index_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(index_file_path, vcf_index_shock_file_ref) assembly_file_path = self._download_assembly(self.vcf_info['assembly_ref'])['path'] assembly_index_file_path = self._index_assembly(assembly_file_path) assembly_index_shock_file_ref = self.dfu.file_to_shock( {'file_path': assembly_index_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(assembly_index_file_path, assembly_index_shock_file_ref) variation_obj = { 'numgenotypes': int(len(self.vcf_info['genotype_ids'])), 'numvariants': int(self.vcf_info['total_variants']), 'contigs': contigs_info, 'population': params['sample_attribute_ref'], # TYPE SPEC CHANGE: need to change type spec to assembly_ref instead of assemby_ref 'assemby_ref': self.vcf_info['assembly_ref'], 'vcf_handle_ref': vcf_shock_file_ref['handle']['hid'], 'vcf_handle' : vcf_shock_file_ref['handle'], 'vcf_index_handle_ref': vcf_index_shock_file_ref['handle']['hid'], 'vcf_index_handle': vcf_index_shock_file_ref['handle'], 'assembly_index_handle_ref': assembly_index_shock_file_ref['handle']['hid'], 'assembly_index_handle': assembly_index_shock_file_ref['handle'] } if 'genome_ref' in params: variation_obj['genome_ref'] = params['genome_ref'] return variation_obj def _save_var_obj(self, params, var): """ :param params: :param var: :return: DataFileUtils object_info: objid - the numerical id of the object. name - the name of the object. type - the type of the object. save_date - the save date of the object. ver - the version of the object. saved_by - the user that saved or copied the object. wsid - the id of the workspace containing the object. workspace - the name of the workspace containing the object. chsum - the md5 checksum of the object. size - the size of the object in bytes. meta - arbitrary user-supplied metadata about the object. """ print('Saving Variation to workspace...\n') if var: if not 'variation_object_name' in params: var_obj_name = 'variation_'+str(uuid.uuid4()) else: var_obj_name = params['variation_object_name'] var_obj_info = self.dfu.save_objects({ 'id': self.dfu.ws_name_to_id(params['workspace_name']), 'objects': [{ 'type': 'KBaseGwasData.Variations', 'data': var, 'name': var_obj_name }] })[0] return var_obj_info else: raise ValueError('Variation object blank, cannot not save to workspace!') def _validate_sample_attribute_ref(self, params): #params["sample_attribute_ref"] = '' #just for testing if not params['sample_attribute_ref']: sample_attribute_mapping_file = os.path.join(self.scratch ,"sample_attribute.tsv") #hardcoded for testing self._create_sample_attribute_file(params['vcf_local_file_path'], sample_attribute_mapping_file) logging.info("Uploading sample attribute file to ref") vcf_sample_attribute_shock_file_ref = self.dfu.file_to_shock( {'file_path': sample_attribute_mapping_file, 'make_handle': 1} ) shock_id = vcf_sample_attribute_shock_file_ref['shock_id'] ws_id = self.dfu.ws_name_to_id(params['workspace_name']) import_params = { 'input_shock_id' : shock_id, 'output_ws_id': ws_id, 'output_obj_name': 'Sample_attribute'} ret = self.gapi.file_to_attribute_mapping(import_params) params['sample_attribute_ref'] = ret['attribute_mapping_ref'] def import_vcf(self, params): # VCF validation # VCF file validation file_valid_result = self.validate_vcf(params) self._validate_sample_attribute_ref(params) # VCF file parsing self.vcf_info = self._parse_vcf_data(params) # Validate vcf chromosome ids against assembly chromosome ids self._validate_assembly_ids(params) # Validate vcf genotypes against sample meta data ids self._validate_sample_ids(params) # Variation object construction # construct contigs_info contigs_info = self._construct_contig_info(params) # construct variation var = self._construct_variation(params, contigs_info) # Save variation object to workspace var_wksp_obj = self._save_var_obj(params, var) return [var_wksp_obj, var]
def run_FAPROTAX(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_FAPROTAX logging.info(params) Var.update({ # carry over into globals `Var`, regardless of resetting, for all API-method runs 'params': Params(params), 'shared_folder': self.shared_folder, 'kbase_endpoint': self.config['kbase-endpoint'], # contains environment, for constructing Genome landing page url #--- 'ws': Workspace(self.workspace_url), 'dfu': DataFileUtil(self.callback_url), # instantiate here so within runtime of @patch 'kbr': KBaseReport(self.callback_url), # instantiate here so within runtime of @patch 'gapi': GenericsAPI(self.callback_url), 'fpu': FunctionalProfileUtil(self.callback_url, service_ver='beta'), # TODO overhead? #--- 'warnings': [], #--- 'run_dir': os.path.join(self.shared_folder, 'kbfptx_' + str(uuid.uuid4())), }) os.mkdir(Var.run_dir) Var.update({ 'return_dir': os.path.join(Var.run_dir, 'return'), }) os.mkdir(Var.return_dir) # ## ### detect input type #### ##### oi = Var.ws.get_object_info3( {'objects': [{ 'ref': params['input_upa'] }]})['infos'][0] if oi[2].startswith('KBaseSearch.GenomeSet'): output = do_GenomeSet_workflow() elif oi[2].startswith('KBaseMatrices.AmpliconMatrix'): output = do_AmpliconMatrix_workflow() else: raise Exception('Unknown type `%s` for `input_upa`' % oi[2]) #END run_FAPROTAX # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_FAPROTAX return value ' + 'output is not type dict as required.') # return the results return [output]
def run_classify(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_classify logging.info(params) params = Params(params) Var.params = params ''' tmp/ `shared_folder` └── kb_rdp_clsf_<uuid>/ `run_dir` ├── return/ `return_dir` | ├── cmd.txt | ├── study_seqs.fna | └── RDP_Classifier_output/ `out_dir` | ├── out_allRank.tsv | └── out_fixedRank.tsv └── report/ `report_dir` ├── pie_hist.html ├── suburst.html └── report.html ''' ## ## set up globals ds `Var` for this API-method run ## which involves making this API-method run's directory structure Var.update({ 'run_dir': os.path.join(self.shared_folder, 'kb_rdp_clsf_' + str(uuid.uuid4())), 'dfu': DataFileUtil(self.callback_url), 'ws': Workspace(self.workspace_url), 'gapi': GenericsAPI(self.callback_url), 'kbr': KBaseReport(self.callback_url), 'warnings': [], }) os.mkdir(Var.run_dir) Var.update({ 'return_dir': os.path.join(Var.run_dir, 'return'), 'report_dir': os.path.join(Var.run_dir, 'report'), }) os.mkdir(Var.return_dir) os.mkdir(Var.report_dir) Var.update( {'out_dir': os.path.join(Var.return_dir, 'RDP_Classifier_output')}) os.mkdir(Var.out_dir) # cat and gunzip SILVA refdata # which has been split into ~99MB chunks to get onto Github #if params.is_custom(): # app_file.prep_refdata() # ## ### load objects #### ##### amp_mat = AmpliconMatrix(params['amp_mat_upa']) row_attr_map_upa = amp_mat.obj.get('row_attributemapping_ref') create_row_attr_map = row_attr_map_upa is None row_attr_map = AttributeMapping(row_attr_map_upa, amp_mat=amp_mat) # ## ### cmd #### ##### fasta_flpth = os.path.join(Var.return_dir, 'study_seqs.fna') Var.out_allRank_flpth = os.path.join(Var.out_dir, 'out_allRank.tsv') Var.out_shortSeq_flpth = os.path.join( Var.out_dir, 'out_unclassifiedShortSeqs.txt') # seqs too short to classify shutil.copyfile(amp_mat.get_fasta(), fasta_flpth) cmd = ('java -Xmx4g -jar %s classify %s ' % (Var.classifier_jar_flpth, fasta_flpth) + ' '.join(params.cli_args) + ' ' + '--format allRank ' + '--outputFile %s --shortseq_outfile %s' % (Var.out_allRank_flpth, Var.out_shortSeq_flpth)) run_check(cmd) # ## ### extract classifications #### ##### id2taxStr = app_file.get_fix_filtered_id2tax() # get ids of classified and unclassified seqs shortSeq_id_l = app_file.parse_shortSeq( ) # sequences too short to get clsf classified_id_l = list(id2taxStr.keys()) # make sure classifieds and shorts complement if Var.debug: ret = sorted(classified_id_l + shortSeq_id_l) mat = sorted(amp_mat.obj['data']['row_ids']) assert ret == mat, \ 'diff1: %s, diff2: %s' % (set(ret)-set(mat), set(mat)-set(ret)) if len(classified_id_l) == 0: raise Exception('No sequences were long enough to be classified') # add in id->'' for unclassified seqs # so id2taxStr_l is complete # so no KeyErrors later for shortSeq_id in shortSeq_id_l: id2taxStr[shortSeq_id] = '' # add to globals for testing Var.shortSeq_id_l = shortSeq_id_l # ## ### add to row AttributeMapping #### ##### prose_args = params.get_prose_args() attribute = ('RDP Classifier Taxonomy (conf=%s, gene=%s)' % (prose_args['conf'], prose_args['gene'])) attribute_names = row_attr_map.get_attribute_names() if attribute in attribute_names: attribute = get_numbered_duplicate(attribute_names, attribute) source = 'RDP Classifier' ind, attribute = row_attr_map.add_attribute_slot(attribute, source) row_attr_map.update_attribute(ind, id2taxStr) # ## ### save obj #### ##### amp_mat_output_name = Var.params['output_name'] attr_map_output_name = (amp_mat_output_name + '.Amplicon_attributes' if create_row_attr_map else None) row_attr_map_upa_new = row_attr_map.save(name=attr_map_output_name) amp_mat.obj['row_attributemapping_ref'] = row_attr_map_upa_new amp_mat_upa_new = amp_mat.save(amp_mat_output_name) objects_created = [ dict( # row AttrMap ref=row_attr_map_upa_new, description='%sAdded attribute `%s`' % ( 'Created. ' if create_row_attr_map else '', attribute, )), dict( # AmpMat ref=amp_mat_upa_new, description= 'Updated amplicon AttributeMapping reference to `%s`' % row_attr_map_upa_new), ] # testing if Var.debug: Var.update(dict( amp_mat=amp_mat, row_attr_map=row_attr_map, )) # ## ### html report #### ##### hrw = report.HTMLReportWriter(cmd_l=[cmd]) html_flpth = hrw.write() html_links = [{ 'path': Var.report_dir, 'name': os.path.basename(html_flpth), }] # ## ### #### ##### file_links = [{ 'path': Var.run_dir, 'name': 'RDP_Classifier_results.zip', 'description': 'Input, output' }] params_report = { 'warnings': Var.warnings, 'objects_created': objects_created, 'html_links': html_links, 'direct_html_link_index': 0, 'file_links': file_links, 'workspace_id': params['workspace_id'], 'html_window_height': Var.report_height, } # testing Var.params_report = params_report report_obj = Var.kbr.create_extended_report(params_report) output = { 'report_name': report_obj['name'], 'report_ref': report_obj['ref'], } #END run_classify # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_classify return value ' + 'output is not type dict as required.') # return the results return [output]
class Subsetting_Matrices: def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) # set up directory for files folder self.output_dir = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(self.output_dir) self.files_folder = os.path.join(self.output_dir, 'files') os.mkdir(self.files_folder) self.file_paths = [] self.html_paths = [] self.GenAPI = GenericsAPI(self.callback_url) def _get_df(self, params): """ Get Amplicon Matrix Data then make Pandas.DataFrame(), also get taxonomy data and add it to df. """ logging.info('Getting DataObject') # Amplicon data obj = self.dfu.get_objects( {'object_refs': [params.get('input_obj_ref')]}) self._make_fasta(obj_ref=obj['data'][0]['data']['amplicon_set_ref']) amp_data = obj['data'][0]['data'] row_ids = amp_data['data']['row_ids'] col_ids = amp_data['data']['col_ids'] values = amp_data['data']['values'] # Add 'taxonomy' column col_ids.append('taxonomy') # Make pandas DataFrame df = pd.DataFrame(index=row_ids, columns=col_ids) for i in range(len(row_ids)): df.iloc[i, :-1] = values[i] # Get object test_row_attributes_permanent_id = obj['data'][0]['data'][ 'row_attributemapping_ref'] obj = self.dfu.get_objects( {'object_refs': [test_row_attributes_permanent_id]}) tax_dict = obj['data'][0]['data']['instances'] # Add taxonomy data and transpose matrix for row_indx in df.index: df.loc[row_indx]['taxonomy'] = tax_dict[row_indx][0] return df def _get_mdf(self, params): """ Get metadata object and make pd.DataFrame from with samples as index and specified subsetting column """ logging.info('Getting MetadataObject') subsetting_field = params.get('subset_field') subsetting_field = subsetting_field['meta_group'][0] params['subset_field'] = subsetting_field # Get object obj = self.dfu.get_objects( {'object_refs': [params.get('attribute_mapping_obj_ref')]}) meta_dict = obj['data'][0]['data']['instances'] attr_l = obj['data'][0]['data']['attributes'] # Find index of specified category name indx = 0 for i in range(len(attr_l)): if attr_l[i]['attribute'] == subsetting_field: indx = i break # Set metadata_samples metadata_samples = meta_dict.keys() # Make pandas DataFrame mdf = pd.DataFrame(index=metadata_samples, columns=[subsetting_field]) i = 0 for key, val in meta_dict.items(): mdf.iloc[i] = val[indx] i += 1 return mdf def insert_newlines(self, string, every): return '\n'.join(string[i:i + every] for i in range(0, len(string), every)) def _make_fasta(self, obj_ref): logging.info( 'Making fasta file from AmpliconSet obj: {}'.format(obj_ref)) set_obj = self.dfu.get_objects({'object_refs': [obj_ref]}) OTUs = set_obj['data'][0]['data']['amplicons'].keys() with open(os.path.join(self.files_folder, "amp_set.fa"), 'w') as fa_file: logging.info('Writing to amp_set.fa file') for key in OTUs: con_str = '>' + key + '\n' con_str += self.insert_newlines( set_obj['data'][0]['data']['amplicons'][key] ['consensus_sequence'], 60) con_str += '\n' fa_file.write(con_str) def _make_group_dict(self, mdf, subset_field): """ Make dictionary with a subsetting column value as key and samples of that subsetting column value and values """ logging.info('Making grouping dictionary') group_dict = {} for sample, group in zip(mdf.index, mdf[subset_field]): try: group_dict[group].append(sample) except KeyError: group_dict.update({group: [sample]}) for group, sample_list in group_dict.items(): group_dict[group].append('taxonomy') return group_dict def _create_subset_matrices(self, df, mdf, subset_field): """ create dictionary of subset pd.DataFrames """ logging.info('Creating matrices...') group_dict = self._make_group_dict(mdf=mdf, subset_field=subset_field) # Create dict of sub matrices dict_of_sub_matrices = {} for key, val in group_dict.items(): data = df[val] dict_of_sub_matrices.update({key: data}) # Drop rows that have all zero counts for key, matrix in dict_of_sub_matrices.items(): to_drop = [] for indx in matrix.index: if all(val == 0 for val in matrix.loc[indx][0:-1]): to_drop.append(indx) dict_of_sub_matrices[key] = matrix.drop(to_drop) return dict_of_sub_matrices def _save_matrices(self, matrices): """ takes a dictionary of pd.matrices and saves the matrices as tab sep csv's, with the name being keys """ logging.info('Saving matrices: {}'.format(matrices.keys())) for group, matrix in matrices.items(): name = group + '.csv' matrix.to_csv(os.path.join(self.files_folder, name), sep='\t') def _create_html_report(self): """ Create html report of files in zip by walking through output folder """ logging.info('Creating html report..') html_str = '<html>' html_str += '<h3>Files In Output Zip File:</h3>\n' for root, folders, files in os.walk(self.output_dir): # Find the image files by their extensions. for f in files: if re.match('^[a-zA-Z]+.*.(fa|csv)$', f): # jpeg|jpg|bmp|png|tiff|pdf|ps| html_str += '<p>' + f + '</p>\n' html_str += '</html>' with open(os.path.join(self.files_folder, "index.html"), 'w') as index_file: index_file.write(html_str) # have needed files saved to folder before shock shock = self.dfu.file_to_shock({ 'file_path': self.files_folder, 'make_handle': 0, 'pack': 'zip' }) # list that goes to 'html_links' self.html_paths.append({ 'shock_id': shock['shock_id'], 'name': 'index.html', 'label': 'Report', 'description': "files in zip" }) # list that goes to 'file_pahts' self.file_paths.append(os.path.join(self.files_folder, 'files.zip')) def _call_and_create_objects(self, params): logging.info('_call_and_create_objects method') list_of_matrix_files = [] groups = [] for root, folders, files in os.walk(self.files_folder): logging.info('Finding files..') # Find the image files by their extensions. for f in files: if re.match('^[a-zA-Z]+.*.(fa)$', f): fa_file = os.path.join(root, f) if re.match('^[a-zA-Z]+.*.(csv)$', f): groups.append(f[0:-4]) list_of_matrix_files.append(os.path.join(root, f)) for csv_file_path, group_name in zip(list_of_matrix_files, groups): logging.info('Sending data to importer:\n' 'csv_file_path: {}\n' 'group_name: {}\n' 'fa_file: {}'.format(csv_file_path, group_name, fa_file)) params['obj_type'] = 'AmpliconMatrix' params['matrix_name'] = group_name params['tsv_fasta'] = { 'tsv_file_tsv_fasta': csv_file_path, 'fasta_file_tsv_fasta': fa_file, 'metadata_keys_tsv_fasta': 'taxonomy_id, taxonomy, taxonomy_source, consensus_sequence' } params['scale'] = 'raw' params['description'] = 'dsc' params['amplicon_set_name'] = group_name + '-set' params['sample_set_ref'] = params.get('attribute_mapping_obj_ref') params['input_local_file'] = True logging.info('Sending params: {}'.format( json.dumps(params, indent=1))) obj_run = self.GenAPI.import_matrix_from_biom(params=params) logging.info('Object run: {}'.format(obj_run)) def _create_amp(self): amp_structure = { 'data': [{ 'data': { 'amplicon_set_ref': '', 'col_attributemapping_ref': '', 'col_mapping': {}, 'data': {}, 'row_attributemapping_ref': '', 'row_mapping': {}, 'scale': 'raw' }, 'info': [], 'path': [''], 'provenance': [], 'creator': '', 'orig_wsid': 0000, 'created': '', 'epoch': 0000, 'refs': [], 'copy_source_inaccessible': 0, 'extracted_ids': {} }] } def run(self, params): logging.info('--->\nrunning Amp_Subset_Util with input \n' + 'params:\n{}'.format(json.dumps(params, indent=1))) df = self._get_df(params) mdf = self._get_mdf(params) matrices = self._create_subset_matrices( df=df, mdf=mdf, subset_field=params.get('subset_field')) self._save_matrices(matrices) self._create_html_report() self._call_and_create_objects(params) return {'file_paths': self.file_paths, 'html_paths': self.html_paths}
def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.genapi = GenericsAPI(self.callback_url)
class ImportAttributeMappingUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.genapi = GenericsAPI(self.callback_url) def import_attribute_mapping_from_staging(self, params): """ import_attribute_mapping_from_staging: wrapper method for fba_tools.tsv_file_to_attribute_mapping required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name attribute_mapping_name: output conditionSet object name workspace_name: workspace name/ID of the object return: obj_ref: return object reference """ log('--->\nrunning ImportConditionSetUtil.import_attribute_mapping_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_attribute_mapping_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') ws_id = params['workspace_id'] import_attribute_mapping_params = { 'output_obj_name': params['attribute_mapping_name'], 'output_ws_id': ws_id, 'input_file_path': scratch_file_path } ref = self.genapi.file_to_fbamodel_attribute_mapping( import_attribute_mapping_params) returnVal = {'obj_ref': ref.get('attribute_mapping_ref')} return returnVal @staticmethod def validate_import_attribute_mapping_from_staging_params(params): """ validate_import_attribute_mapping_from_staging_params: validates params passed to import_attribute_mapping_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'workspace_id', 'attribute_mapping_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_attribute_mapping_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False} object_data = self.dfu.get_objects(get_objects_params) upload_message += "FBAModelSet Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported File: {}\n'.format( params.get('staging_file_subdir_path')) report_params = { 'message': upload_message, 'objects_created': [{ 'ref': obj_ref, 'description': 'Imported FBAModelSet' }], 'workspace_id': params['workspace_id'], 'report_object_name': 'import_model_attri_mapping_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output