def export_original_genbank(self, ctx, params): # 1) validate parameters and extract defaults self.validate_params(params) # 2) get genome genbank handle reference getGenomeOptions = { 'genomes': [{ 'ref': params['genome_ref'] }], 'included_fields': ['genbank_handle_ref'], 'ignore_errors': 0 # if we can't find the genome, throw an error } if 'ref_path_to_genome' in params: getGenomeOptions['genomes'][0]['ref_path_to_genome'] = params[ 'ref_path_to_genome'] api = GenomeAnnotationAPI(self.cfg.callbackURL) genome_data = api.get_genome_v1(getGenomeOptions)['genomes'][0] info = genome_data['info'] data = genome_data['data'] # 3) make sure the type is valid if info[2].split('-')[0] != 'KBaseGenomes.Genome': raise ValueError('Object is not a Genome, it is a:' + str(info[2])) # 4) if the genbank handle is there, get it and return print('checking if genbank file is cached...') result = self.get_genbank_handle(data) return result
def run_vcontact(self, ctx, params): """ :param params: instance of type "InParams" -> structure: parameter "genome" of type "obj_ref" (Insert your typespec information here.) """ # ctx is the context object #BEGIN run_vcontact self.callback_url = os.environ['SDK_CALLBACK_URL'] vc = vConTACTUtils(self.config) self.genome_api = GenomeAnnotationAPI(self.callback_url) genome = params['genome'] genome_data = self.genome_api.get_genome_v1( {"genomes": [{ "ref": genome }]}) gene2genome, sequences = vc.genome_to_inputs(genome_data) gene2genome_fp, sequences_fp = vc.write_inputs(gene2genome, sequences) params['gene2genome'] = gene2genome_fp params['sequences'] = sequences_fp returnVal = vc.run_vcontact(params) vc.vcontact_help() kbo = KBObjectUtils(self.config) kbo.create_report(params['workspace_name']) #END run_vcontact pass
def export(self, ctx, params): # 1) validate parameters and extract defaults self.validate_params(params) # 2) get genome genbank handle reference getGenomeOptions = { 'genomes': [{ 'ref': params['genome_ref'] }], 'included_fields': ['genbank_handle_ref'], 'ignore_errors': 0 # if we can't find the genome, throw an error } if 'ref_path_to_genome' in params: getGenomeOptions['genomes'][0]['ref_path_to_genome'] = params[ 'ref_path_to_genome'] api = GenomeAnnotationAPI(self.cfg.callbackURL) genome_data = api.get_genome_v1(getGenomeOptions)['genomes'][0] info = genome_data['info'] data = genome_data['data'] # 3) make sure the type is valid if info[2].split('-')[0] != 'KBaseGenomes.Genome': raise ValueError('Object is not a Genome, it is a:' + str(info[2])) # 4) build the genbank file and return it print('not cached, building file...') result = self.build_genbank_file(getGenomeOptions, "KBase_derived_" + info[1] + ".gbff") if result is None: raise ValueError('Unable to generate file. Something went wrong') result['from_cache'] = 0 return result
def test_annotate_contigs(self): assembly_file_name = "small.fna" #"AP009048.fna" assembly_test_file = os.path.join("/kb/module/test/data", assembly_file_name) assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name) shutil.copy(assembly_test_file, assembly_temp_file) assembly_name = 'Assembly.1' au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) assembly_ref = au.save_assembly_from_fasta({'file': {'path': assembly_temp_file}, 'workspace_name': self.getWsName(), 'assembly_name': assembly_name}) # Add a genome to the WS to test ref_paths genome_name = "Genome.1" genome = {'id': 'Unknown', 'features': [], 'scientific_name': "", 'domain': "", 'genetic_code': 0, 'assembly_ref': assembly_ref, 'cdss': [], 'mrnas': [], 'source': 'Magic!', 'gc_content': 0, 'dna_size': 0, 'reference_annotation': 0} prov = self.getContext().provenance() ga = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL']) info = ga.save_one_genome_v1( {'workspace': self.getWsName(), 'name': genome_name, 'data': genome, 'provenance': prov})['info'] genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) result = self.getImpl().annotate_contigs(self.getContext(), {'assembly_ref': "{};{}".format(genome_ref, assembly_ref), 'output_workspace': self.getWsName(), 'output_genome_name': genome_name, 'evalue': None, 'fast': 0, 'gcode': 0, 'genus': 'genus', 'kingdom': 'Bacteria', 'metagenome': 0, 'mincontiglen': 1, 'norrna': 0, 'notrna': 0, 'rawproduct': 0, 'rfam': 1, 'scientific_name': 'Super : diper - name;' })[0] rep = self.getWsClient().get_objects([{'ref': result['report_ref']}])[0]['data'] self.assertTrue('text_message' in rep) print("Report:\n" + str(rep['text_message'])) genome_ref = self.getWsName() + "/" + genome_name genome = self.getWsClient().get_objects([{'ref': genome_ref}])[0]['data'] features_to_work = {} for feature in genome['features']: features_to_work[feature['id']] = feature['location'] aseq = AssemblySequenceAPI(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token']) dna_sequences = aseq.get_dna_sequences({'requested_features': features_to_work, 'assembly_ref': genome['assembly_ref']})['dna_sequences'] bad_dnas = 0 for feature in genome['features']: if feature['dna_sequence'] != dna_sequences[feature['id']]: bad_dnas += 1 self.assertEqual(bad_dnas, 0)
def __init__(self, config): self.scratch = os.path.abspath(config['scratch']) self.callback_url = os.environ['SDK_CALLBACK_URL'] self.token = os.environ['KB_AUTH_TOKEN'] self.scratch = os.path.abspath(config['scratch']) self.ws = Workspace(config['workspace-url'], token=self.token) self.genome_api = GenomeAnnotationAPI(self.callback_url) self.au = AssemblyUtil(self.callback_url)
def load_genome_features_prepare_fasta(self, genome_refs, compliant_fasta_dir): feature_info = {} os.makedirs(compliant_fasta_dir) for genome_pos, genome_ref in enumerate(genome_refs): ############################# Genome loading ########################## self.log_line("Loading Genome object from workspace for ref [" + genome_ref + "]") info = self.ws.get_object_info_new( {"objects": [{ "ref": genome_ref }]})[0] genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]) gaapi = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL'], token=self.token) genome = gaapi.get_genome_v1({ "genomes": [{ "ref": genome_ref }], "included_fields": ["scientific_name"], "included_feature_fields": ["id", "protein_translation", "type", "function"] })["genomes"][0]["data"] ############################# Features + Fasta ########################## self.log_line("Preparing fasta file for ref [" + genome_ref + "]") genome_id = str(genome_pos + 1) records = [] for feature_pos, feature in enumerate(genome["features"]): feature_id = feature["id"] sequence = feature.get("protein_translation") if sequence: id = str(feature_pos + 1) record = SeqRecord(Seq(sequence), id=id, description="") records.append(record) func = feature.get("function") feature_info[genome_id + "|" + id] = { "fid": feature_id, "fpos": feature_pos, "gref": genome_ref, "func": func } fasta_file = self.scratch + "/" + genome_id + ".fasta" SeqIO.write(records, fasta_file, "fasta") ############################# Adjusting Fasta by Orthomcl ########################## self.log_line("Running orthomclAdjustFasta for ref [" + genome_ref + "]") self.log_process( subprocess.Popen([ "perl", self.plbin + "/orthomclAdjustFasta", genome_id, fasta_file, "1" ], cwd=compliant_fasta_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)) return feature_info
def export(self, ctx, params): # 1) validate parameters and extract defaults self.validate_params(params) # 2) get genome gff handle reference getGenomeOptions = { 'genomes': [{ 'ref': params['genome_ref'] }], 'included_fields': ['gff_handle_ref'], 'ignore_errors': 0 # if we can't find the genome, throw an error } if 'ref_path_to_genome' in params: getGenomeOptions['genomes'][0]['ref_path_to_genome'] = params[ 'ref_path_to_genome'] api = GenomeAnnotationAPI(self.cfg.callbackURL) genome_data = api.get_genome_v1(getGenomeOptions)['genomes'][0] info = genome_data['info'] data = genome_data['data'] # 3) make sure the type is valid if info[2].split('-')[0] != 'KBaseGenomes.Genome': raise ValueError('Object is not a Genome, it is a:' + str(info[2])) is_gtf = params.get('is_gtf', 0) target_dir = params.get('target_dir') if not target_dir: target_dir = os.path.join(self.cfg.sharedFolder, "gff_" + str(int(time.time() * 1000))) if not os.path.exists(target_dir): os.makedirs(target_dir) # 4) if the GFF handle is there, get it and return if is_gtf != 1: print('checking if GFF file is cached...') result = self.get_gff_handle(data, target_dir) if result is not None: result['from_cache'] = 1 return result print('not cached, building file...') # 5) otherwise, build the GFF file and return it result = self.build_gff_file(getGenomeOptions, target_dir, info[1], is_gtf == 1) if result is None: raise ValueError('Unable to generate file. Something went wrong') result['from_cache'] = 0 return result
def annotate_genes(self, ctx, params): """ :param params: instance of type "AnnotateGenesParams" -> structure: parameter "input_genome_ref" of String, parameter "output_workspace" of String, parameter "output_genome_name" of String """ # ctx is the context object #BEGIN annotate_genes ga = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL'], token=ctx['token']) genome = ga.get_genome_v1({"genomes": [{"ref": params['input_genome_ref']}]} )["genomes"][0]["data"] records = [] for feature_index, feature in enumerate(genome["features"]): feature_id = feature["id"] sequence = feature.get("protein_translation") if sequence: record = SeqRecord(Seq(sequence), id=feature_id, description="") records.append(record) fasta_file = self.scratch + "/proteins.faa" SeqIO.write(records, fasta_file, "fasta") output_file = self.scratch + '/output.txt' with open(fasta_file, "r") as infile: with open(output_file, "w") as outfile: p = subprocess.Popen("kmer_search -m 5 -g 200 -d /data/kmer/V2Data -a", shell=True, cwd=self.scratch, stdin=infile, stdout=outfile, stderr=sys.stderr.fileno()) p.wait() fid_to_finc = {} with open(output_file, "r") as infile: for line in infile: parts = line.rstrip().split("\t") fid = parts[0] func = parts[1] print("Function prediction for feature id=" + fid + ": " + func) fid_to_finc[fid] = func for feature_index, feature in enumerate(genome["features"]): feature_id = feature["id"] if feature_id in fid_to_finc: feature['function'] = fid_to_finc[feature_id] prov = ctx.provenance() info = ga.save_one_genome_v1({'workspace': params['output_workspace'], 'name': params['output_genome_name'], 'data': genome, 'provenance': prov})['info'] genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) print("Genome saved to " + genome_ref) #END annotate_genes pass
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'provenance': [{ 'service': 'GenomeFileUtil', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('GenomeFileUtil'): cls.cfg[nameval[0]] = nameval[1] cls.wsURL = cls.cfg['workspace-url'] cls.ws = workspaceService(cls.wsURL, token=token) cls.gaa = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL']) cls.serviceImpl = GenomeFileUtil(cls.cfg) # create one WS for all tests suffix = int(time.time() * 1000) wsName = "test_GenomeAnnotationAPI_" + str(suffix) ret = cls.ws.create_workspace({'workspace': wsName}) cls.wsName = wsName
def __init__(self, config): self.scratch = config["scratch"] self.ctx = config['ctx']; self.callback_url = config["SDK_CALLBACK_URL"] self.ws_client = workspaceService(config["workspace-url"]) self.gfu = GenomeFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url) self.dfu = DataFileUtil(self.callback_url) self.genome_api = GenomeAnnotationAPI(self.callback_url) self.sso_ref = None self.sso_event = None self.ec_to_sso = {} self.output_workspace = None
def load_genome_features_prepare_fasta(self, genome_refs, compliant_fasta_dir): feature_info = {} os.makedirs(compliant_fasta_dir) for genome_pos, genome_ref in enumerate(genome_refs): ############################# Genome loading ########################## self.log_line("Loading Genome object from workspace for ref [" + genome_ref + "]") info = self.ws.get_object_info_new({"objects": [{"ref": genome_ref}]})[0] genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]) gaapi = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL'], token=self.token) genome_combined = gaapi.get_combined_data({"ref": genome_ref, "exclude_genes": 1, "exclude_summary": 1}) cds_map = genome_combined["feature_by_id_by_type"][genome_combined["cds_type"]] protein_map = genome_combined["protein_by_cds_id"] cds_ids = list(cds_map.keys()) ############################# Features + Fasta ########################## self.log_line("Preparing fasta file for ref [" + genome_ref + "]") genome_id = str(genome_pos + 1) records = [] for feature_pos, feature_id in enumerate(cds_ids): cds = cds_map[feature_id] if feature_id not in protein_map: continue protein = protein_map[feature_id] if "protein_amino_acid_sequence" in protein: sequence = protein["protein_amino_acid_sequence"] id = str(feature_pos + 1) record = SeqRecord(Seq(sequence), id=id, description="") records.append(record) func = None if "protein_function" in protein: func = protein["protein_function"] if ((not func) or len(func) == 0) and "feature_function" in cds: func = cds["feature_function"] feature_info[genome_id + "|" + id] = {"fid": feature_id, "fpos": feature_pos, "gref": genome_ref, "func": func} fasta_file = self.scratch + "/" + genome_id + ".fasta" SeqIO.write(records, fasta_file, "fasta") ############################# Adjusting Fasta by Orthomcl ########################## self.log_line("Running orthomclAdjustFasta for ref [" + genome_ref + "]") self.log_process(subprocess.Popen(["perl", self.plbin + "/orthomclAdjustFasta", genome_id, fasta_file, "1"], cwd=compliant_fasta_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)) return feature_info
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('GenomeSearchUtil'): cls.cfg[nameval[0]] = nameval[1] authServiceUrl = cls.cfg.get( 'auth-service-url', "https://kbase.us/services/authorization/Sessions/Login") auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'user_id': user_id, 'provenance': [{ 'service': 'GenomeSearchUtil', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.cfg['genome-index-dir'] = cls.cfg['scratch'] cls.cfg['debug'] = "1" cls.scratch = cls.cfg['scratch'] cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = GenomeSearchUtil(cls.cfg) suffix = int(time.time() * 1000) cls.wsName = "test_SaveGenomeTest_" + str(suffix) cls.wsClient.create_workspace({'workspace': cls.wsName}) cls.ga_client = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL']) cls.banno_ref = cls.load_genome_direct( 'data/b.anno.2.genome.json', 'b.anno.2', contigset_filename='data/b.anno.2.contigs.json') cls.rhodo_ref = cls.load_genome_direct( 'data/rhodobacter.json', 'rhodobacter', contigset_filename='data/rhodobacter_contigs.json') cls.eco_ref = cls.load_genome_direct('data/new_ecoli_genome.json', 'ecoli', 'data/e_coli_assembly.fasta', gtype="KBaseGenomes.Genome")
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_functional_enrichment_1'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'user_id': user_id, 'provenance': [{ 'service': 'kb_functional_enrichment_1', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL) cls.serviceImpl = kb_functional_enrichment_1(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] cls.fe1_runner = FunctionalEnrichmentUtil(cls.cfg) cls.dfu = DataFileUtil(cls.callback_url) cls.gaa = GenomeAnnotationAPI(cls.callback_url) cls.ws = Workspace(cls.wsURL, token=token) suffix = int(time.time() * 1000) cls.wsName = "test_kb_functional_enrichment_1_" + str(suffix) cls.wsClient.create_workspace({'workspace': cls.wsName}) cls.prepare_data()
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('PanGenomeAPI'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'user_id': user_id, 'provenance': [{ 'service': 'PanGenomeAPI', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL) cls.serviceImpl = PanGenomeAPI(cls.cfg) cls.scratch = cls.cfg['scratch'] shutil.rmtree(cls.scratch) os.mkdir(cls.scratch) cls.callback_url = os.environ['SDK_CALLBACK_URL'] suffix = int(time.time() * 1000) wsName = "test_pangenome_api_" + str(suffix) cls.ws_info = cls.wsClient.create_workspace({'workspace': wsName}) cls.gcs = GenomeComparisonSDK(cls.callback_url) cls.gaa = GenomeAnnotationAPI(cls.callback_url) cls.prepare_data()
def generate_cummerbund_plot2(self, ctx, cummerbundstatParams): """ :param cummerbundstatParams: instance of type "cummerbundstatParams" -> structure: parameter "workspace" of String, parameter "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output), parameter "ws_diffstat_output" of type "ws_diffstat_output" (Differential stat workspace id) :returns: instance of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output) """ # ctx is the context object # return variables are: returnVal #BEGIN generate_cummerbund_plot2 params = cummerbundstatParams returnVal = params['ws_cummerbund_output'] # Set up workspace client user_token = ctx['token'] ws_client = Workspace(url=self.__WS_URL, token=user_token) # Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name': params['ws_cuffdiff_id'], 'workspace': params['workspace'] }]) print "Getting genome info" genome_ref = s_res[0]['data']['genome_id'] # genome_ref = '2702/6/2' # genome_ref = '2702/26/1' # genome_ref = '2229/21/10' print genome_ref gaapi = GenomeAnnotationAPI(self.callbackURL, token=user_token) genome = gaapi.get_genome_v1({"genomes": [{"ref": genome_ref}], "included_fields": ["scientific_name"], "included_feature_fields": ["id", "function", "type" ]})["genomes"][0]["data"] genome_dict = {} features = genome['features'] for feature in features: id = feature['id'] try: function = feature['function'] if not function: function = 'Unknown' except: function = 'Unknown' genome_dict[id] = function # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = script_util2.extract_cuffdiff_data(self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if (cuffdiff_dir is False): return returnVal # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject = dict() # Prepare output plot list cummerbundplotset = [] # List of plots to generate plotlist = [ {'file': "dispersionplot.R", 'title': "Dispersion plot", 'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM."}, {'file': "fpkmscvplot.R", 'title': "Genes CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data."}, {'file': "isoformscvplot.R", 'title': "Isoform CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates."}, {'file': "densityplot.R", 'title': "Density plot", 'description': "The density plot shows the distribution of FPKM scores across samples"}, {'file': "csdensityrepplot.R", 'title': "Replicates density plot", 'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates"}, {'file': "boxplot.R", 'title': "Box plots", 'description': "The box plots show the FPKM distribution across samples."}, {'file': "boxrepplot.R", 'title': "Box plots of replicates", 'description': "The box plots of replicates show the FPKM distribution across sample replicates."}, {'file': "pairwisescatterplots.R", 'title': "Pairwise scatter plots", 'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line."}, {'file': "volcanomatrixplot.R", 'title': "Volcano matrix plots", 'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off."}, {'file': "pcaplot.R", 'title': "PCA plot", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions."}, {'file': "pcarepplot.R", 'title': "PCA plot including replicates", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates."}, {'file': "mdsplot.R", 'title': "Multi-dimensional scaling plot", 'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. "}, {'file': "mdsrepplot.R", 'title': "Multi-dimensional scaling plot including replicates", 'description': "Multi-dimensional scaling plot including replicates are similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions."} ] # Iterate through the plotlist and generate the images and json files. for plot in plotlist: status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token, cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["file"]) # Populate the output object outputobject['cummerbundplotSet'] = cummerbundplotset # TODO: Need to figure out how to get rnaseq experiment id outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id" outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id'] res = ws_client.save_objects({ "workspace": params['workspace'], "objects": [{ "type": "KBaseRNASeq.cummerbund_output", "data": outputobject, "name": params["ws_cummerbund_output"]}] }) infile = join(cuffdiff_dir, "gene_exp.diff") outfile = join(cuffdiff_dir, "gene_exp_diff.out") x = v.volcano_plot_data_parse_and_upload(infile, outfile, genome_dict) with open(outfile) as f: statdata = json.load(f) res = ws_client.save_objects({ "workspace": params['workspace'], "objects": [{ "type": "KBaseRNASeq.DifferentialExpressionStat", "data": statdata, "name": params["ws_diffstat_output"]}] }) #END generate_cummerbund_plot2 # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method generate_cummerbund_plot2 return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def get_enhancedFEM(self, params, tok): """ implements get_enhancedFilteredExpressionMatrix() method """ self.ws_client = Workspace(self.ws_url, token=tok) if 'fem_object_ref' not in params: raise ValueError( "fem_object_ref parameter not given to get_enhancedFilteredExpressionMatrix" ) fem_object_ref = params.get('fem_object_ref') fem_obj_ret = self.ws_client.get_objects2( {'objects': [{ 'ref': fem_object_ref }]})['data'][0] fem = fem_obj_ret.get('data') prov = fem_obj_ret.get('provenance')[0] # create the enhanced FEM, starting with the FEM efem = {} for k in ['genome_ref', 'scale', 'type']: efem[k] = fem.get(k) efem['data'] = {} efem['data']['col_ids'] = [ "description", "fold-change", "q-value", "min", "max", "mean", "std_dev", "is_missing_values" ] efem['data']['column_labels'] = [ "Description", "Fold change", "Q value", "Min. expression", "Max. expression", "Mean expression", "Std. dev.", "Missing values?" ] fm = fem.get('data') efem['data']['row_ids'] = fm.get('row_ids') efem['data']['values'] = [] n_efem_rows = len(efem['data']['row_ids']) fvals = fm.get('values') if (len(fvals) != n_efem_rows): raise Exception( "length discrepancy in filtered expression matrix: {0} row_ids but {1} values" .format(n_efem_rows, len(fvals))) # Get genome object and feature descriptions as a handy feature-indexed dict # moved from constructor gaa = GenomeAnnotationAPI(self.serviceWizardURL, token=tok) feat_dict = gaa.get_feature_functions({ 'ref': fem.get('genome_ref'), 'feature_id_list': None }) # if this FEM has a "resolved_ws_objects" record in its provenance, # then that should be a list of one DEM reference from which we get the FC and q values # as a feature (=row_id) -indexed dict. if fem.get('diff_expr_matrix_ref'): dem_ref = fem.get('diff_expr_matrix_ref') dem_obj_ret = self.ws_client.get_objects2( {'objects': [{ 'ref': dem_ref }]})['data'][0] dem = dem_obj_ret.get('data') dem_dict = self.convert_dem_to_dict( dem.get('data')) # convert to dictionary for quick lookups else: dem_dict = {} # empty dictionary # for each row for row_id, fm_val_row in zip(fm.get('row_ids'), fvals): # make a new row with NA for description, FC and q new_values_row = ['NA', 'NA', 'NA' ] + self.get_matrix_stats(fm_val_row) # if we have a description for this feature (row_id) put it in the first column desc = feat_dict.get(row_id) if desc: new_values_row[ 0] = desc # leave as 'NA' if no entry in feat_dict # if we have a DEM entry for this row, put FC and q into 2nd and 3rd columns d = dem_dict.get(row_id) if d: new_values_row[1], new_values_row[2] = d # finally, add this row to the eFEM efem['data']['values'].append(new_values_row) return efem
class ProkkaUtils: def __init__(self, config): self.scratch = config["scratch"] self.ctx = config['ctx']; self.callback_url = config["SDK_CALLBACK_URL"] self.ws_client = workspaceService(config["workspace-url"]) self.gfu = GenomeFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url) self.dfu = DataFileUtil(self.callback_url) self.genome_api = GenomeAnnotationAPI(self.callback_url) self.sso_ref = None self.sso_event = None self.ec_to_sso = {} self.output_workspace = None @staticmethod def _get_input_value(params, key): """Get value of key after checking for its existence :param params: Params dictionary haystack :param key: Key to search in Params :return: Parameter Value :raises ValueError: raises an exception if the key doesn"t exist """ if not key in params: raise ValueError("Parameter " + key + " should be set in input parameters") return params[key] @staticmethod def _get_qualifier_value(qualifier): """Get first qualifier from the list of qualifiers :param qualifier: list contents of the qualifier from BCBio GFF Tools :return: first element in the list """ return qualifier[0] if (qualifier and len(qualifier) > 0) else None def download_seed_data(self): """Download Seed Data Ontology, and set the gene_ontology reference (sso_ref) and the create a table from ec numbers to sso (ec_to_sso) :return: None """ # Download Seed Reference Data sso_ret = self.ws_client.get_objects([{"ref": "KBaseOntology/seed_subsystem_ontology"}])[0] sso = sso_ret["data"] for sso_id in sso["term_hash"]: sso_name = sso["term_hash"][sso_id]["name"] if "(EC " in sso_name and sso_name.endswith(")"): ec = sso_name[sso_name.index("(EC ") + 4: -1].strip() sso_list = self.ec_to_sso.get(ec, None) if not sso_list: sso_list = [] self.ec_to_sso[ec] = sso_list sso_list.append(sso["term_hash"][sso_id]) print("EC found in SSO: " + str(len(self.ec_to_sso))) sso_info = sso_ret["info"] sso_ref = str(sso_info[6]) + "/" + str(sso_info[0]) + "/" + str(sso_info[4]) with open("/kb/module/work/seed_so.json", "w") as outfile: json.dump(sso, outfile, sort_keys=True, indent=4) self.sso_ref = sso_ref def inspect_assembly(self, assembly_meta, assembly_ref): """Check to see if assembly has too many contigs and might not be a metagenome or non prokaryotic dataset :param assembly_meta: information about the assembly reference :param assembly_ref: the assembly reference number :return: a tuple containing gc_content and dna_size """ gc_content = float(assembly_meta.get("GC content")) dna_size = int(assembly_meta.get("Size")) n_contigs = 0 if "N Contigs" in assembly_meta: n_contigs = int(assembly_meta.get("N Contigs")) else: contig = self.ws_client.get_objects([{"ref": assembly_ref}])[0] n_contigs = len(contig["data"]["contigs"]) if n_contigs >= 30000: message = """ Hmmm. There are over 30,000 contigs in this Assembly. It looks like you are trying to run Prokka on a metagenome or non-prokaryotic data set. If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins. These bins can then be individually annotated as a single genome using Prokka. If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes. Alternatively, you can try reducing the number of contigs using a filter app.") raise ValueError("Too many contigs for Prokka. See logs for details and suggestions """ print(message) raise ValueError("Too many contigs for Prokka. See logs for details and suggestions") assembly_info = namedtuple("assembly_info", "gc_content dna_size") return assembly_info(gc_content, dna_size) @staticmethod def create_renamed_assembly(assembly_fasta_filepath): """Rename records to be in the format of contig_N and output a new fasta file :param assembly_fasta_filepath: :return: The path to the fasta file with renamed contigs the number of contigs, the mapping from old ids to new ids, and the contigs as SeqRecords """ records = [] new_ids_to_old = {} contig_counter = 0 for record in SeqIO.parse(assembly_fasta_filepath, "fasta"): contig_counter += 1 old_id = record.id new_id = "contig_" + str(contig_counter) sequence = record.seq # it has type "Seq" record = SeqRecord(sequence, id=new_id, description="(" + old_id + ")") records.append(record) new_ids_to_old[new_id] = old_id renamed_assembly_fasta_filepath = assembly_fasta_filepath + "_renamed.fna" SeqIO.write(records, renamed_assembly_fasta_filepath, "fasta") renamed_assembly = namedtuple("renamed_assembly", "filepath contig_counter new_ids_to_old records") return renamed_assembly(renamed_assembly_fasta_filepath, contig_counter, new_ids_to_old, records) def run_prokka(self, params, subject_fasta_filepath): """Run Prokka :param params: Prokka parameters :param subject_fasta_filepath: The contigs or genes to run prokka against :return: The directory with all of the prokka output files """ output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4()) # --kingdom [X] Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default "Bacteria") kingdom = "Bacteria" if "kingdom" in params and params["kingdom"]: kingdom = params["kingdom"] prokka_cmd_list = ["perl", "/kb/prokka/bin/prokka", "--outdir", output_dir, "--prefix", "mygenome", "--kingdom", kingdom] # --genus [X] Genus name (triggers to use --usegenus) if "genus" in params and params["genus"]: prokka_cmd_list.extend(["--genus", str(params["genus"]), "--usegenus"]) # --gcode [N] Genetic code / Translation table (set if --kingdom is set) (default "0") if "gcode" in params and params["gcode"]: prokka_cmd_list.extend(["--gcode", str(params["gcode"])]) else: prokka_cmd_list.extend(["--gcode", "0"]) # --gram [X] Gram: -/neg +/pos (default "") if "gram" in params and params["gram"]: raise ValueError("gram parameter is not supported in current Prokka installation") # --metagenome Improve gene predictions for highly fragmented genomes (default OFF) if "metagenome" in params and params["metagenome"] == 1: prokka_cmd_list.append("--metagenome") # --rawproduct Do not clean up /product annotation (default OFF) if "rawproduct" in params and params["rawproduct"] == 1: prokka_cmd_list.append("--rawproduct") # --fast Fast mode - skip CDS /product searching (default OFF) if "fast" in params and params["fast"] == 1: prokka_cmd_list.append("--fast") # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default "1") if "mincontiglen" in params and params["mincontiglen"]: prokka_cmd_list.extend(["--mincontiglen", str(params["mincontiglen"])]) # --evalue [n.n] Similarity e-value cut-off (default "1e-06") if "evalue" in params and params["evalue"]: prokka_cmd_list.extend(["--evalue", str(params["evalue"])]) # --rfam Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default "0") if "rfam" in params and params["rfam"] == 1: prokka_cmd_list.append("--rfam") # --norrna Don"t run rRNA search (default OFF) if "norrna" in params and params["norrna"] == 1: prokka_cmd_list.append("--norrna") # --notrna Don"t run tRNA search (default OFF) if "notrna" in params and params["notrna"] == 1: prokka_cmd_list.append("--notrna") prokka_cmd_list.append(subject_fasta_filepath) print("Prokka command line: " + str(prokka_cmd_list)) try: check_output(prokka_cmd_list, cwd=self.scratch) except CalledProcessError as e: pprint(e) return output_dir @staticmethod def retrieve_prokka_results(output_dir): """ Gather up the relevant prokka results, load the records from the results files :param output_dir: :return: Sequences from the .faa .ffn files and the gff_filepath """ faa_file = output_dir + "/mygenome.faa" cds_to_prot = {} for record in SeqIO.parse(faa_file, "fasta"): cds_to_prot[record.id] = str(record.seq) ffn_file = output_dir + "/mygenome.ffn" cds_to_dna = {} for record in SeqIO.parse(ffn_file, "fasta"): cds_to_dna[record.id] = str(record.seq) gff_file = output_dir + "/mygenome.gff" if not os.path.isfile(gff_file): raise ValueError("PROKKA output GFF file is not found") prokka_results = namedtuple("prokka_results", "cds_to_prot cds_to_dna gff_filepath") return prokka_results(cds_to_prot, cds_to_dna, gff_file) def parse_prokka_results(self, **prokka_parse_parameters): """ Go through the prokka results from the input contigs and then create the features, mrnas and cdss components of the KbaseGenome.Genome object :param prokka_parse_parameters: gff_filepath, mappings :return: Genome:features Genome:cdss Genome:mrnas report_message of genes discovered """ gff_filepath = prokka_parse_parameters["gff_filepath"] cds_to_dna = prokka_parse_parameters["cds_to_dna"] cds_to_prot = prokka_parse_parameters["cds_to_prot"] new_ids_to_old = prokka_parse_parameters["new_ids_to_old"] evidence = self.make_annotation_evidence() cdss = [] mrnas = [] features = [] non_hypothetical = 0 genes_with_ec = 0 genes_with_sso = 0 prot_lengths = [] with open(gff_filepath, "r") as f1: for rec in GFF.parse(f1): contig_id = new_ids_to_old[str(rec.id)] for ft in rec.features: loc = ft.location min_pos = int(loc.start) + 1 max_pos = int(loc.end) strand = "+" if loc.strand == 1 else "-" flen = max_pos - min_pos + 1 start = min_pos if strand == "+" else max_pos location = [[contig_id, start, strand, flen]] qualifiers = ft.qualifiers generated_id = self._get_qualifier_value(qualifiers.get("ID")) if not generated_id: # Skipping feature with no ID (mostly repeat regions) continue dna = cds_to_dna.get(generated_id) if not dna: # Skipping feature with no DNA (mostly repeat regions) continue name = self._get_qualifier_value(qualifiers.get("Name")) ec = self._get_qualifier_value(qualifiers.get("eC_number")) gene = self._get_qualifier_value(qualifiers.get("gene")) product = self._get_qualifier_value(qualifiers.get("product")) fid = generated_id aliases = [] if name: aliases.append(name) if gene: aliases.append(gene) if ec: aliases.append(ec) genes_with_ec += 1 md5 = hashlib.md5(dna).hexdigest() feature = {"id": fid, "location": location, "type": "gene", "aliases": aliases, "md5": md5, "dna_sequence": dna, "dna_sequence_length": len(dna), } if product: feature["function"] = product if product != "hypothetical protein": non_hypothetical += 1 if ec and ec in self.ec_to_sso: sso_list = self.ec_to_sso[ec] sso_terms = {} for sso_item in sso_list: sso_terms[sso_item["id"]] = {"id": sso_item["id"], "evidence": [evidence], "term_name": sso_item["name"], "ontology_ref": self.sso_ref, "term_lineage": []} feature["ontology_terms"] = {"SSO": sso_terms} genes_with_sso += 1 cds = None mrna = None prot = cds_to_prot.get(generated_id) if prot: cds_id = fid + "_CDS" mrna_id = fid + "_mRNA" prot_len = len(prot) prot_lengths.append(prot_len) feature["protein_translation"] = prot feature["protein_translation_length"] = prot_len feature["cdss"] = [cds_id] feature["mrnas"] = [mrna_id] cds = {"id": cds_id, "location": location, "md5": md5, "parent_gene": fid, "parent_mrna": mrna_id, "function": (product if product else ""), "ontology_terms": {}, "protein_translation": prot, "protein_translation_length": prot_len, "aliases": aliases} mrna = {"id": mrna_id, "location": location, "md5": md5, "parent_gene": fid, "cds": cds_id} features.append(feature) if cds: cdss.append(cds) if mrna: mrnas.append(mrna) # Prepare report report = "" report += "Number of genes predicted: " + str(len(features)) + "\n" report += "Number of protein coding genes: " + str(len(prot_lengths)) + "\n" report += "Number of genes with non-hypothetical function: " + str(non_hypothetical) + "\n" report += "Number of genes with EC-number: " + str(genes_with_ec) + "\n" report += "Number of genes with Seed Subsystem Ontology: " + str(genes_with_sso) + "\n" report += "Average protein length: " + str(int(sum(prot_lengths) / float(len(prot_lengths)))) + " aa.\n" annotated_assembly = namedtuple("annotated_assembly", "features cdss mrnas report_message") return annotated_assembly(features, cdss, mrnas, report) def get_new_annotations(self, gff_filepath): """ :param gff_filepath: A dictionary of ids with products and ec numbers :return: """ evidence = self.make_annotation_evidence() genome = {} with open(gff_filepath, "r") as f: for rec in GFF.parse(f): gid = rec.id gene_features = {"id": id} for feature in rec.features: qualifiers = feature.qualifiers if "product" in qualifiers: gene_features["function"] = " ".join(qualifiers["product"]) if "eC_number" in qualifiers: ec_numbers = qualifiers["eC_number"] sso_terms = dict() for ec in ec_numbers: sso_list = self.ec_to_sso.get(ec, []) for sso_item in sso_list: sso_terms[sso_item["id"]] = {"id": sso_item["id"], "evidence": [evidence], "term_name": sso_item["name"], "ontology_ref": self.sso_ref, "term_lineage": []} gene_features["ontology_terms"] = sso_terms genome[gid] = gene_features return genome def write_genome_to_fasta(self, genome_data): """ :param genome_data: :return: """ fasta_for_prokka_filepath = os.path.join(self.scratch, "features_" + str(uuid.uuid4()) + ".fasta") count = 0 with open(fasta_for_prokka_filepath, "w") as f: for item in genome_data["data"]["features"]: if "id" not in item or "dna_sequence" not in item: print("This feature does not have a valid dna sequence.") else: f.write(">" + item["id"] + "\n" + item["dna_sequence"] + "\n") count += 1 print("Finished printing to" + fasta_for_prokka_filepath) if os.stat(fasta_for_prokka_filepath).st_size == 0: raise Exception( "This genome does not contain features with DNA_SEQUENCES. Fasta file is empty.") return fasta_for_prokka_filepath def make_sso_ontology_event(self): """ :param sso_ref: Reference to the annotation library set :return: Ontology_event to be appended to the list of genome ontology events """ time_string = str( datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) return { "method": "Prokka Annotation", "method_version": version, "timestamp": time_string, "id": "SSO", "ontology_ref": self.sso_ref } def make_annotation_evidence(self): """ :param sso_ref: Reference to the annotation library set :return: Ontology_event to be appended to the list of genome ontology events """ time_string = str( datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) return { "method": "Prokka Annotation (Evidence)", "method_version": version, "timestamp": time_string, } def create_genome_ontology_fields(self, genome_data): # Make sure ontologies_events exist sso_event = self.make_sso_ontology_event() ontology_event_index = 0 if 'ontology_events' in genome_data['data']: genome_data['data']['ontology_events'].append(sso_event) ontology_event_index += len(genome_data['data']['ontology_events']) - 1 else: genome_data['data']['ontology_events'] = [sso_event] genome_obj_modified = namedtuple('genome_obj_modified', 'genome_data ontology_event_index') return genome_obj_modified(genome_data, ontology_event_index) @staticmethod def old_genome_ontologies(feature, new_ontology): if "ontology_terms" not in feature: feature["ontology_terms"] = {"SSO": {}} if "SSO" not in feature["ontology_terms"]: feature["ontology_terms"]["SSO"] = {} for key in new_ontology.keys(): feature["ontology_terms"]["SSO"][key] = new_ontology[key] return feature @staticmethod def new_genome_ontologies(feature, new_ontology, ontology_event_index): if "ontology_terms" not in feature: feature["ontology_terms"] = {"SSO": {}} if "SSO" not in feature["ontology_terms"]: feature["ontology_terms"]["SSO"] = {} for key in new_ontology.keys(): id = new_ontology[key]["id"] if id in feature["ontology_terms"]["SSO"]: feature["ontology_terms"]["SSO"][id].append(ontology_event_index) else: feature["ontology_terms"]["SSO"][id] = [ontology_event_index] return feature def annotate_genome_with_new_annotations(self, **annotation_args): """ :param annotation_args: genome_data, new_annotations from prokka, and the output_genome_name :type :return: """ genome_data = annotation_args["genome_data"] new_annotations = annotation_args["new_annotations"] new_genome = False if 'feature_counts' in genome_data['data']: new_genome = True genome_obj_modified = self.create_genome_ontology_fields(genome_data) genome_data = genome_obj_modified.genome_data ontology_event_index = genome_obj_modified.ontology_event_index stats = {"current_functions": len(genome_data["data"]["features"]), "new_functions": 0, "found_functions": 0, "new_ontologies": 0} function_report_filepath = os.path.join(self.scratch, "ontology_report") ontology_report_filepath = os.path.join(self.scratch, "function_report") onto_r = open(function_report_filepath, "w") func_r = open(ontology_report_filepath, "w") func_r.write("function_id current_function new_function\n") onto_r.write("function_id current_ontology new_ontology\n") for i, feature in enumerate(genome_data["data"]["features"]): fid = feature["id"] current_function = feature.get("function", "") current_functions = feature.get("functions", []) current_ontology = feature.get("ontology_terms", None) new_function = "" new_ontology = dict() if fid in new_annotations: # Set Function new_function = new_annotations[fid].get("function", "") if new_function and "hypothetical protein" not in new_function: if (new_function != current_function and new_function not in current_functions): stats['new_functions'] += 1 genome_data["data"]["features"][i]["function"] = new_function genome_data["data"]["features"][i]["functions"] = [new_function] stats['found_functions'] += 1 # Set Ontologies new_ontology = new_annotations[fid].get("ontology_terms", None) if new_ontology: stats['new_ontologies'] += 1 if new_genome: genome_data["data"]["features"][i] = self. \ new_genome_ontologies(feature, new_ontology, ontology_event_index) else: genome_data["data"]["features"][i] = self. \ old_genome_ontologies(feature, new_ontology) if current_function: func_r.write(json.dumps([fid, [current_function], [new_function]]) + "\n") else: func_r.write(json.dumps([fid, current_functions, [new_function]]) + "\n") onto_r.write(json.dumps([fid, current_ontology, new_ontology]) + "\n") func_r.close() onto_r.close() info = self.gfu.save_one_genome({"workspace": self.output_workspace, "name": annotation_args["output_genome_name"], "data": genome_data["data"], "provenance": self.ctx.provenance()})["info"] genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]) annotated_genome = namedtuple("annotated_genome", "genome_ref function_report_filepath ontology_report_filepath stats") return annotated_genome(genome_ref, function_report_filepath, ontology_report_filepath, stats) def upload_file(self, filepath, message="Annotation report generated by kb_prokka"): """ Upload a file to shock :param filepath: File to upload :param message: Optional Upload Message :return: """ output_file_shock_id = self.dfu.file_to_shock({"file_path": filepath})["shock_id"] print("Uploaded filepath" + filepath + "to shock and got id" + output_file_shock_id) return {"shock_id": output_file_shock_id, "name": os.path.basename(filepath), "label": os.path.basename(filepath), "description": message} def report_annotated_genome(self, genome): """ Create report output with newly reannotated genome, and some stats :param genome: Reannotated Genome Reference, Report Files and Stats :return: Reference to Report Object """ genome_ref = genome.genome_ref stats = genome.stats file_links = [self.upload_file(genome.ontology_report_filepath), self.upload_file(genome.function_report_filepath)] report_message = ("Genome Ref:{0}\n" "Number of features sent into prokka:{1}\n" "New functions found:{2}\n" "Ontology terms found:{3}\n" ).format(genome_ref, stats["current_functions"], stats["new_functions"], stats["new_ontologies"]) report_info = self.kbr.create_extended_report( {"message": report_message, "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}], "file_links": file_links, "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()), "workspace_name": self.output_workspace }) return {"output_genome_ref": genome_ref, "report_name": report_info["name"], "report_ref": report_info["ref"]} def annotate_genome(self, params): """ User input an existing genome to re-annotate. :param params: Reference to the genome, Output File Name, UI Parameters :return: Report with Reannotated Genome and Stats about it """ self.download_seed_data() self.output_workspace = params["output_workspace"] genome_ref = self._get_input_value(params, "object_ref") output_name = self._get_input_value(params, "output_genome_name") # genome_data = self.dfu.get_objects({"object_refs": [genome_ref]})["data"][0] genome_data = \ self.genome_api.get_genome_v1({"genomes": [{"ref": genome_ref}], 'downgrade': 0})[ "genomes"][0] fasta_for_prokka_filepath = self.write_genome_to_fasta(genome_data) output_dir = self.run_prokka(params, fasta_for_prokka_filepath) prokka_results = self.retrieve_prokka_results(output_dir) new_annotations = self.get_new_annotations(prokka_results.gff_filepath) annotated_genome = self.annotate_genome_with_new_annotations(genome_data=genome_data, new_annotations=new_annotations, output_genome_name=output_name) return self.report_annotated_genome(annotated_genome) def annotate_assembly(self, params, assembly_info): """ Annotate an assembly with Prokka. The steps include to download the assembly as a fasta file, rename the contigs, run prokka against the contigs, parse the results, and finally, create and upload a genome object. :param params: object reference, output_genome_name and output_workspace :param assembly_info: Information used to determine if the assembly is too big :return: Report with newly annotated assembly as a genome, and stats about it """ self.download_seed_data() self.output_workspace = params["output_workspace"] assembly_ref = self._get_input_value(params, "object_ref") output_genome_name = self._get_input_value(params, "output_genome_name") output_workspace = self._get_input_value(params, "output_workspace") assembly_info = self.inspect_assembly(assembly_info[10], assembly_ref) orig_fasta_file = self.au.get_assembly_as_fasta({"ref": assembly_ref})["path"] # Rename Assembly and Keep Track of Old Contigs renamed_assembly = self.create_renamed_assembly(orig_fasta_file) # Run Prokka with the modified, renamed fasta file output_dir = self.run_prokka(params, renamed_assembly.filepath) # Prokka_results prokka_results = self.retrieve_prokka_results(output_dir) # Parse Results annotated_assembly = self.parse_prokka_results(gff_filepath=prokka_results.gff_filepath, cds_to_dna=prokka_results.cds_to_dna, cds_to_prot=prokka_results.cds_to_dna, new_ids_to_old=renamed_assembly.new_ids_to_old) # Force defaults for optional parameters that may be set to None scientific_name = 'Unknown' if 'scientific_name' in params and params['scientific_name']: scientific_name = params['scientific_name'] domain = "Bacteria" if 'kingdom' in params and params['kingdom']: domain = params['kingdom'] gcode = 0 if 'gcode' in params and params['gcode']: gcode = params['gcode'] genome = {"id": "Unknown", "features": annotated_assembly.features, "scientific_name": scientific_name, "domain": domain, "genetic_code": gcode, "assembly_ref": assembly_ref, "cdss": annotated_assembly.cdss, "mrnas": annotated_assembly.mrnas, "source": "PROKKA annotation pipeline", "gc_content": assembly_info.gc_content, "dna_size": assembly_info.dna_size, "reference_annotation": 0} info = self.gfu.save_one_genome({"workspace": output_workspace, "name": output_genome_name, "data": genome, "provenance": self.ctx.provenance()})["info"] genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]) report_message = "Genome saved to: " + output_workspace + "/" + \ output_genome_name + "\n" + annotated_assembly.report_message report_info = self.kbr.create_extended_report( {"message": report_message, "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}], "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()), "workspace_name": output_workspace }) return {"output_genome_ref": genome_ref, "report_name": report_info["name"], "report_ref": report_info["ref"]}
def generate_cummerbund_plot2(self, ctx, cummerbundstatParams): """ :param cummerbundstatParams: instance of type "cummerbundstatParams" -> structure: parameter "workspace" of String, parameter "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output), parameter "ws_diffstat_output" of type "ws_diffstat_output" (Differential stat workspace id) :returns: instance of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output) """ # ctx is the context object # return variables are: returnVal #BEGIN generate_cummerbund_plot2 params = cummerbundstatParams returnVal = params['ws_cummerbund_output'] #Set up workspace client user_token = ctx['token'] ws_client = Workspace(url=self.__WS_URL, token=user_token) #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name' : params['ws_cuffdiff_id'], 'workspace' : params['workspace'] }]) print "Getting genome info" genome_ref = s_res[0]['data']['genome_id'] #genome_ref = '2702/6/2' #genome_ref = '2702/26/1' #genome_ref = '2229/21/10' print genome_ref gaapi = GenomeAnnotationAPI(self.callbackURL, token=user_token) genome = gaapi.get_genome_v1({"genomes": [{"ref": genome_ref}], "included_fields": ["scientific_name"], "included_feature_fields": ["id", "function", "type" ]})["genomes"][0]["data"] genome_dict = {} features = genome['features'] for feature in features: id = feature['id'] try: function = feature['function'] if not function: function = 'Unknown' except: function = 'Unknown' genome_dict[id] = function # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if (cuffdiff_dir is False): return returnVal # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject=dict() # Prepare output plot list cummerbundplotset=[] # List of plots to generate plotlist = [ { 'file': "dispersionplot.R", 'title': "Dispersion plot", 'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM." }, { 'file': "fpkmscvplot.R", 'title': "Genes CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data." }, { 'file': "isoformscvplot.R", 'title': "Isoform CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates." }, { 'file': "densityplot.R", 'title': "Density plot", 'description': "The density plot shows the distribution of FPKM scores across samples" }, { 'file': "csdensityrepplot.R", 'title': "Replicates density plot", 'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates" }, { 'file': "boxplot.R", 'title': "Box plots", 'description': "The box plots show the FPKM distribution across samples." }, { 'file': "boxrepplot.R", 'title': "Box plots of replicates", 'description': "The box plots of replicates show the FPKM distribution across sample replicates." }, { 'file': "pairwisescatterplots.R", 'title': "Pairwise scatter plots", 'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line." }, { 'file': "volcanomatrixplot.R", 'title': "Volcano matrix plots", 'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off." }, { 'file': "pcaplot.R", 'title': "PCA plot", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions." }, { 'file': "pcarepplot.R", 'title': "PCA plot including replicates", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates." }, { 'file': "mdsplot.R", 'title': "Multi-dimensional scaling plot", 'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. " }, { 'file': "mdsrepplot.R", 'title': "Multi-dimensional scaling plot including replicates", 'description': "Multi-dimensional scaling plot including replicates are similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions." } ] # Iterate through the plotlist and generate the images and json files. for plot in plotlist: status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token, cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["file"]) # Populate the output object outputobject['cummerbundplotSet'] = cummerbundplotset #TODO: Need to figure out how to get rnaseq experiment id outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id" outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id'] res = ws_client.save_objects({ "workspace":params['workspace'], "objects": [{ "type":"KBaseRNASeq.cummerbund_output", "data":outputobject, "name":params["ws_cummerbund_output"]}] }) infile = join(cuffdiff_dir, "gene_exp.diff") outfile = join(cuffdiff_dir, "gene_exp_diff.out") x=v.volcano_plot_data_parse_and_upload(infile,outfile, genome_dict) with open(outfile) as f: statdata = json.load(f) res = ws_client.save_objects({ "workspace":params['workspace'], "objects": [{ "type":"KBaseRNASeq.DifferentialExpressionStat", "data":statdata, "name":params["ws_diffstat_output"]}] }) #END generate_cummerbund_plot2 # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method generate_cummerbund_plot2 return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
class vConTACT: ''' Module Name: vConTACT Module Description: A KBase module: vConTACT ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/bolduc/vcontact" GIT_COMMIT_HASH = "ff92f754f02d757aa925d2327fc8ef2bf0af4b07" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config #END_CONSTRUCTOR pass def run_vcontact(self, ctx, params): """ :param params: instance of type "InParams" -> structure: parameter "genome" of type "obj_ref" (Insert your typespec information here.) """ # ctx is the context object #BEGIN run_vcontact self.callback_url = os.environ['SDK_CALLBACK_URL'] vc = vConTACTUtils(self.config) self.genome_api = GenomeAnnotationAPI(self.callback_url) genome = params['genome'] genome_data = self.genome_api.get_genome_v1( {"genomes": [{ "ref": genome }]}) gene2genome, sequences = vc.genome_to_inputs(genome_data) gene2genome_fp, sequences_fp = vc.write_inputs(gene2genome, sequences) params['gene2genome'] = gene2genome_fp params['sequences'] = sequences_fp returnVal = vc.run_vcontact(params) vc.vcontact_help() kbo = KBObjectUtils(self.config) kbo.create_report(params['workspace_name']) #END run_vcontact pass def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
def build_gff_file(self, getGenomeOptions, output_dir, output_filename, is_gtf): # first get subdata needed; forget about the metadata #getGenomeOptions['included_fields'] = [] #getGenomeOptions['included_feature_fields'] = ['id', 'type', 'location'] getGenomeOptions['no_metadata'] = 1 if 'included_fields' in getGenomeOptions: del getGenomeOptions['included_fields'] if 'included_feature_fields' in getGenomeOptions: del getGenomeOptions['included_feature_fields'] api = GenomeAnnotationAPI(self.cfg.callbackURL) genome_data = api.get_genome_v1(getGenomeOptions)['genomes'][0]['data'] # create the file try: file_ext = ".gtf" if is_gtf else ".gff" out_file_path = os.path.join(output_dir, output_filename + file_ext) print('Creating file: ' + str(out_file_path)) output = open(out_file_path, 'w') features = [] if 'features' in genome_data: for f in genome_data['features']: features.append({ 'id': f['id'], 'type': f['type'], 'location': f['location'] }) if 'cdss' in genome_data: for f in genome_data['cdss']: features.append({ 'id': f['id'], 'type': 'CDS', 'location': f['location'], 'parent_gene': f['parent_gene'], 'parent_mrna': f['parent_mrna'] }) if 'mrnas' in genome_data: for f in genome_data['mrnas']: features.append({ 'id': f['id'], 'type': 'mRNA', 'location': f['location'], 'parent_gene': f['parent_gene'] }) mrna_map = {} ## mrna_id -> <mRNA> gene_map = {} ## gene_id -> <gene> ## gene is {'id': <>, 'location': [[contig,start,strand,len], ...], ## 'mrna_cds_pairs': [[<mRNA>, <CDS>], ...]} #gene_id_generation = 1 #mrna_id_generation = 1 for f in features: if f['type'] == 'mRNA': mrna_map[f['id']] = f elif f['type'] != 'CDS': gene_map[f['id']] = f ## Now let's go over CDSs for f in features: if f['type'] == 'CDS': gene_id = f.get('parent_gene') gene = None if gene_id: gene = gene_map.get(gene_id) rename_cds = False if gene is None: if gene_id is None: gene_id = f[ 'id'] #'gene_' + str(gene_id_generation) #gene_id_generation += 1 rename_cds = True gene = { 'id': gene_id, 'location': self.get_common_location(f['location']) } gene_map[gene_id] = gene mrna_id = f.get('parent_mrna') mrna = None if mrna_id: mrna = mrna_map.get(mrna_id) if mrna is None: if mrna_id is None: mrna_id = f[ 'id'] + '_mRNA' # 'mRNA_' + str(mrna_id_generation) #mrna_id_generation += 1 mrna = {'id': mrna_id, 'location': f['location']} mrna_map[mrna_id] = mrna if rename_cds: f['id'] = f['id'] + '_CDS' mrna_cds_pairs = gene.get('mrna_cds_pairs') if mrna_cds_pairs is None: mrna_cds_pairs = [] gene['mrna_cds_pairs'] = mrna_cds_pairs mrna_cds_pairs.append([mrna, f]) ## Let's sort genes by contigs contigs = [] ## contig is {'genes': []} contig_map = {} for gene_id in gene_map: gene = gene_map[gene_id] gene['start'] = self.get_start(gene['location'][0]) contig_id = gene['location'][0][0] contig = contig_map.get(contig_id) if contig is None: contig = {'id': contig_id, 'genes': []} contig_map[contig_id] = contig contigs.append(contig) contig['genes'].append(gene) for contig in contigs: contig['genes'].sort(key=lambda gene: gene['start']) # write the file exon_id_generation = 1 for contig in contigs: contig_id = contig['id'] for gene in contig['genes']: gene_id = gene['id'] strand = gene['location'][0][2] if not is_gtf: self.write_gff_line(output, contig_id, 'gene', gene['start'], self.get_end(gene['location'][0]), strand, '.', gene_id, None) if 'mrna_cds_pairs' not in gene: continue for [mrna, cds] in gene['mrna_cds_pairs']: mrna_id = mrna['id'] mrna_loc = self.get_common_location( mrna['location'])[0] if not is_gtf: self.write_gff_line(output, contig_id, 'mRNA', self.get_start(mrna_loc), self.get_end(mrna_loc), strand, '.', mrna_id, gene_id) mrna_exons = self.get_location_as_sorted_exons( mrna['location'], strand) for exon in mrna_exons: exon_id = 'exon_' + str(exon_id_generation) exon_id_generation += 1 if is_gtf: self.write_gtf_line(output, contig_id, 'exon', exon['start'], exon['end'], strand, '.', gene_id, mrna_id) else: self.write_gff_line(output, contig_id, 'exon', exon['start'], exon['end'], strand, '.', exon_id, mrna_id) cds_exons = self.get_location_as_sorted_exons( cds['location'], strand) cds_id = cds['id'] frame = 0 for exon in cds_exons: f_start = exon['start'] f_end = exon['end'] f_length = f_end - f_start + 1 if is_gtf: self.write_gtf_line(output, contig_id, 'CDS', f_start, f_end, strand, frame, gene_id, mrna_id) else: self.write_gff_line(output, contig_id, 'CDS', f_start, f_end, strand, frame, cds_id, mrna_id) frame = (3 - ((f_length - frame) % 3)) % 3 except Exception, e: raise ValueError("Failed to create file: {0}".format(e))
class vConTACTUtils: def __init__(self, config): self.scratch = os.path.abspath(config['scratch']) self.callback_url = os.environ['SDK_CALLBACK_URL'] self.token = os.environ['KB_AUTH_TOKEN'] self.scratch = os.path.abspath(config['scratch']) self.ws = Workspace(config['workspace-url'], token=self.token) self.genome_api = GenomeAnnotationAPI(self.callback_url) self.au = AssemblyUtil(self.callback_url) def vcontact_help(self): command = "vcontact --help" self._run_command(command) def execute(self, command: list): """ :param command: Command suitable for running in subprocess, must use a ['ls', '-l'] format :return: Response from command """ # logger.info('Running command: {}'.format(command)) print('Running command: {}'.format(' '.join(command))) res = subprocess.run(command, shell=False, encoding='utf-8', check=True) return res def run_vcontact(self, params): # Determine KBase "inputs" for vConTACT2 genome = params['genome'] obj_type = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][2] if 'assembly' in obj_type.lower(): # If KBaseGenomeAnnotations.Assembly # Assembly requires annotation genome_fp = self.au.get_assembly_as_fasta({'ref': genome})['path'] proteins_fp = os.path.join(self.scratch, 'proteins.faa') proteins_gbk = os.path.join(self.scratch, 'proteins.gbk') gene2genome_fp = os.path.join(self.scratch, 'gene2genome.csv') prodigal_cmd = ['prodigal', '-a', proteins_fp, '-o', proteins_gbk, '-f', 'gbk', '-i', genome_fp, '-p', 'meta'] res = self.execute(prodigal_cmd) records = {} with open(proteins_fp, 'r') as proteins_fh: for record in SeqIO.parse(proteins_fh, 'fasta'): records[len(records)] = { 'protein_id': record.id, 'contig_id': record.id.rsplit('_', 1)[0], 'keywords': 'None' } g2g_df = pd.DataFrame.from_dict(records, orient='index') g2g_df.to_csv(gene2genome_fp, index=False) # Pass filepaths to the app and run params['gene2genome'] = gene2genome_fp params['sequences'] = proteins_fp elif 'kbasegenomes' in obj_type.lower(): # If KBaseGenomes.Genome genome_data = self.genome_api.get_genome_v1({"genomes": [{"ref": genome}]}) # Convert genome data into "reasonable" parse form and write to scratch filesystem gene2genome, sequences = self.genome_to_inputs(genome_data) gene2genome_fp, sequences_fp = self.write_inputs(gene2genome, sequences) # Pass filepaths to the app and run params['gene2genome'] = gene2genome_fp params['sequences'] = sequences_fp elif 'binnedcontigs' in obj_type.lower(): # If KBaseMetagenomes.BinnedContigs print('KBaseMetagenomes.BinnedContigs hasnt been enabled. Check back later.') exit(1) else: print('Unknown error in identifying object types') print('Available database files') print(os.listdir('/miniconda/lib/python3.7/site-packages/vcontact2/data/')) # Just iterate through all parameters mappings = { 'gene2genome': '--proteins-fp', 'sequences': '--raw-proteins', 'db': '--db', 'pcs_mode': '--pcs-mode', 'vcs_mode': '--vcs-mode', 'blast_evalue': '--blast-evalue', 'pc_max_overlap': '--max-overlap', 'pc_penalty': '--penalty', 'pc_haircut': '--haircut', 'pc_inflation': '--pc-inflation', 'vc_inflation': '--vc-inflation', 'vc_density': '--min-density', 'vc_min_size': '--min-size', 'vc_max_overlap': '--vc-overlap', 'vc_penalty': '--vc-penalty', 'vc_haircut': '--vc-haircut', 'merge_method': '--merge-method', 'similarity': '--similarity', 'seed_method': '--seed-method', 'min_significance': '--sig', 'max_significance': '--max-sig', 'module_inflation': '--mod-inflation', 'mod_significance': '--mod-sig', 'module_min_shared': '--mod-shared-min', 'link_significance': '--link-sig', 'link_proportion': '--link-prop' } bool_args = ['optimize', 'permissive'] # Should create build_command? command = 'vcontact2 --output-dir outdir' # Binaries command += ' --diamond-bin /usr/local/bin/diamond --c1-bin /usr/local/bin/cluster_one-1.0.jar' for param, cmd in mappings.items(): command += ' {} {}'.format(cmd, params[param]) self._run_command(command) report = self._generate_report(params) return report def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def genome_to_inputs(self, genome): """ genome_to_inputs: convert genome annotation data (~json) to file inputs required by vConTACT :param genome: :return: """ records = [] gene2genome = OrderedDict() genome_data = genome['genomes'][0] for item in genome_data['data']['features']: if 'id' not in item: continue print('This feature does not have a valid id') elif 'dna_sequence' not in item or 'protein_translation' not in item: continue print('This feature {} does not have a valid DNA sequence.'.format(item['id'])) else: # Create FASTA file if item['type'] == 'gene': desc = (item['functions'] if item.get('functions', None) else item.get('function', '')) gene_record = SeqRecord(Seq(item['protein_translation']), id=item['id'], description=desc) records.append(gene_record) # Build gene2genome gene2genome.update({ item['id']: { # 'contig_id': genome_data['data']['contig_ids'][0], 'contig_id': item['location'][0][0], 'protein_id': item['id'], 'keywords': item['function'] } }) return gene2genome, records def write_inputs(self, mapping, sequences): fasta_for_proteins_fp = os.path.join(self.scratch, 'vConTACT_proteins.fasta') with open(fasta_for_proteins_fp, 'w') as fasta_for_proteins_fh: SeqIO.write(sequences, fasta_for_proteins_fh, 'fasta') genes_to_genomes_mapping_fp = os.path.join(self.scratch, 'vConTACT_gene2genome.csv') with open(genes_to_genomes_mapping_fp, 'w') as genes_to_genomes_mapping_fh: fields = ['contig_id', 'protein_id', 'keywords'] writer = csv.DictWriter(genes_to_genomes_mapping_fh, fieldnames=fields) writer.writeheader() for gene in mapping.keys(): writer.writerow(mapping[gene]) return genes_to_genomes_mapping_fp, fasta_for_proteins_fp def _generate_report(self, params): """ _generate_report: generate summary report This will contain ALL the logic to generate the report, including areas that should/will be re-factored later """ # Get self.dfu = dfu(self.callback_url) # Get filepath of summary file summary_fp = os.path.join(os.getcwd(), 'outdir', 'genome_by_genome_overview.csv') summary_df = pd.read_csv(summary_fp, header=0, index_col=0) html = summary_df.to_html(index=False, classes='my_class table-striped" id = "my_id') # Need to file write below direct_html = html_template.substitute(html_table=html) # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer start_header = Literal("<thead>") end_header = Literal("</thead>") text = start_header + SkipTo(end_header) new_text = '' for data, start_pos, end_pos in text.scanString(direct_html): new_text = ''.join(data).replace(' style="text-align: right;"', '').replace('thead>', 'tfoot>\n ') + '\n</tfoot>' # Get start and end positions to insert new text end_tbody = Literal("</tbody>") end_table = Literal("</table>") insertion_pos = end_tbody + SkipTo(end_table) final_html = '' for data, start_pos, end_pos in insertion_pos.scanString(direct_html): final_html = direct_html[:start_pos + 8] + '\n' + new_text + direct_html[start_pos + 8:] output_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_dir) result_fp = os.path.join(output_dir, 'index.html') with open(result_fp, 'w') as result_fh: result_fh.write(final_html) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_dir, 'pack': 'zip' })['shock_id'] html_report = [{ 'shock_id': report_shock_id, 'name': os.path.basename(result_fp), 'label': os.path.basename(result_fp), 'description': 'HTML summary report for vConTACT2' }] report_params = {'message': 'Basic message to show in the report', 'workspace_name': params['workspace_name'], 'html_links': html_report, 'direct_html_link_index': 0, 'report_object_name': 'vConTACT_report_{}'.format(str(uuid.uuid4())), # Don't use until have files to attach to report # 'file_links': [{}], # Don't use until data objects that are created as result of running app # 'objects_created': [{'ref': matrix_obj_ref, # 'description': 'Imported Matrix'}], } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ # https://stackoverflow.com/a/600612/643675 if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise