def __init__(self, config): """ :param config: :param logger: :param directory: Working directory :param urls: Service urls """ # BEGIN_CONSTRUCTOR self.ws_url = config["workspace-url"] self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.srv_wiz_url = config['srv-wiz-url'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev') self.eu = ExpressionUtils(self.callback_url) self.ws = Workspace(self.ws_url, token=self.token) self.scratch = os.path.join(config['scratch'], str(uuid.uuid4())) self._mkdir_p(self.scratch) self.tool_used = "Cufflinks" self.tool_version = os.environ['VERSION'] # END_CONSTRUCTOR pass
def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.setapi = SetAPI(self.callback_url) self.wss = workspaceService(config['workspace-url'])
def __init__(self, config, logger=None): self.config = config self.logger = logger self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'EAS_' + str(uuid.uuid4())) self.ws_url = config['workspace-url'] self.ws_client = Workspace(self.ws_url) self.dfu = DataFileUtil(self.callback_url) self.setAPI = SetAPI(self.callback_url) pass
def __init__(self, config, logger=None): self.config = config self.logger = logger self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'DEM_' + str(uuid.uuid4())) self.ws_url = config['workspace-url'] self.ws_client = Workspace(self.ws_url) self.fv = KBaseFeatureValues(self.callback_url) self.dfu = DataFileUtil(self.callback_url) self.setAPI = SetAPI(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self._mkdir_p(self.scratch)
def test_list_object_with_sets(self): ws_name1 = self.createWs() reads_obj_ref = self.__class__.example_reads_ref set_obj_name = "MyReadsSet.1" sapi = SetAPI(self.__class__.serviceWizardURL, token=self.getContext()['token'], service_ver=self.__class__.SetAPI_version) sapi.save_reads_set_v1({ 'workspace': ws_name1, 'output_object_name': set_obj_name, 'data': { 'description': '', 'items': [{ 'ref': reads_obj_ref }] } }) list_ret = self.getImpl().list_objects_with_sets( self.getContext(), {"ws_name": ws_name1})[0] ret = list_ret['data'] self.assertTrue(len(ret) > 0) set_count = 0 for item in ret: self.assertTrue("object_info" in item) if "set_items" in item: set_count += 1 set_items = item["set_items"]["set_items_info"] self.assertEqual(1, len(set_items)) self.assertEqual(1, set_count) self.assertIn('data_palette_refs', list_ret) ws_id = self.getWsClient().get_workspace_info({"workspace": ws_name1})[0] ret2 = self.getImpl().list_objects_with_sets( self.getContext(), {"ws_id": ws_id})[0]["data"] self.assertEqual(len(ret), len(ret2)) type_filter = "KBaseSets.ReadsSet" ret3 = self.getImpl().list_objects_with_sets(self.getContext(), { "types": [type_filter], "workspaces": [str(ws_id)] })[0]["data"] self.assertTrue(len(ret3) > 0) for item in ret3: info = item['object_info'] obj_type = info[2].split('-')[0] self.assertEqual(type_filter, obj_type) type_filter = "KBaseGenomes.Genome" ret4 = self.getImpl().list_objects_with_sets(self.getContext(), { "types": [type_filter], "workspaces": [str(ws_id)] })[0]["data"] self.assertTrue(len(ret4) == 0)
def fetch_reads_refs_from_sampleset(ref, ws_url, srv_wiz_url): """ From the given object ref, return a list of all reads objects that are a part of that object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows: { "ref": reads object reference, "condition": condition string associated with that reads object, "name": reads object name (needed for saving an AlignmentSet) } The only one required is "ref", all other keys may or may not be present, based on the reads object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info for each reads object, but a single PairedEndLibrary may not have that info. If ref is already a Reads library, just returns a list with ref as a single element. """ obj_type = get_object_type(ref, ws_url) refs = list() if "KBaseSets.ReadsSet" in obj_type or "KBaseRNASeq.RNASeqSampleSet" in obj_type: print("Looking up reads references in ReadsSet object") set_client = SetAPI(srv_wiz_url) reads_set = set_client.get_reads_set_v1({ "ref": ref, "include_item_info": 0, "include_set_item_ref_paths": 1 }) print("Got results from ReadsSet object") pprint(reads_set) ref_list = [r["ref_path"] for r in reads_set["data"]["items"]] reads_names = get_object_names(ref_list, ws_url) for reads in reads_set["data"]["items"]: ref = reads["ref_path"] refs.append({ "ref": ref, "condition": reads["label"], "name": reads_names[ref] }) elif ("KBaseAssembly.SingleEndLibrary" in obj_type or "KBaseFile.SingleEndLibrary" in obj_type or "KBaseAssembly.PairedEndLibrary" in obj_type or "KBaseFile.PairedEndLibrary" in obj_type): refs.append({ "ref": ref, "name": get_object_names([ref], ws_url)[ref] }) else: raise ValueError("Unable to fetch reads reference from object {} " "which is a {}".format(ref, obj_type)) return refs
def load_reads_set(callback_url, ws_name, reads_set, target_name): """ Combine a list of reads references into a ReadsSet. if file_rev is None or not a present key, then this is treated as a single end reads. """ set_client = SetAPI(callback_url) set_output = set_client.save_reads_set_v1({ "workspace": ws_name, "output_object_name": target_name, "data": { "description": "reads set for testing", "items": reads_set } }) return set_output["set_ref"]
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('AlignmentSetEditor'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'user_id': user_id, 'provenance': [ {'service': 'AlignmentSetEditor', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL) cls.serviceImpl = AlignmentSetEditor(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] cls.setAPI = SetAPI(cls.callback_url) cls.gfu = GenomeFileUtil(cls.callback_url) cls.ru = ReadsUtils(cls.callback_url) cls.rau = ReadsAlignmentUtils(cls.callback_url) suffix = int(time.time() * 1000) cls.wsName = "test_AlignmentSetEditor_" + str(suffix) cls.wsClient.create_workspace({'workspace': cls.wsName})
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_cufflinks'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'user_id': user_id, 'provenance': [ {'service': 'kb_cufflinks', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = Workspace(url=cls.wsURL, token=token) cls.serviceImpl = kb_cufflinks(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.callback_url = environ.get('SDK_CALLBACK_URL') cls.srv_wiz_url = cls.cfg['srv-wiz-url'] # cls.wsName = 'cufflinks_test_' + user_id # reuse existing workspace suffix = int(time.time() * 1000) cls.wsName = "test_kb_cufflinks_" + str(suffix) print('workspace_name: ' + cls.wsName) try: # reuse existing (previously torn down) workspace cls.wsClient.undelete_workspace({'workspace': cls.wsName}) print('reusing old workspace...') except BaseException: try: # create if workspace does not exist cls.wsClient.create_workspace({'workspace': cls.wsName}) except BaseException: # get workspace if it exists and was not previously deleted (previously # not torn down) ws_info = cls.wsClient.get_workspace_info({'workspace': cls.wsName}) print("creating new workspace: " + str(ws_info)) cls.dfu = DataFileUtil(cls.callback_url) cls.gfu = GenomeFileUtil(cls.callback_url) cls.ru = ReadsUtils(cls.callback_url) cls.rau = ReadsAlignmentUtils(cls.callback_url) cls.set_api = SetAPI(cls.srv_wiz_url, service_ver='dev') cls.cufflinks_runner = CufflinksUtils(cls.cfg) cls.prepare_data()
def test_two_users_set_inside_dp(self): ws_name1_1 = self.createWs() # Injecting reads object (real copy) into workspace1 orig_reads_obj_ref = self.__class__.example_reads_ref reads_obj_name = "TestReads" self.getWsClient().copy_object({'from': {'ref': orig_reads_obj_ref}, 'to': {'workspace': ws_name1_1, 'name': reads_obj_name}}) copy_reads_obj_ref = ws_name1_1 + '/' + reads_obj_name ws_name1_2 = self.createWs() set_obj_name = "MyReadsSet.1" sapi = SetAPI(self.__class__.serviceWizardURL, token=self.getContext()['token'], service_ver=self.__class__.SetAPI_version) sapi.save_reads_set_v1({'workspace': ws_name1_2, 'output_object_name': set_obj_name, 'data': {'description': '', 'items': [{'ref': copy_reads_obj_ref}]}}) orig_set_ref = ws_name1_2 + '/' + set_obj_name # Making DP-copy of reads set object by user2 ws_name2 = self.createWs2() # Let's share workspace containing set with user2 self.getWsClient().set_permissions({'workspace': ws_name1_2, 'new_permission': 'r', 'users': [self.getContext2()['user_id']]}) # Import reads set ref into DataPalette of third workspace dps = DataPaletteService(self.__class__.serviceWizardURL, token=self.getContext2()['token'], service_ver=self.__class__.DataPalette_version) dps.add_to_palette({'workspace': ws_name2, 'new_refs': [{'ref': orig_set_ref}]}) dp_ref_map = dps.list_data({'workspaces': [ws_name2]})['data_palette_refs'] set_ref_path = dp_ref_map.itervalues().next() + ';' + orig_set_ref reads_ref_path = set_ref_path + ';' + copy_reads_obj_ref # Un-share original workspace self.getWsClient().set_permissions({'workspace': ws_name1_2, 'new_permission': 'n', 'users': [self.getContext2()['user_id']]}) # Let's check that we can list set and see reads object as set item ret = self.getImpl().list_objects_with_sets(self.getContext2(), {"ws_name": ws_name2})[0]["data"] self.assertEqual(1, len(ret)) item = ret[0] self.assertTrue('set_items' in item) self.assertTrue('set_items_info' in item['set_items']) self.assertEqual(1, len(item['set_items']['set_items_info'])) # Check access to reads and to set objects info = self.getWsClient2().get_object_info_new({'objects': [{'ref': set_ref_path}]})[0] self.assertEqual(set_obj_name, info[1]) info = self.getWsClient2().get_object_info_new({'objects': [{'ref': reads_ref_path}]})[0] self.assertEqual(reads_obj_name, info[1])
def test_unique_items(self): # Create original workspace with reads object + ReadsSet object ws_name1 = self.createWs() foft = FakeObjectsForTests(os.environ['SDK_CALLBACK_URL']) reads_obj_name = "test.reads.1" foft.create_fake_reads({ 'ws_name': ws_name1, 'obj_names': [reads_obj_name] }) reads_obj_ref = ws_name1 + '/' + reads_obj_name set_obj_name = "test.reads_set.1" sapi = SetAPI(self.__class__.serviceWizardURL, token=self.getContext()['token'], service_ver=self.__class__.SetAPI_version) sapi.save_reads_set_v1({ 'workspace': ws_name1, 'output_object_name': set_obj_name, 'data': { 'description': '', 'items': [{ 'ref': reads_obj_ref }] } }) set_obj_ref = ws_name1 + '/' + set_obj_name # Create workspace with DataPalette copy of Reads object and copy of ReadsSet ws_name2 = self.createWs() dps = DataPaletteService( self.__class__.serviceWizardURL, token=self.getContext()['token'], service_ver=self.__class__.DataPalette_version) dps.add_to_palette({ 'workspace': ws_name2, 'new_refs': [{ 'ref': reads_obj_ref }, { 'ref': set_obj_ref }] }) # Check if listing in both these workspaces at the same time gives unique items ret = self.getImpl().list_objects_with_sets( self.getContext(), {"workspaces": [ws_name1, ws_name2]})[0]["data"] self.assertEqual(2, len(ret))
def setUpClass(cls): cls.token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_ballgown'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(cls.token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': cls.token, 'user_id': user_id, 'provenance': [{ 'service': 'kb_ballgown', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.hs = HandleService(url=cls.cfg['handle-service-url'], token=cls.token) cls.shockURL = cls.cfg['shock-url'] cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=cls.token) cls.serviceImpl = kb_ballgown(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] cls.gfu = GenomeFileUtil(cls.callback_url) cls.dfu = DataFileUtil(cls.callback_url) cls.ru = ReadsUtils(cls.callback_url) cls.rau = ReadsAlignmentUtils(cls.callback_url, service_ver='dev') cls.eu = ExpressionUtils(cls.callback_url, service_ver='dev') cls.set_api = SetAPI(cls.callback_url) suffix = int(time.time() * 1000) cls.wsName = "test_kb_ballgown_" + str(suffix) #cls.wsName = "test_kb_ballgown_1004" cls.wsClient.create_workspace({'workspace': cls.wsName}) cls.nodes_to_delete = [] cls.handles_to_delete = [] cls.prepare_data()
def upload_alignment_set(self, alignment_items, alignmentset_name, ws_name): """ Compiles and saves a set of alignment references (+ other stuff) into a KBaseRNASeq.RNASeqAlignmentSet. Returns the reference to the new alignment set. alignment_items: [{ "ref": alignment_ref, "label": condition label. }] # alignment_info = dict like this: # { # reads_ref: { # "ref": alignment_ref # } # } # reads_info = dict like this: # { # reads_ref: { # "condition": "some condition" # } # } # input_params = global input params to HISAT2, also has ws_name for the target workspace. # alignmentset_name = name of final set object. """ print("Uploading completed alignment set") alignment_set = { "description": "Alignments using HISAT2, v.{}".format(HISAT_VERSION), "items": alignment_items } set_api = SetAPI(self.srv_wiz_url) set_info = set_api.save_reads_alignment_set_v1({ "workspace": ws_name, "output_object_name": alignmentset_name, "data": alignment_set }) return set_info["set_ref"]
class GenDiffExprMatrix: INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') def __init__(self, config, logger=None): self.config = config self.logger = logger self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'DEM_' + str(uuid.uuid4())) self.ws_url = config['workspace-url'] self.ws_client = Workspace(self.ws_url) self.fv = KBaseFeatureValues(self.callback_url) self.dfu = DataFileUtil(self.callback_url) self.setAPI = SetAPI(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self._mkdir_p(self.scratch) def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def setup_data(self): self.new_col_names = [ 'gene_id', 'log2_fold_change', 'p_value', 'q_value' ] def get_feature_ids(self, genome_ref): """ _get_feature_ids: get feature ids from genome """ feature_num = self.gsu.search({'ref': genome_ref})['num_found'] genome_features = self.gsu.search({ 'ref': genome_ref, 'limit': feature_num, 'sort_by': [['feature_id', True]] })['features'] features_ids = map( lambda genome_feature: genome_feature.get('feature_id'), genome_features) return list(set(features_ids)) def gen_matrix(self, infile, old_col_names, delimiter): with open(infile, 'rb') as source: rdr = csv.DictReader(source, delimiter=delimiter) col_names = self.new_col_names[1:] row_names = [] values = [] for row in rdr: try: values.append([float(row[v]) for v in old_col_names[1:]]) except: values_list = [] for v in old_col_names[1:]: tmpval = row[v] if isinstance(tmpval, (int, long, float)): values_list.append(float(tmpval)) elif isinstance(tmpval, basestring): if 'na' in tmpval.lower( ) or 'none' in tmpval.lower(): values_list.append(None) else: tmpval = tmpval.replace("'", "") tmpval = tmpval.replace('"', '') values_list.append(float(tmpval)) else: raise ValueError( "invalid type in input file: {}".format( tmpval)) values.append(values_list) row_names.append(row[old_col_names[0]]) twoD_matrix = { 'row_ids': row_names, 'col_ids': col_names, 'values': values } return twoD_matrix def get_max_fold_change_to_handle_inf(self, infile): maxvalue = 0 with open(infile) as source: rdr = csv.DictReader(source, dialect='excel-tab') for line in rdr: log2fc_val = line.get('log2_fold_change') if not 'inf' in str(log2fc_val): log2fc = abs(float(log2fc_val)) if log2fc > maxvalue: maxvalue = log2fc print 'maxvalue: ', maxvalue return maxvalue def gen_cuffdiff_matrix(self, infile, delimiter='\t'): max_value = self.get_max_fold_change_to_handle_inf(infile) with open(infile, 'rb') as source: rdr = csv.DictReader(source, delimiter=delimiter) col_names = self.new_col_names[1:] row_names = [] values = [] for row in rdr: log2fc_val = row.get('log2_fold_change') # print 'FC_VAL: ', log2fc_val if '-inf' in str(log2fc_val): row['log2_fold_change'] = -float(max_value) elif 'inf' in str(log2fc_val): row['log2_fold_change'] = float(max_value) elif 'nan' in str(log2fc_val): row['log2_fold_change'] = None try: values.append( [float(row[v]) for v in self.new_col_names[1:]]) except: values.append( [None] + [float(row[v]) for v in self.new_col_names[2:]]) row_names.append(row[self.new_col_names[0]]) tmatrix = { 'row_ids': row_names, 'col_ids': col_names, 'values': values } return tmatrix def save_diff_expr_matrix(self, obj_name, data_matrix, condition1, condition2): dem_data = { 'genome_ref': self.params.get('genome_ref'), 'data': data_matrix, 'condition_mapping': { condition1: condition2 }, 'type': 'log2_level', 'scale': '1.0' } res = self.dfu.save_objects({ 'id': self.params.get('ws_id'), "objects": [{ "type": "KBaseFeatureValues.DifferentialExpressionMatrix", "data": dem_data, "name": obj_name, "extra_provenance_input_refs": [self.params.get('genome_ref')] }] })[0] ret_ref = str(res[6]) + '/' + str(res[0]) + '/' + str(res[4]) return ret_ref def save_diff_expr_matrix_set(self, obj_name, matrix_set): res = self.setAPI.save_differential_expression_matrix_set_v1({ "workspace": self.params.get('ws_name'), "output_object_name": obj_name, "data": matrix_set }) return res.get('set_ref') # # ballgown always outputs a linear fold change, which we need to convert to log2 # before storing # def safely_apply_log2_to_fc(self, row): if row[0]: fc = row[0] if fc < 1.0e-10: fc = fc + 1.0e-10 # incase fc is zero return ([log2(fc)] + row[1:]) else: return (row) def process_ballgown_file(self, diffexpr_filepath): ballgown_col_names = ['id', 'fc', 'pval', 'qval'] data_matrix = self.gen_matrix(diffexpr_filepath, ballgown_col_names, delimiter='\t') log2_data_matrix = data_matrix log2_data_matrix['values'] = map(self.safely_apply_log2_to_fc, data_matrix.get('values')) dem_ref = self.save_diff_expr_matrix( self.params.get('obj_name') + '_0', log2_data_matrix, None, None) set_items = [{ 'label': 'global Differential Expression Data', 'ref': dem_ref }] matrix_set = { 'description': 'ballgown Diff Exp Matrix Set', 'items': set_items } return self.save_diff_expr_matrix_set(self.params.get('obj_name'), matrix_set) def process_deseq_file(self, diffexpr_filepath): deseq_col_names = ['geneID', 'log2FoldChange', 'pvalue', 'padj'] data_matrix = self.gen_matrix(diffexpr_filepath, deseq_col_names, delimiter=',') dem_ref = self.save_diff_expr_matrix( self.params.get('obj_name') + '_0', data_matrix, None, None) set_items = [{ 'label': 'global Differential Expression Data', 'ref': dem_ref }] matrix_set = { 'description': 'deseq Diff Exp Matrix Set', 'items': set_items } return self.save_diff_expr_matrix_set(self.params.get('obj_name'), matrix_set) def process_cuffdiff_file(self, diffexpr_filepath): cuffdiff_col_names = [ 'gene', 'log2(fold_change)', 'p_value', 'q_value' ] ConditionPair = namedtuple("ConditionPair", ["condition1", "condition2"]) FileInfo = namedtuple('FileInfo', ['file_path', 'file_obj']) condPair_fileInfo = {} timestamp = str( int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)) with open(diffexpr_filepath, 'rb') as source: rdr = csv.DictReader(source, dialect='excel-tab') """ save the files opened for writing in outfiles list, so they can be closed later """ outfiles = list() for r in rdr: c1 = r['sample_1'] c2 = r['sample_2'] cond_pair = ConditionPair(condition1=c1, condition2=c2) tsv_file_info = condPair_fileInfo.get(cond_pair, None) if tsv_file_info is None: tsv_file_name = timestamp + '_' + c1 + '~~' + c2 tsv_file_path = os.path.join(self.scratch, tsv_file_name) outfile = open(tsv_file_path, 'wb') outfiles.append(outfile) csv_wtr = csv.DictWriter(outfile, delimiter='\t', fieldnames=self.new_col_names) csv_wtr.writerow( dict((cn, cn) for cn in self.new_col_names)) tsv_file_info = FileInfo(file_path=tsv_file_path, file_obj=csv_wtr) condPair_fileInfo[cond_pair] = tsv_file_info wtr = tsv_file_info.file_obj col_vals = [r[v] for v in cuffdiff_col_names] wtr.writerow(dict(zip(self.new_col_names, col_vals))) for ofile in outfiles: ofile.close() set_items = list() for cond_pair, file_info in condPair_fileInfo.iteritems(): print 'Cond_pair: ', cond_pair print 'File: ', file_info.file_path tsv_file = file_info.file_path data_matrix = self.gen_cuffdiff_matrix(tsv_file) object_name = self.get_obj_name(self.params['obj_name'], cond_pair.condition1, cond_pair.condition2) dem_ref = self.save_diff_expr_matrix(object_name, data_matrix, cond_pair.condition1, cond_pair.condition2) print('process_cuffdiff_file: DEM_REF: ' + dem_ref) set_items.append({ 'label': cond_pair.condition1 + ', ' + cond_pair.condition2, 'ref': dem_ref }) matrix_set = { 'description': 'cuffdiff Diff Exp Matrix Set', 'items': set_items } return self.save_diff_expr_matrix_set(self.params.get('obj_name'), matrix_set) """ Functions for save_differentialExpressionMatrixSet """ def save_matrix(self, genome_ref, infile, in_col_names, delimiter): feature_ids = self.get_feature_ids(genome_ref) with open(infile, 'rb') as source: rdr = csv.DictReader(source, delimiter=delimiter) col_names = in_col_names[1:] row_names = [] values = [] for row in rdr: if row[in_col_names[0]] in feature_ids: row_names.append(row[in_col_names[0]]) else: gene_ids = row[in_col_names[0]].strip().split(',') match = True mismatched_gene_ids = list() for gene_id in gene_ids: gene_id = gene_id.strip() if gene_id not in feature_ids: mismatched_gene_ids.append(gene_id) match = False if match: row_names.append(row[in_col_names[0]]) else: error_msg = 'Gene_id(s) "{}" is not a known feature in "{}"'.format( ', '.join(mismatched_gene_ids), self.params.get('genome_ref')) raise ValueError(error_msg) try: values.append([float(row[v]) for v in in_col_names[1:]]) except: values_list = [] for v in in_col_names[1:]: tmpval = row[v] if isinstance(tmpval, (int, long, float)): values_list.append(float(tmpval)) elif isinstance(tmpval, basestring): if 'na' in tmpval.lower( ) or 'none' in tmpval.lower(): values_list.append(None) else: tmpval = tmpval.replace("'", "") tmpval = tmpval.replace('"', '') values_list.append(float(tmpval)) else: raise ValueError( "invalid type in input file: {}".format( tmpval)) values.append(values_list) twoD_matrix = { 'row_ids': row_names, 'col_ids': col_names, 'values': values } return twoD_matrix @staticmethod def get_obj_name(obj_name, condition1, condition2): def sanitize(ws_name): # I'm not using translate because it's a mess with mixed unicode & strings return ws_name.replace("\t", " ").replace(" ", "_").replace("/", "|") return "{}-{}-{}".format(obj_name, sanitize(condition1), sanitize(condition2)) def gen_diffexpr_matrices(self, params): print('In GEN DEMs') self.params = params self.setup_data() diffexpr_filepath = self.params.get('diffexpr_filepath') if 'deseq' in self.params.get('tool_used').lower(): dem_ref = self.process_deseq_file(diffexpr_filepath) elif 'ballgown' in self.params.get('tool_used').lower(): dem_ref = self.process_ballgown_file(diffexpr_filepath) elif 'cuffdiff' in self.params.get('tool_used').lower(): dem_ref = self.process_cuffdiff_file(diffexpr_filepath) else: raise ValueError('"{}" is not a valid tool_used parameter'.format( self.params.get('tool_used'))) return dem_ref def save_diffexpr_matrices(self, params): print('In SAVE DEMs') self.params = params self.setup_data() set_items = list() for deFile in self.params.get('diffexpr_data'): condition_mapping = deFile.get('condition_mapping') diffexpr_filepath = deFile.get('diffexpr_filepath') if deFile.get('delimter', None) is not None: delimiter = deFile.get('delimter') else: delimiter = '\t' fileext = os.path.splitext(diffexpr_filepath)[1] if 'csv' in fileext.lower(): delimiter = ',' elif 'tsv' in fileext.lower(): delimiter = '\t' else: print('Using tab delimiter') data_matrix = self.save_matrix(self.params.get('genome_ref'), diffexpr_filepath, self.new_col_names, delimiter) condition1, condition2 = condition_mapping.items()[0] object_name = self.get_obj_name(self.params['obj_name'], condition1, condition2) dem_ref = self.save_diff_expr_matrix(object_name, data_matrix, condition1, condition2) set_items.append({ 'label': condition1 + ', ' + condition2, 'ref': dem_ref }) matrix_set = { 'description': self.params.get('tool_used') + ' Differential Expression Matrix Set', 'items': set_items } return self.save_diff_expr_matrix_set(self.params.get('obj_name'), matrix_set)
def save_read_set(self, ctx, params): """ :param params: instance of type "save_read_set_params" (** ** Method for adding Reads objects to a Reads Set) -> structure: parameter "workspace_name" of String, parameter "output_readset_name" of String, parameter "input_reads_list" of list of String, parameter "desc" of String :returns: instance of type "save_read_set_output" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN save_read_set console = [] invalid_msgs = [] #self.log(console,'Running save_read_set with params=') #self.log(console, "\n"+pformat(params)) report = '' # report = 'Running KButil_Add_Genomes_to_GenomeSet with params=' # report += "\n"+pformat(params) #### do some basic checks # if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'desc' not in params: raise ValueError('desc parameter is required') if 'input_reads_list' not in params: raise ValueError('input_reads_list parameter is required') #if 'input_readsset_name' not in params: # raise ValueError('input_readsset_name parameter is optional') if 'output_readset_name' not in params: raise ValueError('output_readset_name parameter is required') # Build ReadsSet # elements = dict() savereadssetparams = {} savereadssetparams['workspace_name'] = params['workspace_name'] savereadssetparams['output_object_name'] = params[ 'output_readset_name'] readsetdata = {} if (params['desc'] is not None): readsetdata['description'] = params['desc'] readsetdata['items'] = [] print "WS " + params['workspace_name'] print "READS " + str(params['input_reads_list']) # add new reads for reads_name in params['input_reads_list']: readssetitem = {} readssetitem['ref'] = params['workspace_name'] + '/' + reads_name readssetitem['label'] = '' readsetdata['items'].append(readssetitem) savereadssetparams['data'] = readsetdata # load the method provenance from the context object # #self.log(console,"Setting Provenance") # DEBUG provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference try: prov_defined = provenance[0]['input_ws_objects'] except: provenance[0]['input_ws_objects'] = [] for reads_name in params['input_reads_list']: provenance[0]['input_ws_objects'].append(params['workspace_name'] + '/' + reads_name) provenance[0]['service'] = 'ReadssetEditor' provenance[0]['method'] = 'save_read_set' # Save output object # #if len(invalid_msgs) == 0: # self.log(console,"Saving ReadssSet") set_api = SetAPI(url=self.servicewizardURL, token=ctx['token']) #set_api._service_ver = "dev" set_api.save_reads_set_v1(savereadssetparams) # build output report object # #self.log(console,"BUILDING REPORT") # DEBUG if len(invalid_msgs) == 0: #self.log(console,"reads in output set "+params['output_readset_name']+": "+str(len(elements.keys()))) report += 'reads in output set ' + params[ 'output_readset_name'] + ': ' + str(len( elements.keys())) + "\n" reportObj = { 'objects_created': [{ 'ref': params['workspace_name'] + '/' + params['output_readset_name'], 'description': 'save_read_set' }], 'text_message': report } else: report += "FAILURE:\n\n" + "\n".join(invalid_msgs) + "\n" reportObj = {'objects_created': [], 'text_message': report} reportName = 'save_read_set_report_' + str(hex(uuid.getnode())) ws = workspaceService(self.workspaceURL, token=ctx['token']) report_obj_info = ws.save_objects({ 'workspace': params['workspace_name'], 'objects': [{ 'type': 'KBaseReport.Report', 'data': reportObj, 'name': reportName, 'meta': {}, 'hidden': 1, 'provenance': provenance }] })[0] # Build report and return # #self.log(console,"BUILDING RETURN OBJECT") returnVal = { 'report_name': reportName, 'report_ref': str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' + str(report_obj_info[4]), } #self.log(console,"save_read_set DONE") #END save_read_set # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method save_read_set return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def fetch_reads_refs_from_sampleset(ref, ws_url, callback_url, params): """ From the given object ref, return a list of all reads objects that are a part of that object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows: { "ref": reads object reference, "condition": condition string associated with that reads object } The only one required is "ref", all other keys may or may not be present, based on the reads object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info for each reads object, but a single PairedEndLibrary may not have that info. If ref is already a Reads library, just returns a list with ref as a single element. """ obj_type = get_object_type(ref, ws_url) ws = Workspace(ws_url) refs = list() refs_for_ws_info = list() if "KBaseSets.ReadsSet" in obj_type: print("Looking up reads references in ReadsSet object") set_client = SetAPI(callback_url) reads_set = set_client.get_reads_set_v1({ "ref": ref, "include_item_info": 0 }) for reads in reads_set["data"]["items"]: refs.append({"ref": reads["ref"], "condition": reads["label"]}) refs_for_ws_info.append({'ref': reads['ref']}) elif "KBaseRNASeq.RNASeqSampleSet" in obj_type: print("Looking up reads references in RNASeqSampleSet object") sample_set = ws.get_objects2({"objects": [{ "ref": ref }]})["data"][0]["data"] for i in range(len(sample_set["sample_ids"])): refs.append({ "ref": sample_set["sample_ids"][i], "condition": sample_set["condition"][i] }) refs_for_ws_info.append({'ref': sample_set['sample_ids'][i]}) elif ("KBaseAssembly.SingleEndLibrary" in obj_type or "KBaseFile.SingleEndLibrary" in obj_type or "KBaseFile.SingleEndLibrary-2.0" in obj_type or "KBaseFile.SingleEndLibrary-2.1" in obj_type or "KBaseAssembly.PairedEndLibrary" in obj_type or "KBaseFile.PairedEndLibrary" in obj_type or "KBaseFile.PairedEndLibrary-2.0" in obj_type or "KBaseFile.PairedEndLibrary-2.1" in obj_type): refs.append({"ref": ref}) refs_for_ws_info.append({'ref': ref}) else: raise ValueError("Unable to fetch reads reference from object {} " "which is a {}".format(ref, obj_type)) # get object info so we can name things properly infos = ws.get_object_info3({'objects': refs_for_ws_info})['infos'] name_ext = '_alignment' if ('alignment_suffix' in params and params['alignment_suffix'] is not None): ext = params['alignment_suffix'].replace(' ', '') if ext: name_ext = ext unique_names = get_unique_names(infos) for k in range(0, len(refs)): refs[k]['info'] = infos[k] name = unique_names[k] + name_ext refs[k]['alignment_output_name'] = name return refs
class MetagenomeFileUtils: def _validate_merge_bins_from_binned_contig_params(self, params): """ _validate_merge_bins_from_binned_contig_params: validates params passed to merge_bins_from_binned_contig method """ log('Start validating merge_bins_from_binned_contig params') # check for required parameters for p in [ 'old_binned_contig_ref', 'bin_merges', 'output_binned_contig_name', 'workspace_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) bin_merges = params.get('bin_merges') if not isinstance(bin_merges, list): error_msg = 'expecting a list for bin_merges param, ' error_msg += 'but getting a [{}]'.format(type(bin_merges)) raise ValueError(error_msg) for bin_merge in bin_merges: for p in ['new_bin_id', 'bin_to_merge']: if p not in bin_merge: raise ValueError( '"{}" key is required in bin_merges, but missing'. format(p)) bin_to_merge = bin_merge.get('bin_to_merge') if not isinstance(bin_to_merge, list): error_msg = 'expecting a list for bin_to_merge, ' error_msg += 'but getting a [{}]'.format(type(bin_to_merge)) raise ValueError(error_msg) def _validate_remove_bins_from_binned_contig_params(self, params): """ _validate_remove_bins_from_binned_contig_params: validates params passed to remove_bins_from_binned_contig method """ log('Start validating remove_bins_from_binned_contig params') # check for required parameters for p in [ 'old_binned_contig_ref', 'bins_to_remove', 'output_binned_contig_name', 'workspace_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) bins_to_remove = params.get('bins_to_remove') if not isinstance(bins_to_remove, list): error_msg = 'expecting a list for bins_to_remove param, ' error_msg += 'but getting a [{}]'.format(type(bins_to_remove)) raise ValueError(error_msg) def _validate_file_to_binned_contigs_params(self, params): """ _validate_file_to_binned_contigs_params: validates params passed to file_to_binned_contigs method """ log('Start validating file_to_binned_contigs params') # check for required parameters for p in [ 'assembly_ref', 'file_directory', 'binned_contig_name', 'workspace_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _validate_binned_contigs_to_file_params(self, params): """ _validate_binned_contigs_to_file_params: validates params passed to binned_contigs_to_file method """ log('Start validating binned_contigs_to_file params') # check for required parameters for p in ['input_ref']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _validate_extract_binned_contigs_as_assembly_params(self, params): """ _validate_extract_binned_contigs_as_assembly_params: validates params passed to extract_binned_contigs_as_assembly method """ log('Start validating extract_binned_contigs_as_assembly params') # check for required parameters for p in [ 'binned_contig_obj_ref', 'extracted_assemblies', 'assembly_suffix', 'workspace_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) # convert comma-separated list of bins into a list of individual ids (the python # comprehension construction deals with the fact that split(',') returns a list of # length one, [''], for an empty string input extracted_assemblies = [ x for x in params.get('extracted_assemblies').split(',') if x ] # parameter assembly_set_name is required if extracted_assemblies list has more # than one element if len(extracted_assemblies) > 1 and 'assembly_set_name' not in params: raise ValueError( '"assembly_set_names" parameter is required for more than one extracted assembly' ) def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _get_bin_ids(self, file_directory): """ _get_bin_ids: getting bin contig ids from files NOTE: This method is very specific to MaxBin2 app result. Bin contig files generated by MaxBin2 follow 'header.0xx.fasta' name pattern """ bin_ids = [] result_files = os.listdir(file_directory) for file in result_files: if re.match(r'.*\.\d{3}\.fasta', file): bin_ids.append(file) log('generated bin ids:\n{}'.format('\n'.join(bin_ids))) return bin_ids def _process_summary_file(self, bin_id, lines): """ _process_summary_file: process header.summary file content getting GC content (gc), Genome size (sum_contig_len) and Completeness (cov) from header.summary file NOTE: This method is very specific to MaxBin2 app result. header.summary file could be one of below fomat: Bin name Abundance Completeness Genome size GC content maxbin_output.001.fasta 0.00 97.2% 2690533 52.9 Bin name Completeness Genome size GC content maxbin_output.001.fasta 97.2% 2690533 52.9 """ for line in lines: line_list = line.split('\t') if line_list[0] == bin_id: if len(line_list) == 5: gc = round(float(line_list[4]) / 100, 5) sum_contig_len = int(line_list[3]) cov = round(float(line_list[2].partition('%')[0]) / 100, 5) elif len(line_list) == 4: gc = round(float(line_list[3]) / 100, 5) sum_contig_len = int(line_list[2]) cov = round(float(line_list[1].partition('%')[0]) / 100, 5) return gc, sum_contig_len, cov def _get_total_contig_len(self, file_directory): """ _get_total_contig_len: process header.summary file content getting total contig length from header.summary file NOTE: This method is very specific to MaxBin2 app result. """ log('generating total contig length') total_contig_len = 0 file_list = os.listdir(file_directory) for file in file_list: if file.endswith('.summary'): with open(os.path.join(file_directory, file), 'r') as summary_file: lines = summary_file.readlines() for line in lines[1:]: line_list = line.split('\t') if len(line_list) == 5: total_contig_len += int(line_list[3]) elif len(line_list) == 4: total_contig_len += int(line_list[2]) log('generated total contig length: {}'.format(total_contig_len)) return total_contig_len def _generate_contig_bin_summary(self, bin_id, file_directory): """ _generate_contig_bin_summary: getting ContigBin summary from header.summary file NOTE: This method is very specific to MaxBin2 app result. """ log('generating summary for bin_id: {}'.format(bin_id)) file_list = os.listdir(file_directory) for file in file_list: if file.endswith('.summary'): with open(os.path.join(file_directory, file), 'r') as summary_file: lines = summary_file.readlines() gc, sum_contig_len, cov = self._process_summary_file( bin_id, lines) log('generated GC content: {}, Genome size: {} '.format( gc, sum_contig_len)) log('and Completeness: {} for bin_id: {}'.format(cov, bin_id)) return gc, sum_contig_len, cov def _generate_contigs(self, file_name, file_directory, assembly_ref): """ _generate_contigs: generate contigs from assembly object file_name: file name of fasta file file_directory: fasta file directory assembly_ref: associated assembly object reference """ log('start generating contig objects for file: {}'.format(file_name)) assembly = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0] assembly_contigs = assembly.get('data').get('contigs') contigs = {} for record in SeqIO.parse(os.path.join(file_directory, file_name), "fasta"): contig_id = record.id contig = assembly_contigs.get(contig_id) if contig: # using assembly object data contig_gc = contig.get('gc_content') sequence_length = contig.get('length') else: log('cannot find contig [{}] from assembly.'.format(contig_id)) log('computing contig info') sequence = str(record.seq).upper() sequence_length = len(sequence) contig_gc_len = 0 contig_gc_len += sequence.count('G') contig_gc_len += sequence.count('C') contig_gc = round( float(contig_gc_len) / float(sequence_length), 5) contig = {'gc': contig_gc, 'len': sequence_length} contigs[contig_id] = contig log('complete generating contig objects for file: {}'.format( file_name)) return contigs def _generate_contig_bin(self, bin_id, file_directory, assembly_ref): """ _generate_contig_bin: gerneate ContigBin structure """ log('start generating BinnedContig info for bin: {}'.format(bin_id)) # generate ContigBin summery info gc, sum_contig_len, cov = self._generate_contig_bin_summary( bin_id, file_directory) # generate Contig info contigs = self._generate_contigs(bin_id, file_directory, assembly_ref) contig_bin = { 'bid': bin_id, 'contigs': contigs, 'n_contigs': len(contigs), 'gc': gc, 'sum_contig_len': sum_contig_len, 'cov': cov } log('complete generating BinnedContig info for bin: {}'.format(bin_id)) return contig_bin def _get_contig_file(self, assembly_ref): """ _get_contig_file: get contif file from GenomeAssembly object """ log('retrieving contig file from assembly: {}'.format(assembly_ref)) contig_file = self.au.get_assembly_as_fasta({ 'ref': assembly_ref }).get('path') sys.stdout.flush() contig_file = self.dfu.unpack_file({'file_path': contig_file})['file_path'] log('saved contig file to: {}'.format(contig_file)) return contig_file def _get_contig_string(self, contig_id, assembly_contig_file, parsed_assembly): """ _get_contig_string: find and return contig string from assembly contig file """ # parsed_assembly = SeqIO.to_dict(SeqIO.parse(assembly_contig_file, "fasta")) contig_record = parsed_assembly.get(contig_id) if contig_record: string_contig = '' string_contig += '>{}\n'.format(contig_id) string_contig += str(contig_record.seq).upper() string_contig += '\n' else: error_msg = 'Cannot find contig [{}] from file [{}].'.format( contig_id, assembly_contig_file) raise ValueError(error_msg) return string_contig def _pack_file_to_shock(self, result_files): """ _pack_file_to_shock: pack files in result_files list and save in shock """ log('start packing and uploading files:\n{}'.format( '\n'.join(result_files))) output_directory = os.path.join( self.scratch, 'packed_binned_contig_' + str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join( output_directory, 'packed_binned_contig_' + str(uuid.uuid4()) + '.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for file in result_files: zip_file.write(file, os.path.basename(file)) shock_id = self.dfu.file_to_shock({ 'file_path': result_file }).get('shock_id') log('saved file to shock: {}'.format(shock_id)) return shock_id def _generate_report(self, report_message, params): """ generate_report: generate summary report """ log('Generating report') uuid_string = str(uuid.uuid4()) upload_message = 'Job Finished\n\n' upload_message += report_message log('Report message:\n{}'.format(upload_message)) report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'MetagenomeUtils_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _generate_report_message(self, new_binned_contig_ref): """ _generate_report_message: generate a report message for BinnedContig object """ report_message = '' binned_contig = self.dfu.get_objects( {'object_refs': [new_binned_contig_ref]})['data'][0] binned_contig_info = binned_contig.get('info') binned_contig_name = binned_contig_info[1] report_message += 'Generated BinnedContig: {} [{}]\n'.format( binned_contig_name, new_binned_contig_ref) binned_contig_count = 0 total_bins = binned_contig.get('data').get('bins') total_bins_count = len(total_bins) bin_ids = [] for bin in total_bins: binned_contig_count += len(bin.get('contigs')) bin_ids.append(bin.get('bid')) report_message += '--------------------------\nSummary:\n\n' report_message += 'Binned contigs: {}\n'.format(binned_contig_count) report_message += 'Total size of bins: {}\n'.format(total_bins_count) report_message += 'Bin IDs:\n{}\n'.format('\n'.join(bin_ids)) return report_message def _merge_bins(self, new_bin_id, bin_objects_to_merge): """ _merge_bins: merge a list of bins into new_bin_id """ total_contigs = {} total_gc_count = 0 total_sum_contig_len = 0 total_cov_len = 0 for bin in bin_objects_to_merge: total_contigs.update(bin.get('contigs')) sum_contig_len = bin.get('sum_contig_len') total_sum_contig_len += sum_contig_len total_gc_count += sum_contig_len * bin.get('gc') total_cov_len += sum_contig_len * bin.get('cov') contig_bin = { 'bid': new_bin_id, 'contigs': total_contigs, 'n_contigs': len(total_contigs), 'gc': round(float(total_gc_count) / total_sum_contig_len, 5), 'sum_contig_len': total_sum_contig_len, 'cov': round(float(total_cov_len) / total_sum_contig_len, 5) } return contig_bin def _save_binned_contig(self, binned_contigs, workspace_name, binned_contig_name): """ _build_binned_contig: save BinnedContig object """ workspace_name = workspace_name if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) object_type = 'KBaseMetagenomes.BinnedContigs' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': binned_contigs, 'name': binned_contig_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] new_binned_contig_ref = str(dfu_oi[6]) + '/' + str( dfu_oi[0]) + '/' + str(dfu_oi[4]) return new_binned_contig_ref def _check_bin_merges(self, bin_merges): """ _check_bin_merges: checking bin_merges """ bin_id_list = map(lambda item: item.get('bin_to_merge'), bin_merges) bin_ids = [] map(lambda item: map(lambda bin_id: bin_ids.append(bin_id), item), bin_id_list) for bin_id in bin_id_list: if len(bin_id) <= 1: raise ValueError( "Please provide at least two bin_ids to merge") for id in bin_id: if bin_ids.count(id) > 1: raise ValueError( "Same bin [{}] appears in muliple merges".format(id)) new_bin_id_list = map(lambda item: item.get('new_bin_id'), bin_merges) for new_bin_id in new_bin_id_list: if new_bin_id_list.count(new_bin_id) > 1: raise ValueError( "Same new Bin ID [{}] appears in muliple merges".format( id)) def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.setapi = SetAPI(self.callback_url) self.wss = workspaceService(config['workspace-url']) def file_to_binned_contigs(self, params): """ file_to_binned_contigs: Generating BinnedContigs ojbect from files input params: file_directory: file directory containing compressed/unpacked contig file(s) to build BinnedContig object assembly_ref: metagenome assembly object reference binned_contig_name: BinnedContig object name workspace_name: the name/id of the workspace it gets saved to return params: binned_contig_obj_ref: generated result BinnedContig object reference """ log('--->\nrunning MetagenomeFileUtils.file_to_binned_contigs\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_file_to_binned_contigs_params(params) file_directory = params.get('file_directory') assembly_ref = params.get('assembly_ref') log('starting generating BinnedContig object') bin_ids = self._get_bin_ids(file_directory) bins = [] for bin_id in bin_ids: contig_bin = self._generate_contig_bin(bin_id, file_directory, assembly_ref) bins.append(contig_bin) log('finished generating BinnedContig object') total_contig_len = self._get_total_contig_len(file_directory) binned_contigs = { 'assembly_ref': assembly_ref, 'bins': bins, 'total_contig_len': total_contig_len } binned_contig_obj_ref = self._save_binned_contig( binned_contigs, params.get('workspace_name'), params.get('binned_contig_name')) returnVal = {'binned_contig_obj_ref': binned_contig_obj_ref} log('successfully saved BinnedContig object') return returnVal def binned_contigs_to_file(self, params): """ binned_contigs_to_file: Convert BinnedContig object to fasta files and pack them to shock input params: input_ref: BinnedContig object reference optional params: save_to_shock: saving result bin files to shock. default to True bin_id_list: only extract bin_id_list return params: shock_id: saved packed file shock id bin_file_directory: directory that contains all bin files """ log('--->\nrunning MetagenomeFileUtils.binned_contigs_to_file\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_binned_contigs_to_file_params(params) binned_contig_object = self.dfu.get_objects( {'object_refs': [params.get('input_ref')]})['data'][0] assembly_ref = binned_contig_object.get('data').get('assembly_ref') assembly_contig_file = self._get_contig_file(assembly_ref) log('parsing assembly file [{}] to dictionary'.format( assembly_contig_file)) parsed_assembly = SeqIO.to_dict( SeqIO.parse(assembly_contig_file, "fasta")) bins = binned_contig_object.get('data').get('bins') result_directory = os.path.join( self.scratch, 'binned_contig_files_' + str(uuid.uuid4())) self._mkdir_p(result_directory) result_files = [] bin_id_list = params.get('bin_id_list') for bin in bins: bin_id = bin.get('bid') if bin_id_list: if bin_id in bin_id_list: log('processing bin: {}'.format(bin_id)) with open(os.path.join(result_directory, bin_id), 'w') as file: contigs = bin.get('contigs') for contig_id in contigs.keys(): contig_string = self._get_contig_string( contig_id, assembly_contig_file, parsed_assembly) file.write(contig_string) result_files.append(os.path.join(result_directory, bin_id)) log('saved contig file to: {}'.format(result_files[-1])) else: log('processing bin: {}'.format(bin_id)) with open(os.path.join(result_directory, bin_id), 'w') as file: contigs = bin.get('contigs') for contig_id in contigs.keys(): contig_string = self._get_contig_string( contig_id, assembly_contig_file, parsed_assembly) file.write(contig_string) result_files.append(os.path.join(result_directory, bin_id)) log('saved contig file to: {}'.format(result_files[-1])) if params.get('save_to_shock') or params.get('save_to_shock') is None: shock_id = self._pack_file_to_shock(result_files) else: shock_id = None returnVal = { 'shock_id': shock_id, 'bin_file_directory': result_directory } return returnVal def _get_object_name_from_ref(self, obj_ref): """given the object reference, return the object_name as a string""" return (self.wss.get_object_info_new({"objects": [{ 'ref': obj_ref }]})[0][1]) def extract_binned_contigs_as_assembly(self, params): """ extract_binned_contigs_as_assembly: extract one/multiple Bins from BinnedContigs as Assembly input params: binned_contig_obj_ref: BinnedContig object reference extracted_assemblies: a string, a comma-separated list of bin_ids to be extracted workspace_name: the name of the workspace it gets saved to return params: assembly_ref_list: a list of generated result Assembly object reference report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ log('--->\nrunning MetagenomeFileUtils.extract_binned_contigs_as_assembly\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_extract_binned_contigs_as_assembly_params(params) # convert comma-separated list of bins into a list of individual ids (the python # comprehension construction deals with the fact that split(',') returns a list of # length one, [''], for an empty string input extracted_assemblies = [ x for x in params.get('extracted_assemblies').split(',') if x ] binned_contig_obj_ref = params.get('binned_contig_obj_ref') contigs_to_file_ret = self.binned_contigs_to_file({ 'input_ref': binned_contig_obj_ref, 'save_to_shock': False, 'bin_id_list': extracted_assemblies }) bin_file_directory = contigs_to_file_ret.get('bin_file_directory') # bin_files will be either a list of the bin contig files corresponding to the # target bin ids, or a list of all bin contig files if extracted_assemblies is empty bin_files = os.listdir(bin_file_directory) # if extracted_assemblies is empty list, create a full one here if not extracted_assemblies: extracted_assemblies = bin_files log("extracted_assemblies was empty, is now " + pformat(extracted_assemblies)) generated_assembly_ref_list = [] assembly_suffix = params.get('assembly_suffix').strip() for bin_id in extracted_assemblies: if bin_id not in map(os.path.basename, bin_files): error_msg = 'bin_id [{}] cannot be found in BinnedContig '.format( bin_id) error_msg += '[{}]'.format(binned_contig_obj_ref) raise ValueError(error_msg) else: output_assembly_name = bin_id + assembly_suffix log('saving assembly: {}'.format(output_assembly_name)) for bin_file in bin_files: if os.path.basename(bin_file) == bin_id: log('starting generating assembly from {}'.format( bin_id)) assembly_params = { 'file': { 'path': os.path.join(bin_file_directory, bin_file) }, 'workspace_name': params.get('workspace_name'), 'assembly_name': output_assembly_name } assembly_ref = self.au.save_assembly_from_fasta( assembly_params) log('finished generating assembly from {}'.format( bin_id)) generated_assembly_ref_list.append(assembly_ref) setref = None if (len(generated_assembly_ref_list) > 1): binned_contig_object_name = self._get_object_name_from_ref( binned_contig_obj_ref) assembly_set_name = params.get('assembly_set_name') log("saving assembly set {0}".format(assembly_set_name)) setref = self.setapi.save_assembly_set_v1({ 'workspace': params.get('workspace_name'), 'output_object_name': assembly_set_name, 'data': { 'description': 'binned assemblies from {0}'.format( binned_contig_object_name), 'items': [{ 'ref': r } for r in generated_assembly_ref_list] } }) log("save assembly set_ref is {0}".format(setref.get('set_ref'))) report_message = 'Generated Assembly Reference: {}'.format( ', '.join(generated_assembly_ref_list)) reportVal = self._generate_report(report_message, params) returnVal = {'assembly_ref_list': generated_assembly_ref_list} returnVal.update(reportVal) if setref: returnVal.update({'assembly_set_ref': setref}) return returnVal def remove_bins_from_binned_contig(self, params): """ remove_bins_from_binned_contig: remove a list of bins from BinnedContig object input params: old_binned_contig_ref: Original BinnedContig object reference bins_to_remove: a list of bin ids to be removed output_binned_contig_name: Name for the output BinnedContigs object workspace_name: the name of the workspace new object gets saved to return params: new_binned_contig_ref: newly created BinnedContig object referece """ log('--->\nrunning MetagenomeFileUtils.remove_bins_from_binned_contig\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_remove_bins_from_binned_contig_params(params) binned_contig_object = self.dfu.get_objects( {'object_refs': [params.get('old_binned_contig_ref')]})['data'][0] assembly_ref = binned_contig_object.get('data').get('assembly_ref') total_contig_len = int( binned_contig_object.get('data').get('total_contig_len')) old_bins = binned_contig_object.get('data').get('bins') bins_to_remove = params.get('bins_to_remove') for bin in list(old_bins): bin_id = bin.get('bid') if bin_id in bins_to_remove: log('removing bin_id: {}'.format(bin_id)) old_bins.remove(bin) total_contig_len -= int(bin.get('sum_contig_len')) log('removed bin_id: {} from BinnedContig object'.format( bin_id)) binned_contigs = { 'assembly_ref': assembly_ref, 'bins': old_bins, 'total_contig_len': total_contig_len } new_binned_contig_ref = self._save_binned_contig( binned_contigs, params.get('workspace_name'), params.get('output_binned_contig_name')) returnVal = {'new_binned_contig_ref': new_binned_contig_ref} log('successfully saved BinnedContig object') return returnVal def merge_bins_from_binned_contig(self, params): """ merge_bins_from_binned_contig: merge a list of bins from BinnedContig object input params: old_binned_contig_ref: Original BinnedContig object reference bin_merges: a list of bin merges dicts new_bin_id: newly created bin id bin_to_merge: list of bins to merge output_binned_contig_name: Name for the output BinnedContigs object workspace_name: the name of the workspace new object gets saved to return params: new_binned_contig_ref: newly created BinnedContig object referece """ log('--->\nrunning MetagenomeFileUtils.merge_bins_from_binned_contig\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_merge_bins_from_binned_contig_params(params) bin_merges = params.get('bin_merges') self._check_bin_merges(bin_merges) binned_contig_object = self.dfu.get_objects( {'object_refs': [params.get('old_binned_contig_ref')]})['data'][0] assembly_ref = binned_contig_object.get('data').get('assembly_ref') total_contig_len = int( binned_contig_object.get('data').get('total_contig_len')) bins = binned_contig_object.get('data').get('bins') old_bin_ids = map(lambda item: item.get('bid'), bins) for bin_merge in bin_merges: new_bin_id = bin_merge.get('new_bin_id') bin_id_to_merge = bin_merge.get('bin_to_merge') if set(bin_id_to_merge) <= set(old_bin_ids): bin_objects_to_merge = [] for bin in list(bins): bin_id = bin.get('bid') if bin_id in bin_id_to_merge: bin_objects_to_merge.append(bin) log('removing bin_id: {}'.format(bin_id)) bins.remove(bin) total_contig_len -= int(bin.get('sum_contig_len')) log('removed bin_id: {} from BinnedContig object'. format(bin_id)) new_bin = self._merge_bins(new_bin_id, bin_objects_to_merge) log('appending bin_id: {}'.format(new_bin_id)) bins.append(new_bin) total_contig_len += int(new_bin.get('sum_contig_len')) log('appended bin_id: {} to BinnedContig object'.format( new_bin_id)) else: bad_bin_ids = list(set(bin_id_to_merge) - set(old_bin_ids)) error_msg = 'bin_id: [{}] '.format(', '.join(bad_bin_ids)) error_msg += 'is not listed in BinnedContig object' raise ValueError(error_msg) binned_contigs = { 'assembly_ref': assembly_ref, 'bins': bins, 'total_contig_len': total_contig_len } new_binned_contig_ref = self._save_binned_contig( binned_contigs, params.get('workspace_name'), params.get('output_binned_contig_name')) returnVal = {'new_binned_contig_ref': new_binned_contig_ref} log('successfully saved BinnedContig object') return returnVal def edit_bins_from_binned_contig(self, params): """ edit_bins_from_binned_contig: merge/remove a list of bins from BinnedContig object a wrapper method of: merge_bins_from_binned_contig remove_bins_from_binned_contig input params: old_binned_contig_ref: Original BinnedContig object reference bins_to_remove: a list of bin ids to be removed bin_merges: a list of bin merges dicts new_bin_id: newly created bin id bin_to_merge: list of bins to merge output_binned_contig_name: Name for the output BinnedContigs object workspace_name: the name of the workspace new object gets saved to return params: new_binned_contig_ref: newly created BinnedContig object referece report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ log('--->\nrunning MetagenomeFileUtils.edit_bins_from_binned_contig\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) input_params = params.copy() if params.get('bins_to_remove'): bins_to_remove = input_params.get('bins_to_remove') if isinstance(bins_to_remove, string_types): input_params['bins_to_remove'] = bins_to_remove.split(',') new_binned_contig_ref = self.remove_bins_from_binned_contig( input_params).get('new_binned_contig_ref') input_params['old_binned_contig_ref'] = new_binned_contig_ref if params.get('bin_merges'): new_binned_contig_ref = self.merge_bins_from_binned_contig( input_params).get('new_binned_contig_ref') returnVal = {'new_binned_contig_ref': new_binned_contig_ref} report_message = self._generate_report_message(new_binned_contig_ref) reportVal = self._generate_report(report_message, params) returnVal.update(reportVal) return returnVal
class CufflinksUtils: CUFFLINKS_TOOLKIT_PATH = '/opt/cufflinks/' GFFREAD_TOOLKIT_PATH = '/opt/cufflinks/' def __init__(self, config): """ :param config: :param logger: :param directory: Working directory :param urls: Service urls """ # BEGIN_CONSTRUCTOR self.ws_url = config["workspace-url"] self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.srv_wiz_url = config['srv-wiz-url'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev') self.eu = ExpressionUtils(self.callback_url) self.ws = Workspace(self.ws_url, token=self.token) self.scratch = os.path.join(config['scratch'], str(uuid.uuid4())) self._mkdir_p(self.scratch) self.tool_used = "Cufflinks" self.tool_version = os.environ['VERSION'] # END_CONSTRUCTOR pass def parse_FPKMtracking_calc_TPM(self, filename): """ Generates TPM from FPKM :return: """ fpkm_dict = {} tpm_dict = {} gene_col = 0 fpkm_col = 9 sum_fpkm = 0.0 with open(filename) as f: next(f) for line in f: larr = line.split("\t") gene_id = larr[gene_col] if gene_id != "": fpkm = float(larr[fpkm_col]) sum_fpkm = sum_fpkm + fpkm fpkm_dict[gene_id] = math.log(fpkm + 1, 2) tpm_dict[gene_id] = fpkm if sum_fpkm == 0.0: log("Warning: Unable to calculate TPM values as sum of FPKM values is 0" ) else: for g in tpm_dict: tpm_dict[g] = math.log((tpm_dict[g] / sum_fpkm) * 1e6 + 1, 2) return fpkm_dict, tpm_dict def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_run_cufflinks_params(self, params): """ _validate_run_cufflinks_params: Raises an exception if params are invalid """ log('Start validating run_cufflinks params') # check for required parameters for p in ['alignment_object_ref', 'workspace_name', 'genome_ref']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def _run_gffread(self, gff_path, gtf_path): """ _run_gffread: run gffread script ref: http://cole-trapnell-lab.github.io/cufflinks/file_formats/#the-gffread-utility """ log('converting gff to gtf') command = self.GFFREAD_TOOLKIT_PATH + '/gffread ' command += "-E {0} -T -o {1}".format(gff_path, gtf_path) self._run_command(command) def _create_gtf_annotation_from_genome(self, genome_ref): """ Create reference annotation file from genome """ ref = self.ws.get_object_subset([{ 'ref': genome_ref, 'included': ['contigset_ref', 'assembly_ref'] }]) if 'contigset_ref' in ref[0]['data']: contig_id = ref[0]['data']['contigset_ref'] elif 'assembly_ref' in ref[0]['data']: contig_id = ref[0]['data']['assembly_ref'] if contig_id is None: raise ValueError( "Genome at {0} does not have reference to the assembly object". format(genome_ref)) print(contig_id) log("Generating GFF file from Genome") try: ret = self.au.get_assembly_as_fasta({'ref': contig_id}) output_file = ret['path'] mapping_filename = c_mapping.create_sanitized_contig_ids( output_file) os.remove(output_file) # get the GFF ret = self.gfu.genome_to_gff({'genome_ref': genome_ref}) genome_gff_file = ret['file_path'] c_mapping.replace_gff_contig_ids(genome_gff_file, mapping_filename, to_modified=True) gtf_ext = ".gtf" if not genome_gff_file.endswith(gtf_ext): gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf' self._run_gffread(genome_gff_file, gtf_path) else: gtf_path = genome_gff_file log("gtf file : " + gtf_path) except Exception: raise ValueError( "Generating GTF file from Genome Annotation object Failed : {}" .format("".join(traceback.format_exc()))) return gtf_path def _get_gtf_file(self, alignment_ref): """ _get_gtf_file: get the reference annotation file (in GTF or GFF3 format) """ result_directory = self.scratch alignment_data = self.ws.get_objects2( {'objects': [{ 'ref': alignment_ref }]})['data'][0]['data'] genome_ref = alignment_data.get('genome_id') # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1] # ws_gtf = genome_name+"_GTF_Annotation" genome_data = self.ws.get_objects2({'objects': [{ 'ref': genome_ref }]})['data'][0]['data'] gff_handle_ref = genome_data.get('gff_handle_ref') if gff_handle_ref: log('getting reference annotation file from genome') annotation_file = self.dfu.shock_to_file({ 'handle_id': gff_handle_ref, 'file_path': result_directory, 'unpack': 'unpack' })['file_path'] else: annotation_file = self._create_gtf_annotation_from_genome( genome_ref) return annotation_file def _get_gtf_file_from_genome_ref(self, genome_ref): """ _get_gtf_file: get the reference annotation file (in GTF or GFF3 format) """ result_directory = self.scratch genome_data = self.ws.get_objects2({'objects': [{ 'ref': genome_ref }]})['data'][0]['data'] gff_handle_ref = genome_data.get('gff_handle_ref') if gff_handle_ref: log('getting reference annotation file from genome') annotation_file = self.dfu.shock_to_file({ 'handle_id': gff_handle_ref, 'file_path': result_directory, 'unpack': 'unpack' })['file_path'] else: annotation_file = self._create_gtf_annotation_from_genome( genome_ref) return annotation_file def _get_input_file(self, alignment_ref): """ _get_input_file: get input BAM file from Alignment object """ bam_file_dir = self.rau.download_alignment( {'source_ref': alignment_ref})['destination_dir'] files = os.listdir(bam_file_dir) bam_file_list = [ file for file in files if re.match(r'.*\_sorted\.bam', file) ] if not bam_file_list: bam_file_list = [ file for file in files if re.match(r'.*(?<!sorted)\.bam', file) ] if not bam_file_list: raise ValueError('Cannot find .bam file from alignment {}'.format( alignment_ref)) bam_file_name = bam_file_list[0] bam_file = os.path.join(bam_file_dir, bam_file_name) return bam_file def _generate_command(self, params): """ _generate_command: generate cufflinks command """ cufflinks_command = '/opt/cufflinks/cufflinks' cufflinks_command += (' -q --no-update-check -p ' + str(params.get('num_threads', 1))) if 'max_intron_length' in params and params[ 'max_intron_length'] is not None: cufflinks_command += (' --max-intron-length ' + str(params['max_intron_length'])) if 'min_intron_length' in params and params[ 'min_intron_length'] is not None: cufflinks_command += (' --min-intron-length ' + str(params['min_intron_length'])) if 'overhang_tolerance' in params and params[ 'overhang_tolerance'] is not None: cufflinks_command += (' --overhang-tolerance ' + str(params['overhang_tolerance'])) cufflinks_command += " -o {0} -G {1} {2}".format( params['result_directory'], params['gtf_file'], params['input_file']) log('Generated cufflinks command: {}'.format(cufflinks_command)) return cufflinks_command def _process_rnaseq_alignment_object(self, params): """ _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object """ log('start processing RNASeqAlignment object\nparams:\n{}'.format( json.dumps(params, indent=1))) alignment_ref = params.get('alignment_ref') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) params['result_directory'] = str(result_directory) # input files params['input_file'] = self._get_input_file(alignment_ref) if not params.get('gtf_file'): params['gtf_file'] = self._get_gtf_file(alignment_ref) if '/' not in params['genome_ref']: params['genome_ref'] = params['workspace_name'] + '/' + params[ 'genome_ref'] command = self._generate_command(params) self._run_command(command) expression_obj_ref = self._save_rnaseq_expression( result_directory, alignment_ref, params.get('workspace_name'), params.get('genome_ref'), params['gtf_file'], params['expression_suffix']) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_obj_ref, 'alignment_ref': alignment_ref } expression_name = self.ws.get_object_info([{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] widget_params = { "output": expression_name, "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _process_kbasesets_alignment_object(self, params): """ _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object """ log('start processing KBaseSets object\nparams:\n{}'.format( json.dumps(params, indent=1))) alignment_ref = params.get('alignment_ref') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) params['result_directory'] = str(result_directory) # input files params['input_file'] = self._get_input_file(alignment_ref) if not params.get('gtf_file'): params['gtf_file'] = self._get_gtf_file(alignment_ref) command = self._generate_command(params) self._run_command(command) expression_obj_ref = self._save_kbasesets_expression( result_directory, alignment_ref, params.get('workspace_name'), params.get('genome_ref'), params.get('gtf_file'), params.get('expression_suffix')) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_obj_ref, 'alignment_ref': alignment_ref } expression_name = self.ws.get_object_info([{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] widget_params = { "output": expression_name, "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _generate_html_report(self, result_directory, obj_ref): """ _generate_html_report: generate html summary report """ log('Start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') expression_object = self.ws.get_objects2( {'objects': [{ 'ref': obj_ref }]})['data'][0] expression_object_type = expression_object.get('info')[2] Overview_Content = '' if re.match('KBaseRNASeq.RNASeqExpression-\d.\d', expression_object_type): Overview_Content += '<p>Generated Expression Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expression_object_type): Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) Overview_Content += '<br><p>Generated Expression Object:</p>' for expression_ref in expression_object['data'][ 'sample_expression_ids']: expression_name = self.ws.get_object_info( [{ "ref": expression_ref }], includeMetadata=None)[0][1] Overview_Content += '<p>{}</p>'.format(expression_name) elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type): pprint(expression_object) Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) Overview_Content += '<br><p>Generated Expression Object:</p>' for expression_ref in expression_object['data']['items']: expression_name = self.ws.get_object_info( [{ "ref": expression_ref['ref'] }], includeMetadata=None)[0][1] condition = expression_ref['label'] Overview_Content += '<p>condition:{0}; expression_name: {1}</p>'.format( condition, expression_name) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Overview_Content</p>', Overview_Content) result_file.write(report_template) html_report.append({ 'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Cufflinks App' }) return html_report def _save_rnaseq_expression(self, result_directory, alignment_ref, workspace_name, genome_ref, gtf_file, expression_suffix): """ _save_rnaseq_expression: save Expression object to workspace """ log('start saving Expression object') alignment_object_name = self.ws.get_object_info( [{ "ref": alignment_ref }], includeMetadata=None)[0][1] # set expression name if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_ref = self.eu.upload_expression({ 'destination_ref': workspace_name + '/' + expression_name, 'source_dir': result_directory, 'alignment_ref': alignment_ref, 'tool_used': self.tool_used, 'tool_version': self.tool_version })['obj_ref'] return expression_ref def _save_kbasesets_expression(self, result_directory, alignment_ref, workspace_name, genome_ref, gtf_file, expression_suffix): """ _save_kbasesets_expression: save Expression object to workspace using ExpressionUtils and SetAPI """ log('start saving Expression object') alignment_info = self.ws.get_object_info3( {'objects': [{ "ref": alignment_ref }]}) alignment_object_name = alignment_info['infos'][0][1] # set expression name if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_ref = self.eu.upload_expression({ 'destination_ref': workspace_name + '/' + expression_name, 'source_dir': result_directory, 'alignment_ref': alignment_ref, 'tool_used': self.tool_used, 'tool_version': self.tool_version })['obj_ref'] return expression_ref def _save_rnaseq_expression_set(self, alignment_expression_map, alignment_set_ref, workspace_name, expression_set_name): """ _save_rnaseq_expression_set: save ExpressionSet object to workspace """ log('start saving ExpressionSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_set_data = self._generate_expression_set_data( alignment_expression_map, alignment_set_ref, expression_set_name) object_type = 'KBaseRNASeq.RNASeqExpressionSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': expression_set_data, 'name': expression_set_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str( dfu_oi[4]) return expression_set_ref def _save_kbasesets_expression_set(self, alignment_expression_map, alignment_set_ref, workspace_name, expression_set_name): """ _save_kbasesets_expression_set: save ExpressionSet object to workspace """ log('start saving ExpressionSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_set_data = self._generate_expression_set_data( alignment_expression_map, alignment_set_ref, expression_set_name) object_type = 'KBaseRNASeq.RNASeqExpressionSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': expression_set_data, 'name': expression_set_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str( dfu_oi[4]) return expression_set_ref def _generate_report(self, obj_ref, workspace_name, result_directory, exprMatrix_FPKM_ref=None, exprMatrix_TPM_ref=None): """ _generate_report: generate summary report """ log('creating report') output_files = self._generate_output_file_list(result_directory) output_html_files = self._generate_html_report(result_directory, obj_ref) expression_object = self.ws.get_objects2( {'objects': [{ 'ref': obj_ref }]})['data'][0] expression_info = expression_object['info'] expression_data = expression_object['data'] expression_object_type = expression_info[2] if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'Expression generated by Cufflinks' }] elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'Expression generated by Cufflinks' }] elif re.match('KBaseSets.ExpressionSet-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'ExpressionSet generated by Cufflinks' }] items = expression_data['items'] for item in items: objects_created.append({ 'ref': item['ref'], 'description': 'Expression generated by Cufflinks' }) objects_created.append({ 'ref': exprMatrix_FPKM_ref, 'description': 'FPKM ExpressionMatrix generated by Cufflinks' }) objects_created.append({ 'ref': exprMatrix_TPM_ref, 'description': 'TPM ExpressionMatrix generated by Cufflinks' }) report_params = { 'message': '', 'workspace_name': workspace_name, 'file_links': output_files, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_cufflinks_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _parse_FPKMtracking(self, filename, metric): result = {} pos1 = 0 if metric == 'FPKM': pos2 = 7 if metric == 'TPM': pos2 = 8 with open(filename) as f: next(f) for line in f: larr = line.split("\t") if larr[pos1] != "": try: result[larr[pos1]] = math.log(float(larr[pos2]) + 1, 2) except ValueError: result[larr[pos1]] = math.log(1, 2) return result def _generate_output_file_list(self, result_directory): """ _generate_output_file_list: zip result files and generate file_links for report """ log('Start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join(output_directory, 'cufflinks_result.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if not (file.endswith('.DS_Store')): zip_file.write( os.path.join(root, file), os.path.join(os.path.basename(root), file)) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by Cufflinks App' }) return output_files def _generate_expression_data(self, result_directory, alignment_ref, gtf_file, workspace_name, expression_suffix): """ _generate_expression_data: generate Expression object with cufflinks output files """ alignment_data_object = self.ws.get_objects2( {'objects': [{ 'ref': alignment_ref }]})['data'][0] # set expression name alignment_object_name = alignment_data_object['info'][1] if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_data = { 'id': expression_name, 'type': 'RNA-Seq', 'numerical_interpretation': 'FPKM', 'processing_comments': 'log2 Normalized', 'tool_used': self.tool_used, 'tool_version': self.tool_version } alignment_data = alignment_data_object['data'] condition = alignment_data.get('condition') expression_data.update({'condition': condition}) genome_id = alignment_data.get('genome_id') expression_data.update({'genome_id': genome_id}) read_sample_id = alignment_data.get('read_sample_id') expression_data.update( {'mapped_rnaseq_alignment': { read_sample_id: alignment_ref }}) exp_dict, tpm_exp_dict = self.parse_FPKMtracking_calc_TPM( os.path.join(result_directory, 'genes.fpkm_tracking')) expression_data.update({'expression_levels': exp_dict}) expression_data.update({'tpm_expression_levels': tpm_exp_dict}) handle = self.dfu.file_to_shock({ 'file_path': result_directory, 'pack': 'zip', 'make_handle': True })['handle'] expression_data.update({'file': handle}) return expression_data def _generate_expression_set_data(self, alignment_expression_map, alignment_set_ref, expression_set_name): """ _generate_expression_set_data: generate ExpressionSet object with cufflinks output files """ alignment_set_data_object = self.ws.get_objects2( {'objects': [{ 'ref': alignment_set_ref }]})['data'][0] alignment_set_data = alignment_set_data_object['data'] expression_set_data = { 'tool_used': self.tool_used, 'tool_version': self.tool_version, 'id': expression_set_name, 'alignmentSet_id': alignment_set_ref, 'genome_id': alignment_set_data.get('genome_id'), 'sampleset_id': alignment_set_data.get('sampleset_id') } sample_expression_ids = [] mapped_expression_objects = [] mapped_expression_ids = [] for alignment_expression in alignment_expression_map: alignment_ref = alignment_expression.get('alignment_ref') expression_ref = alignment_expression.get('expression_obj_ref') sample_expression_ids.append(expression_ref) mapped_expression_ids.append({alignment_ref: expression_ref}) alignment_name = self.ws.get_object_info( [{ "ref": alignment_ref }], includeMetadata=None)[0][1] expression_name = self.ws.get_object_info( [{ "ref": expression_ref }], includeMetadata=None)[0][1] mapped_expression_objects.append({alignment_name: expression_name}) expression_set_data['sample_expression_ids'] = sample_expression_ids expression_set_data[ 'mapped_expression_objects'] = mapped_expression_objects expression_set_data['mapped_expression_ids'] = mapped_expression_ids return expression_set_data def _process_alignment_set_object(self, params, alignment_object_type): """ _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object and KBaseSets.ReadsAlignmentSet type object """ log('start processing KBaseRNASeq.RNASeqAlignmentSet object or KBaseSets.ReadsAlignmentSet object' '\nparams:\n{}'.format(json.dumps(params, indent=1))) alignment_set_ref = params.get('alignment_set_ref') if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type): params['gtf_file'] = self._get_gtf_file(alignment_set_ref) else: if not '/' in params['genome_ref']: params['genome_ref'] = params['workspace_name'] + '/' + params[ 'genome_ref'] params['gtf_file'] = self._get_gtf_file_from_genome_ref( params['genome_ref']) alignment_set = self.set_api.get_reads_alignment_set_v1({ 'ref': alignment_set_ref, 'include_item_info': 0, 'include_set_item_ref_paths': 1 }) mul_processor_params = [] for alignment in alignment_set["data"]["items"]: alignment_ref = alignment['ref_path'] alignment_upload_params = params.copy() alignment_upload_params['alignment_ref'] = alignment_ref mul_processor_params.append(alignment_upload_params) # use the following when you want to run the cmd sequentially # self._process_kbasesets_alignment_object(mul_processor_params[0]) cpus = min(params.get('num_threads'), multiprocessing.cpu_count()) pool = Pool(ncpus=cpus) log('running _process_alignment_object with {} cpus'.format(cpus)) alignment_expression_map = pool.map( self._process_kbasesets_alignment_object, mul_processor_params) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) expression_items = list() for proc_alignment_return in alignment_expression_map: expression_obj_ref = proc_alignment_return.get( 'expression_obj_ref') alignment_ref = proc_alignment_return.get('alignment_ref') alignment_info = self.ws.get_object_info3({ 'objects': [{ "ref": alignment_ref }], 'includeMetadata': 1 }) condition = alignment_info['infos'][0][10]['condition'] expression_items.append({ "ref": expression_obj_ref, "label": condition, }) expression_name = self.ws.get_object_info( [{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] self._run_command('cp -R {} {}'.format( proc_alignment_return.get('result_directory'), os.path.join(result_directory, expression_name))) expression_set = { "description": "generated by kb_cufflinks", "items": expression_items } expression_set_info = self.set_api.save_expression_set_v1({ "workspace": params['workspace_name'], "output_object_name": params['expression_set_name'], "data": expression_set }) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_set_info['set_ref'] } widget_params = { "output": params.get('expression_set_name'), "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _generate_output_object_name(self, params, alignment_object_type, alignment_object_name): """ Generates the output object name based on input object type and name and stores it in params with key equal to 'expression' or 'expression_set' based on whether the input object is an alignment or alignment_set. :param params: module input params :param alignment_object_type: input alignment object type :param alignment_object_name: input alignment object name :param alignment_object_data: input alignment object data """ expression_set_suffix = params['expression_set_suffix'] expression_suffix = params['expression_suffix'] if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type): if re.match('.*_[Aa]lignment$', alignment_object_name): params['expression_name'] = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_name'] = alignment_object_name + expression_suffix if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type): if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name): # set expression set name params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$', expression_set_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_set_name'] = alignment_object_name + expression_set_suffix if re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type): if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name): # set expression set name params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$', expression_set_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_set_name'] = alignment_object_name + expression_set_suffix def _save_expression_matrix(self, expressionset_ref, workspace_name): """ _save_expression_matrix: save FPKM and TPM ExpressionMatrix """ log('start saving ExpressionMatrix object') expression_set_name = self.ws.get_object_info( [{ "ref": expressionset_ref }], includeMetadata=None)[0][1] output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '', expression_set_name) upload_expression_matrix_params = { 'expressionset_ref': expressionset_ref, 'output_obj_name': output_obj_name_prefix, 'workspace_name': workspace_name } expression_matrix_refs = self.eu.get_expressionMatrix( upload_expression_matrix_params) return expression_matrix_refs def run_cufflinks_app(self, params): log('--->\nrunning CufflinksUtil.run_cufflinks_app\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_run_cufflinks_params(params) alignment_object_ref = params.get('alignment_object_ref') alignment_object_info = self.ws.get_object_info3( {"objects": [{ "ref": alignment_object_ref }]})['infos'][0] alignment_object_type = alignment_object_info[2] alignment_object_name = alignment_object_info[1] # get output object name self._generate_output_object_name(params, alignment_object_type, alignment_object_name) log('--->\nalignment object type: \n' + '{}'.format(alignment_object_type)) if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type): params.update({'alignment_ref': alignment_object_ref}) returnVal = self._process_rnaseq_alignment_object(params) report_output = self._generate_report( returnVal.get('expression_obj_ref'), params.get('workspace_name'), returnVal.get('result_directory')) returnVal.update(report_output) elif re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type) or \ re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type): params.update({'alignment_set_ref': alignment_object_ref}) returnVal = self._process_alignment_set_object( params, alignment_object_type) expression_matrix_refs = self._save_expression_matrix( returnVal['expression_obj_ref'], params.get('workspace_name')) returnVal.update(expression_matrix_refs) report_output = self._generate_report( returnVal['expression_obj_ref'], params.get('workspace_name'), returnVal['result_directory'], expression_matrix_refs['exprMatrix_FPKM_ref'], expression_matrix_refs['exprMatrix_TPM_ref']) returnVal.update(report_output) else: raise ValueError( 'None RNASeqAlignment type\nObject info:\n{}'.format( alignment_object_info)) return returnVal
class EditAlignmentSet: """ Constains a set of functions for expression levels calculations. """ PARAM_IN_WS_NAME_ID = 'workspace_name' PARAM_IN_OBJ_NAME_ID = 'output_object_name' PARAM_IN_ALIGNSET_REF = 'alignment_set_ref' PARAM_IN_ALIGNS_ADD = 'alignments_to_add' PARAM_IN_ALIGNS_RM = 'alignments_to_remove' def __init__(self, config, logger=None): self.config = config self.logger = logger self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'EAS_' + str(uuid.uuid4())) self.ws_url = config['workspace-url'] self.ws_client = Workspace(self.ws_url) self.dfu = DataFileUtil(self.callback_url) self.setAPI = SetAPI(self.callback_url) pass def _process_params(self, params): """ validates params passed to gen expression matrix method """ for p in [ self.PARAM_IN_ALIGNSET_REF, self.PARAM_IN_OBJ_NAME_ID, self.PARAM_IN_WS_NAME_ID ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) ws_name_id = params.get(self.PARAM_IN_WS_NAME_ID) if not isinstance(ws_name_id, int): try: ws_name_id = self.dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) alignments_to_add = params.get(self.PARAM_IN_ALIGNS_ADD) alignments_to_remove = params.get(self.PARAM_IN_ALIGNS_RM) if alignments_to_add is None and alignments_to_remove is None: raise ValueError( 'Either "alignments_to_remove" or "alignments_to_add" should be given' ) return ws_name_id def _get_type_from_obj_info(self, info): return info[2].split('-')[0] def _get_obj_info(self, ref): return self.ws_client.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0] def _get_set_items(self, alignment_set_ref): obj_info = self._get_obj_info(alignment_set_ref) obj_type = self._get_type_from_obj_info(obj_info) if obj_type in ['KBaseSets.ReadsAlignmentSet']: set_data = self.setAPI.get_reads_alignment_set_v1( {'ref': alignment_set_ref}) items = set_data['data']['items'] elif obj_type in ['KBaseRNASeq.RNASeqAlignmentSet']: alignmentset_obj = self.ws_client.get_objects2( {'objects': [{ 'ref': alignment_set_ref }]})['data'][0] """ Add each alignment object to align_item and add it to items list """ items = list() for alignment_ref in alignmentset_obj['data']['sample_alignments']: align_item = dict() align_item['ref'] = alignment_ref items.append(align_item) else: raise ValueError( '"alignment_set_ref" should be of type KBaseSets.ReadsAlignmentSet or ' + 'KBaseRNASeq.RNASeqAlignmentSet') return items def _add_alignments(self, alignment_set_items, alignment_refs_list): for alignment_ref in alignment_refs_list: found = False for set_item in alignment_set_items: if set_item.get('ref') == alignment_ref: print('{} already in the input Alignment Set. Not added'. format(alignment_ref)) found = True break if not found: alignment_set_items.append({'ref': alignment_ref}) return alignment_set_items def _remove_alignments(self, input_alignment_set, alignment_set_items, alignments_to_remove): for input_item in input_alignment_set: if not (input_item.get('ref') in alignments_to_remove): alignment_set_items.append(input_item) return alignment_set_items def _save_alignment_set(self, ws_name, obj_name, set_data): res = self.setAPI.save_reads_alignment_set_v1({ "workspace": ws_name, "output_object_name": obj_name, "data": set_data }) return res.get('set_ref') def edit_alignment_set(self, params): ws_name_id = self._process_params(params) obj_name = params.get(self.PARAM_IN_OBJ_NAME_ID) alignment_set_ref = params.get(self.PARAM_IN_ALIGNSET_REF) print('INPUT ALIGNMENT SET REF: ' + alignment_set_ref) input_alignment_set = self._get_set_items(alignment_set_ref) alignments_to_remove = params.get(self.PARAM_IN_ALIGNS_RM, None) alignments_to_add = params.get(self.PARAM_IN_ALIGNS_ADD, None) set_items = list() if alignments_to_remove is not None: set_items = self._remove_alignments(input_alignment_set, set_items, alignments_to_remove) if alignments_to_add is not None: set_items = self._add_alignments(set_items, alignments_to_add) set_data = { 'description': 'Edited from {}'.format(alignment_set_ref), 'items': set_items } output_alignment_set_ref = self._save_alignment_set( ws_name_id, obj_name, set_data) return output_alignment_set_ref