def UploadFromMEME(self, ctx, params): """ :param params: instance of type "UploadGibbsInParams" -> structure: parameter "path" of String, parameter "ws_name" of String, parameter "obj_name" of String :returns: instance of type "UploadOutput" -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: output #BEGIN UploadFromMEME print('Extracting motifs') motifList = MU.parse_meme_output(params['path']) print(motifList) MSO = {} MSO['Condition'] = 'Temp' MSO['SequenceSet_ref'] = '123' MSO['Motifs'] = [] MSO['Alphabet'] = ['A', 'C', 'G', 'T'] MSO['Background'] = {} for letter in MSO['Alphabet']: MSO['Background'][letter] = 0.0 MSU.parseMotifList(motifList, MSO) MSU.CheckLength(MSO, params['min_len'], params['max_len']) if 'absolute_locations' in params: for motif in MSO['Motifs']: for loc in motif['Motif_Locations']: if loc['sequence_id'] in params['absolute_locations']: loc['sequence_id'] = params['contig'] absStart = int(params['start']) loc['start'] = absStart loc['end'] = absStart + loc['end'] dfu = DataFileUtil(self.callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name']) save_objects_params['objects'] = [{ 'type': 'KBaseGeneRegulation.MotifSet', 'data': MSO, 'name': params['obj_name'] }] info = dfu.save_objects(save_objects_params)[0] print('SAVED OBJECT') print(info) motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) print(motif_set_ref) output = {'obj_ref': motif_set_ref} print(output) #END UploadFromMEME # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method UploadFromMEME return value ' + 'output is not type dict as required.') # return the results return [output]
def _save_to_ws_and_report(self, ws_id, source, assembly_data): dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) workspace_id = dfu.ws_name_to_id(self.getWsName()) print("Workspace id: {}".format(workspace_id)) info = dfu.save_objects({ 'id': '18590', # Numerical id of workspace "objects": [{ "type": "KBaseGenomeAnnotations.Assembly-3.0", "data": assembly_data, "name": ws_id }] })[0] #print("Data from save to ws: {}".format(json.dumps(info, indent=2))) assembly_ref = "%s/%s/%s" % (info[6], info[0], info[4]) return assembly_ref
class FastaToAssembly: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) # Note added X due to kb|g.1886.fasta self.valid_chars = "-ACGTUWSMKRYBDHVNX" self.amino_acid_specific_characters = "PLIFQE" def import_fasta(self, ctx, params): print('validating parameters') self.validate_params(params) print('staging input files') fasta_file_path = self.stage_input(params) if 'min_contig_length' in params: min_contig_length = int(params['min_contig_length']) print('filtering fasta file by contig length (min len=' + str(min_contig_length) + 'bp)') fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length) print('parsing FASTA file: ' + str(fasta_file_path)) assembly_data = self.parse_fasta(fasta_file_path, params) print(' - parsed ' + str(assembly_data['num_contigs']) + ' contigs, ' + str(assembly_data['dna_size']) + 'bp') print('saving assembly to KBase') # save file to shock and build handle fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path) # construct the output object assembly_object_to_save = self.build_assembly_object(assembly_data, fasta_file_handle_info, params) # save to WS and return if 'workspace_id' in params: workspace_id = int(params['workspace_id']) else: workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) assembly_info = self.save_assembly_object(workspace_id, params['assembly_name'], assembly_object_to_save) return assembly_info def build_assembly_object(self, assembly_data, fasta_file_handle_info, params): ''' construct the WS object data to save based on the parsed info and params ''' assembly_data['assembly_id'] = params['assembly_name'] assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid'] assembly_data['fasta_handle_info'] = fasta_file_handle_info assembly_data['type'] = 'Unknown' if 'type' in params: assembly_data['type'] = params['type'] if 'taxon_ref' in params: assembly_data['taxon_ref'] = params['taxon_ref'] if 'external_source' in params: assembly_data['external_source'] = params['external_source'] if 'external_source_id' in params: assembly_data['external_source_id'] = params['external_source_id'] if 'external_source_origination_date' in params: assembly_data['external_source_origination_date'] = params['external_source_origination_date'] return assembly_data def parse_fasta(self, fasta_file_path, params): ''' Do the actual work of inspecting each contig ''' # variables to store running counts of things total_length = 0 base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0} md5_list = [] # map from contig_id to contig_info all_contig_data = {} extra_contig_info = {} if'contig_info' in params: extra_contig_info = params['contig_info'] for record in SeqIO.parse(fasta_file_path, "fasta"): # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()), # id='gi|113968346|ref|NC_008321.1|', # name='gi|113968346|ref|NC_008321.1|', # description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome', # dbxrefs=[]) sequence = str(record.seq).upper() contig_info = { 'contig_id': record.id, 'name': record.id, 'description': record.description[len(record.id):].strip(), 'length': len(record.seq) } # 1) compute sequence character statistics running total total_length += contig_info['length'] sequence_count_table = dict(Counter(sequence)) for character in sequence_count_table: if character in base_counts: base_counts[character] = base_counts[character] + sequence_count_table[character] else: base_counts[character] = sequence_count_table[character] if character not in self.valid_chars: if character in self.amino_acid_specific_characters: raise ValueError('This fasta file may have amino acids in it instead ' + 'of the required nucleotides.') raise ValueError("This FASTA file has non nucleic acid characters : {0}".format(character)) # 2) record number of 'N' characters (only set if there are some) Ncount = 0 if 'N' in sequence_count_table: Ncount = sequence_count_table['N'] contig_info['Ncount'] = Ncount # 2b) record if the contig is circular if record.id in extra_contig_info: if 'is_circ' in extra_contig_info[record.id]: contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ']) if 'description' in extra_contig_info[record.id]: contig_info['description'] = str(extra_contig_info[record.id]['description']) # 3) record md5 checksum contig_md5 = md5(sequence).hexdigest() contig_info['md5'] = contig_md5 md5_list.append(contig_md5) # 4) record the all important GC to ~3 significant digits GC_count = 0 for base in ['G', 'C']: if base in sequence_count_table: GC_count += sequence_count_table[base] contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5) # 5) add to contig list if contig_info['contig_id'] in all_contig_data: raise ValueError('The fasta header key ' + contig_info['contig_id'] + 'appears more than once in the file') all_contig_data[contig_info['contig_id']] = contig_info # Aggregate stats for the data total_gc_content = None if total_length > 0: total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5) assembly_data = { 'md5': md5(",".join(sorted(md5_list))).hexdigest(), 'base_counts': base_counts, 'dna_size': total_length, 'gc_content': total_gc_content, 'contigs': all_contig_data, 'num_contigs': len(all_contig_data) } return assembly_data def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length): ''' generates SeqRecords iterator for writing from a legacy contigset object ''' rows = 0 rows_added = 0 for record in fasta_record_iter: rows += 1 if len(record.seq) >= min_contig_length: rows_added += 1 yield record print(' - filtered out ' + str(rows - rows_added) + ' of ' + str(rows) + ' contigs that were shorter than ' + str(min_contig_length) + 'bp.') def filter_contigs_by_length(self, fasta_file_path, min_contig_length): ''' removes all contigs less than the min_contig_length provided ''' filtered_fasta_file_path = fasta_file_path + '.filtered.fa' fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta') SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length), filtered_fasta_file_path, 'fasta') return filtered_fasta_file_path def save_assembly_object(self, workspace_id, assembly_name, obj_data): print('Saving Assembly to Workspace') sys.stdout.flush() obj_info = self.dfu.save_objects({'id': workspace_id, 'objects': [{'type': 'KBaseGenomeAnnotations.Assembly', 'data': obj_data, 'name': assembly_name }] })[0] return obj_info def save_fasta_file_to_shock(self, fasta_file_path): ''' Given the path to the file, upload to shock and return Handle information returns: typedef structure { string shock_id; Handle handle; string node_file_name; string size; } FileToShockOutput; ''' print('Uploading fasta file (' + str(fasta_file_path) + ') to SHOCK') sys.stdout.flush() return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1}) def stage_input(self, params): ''' Setup the input_directory by fetching the files and returning the path to the file''' file_path = None if 'file' in params: file_path = os.path.abspath(params['file']['path']) elif 'shock_id' in params: print('Downloading file from SHOCK node: ' + str(params['shock_id'])) sys.stdout.flush() input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4())) os.makedirs(input_directory) file_name = self.dfu.shock_to_file({'file_path': input_directory, 'shock_id': params['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) elif 'ftp_url' in params: print('Downloading file from: ' + str(params['ftp_url'])) sys.stdout.flush() file_path = self.dfu.download_web_file({'file_url': params['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] # extract the file if it is compressed if file_path is not None: unpacked_file = self.dfu.unpack_file({'file_path': file_path}) return unpacked_file['file_path'] raise ValueError('No valid fasta could be extracted based on the input parameters') def validate_params(self, params): for key in ('workspace_name', 'assembly_name'): if key not in params: raise ValueError('required "' + key + '" field was not defined') # one and only one of either 'file', 'shock_id', or ftp_url is required input_count = 0 for key in ('file', 'shock_id', 'ftp_url'): if key in params and params[key] is not None: input_count = input_count + 1 if key == 'file': if not isinstance(params[key], dict) or 'path' not in params[key]: raise ValueError('when specifying a fasta file input, "path" field was not defined in "file"') if input_count == 0: raise ValueError('required fasta file as input, set as either "file", "shock_id", or "ftp_url"') if input_count > 1: raise ValueError('required exactly one fasta file as input source, you set more than one of ' + 'these fields: "file", "shock_id", or "ftp_url"')
class GenDiffExprMatrix: INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') def __init__(self, config, logger=None): self.config = config self.logger = logger self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'DEM_' + str(uuid.uuid4())) self.ws_url = config['workspace-url'] self.ws_client = Workspace(self.ws_url) self.fv = KBaseFeatureValues(self.callback_url) self.dfu = DataFileUtil(self.callback_url) self.setAPI = SetAPI(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self._mkdir_p(self.scratch) def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def setup_data(self): self.new_col_names = [ 'gene_id', 'log2_fold_change', 'p_value', 'q_value' ] def get_feature_ids(self, genome_ref): """ _get_feature_ids: get feature ids from genome """ feature_num = self.gsu.search({'ref': genome_ref})['num_found'] genome_features = self.gsu.search({ 'ref': genome_ref, 'limit': feature_num, 'sort_by': [['feature_id', True]] })['features'] features_ids = map( lambda genome_feature: genome_feature.get('feature_id'), genome_features) return list(set(features_ids)) def gen_matrix(self, infile, old_col_names, delimiter): with open(infile, 'rb') as source: rdr = csv.DictReader(source, delimiter=delimiter) col_names = self.new_col_names[1:] row_names = [] values = [] for row in rdr: try: values.append([float(row[v]) for v in old_col_names[1:]]) except: values_list = [] for v in old_col_names[1:]: tmpval = row[v] if isinstance(tmpval, (int, long, float)): values_list.append(float(tmpval)) elif isinstance(tmpval, basestring): if 'na' in tmpval.lower( ) or 'none' in tmpval.lower(): values_list.append(None) else: tmpval = tmpval.replace("'", "") tmpval = tmpval.replace('"', '') values_list.append(float(tmpval)) else: raise ValueError( "invalid type in input file: {}".format( tmpval)) values.append(values_list) row_names.append(row[old_col_names[0]]) twoD_matrix = { 'row_ids': row_names, 'col_ids': col_names, 'values': values } return twoD_matrix def get_max_fold_change_to_handle_inf(self, infile): maxvalue = 0 with open(infile) as source: rdr = csv.DictReader(source, dialect='excel-tab') for line in rdr: log2fc_val = line.get('log2_fold_change') if not 'inf' in str(log2fc_val): log2fc = abs(float(log2fc_val)) if log2fc > maxvalue: maxvalue = log2fc print 'maxvalue: ', maxvalue return maxvalue def gen_cuffdiff_matrix(self, infile, delimiter='\t'): max_value = self.get_max_fold_change_to_handle_inf(infile) with open(infile, 'rb') as source: rdr = csv.DictReader(source, delimiter=delimiter) col_names = self.new_col_names[1:] row_names = [] values = [] for row in rdr: log2fc_val = row.get('log2_fold_change') # print 'FC_VAL: ', log2fc_val if '-inf' in str(log2fc_val): row['log2_fold_change'] = -float(max_value) elif 'inf' in str(log2fc_val): row['log2_fold_change'] = float(max_value) elif 'nan' in str(log2fc_val): row['log2_fold_change'] = None try: values.append( [float(row[v]) for v in self.new_col_names[1:]]) except: values.append( [None] + [float(row[v]) for v in self.new_col_names[2:]]) row_names.append(row[self.new_col_names[0]]) tmatrix = { 'row_ids': row_names, 'col_ids': col_names, 'values': values } return tmatrix def save_diff_expr_matrix(self, obj_name, data_matrix, condition1, condition2): dem_data = { 'genome_ref': self.params.get('genome_ref'), 'data': data_matrix, 'condition_mapping': { condition1: condition2 }, 'type': 'log2_level', 'scale': '1.0' } res = self.dfu.save_objects({ 'id': self.params.get('ws_id'), "objects": [{ "type": "KBaseFeatureValues.DifferentialExpressionMatrix", "data": dem_data, "name": obj_name, "extra_provenance_input_refs": [self.params.get('genome_ref')] }] })[0] ret_ref = str(res[6]) + '/' + str(res[0]) + '/' + str(res[4]) return ret_ref def save_diff_expr_matrix_set(self, obj_name, matrix_set): res = self.setAPI.save_differential_expression_matrix_set_v1({ "workspace": self.params.get('ws_name'), "output_object_name": obj_name, "data": matrix_set }) return res.get('set_ref') # # ballgown always outputs a linear fold change, which we need to convert to log2 # before storing # def safely_apply_log2_to_fc(self, row): if row[0]: fc = row[0] if fc < 1.0e-10: fc = fc + 1.0e-10 # incase fc is zero return ([log2(fc)] + row[1:]) else: return (row) def process_ballgown_file(self, diffexpr_filepath): ballgown_col_names = ['id', 'fc', 'pval', 'qval'] data_matrix = self.gen_matrix(diffexpr_filepath, ballgown_col_names, delimiter='\t') log2_data_matrix = data_matrix log2_data_matrix['values'] = map(self.safely_apply_log2_to_fc, data_matrix.get('values')) dem_ref = self.save_diff_expr_matrix( self.params.get('obj_name') + '_0', log2_data_matrix, None, None) set_items = [{ 'label': 'global Differential Expression Data', 'ref': dem_ref }] matrix_set = { 'description': 'ballgown Diff Exp Matrix Set', 'items': set_items } return self.save_diff_expr_matrix_set(self.params.get('obj_name'), matrix_set) def process_deseq_file(self, diffexpr_filepath): deseq_col_names = ['geneID', 'log2FoldChange', 'pvalue', 'padj'] data_matrix = self.gen_matrix(diffexpr_filepath, deseq_col_names, delimiter=',') dem_ref = self.save_diff_expr_matrix( self.params.get('obj_name') + '_0', data_matrix, None, None) set_items = [{ 'label': 'global Differential Expression Data', 'ref': dem_ref }] matrix_set = { 'description': 'deseq Diff Exp Matrix Set', 'items': set_items } return self.save_diff_expr_matrix_set(self.params.get('obj_name'), matrix_set) def process_cuffdiff_file(self, diffexpr_filepath): cuffdiff_col_names = [ 'gene', 'log2(fold_change)', 'p_value', 'q_value' ] ConditionPair = namedtuple("ConditionPair", ["condition1", "condition2"]) FileInfo = namedtuple('FileInfo', ['file_path', 'file_obj']) condPair_fileInfo = {} timestamp = str( int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)) with open(diffexpr_filepath, 'rb') as source: rdr = csv.DictReader(source, dialect='excel-tab') """ save the files opened for writing in outfiles list, so they can be closed later """ outfiles = list() for r in rdr: c1 = r['sample_1'] c2 = r['sample_2'] cond_pair = ConditionPair(condition1=c1, condition2=c2) tsv_file_info = condPair_fileInfo.get(cond_pair, None) if tsv_file_info is None: tsv_file_name = timestamp + '_' + c1 + '~~' + c2 tsv_file_path = os.path.join(self.scratch, tsv_file_name) outfile = open(tsv_file_path, 'wb') outfiles.append(outfile) csv_wtr = csv.DictWriter(outfile, delimiter='\t', fieldnames=self.new_col_names) csv_wtr.writerow( dict((cn, cn) for cn in self.new_col_names)) tsv_file_info = FileInfo(file_path=tsv_file_path, file_obj=csv_wtr) condPair_fileInfo[cond_pair] = tsv_file_info wtr = tsv_file_info.file_obj col_vals = [r[v] for v in cuffdiff_col_names] wtr.writerow(dict(zip(self.new_col_names, col_vals))) for ofile in outfiles: ofile.close() set_items = list() for cond_pair, file_info in condPair_fileInfo.iteritems(): print 'Cond_pair: ', cond_pair print 'File: ', file_info.file_path tsv_file = file_info.file_path data_matrix = self.gen_cuffdiff_matrix(tsv_file) object_name = self.get_obj_name(self.params['obj_name'], cond_pair.condition1, cond_pair.condition2) dem_ref = self.save_diff_expr_matrix(object_name, data_matrix, cond_pair.condition1, cond_pair.condition2) print('process_cuffdiff_file: DEM_REF: ' + dem_ref) set_items.append({ 'label': cond_pair.condition1 + ', ' + cond_pair.condition2, 'ref': dem_ref }) matrix_set = { 'description': 'cuffdiff Diff Exp Matrix Set', 'items': set_items } return self.save_diff_expr_matrix_set(self.params.get('obj_name'), matrix_set) """ Functions for save_differentialExpressionMatrixSet """ def save_matrix(self, genome_ref, infile, in_col_names, delimiter): feature_ids = self.get_feature_ids(genome_ref) with open(infile, 'rb') as source: rdr = csv.DictReader(source, delimiter=delimiter) col_names = in_col_names[1:] row_names = [] values = [] for row in rdr: if row[in_col_names[0]] in feature_ids: row_names.append(row[in_col_names[0]]) else: gene_ids = row[in_col_names[0]].strip().split(',') match = True mismatched_gene_ids = list() for gene_id in gene_ids: gene_id = gene_id.strip() if gene_id not in feature_ids: mismatched_gene_ids.append(gene_id) match = False if match: row_names.append(row[in_col_names[0]]) else: error_msg = 'Gene_id(s) "{}" is not a known feature in "{}"'.format( ', '.join(mismatched_gene_ids), self.params.get('genome_ref')) raise ValueError(error_msg) try: values.append([float(row[v]) for v in in_col_names[1:]]) except: values_list = [] for v in in_col_names[1:]: tmpval = row[v] if isinstance(tmpval, (int, long, float)): values_list.append(float(tmpval)) elif isinstance(tmpval, basestring): if 'na' in tmpval.lower( ) or 'none' in tmpval.lower(): values_list.append(None) else: tmpval = tmpval.replace("'", "") tmpval = tmpval.replace('"', '') values_list.append(float(tmpval)) else: raise ValueError( "invalid type in input file: {}".format( tmpval)) values.append(values_list) twoD_matrix = { 'row_ids': row_names, 'col_ids': col_names, 'values': values } return twoD_matrix @staticmethod def get_obj_name(obj_name, condition1, condition2): def sanitize(ws_name): # I'm not using translate because it's a mess with mixed unicode & strings return ws_name.replace("\t", " ").replace(" ", "_").replace("/", "|") return "{}-{}-{}".format(obj_name, sanitize(condition1), sanitize(condition2)) def gen_diffexpr_matrices(self, params): print('In GEN DEMs') self.params = params self.setup_data() diffexpr_filepath = self.params.get('diffexpr_filepath') if 'deseq' in self.params.get('tool_used').lower(): dem_ref = self.process_deseq_file(diffexpr_filepath) elif 'ballgown' in self.params.get('tool_used').lower(): dem_ref = self.process_ballgown_file(diffexpr_filepath) elif 'cuffdiff' in self.params.get('tool_used').lower(): dem_ref = self.process_cuffdiff_file(diffexpr_filepath) else: raise ValueError('"{}" is not a valid tool_used parameter'.format( self.params.get('tool_used'))) return dem_ref def save_diffexpr_matrices(self, params): print('In SAVE DEMs') self.params = params self.setup_data() set_items = list() for deFile in self.params.get('diffexpr_data'): condition_mapping = deFile.get('condition_mapping') diffexpr_filepath = deFile.get('diffexpr_filepath') if deFile.get('delimter', None) is not None: delimiter = deFile.get('delimter') else: delimiter = '\t' fileext = os.path.splitext(diffexpr_filepath)[1] if 'csv' in fileext.lower(): delimiter = ',' elif 'tsv' in fileext.lower(): delimiter = '\t' else: print('Using tab delimiter') data_matrix = self.save_matrix(self.params.get('genome_ref'), diffexpr_filepath, self.new_col_names, delimiter) condition1, condition2 = condition_mapping.items()[0] object_name = self.get_obj_name(self.params['obj_name'], condition1, condition2) dem_ref = self.save_diff_expr_matrix(object_name, data_matrix, condition1, condition2) set_items.append({ 'label': condition1 + ', ' + condition2, 'ref': dem_ref }) matrix_set = { 'description': self.params.get('tool_used') + ' Differential Expression Matrix Set', 'items': set_items } return self.save_diff_expr_matrix_set(self.params.get('obj_name'), matrix_set)
class ExprMatrixUtils: """ Constains a set of functions for expression levels calculations. """ PARAM_IN_WS_NAME = 'workspace_name' PARAM_IN_OBJ_NAME = 'output_obj_name' PARAM_IN_EXPSET_REF = 'expressionset_ref' def __init__(self, config, logger=None): self.config = config self.logger = logger self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'EM_' + str(uuid.uuid4())) self.ws_url = config['workspace-url'] self.ws_client = Workspace(self.ws_url) self.dfu = DataFileUtil(self.callback_url) pass def process_params(self, params): """ validates params passed to gen expression matrix method """ for p in [self.PARAM_IN_EXPSET_REF, self.PARAM_IN_OBJ_NAME, self.PARAM_IN_WS_NAME ]: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) ws_name_id = params.get(self.PARAM_IN_WS_NAME) if not isinstance(ws_name_id, int): try: ws_name_id = self.dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.ws_id = ws_name_id def get_expressionset_data(self, expressionset_ref): expr_set_obj = self.ws_client.get_objects2( {'objects': [{'ref': expressionset_ref}]})['data'][0] expr_set_obj_type = expr_set_obj.get('info')[2] expr_set_data = dict() expr_set_data['ws_name'] = expr_set_obj.get('info')[7] expr_set_data['obj_name'] = expr_set_obj.get('info')[1] if re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expr_set_obj_type): expr_set_data['genome_ref'] = expr_set_obj['data']['genome_id'] expr_obj_refs = list() for expr_obj in expr_set_obj['data']['mapped_expression_ids']: expr_obj_refs.append(expr_obj.values()[0]) expr_set_data['expr_obj_refs'] = expr_obj_refs elif re.match('KBaseSets.ExpressionSet-\d.\d', expr_set_obj_type): items = expr_set_obj.get('data').get('items') expr_obj_refs = list() for item in items: expr_obj_refs.append(item['ref']) expr_obj = self.ws_client.get_objects2( {'objects': [{'ref': expr_obj_refs[0]}]})['data'][0] expr_set_data['genome_ref'] = expr_obj['data']['genome_id'] expr_set_data['expr_obj_refs'] = expr_obj_refs else: raise TypeError(self.PARAM_IN_EXPSET_REF + ' should be of type ' + 'KBaseRNASeq.RNASeqExpressionSet ' + 'or KBaseSets.ExpressionSet') return expr_set_data def save_expression_matrix(self, tables, expr_set_data, em_obj_name, hidden = 0): all_rows = {} # build a dictionary of keys only which is a union of all row ids (gene_ids) self.logger.info( '***** length of tables is {0}'.format( len( tables ))) for table in tables: for r in table.keys(): all_rows[r] = [] for gene_id in all_rows.keys(): row = [] for table in tables: if ( gene_id in table ): #logger.info( 'append ' + gene_id ) #logger.info( pformat( table[gene_id])) #all_rows[gene_id].append( table[gene_id] ) row.append( table[gene_id] ) else: #logger.info( 'append 0' ) row.append( 0 ) all_rows[gene_id] = row #logger.info( all_rows[gene_id]) em_data = { 'genome_ref': expr_set_data['genome_ref'], 'scale': 'log2', 'type': 'level', 'data': { 'row_ids': [], 'values': [], 'col_ids': expr_set_data['expr_obj_names'] }, 'feature_mapping' : {}, 'condition_mapping': expr_set_data['condition_map'] } # we need to load row-by-row to preserve the order self.logger.info('loading expression matrix data') for gene_id in all_rows.keys(): em_data['feature_mapping'][gene_id] = gene_id em_data['data']['row_ids'].append(gene_id) em_data['data']['values'].append(all_rows[gene_id]) try: self.logger.info( 'saving em_data em_name {0}'.format(em_obj_name)) obj_info = self.dfu.save_objects({'id': self.ws_id, 'objects': [ { 'type': 'KBaseFeatureValues.ExpressionMatrix', 'data': em_data, 'name': em_obj_name, 'hidden': hidden, 'extra_provenance_input_refs': [ em_data.get('genome_ref'), self.params[self.PARAM_IN_EXPSET_REF]] } ]})[0] self.logger.info('ws save return:\n' + pformat(obj_info)) except Exception as e: self.logger.exception(e) raise Exception('Failed Saving Expression Matrix to Workspace') return str(obj_info[6]) + '/' + str(obj_info[0]) + '/' + str(obj_info[4]) def get_expression_matrix(self, params): self.process_params(params) self.params = params expressionset_ref = params.get(self.PARAM_IN_EXPSET_REF) expr_set_data = self.get_expressionset_data(expressionset_ref) expr_obj_names = list() fpkm_tables = list() tpm_tables = list() condition_map = dict() tpm_table = None for expr_obj_ref in expr_set_data['expr_obj_refs']: try: self.logger.info('*** getting expression set {0} from workspace ****' .format(expr_obj_ref)) expr = self.ws_client.get_objects2( {'objects': [{'ref': expr_obj_ref}]})['data'][0] except Exception, e: self.logger.exception(e) raise Exception('Unable to download expression object {0} from workspace {1}'. format(expr_obj_ref, expr_set_data['ws_name'])) expr_name = expr.get('info')[1] expr_obj_names.append(expr_name) condition_map.update({expr_name: expr.get('data').get('condition')}) num_interp = expr.get('data').get('numerical_interpretation') if num_interp != 'FPKM': raise Exception( 'Did not get expected FPKM value from numerical interpretation key from \ Expression object {0}, instead got '.format(expr_obj_ref, num_interp)) pr_comments = expr.get('data').get('processing_comments', None) # log2 Normalized if pr_comments is not None: self.logger.info('pr_comments are {0}'.format(pr_comments)) fpkm_table = expr.get('data').get('expression_levels') # QUESTION: is this really FPKM levels? self.logger.info('FPKM keycount: {0}'.format(len(fpkm_table.keys()))) fpkm_tables.append(fpkm_table) tpm_table = None # Cufflinks doesn't generate TPM if 'tpm_expression_levels' in expr['data']: # so we need to check for this key tpm_table = expr.get('data').get('tpm_expression_levels') self.logger.info('TPM keycount: {0}'.format(len(tpm_table.keys()))) tpm_tables.append(tpm_table) expr_set_data['expr_obj_names'] = expr_obj_names expr_set_data['condition_map'] = condition_map output_obj_name = params.get(self.PARAM_IN_OBJ_NAME) fpkm_ref = self.save_expression_matrix(fpkm_tables, expr_set_data, '{0}_FPKM_ExpressionMatrix'.format(output_obj_name)) tpm_ref = None if tpm_table is not None: tpm_ref = self.save_expression_matrix(tpm_tables, expr_set_data, '{0}_TPM_ExpressionMatrix'.format(output_obj_name)) return fpkm_ref, tpm_ref
class ExpressionUtils: ''' Module Name: ExpressionUtils Module Description: A KBase module: ExpressionUtils This module is intended for use by Assemblers to upload RNASeq Expression files (gtf, fpkm and ctab). This module generates the ctab files and tpm data if they are absent. The expression files are uploaded as a single compressed file.This module also generates expression levels and tpm expression levels from the input files and saves them in the workspace object. Once uploaded, the expression files can be downloaded onto an output directory. ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.1.1" GIT_URL = "https://github.com/JamesJeffryes/ExpressionUtils.git" GIT_COMMIT_HASH = "62ce653aa5c5b39a597486613bc140b173a35c99" #BEGIN_CLASS_HEADER PARAM_IN_SRC_DIR = 'source_dir' PARAM_IN_SRC_REF = 'source_ref' PARAM_IN_DST_REF = 'destination_ref' PARAM_IN_ALIGNMENT_REF = 'alignment_ref' PARAM_IN_GENOME_REF = 'genome_ref' PARAM_IN_ANNOTATION_ID = 'annotation_id' PARAM_IN_BAM_FILE_PATH = 'bam_file_path' PARAM_IN_DESCRIPTION = 'description' PARAM_IN_DATA_QUAL_LEVEL = 'data_quality_level' PARAM_IN_PROC_COMMENTS = 'processing_comments' PARAM_IN_PLATFORM = 'platform' PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id' PARAM_IN_ORIG_MEDIAN = 'original_median' PARAM_IN_EXT_SRC_DATE = 'external_source_date' PARAM_IN_TRANSCRIPTS = 'transcripts' PARAM_IN_SRC = 'source' def _check_required_param(self, in_params, param_list): """ Check if each of the params in the list are in the input params """ for param in param_list: if (param not in in_params or not in_params[param]): raise ValueError('{} parameter is required'.format(param)) def _proc_ws_obj_params(self, ctx, params): """ Check the validity of workspace and object params and return them """ dst_ref = params.get(self.PARAM_IN_DST_REF) ws_name_id, obj_name_id = os.path.split(dst_ref) if not bool(ws_name_id.strip()) or ws_name_id == '/': raise ValueError("Workspace name or id is required in " + self.PARAM_IN_DST_REF) if not bool(obj_name_id.strip()): raise ValueError("Object name or id is required in " + self.PARAM_IN_DST_REF) dfu = DataFileUtil(self.callback_url) if not isinstance(ws_name_id, int): try: ws_name_id = dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id)) return ws_name_id, obj_name_id def _proc_upload_expression_params(self, ctx, params): """ Check the presence and validity of upload expression params """ self._check_required_param(params, [ self.PARAM_IN_DST_REF, self.PARAM_IN_SRC_DIR, self.PARAM_IN_ALIGNMENT_REF ]) ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params) source_dir = params.get(self.PARAM_IN_SRC_DIR) if not (os.path.isdir(source_dir)): raise ValueError('Source directory does not exist: ' + source_dir) if not os.listdir(source_dir): raise ValueError('Source directory is empty: ' + source_dir) return ws_name_id, obj_name_id, source_dir def _get_ws_info(self, obj_ref): ws = Workspace(self.ws_url) try: info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0] except WorkspaceError as wse: self.__LOGGER.error('Logging workspace exception') self.__LOGGER.error(str(wse)) raise return info def _get_genome_ref(self, assembly_or_genome_ref, params): if self.PARAM_IN_GENOME_REF in params and params[ self.PARAM_IN_GENOME_REF] is not None: return params[self.PARAM_IN_GENOME_REF] obj_type = self._get_ws_info(assembly_or_genome_ref)[2] if obj_type.startswith('KBaseGenomes.Genome'): return assembly_or_genome_ref raise ValueError('Alignment object does not contain genome_ref; ' '"{}" parameter is required'.format( self.PARAM_IN_GENOME_REF)) def _get_expression_levels(self, source_dir, genome_ref, transcripts=False): fpkm_file_path = os.path.join(source_dir, 'genes.fpkm_tracking') if transcripts: fpkm_file_path = os.path.join(source_dir, 't_data.ctab') if not os.path.isfile(fpkm_file_path): raise ValueError('{} file is required'.format(fpkm_file_path)) id_col = 5 if transcripts else 0 self.__LOGGER.info( 'Generating expression levels from {}'.format(fpkm_file_path)) return self.expression_utils.get_expression_levels( fpkm_file_path, genome_ref, id_col) def _gen_ctab_files(self, params, alignment_ref): source_dir = params.get(self.PARAM_IN_SRC_DIR) if len(glob.glob(source_dir + '/*.ctab')) < 5: self.__LOGGER.info(' ======= Generating ctab files ==========') gtf_file = os.path.join(source_dir, 'transcripts.gtf') if not os.path.isfile(gtf_file): raise ValueError( "{} file is required to generate ctab files, found missing" .format(gtf_file)) if self.PARAM_IN_BAM_FILE_PATH in params and \ params[self.PARAM_IN_BAM_FILE_PATH] is not None: bam_file_path = params[self.PARAM_IN_BAM_FILE_PATH] else: self.__LOGGER.info( 'Downloading bam file from alignment object') rau = ReadsAlignmentUtils(self.callback_url) alignment_retVal = rau.download_alignment( {'source_ref': alignment_ref}) alignment_dir = alignment_retVal.get('destination_dir') allbamfiles = glob.glob(alignment_dir + '/*.bam') if len(allbamfiles) == 0: raise ValueError('bam file does not exist in {}'.format(d)) elif len(allbamfiles) == 1: bam_file_path = allbamfiles[0] elif len(allbamfiles) > 1: tmp_file_path = os.path.join(alignment_dir, 'accepted_hits.bam') if os.path.isfile(tmp_file_path): bam_file_path = tmp_file_path else: tmp_file_path = os.path.join( alignment_dir, 'accepted_hits_sorted.bam') if os.path.isfile(tmp_file_path): bam_file_path = tmp_file_path else: raise ValueError( 'accepted_hits.bam, accepted_hits_sorted.bam or other bam file not found in {}' .format(alignment_dir)) result = self.table_maker.build_ctab_files( ref_genome_path=gtf_file, alignment_path=bam_file_path, output_dir=source_dir) if result != 0: raise ValueError('Tablemaker failed') #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.__LOGGER = logging.getLogger('ExpressionUtils') self.__LOGGER.setLevel(logging.INFO) streamHandler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s" ) formatter.converter = time.gmtime streamHandler.setFormatter(formatter) self.__LOGGER.addHandler(streamHandler) self.__LOGGER.info("Logger was set") self.config = config self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.config['SDK_CALLBACK_URL'] = self.callback_url self.expression_utils = Expression_Utils(self.config) self.dfu = DataFileUtil(self.callback_url) self.table_maker = TableMaker(config, self.__LOGGER) self.expr_matrix_utils = ExprMatrixUtils(config, self.__LOGGER) #END_CONSTRUCTOR pass def upload_expression(self, ctx, params): """ Uploads the expression * :param params: instance of type "UploadExpressionParams" (* Required input parameters for uploading a reads expression data string destination_ref - object reference of expression data. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id string source_dir - directory with the files to be uploaded string alignment_ref - alignment workspace object reference *) -> structure: parameter "destination_ref" of String, parameter "source_dir" of String, parameter "alignment_ref" of String, parameter "genome_ref" of String, parameter "annotation_id" of String, parameter "bam_file_path" of String, parameter "transcripts" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "data_quality_level" of Long, parameter "original_median" of Double, parameter "description" of String, parameter "platform" of String, parameter "source" of String, parameter "external_source_date" of String, parameter "processing_comments" of String :returns: instance of type "UploadExpressionOutput" (* Output from upload expression *) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN upload_expression self.__LOGGER.info('Starting upload expression, parsing parameters ') pprint(params) ws_name_id, obj_name_id, source_dir = self._proc_upload_expression_params( ctx, params) alignment_ref = params.get(self.PARAM_IN_ALIGNMENT_REF) try: alignment_obj = self.dfu.get_objects( {'object_refs': [alignment_ref]})['data'][0] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise alignment = alignment_obj['data'] assembly_or_genome_ref = alignment['genome_id'] genome_ref = self._get_genome_ref(assembly_or_genome_ref, params) expression_levels, tpm_expression_levels = self._get_expression_levels( source_dir, genome_ref, params.get(self.PARAM_IN_TRANSCRIPTS)) self._gen_ctab_files(params, alignment_ref) uploaded_file = self.dfu.file_to_shock({ 'file_path': source_dir, 'make_handle': 1, 'pack': 'zip' }) """ move the zipfile created in the source directory one level up """ path, dir = os.path.split(source_dir) zipfile = dir + '.zip' if os.path.isfile(os.path.join(source_dir, zipfile)): shutil.move(os.path.join(source_dir, zipfile), os.path.join(path, zipfile)) file_handle = uploaded_file['handle'] file_size = uploaded_file['size'] expression_data = { 'numerical_interpretation': 'FPKM', 'genome_id': genome_ref, 'mapped_rnaseq_alignment': { alignment['read_sample_id']: alignment_ref }, 'condition': alignment['condition'], 'file': file_handle, 'expression_levels': expression_levels, 'tpm_expression_levels': tpm_expression_levels } additional_params = [ self.PARAM_IN_ANNOTATION_ID, self.PARAM_IN_DESCRIPTION, self.PARAM_IN_DATA_QUAL_LEVEL, self.PARAM_IN_PLATFORM, self.PARAM_IN_PROC_COMMENTS, self.PARAM_IN_MAPPED_SAMPLE_ID, self.PARAM_IN_ORIG_MEDIAN, self.PARAM_IN_EXT_SRC_DATE, self.PARAM_IN_SRC ] for opt_param in additional_params: if opt_param in params and params[opt_param] is not None: expression_data[opt_param] = params[opt_param] extra_provenance_input_refs = list() extra_provenance_input_refs.append( params.get(self.PARAM_IN_ALIGNMENT_REF)) if self.PARAM_IN_GENOME_REF in params and params.get( self.PARAM_IN_GENOME_REF) is not None: extra_provenance_input_refs.append( params.get(self.PARAM_IN_GENOME_REF)) self.__LOGGER.info('=========== Adding extra_provenance_refs') self.__LOGGER.info(str(extra_provenance_input_refs)) self.__LOGGER.info('==========================================') res = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": "KBaseRNASeq.RNASeqExpression", "data": expression_data, "name": obj_name_id, "extra_provenance_input_refs": extra_provenance_input_refs }] })[0] self.__LOGGER.info('save complete') returnVal = { 'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4]) } self.__LOGGER.info('Uploaded object: ') print(returnVal) #END upload_expression # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method upload_expression return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def download_expression(self, ctx, params): """ Downloads expression * :param params: instance of type "DownloadExpressionParams" (* Required input parameters for downloading expression string source_ref - object reference of expression source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String :returns: instance of type "DownloadExpressionOutput" (* The output of the download method. *) -> structure: parameter "destination_dir" of String """ # ctx is the context object # return variables are: returnVal #BEGIN download_expression self.__LOGGER.info('Running download_expression with params:\n' + pformat(params)) inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required') try: expression = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise # set the output dir timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'download_' + str(timestamp)) os.mkdir(output_dir) file_ret = self.dfu.shock_to_file({ 'shock_id': expression[0]['data']['file']['id'], 'file_path': output_dir, 'unpack': 'unpack' }) if not os.listdir(output_dir): raise ValueError('No files were downloaded: ' + output_dir) for f in glob.glob(output_dir + '/*.zip'): os.remove(f) returnVal = {'destination_dir': output_dir} #END download_expression # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method download_expression return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def export_expression(self, ctx, params): """ Wrapper function for use by in-narrative downloaders to download expressions from shock * :param params: instance of type "ExportParams" (* Required input parameters for exporting expression string source_ref - object reference of expression source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_expression inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required') try: expression = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise output = {'shock_id': expression[0]['data']['file']['id']} #END export_expression # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_expression return value ' + 'output is not type dict as required.') # return the results return [output] def get_expressionMatrix(self, ctx, params): """ :param params: instance of type "getExprMatrixParams" (* Following are the required input parameters to get Expression Matrix *) -> structure: parameter "workspace_name" of String, parameter "output_obj_name" of String, parameter "expressionset_ref" of String :returns: instance of type "getExprMatrixOutput" -> structure: parameter "exprMatrix_FPKM_ref" of String, parameter "exprMatrix_TPM_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN get_expressionMatrix fpkm_ref, tpm_ref = self.expr_matrix_utils.get_expression_matrix( params) returnVal = { 'exprMatrix_FPKM_ref': fpkm_ref, 'exprMatrix_TPM_ref': tpm_ref } #END get_expressionMatrix # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method get_expressionMatrix return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
def upload_reads(self, ctx, params): """ Loads a set of reads to KBase data stores. :param params: instance of type "UploadReadsParams" (Input to the upload_reads function. Required parameters: fwd_id - the id of the shock node containing the reads data file: either single end reads, forward/left reads, or interleaved reads. sequencing_tech - the sequencing technology used to produce the reads. One of: wsid - the id of the workspace where the reads will be saved (preferred). wsname - the name of the workspace where the reads will be saved. One of: objid - the id of the workspace object to save over name - the name to which the workspace object will be saved Optional parameters: rev_id - the shock node id containing the reverse/right reads for paired end, non-interleaved reads. single_genome - whether the reads are from a single genome or a metagenome. Default is single genome. strain - information about the organism strain that was sequenced. source - information about the organism source. interleaved - specify that the fwd reads file is an interleaved paired end reads file as opposed to a single end reads file. Default true, ignored if rev_id is specified. read_orientation_outward - whether the read orientation is outward from the set of primers. Default is false and is ignored for single end reads. insert_size_mean - the mean size of the genetic fragments. Ignored for single end reads. insert_size_std_dev - the standard deviation of the size of the genetic fragments. Ignored for single end reads.) -> structure: parameter "fwd_id" of String, parameter "wsid" of Long, parameter "wsname" of String, parameter "objid" of Long, parameter "name" of String, parameter "rev_id" of String, parameter "sequencing_tech" of String, parameter "single_genome" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "strain" of type "StrainInfo" (Information about a strain. genetic_code - the genetic code of the strain. See http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c genus - the genus of the strain species - the species of the strain strain - the identifier for the strain source - information about the source of the strain organelle - the organelle of interest for the related data (e.g. mitochondria) ncbi_taxid - the NCBI taxonomy ID of the strain location - the location from which the strain was collected @optional genetic_code source ncbi_taxid organelle location) -> structure: parameter "genetic_code" of Long, parameter "genus" of String, parameter "species" of String, parameter "strain" of String, parameter "organelle" of String, parameter "source" of type "SourceInfo" (Information about the source of a piece of data. source - the name of the source (e.g. NCBI, JGI, Swiss-Prot) source_id - the ID of the data at the source project_id - the ID of a project encompassing the data at the source @optional source source_id project_id) -> structure: parameter "source" of String, parameter "source_id" of type "source_id" (An ID used for a piece of data at its source. @id external), parameter "project_id" of type "project_id" (An ID used for a project encompassing a piece of data at its source. @id external), parameter "ncbi_taxid" of Long, parameter "location" of type "Location" (Information about a location. lat - latitude of the site, recorded as a decimal number. North latitudes are positive values and south latitudes are negative numbers. lon - longitude of the site, recorded as a decimal number. West longitudes are positive values and east longitudes are negative numbers. elevation - elevation of the site, expressed in meters above sea level. Negative values are allowed. date - date of an event at this location (for example, sample collection), expressed in the format YYYY-MM-DDThh:mm:ss.SSSZ description - a free text description of the location and, if applicable, the associated event. @optional date description) -> structure: parameter "lat" of Double, parameter "lon" of Double, parameter "elevation" of Double, parameter "date" of String, parameter "description" of String, parameter "source" of type "SourceInfo" (Information about the source of a piece of data. source - the name of the source (e.g. NCBI, JGI, Swiss-Prot) source_id - the ID of the data at the source project_id - the ID of a project encompassing the data at the source @optional source source_id project_id) -> structure: parameter "source" of String, parameter "source_id" of type "source_id" (An ID used for a piece of data at its source. @id external), parameter "project_id" of type "project_id" (An ID used for a project encompassing a piece of data at its source. @id external), parameter "interleaved" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "read_orientation_outward" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "insert_size_mean" of Double, parameter "insert_size_std_dev" of Double :returns: instance of type "UploadReadsOutput" (The output of the upload_reads function. obj_ref - a reference to the new Workspace object in the form X/Y/Z, where X is the workspace ID, Y is the object ID, and Z is the version.) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN upload_reads self.log('Starting upload reads, parsing args') o, wsid, name, objid, kbtype, single_end, fwdid, revid = ( self._proc_upload_reads_params(ctx, params)) interleaved = 1 if (not single_end and not revid) else 0 fileinput = [{ 'shock_id': fwdid, 'file_path': self.scratch + '/fwd/', 'unpack': 'uncompress' }] if revid: fileinput.append({ 'shock_id': revid, 'file_path': self.scratch + '/rev/', 'unpack': 'uncompress' }) dfu = DataFileUtil(self.callback_url, token=ctx['token']) self.log('downloading reads files from Shock') files = dfu.shock_to_file_mass(fileinput) self.log('download complete, validating files') for f, i in zip(files, fileinput): if not self.validateFASTQ(ctx, [{ 'file_path': f['file_path'], 'interleaved': interleaved }])[0][0]['validated']: raise ValueError( 'Invalid fasta file {} from Shock node {}'.format( f['file_path'], i['shock_id'])) self.log('file validation complete') self.log('coercing forward reads node to my control, muhahahaha!') fwdr = dfu.own_shock_node({'shock_id': fwdid, 'make_handle': 1}) self.log('coercing complete, my evil schemes know no bounds') revr = None if revid: self.log('coercing reverse reads node to my control, muhahahaha!') revr = dfu.own_shock_node({'shock_id': revid, 'make_handle': 1}) self.log('coercing complete. Will I stop at nothing?') # TODO calculate gc content, read size, read_count (find a program) fwdfile = { 'file': fwdr['handle'], 'encoding': 'ascii', 'size': files[0]['size'], 'type': 'fq' } if single_end: o['lib'] = fwdfile else: o['lib1'] = fwdfile if revr: o['lib2'] = { 'file': revr['handle'], 'encoding': 'ascii', 'size': files[1]['size'], 'type': 'fq' } so = {'type': kbtype, 'data': o} if name: so['name'] = name else: so['objid'] = objid self.log('saving workspace object') oi = dfu.save_objects({'id': wsid, 'objects': [so]})[0] self.log('save complete') returnVal = { 'obj_ref': str(oi[6]) + '/' + str(oi[0]) + '/' + str(oi[4]) } #END upload_reads # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method upload_reads return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
class TreeUtils: ''' Module Name: TreeUtils Module Description: ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "" GIT_COMMIT_HASH = "acb216cd302c161d5b4dfb272bd4bbae44cdac28" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.utils = Utils(config) self.scratch = config['scratch'] self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.ws = Workspace(config['workspace-url']) logging.basicConfig(level=logging.INFO) #END_CONSTRUCTOR pass def get_trees(self, ctx, params): """ :param params: instance of type "GetTreesParams" (tree_refs - (required) list of WS references included_fields - (optional) subset of tree fields to include) -> structure: parameter "tree_refs" of list of String, parameter "included_fields" of list of String :returns: instance of list of type "TreeData" -> structure: parameter "data" of type "Tree" (Data type for phylogenetic trees. @optional name description type tree_attributes @optional default_node_labels ws_refs kb_refs leaf_list) -> structure: parameter "name" of String, parameter "description" of String, parameter "type" of String, parameter "tree" of type "newick_tree" (Trees are represented in KBase by default in newick format (http://en.wikipedia.org/wiki/Newick_format) and are returned to you in this format by default.) -> type "tree" (A string representation of a phylogenetic tree. The format/syntax of the string is specified by using one of the available typedefs declaring a particular format, such as 'newick_tree', 'phylo_xml_tree' or 'json_tree'. When a format is not explictily specified, it is possible to return trees in different formats depending on addtional parameters. Regardless of format, all leaf nodes in trees built from MSAs are indexed to a specific MSA row. You can use the appropriate functionality of the API to replace these IDs with other KBase Ids instead. Internal nodes may or may not be named. Nodes, depending on the format, may also be annotated with structured data such as bootstrap values and distances.), parameter "tree_attributes" of mapping from String to String, parameter "default_node_labels" of mapping from type "node_id" to type "label", parameter "ws_refs" of mapping from type "node_id" to mapping from type "ref_type" (An enumeration of reference types for a node. Either the one letter abreviation or full name can be given. For large trees, it is strongly advised you use the one letter abreviations. Supported types are: g | genome => genome typed object or CDS data p | protein => protein sequence object or CDS data, often given as the MD5 of the sequence n | dna => dna sequence object or CDS data, often given as the MD5 of the sequence f | feature => feature object or CDS data) to list of type "ws_obj_id" (@id ws), parameter "kb_refs" of mapping from type "node_id" to mapping from type "ref_type" (An enumeration of reference types for a node. Either the one letter abreviation or full name can be given. For large trees, it is strongly advised you use the one letter abreviations. Supported types are: g | genome => genome typed object or CDS data p | protein => protein sequence object or CDS data, often given as the MD5 of the sequence n | dna => dna sequence object or CDS data, often given as the MD5 of the sequence f | feature => feature object or CDS data) to list of type "kbase_id" (A KBase ID is a string starting with the characters "kb|". KBase IDs are typed. The types are designated using a short string. For instance," g" denotes a genome, "tree" denotes a Tree, and "aln" denotes a sequence alignment. KBase IDs may be hierarchical. For example, if a KBase genome identifier is "kb|g.1234", a protein encoding gene within that genome may be represented as "kb|g.1234.peg.771". @id kb), parameter "leaf_list" of list of type "node_id", parameter "info" of type "object_info" (Information about an object, including user provided metadata. obj_id objid - the numerical id of the object. obj_name name - the name of the object. type_string type - the type of the object. timestamp save_date - the save date of the object. obj_ver ver - the version of the object. username saved_by - the user that saved or copied the object. ws_id wsid - the workspace containing the object. ws_name workspace - the workspace containing the object. string chsum - the md5 checksum of the object. int size - the size of the object in bytes. usermeta meta - arbitrary user-supplied metadata about the object.) -> tuple of size 11: parameter "objid" of type "obj_id" (The unique, permanent numerical ID of an object.), parameter "name" of type "obj_name" (A string used as a name for an object. Any string consisting of alphanumeric characters and the characters |._- that is not an integer is acceptable.), parameter "type" of type "type_string" (A type string. Specifies the type and its version in a single string in the format [module].[typename]-[major].[minor]: module - a string. The module name of the typespec containing the type. typename - a string. The name of the type as assigned by the typedef statement. major - an integer. The major version of the type. A change in the major version implies the type has changed in a non-backwards compatible way. minor - an integer. The minor version of the type. A change in the minor version implies that the type has changed in a way that is backwards compatible with previous type definitions. In many cases, the major and minor versions are optional, and if not provided the most recent version will be used. Example: MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z (representing the UTC timezone) or the difference in time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, parameter "saved_by" of type "username" (Login name of a KBase user account.), parameter "wsid" of type "ws_id" (The unique, permanent numerical ID of a workspace.), parameter "workspace" of type "ws_name" (A string used as a name for a workspace. Any string consisting of alphanumeric characters and "_", ".", or "-" that is not an integer is acceptable. The name may optionally be prefixed with the workspace owner's user name and a colon, e.g. kbasetest:my_workspace.), parameter "chsum" of String, parameter "size" of Long, parameter "meta" of type "usermeta" (User provided metadata about an object. Arbitrary key-value pairs provided by the user.) -> mapping from String to String """ # ctx is the context object # return variables are: result #BEGIN get_trees logging.info("Starting 'get_trees' with params:{}".format(params)) self.utils.validate_params(params, ("tree_refs",), ("included_fields",)) ws_objs = [{'ref': r, 'included': params.get('included_fields', None)} for r in params['tree_refs']] result = self.ws.get_objects2({'objects': ws_objs})['data'] #END get_trees # At some point might do deeper type checking... if not isinstance(result, list): raise ValueError('Method get_trees return value ' + 'result is not type list as required.') # return the results return [result] def save_trees(self, ctx, params): """ :param params: instance of type "SaveTreesParams" -> structure: parameter "ws_id" of type "ws_id" (The unique, permanent numerical ID of a workspace.), parameter "trees" of list of type "ObjectSaveData" (An object and associated data required for saving. Required arguments: type_string type - the type of the object. Omit the version information to use the latest version. UnspecifiedObject data - the object data. Optional arguments: One of an object name or id. If no name or id is provided the name will be set to 'auto' with the object id appended as a string, possibly with -\d+ appended if that object id already exists as a name. obj_name name - the name of the object. obj_id objid - the id of the object to save over. usermeta meta - arbitrary user-supplied metadata for the object, not to exceed 16kb; if the object type specifies automatic metadata extraction with the 'meta ws' annotation, and your metadata name conflicts, then your metadata will be silently overwritten. list<ProvenanceAction> provenance - provenance data for the object. boolean hidden - true if this object should not be listed when listing workspace objects.) -> structure: parameter "type" of type "type_string" (A type string. Specifies the type and its version in a single string in the format [module].[typename]-[major].[minor]: module - a string. The module name of the typespec containing the type. typename - a string. The name of the type as assigned by the typedef statement. major - an integer. The major version of the type. A change in the major version implies the type has changed in a non-backwards compatible way. minor - an integer. The minor version of the type. A change in the minor version implies that the type has changed in a way that is backwards compatible with previous type definitions. In many cases, the major and minor versions are optional, and if not provided the most recent version will be used. Example: MyModule.MyType-3.1), parameter "data" of unspecified object, parameter "name" of type "obj_name" (A string used as a name for an object. Any string consisting of alphanumeric characters and the characters |._- that is not an integer is acceptable.), parameter "objid" of type "obj_id" (The unique, permanent numerical ID of an object.), parameter "meta" of type "usermeta" (User provided metadata about an object. Arbitrary key-value pairs provided by the user.) -> mapping from String to String, parameter "provenance" of list of type "ProvenanceAction" (A provenance action. A provenance action (PA) is an action taken while transforming one data object to another. There may be several PAs taken in series. A PA is typically running a script, running an api command, etc. All of the following fields are optional, but more information provided equates to better data provenance. resolved_ws_objects should never be set by the user; it is set by the workspace service when returning data. On input, only one of the time or epoch may be supplied. Both are supplied on output. The maximum size of the entire provenance object, including all actions, is 1MB. timestamp time - the time the action was started epoch epoch - the time the action was started. string caller - the name or id of the invoker of this provenance action. In most cases, this will be the same for all PAs. string service - the name of the service that performed this action. string service_ver - the version of the service that performed this action. string method - the method of the service that performed this action. list<UnspecifiedObject> method_params - the parameters of the method that performed this action. If an object in the parameters is a workspace object, also put the object reference in the input_ws_object list. string script - the name of the script that performed this action. string script_ver - the version of the script that performed this action. string script_command_line - the command line provided to the script that performed this action. If workspace objects were provided in the command line, also put the object reference in the input_ws_object list. list<obj_ref> input_ws_objects - the workspace objects that were used as input to this action; typically these will also be present as parts of the method_params or the script_command_line arguments. list<obj_ref> resolved_ws_objects - the workspace objects ids from input_ws_objects resolved to permanent workspace object references by the workspace service. list<string> intermediate_incoming - if the previous action produced output that 1) was not stored in a referrable way, and 2) is used as input for this action, provide it with an arbitrary and unique ID here, in the order of the input arguments to this action. These IDs can be used in the method_params argument. list<string> intermediate_outgoing - if this action produced output that 1) was not stored in a referrable way, and 2) is used as input for the next action, provide it with an arbitrary and unique ID here, in the order of the output values from this action. These IDs can be used in the intermediate_incoming argument in the next action. list<ExternalDataUnit> external_data - data external to the workspace that was either imported to the workspace or used to create a workspace object. list<SubAction> subactions - the subactions taken as a part of this action. mapping<string, string> custom - user definable custom provenance fields and their values. string description - a free text description of this action.) -> structure: parameter "time" of type "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z (representing the UTC timezone) or the difference in time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC time)), parameter "epoch" of type "epoch" (A Unix epoch (the time since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "caller" of String, parameter "service" of String, parameter "service_ver" of String, parameter "method" of String, parameter "method_params" of list of unspecified object, parameter "script" of String, parameter "script_ver" of String, parameter "script_command_line" of String, parameter "input_ws_objects" of list of type "obj_ref" (A string that uniquely identifies an object in the workspace service. There are two ways to uniquely identify an object in one string: "[ws_name or id]/[obj_name or id]/[obj_ver]" - for example, "MyFirstWorkspace/MyFirstObject/3" would identify the third version of an object called MyFirstObject in the workspace called MyFirstWorkspace. 42/Panic/1 would identify the first version of the object name Panic in workspace with id 42. Towel/1/6 would identify the 6th version of the object with id 1 in the Towel workspace. "kb|ws.[ws_id].obj.[obj_id].ver.[obj_ver]" - for example, "kb|ws.23.obj.567.ver.2" would identify the second version of an object with id 567 in a workspace with id 23. In all cases, if the version number is omitted, the latest version of the object is assumed.), parameter "resolved_ws_objects" of list of type "obj_ref" (A string that uniquely identifies an object in the workspace service. There are two ways to uniquely identify an object in one string: "[ws_name or id]/[obj_name or id]/[obj_ver]" - for example, "MyFirstWorkspace/MyFirstObject/3" would identify the third version of an object called MyFirstObject in the workspace called MyFirstWorkspace. 42/Panic/1 would identify the first version of the object name Panic in workspace with id 42. Towel/1/6 would identify the 6th version of the object with id 1 in the Towel workspace. "kb|ws.[ws_id].obj.[obj_id].ver.[obj_ver]" - for example, "kb|ws.23.obj.567.ver.2" would identify the second version of an object with id 567 in a workspace with id 23. In all cases, if the version number is omitted, the latest version of the object is assumed.), parameter "intermediate_incoming" of list of String, parameter "intermediate_outgoing" of list of String, parameter "external_data" of list of type "ExternalDataUnit" (An external data unit. A piece of data from a source outside the Workspace. On input, only one of the resource_release_date or resource_release_epoch may be supplied. Both are supplied on output. string resource_name - the name of the resource, for example JGI. string resource_url - the url of the resource, for example http://genome.jgi.doe.gov string resource_version - version of the resource timestamp resource_release_date - the release date of the resource epoch resource_release_epoch - the release date of the resource string data_url - the url of the data, for example http://genome.jgi.doe.gov/pages/dynamicOrganismDownload.jsf? organism=BlaspURHD0036 string data_id - the id of the data, for example 7625.2.79179.AGTTCC.adnq.fastq.gz string description - a free text description of the data.) -> structure: parameter "resource_name" of String, parameter "resource_url" of String, parameter "resource_version" of String, parameter "resource_release_date" of type "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z (representing the UTC timezone) or the difference in time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC time)), parameter "resource_release_epoch" of type "epoch" (A Unix epoch (the time since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "data_url" of String, parameter "data_id" of String, parameter "description" of String, parameter "subactions" of list of type "SubAction" (Information about a subaction that is invoked by a provenance action. A provenance action (PA) may invoke subactions (SA), e.g. calling a separate piece of code, a service, or a script. In most cases these calls are the same from PA to PA and so do not need to be listed in the provenance since providing information about the PA alone provides reproducibility. In some cases, however, SAs may change over time, such that invoking the same PA with the same parameters may produce different results. For example, if a PA calls a remote server, that server may be updated between a PA invoked on day T and another PA invoked on day T+1. The SubAction structure allows for specifying information about SAs that may dynamically change from PA invocation to PA invocation. string name - the name of the SA. string ver - the version of SA. string code_url - a url pointing to the SA's codebase. string commit - a version control commit ID for the SA. string endpoint_url - a url pointing to the access point for the SA - a server url, for instance.) -> structure: parameter "name" of String, parameter "ver" of String, parameter "code_url" of String, parameter "commit" of String, parameter "endpoint_url" of String, parameter "custom" of mapping from String to String, parameter "description" of String, parameter "hidden" of type "boolean" (A boolean. 0 = false, other = true.) :returns: instance of list of type "object_info" (Information about an object, including user provided metadata. obj_id objid - the numerical id of the object. obj_name name - the name of the object. type_string type - the type of the object. timestamp save_date - the save date of the object. obj_ver ver - the version of the object. username saved_by - the user that saved or copied the object. ws_id wsid - the workspace containing the object. ws_name workspace - the workspace containing the object. string chsum - the md5 checksum of the object. int size - the size of the object in bytes. usermeta meta - arbitrary user-supplied metadata about the object.) -> tuple of size 11: parameter "objid" of type "obj_id" (The unique, permanent numerical ID of an object.), parameter "name" of type "obj_name" (A string used as a name for an object. Any string consisting of alphanumeric characters and the characters |._- that is not an integer is acceptable.), parameter "type" of type "type_string" (A type string. Specifies the type and its version in a single string in the format [module].[typename]-[major].[minor]: module - a string. The module name of the typespec containing the type. typename - a string. The name of the type as assigned by the typedef statement. major - an integer. The major version of the type. A change in the major version implies the type has changed in a non-backwards compatible way. minor - an integer. The minor version of the type. A change in the minor version implies that the type has changed in a way that is backwards compatible with previous type definitions. In many cases, the major and minor versions are optional, and if not provided the most recent version will be used. Example: MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z (representing the UTC timezone) or the difference in time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, parameter "saved_by" of type "username" (Login name of a KBase user account.), parameter "wsid" of type "ws_id" (The unique, permanent numerical ID of a workspace.), parameter "workspace" of type "ws_name" (A string used as a name for a workspace. Any string consisting of alphanumeric characters and "_", ".", or "-" that is not an integer is acceptable. The name may optionally be prefixed with the workspace owner's user name and a colon, e.g. kbasetest:my_workspace.), parameter "chsum" of String, parameter "size" of Long, parameter "meta" of type "usermeta" (User provided metadata about an object. Arbitrary key-value pairs provided by the user.) -> mapping from String to String """ # ctx is the context object # return variables are: result #BEGIN save_trees logging.info("Starting 'save_trees'") self.utils.validate_params(params, ("ws_id", "trees"), ('type',)) trees = [] for i, t in enumerate(params['trees']): self.utils.validate_params(t, ("data",), ("name", "hidden", "meta", "type")) if 'type' in t and t['type'] != 'KBaseTrees.Tree': raise ValueError("This method only saves KBaseTrees.Tree objects") if "tree" not in t['data']: raise ValueError("Object {} missing 'tree' attribute containing newick tree" .format(i)) if not Utils.validate_newick(t['data']['tree']): raise ValueError("Object {} has an invalid newick tree: {}" .format(i, t['data']['tree'])) t['type'] = 'KBaseTrees.Tree' trees.append(t) result = self.dfu.save_objects({"id": params["ws_id"], "objects": trees}) #END save_trees # At some point might do deeper type checking... if not isinstance(result, list): raise ValueError('Method save_trees return value ' + 'result is not type list as required.') # return the results return [result] def tree_to_newick_file(self, ctx, params): """ :param params: instance of type "TreeToNewickFileParams" -> structure: parameter "input_ref" of type "Tree_id" (@id kb KBaseTrees.Tree), parameter "destination_dir" of String :returns: instance of type "TreeToNewickFileOutput" -> structure: parameter "file_path" of String """ # ctx is the context object # return variables are: result #BEGIN tree_to_newick_file logging.info("Starting 'tree_to_newick' with params: {}".format(params)) self.utils.validate_params(params, ("destination_dir", "input_ref")) _, result = self.utils.to_newick(params) #END tree_to_newick_file # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method tree_to_newick_file return value ' + 'result is not type dict as required.') # return the results return [result] def export_tree_newick(self, ctx, params): """ :param params: instance of type "ExportTreeParams" -> structure: parameter "input_ref" of type "Tree_id" (@id kb KBaseTrees.Tree) :returns: instance of type "ExportTreeOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: result #BEGIN export_tree_newick logging.info("Starting 'export_tree_newick' with params:{}".format(params)) self.utils.validate_params(params, ("input_ref",)) params['destination_dir'] = self.scratch cs_id, files = self.utils.to_newick(params) result = self.utils.export(files['file_path'], cs_id, params['input_ref']) #END export_tree_newick # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method export_tree_newick return value ' + 'result is not type dict as required.') # return the results return [result] def status(self, ctx): #BEGIN_STATUS returnVal = {'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH} #END_STATUS return [returnVal]
class GenericsUtil: def _validate_fetch_data_params(self, params): """ _validate_fetch_data_params: validates params passed to fetch_data method """ log('start validating fetch_data params') # check for required parameters for p in ['obj_ref']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _validate_import_matrix_from_excel_params(self, params): """ _validate_import_matrix_from_excel_params: validates params passed to import_matrix_from_excel method """ log('start validating import_matrix_from_excel params') # check for required parameters for p in ['obj_type', 'matrix_name', 'workspace_name']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) obj_type = params.get('obj_type') if obj_type not in MATRIX_TYPE: raise ValueError('Unknown matrix object type: {}'.format(obj_type)) if params.get('input_file_path'): file_path = params.get('input_file_path') elif params.get('input_shock_id'): file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') elif params.get('input_staging_file_path'): file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params.get('input_staging_file_path') }).get('copy_file_path') else: error_msg = "Must supply either a input_shock_id or input_file_path " error_msg += "or input_staging_file_path" raise ValueError(error_msg) refs_key = [ 'col_conditionset_ref', 'row_conditionset_ref', 'genome_ref', 'diff_expr_matrix_ref' ] refs = {k: v for k, v in params.items() if k in refs_key} return (obj_type, file_path, params.get('workspace_name'), params.get('matrix_name'), refs) def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ log('Start uploading file to shock: {}'.format(file_path)) file_to_shock_params = {'file_path': file_path, 'pack': 'zip'} shock_id = self.dfu.file_to_shock(file_to_shock_params).get('shock_id') return shock_id def _upload_dir_to_shock(self, directory): """ _upload_dir_to_shock: upload target dir to shock using DataFileUtil """ log('Start uploading directory to shock: {}'.format(directory)) file_to_shock_params = {'file_path': directory, 'pack': 'zip'} shock_file = self.dfu.file_to_shock(file_to_shock_params) shock_id = shock_file.get('shock_id') return shock_id def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _generate_html_string(self, df): """ _generate_html_string: generating a html string from df template used: https://developers.google.com/chart/interactive/docs/gallery/table https://developers.google.com/chart/interactive/docs/reference#formatters """ dtypes = df.dtypes columns = df.columns column_str = '' number_columns = [] for idx, column in enumerate(columns): dtype = dtypes[idx].name if 'int' in dtype or 'float' in dtype: column_str += "data.addColumn('number', '{}')\n".format(column) number_columns.append(column) else: column_str += "data.addColumn('string', '{}')\n".format(column) data_str = "data.addRows({})".format(df.values.tolist()) formatter_str = '' for number_column in number_columns: mean = round(df[number_column].mean(), 2) column_n = columns.tolist().index(number_column) formatter_str += "var formatter_{} = ".format(column_n) formatter_str += "new google.visualization.BarFormat({base: " formatter_str += str(mean) formatter_str += ", width: 120});\n" formatter_str += "formatter_{}.format(data, {});\n".format( column_n, column_n) return column_str, data_str, formatter_str def _find_between(self, s, start, end): """ _find_between: find string in between start and end """ return re.search('{}(.*){}'.format(start, end), s).group(1) def _find_type_spec(self, obj_type): """ _find_type_spec: find body spec of type """ obj_type_name = self._find_between(obj_type, '\.', '\-') type_info = self.wsClient.get_type_info(obj_type) type_spec = type_info.get('spec_def') type_spec_list = type_spec.split(obj_type_name + ';') obj_type_spec = type_spec_list[0].split('structure')[-1] log('Found spec for {}\n{}\n'.format(obj_type, obj_type_spec)) return obj_type_spec def _find_constraints(self, obj_type): """ _find_constraints: retrieve constraints (@contains, rowsum, unique) """ type_info = self.wsClient.get_type_info(obj_type) type_desc = type_info.get('description') constraints = {'contains': [], 'rowsum': [], 'unique': []} unique = [ item.split('\n')[0].strip() for item in type_desc.split('@unique')[1:] ] constraints['unique'] = unique contains = [ item.split('\n')[0].strip() for item in type_desc.split('@contains')[1:] ] constraints['contains'] = contains return constraints def _find_generics_type(self, obj_type): """ _find_generics_type: try to find generics type in an object """ log('Start finding generics type and name') obj_type_spec = self._find_type_spec(obj_type) if not obj_type_spec: raise ValueError('Cannot retrieve spec for: {}'.format(obj_type)) generics_types = [ generics_type for generics_type in GENERICS_TYPE if generics_type in obj_type_spec ] if not generics_types: error_msg = 'Cannot find generics type in spec:\n{}\n'.format( obj_type_spec) raise ValueError(error_msg) generics_module = dict() for generics_type in generics_types: for item in obj_type_spec.split(generics_type)[1:]: generics_type_name = item.split(';')[0].strip().split( ' ')[-1].strip() generics_module.update({generics_type_name: generics_type}) log('Found generics type:\n{}\n'.format(generics_module)) return generics_module def _convert_data(self, data, generics_module): """ _convert_data: convert data to df based on data_type """ data_types = generics_module.values() if not set(GENERICS_TYPE) >= set(data_types): raise ValueError( 'Found unknown generics data type in:\n{}\n'.format( data_types)) if data_types == ['FloatMatrix2D']: key = generics_module.keys()[generics_module.values().index( 'FloatMatrix2D')] values = data[key]['values'] index = data[key]['row_ids'] columns = data[key]['col_ids'] df = pd.DataFrame(values, index=index, columns=columns) # elif 'FloatMatrix2D' in data_types: # default case # key = generics_module.keys()[generics_module.values().index('FloatMatrix2D')] # values = data[key]['values'] # index = data[key]['row_ids'] # columns = data[key]['col_ids'] # df = pd.DataFrame(values, index=index, columns=columns) else: raise ValueError('Unexpected Error') return df.to_json() def _retrieve_data(self, obj_ref, generics_module=None): """ _retrieve_data: retrieve object data and return a dataframe in json format """ log('Start retrieving data') obj_source = self.dfu.get_objects({"object_refs": [obj_ref]})['data'][0] obj_info = obj_source.get('info') obj_data = obj_source.get('data') if not generics_module: generics_module = self._find_generics_type(obj_info[2]) try: data = { k: v for k, v in obj_data.items() if k in generics_module.keys() } except KeyError: raise ValueError('Retrieved wrong generics type name') data_matrix = self._convert_data(data, generics_module) return data_matrix def _get_col_cond_list(self, col_mapping, col_conditionset_ref, cols): """ _get_col_cond_list: generate col condition list for excel """ col_cond_list = [] conditionset_data = self.dfu.get_objects( {"object_refs": [col_conditionset_ref]})['data'][0]['data'] col_condition_names = [ factor.get('factor') for factor in conditionset_data.get('factors') ] for col in cols: condition_id = col_mapping.get(col) if condition_id: col_cond_list.append( conditionset_data.get('conditions').get(condition_id)) else: col_cond_list.append([''] * len(col_condition_names)) col_cond_list = map(list, zip(*col_cond_list)) for idx, col_array in enumerate(col_cond_list): col_array.insert(0, col_condition_names[idx]) return col_cond_list def _get_row_cond_list(self, row_mapping, row_conditionset_ref, rows): """ _get_row_cond_list: generate row condition list for excel """ row_cond_list = [] conditionset_data = self.dfu.get_objects( {"object_refs": [row_conditionset_ref]})['data'][0]['data'] row_condition_names = [ factor.get('factor') for factor in conditionset_data.get('factors') ] row_cond_list.append(row_condition_names) for row in rows: condition_id = row_mapping.get(row) if condition_id: row_cond_list.append( conditionset_data.get('conditions').get(condition_id)) else: row_cond_list.append([''] * len(row_condition_names)) return row_cond_list def _get_data_list(self, cols, rows, values): """ _get_data_list: generate data value list for excel """ data_arrays = [] cols.insert(0, '') data_arrays.append(cols) for idx, row in enumerate(rows): values[idx].insert(0, row) data_arrays += values return data_arrays def _merge_cond_list(self, excel_list, col_cond_list, row_cond_list): """ _merge_cond_list: merge lists for excel """ col_cond_len = len(col_cond_list) for item in excel_list[:col_cond_len]: row_len = len(row_cond_list[0]) if row_cond_list else 0 item[0:0] = [''] * row_len if row_cond_list: for idx, item in enumerate(excel_list[col_cond_len:]): item[0:0] = row_cond_list[idx] def _is_number(s): """ _is_number: string is a numeric """ try: float(s) return True except ValueError: pass return False def _gen_excel(self, excel_list, obj_name): """ _gen_excel: create excel """ result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_name)) log('Start writing to file: {}'.format(file_path)) workbook = xlsxwriter.Workbook(file_path, {'nan_inf_to_errors': True}) worksheet = workbook.add_worksheet() row = 1 for data_entry in excel_list: for idx, cell_data in enumerate(data_entry): worksheet.write(row, idx, cell_data) row += 1 workbook.close() return file_path def _write_mapping_sheet(self, file_path, sheet_name, mapping, index): """ _write_mapping_sheet: write mapping to sheet """ df_dict = collections.OrderedDict() df_dict[index[0]] = [] df_dict[index[1]] = [] for key, value in mapping.items(): df_dict.get(index[0]).append(key) df_dict.get(index[1]).append(value) df = pd.DataFrame.from_dict(df_dict) with pd.ExcelWriter(file_path, engine='openpyxl') as writer: writer.book = load_workbook(file_path) df.to_excel(writer, sheet_name=sheet_name) def _filter_constraints(self, constraints, data): contains_constraints = constraints.get('contains') filtered_constraints = [] for contains_constraint in contains_constraints: in_values = contains_constraint.split(' ')[1:] missing_key = True for in_value in in_values: if in_value.startswith('values'): search_value = re.search('{}(.*){}'.format('\(', '\)'), in_value).group(1) unique_list = search_value.split('.') key = unique_list[0] elif ':' in in_value: key = in_value.split(':')[0] else: unique_list = in_value.split('.') key = unique_list[0] if key in data: missing_key = False break if missing_key: filtered_constraints.append(contains_constraint) for x in filtered_constraints: contains_constraints.remove(x) return constraints def _retrieve_value(self, data, value): log('Getting value for {}'.format(value)) retrieve_data = [] m_data = DotMap(data) if value.startswith( 'values'): # TODO: nested values e.g. values(values(ids)) search_value = re.search('{}(.*){}'.format('\(', '\)'), value).group(1) unique_list = search_value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = m_data_cp.values() elif ':' in value: obj_ref = getattr(m_data, value.split(':')[0]) if obj_ref: included = value.split(':')[1] included = '/' + included.replace('.', '/') ref_data = self.wsClient.get_objects2( {'objects': [{ 'ref': obj_ref, 'included': [included] }]})['data'][0]['data'] m_ref_data = DotMap(ref_data) if ref_data: if '*' not in included: for key in included.split('/')[1:]: m_ref_data = getattr(m_ref_data, key) else: keys = included.split('/')[1:] m_ref_data = [ x.get(keys[2]) for x in ref_data.get(keys[0]) ] # TODO: only works for 2 level nested data like '/features/[*]/id' retrieve_data = list(m_ref_data) else: unique_list = value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = list(m_data_cp) log('Retrieved value (first 20):\n{}\n'.format(retrieve_data[:20])) return retrieve_data def _generate_report(self, matrix_obj_ref, workspace_name): """ _generate_report: generate summary report """ report_params = { 'message': '', 'objects_created': [{ 'ref': matrix_obj_ref, 'description': 'Imported Matrix' }], 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_excel_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _validate(self, constraints, data): """ _validate: validate data """ validated = True failed_constraints = {'contains': [], 'rowsum': [], 'unique': []} unique_constraints = constraints.get('unique') for unique_constraint in unique_constraints: retrieved_value = self._retrieve_value(data, unique_constraint) if len(set(retrieved_value)) != len(retrieved_value): validated = False failed_constraints['unique'].append(unique_constraint) contains_constraints = constraints.get('contains') for contains_constraint in contains_constraints: value = contains_constraint.split(' ')[0] in_values = contains_constraint.split(' ')[1:] retrieved_in_values = [] for in_value in in_values: retrieved_in_values += self._retrieve_value(data, in_value) if not (set(self._retrieve_value(data, value)) <= set(retrieved_in_values)): validated = False failed_constraints['contains'].append(contains_constraint) return validated, failed_constraints def _process_mapping_sheet(self, file_path, sheet_name): """ _process_mapping: process mapping sheet """ try: df = pd.read_excel(file_path, sheet_name=sheet_name) except XLRDError: return dict() else: mapping = {value[0]: value[1] for value in df.values.tolist()} return mapping def _process_conditionset_sheet(self, file_path, sheet_name, matrix_name, workspace_id): """ _process_conditionset_sheet: process condition set sheet """ try: df = pd.read_excel(file_path, sheet_name=sheet_name) except XLRDError: return '' else: obj_name = '{}_{}'.format(sheet_name, matrix_name) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_name)) df.to_excel(file_path) import_condition_set_params = { 'output_obj_name': obj_name, 'output_ws_id': workspace_id, 'input_file_path': file_path } ref = self.cu.file_to_condition_set(import_condition_set_params) return ref.get('condition_set_ref') def _file_to_data(self, file_path, refs, matrix_name, workspace_id): log('Start reading and converting excel file data') data = refs try: pd.read_excel(file_path) except XLRDError: # TODO: convert csv file to excel log('Found csv file') raise ValueError('Please provide .xlsx file only') # processing data sheet try: df = pd.read_excel(file_path, sheet_name='data') except XLRDError: raise ValueError('Cannot find <data> sheetss') else: df.fillna(0, inplace=True) matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist() } data.update({'data': matrix_data}) # processing col/row_mapping col_mapping = self._process_mapping_sheet(file_path, 'col_mapping') data.update({'col_mapping': col_mapping}) row_mapping = self._process_mapping_sheet(file_path, 'row_mapping') data.update({'row_mapping': row_mapping}) # processing col/row_conditionset col_conditionset_ref = self._process_conditionset_sheet( file_path, 'col_conditionset', matrix_name, workspace_id) data.update({'col_conditionset_ref': col_conditionset_ref}) row_conditionset_ref = self._process_conditionset_sheet( file_path, 'row_conditionset', matrix_name, workspace_id) data.update({'row_conditionset_ref': row_conditionset_ref}) # processing metadata metadata = self._process_mapping_sheet(file_path, 'metadata') data.update(metadata) return data def _build_header_str(self, factor_names): header_str = '' width = 100.0 / len(factor_names) header_str += '<tr class="header">' header_str += '<th style="width:{0:.2f}%;">Feature ID</th>'.format( width) for factor_name in factor_names: header_str += '<th style="width:{0:.2f}%;"'.format(width) header_str += '>{}</th>'.format(factor_name) header_str += '</tr>' return header_str def _build_html_str(self, row_mapping, conditionset_data, row_ids): log('Start building html replacement') factor_names = [ factor.get('factor') for factor in conditionset_data.get('factors') ] header_str = self._build_header_str(factor_names) table_str = '' conditions = conditionset_data.get('conditions') for feature_id, factor_id in row_mapping.items(): if feature_id in row_ids: feature_conditions = conditions.get(factor_id) table_str += '<tr>' table_str += '<td>{}</td>'.format(feature_id) for feature_condition in feature_conditions: table_str += '<td>{}</td>'.format(feature_condition) table_str += '</tr>' return header_str, table_str def _generate_search_html_report(self, header_str, table_str): html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'search.html') shutil.copy2(os.path.join(os.path.dirname(__file__), 'kbase_icon.png'), output_directory) shutil.copy2( os.path.join(os.path.dirname(__file__), 'search_icon.png'), output_directory) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'search_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '//HEADER_STR', header_str) report_template = report_template.replace( '//TABLE_STR', table_str) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_directory, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Search Matrix App' }) return html_report def _generate_search_report(self, header_str, table_str, workspace_name): log('Start creating report') output_html_files = self._generate_search_html_report( header_str, table_str) report_params = { 'message': '', 'workspace_name': workspace_name, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_matrix_filter_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _filter_value_data(self, value_data, feature_ids): filtered_value_data = dict() filtered_value_data['col_ids'] = value_data['col_ids'] feature_ids = feature_ids.split(',') filtered_value_data['row_ids'] = feature_ids filtered_value_data['values'] = list() values = value_data['values'] row_ids = value_data['row_ids'] for feature_id in feature_ids: idx = row_ids.index(feature_id) filtered_value_data['values'].append(values[idx]) return filtered_value_data def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.srv_wiz_url = config['srv-wiz-url'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.wsClient = workspaceService(self.ws_url, token=self.token) self.cu = ConditionUtils(self.callback_url, service_ver="dev") def filter_matrix(self, params): """ filter_matrix: create sub-matrix based on input feature_ids or group by factor name arguments: matrix_obj_ref: object reference of a matrix workspace_name: workspace name feature_ids: string of feature ids that result matrix contains filtered_matrix_name: name of newly created filtered matrix object """ matrix_obj_ref = params.get('matrix_obj_ref') workspace_name = params.get('workspace_name') feature_ids = params.get('feature_ids') filtered_matrix_name = params.get('filtered_matrix_name') matrix_source = self.dfu.get_objects({"object_refs": [matrix_obj_ref]})['data'][0] matrix_info = matrix_source.get('info') matrix_data = matrix_source.get('data') matrix_type = self._find_between(matrix_info[2], '\.', '\-') value_data = matrix_data.get('data') filtered_value_data = self._filter_value_data(value_data, feature_ids) matrix_data['data'] = filtered_value_data if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name filtered_matrix_obj_ref = self.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(matrix_type), 'obj_name': filtered_matrix_name, 'data': matrix_data, 'workspace_name': workspace_id })['obj_ref'] returnVal = {'matrix_obj_refs': [filtered_matrix_obj_ref]} report_output = self._generate_report(filtered_matrix_obj_ref, workspace_name) returnVal.update(report_output) return returnVal def search_matrix(self, params): """ search_matrix: generate a HTML report that allows users to select feature ids arguments: matrix_obj_ref: object reference of a matrix workspace_name: workspace name """ matrix_obj_ref = params.get('matrix_obj_ref') workspace_name = params.get('workspace_name') matrix_source = self.dfu.get_objects({"object_refs": [matrix_obj_ref]})['data'][0] matrix_data = matrix_source.get('data') row_mapping = matrix_data.get('row_mapping') row_conditionset_ref = matrix_data.get('row_conditionset_ref') row_ids = matrix_data['data']['row_ids'] if not (row_mapping and row_conditionset_ref): raise ValueError( 'Matrix obejct is missing either row_mapping or row_conditionset_ref' ) conditionset_data = self.dfu.get_objects( {"object_refs": [row_conditionset_ref]})['data'][0]['data'] header_str, table_str = self._build_html_str(row_mapping, conditionset_data, row_ids) returnVal = self._generate_search_report(header_str, table_str, workspace_name) return returnVal def import_matrix_from_excel(self, params): """ import_matrix_from_excel: import matrix object from excel arguments: obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix matrix_name: matrix object name workspace_name: workspace name matrix object to be saved to input_shock_id: file shock id or input_file_path: absolute file path or input_staging_file_path: staging area file path optional arguments: col_conditionset_ref: column ConditionSet reference row_conditionset_ref: row ConditionSet reference genome_ref: genome reference matrix_obj_ref: Matrix reference """ (obj_type, file_path, workspace_name, matrix_name, refs) = self._validate_import_matrix_from_excel_params(params) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name data = self._file_to_data(file_path, refs, matrix_name, workspace_id) matrix_obj_ref = self.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': data, 'workspace_name': workspace_id })['obj_ref'] returnVal = {'matrix_obj_ref': matrix_obj_ref} report_output = self._generate_report(matrix_obj_ref, workspace_name) returnVal.update(report_output) return returnVal def save_object(self, params): """ save_object: validate data constraints and save matrix object arguments: obj_type: saving object data type obj_name: saving object name data: data to be saved workspace_name: workspace name matrix object to be saved to return: obj_ref: object reference """ log('Starting saving object') obj_type = params.get('obj_type') module_name = obj_type.split('.')[0] type_name = obj_type.split('.')[1] types = self.wsClient.get_module_info({ 'mod': module_name }).get('types') for module_type in types: if self._find_between(module_type, '\.', '\-') == type_name: obj_type = module_type break data = dict((k, v) for k, v in params.get('data').iteritems() if v) validate = self.validate_data({'obj_type': obj_type, 'data': data}) if not validate.get('validated'): log('Data failed type checking') failed_constraints = validate.get('failed_constraints') error_msg = 'Object {} failed type checking:\n'.format( params.get('obj_name')) if failed_constraints.get('unique'): unique_values = failed_constraints.get('unique') error_msg += 'Object should have unique field: {}\n'.format( unique_values) if failed_constraints.get('contains'): contained_values = failed_constraints.get('contains') for contained_value in contained_values: subset_value = contained_value.split(' ')[0] super_value = ' '.join(contained_value.split(' ')[1:]) error_msg += 'Object field [{}] should contain field [{}]\n'.format( super_value, subset_value) raise ValueError(error_msg) workspace_name = params.get('workspace_name') if not isinstance(workspace_name, int): ws_name_id = self.dfu.ws_name_to_id(workspace_name) else: ws_name_id = workspace_name info = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data": data, "name": params.get('obj_name') }] })[0] return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])} def validate_data(self, params): """ validate_data: validate data arguments: obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1' data: obj data to be validated return: validated: True or False """ constraints = self._find_constraints(params.get('obj_type')) data = params.get('data') constraints = self._filter_constraints(constraints, data) validated, failed_constraints = self._validate(constraints, data) returnVal = { 'validated': validated, 'failed_constraints': failed_constraints } return returnVal def generate_matrix_html(self, params): """ generate_matrix_html: generate a html page for given data arguments: df: a pandas dataframe return: html_string: html as a string format """ column_str, data_str, formatter_str = self._generate_html_string( params.get('df')) with open( os.path.join(os.path.dirname(__file__), 'matrix_page_template.html'), 'r') as matrix_page_template_file: html_string = matrix_page_template_file.read() html_string = html_string.replace('// ADD_COL', column_str) html_string = html_string.replace('// ADD_DATA', data_str) html_string = html_string.replace('// ADD_FORMATTER', formatter_str) returnVal = {'html_string': html_string} return returnVal def fetch_data(self, params): """ fetch_data: fetch generics data as pandas dataframe for a generics data object arguments: obj_ref: generics object reference optional arguments: generics_module: the generics data module to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; generics_module should be {'data': 'FloatMatrix2D', 'condition_set_ref': 'condition_set_ref'} return: data_matrix: a pandas dataframe in json format """ log('--->\nrunning GenericsUtil.fetch_data\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_fetch_data_params(params) try: data_matrix = self._retrieve_data(params.get('obj_ref'), params.get('generics_module')) except Exception: error_msg = 'Running fetch_data returned an error:\n{}\n'.format( traceback.format_exc()) error_msg += 'Please try to specify generics type and name as generics_module\n' raise ValueError(error_msg) returnVal = {'data_matrix': data_matrix} return returnVal def export_matrix(self, params): """ export_matrix: univeral downloader for matrix data object arguments: obj_ref: generics object reference optional arguments: generics_module: select the generics data to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; and only data is needed generics_module should be {'data': 'FloatMatrix2D'} """ log('Start exporting matrix') if 'input_ref' in params: params['obj_ref'] = params.pop('input_ref') obj_source = self.dfu.get_objects( {"object_refs": [params.get('obj_ref')]})['data'][0] obj_data = obj_source.get('data') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_source.get('info')[1])) data_matrix = self.fetch_data(params).get('data_matrix') df = pd.read_json(data_matrix) df.to_excel(file_path, sheet_name='data') if obj_data.get('col_mapping'): self._write_mapping_sheet(file_path, 'col_mapping', obj_data.get('col_mapping'), ['col_name', 'condition_name']) obj_data.pop('col_mapping') if obj_data.get('row_mapping'): self._write_mapping_sheet(file_path, 'row_mapping', obj_data.get('row_mapping'), ['row_name', 'condition_name']) obj_data.pop('row_mapping') try: obj_data.pop('data') except KeyError: log('Missing key [data]') self._write_mapping_sheet(file_path, 'metadata', obj_data, ['name', 'value']) shock_id = self._upload_to_shock(file_path) return {'shock_id': shock_id}
def find_motifs(self, ctx, params): """ :param params: instance of type "get_promoter_for_gene_input" (Genome is a KBase genome Featureset is a KBase featureset Promoter_length is the length of promoter requested for all genes) -> structure: parameter "workspace_name" of String, parameter "genome_ref" of String, parameter "featureSet_ref" of String, parameter "promoter_length" of Long :returns: instance of type "get_promoter_for_gene_output_params" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN find_motifs #TODO: have these guys return output paths for key, value in params.iteritems(): print key if 'motif_min_length' not in params: params['motif_min_length'] = 8 if 'motif_max_length' not in params: params['motif_max_length'] = 16 motMin = params['motif_min_length'] motMax = params['motif_max_length'] promoterFastaFilePath = self.get_promoter_for_gene(ctx, params)[0] gibbsCommandList = [] for i in range(motMin, motMax + 1, 2): gibbsCommandList.append( GU.build_gibbs_command(promoterFastaFilePath, i)) for g in gibbsCommandList: GU.run_gibbs_command(g) #gibbsCommand = GU.build_gibbs_command(promoterFastaFilePath) #GU.run_gibbs_command(gibbsCommand) #print(promoterFastaFilePath) homerMotifCommand = HU.build_homer_motif_command(promoterFastaFilePath) homerLocationCommand = HU.build_homer_location_command( promoterFastaFilePath) os.mkdir(self.shared_folder + '/homer_out') #print(homerMotifCommand) HU.run_homer_command(homerMotifCommand) HU.run_homer_command(homerLocationCommand) MEMEMotifCommand = MEU.build_meme_command(promoterFastaFilePath) MEU.run_meme_command(MEMEMotifCommand) gibbsMotifList = GU.parse_gibbs_output(motMin, motMax) homerMotifList = HU.parse_homer_output() memeMotifList = MEU.parse_meme_output() timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) timestamp = str(timestamp) htmlDir = self.shared_folder + '/html' + timestamp os.mkdir(htmlDir) lineCount = 0 with open(promoterFastaFilePath, 'r') as pFile: for line in pFile: lineCount += 1 numFeat = lineCount / 2 with open(promoterFastaFilePath, 'r') as pFile: fileStr = pFile.read() promHtmlStr = '<html><body> ' + fileStr + ' </body></html>' with open(htmlDir + '/promoters.html', 'w') as promHTML: promHTML.write(promHtmlStr) JsonPath = '/kb/module/work/tmp' subprocess.call([ 'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py', JsonPath + '/gibbs.json', htmlDir + '/gibbs.html', str(numFeat) ]) subprocess.call([ 'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py', JsonPath + '/homer_out/homer.json', htmlDir + '/homer.html', str(numFeat) ]) subprocess.call([ 'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py', JsonPath + '/meme_out/meme.json', htmlDir + '/meme.html', str(numFeat) ]) fullMotifList = [] for h in homerMotifList: add = True for g in gibbsMotifList: if h['Iupac_signature'] == g['Iupac_signature']: add = False break for m in memeMotifList: if m['Iupac_signature'] == h['Iupac_signature']: add = False break if add: fullMotifList.append(h) for g in gibbsMotifList: add = True for m in memeMotifList: if m['Iupac_signature'] == g['Iupac_signature']: add = False break if add: fullMotifList.append(g) for m in memeMotifList: fullMotifList.append(m) #What needs to happen here: #call makeLogo for each of the json outputs(capture these from somewhere) dfu = DataFileUtil(self.callback_url) parsed = ['gibbs.html', 'homer.html', 'meme.html', 'promoters.html'] indexHtmlStr = '<html>' #use js to load the page content for p in parsed: indexHtmlStr += '<head><script src="https://ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min.js"></script> <script> $(function(){$("#' + p.replace( '.html', '_content') + '").load("' + p + '"); });</script> ' indexHtmlStr += """<style> body {font-family: Arial;} /* Style the tab */ .tab { overflow: hidden; border: 1px solid #ccc; background-color: #f1f1f1; } /* Style the buttons inside the tab */ .tab button { background-color: inherit; float: left; border: none; outline: none; cursor: pointer; padding: 14px 16px; transition: 0.3s; font-size: 17px; } /* Change background color of buttons on hover */ .tab button:hover { background-color: #ddd; } /* Create an active/current tablink class */ .tab button.active { background-color: #ccc; } /* Style the tab content */ .tabcontent { display: none; padding: 6px 12px; border: 1px solid #ccc; border-top: none; } </style></head> """ indexHtmlStr += '<body>' #adding tabs indexHtmlStr += '<div class="tab">\n' for p in parsed: indexHtmlStr += '<button class="tablinks" onclick="openReport(event, \'' + p.replace( '.html', '_content') + '\')">' + p.replace('.html', '') + '</button>' indexHtmlStr += '</div>' for p in parsed: indexHtmlStr += '<div id="' + p.replace( '.html', '_content') + '" class="tabcontent"></div>' indexHtmlStr += """<script> function openReport(evt, reportName) { var i, tabcontent, tablinks; tabcontent = document.getElementsByClassName("tabcontent"); for (i = 0; i < tabcontent.length; i++) { tabcontent[i].style.display = "none"; } tablinks = document.getElementsByClassName("tablinks"); for (i = 0; i < tablinks.length; i++) { tablinks[i].className = tablinks[i].className.replace(" active", ""); } document.getElementById(reportName).style.display = "block"; evt.currentTarget.className += " active"; } </script>""" #for p in parsed: # indexHtmlStr += '<a href="' + p + '">' + p.replace('.html','') +' Output</a>\n' #indexHtmlStr += '</body></html>' with open(htmlDir + '/index.html', 'w') as html_handle: html_handle.write(str(indexHtmlStr)) #plt.rcParams['figure.dpi'] = 300 #htmlFiles = ['index.html','gibbs.html','homer.html'] #shockParamsList = [] #for f in htmlFiles: # shockParamsList.append({'file_path': htmlDir + f ,'make_handle': 0, 'pack': 'zip'}) try: html_upload_ret = dfu.file_to_shock({ 'file_path': htmlDir, 'make_handle': 0, 'pack': 'zip' }) except: raise ValueError('error uploading HTML file to shock') #Create motif set object from MotifList #TODO set parameters correctly #add narrative support to set MSO = {} MSO['Condition'] = 'Temp' MSO['FeatureSet_ref'] = '123' MSO['Motifs'] = [] MSO['Alphabet'] = ['A', 'C', 'G', 'T'] MSO['Background'] = {} for letter in MSO['Alphabet']: MSO['Background'][letter] = 0.0 MSU.parseMotifList(fullMotifList, MSO) objname = 'MotifSet' + str( int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)) #Pass motif set into this save_objects_params = {} #save_objects_params['id'] = self.ws_info[0] #save_objects_params['id'] = long(params['workspace_name'].split('_')[1]) save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) save_objects_params['objects'] = [{ 'type': 'KBaseGwasData.MotifSet', 'data': MSO, 'name': objname }] info = dfu.save_objects(save_objects_params)[0] motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) #object_upload_ret = dfu.file_to_shock() reportName = 'identify_promoter_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [{ 'ref': motif_set_ref, 'description': 'Motif Set generated by identify promoter' }], 'message': '', 'direct_html': None, 'direct_html_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj #reportObj['direct_html'] = None reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{ 'shock_id': html_upload_ret['shock_id'], #'name': 'promoter_download.zip', 'name': 'index.html', 'label': 'Save promoter_download.zip' }] report = KBaseReport(self.callback_url, token=ctx['token']) #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']}) report_info = report.create_extended_report(reportObj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END find_motifs # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method find_motifs return value ' + 'output is not type dict as required.') # return the results return [output]
class variation_importer_utils: def __init__(self, utility_params): self.params = utility_params # self.scratch = utility_params['scratch'] self.scratch = os.path.join(utility_params['scratch'], 'variation_importer_' + str(uuid.uuid4())) os.mkdir(self.scratch) self.service_wiz_url = utility_params['srv-wiz-url'] self.callback_url = utility_params['callback_url'] self.dfu = DataFileUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url, token=utility_params['token']) def _create_fake_location_data(self): location = { 'lat': random.uniform(-90, 90), 'lon': random.uniform(-180, 180), 'elevation': random.uniform(0, 100), 'description': "".join([random.choice(string.ascii_letters) for n in xrange(20)]) } return location def _create_fake_straininfo(self, genotype_id): straininfo = { 'source_id': genotype_id, 'location_info': self._create_fake_location_data() } return straininfo def _create_fake_population(self, genotypes): population = {'description': 'Faker population data.', 'strains': []} for genome in genotypes: population['strains'].append(self._create_fake_straininfo(genome)) return population def _create_fake_kinship_matrix(self): kinship = { 'row_ids': ['one', 'two'], 'col_ids': ['one', 'two'], 'kinship_coefficients': [[0.1, 0.1], [0.1, 0.1]] } return kinship def _compare(self, s, t): return Counter(s) == Counter(t) def pretend_download_staging_file(self, vcf_filename, scratch): vcf_filepath = os.path.join(scratch, vcf_filename) shutil.copy('/kb/module/data/' + vcf_filename, vcf_filepath) return {'copy_file_path': vcf_filepath} def _generate_population(self, location_filepath, genotypes, population_description="None Provided"): locations = pd.read_csv(location_filepath, delimiter='\t') # Drop any missing data from id, latitude, or longitude. locations.dropna(subset=['id', 'latitude', 'longitude'], inplace=True) # Compare the location IDs with the genotype IDs if not (self._compare(locations.iloc[:, 0].astype(str).tolist(), genotypes)): log("Location IDs do not match Sample IDs in Variation file!") raise ValueError( "Location IDs do not match Sample IDs in Variation file!") col_names = [x.lower() for x in locations.columns.values] expected_columns = ['id', 'latitude', 'longitude'] optional_columns = ['elevation', 'description'] # CHeck that first three columns match the expected columns. if not (self._compare(col_names[0:3], expected_columns)): raise ValueError("Missing or unexpected column names in {}".format( location_filepath)) # If optional columns are not present, give default value for each. for col in optional_columns: if col not in col_names: if col == 'elevation': locations[col] = 0.0 else: locations[col] = "None provided." population = {'description': population_description, 'strains': []} for idx, row in locations.iterrows(): population['strains'].append({ 'source_id': str(row['id']), 'location_info': { 'lat': row['latitude'], 'lon': row['longitude'], 'elevation': row['elevation'], 'description': row['description'] } }) return population def _validate_vcf(self, vcf_filepath, vcf_version): validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4())) os.mkdir(validation_output_dir) if vcf_version >= 4.1: print("Using vcf_validator_linux...") validator_cmd = ["vcf_validator_linux"] validator_cmd.append("-i") validator_cmd.append(vcf_filepath) validator_cmd.append("-o") validator_cmd.append(validation_output_dir) else: print("Using vcftools to validate...") validator_cmd = ["vcf-validator"] validator_cmd.append(vcf_filepath) print("VCF version below 4.1. No validation logging.") print("Validator command: {}".format(validator_cmd)) p = subprocess.Popen(validator_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) validator_output = [] while True: line = p.stdout.readline() if not line: break validator_output.append(line) p.wait() validation_output_filename = [ f for f in os.listdir(validation_output_dir) if f.endswith('.txt') ][0] validation_output_filepath = os.path.join(validation_output_dir, validation_output_filename) if not validation_output_filename: print('Validator did not generate log file!') raise Exception("Validator did not generate a log file.") log("Validator output filepath: {}".format(validation_output_filepath)) log("Return code from validator {}".format(p.returncode)) return validation_output_filepath, p.returncode # Retrieve contigs from assembly file. def _get_contigs_from_assembly(self, assembly_ref, type='Assembly'): try: assembly_data = self.dfu.get_objects( {'object_refs': [assembly_ref]})['data'][0]['data'] except Exception as e: print("Unable to retrieve Assembly reference: {}".format( assembly_ref)) raise ValueError(e) raw_contigs = assembly_data['contigs'] contigs = {} # Contigs returns just a dict with key and contig_id for key, value in raw_contigs.iteritems(): contigs[str(key)] = value['contig_id'] return raw_contigs def _get_version_contigs_genotypes(self, vcf_filepath): contigs = [] genotypes = [] version = '' with (gzip.open if vcf_filepath.endswith('.gz') else open)( vcf_filepath, 'rt') as vcf: line = vcf.readline() tokens = line.split('=') if not (tokens[0].startswith('##fileformat')): log("Invalid VCF. ##fileformat line in meta is improperly formatted." ) raise ValueError( "Invalid VCF. ##fileformat line in meta is improperly formatted." ) version = float(tokens[1][-4:].rstrip()) log("VCF version: {}".format(version)) for line in vcf: if line.startswith("#CHROM"): log("#CHROM encountered, exiting loop.") genotypes = line.split()[9:] log("Number Genotypes in vcf: {}".format(len(genotypes))) break tokens = line.split("=") if tokens[0].startswith('##contig'): contigs.append(tokens[2][:-2]) return version, contigs, genotypes # Arabidopsis ref: 18590/2/8 def _get_assembly_ref_from_genome(self, genome_ref): ga = GenomeAnnotationAPI(self.service_wiz_url) inputs_get_assembly = {'ref': genome_ref} try: assembly_object_ref = ga.get_assembly(inputs_get_assembly) except Exception as e: print( "Unable to retrieve Assembly reference ID from Genome ref_id: {}" .format(genome_ref)) raise Exception(e) return assembly_object_ref def _generate_output_file_list(self): log('Start packing result files') output_files = list() result_file = os.path.join(self.scratch, 'variation_importer_results.zip') excluded_extensions = ['.zip', '.vcf', '.vcf.gz', '.html', '.DS_Store'] with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(self.scratch): for file in files: if not (file.endswith(tuple(excluded_extensions)) # file.endswith('.zip') or # file.endswith('.vcf') or # file.endswith('.vcf.gz') or # file.endswith('.html') or # file.endswith('.DS_Store') ): zip_file.write(os.path.join(root, file), file) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by Variation Importer' }) log("Importer output generated: {}".format(output_files)) return output_files def _generate_report(self, params, variation_results, variation_file_path): stats_results = self._generate_variation_stats( params['additional_output_type'], variation_file_path) html_report = self._generate_html_report(variation_results, stats_results) file_links = self._generate_output_file_list() objects = [] if (variation_results['valid_variation_file']): objects = [{ 'ref': variation_results['variation_obj_ref'], 'description': 'Variation Object created by VCF Importer' }] report_params = { 'objects_created': objects, 'message': '', 'direct_html_link_index': 0, 'file_links': file_links, 'html_links': html_report, 'html_window_height': 330, 'workspace_name': params['workspace_name'], 'report_object_name': 'variation_importer_report_' + str(uuid.uuid4()) } kbr_output = self.kbr.create_extended_report(report_params) report_output = { 'report_name': kbr_output['name'], 'report_ref': kbr_output['ref'], 'variation_ref': variation_results['variation_obj_ref'] } log("Returning from _generate_report!") return report_output def _generate_html_report(self, variation_results, stats_output=None): """ _generate_html_report: generate html report from output files """ html_report = list() print("Validation output filepath passed to html report: {}".format( variation_results['validation_output_filepath'])) try: report_dir = os.path.join(self.scratch, 'html') os.mkdir(report_dir) with open(template_dir, 'r') as html, open( variation_results['validation_output_filepath'], 'r') as validation: validation_content = '<p><h4>{} '.format( variation_results['variation_filename']) if variation_results.get('valid_variation_file'): validation_content += '<em><i>is</i> a valid </em> variation file.' else: validation_content += '<em><i>is not</i> a valid </em>variation file. Details below.' validation_content += '</h4></p>' report = html.read() # Discard the first line of the validation file. It is irrelevant. validation.readline() validation_content += '<p><h4>Errors and warning generated by VCF validator:</h4></p>' validation_content += '<ul>' for line in validation.readlines(): validation_content += '<li>{}</li>'.format(line) validation_content += '</ul>' if variation_results.get('invalid_contigs'): validation_content += '<h4>The following Contigs were not found in the reference genome. The possible contigs have been written to the file {}. Please see the associated links to download.</h4>'.format( variation_results.get('genome_ref'), 'valid_contigs.txt') validation_content += '<ul>' for contig in variation_results.get('invalid_contigs'): validation_content += '<li>{}</li>'.format(contig) validation_content += '</ul>' # if not variation_results.get('contigs'): # validation_content += '<h4>No contig information was included in the VCF file header! Please recreate the VCF file with each contig described in the meta description </h4>' report = report.replace('Validation_Results', validation_content) if (stats_output.get('stats_file_dir')): summary_results = '<p><h4>Summary Statistics</h4></p>' summary_results += ''' <table> <tr> <th>Number of SNPs</th> <th>Number of Genotypes </th> </tr> ''' summary_results += '<tr>' summary_results += '<td>{}</td><td>{}</td>'.format( 'To be added later', variation_results['num_genotypes']) summary_results += '</tr></table>' report = report.replace('Variation_Statistics', summary_results) # visualization image_content = '' if (stats_output.get('stats_img_dir')): image_dir = stats_output.get('stats_img_dir') for file in glob.glob(os.path.join(image_dir, '*.png')): shutil.move(file, report_dir) for image in glob.glob(report_dir + "/*.png"): image = image.replace(report_dir + '/', '') caption = image.replace(report_dir + '/', '').replace('.png', '') image_content += '<p style="text-align:center"><img align="center" src="{}" ' \ '></a><a target="_blank"><br>' \ '<p align="center">{}</p></p>'.format(image, caption) else: image_content += 'No visualizations generated.' report = report.replace("Visualization_Results", image_content) except Exception as e: print("Error generating HTML report.") raise report_file_path = os.path.join(report_dir, 'index.html') with open(report_file_path, 'w') as output: output.write(report) try: html_upload_ret = self.dfu.file_to_shock({ 'file_path': report_file_path, 'make_handle': 0, 'pack': 'zip' }) log("Variation HTML report to shock ref: {}".format( html_upload_ret)) except: raise ValueError('Error uploading HTML to shock') html_report.append({ 'shock_id': html_upload_ret['shock_id'], 'name': os.path.basename(report_file_path), 'label': os.path.basename(report_file_path), 'description': 'HTML report for Variation Importer' }) return html_report def _generate_variation_stats(self, additional_output_type, variation_filepath): """ :param commments go here """ file_output_directory = os.path.join(self.scratch, 'stats_' + str(uuid.uuid4())) os.mkdir(file_output_directory) image_output_directory = os.path.join( self.scratch, 'stats_images_' + str(uuid.uuid4())) os.mkdir(image_output_directory) # TODO: Validate user supplied params and build PLINK command plink_cmd = ["plink"] plink_cmd.append('--vcf') plink_cmd.append(variation_filepath) # plink_cmd.append('--recode12') # plink_cmd.append('transpose') # plink_cmd.append('--output-missing-genotype') # plink_cmd.append("0") plink_cmd.append('--freq') plink_cmd.append('--hardy') # plink_cmd.append('gz') plink_cmd.append('--out') plink_cmd.append(variation_filepath) print("PLINK arguments: {}".format(plink_cmd)) plink_output = { "errors": [], "warnings": [] # "notes" : [] } p = subprocess.Popen(plink_cmd, cwd=file_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) while True: line = p.stdout.readline() if not line: break # log(line) tokens = line.split(':') if (tokens[0] == 'Error'): plink_output['errors'].append(line) raise ValueError('PLINK 1.9 error: ' + line) elif (tokens[0] == 'Warning'): plink_output['warnings'].append(line) print(line) # elif(tokens[0] == 'Note'): # plink_output['notes'].append(line) # print(line) p.stdout.close() p.wait() plink_output_filepath = os.path.join(file_output_directory, 'plink_cli_output.txt') with open(plink_output_filepath, 'w') as plink: for data in plink_output: plink.write("{}: {}\n".format(data, plink_output[data])) plink_output_files = [ f for f in os.listdir(self.scratch) if f.startswith(os.path.basename(variation_filepath) + '.') ] for file in plink_output_files: shutil.move(os.path.join(self.scratch, file), file_output_directory) if p.returncode != 0: log("PLINK encountered an error during runtime. Please see log file." ) variation_filename = os.path.basename(variation_filepath) base_filepath = os.path.join(file_output_directory, variation_filename) freq_filepath = base_filepath + '.frq' maf_script_filepath = '/kb/module/lib/VariationImporter/Utils/MAF_check.R' hwe_script_filepath = '/kb/module/lib/VariationImporter/Utils/HWE.R' log("Frequency filepath: {}".format(freq_filepath)) # TODO: make function to do Rscript calls. # generate visualizations and store in directory maf_command = ['Rscript'] maf_command.append('--no-save') maf_command.append('--vanilla') maf_command.append(maf_script_filepath) maf_command.append(freq_filepath) maf_command.append("Minor Allele Frequencies.png") r = subprocess.Popen(maf_command, cwd=image_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) r.wait() if r.returncode != 0: log("Error creating MAF histogram in R") hwe_filepath = base_filepath + '.hwe' zoom_filepath = hwe_filepath + '.zoom' zoom_command = '''awk '{{ if ($9 < 0.00001) print $0 }}' {} > {}'''.format( hwe_filepath, zoom_filepath) try: z = subprocess.Popen(zoom_command, cwd=file_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) z.wait() if z.returncode != 0: log("Error creating HWE zoom file.") except Exception as e: log("Error creating zoom HWE file: {}".format(e)) hwe_command = ['Rscript'] hwe_command.append('--no-save') hwe_command.append('--vanilla') hwe_command.append(hwe_script_filepath) hwe_command.append(hwe_filepath) hwe_command.append("Hardy-Weinberg Equilibrium.png") hwe_command.append(zoom_filepath) hwe_command.append("Hardy-Weinberg Equilibrium Zoom.png") print("MAF command: {}".format(hwe_command)) h = subprocess.Popen(hwe_command, cwd=image_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) h.wait() if h.returncode != 0: log("Error generating HWE Zoom plot") return { 'stats_file_dir': file_output_directory, 'stats_img_dir': image_output_directory } def _save_variation_to_ws(self, workspace_name, variation_obj, variation_filepath, kinship_matrix): ws_id = self.dfu.ws_name_to_id(workspace_name) try: vcf_shock_return = self.dfu.file_to_shock({ 'file_path': variation_filepath, 'make_handle': 1, 'pack': 'gzip' }) except Exception as e: print("Error uploading file to shock!") raise ValueError(e) variation_obj['variation_file_reference'] = vcf_shock_return.get( 'shock_id') info = self.dfu.save_objects({ 'id': ws_id, 'objects': [{ 'type': 'KBaseGwasData.Variations', 'data': variation_obj, 'name': 'TestVariationImporterName' }] })[0] variation_ref = "%s/%s/%s" % (info[6], info[0], info[4]) log("Variation reference created: {}".format(variation_ref)) return variation_ref def validate_vcf(self, params): """ :param params: dict containing all input parameters. """ returnVal = {} valid_vcf_file = True try: vcf_filepath = self.pretend_download_staging_file( params['staging_file_subdir_path'], self.scratch).get('copy_file_path') location_filepath = self.pretend_download_staging_file( params['location_file_subdir_path'], self.scratch).get('copy_file_path') except Exception as e: raise Exception("Unable to download {} from staging area.".format( params['staging_file_subdir_path'])) try: location_filepath = self.pretend_download_staging_file( params['location_file_subdir_path'], self.scratch).get('copy_file_path') except Exception as e: raise Exception("Unable to download {} from staging area.".format( params['location_file_subdir_path'])) # Check file size log("{} file size: {}".format(vcf_filepath, os.path.getsize(vcf_filepath))) log('\nValidating {}...'.format(vcf_filepath)) vcf_version, vcf_contigs, vcf_genotypes = self._get_version_contigs_genotypes( vcf_filepath) if not vcf_contigs: log("No contig data in {} header.".format(vcf_filepath)) raise ValueError( "No contig data in {} header.".format(vcf_filepath)) if (vcf_version < 4.1): log("VCF file is version {}. Must be at least version 4.1".format( vcf_version)) raise ValueError( "VCF file is version {}. Must be at least version 4.1".format( vcf_version)) # Generate population object population = self._generate_population(location_filepath, vcf_genotypes) # Retrieve Assembly object reference associated with genome. try: assembly_ref = self._get_assembly_ref_from_genome( params['genome_ref']) except Exception as e: print("Unable to retrieve {}".format(params['genome_ref'])) raise ValueError(e) # Retrieve contig list from Assembly object. try: assembly_contigs = self._get_contigs_from_assembly(assembly_ref) except Exception as e: print("Unable to retrieve contigs from Assembly ref: {}".format( assembly_ref)) raise ValueError(e) log("Length of assembly contigs: {}".format(len(assembly_contigs))) # Compare contig IDs from VCF to those in the Assembly object invalid_contigs = [] for contig in vcf_contigs: if contig not in assembly_contigs.keys(): invalid_contigs.append(contig) if invalid_contigs: log("Invalid contig IDs found in {}".format(vcf_filepath)) valid_contig_filepath = os.path.join(self.scratch, 'valid_contigs.txt') log("Writing valid contigs to file: {}".format( valid_contig_filepath)) with open(valid_contig_filepath, 'w') as icf: for contig in assembly_contigs: icf.write(contig + '\n') valid_vcf_file = False validation_output_filepath, returncode = self._validate_vcf( vcf_filepath, vcf_version) if returncode != 0: valid_vcf_file = False kinship_matrix = self._create_fake_kinship_matrix() variation_obj_ref = '' if valid_vcf_file: variation_object = { "genome": params['genome_ref'], "population": population, "contigs": vcf_contigs, "comment": "Comments go here", "assay": "Assay data goes gere.", "originator": "PI/Lab info goes here", "pubmed_id": "PubMed ID goes here", "kinship_info": kinship_matrix } variation_obj_ref = self._save_variation_to_ws( params['workspace_name'], variation_object, vcf_filepath, kinship_matrix) log("Variation object reference: {}".format(variation_obj_ref)) variation_report_metadata = { 'valid_variation_file': valid_vcf_file, 'variation_obj_ref': variation_obj_ref, 'variation_filename': os.path.basename(vcf_filepath), 'validation_output_filepath': validation_output_filepath, 'vcf_version': vcf_version, 'num_genotypes': len(vcf_genotypes), 'num_contigs': len(vcf_contigs), 'invalid_contigs': invalid_contigs } returnVal = self._generate_report(params, variation_report_metadata, vcf_filepath) return returnVal
class GenomeInterface: def _validate_save_one_genome_params(self, params): """ _validate_save_one_genome_params: validates params passed to save_one_genome method """ log('start validating save_one_genome params') # check for required parameters for p in ['workspace', 'name', 'data']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _check_shock_response(self, response, errtxt): """ _check_shock_response: check shock node response (Copied from DataFileUtil) """ log('start checking shock response') if not response.ok: try: err = json.loads(response.content)['error'][0] except: # this means shock is down or not responding. self.log("Couldn't parse response error content from Shock: " + response.content) response.raise_for_status() raise ValueError(errtxt + str(err)) def _own_handle(self, genome_data, handle_property): """ _own_handle: check that handle_property point to shock nodes owned by calling user """ log('start checking handle {} ownership'.format(handle_property)) if handle_property in genome_data: handle_id = genome_data[handle_property] hs = HandleService(self.handle_url, token=self.token) handles = hs.hids_to_handles([handle_id]) shock_id = handles[0]['id'] # Copy from DataFileUtil.own_shock_node implementation: header = {'Authorization': 'Oauth {}'.format(self.token)} res = requests.get(self.shock_url + '/node/' + shock_id + '/acl/?verbosity=full', headers=header, allow_redirects=True) self._check_shock_response( res, 'Error getting ACLs for Shock node {}: '.format(shock_id)) owner = res.json()['data']['owner']['username'] user_id = self.auth_client.get_user(self.token) if owner != user_id: log('start copying node to owner: {}'.format(user_id)) dfu_shock = self.dfu.copy_shock_node({ 'shock_id': shock_id, 'make_handle': True }) handle_id = dfu_shock['handle']['hid'] genome_data[handle_property] = handle_id def _check_dna_sequence_in_features(self, genome): """ _check_dna_sequence_in_features: check dna sequence in each feature """ log('start checking dna sequence in each feature') if 'features' in genome: features_to_work = {} for feature in genome['features']: if not ('dna_sequence' in feature and feature['dna_sequence']): features_to_work[feature['id']] = feature['location'] if len(features_to_work) > 0: aseq = AssemblySequenceAPI(self.sw_url, token=self.token) get_dna_params = {'requested_features': features_to_work} if 'assembly_ref' in genome: get_dna_params['assembly_ref'] = genome['assembly_ref'] elif 'contigset_ref' in genome: get_dna_params['contigset_ref'] = genome['contigset_ref'] else: # Nothing to do (it may be test genome without contigs)... return dna_sequences = aseq.get_dna_sequences( get_dna_params)['dna_sequences'] for feature in genome['features']: if feature['id'] in dna_sequences: feature['dna_sequence'] = dna_sequences[feature['id']] feature['dna_sequence_length'] = len( feature['dna_sequence']) def __init__(self, config): self.ws_url = config.workspaceURL self.handle_url = config.handleURL self.shock_url = config.shockURL self.sw_url = config.srvWizURL self.token = config.token self.auth_service_url = config.authServiceUrl self.callback_url = config.callbackURL self.ws = Workspace(self.ws_url, token=self.token) self.auth_client = _KBaseAuth(self.auth_service_url) self.dfu = DataFileUtil(self.callback_url) def save_one_genome(self, params): log('start saving genome object') self._validate_save_one_genome_params(params) workspace = params['workspace'] name = params['name'] data = params['data'] # check all handles point to shock nodes owned by calling user self._own_handle(data, 'genbank_handle_ref') self._own_handle(data, 'gff_handle_ref') self._check_dna_sequence_in_features(data) if 'hidden' in params and str( params['hidden']).lower() in ('yes', 'true', 't', '1'): hidden = 1 else: hidden = 0 if isinstance(workspace, int) or workspace.isdigit(): workspace_id = workspace else: workspace_id = self.dfu.ws_name_to_id(workspace) dfu_save_params = { 'id': workspace_id, 'objects': [{ 'type': 'KBaseGenomes.Genome', 'data': data, 'name': name, 'hidden': hidden }] } dfu_oi = self.dfu.save_objects(dfu_save_params)[0] returnVal = {'info': dfu_oi} return returnVal
class CufflinksUtils: CUFFLINKS_TOOLKIT_PATH = '/opt/cufflinks/' GFFREAD_TOOLKIT_PATH = '/opt/cufflinks/' def __init__(self, config): """ :param config: :param logger: :param directory: Working directory :param urls: Service urls """ # BEGIN_CONSTRUCTOR self.ws_url = config["workspace-url"] self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.srv_wiz_url = config['srv-wiz-url'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev') self.eu = ExpressionUtils(self.callback_url) self.ws = Workspace(self.ws_url, token=self.token) self.scratch = os.path.join(config['scratch'], str(uuid.uuid4())) self._mkdir_p(self.scratch) self.tool_used = "Cufflinks" self.tool_version = os.environ['VERSION'] # END_CONSTRUCTOR pass def parse_FPKMtracking_calc_TPM(self, filename): """ Generates TPM from FPKM :return: """ fpkm_dict = {} tpm_dict = {} gene_col = 0 fpkm_col = 9 sum_fpkm = 0.0 with open(filename) as f: next(f) for line in f: larr = line.split("\t") gene_id = larr[gene_col] if gene_id != "": fpkm = float(larr[fpkm_col]) sum_fpkm = sum_fpkm + fpkm fpkm_dict[gene_id] = math.log(fpkm + 1, 2) tpm_dict[gene_id] = fpkm if sum_fpkm == 0.0: log("Warning: Unable to calculate TPM values as sum of FPKM values is 0" ) else: for g in tpm_dict: tpm_dict[g] = math.log((tpm_dict[g] / sum_fpkm) * 1e6 + 1, 2) return fpkm_dict, tpm_dict def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_run_cufflinks_params(self, params): """ _validate_run_cufflinks_params: Raises an exception if params are invalid """ log('Start validating run_cufflinks params') # check for required parameters for p in ['alignment_object_ref', 'workspace_name', 'genome_ref']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def _run_gffread(self, gff_path, gtf_path): """ _run_gffread: run gffread script ref: http://cole-trapnell-lab.github.io/cufflinks/file_formats/#the-gffread-utility """ log('converting gff to gtf') command = self.GFFREAD_TOOLKIT_PATH + '/gffread ' command += "-E {0} -T -o {1}".format(gff_path, gtf_path) self._run_command(command) def _create_gtf_annotation_from_genome(self, genome_ref): """ Create reference annotation file from genome """ ref = self.ws.get_object_subset([{ 'ref': genome_ref, 'included': ['contigset_ref', 'assembly_ref'] }]) if 'contigset_ref' in ref[0]['data']: contig_id = ref[0]['data']['contigset_ref'] elif 'assembly_ref' in ref[0]['data']: contig_id = ref[0]['data']['assembly_ref'] if contig_id is None: raise ValueError( "Genome at {0} does not have reference to the assembly object". format(genome_ref)) print(contig_id) log("Generating GFF file from Genome") try: ret = self.au.get_assembly_as_fasta({'ref': contig_id}) output_file = ret['path'] mapping_filename = c_mapping.create_sanitized_contig_ids( output_file) os.remove(output_file) # get the GFF ret = self.gfu.genome_to_gff({'genome_ref': genome_ref}) genome_gff_file = ret['file_path'] c_mapping.replace_gff_contig_ids(genome_gff_file, mapping_filename, to_modified=True) gtf_ext = ".gtf" if not genome_gff_file.endswith(gtf_ext): gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf' self._run_gffread(genome_gff_file, gtf_path) else: gtf_path = genome_gff_file log("gtf file : " + gtf_path) except Exception: raise ValueError( "Generating GTF file from Genome Annotation object Failed : {}" .format("".join(traceback.format_exc()))) return gtf_path def _get_gtf_file(self, alignment_ref): """ _get_gtf_file: get the reference annotation file (in GTF or GFF3 format) """ result_directory = self.scratch alignment_data = self.ws.get_objects2( {'objects': [{ 'ref': alignment_ref }]})['data'][0]['data'] genome_ref = alignment_data.get('genome_id') # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1] # ws_gtf = genome_name+"_GTF_Annotation" genome_data = self.ws.get_objects2({'objects': [{ 'ref': genome_ref }]})['data'][0]['data'] gff_handle_ref = genome_data.get('gff_handle_ref') if gff_handle_ref: log('getting reference annotation file from genome') annotation_file = self.dfu.shock_to_file({ 'handle_id': gff_handle_ref, 'file_path': result_directory, 'unpack': 'unpack' })['file_path'] else: annotation_file = self._create_gtf_annotation_from_genome( genome_ref) return annotation_file def _get_gtf_file_from_genome_ref(self, genome_ref): """ _get_gtf_file: get the reference annotation file (in GTF or GFF3 format) """ result_directory = self.scratch genome_data = self.ws.get_objects2({'objects': [{ 'ref': genome_ref }]})['data'][0]['data'] gff_handle_ref = genome_data.get('gff_handle_ref') if gff_handle_ref: log('getting reference annotation file from genome') annotation_file = self.dfu.shock_to_file({ 'handle_id': gff_handle_ref, 'file_path': result_directory, 'unpack': 'unpack' })['file_path'] else: annotation_file = self._create_gtf_annotation_from_genome( genome_ref) return annotation_file def _get_input_file(self, alignment_ref): """ _get_input_file: get input BAM file from Alignment object """ bam_file_dir = self.rau.download_alignment( {'source_ref': alignment_ref})['destination_dir'] files = os.listdir(bam_file_dir) bam_file_list = [ file for file in files if re.match(r'.*\_sorted\.bam', file) ] if not bam_file_list: bam_file_list = [ file for file in files if re.match(r'.*(?<!sorted)\.bam', file) ] if not bam_file_list: raise ValueError('Cannot find .bam file from alignment {}'.format( alignment_ref)) bam_file_name = bam_file_list[0] bam_file = os.path.join(bam_file_dir, bam_file_name) return bam_file def _generate_command(self, params): """ _generate_command: generate cufflinks command """ cufflinks_command = '/opt/cufflinks/cufflinks' cufflinks_command += (' -q --no-update-check -p ' + str(params.get('num_threads', 1))) if 'max_intron_length' in params and params[ 'max_intron_length'] is not None: cufflinks_command += (' --max-intron-length ' + str(params['max_intron_length'])) if 'min_intron_length' in params and params[ 'min_intron_length'] is not None: cufflinks_command += (' --min-intron-length ' + str(params['min_intron_length'])) if 'overhang_tolerance' in params and params[ 'overhang_tolerance'] is not None: cufflinks_command += (' --overhang-tolerance ' + str(params['overhang_tolerance'])) cufflinks_command += " -o {0} -G {1} {2}".format( params['result_directory'], params['gtf_file'], params['input_file']) log('Generated cufflinks command: {}'.format(cufflinks_command)) return cufflinks_command def _process_rnaseq_alignment_object(self, params): """ _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object """ log('start processing RNASeqAlignment object\nparams:\n{}'.format( json.dumps(params, indent=1))) alignment_ref = params.get('alignment_ref') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) params['result_directory'] = str(result_directory) # input files params['input_file'] = self._get_input_file(alignment_ref) if not params.get('gtf_file'): params['gtf_file'] = self._get_gtf_file(alignment_ref) if '/' not in params['genome_ref']: params['genome_ref'] = params['workspace_name'] + '/' + params[ 'genome_ref'] command = self._generate_command(params) self._run_command(command) expression_obj_ref = self._save_rnaseq_expression( result_directory, alignment_ref, params.get('workspace_name'), params.get('genome_ref'), params['gtf_file'], params['expression_suffix']) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_obj_ref, 'alignment_ref': alignment_ref } expression_name = self.ws.get_object_info([{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] widget_params = { "output": expression_name, "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _process_kbasesets_alignment_object(self, params): """ _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object """ log('start processing KBaseSets object\nparams:\n{}'.format( json.dumps(params, indent=1))) alignment_ref = params.get('alignment_ref') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) params['result_directory'] = str(result_directory) # input files params['input_file'] = self._get_input_file(alignment_ref) if not params.get('gtf_file'): params['gtf_file'] = self._get_gtf_file(alignment_ref) command = self._generate_command(params) self._run_command(command) expression_obj_ref = self._save_kbasesets_expression( result_directory, alignment_ref, params.get('workspace_name'), params.get('genome_ref'), params.get('gtf_file'), params.get('expression_suffix')) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_obj_ref, 'alignment_ref': alignment_ref } expression_name = self.ws.get_object_info([{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] widget_params = { "output": expression_name, "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _generate_html_report(self, result_directory, obj_ref): """ _generate_html_report: generate html summary report """ log('Start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') expression_object = self.ws.get_objects2( {'objects': [{ 'ref': obj_ref }]})['data'][0] expression_object_type = expression_object.get('info')[2] Overview_Content = '' if re.match('KBaseRNASeq.RNASeqExpression-\d.\d', expression_object_type): Overview_Content += '<p>Generated Expression Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expression_object_type): Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) Overview_Content += '<br><p>Generated Expression Object:</p>' for expression_ref in expression_object['data'][ 'sample_expression_ids']: expression_name = self.ws.get_object_info( [{ "ref": expression_ref }], includeMetadata=None)[0][1] Overview_Content += '<p>{}</p>'.format(expression_name) elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type): pprint(expression_object) Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) Overview_Content += '<br><p>Generated Expression Object:</p>' for expression_ref in expression_object['data']['items']: expression_name = self.ws.get_object_info( [{ "ref": expression_ref['ref'] }], includeMetadata=None)[0][1] condition = expression_ref['label'] Overview_Content += '<p>condition:{0}; expression_name: {1}</p>'.format( condition, expression_name) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Overview_Content</p>', Overview_Content) result_file.write(report_template) html_report.append({ 'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Cufflinks App' }) return html_report def _save_rnaseq_expression(self, result_directory, alignment_ref, workspace_name, genome_ref, gtf_file, expression_suffix): """ _save_rnaseq_expression: save Expression object to workspace """ log('start saving Expression object') alignment_object_name = self.ws.get_object_info( [{ "ref": alignment_ref }], includeMetadata=None)[0][1] # set expression name if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_ref = self.eu.upload_expression({ 'destination_ref': workspace_name + '/' + expression_name, 'source_dir': result_directory, 'alignment_ref': alignment_ref, 'tool_used': self.tool_used, 'tool_version': self.tool_version })['obj_ref'] return expression_ref def _save_kbasesets_expression(self, result_directory, alignment_ref, workspace_name, genome_ref, gtf_file, expression_suffix): """ _save_kbasesets_expression: save Expression object to workspace using ExpressionUtils and SetAPI """ log('start saving Expression object') alignment_info = self.ws.get_object_info3( {'objects': [{ "ref": alignment_ref }]}) alignment_object_name = alignment_info['infos'][0][1] # set expression name if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_ref = self.eu.upload_expression({ 'destination_ref': workspace_name + '/' + expression_name, 'source_dir': result_directory, 'alignment_ref': alignment_ref, 'tool_used': self.tool_used, 'tool_version': self.tool_version })['obj_ref'] return expression_ref def _save_rnaseq_expression_set(self, alignment_expression_map, alignment_set_ref, workspace_name, expression_set_name): """ _save_rnaseq_expression_set: save ExpressionSet object to workspace """ log('start saving ExpressionSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_set_data = self._generate_expression_set_data( alignment_expression_map, alignment_set_ref, expression_set_name) object_type = 'KBaseRNASeq.RNASeqExpressionSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': expression_set_data, 'name': expression_set_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str( dfu_oi[4]) return expression_set_ref def _save_kbasesets_expression_set(self, alignment_expression_map, alignment_set_ref, workspace_name, expression_set_name): """ _save_kbasesets_expression_set: save ExpressionSet object to workspace """ log('start saving ExpressionSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_set_data = self._generate_expression_set_data( alignment_expression_map, alignment_set_ref, expression_set_name) object_type = 'KBaseRNASeq.RNASeqExpressionSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': expression_set_data, 'name': expression_set_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str( dfu_oi[4]) return expression_set_ref def _generate_report(self, obj_ref, workspace_name, result_directory, exprMatrix_FPKM_ref=None, exprMatrix_TPM_ref=None): """ _generate_report: generate summary report """ log('creating report') output_files = self._generate_output_file_list(result_directory) output_html_files = self._generate_html_report(result_directory, obj_ref) expression_object = self.ws.get_objects2( {'objects': [{ 'ref': obj_ref }]})['data'][0] expression_info = expression_object['info'] expression_data = expression_object['data'] expression_object_type = expression_info[2] if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'Expression generated by Cufflinks' }] elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'Expression generated by Cufflinks' }] elif re.match('KBaseSets.ExpressionSet-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'ExpressionSet generated by Cufflinks' }] items = expression_data['items'] for item in items: objects_created.append({ 'ref': item['ref'], 'description': 'Expression generated by Cufflinks' }) objects_created.append({ 'ref': exprMatrix_FPKM_ref, 'description': 'FPKM ExpressionMatrix generated by Cufflinks' }) objects_created.append({ 'ref': exprMatrix_TPM_ref, 'description': 'TPM ExpressionMatrix generated by Cufflinks' }) report_params = { 'message': '', 'workspace_name': workspace_name, 'file_links': output_files, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_cufflinks_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _parse_FPKMtracking(self, filename, metric): result = {} pos1 = 0 if metric == 'FPKM': pos2 = 7 if metric == 'TPM': pos2 = 8 with open(filename) as f: next(f) for line in f: larr = line.split("\t") if larr[pos1] != "": try: result[larr[pos1]] = math.log(float(larr[pos2]) + 1, 2) except ValueError: result[larr[pos1]] = math.log(1, 2) return result def _generate_output_file_list(self, result_directory): """ _generate_output_file_list: zip result files and generate file_links for report """ log('Start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join(output_directory, 'cufflinks_result.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if not (file.endswith('.DS_Store')): zip_file.write( os.path.join(root, file), os.path.join(os.path.basename(root), file)) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by Cufflinks App' }) return output_files def _generate_expression_data(self, result_directory, alignment_ref, gtf_file, workspace_name, expression_suffix): """ _generate_expression_data: generate Expression object with cufflinks output files """ alignment_data_object = self.ws.get_objects2( {'objects': [{ 'ref': alignment_ref }]})['data'][0] # set expression name alignment_object_name = alignment_data_object['info'][1] if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_data = { 'id': expression_name, 'type': 'RNA-Seq', 'numerical_interpretation': 'FPKM', 'processing_comments': 'log2 Normalized', 'tool_used': self.tool_used, 'tool_version': self.tool_version } alignment_data = alignment_data_object['data'] condition = alignment_data.get('condition') expression_data.update({'condition': condition}) genome_id = alignment_data.get('genome_id') expression_data.update({'genome_id': genome_id}) read_sample_id = alignment_data.get('read_sample_id') expression_data.update( {'mapped_rnaseq_alignment': { read_sample_id: alignment_ref }}) exp_dict, tpm_exp_dict = self.parse_FPKMtracking_calc_TPM( os.path.join(result_directory, 'genes.fpkm_tracking')) expression_data.update({'expression_levels': exp_dict}) expression_data.update({'tpm_expression_levels': tpm_exp_dict}) handle = self.dfu.file_to_shock({ 'file_path': result_directory, 'pack': 'zip', 'make_handle': True })['handle'] expression_data.update({'file': handle}) return expression_data def _generate_expression_set_data(self, alignment_expression_map, alignment_set_ref, expression_set_name): """ _generate_expression_set_data: generate ExpressionSet object with cufflinks output files """ alignment_set_data_object = self.ws.get_objects2( {'objects': [{ 'ref': alignment_set_ref }]})['data'][0] alignment_set_data = alignment_set_data_object['data'] expression_set_data = { 'tool_used': self.tool_used, 'tool_version': self.tool_version, 'id': expression_set_name, 'alignmentSet_id': alignment_set_ref, 'genome_id': alignment_set_data.get('genome_id'), 'sampleset_id': alignment_set_data.get('sampleset_id') } sample_expression_ids = [] mapped_expression_objects = [] mapped_expression_ids = [] for alignment_expression in alignment_expression_map: alignment_ref = alignment_expression.get('alignment_ref') expression_ref = alignment_expression.get('expression_obj_ref') sample_expression_ids.append(expression_ref) mapped_expression_ids.append({alignment_ref: expression_ref}) alignment_name = self.ws.get_object_info( [{ "ref": alignment_ref }], includeMetadata=None)[0][1] expression_name = self.ws.get_object_info( [{ "ref": expression_ref }], includeMetadata=None)[0][1] mapped_expression_objects.append({alignment_name: expression_name}) expression_set_data['sample_expression_ids'] = sample_expression_ids expression_set_data[ 'mapped_expression_objects'] = mapped_expression_objects expression_set_data['mapped_expression_ids'] = mapped_expression_ids return expression_set_data def _process_alignment_set_object(self, params, alignment_object_type): """ _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object and KBaseSets.ReadsAlignmentSet type object """ log('start processing KBaseRNASeq.RNASeqAlignmentSet object or KBaseSets.ReadsAlignmentSet object' '\nparams:\n{}'.format(json.dumps(params, indent=1))) alignment_set_ref = params.get('alignment_set_ref') if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type): params['gtf_file'] = self._get_gtf_file(alignment_set_ref) else: if not '/' in params['genome_ref']: params['genome_ref'] = params['workspace_name'] + '/' + params[ 'genome_ref'] params['gtf_file'] = self._get_gtf_file_from_genome_ref( params['genome_ref']) alignment_set = self.set_api.get_reads_alignment_set_v1({ 'ref': alignment_set_ref, 'include_item_info': 0, 'include_set_item_ref_paths': 1 }) mul_processor_params = [] for alignment in alignment_set["data"]["items"]: alignment_ref = alignment['ref_path'] alignment_upload_params = params.copy() alignment_upload_params['alignment_ref'] = alignment_ref mul_processor_params.append(alignment_upload_params) # use the following when you want to run the cmd sequentially # self._process_kbasesets_alignment_object(mul_processor_params[0]) cpus = min(params.get('num_threads'), multiprocessing.cpu_count()) pool = Pool(ncpus=cpus) log('running _process_alignment_object with {} cpus'.format(cpus)) alignment_expression_map = pool.map( self._process_kbasesets_alignment_object, mul_processor_params) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) expression_items = list() for proc_alignment_return in alignment_expression_map: expression_obj_ref = proc_alignment_return.get( 'expression_obj_ref') alignment_ref = proc_alignment_return.get('alignment_ref') alignment_info = self.ws.get_object_info3({ 'objects': [{ "ref": alignment_ref }], 'includeMetadata': 1 }) condition = alignment_info['infos'][0][10]['condition'] expression_items.append({ "ref": expression_obj_ref, "label": condition, }) expression_name = self.ws.get_object_info( [{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] self._run_command('cp -R {} {}'.format( proc_alignment_return.get('result_directory'), os.path.join(result_directory, expression_name))) expression_set = { "description": "generated by kb_cufflinks", "items": expression_items } expression_set_info = self.set_api.save_expression_set_v1({ "workspace": params['workspace_name'], "output_object_name": params['expression_set_name'], "data": expression_set }) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_set_info['set_ref'] } widget_params = { "output": params.get('expression_set_name'), "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _generate_output_object_name(self, params, alignment_object_type, alignment_object_name): """ Generates the output object name based on input object type and name and stores it in params with key equal to 'expression' or 'expression_set' based on whether the input object is an alignment or alignment_set. :param params: module input params :param alignment_object_type: input alignment object type :param alignment_object_name: input alignment object name :param alignment_object_data: input alignment object data """ expression_set_suffix = params['expression_set_suffix'] expression_suffix = params['expression_suffix'] if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type): if re.match('.*_[Aa]lignment$', alignment_object_name): params['expression_name'] = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_name'] = alignment_object_name + expression_suffix if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type): if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name): # set expression set name params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$', expression_set_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_set_name'] = alignment_object_name + expression_set_suffix if re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type): if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name): # set expression set name params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$', expression_set_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_set_name'] = alignment_object_name + expression_set_suffix def _save_expression_matrix(self, expressionset_ref, workspace_name): """ _save_expression_matrix: save FPKM and TPM ExpressionMatrix """ log('start saving ExpressionMatrix object') expression_set_name = self.ws.get_object_info( [{ "ref": expressionset_ref }], includeMetadata=None)[0][1] output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '', expression_set_name) upload_expression_matrix_params = { 'expressionset_ref': expressionset_ref, 'output_obj_name': output_obj_name_prefix, 'workspace_name': workspace_name } expression_matrix_refs = self.eu.get_expressionMatrix( upload_expression_matrix_params) return expression_matrix_refs def run_cufflinks_app(self, params): log('--->\nrunning CufflinksUtil.run_cufflinks_app\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_run_cufflinks_params(params) alignment_object_ref = params.get('alignment_object_ref') alignment_object_info = self.ws.get_object_info3( {"objects": [{ "ref": alignment_object_ref }]})['infos'][0] alignment_object_type = alignment_object_info[2] alignment_object_name = alignment_object_info[1] # get output object name self._generate_output_object_name(params, alignment_object_type, alignment_object_name) log('--->\nalignment object type: \n' + '{}'.format(alignment_object_type)) if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type): params.update({'alignment_ref': alignment_object_ref}) returnVal = self._process_rnaseq_alignment_object(params) report_output = self._generate_report( returnVal.get('expression_obj_ref'), params.get('workspace_name'), returnVal.get('result_directory')) returnVal.update(report_output) elif re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type) or \ re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type): params.update({'alignment_set_ref': alignment_object_ref}) returnVal = self._process_alignment_set_object( params, alignment_object_type) expression_matrix_refs = self._save_expression_matrix( returnVal['expression_obj_ref'], params.get('workspace_name')) returnVal.update(expression_matrix_refs) report_output = self._generate_report( returnVal['expression_obj_ref'], params.get('workspace_name'), returnVal['result_directory'], expression_matrix_refs['exprMatrix_FPKM_ref'], expression_matrix_refs['exprMatrix_TPM_ref']) returnVal.update(report_output) else: raise ValueError( 'None RNASeqAlignment type\nObject info:\n{}'.format( alignment_object_info)) return returnVal
class FastaGFFToGenome: def __init__(self, config): self.cfg = config self.dfu = DataFileUtil(self.cfg.callbackURL) def import_file(self, params): # 1) validate parameters self._validate_import_file_params(params) # 2) construct the input directory staging area input_directory = os.path.join(self.cfg.sharedFolder, 'fast_gff_upload_' + str(uuid.uuid4())) os.makedirs(input_directory) file_paths = self._stage_input(params, input_directory) # 3) extract out the parameters params = self._set_parsed_params(params) # 4) do the upload result = self.upload_genome( shock_service_url=self.cfg.shockURL, handle_service_url=self.cfg.handleURL, workspace_service_url=self.cfg.workspaceURL, callback_url=self.cfg.callbackURL, input_fasta_file=file_paths["fasta_file"], input_gff_file=file_paths["gff_file"], workspace_name=params['workspace_name'], core_genome_name=params['genome_name'], scientific_name=params['scientific_name'], taxon_wsname=params['taxon_wsname'], taxon_reference=params['taxon_reference'], source=params['source'], genome_type=params['type'], release=params['release']) # 5) generate report output_data_ref = params['workspace_name'] + "/" + params['genome_name'] reportObj = { 'objects_created': [{ 'ref': output_data_ref, 'description': 'KBase Genome object' }], 'text_message': result['report_string'] } reportClient = KBaseReport(os.environ['SDK_CALLBACK_URL']) report_info = reportClient.create({ 'report': reportObj, 'workspace_name': params['workspace_name'] }) # 6) clear the temp directory shutil.rmtree(input_directory) # 7) return the result info = result['genome_info'] details = { 'genome_ref': str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]), 'genome_info': info, 'report_name': report_info['name'], 'report_ref': report_info['ref'] } return details def upload_genome(self, shock_service_url=None, handle_service_url=None, workspace_service_url=None, callback_url=None, input_gff_file=None, input_fasta_file=None, workspace_name=None, core_genome_name=None, scientific_name="unknown_taxon", taxon_wsname='ReferenceTaxons', taxon_reference=None, source=None, release=None, genome_type=None): # retrieve taxon taxonomy, taxon_reference = self._retrieve_taxon( taxon_reference, taxon_wsname, scientific_name) # reading in Fasta file assembly = self._retrieve_fasta_file(input_fasta_file, core_genome_name, scientific_name, source) if taxon_reference is not None: assembly["taxon_ref"] = taxon_reference # reading in GFF file feature_list = self._retrieve_gff_file(input_gff_file) # compile links between features feature_hierarchy = self._generate_feature_hierarchy(feature_list) # retrieve genome feature list (genome_features_list, genome_mrnas_list, genome_cdss_list) = self._retrieve_genome_feature_list( feature_list, feature_hierarchy, assembly) # remove sequences before loading for contig in assembly["contigs"]: del assembly["contigs"][contig]["sequence"] aUtil = AssemblyUtil(callback_url) assembly_ref = aUtil.save_assembly_from_fasta({ 'file': { 'path': input_fasta_file, 'assembly_name': assembly['assembly_id'] }, 'workspace_name': workspace_name, 'assembly_name': assembly['assembly_id'] }) # generate genome info genome = self._gen_genome_info(core_genome_name, scientific_name, assembly_ref, genome_features_list, genome_cdss_list, genome_mrnas_list, source, assembly, taxon_reference, taxonomy, input_gff_file) workspace_id = self.dfu.ws_name_to_id(workspace_name) genome_info = self.dfu.save_objects({ "id": workspace_id, "objects": [{ "name": core_genome_name, "type": "KBaseGenomes.Genome", "data": genome }] })[0] report_string = '' return {'genome_info': genome_info, 'report_string': report_string} def _validate_import_file_params(self, params): """ validate_import_file_params: validates params passed to FastaGFFToGenome.import_file method """ # check for required parameters for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) # one and only one of 'path', or 'shock_id' is required for key in ('fasta_file', 'gff_file'): file = params[key] if not isinstance(file, dict): raise ValueError( 'Required "{}" field must be a map/dict'.format(key)) n_valid_fields = 0 if 'path' in file and file['path'] is not None: n_valid_fields += 1 if 'shock_id' in file and file['shock_id'] is not None: n_valid_fields += 1 if 'ftp_url' in file and file['ftp_url'] is not None: n_valid_fields += 1 raise ValueError( 'FTP link is currently not supported for FastaGFFToGenome') if n_valid_fields < 1: error_msg = 'Required "{}" field must include one source: '.format( key) error_msg += 'path | shock_id' raise ValueError(error_msg) if n_valid_fields > 1: error_msg = 'Required "{}" field has too many sources specified: '.format( key) error_msg += str(file.keys()) raise ValueError(error_msg) # check for valid type param valid_types = ['Reference', 'User upload', 'Representative'] if params.get('type') and params['type'] not in valid_types: error_msg = 'Entered value for type is not one of the valid entries of ' error_msg += '[' + ''.join('"' + str(e) + '", ' for e in valid_types)[0:-2] + ']' raise ValueError(error_msg) def _set_parsed_params(self, params): log('Setting params') # default params default_params = { 'taxon_wsname': self.cfg.raw['taxon-workspace-name'], 'scientific_name': 'unknown_taxon', 'taxon_reference': None, 'source': 'User', 'release': None, 'type': 'User upload', 'metadata': {} } for field in default_params: if field not in params: params[field] = default_params[field] log(json.dumps(params, indent=1)) return params def _stage_input(self, params, input_directory): """ stage_input: Setup the input_directory by fetching the files and uncompressing if needed """ file_paths = dict() for key in ('fasta_file', 'gff_file'): file = params[key] file_path = None if 'path' in file and file['path'] is not None: local_file_path = file['path'] file_path = os.path.join(input_directory, os.path.basename(local_file_path)) log('Moving file from {} to {}'.format(local_file_path, file_path)) shutil.copy2(local_file_path, file_path) if 'shock_id' in file and file['shock_id'] is not None: # handle shock file log('Downloading file from SHOCK node: {}-{}'.format( self.cfg.sharedFolder, file['shock_id'])) sys.stdout.flush() file_name = self.dfu.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) # extract the file if it is compressed if file_path is not None: print("staged input file =" + file_path) sys.stdout.flush() dfUtil_result = self.dfu.unpack_file({'file_path': file_path}) file_paths[key] = dfUtil_result['file_path'] else: raise ValueError( 'No valid files could be extracted based on the input') return file_paths def _retrieve_taxon(self, taxon_reference, taxon_wsname, scientific_name): """ _retrieve_taxon: retrieve taxonomy and taxon_reference """ taxon_id = -1 taxon_object_name = "unknown_taxon" # retrieve lookup object if scientific name provided if (taxon_reference is None and scientific_name is not "unknown_taxon"): # retrieve taxon lookup object then find taxon id taxon_lookup = self.dfu.get_objects({ 'object_refs': [taxon_wsname + "/taxon_lookup"], 'ignore_errors': 0 })['data'][0]['data']['taxon_lookup'] if (scientific_name[0:3] in taxon_lookup and scientific_name in taxon_lookup[scientific_name[0:3]]): taxon_id = taxon_lookup[scientific_name[0:3]][scientific_name] taxon_object_name = "{}_taxon".format(str(taxon_id)) # retrieve Taxon object taxon_info = {} if (taxon_reference is None): taxon_info = self.dfu.get_objects({ 'object_refs': [taxon_wsname + "/" + taxon_object_name], 'ignore_errors': 0 })['data'][0] taxon_reference = "{}/{}/{}".format(taxon_info['info'][6], taxon_info['info'][0], taxon_info['info'][4]) else: taxon_info = self.dfu.get_objects({ "object_refs": [taxon_reference], 'ignore_errors': 0 })['data'][0] taxonomy = taxon_info['data']['scientific_lineage'] return taxonomy, taxon_reference def _retrieve_fasta_file(self, input_fasta_file, core_genome_name, scientific_name, source): """ _retrieve_fasta_file: retrieve info from fasta_file https://www.biostars.org/p/710/ """ log("Reading FASTA file") assembly = { "contigs": {}, "dna_size": 0, "gc_content": 0, "md5": [], "base_counts": {} } contig_seq_start = 0 input_file_handle = open(input_fasta_file, 'rb') # alternate header and sequence faiter = (x[1] for x in itertools.groupby(input_file_handle, lambda line: line[0] == ">")) for header in faiter: # drop the ">" header = header.next()[1:].strip() # join all sequence lines to one. seq = "".join(s.strip() for s in faiter.next()) try: fasta_header, fasta_description = header.split(' ', 1) except: fasta_header = header fasta_description = None # Handle record seq = seq.upper() # Build contig objects for Assembly seq_count = dict(collections.Counter(seq)) # to delete at end, but required for now contig_dict = {"sequence": seq} Ncount = 0 if "N" in seq_count: Ncount = seq_count["N"] contig_dict["Ncount"] = Ncount for character in seq_count: if character in assembly["base_counts"]: assembly["base_counts"][character] += seq_count[character] else: assembly["base_counts"][character] = seq_count[character] contig_seq_length = len(seq) assembly["dna_size"] += contig_seq_length contig_gc_length = seq.count("G") contig_gc_length += seq.count("C") contig_dict["gc_content"] = float("{0:.2f}".format( float(contig_gc_length) / float(contig_seq_length))) assembly["gc_content"] += contig_gc_length contig_dict["contig_id"] = fasta_header contig_dict["name"] = fasta_header contig_dict["length"] = contig_seq_length contig_dict["md5"] = hashlib.md5(seq).hexdigest() assembly["md5"].append(contig_dict["md5"]) if fasta_description is not None: contig_dict["description"] = fasta_description contig_dict["is_circular"] = "Unknown" contig_dict["start_position"] = contig_seq_start contig_dict["num_bytes"] = sys.getsizeof(contig_dict["sequence"]) assembly["contigs"][fasta_header] = contig_dict # used for start of next sequence and total gc_content contig_seq_start += contig_seq_length assembly["gc_content"] = float("{0:.2f}".format( float(assembly["gc_content"]) / float(contig_seq_start))) assembly["md5"] = hashlib.md5(",".join(assembly["md5"])).hexdigest() assembly["assembly_id"] = core_genome_name + "_assembly" assembly["name"] = scientific_name assembly["external_source"] = source assembly["external_source_id"] = os.path.basename(input_fasta_file) assembly["external_source_origination_date"] = str( os.stat(input_fasta_file).st_ctime) assembly["num_contigs"] = len(assembly["contigs"].keys()) assembly["type"] = "Unknown" assembly[ "notes"] = "Note MD5s are generated from uppercasing the sequences" return assembly def _retrieve_gff_file(self, input_gff_file): """ _retrieve_gff_file: retrieve info from gff_file """ log("Reading GFF file") feature_list = dict() is_phytozome = 0 is_patric = 0 gff_file_handle = open(input_gff_file, 'rb') current_line = gff_file_handle.readline() line_count = 0 while (current_line != ''): current_line = current_line.strip() if (current_line.isspace() or current_line == "" or current_line.startswith("#")): pass else: #Split line (contig_id, source_id, feature_type, start, end, score, strand, phase, attributes) = current_line.split('\t') #Checking to see if Phytozome if ("phytozome" in source_id or "Phytozome" in source_id): is_phytozome = 1 #Checking to see if Phytozome if ("PATRIC" in source_id): is_patric = 1 #PATRIC prepends their contig ids with some gibberish if (is_patric and "|" in contig_id): contig_id = contig_id.split("|", 1)[1] #Features grouped by contigs first if (contig_id not in feature_list): feature_list[contig_id] = list() #Populating basic feature object ftr = { 'contig': contig_id, 'source': source_id, 'type': feature_type, 'start': int(start), 'end': int(end), 'score': score, 'strand': strand, 'phase': phase, 'attributes': attributes } #Populating with attribute key-value pair #This is where the feature id is from for attribute in attributes.split(";"): attribute = attribute.strip() #Sometimes empty string if (attribute == ""): continue #Use of 1 to limit split as '=' character can also be made available later #Sometimes lack of "=", assume spaces instead if ("=" in attribute): key, value = attribute.split("=", 1) elif (" " in attribute): key, value = attribute.split(" ", 1) else: log("Warning: attribute " + attribute + " cannot be separated into key,value pair") ftr[key] = value feature_list[contig_id].append(ftr) current_line = gff_file_handle.readline() gff_file_handle.close() #Some GFF/GTF files don't use "ID" so we go through the possibilities feature_list = self._add_missing_identifiers(feature_list) #Most bacterial files have only CDSs #In order to work with prokaryotic and eukaryotic gene structure synonymously #Here we add feature dictionaries representing the parent gene and mRNAs feature_list = self._add_missing_parents(feature_list) #Phytozome has the annoying habit of editing their identifiers so we fix them if (is_phytozome): self._update_phytozome_features(feature_list) #All identifiers need to be checked so that they follow the same general rules #Rules are listed within the function itself feature_list = self._update_identifiers(feature_list) #If phytozome, the edited files need to be re-printed as GFF so that it works better with RNA-Seq pipeline if (is_phytozome): self._print_phytozome_gff(input_gff_file, feature_list) return feature_list def _add_missing_identifiers(self, feature_list): #General rule is to iterate through a range of possibilities if "ID" is missing for contig in feature_list.keys(): for i in range(len(feature_list[contig])): if ("ID" not in feature_list[contig][i]): for key in ("transcriptId", "proteinId", "PACid", "pacid", "Parent"): if (key in feature_list[contig][i]): feature_list[contig][i]['ID'] = feature_list[ contig][i][key] break #If the process fails, throw an error for ftr_type in ("gene", "mRNA", "CDS"): if (ftr_type not in feature_list[contig][i]): continue if ("ID" not in feature_list[contig][i]): log("Error: Cannot find unique ID to utilize in GFF attributes: "+ \ feature_list[contig][i]['contig']+"."+ \ feature_list[contig][i]['source']+"."+ \ feature_list[contig][i]['type']+": "+ \ feature_list[contig][i]['attributes']) return feature_list def _generate_feature_hierarchy(self, feature_list): feature_hierarchy = {contig: {} for contig in feature_list} #Need to remember mRNA/gene links for CDSs mRNA_gene_dict = {} exon_list_position_dict = {} for contig in feature_list: for i in range(len(feature_list[contig])): ftr = feature_list[contig][i] if ("gene" in ftr["type"]): feature_hierarchy[contig][ftr["ID"]] = { "utrs": [], "mrnas": [], "cdss": [], "index": i } if ("UTR" in ftr["type"]): feature_hierarchy[contig][mRNA_gene_dict[ ftr["Parent"]]]["utrs"].append({ "id": ftr["ID"], "index": i }) if ("RNA" in ftr["type"]): feature_hierarchy[contig][ftr["Parent"]]["mrnas"].append({ "id": ftr["ID"], "index": i, "cdss": [] }) mRNA_gene_dict[ftr["ID"]] = ftr["Parent"] exon_list_position_dict[ftr["ID"]] = len( feature_hierarchy[contig][ftr["Parent"]]["mrnas"]) - 1 if ("CDS" in ftr["type"]): feature_hierarchy[contig][mRNA_gene_dict[ftr["Parent"]]]["mrnas"]\ [exon_list_position_dict[ftr["Parent"]]]["cdss"].append( { "id": ftr["ID"], "index" : i } ) return feature_hierarchy def _add_missing_parents(self, feature_list): #General rules is if CDS or RNA missing parent, add them for contig in feature_list.keys(): ftrs = feature_list[contig] new_ftrs = [] for i in range(len(ftrs)): if ("Parent" not in ftrs[i]): #Assuming parent doesn't exist at all, so create de novo instead of trying to find it if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]): new_gene_ftr = copy.deepcopy(ftrs[i]) new_gene_ftr["type"] = "gene" ftrs[i]["Parent"] = new_gene_ftr["ID"] new_ftrs.append(new_gene_ftr) if ("CDS" in ftrs[i]["type"]): new_rna_ftr = copy.deepcopy(ftrs[i]) new_rna_ftr["type"] = "mRNA" new_ftrs.append(new_rna_ftr) ftrs[i]["Parent"] = new_rna_ftr["ID"] new_ftrs.append(ftrs[i]) feature_list[contig] = new_ftrs return feature_list def _update_phytozome_features(self, feature_list): #General rule is to use the "Name" field where possible #And update parent attribute correspondingly for contig in feature_list.keys(): feature_position_dict = {} for i in range(len(feature_list[contig])): #Maintain old_id for reference #Sometimes ID isn't available, so use PACid old_id = None for key in ("ID", "PACid", "pacid"): if (key in feature_list[contig][i]): old_id = feature_list[contig][i][key] break if (old_id is None): #This should be an error print ("Cannot find unique ID, PACid, or pacid in GFF attributes: ",\ feature_list[contig][i][contig],feature_list[contig][i][source],feature_list[contig][i][attributes]) continue #Retain old_id feature_position_dict[old_id] = i #In Phytozome, gene and mRNA have "Name" field, CDS do not if ("Name" in feature_list[contig][i]): feature_list[contig][i]["ID"] = feature_list[contig][i][ "Name"] if ("Parent" in feature_list[contig][i]): #Update Parent to match new ID of parent ftr feature_list[contig][i]["Parent"] = feature_list[contig][ feature_position_dict[feature_list[contig][i] ["Parent"]]]["ID"] return feature_list def _update_identifiers(self, feature_list): #General rules: #1) Genes keep identifier #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA" #3) CDS always uses RNA identifier with ".CDS" appended #4) CDS appended with an incremented digit CDS_count_dict = dict() mRNA_parent_dict = dict() for contig in feature_list.keys(): for ftr in feature_list[contig]: if ("Parent" in ftr): #Retain old_id of parents old_id = ftr["ID"] if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]): ftr["ID"] = ftr["Parent"] + "." + ftr["type"] #link old to new ids for mRNA to use with CDS if ("RNA" in ftr["type"]): mRNA_parent_dict[old_id] = ftr["ID"] if ("CDS" in ftr["type"]): #Increment CDS identifier if (ftr["ID"] not in CDS_count_dict): CDS_count_dict[ftr["ID"]] = 1 else: CDS_count_dict[ftr["ID"]] += 1 ftr["ID"] = ftr["ID"] + "." + str( CDS_count_dict[ftr["ID"]]) #Recall new mRNA id for parent ftr["Parent"] = mRNA_parent_dict[ftr["Parent"]] return feature_list def _print_phytozome_gff(self, input_gff_file, feature_list): #Write modified feature ids to new file input_gff_file = input_gff_file.replace("gene", "edited_gene") + ".gz" try: print "Printing to new file: " + input_gff_file gff_file_handle = gzip.open(input_gff_file, 'wb') except: print "Failed to open" for contig in sorted(feature_list.iterkeys()): for ftr in feature_list[contig]: #Re-build attributes attributes_dict = {} for attribute in ftr["attributes"].split(";"): attribute = attribute.strip() #Sometimes empty string if (attribute == ""): continue #Use of 1 to limit split as '=' character can also be made available later #Sometimes lack of "=", assume spaces instead if ("=" in attribute): key, value = attribute.split("=", 1) elif (" " in attribute): key, value = attribute.split(" ", 1) else: log("Warning: attribute " + attribute + " cannot be separated into key,value pair") if (ftr[key] != value): value = ftr[key] attributes_dict[key] = value ftr["attributes"] = ";".join(key + "=" + attributes_dict[key] for key in attributes_dict.keys()) new_line = "\t".join( str(ftr[key]) for key in [ 'contig', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ]) gff_file_handle.write(new_line) gff_file_handle.close() return def _retrieve_genome_feature_list(self, feature_list, feature_hierarchy, assembly): genome_features_list = list() genome_mrnas_list = list() genome_cdss_list = list() genome_translation_issues = list() for contig in feature_hierarchy: for gene in feature_hierarchy[contig]: #We only iterate through the gene objects #And then for each gene object, retrieve the necessary mRNA and CDS objects indirectly ftr = feature_list[contig][feature_hierarchy[contig][gene] ["index"]] contig_sequence = assembly["contigs"][ ftr["contig"]]["sequence"] gene_ftr = self._convert_ftr_object( ftr, contig_sequence ) #reverse-complementation for negative strands done here #Add non-optional terms gene_ftr["mrnas"] = list() gene_ftr["cdss"] = list() gene_ftr["ontology_terms"] = dict() #Retaining longest sequences for gene feature longest_protein_length = 0 longest_protein_sequence = "" for mRNA in feature_hierarchy[contig][gene]["mrnas"]: ######################################################## # Construct mRNA Ftr ######################################################## ftr = feature_list[contig][mRNA["index"]] contig_sequence = assembly["contigs"][ ftr["contig"]]["sequence"] mRNA_ftr = self._convert_ftr_object( ftr, contig_sequence ) #reverse-complementation for negative strands done here #Modify mrna object for use in mrna array #Objects will be un-used until further notice mRNA_ftr['parent_gene'] = gene_ftr['id'] #If there are CDS, then New CDS ID without incrementation as they were aggregated if (len(mRNA['cdss']) > 0): mRNA_ftr['cds'] = mRNA_ftr['id'] + ".CDS" else: mRNA_ftr['cds'] = "" #Add to mrnas array genome_mrnas_list.append(mRNA_ftr) #Add ids to gene_ftr arrays gene_ftr["mrnas"].append(mRNA_ftr["id"]) ######################################################## # Construct transcript, protein sequence, UTR, CDS locations ######################################################## #At time of writing, all of this aggregation should probably be done in a single function cds_exons_locations_array = list() cds_cdna_sequence = str() protein_sequence = str() if (len(mRNA["cdss"]) > 0): (cds_exons_locations_array, cds_cdna_sequence, protein_sequence) = \ self._cds_aggregation_translation(mRNA["cdss"],feature_list[contig],assembly,genome_translation_issues) UTRs = list() if ("utrs" in feature_hierarchy[contig][gene] and len(feature_hierarchy[contig][gene]["utrs"]) > 0): for UTR in feature_hierarchy[contig][gene]["utrs"]: ftr = feature_list[contig][UTR["index"]] if ("Parent" in ftr and ftr["Parent"] == mRNA_ftr["id"]): UTRs.append(ftr) mrna_exons_locations_array = copy.deepcopy( cds_exons_locations_array) mrna_transcript_sequence = str(cds_cdna_sequence) if (len(UTRs) > 0): (mrna_exons_locations_array, mrna_transcript_sequence) = \ self._utr_aggregation(UTRs,assembly,mrna_exons_locations_array,cds_cdna_sequence) #Update sequence and locations mRNA_ftr["dna_sequence"] = mrna_transcript_sequence mRNA_ftr["dna_sequence_length"] = len( mrna_transcript_sequence) mRNA_ftr["location"] = mrna_exons_locations_array mRNA_ftr["md5"] = hashlib.md5( mRNA_ftr["dna_sequence"]).hexdigest() #Remove DNA del mRNA_ftr["dna_sequence"] del mRNA_ftr["dna_sequence_length"] #Skip CDS if not present if (len(mRNA["cdss"]) == 0): continue #Remove asterix representing stop codon if present if (len(protein_sequence) > 0 and protein_sequence[-1] == '*'): protein_sequence = protein_sequence[:-1] #Save longest sequence if (len(protein_sequence) > longest_protein_length): longest_protein_length = len(protein_sequence) longest_protein_sequence = protein_sequence ######################################################## # Construct CDS Ftr ######################################################## CDS_ftr = dict() CDS_ftr['type'] = 'CDS' #New CDS ID without incrementation as they were aggregated CDS_ftr['id'] = mRNA_ftr['id'] + '.CDS' #Add gene/mrna links CDS_ftr['parent_gene'] = gene_ftr['id'] CDS_ftr['parent_mrna'] = mRNA_ftr['id'] #Update sequence and locations CDS_ftr["dna_sequence"] = cds_cdna_sequence CDS_ftr["dna_sequence_length"] = len(cds_cdna_sequence) CDS_ftr["location"] = cds_exons_locations_array CDS_ftr["md5"] = hashlib.md5( CDS_ftr["dna_sequence"]).hexdigest() #Add protein CDS_ftr["protein_translation"] = str( protein_sequence).upper() CDS_ftr["protein_translation_length"] = len( CDS_ftr["protein_translation"]) #Only generate md5 for dna sequences #CDS_ftr["md5"] = hashlib.md5(CDS_ftr["protein_translation"]).hexdigest() #Add empty non-optional fields for populating in future CDS_ftr["ontology_terms"] = dict() if ("aliases" not in CDS_ftr): CDS_ftr["aliases"] = list() if ("function" not in CDS_ftr): CDS_ftr["function"] = "" #Add to cdss array genome_cdss_list.append(CDS_ftr) #Add ids to gene_ftr arrays gene_ftr["cdss"].append(CDS_ftr["id"]) gene_ftr["protein_translation"] = longest_protein_sequence gene_ftr["protein_translation_length"] = longest_protein_length genome_features_list.append(gene_ftr) msg = "Genome features processed: {} genes, {} RNAs, and {} CDSs\n".format( len(genome_features_list), len(genome_mrnas_list), len(genome_cdss_list)) msg += "{} mRNA(s) had errors during translation".format( len(genome_translation_issues)) log(msg) return genome_features_list, genome_mrnas_list, genome_cdss_list def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref, genome_features_list, genome_cdss_list, genome_mrnas_list, source, assembly, taxon_reference, taxonomy, input_gff_file): """ _gen_genome_info: generate genome info """ genome = dict() genome["id"] = core_genome_name genome["scientific_name"] = scientific_name genome["assembly_ref"] = assembly_ref genome["features"] = genome_features_list genome["cdss"] = genome_cdss_list genome["mrnas"] = genome_mrnas_list genome["source"] = source genome["domain"] = "Eukaryota" genome["genetic_code"] = 1 genome["gc_content"] = assembly["gc_content"] genome["dna_size"] = assembly["dna_size"] if taxon_reference is not None: genome["taxon_ref"] = taxon_reference genome["taxonomy"] = taxonomy gff_file_to_shock = self.dfu.file_to_shock({ 'file_path': input_gff_file, 'make_handle': 1, 'pack': "gzip" }) gff_handle_ref = gff_file_to_shock['handle']['hid'] genome['gff_handle_ref'] = gff_handle_ref return genome def _convert_ftr_object(self, old_ftr, contig): new_ftr = dict() new_ftr["id"] = old_ftr["ID"] dna_sequence = Seq(contig[old_ftr["start"] - 1:old_ftr["end"]], IUPAC.ambiguous_dna) # reverse complement if (old_ftr["strand"] == "-"): dna_sequence = dna_sequence.reverse_complement() old_start = old_ftr["start"] old_ftr["start"] = old_ftr["end"] old_ftr["end"] = old_start new_ftr["dna_sequence"] = str(dna_sequence).upper() new_ftr["dna_sequence_length"] = len(dna_sequence) new_ftr["md5"] = hashlib.md5(str(dna_sequence)).hexdigest() new_ftr["location"] = [[ old_ftr["contig"], old_ftr["start"], old_ftr["strand"], len(dna_sequence) ]] new_ftr["type"] = old_ftr["type"] new_ftr["aliases"] = list() for key in ("transcriptId", "proteinId", "PACid", "pacid"): if (key in old_ftr.keys()): new_ftr["aliases"].append(key + ":" + old_ftr[key]) return new_ftr def _utr_aggregation(self, utr_list, assembly, exons, exon_sequence): #create copies of locations and transcript utrs_exons = list(exons) utr_exon_sequence = exon_sequence five_prime_dna_sequence = "" three_prime_dna_sequence = "" five_prime_locations = list() three_prime_locations = list() for UTR in (utr_list): contig_sequence = assembly["contigs"][UTR["contig"]]["sequence"] UTR_ftr = self._convert_ftr_object( UTR, contig_sequence ) #reverse-complementation for negative strands done here #aggregate sequences and locations if ("five_prime" in UTR_ftr["id"]): five_prime_dna_sequence += UTR_ftr["dna_sequence"] five_prime_locations.append(UTR_ftr["location"][0]) if ("three_prime" in UTR_ftr["id"]): three_prime_dna_sequence += UTR_ftr["dna_sequence"] three_prime_locations.append(UTR_ftr["location"][0]) #Handle five_prime UTRs if (len(five_prime_locations) > 0): #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file) five_prime_locations = sorted(five_prime_locations, key=lambda x: x[1]) #Merge last UTR with CDS if "next" to each other if( ( utrs_exons[0][2] == "+" and five_prime_locations[-1][1]+five_prime_locations[-1][3] == utrs_exons[0][1] ) or \ ( utrs_exons[0][2] == "-" and five_prime_locations[-1][1]-five_prime_locations[-1][3] == utrs_exons[0][1] ) ): #Remove last UTR last_five_prime_location = five_prime_locations[-1] five_prime_locations = five_prime_locations[:-1] #"Add" last UTR to first exon utrs_exons[0][1] = last_five_prime_location[1] utrs_exons[0][3] += last_five_prime_location[3] #Prepend other UTRs if available if (len(five_prime_locations) > 0): utrs_exons = five_prime_locations + utrs_exons utr_exon_sequence = five_prime_dna_sequence + utr_exon_sequence #Handle three_prime UTRs if (len(three_prime_locations) > 0): #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file three_prime_locations = sorted(three_prime_locations, key=lambda x: x[1]) #Merge first UTR with CDS if "next to each other if( ( utrs_exons[0][2] == "+" and utrs_exons[-1][1]+utrs_exons[-1][3] == three_prime_locations[0][1] ) or \ ( utrs_exons[0][2] == "-" and utrs_exons[-1][1]-utrs_exons[-1][3] == three_prime_locations[0][1] ) ): #Remove first UTR first_three_prime_location = three_prime_locations[0] three_prime_locations = three_prime_locations[1:] #"Add" first UTR to last exon utrs_exons[-1][3] += first_three_prime_location[3] #Append other UTRs if available if (len(three_prime_locations) > 0): utrs_exons = utrs_exons + three_prime_locations utr_exon_sequence += three_prime_dna_sequence return (utrs_exons, utr_exon_sequence) def _cds_aggregation_translation(self, cds_list, feature_list, assembly, issues): dna_sequence = "" locations = list() # collect phases, and lengths of exons # right now, this is only for the purpose of error reporting phases = list() exons = list() #Saving parent mRNA identifier Parent_mRNA = cds_list[0]["id"] for CDS in (cds_list): ftr = feature_list[CDS["index"]] phases.append(ftr["phase"]) Parent_mRNA = ftr["Parent"] contig_sequence = assembly["contigs"][ftr["contig"]]["sequence"] CDS_ftr = self._convert_ftr_object( ftr, contig_sequence ) #reverse-complementation for negative strands done here exons.append(len(CDS_ftr["dna_sequence"])) # Remove base(s) according to phase, but only for first CDS if (CDS == cds_list[0] and int(ftr["phase"]) != 0): log("Adjusting phase for first CDS: " + CDS["id"]) CDS_ftr["dna_sequence"] = CDS_ftr["dna_sequence"][ int(ftr["phase"]):] #aggregate sequences and locations dna_sequence += CDS_ftr["dna_sequence"] locations.append(CDS_ftr["location"][0]) # translate sequence dna_sequence_obj = Seq(dna_sequence, IUPAC.ambiguous_dna) rna_sequence = dna_sequence_obj.transcribe() # incomplete gene model with no start codon if str(rna_sequence.upper())[:3] not in codon_table.start_codons: msg = "Missing start codon for {}. Possibly incomplete gene model.".format( Parent_mRNA) log(msg) # You should never have this problem, needs to be reported rather than "fixed" codon_count = len(str(rna_sequence)) % 3 if codon_count != 0: msg = "Number of bases for RNA sequence for {} ".format( Parent_mRNA) msg += "is not divisible by 3. " msg += "The resulting protein may well be mis-translated." log(msg) issues.append(Parent_mRNA) protein_sequence = Seq("") try: protein_sequence = rna_sequence.translate() except CodonTable.TranslationError as te: log("TranslationError for: " + feature_object["id"], phases, exons, " : " + str(te)) return (locations, dna_sequence.upper(), str(protein_sequence).upper())
class Utils: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.kbse = KBaseSearchEngine(config['search-url']) self.gen_api = GenericsAPI(self.callback_url) self.DEFAULT_ONTOLOGY_REF = "KbaseOntologies/Custom" self.DEFAULT_ONTOLOGY_ID = "Custom:Term" self.DEFAULT_UNIT_ID = "Custom:Unit" @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) def get_conditions(self, params): data = self.dfu.get_objects( {'object_refs': [params['condition_set_ref']]})['data'][0]['data'] conditions = {} keep_keys = params.get('conditions', data['conditions'].keys()) for key in keep_keys: conditions[key] = defaultdict(list) for factor, val in zip(data['factors'], data['conditions'][key]): ont_abriv = factor['factor_ont_id'].split(":")[0] factor['value'] = val conditions[key][ont_abriv].append(copy.copy(factor)) return {"conditions": conditions} def file_to_condition_set(self, params): """Convert a user supplied file to a compound set""" if 'input_file_path' in params: scratch_file_path = params['input_file_path'] elif 'input_shock_id' in params: scratch_file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') else: raise ValueError( "Must supply either a input_shock_id or input_file_path") try: df = pd.read_excel(scratch_file_path, dtype='str') except XLRDError: df = pd.read_csv(scratch_file_path, sep="\t", dtype='str') comp_set = self._df_to_cs_obj(df) info = self.dfu.save_objects({ "id": params['output_ws_id'], "objects": [{ "type": "KBaseExperiments.ConditionSet", "data": comp_set, "name": params['output_obj_name'] }] })[0] return {"condition_set_ref": "%s/%s/%s" % (info[6], info[0], info[4])} def _conditionset_data_to_df(self, data): """ Converts a compound set object data to a dataframe """ factors = pd.DataFrame(data['factors']) factors.rename(columns=lambda x: x.replace("ont", "ontology"). capitalize().replace("_", " ")) conditions = pd.DataFrame(data['conditions']) cs_df = factors.join(conditions) return cs_df def _clusterset_data_to_df(self, data): """ Converts a cluster set object data to a dataframe """ original_matrix_ref = data.get('original_data') data_matrix = self.gen_api.fetch_data({ 'obj_ref': original_matrix_ref }).get('data_matrix') data_df = pd.read_json(data_matrix) clusters = data.get('clusters') id_name_list = [ cluster.get('id_to_data_position').keys() for cluster in clusters ] id_names = [item for sublist in id_name_list for item in sublist] if set(data_df.columns.tolist()) == set( id_names): # cluster is based on condition data_df = data_df.T cluster_names = [None] * data_df.index.size cluster_id = 0 for cluster in clusters: item_ids = cluster.get('id_to_data_position').keys() item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids] for idx in item_idx: cluster_names[idx] = cluster_id cluster_id += 1 data_df['cluster'] = cluster_names return data_df def _ws_obj_to_df(self, input_ref): """Converts workspace obj to a dataframe""" res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0] name = res['info'][1] obj_type = res['info'][2] if "KBaseExperiments.ConditionSet" in obj_type: cs_df = self._conditionset_data_to_df(res['data']) elif "KBaseExperiments.ClusterSet" in obj_type: cs_df = self._clusterset_data_to_df(res['data']) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please supply KBaseExperiments.ConditionSet or KBaseExperiments.ClusterSet' raise ValueError("err_msg") return name, cs_df, obj_type def _df_to_cs_obj(self, cs_df): """Converts a dataframe from a user file to a compound set object""" condition_set = {'ontology_mapping_method': "User Curation"} cs_df.fillna('', inplace=True) if not len(cs_df): raise ValueError("No factors in supplied files") factor_df = cs_df.filter(regex="[Uu]nit|[Ff]actor") condition_df = cs_df.drop(factor_df.columns, axis=1) if not len(condition_df.columns): raise ValueError( "Unable to find any condition columns in supplied file") factor_df.rename( columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(), inplace=True) if "factor" not in factor_df.columns: raise ValueError( "Unable to find a 'Factor' column in supplied file") factor_fields = ('factor', 'unit', 'factor_ont_id', 'unit_ont_id') factors = factor_df.filter(items=factor_fields).to_dict('records') condition_set['factors'] = [ self._add_ontology_info(f) for f in factors ] condition_set['conditions'] = condition_df.to_dict('list') return condition_set def _search_ontologies(self, term, closest=False): """ Match to an existing KBase ontology term :param term: Test to match :param closest: if false, term must exactly match an ontology ID :return: dict(ontology_ref, id) """ params = { "object_types": ["OntologyTerm"], "match_filter": { "lookup_in_keys": { "id": { "value": term } } }, "access_filter": { "with_private": 0, "with_public": 1 }, "pagination": { "count": 1 }, "post_processing": { "skip_data": 1 } } if closest: params['match_filter'] = {"full_text_in_all": term} res = self.kbse.search_objects(params) if not res['objects']: return None term = res['objects'][0] return { "ontology_ref": term['guid'].split(":")[1], "id": term['key_props']['id'] } def _add_ontology_info(self, factor): """Searches KBASE ontologies for terms matching the user supplied factors and units. Add the references if found""" optionals = { "unit", "unit_ont_id", "unit_ont_ref", } factor = { k: v for k, v in factor.items() if k not in optionals or v != "" } ont_info = self._search_ontologies( factor.get('factor_ont_id', "").replace("_", ":")) if ont_info: factor['factor_ont_ref'] = ont_info['ontology_ref'] factor['factor_ont_id'] = ont_info['id'] else: factor['factor_ont_ref'] = self.DEFAULT_ONTOLOGY_REF factor['factor_ont_id'] = self.DEFAULT_ONTOLOGY_ID if factor.get('unit'): ont_info = self._search_ontologies( factor.get('unit_ont_id', '').replace("_", ":")) if ont_info: factor['unit_ont_ref'] = ont_info['ontology_ref'] factor['unit_ont_id'] = ont_info['id'] else: factor['unit_ont_ref'] = self.DEFAULT_ONTOLOGY_REF factor['unit_ont_id'] = self.DEFAULT_UNIT_ID return factor def to_tsv(self, params): """Convert an compound set to TSV file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".tsv") df.to_csv(files['file_path'], sep="\t", index=False) return _id, files def to_excel(self, params): """Convert an compound set to Excel file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".xlsx") writer = pd.ExcelWriter(files['file_path']) if "KBaseExperiments.ConditionSet" in obj_type: df.to_excel(writer, "Conditions", index=False) elif "KBaseExperiments.ClusterSet" in obj_type: df.to_excel(writer, "ClusterSet", index=True) # else is checked in `_ws_obj_to_df` writer.save() return _id, files def export(self, file, name, input_ref): """Saves a set of files to SHOCK for export""" export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4())) os.makedirs(export_package_dir) shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [input_ref] }) return {'shock_id': package_details['shock_id']}
class ReadsAlignmentUtils: ''' Module Name: ReadsAlignmentUtils Module Description: A KBase module: ReadsAlignmentUtils This module is intended for use by Aligners and Assemblers to upload and download alignment files. The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files in sam and bai formats from the downloaded bam file. This utility also generates stats from the stored alignment. ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git" GIT_COMMIT_HASH = "a807d122b097a4c6713a81d5a82eef335835f77a" #BEGIN_CLASS_HEADER PARAM_IN_FILE = 'file_path' PARAM_IN_SRC_REF = 'source_ref' PARAM_IN_DST_REF = 'destination_ref' PARAM_IN_CONDITION = 'condition' PARAM_IN_READ_LIB_REF = 'read_library_ref' PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref' PARAM_IN_ALIGNED_USING = 'aligned_using' PARAM_IN_ALIGNER_VER = 'aligner_version' PARAM_IN_ALIGNER_OPTS = 'aligner_opts' PARAM_IN_REPLICATE_ID = 'replicate_id' PARAM_IN_PLATFORM = 'platform' PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index' PARAM_IN_SAMPLESET_REF = 'sampleset_ref' PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id' PARAM_IN_DOWNLOAD_SAM = 'downloadSAM' PARAM_IN_DOWNLOAD_BAI = 'downloadBAI' PARAM_IN_VALIDATE = 'validate' INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') INVALID_WS_NAME_RE = re.compile('[^\\w:._-]') def _get_file_path_info(self, file_path): """ Given a file path, returns the directory, file name, file base and file extension """ dir, file_name = os.path.split(file_path) file_base, file_ext = os.path.splitext(file_name) return dir, file_name, file_base, file_ext def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _check_required_param(self, in_params, param_list): """ Checks if each of the params in the list are in the input params """ for param in param_list: if (param not in in_params or not in_params[param]): raise ValueError('{} parameter is required'.format(param)) def _proc_ws_obj_params(self, ctx, params): """ Checks the validity of workspace and object params and returns them """ dst_ref = params.get(self.PARAM_IN_DST_REF) ws_name_id, obj_name_id = os.path.split(dst_ref) if not bool(ws_name_id.strip()) or ws_name_id == '/': raise ValueError("Workspace name or id is required in " + self.PARAM_IN_DST_REF) if not bool(obj_name_id.strip()): raise ValueError("Object name or id is required in " + self.PARAM_IN_DST_REF) if not isinstance(ws_name_id, int): try: ws_name_id = self.dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id)) return ws_name_id, obj_name_id def _get_ws_info(self, obj_ref): ws = Workspace(self.ws_url) try: info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0] except WorkspaceError as wse: self.__LOGGER.error('Logging workspace exception') self.__LOGGER.error(str(wse)) raise return info def _proc_upload_alignment_params(self, ctx, params): """ Checks the presence and validity of upload alignment params """ self._check_required_param(params, [ self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION, self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF ]) ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params) file_path = params.get(self.PARAM_IN_FILE) if not (os.path.isfile(file_path)): raise ValueError('File does not exist: ' + file_path) lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2] if lib_type.startswith('KBaseFile.SingleEndLibrary') or \ lib_type.startswith('KBaseFile.PairedEndLibrary') or \ lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \ lib_type.startswith('KBaseAssembly.PairedEndLibrary'): pass else: raise ValueError(self.PARAM_IN_READ_LIB_REF + ' parameter should be of type' + ' KBaseFile.SingleEndLibrary or' + ' KBaseFile.PairedEndLibrary or' + ' KBaseAssembly.SingleEndLibrary or' + ' KBaseAssembly.PairedEndLibrary') obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2] if obj_type.startswith('KBaseGenomes.Genome') or \ obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \ obj_type.startswith('KBaseGenomes.ContigSet'): pass else: raise ValueError(self.PARAM_IN_ASM_GEN_REF + ' parameter should be of type' + ' KBaseGenomes.Genome or' + ' KBaseGenomeAnnotations.Assembly or' + ' KBaseGenomes.ContigSet') return ws_name_id, obj_name_id, file_path, lib_type def _get_aligner_stats(self, bam_file): """ Gets the aligner stats from BAM file """ path, file = os.path.split(bam_file) return self.samtools.get_stats(file, path) def _validate(self, params): samt = SamTools(self.config, self.__LOGGER) if 'ignore' in params: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path, ignore=params['ignore']) else: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path) return rval #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.__LOGGER = logging.getLogger('KBaseRNASeq') if 'log_level' in config: self.__LOGGER.setLevel(config['log_level']) else: self.__LOGGER.setLevel(logging.INFO) streamHandler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "%(asctime)s - %(filename)s - %(lineno)d - \ %(levelname)s - %(message)s") formatter.converter = time.gmtime streamHandler.setFormatter(formatter) self.__LOGGER.addHandler(streamHandler) self.__LOGGER.info("Logger was set") script_utils.check_sys_stat(self.__LOGGER) self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.samtools = SamTools(config) #END_CONSTRUCTOR pass def validate_alignment(self, ctx, params): """ :param params: instance of type "ValidateAlignmentParams" (* Input parameters for validating a reads alignment. For validation errors to ignore, see http://broadinstitute.github.io/picard/command-line-overview.html#V alidateSamFile) -> structure: parameter "file_path" of String, parameter "ignore" of list of String :returns: instance of type "ValidateAlignmentOutput" (* Results from validate alignment *) -> structure: parameter "validated" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)) """ # ctx is the context object # return variables are: returnVal #BEGIN validate_alignment rval = self._validate(params) if rval == 0: returnVal = {'validated': True} else: returnVal = {'validated': False} #END validate_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method validate_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def upload_alignment(self, ctx, params): """ Validates and uploads the reads alignment * :param params: instance of type "UploadAlignmentParams" (* Required input parameters for uploading a reads alignment string destination_ref - object reference of alignment destination. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id file_path - File with the path of the sam or bam file to be uploaded. If a sam file is provided, it will be converted to the sorted bam format before being saved read_library_ref - workspace object ref of the read sample used to make the alignment file condition - assembly_or_genome_ref - workspace object ref of genome assembly or genome object that was used to build the alignment *) -> structure: parameter "destination_ref" of String, parameter "file_path" of String, parameter "read_library_ref" of String, parameter "condition" of String, parameter "assembly_or_genome_ref" of String, parameter "aligned_using" of String, parameter "aligner_version" of String, parameter "aligner_opts" of mapping from String to String, parameter "replicate_id" of String, parameter "platform" of String, parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter "sampleset_ref" of type "ws_Sampleset_ref", parameter "mapped_sample_id" of mapping from String to mapping from String to String, parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "UploadAlignmentOutput" (* Output from uploading a reads alignment *) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN upload_alignment self.__LOGGER.info( 'Starting upload Reads Alignment, parsing parameters ') pprint(params) ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params( ctx, params) dir, file_name, file_base, file_ext = self._get_file_path_info( file_path) if self.PARAM_IN_VALIDATE in params and params[ self.PARAM_IN_VALIDATE] is True: if self._validate(params) == 1: raise Exception('{0} failed validation'.format(file_path)) bam_file = file_path if file_ext.lower() == '.sam': bam_file = os.path.join(dir, file_base + '.bam') self.samtools.convert_sam_to_sorted_bam(ifile=file_name, ipath=dir, ofile=bam_file) uploaded_file = self.dfu.file_to_shock({ 'file_path': bam_file, 'make_handle': 1 }) file_handle = uploaded_file['handle'] file_size = uploaded_file['size'] aligner_stats = self._get_aligner_stats(file_path) aligner_data = { 'file': file_handle, 'size': file_size, 'condition': params.get(self.PARAM_IN_CONDITION), 'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF), 'library_type': lib_type, 'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF), 'alignment_stats': aligner_stats } optional_params = [ self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER, self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID, self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX, self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID ] for opt_param in optional_params: if opt_param in params and params[opt_param] is not None: aligner_data[opt_param] = params[opt_param] self.__LOGGER.info('========= Adding extra_provenance_refs') self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF)) self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF)) self.__LOGGER.info('=======================================') res = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": "KBaseRNASeq.RNASeqAlignment", "data": aligner_data, "name": obj_name_id, "extra_provenance_input_refs": [ params.get(self.PARAM_IN_READ_LIB_REF), params.get(self.PARAM_IN_ASM_GEN_REF) ] }] })[0] self.__LOGGER.info('save complete') returnVal = { 'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4]) } self.__LOGGER.info('Uploaded object: ') self.__LOGGER.info(returnVal) #END upload_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method upload_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def download_alignment(self, ctx, params): """ Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats * :param params: instance of type "DownloadAlignmentParams" (* Required input parameters for downloading a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "DownloadAlignmentOutput" (* The output of the download method. *) -> structure: parameter "destination_dir" of String, parameter "stats" of type "AlignmentStats" -> structure: parameter "properly_paired" of Long, parameter "multiple_alignments" of Long, parameter "singletons" of Long, parameter "alignment_rate" of Double, parameter "unmapped_reads" of Long, parameter "mapped_reads" of Long, parameter "total_reads" of Long """ # ctx is the context object # return variables are: returnVal #BEGIN download_alignment self.__LOGGER.info('Running download_alignment with params:\n' + pformat(params)) inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise # set the output dir uuid_str = str(uuid.uuid4()) output_dir = os.path.join(self.scratch, 'download_' + uuid_str) self._mkdir_p(output_dir) file_ret = self.dfu.shock_to_file({ 'shock_id': alignment[0]['data']['file']['id'], 'file_path': output_dir }) if zipfile.is_zipfile(file_ret.get('file_path')): with zipfile.ZipFile(file_ret.get('file_path')) as z: z.extractall(output_dir) for f in glob.glob(output_dir + '/*.zip'): os.remove(f) bam_files = glob.glob(output_dir + '/*.bam') uuid_prefix = uuid_str[:8] if len(bam_files) == 0: raise ValueError("Alignment object does not contain a bam file") for bam_file_path in bam_files: dir, file_name, file_base, file_ext = self._get_file_path_info( bam_file_path) if params.get(self.PARAM_IN_VALIDATE, False): validate_params = {'file_path': bam_file_path} if self._validate(validate_params) == 1: raise Exception( '{0} failed validation'.format(bam_file_path)) if params.get('downloadBAI', False): bai_file = uuid_prefix + '_' + file_base + '.bai' bai_file_path = os.path.join(output_dir, bai_file) self.samtools.create_bai_from_bam(ifile=file_name, ipath=output_dir, ofile=bai_file) if not os.path.isfile(bai_file_path): raise ValueError('Error creating {}'.format(bai_file_path)) if params.get('downloadSAM', False): sam_file = uuid_prefix + '_' + file_base + '.sam' sam_file_path = os.path.join(output_dir, sam_file) self.samtools.convert_bam_to_sam(ifile=file_name, ipath=output_dir, ofile=sam_file) if not os.path.isfile(sam_file_path): raise ValueError('Error creating {}'.format(sam_file_path)) returnVal = { 'destination_dir': output_dir, 'stats': alignment[0]['data']['alignment_stats'] } #END download_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method download_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def export_alignment(self, ctx, params): """ Wrapper function for use by in-narrative downloaders to download alignments from shock * :param params: instance of type "ExportParams" (* Required input parameters for exporting a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_alignment inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) if params.get(self.PARAM_IN_VALIDATE, False) or \ params.get('exportBAI', False) or \ params.get('exportSAM', False): """ Need to validate or convert files. Use download_alignment """ download_params = {} for key, val in params.iteritems(): download_params[key.replace('export', 'download')] = val download_retVal = self.download_alignment(ctx, download_params)[0] export_dir = download_retVal['destination_dir'] # package and load to shock ret = self.dfu.package_for_download({ 'file_path': export_dir, 'ws_refs': [inref] }) output = {'shock_id': ret['shock_id']} else: """ return shock id from the object """ try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise output = {'shock_id': alignment[0]['data']['file']['id']} #END export_alignment # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_alignment return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
def upload_genome(shock_service_url=None, handle_service_url=None, workspace_service_url=None, callback_url=None, input_gff_file=None, input_fasta_file=None, workspace_name=None, core_genome_name=None, scientific_name="unknown_taxon", taxon_wsname='ReferenceTaxons', taxon_reference=None, source=None, release=None, genome_type=None): assembly_ref = None gff_handle_ref = None time_string = str( datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S')) dfUtil = DataFileUtil(callback_url) ########################################### #Retrieve taxon #Taxon lookup dependent on full genus #Example: Athaliana Arabidopsis thaliana ########################################### #default to taxon_id = -1 taxon_object_name = "unknown_taxon" #Retrieve lookup object if scientific name provided if (taxon_reference is None and scientific_name is not "unknown_taxon"): #Need to retrieve taxon lookup object then find taxon id taxon_lookup = dfUtil.get_objects({ 'object_refs': [taxon_wsname + "/taxon_lookup"], 'ignore_errors': 0 })['data'][0]['data']['taxon_lookup'] if (scientific_name[0:3] in taxon_lookup and scientific_name in taxon_lookup[scientific_name[0:3]]): taxon_id = taxon_lookup[scientific_name[0:3]][scientific_name] taxon_object_name = "%s_taxon" % (str(taxon_id)) #Retrieve Taxon object taxon_info = {} if (taxon_reference is None): taxon_info = dfUtil.get_objects({ 'object_refs': [taxon_wsname + "/" + taxon_object_name], 'ignore_errors': 0 })['data'][0] taxon_reference = "%s/%s/%s" % (taxon_info['info'][6], taxon_info['info'][0], taxon_info['info'][4]) else: taxon_info = dfUtil.get_objects([{ "object_refs": [taxon_reference], 'ignore_errors': 0 }])['data'][0] taxonomy = taxon_info['data']['scientific_lineage'] ########################################### #End taxonomy retrieval ########################################### ########################################### #Create logger ########################################### logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) # send messages to sys.stderr streamHandler = logging.StreamHandler(sys.stderr) formatter = logging.Formatter( "%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s" ) formatter.converter = time.gmtime streamHandler.setFormatter(formatter) logger.addHandler(streamHandler) ########################################### #End logger creation ########################################### ########################################## #Reading in Fasta file, Code taken from https://www.biostars.org/p/710/ ########################################## logger.info("Reading FASTA file.") assembly = { "contigs": {}, "dna_size": 0, "gc_content": 0, "md5": [], "base_counts": {} } contig_seq_start = 0 input_file_handle = open(input_fasta_file, 'rb') # alternate header and sequence faiter = (x[1] for x in itertools.groupby(input_file_handle, lambda line: line[0] == ">")) for header in faiter: # drop the ">" header = header.next()[1:].strip() # join all sequence lines to one. seq = "".join(s.strip() for s in faiter.next()) try: fasta_header, fasta_description = header.split(' ', 1) except: fasta_header = header fasta_description = None #Handle record seq = seq.upper() #Build contig objects for Assembly seq_count = dict(collections.Counter(seq)) #to delete at end, but required for now contig_dict = {"sequence": seq} Ncount = 0 if "N" in seq_count: Ncount = seq_count["N"] contig_dict["Ncount"] = Ncount for character in seq_count: if character in assembly["base_counts"]: assembly["base_counts"][character] += seq_count[character] else: assembly["base_counts"][character] = seq_count[character] contig_seq_length = len(seq) assembly["dna_size"] += contig_seq_length contig_gc_length = seq.count("G") contig_gc_length += seq.count("C") contig_dict["gc_content"] = float("{0:.2f}".format( float(contig_gc_length) / float(contig_seq_length))) assembly["gc_content"] += contig_gc_length contig_dict["contig_id"] = fasta_header contig_dict["name"] = fasta_header contig_dict["length"] = contig_seq_length contig_dict["md5"] = hashlib.md5(seq).hexdigest() assembly["md5"].append(contig_dict["md5"]) if fasta_description is not None: contig_dict["description"] = fasta_description contig_dict["is_circular"] = "Unknown" contig_dict["start_position"] = contig_seq_start contig_dict["num_bytes"] = sys.getsizeof(contig_dict["sequence"]) assembly["contigs"][fasta_header] = contig_dict #used for start of next sequence and total gc_content contig_seq_start += contig_seq_length assembly["gc_content"] = float("{0:.2f}".format( float(assembly["gc_content"]) / float(contig_seq_start))) assembly["md5"] = hashlib.md5(",".join(assembly["md5"])).hexdigest() assembly["assembly_id"] = core_genome_name + "_assembly" assembly["name"] = scientific_name assembly["external_source"] = source assembly["external_source_id"] = os.path.basename(input_fasta_file) assembly["external_source_origination_date"] = str( os.stat(input_fasta_file).st_ctime) assembly["num_contigs"] = len(assembly["contigs"].keys()) assembly["type"] = "Unknown" assembly[ "notes"] = "Note MD5s are generated from uppercasing the sequences" if taxon_reference is not None: assembly["taxon_ref"] = taxon_reference logger.info("Reading GFF file.") header = list() feature_list = dict() original_CDS_count = dict() original_feature_ids = dict() # gff_file_handle = gzip.open(input_gff_file, 'rb') gff_file_handle = open(input_gff_file, 'rb') current_line = gff_file_handle.readline() gff_object = dict() while (current_line != ''): current_line = current_line.strip() if (current_line.startswith("##") or current_line.startswith("#!")): header.append(current_line) if ('headers' not in gff_object): gff_object['headers'] = list() gff_object['headers'].append(current_line) else: if ('features' not in gff_object): gff_object['features'] = list() contig_id, source_id, feature_type, start, end, score, strand, phase, attributes = current_line.split( '\t') attributes_dict = dict() for attribute in attributes.split(";"): if (attribute == "" or "=" not in attribute): continue key, value = attribute.split("=", 1) attributes_dict[key] = value #ID should be transferred from Name or Parent old_id = None for key in ("ID", "PACid", "pacid"): if (key in attributes_dict): old_id = attributes_dict[key] break if (old_id is None): eprint( "Cannot find unique ID, PACid, or pacid in GFF attributes: " + attributes) continue if ("Name" in attributes_dict): attributes_dict["ID"] = attributes_dict["Name"] else: attributes_dict["ID"] = original_feature_ids[ attributes_dict["Parent"]] + "." + feature_type #if CDS have to increment if (feature_type == "CDS"): if (attributes_dict["ID"] not in original_CDS_count): original_CDS_count[attributes_dict["ID"]] = 1 else: original_CDS_count[attributes_dict["ID"]] += 1 attributes_dict["ID"] += "." + str( original_CDS_count[attributes_dict["ID"]]) #Update parent if ("Parent" in attributes_dict): attributes_dict["Parent"] = original_feature_ids[ attributes_dict["Parent"]] original_feature_ids[old_id] = attributes_dict["ID"] #recreate line for GFF partial_line, attributes = current_line.rsplit('\t', 1) new_line = partial_line + "\t" + ";".join( key + "=" + attributes_dict[key] for key in attributes_dict.keys()) gff_object['features'].append(new_line) if (contig_id not in assembly["contigs"]): logger.warn("Missing contig: " + contig_id) if (contig_id not in feature_list): feature_list[contig_id] = list() feature = { 'type': feature_type, 'start': int(start), 'end': int(end), 'score': score, 'strand': strand, 'phase': phase } for attribute in attributes.split(";"): if (attribute == "" or "=" not in attribute): continue key, value = attribute.split("=", 1) feature[key] = value #Append contig identifier feature["contig"] = contig_id feature_list[contig_id].append(feature) current_line = gff_file_handle.readline() gff_file_handle.close() #Writing updated lines to gff_file_handle input_gff_file = input_gff_file.replace("gene", "edited_gene") gff_file_handle = gzip.open(input_gff_file, 'wb') if ('headers' in gff_object): gff_file_handle.write("\n".join(gff_object["headers"])) gff_file_handle.write("\n".join(gff_object["features"])) gff_file_handle.close() #New code inserted to better handle feature identifiers #Start by extracting and group them first features_identifiers_dict = dict() features_identifiers_list = list() features_identifiers_count = dict() features_parents_dict = dict() features_name_id_dict = dict() CDS_count = dict() for contig in sorted(feature_list): for feature in feature_list[contig]: #We're only considering gene, mRNA, and CDS for brevity's sake if (feature["type"] not in ("gene", "mRNA", "CDS")): continue #gene and mRNA always have name, CDS do not if ("Name" not in feature): feature["Name"] = None #Update parent following name/id switch if ("Parent" in feature and feature["Parent"] in features_name_id_dict): feature["Parent"] = features_name_id_dict[feature["Parent"]] #ID should be transferred to Name, but need to maintain parent if (feature["Name"] is not None): features_name_id_dict[feature["ID"]] = feature["Name"] feature["ID"] = feature["Name"] else: feature["ID"] = feature["Parent"] + "." + feature["type"] #if CDS have to increment if (feature["type"] == "CDS"): if (feature["ID"] not in CDS_count): CDS_count[feature["ID"]] = 1 else: CDS_count[feature["ID"]] += 1 feature["ID"] += "." + str(CDS_count[feature["ID"]]) #Collect if (feature["type"] == "gene"): features_identifiers_dict[feature["ID"]] = dict() if (feature["type"] == "mRNA"): features_identifiers_dict[feature["Parent"]][ feature["ID"]] = dict() features_parents_dict[feature["ID"]] = feature["Parent"] if (feature["type"] == "CDS"): features_identifiers_dict[features_parents_dict[ feature["Parent"]]][feature["Parent"]][feature["ID"]] = 1 features_identifiers_list.append(feature) features_identifiers_count[ feature["ID"]] = len(features_identifiers_list) - 1 updated_features_identifiers_dict = dict() updated_features_list = list() updated_features_identifiers_count = dict() updated_features_parents_dict = dict() updated_CDS_count = dict() for gene in sorted(features_identifiers_dict): #retrieve original object gene_ftr = features_identifiers_list[features_identifiers_count[gene]] #store gene updated_features_identifiers_dict[gene_ftr["ID"]] = dict() updated_features_list.append(gene_ftr) updated_features_identifiers_count[ gene_ftr["ID"]] = len(updated_features_list) - 1 for mRNA in sorted(features_identifiers_dict[gene], key=lambda x: features_identifiers_count[x]): #retrieve feature mRNA_ftr = features_identifiers_list[ features_identifiers_count[mRNA]] if ("PAC" in mRNA[0:3]): if ("Name" in mRNA_ftr): mRNA_ftr["ID"] = mRNA_ftr["Name"] updated_features_identifiers_dict[gene_ftr["ID"]][ mRNA_ftr["ID"]] = dict() updated_features_parents_dict[mRNA_ftr["ID"]] = mRNA_ftr["Parent"] updated_features_list.append(mRNA_ftr) updated_features_identifiers_count[ mRNA_ftr["ID"]] = len(updated_features_list) - 1 for CDS in sorted(features_identifiers_dict[gene][mRNA], key=lambda x: features_identifiers_count[x]): #retrieve feature CDS_ftr = features_identifiers_list[ features_identifiers_count[CDS]] if ("PAC" in CDS[0:3]): CDS_ftr["ID"] = mRNA_ftr["ID"] + ".CDS" if (CDS_ftr["ID"] not in updated_CDS_count): updated_CDS_count[CDS_ftr["ID"]] = 1 else: updated_CDS_count[CDS_ftr["ID"]] += 1 CDS_ftr["ID"] += "." + str( updated_CDS_count[CDS_ftr["ID"]]) CDS_ftr["Parent"] = mRNA_ftr["ID"] updated_features_identifiers_dict[gene_ftr["ID"]][ mRNA_ftr["ID"]][CDS_ftr["ID"]] = 1 updated_features_parents_dict[ CDS_ftr["ID"]] = CDS_ftr["Parent"] updated_features_list.append(CDS_ftr) updated_features_identifiers_count[ CDS_ftr["ID"]] = len(updated_features_list) - 1 genome_features_list = list() genome_mrnas_list = list() genome_cdss_list = list() for gene in sorted(updated_features_identifiers_dict): #retrieve updated object gene_ftr = updated_features_list[ updated_features_identifiers_count[gene]] gene_object = convert_ftr_object( gene_ftr, assembly["contigs"][gene_ftr["contig"]]["sequence"]) gene_object["type"] = "gene" #New terms, TODO, move to end of gene loop gene_object["cdss"] = list() gene_object["mrnas"] = list() #use function of longest CDS for gene longest_protein_length = 0 longest_protein_sequence = "" for mRNA in sorted( updated_features_identifiers_dict[gene], key=lambda x: updated_features_identifiers_count[x]): #retrieve updated object mRNA_ftr = updated_features_list[ updated_features_identifiers_count[mRNA]] feature_object = convert_ftr_object( mRNA_ftr, assembly["contigs"][mRNA_ftr["contig"]]["sequence"]) feature_object['parent_gene'] = gene_object['id'] mrna_object = copy.deepcopy(feature_object) cds_object = copy.deepcopy(feature_object) cds_object['id'] = mrna_object['id'] + ".CDS" mrna_object['cds'] = cds_object['id'] cds_object['parent_mrna'] = mrna_object['id'] del mrna_object["dna_sequence"] del mrna_object["dna_sequence_length"] cds_object["ontology_terms"] = dict() gene_object["mrnas"].append(mrna_object["id"]) gene_object["cdss"].append(cds_object["id"]) #CDS aggregation needs to be done in order to build protein sequence and list of locations CDS_list = sorted( updated_features_identifiers_dict[gene][mRNA], key=lambda x: updated_features_identifiers_count[x]) dna_sequence = "" locations = list() #collect phases, and lengths of exons #right now, this is only for the purpose of error reporting phases = list() exons = list() for CDS in (CDS_list): #retrieve updated partial CDS add_ftr = updated_features_list[ updated_features_identifiers_count[CDS]] phases.append(add_ftr["phase"]) add_ftr_obj = convert_ftr_object( add_ftr, assembly["contigs"][add_ftr["contig"]]["sequence"]) exons.append(len(add_ftr_obj["dna_sequence"])) #Remove base(s) according to phase, but only for first CDS if (CDS == CDS_list[0] and int(add_ftr["phase"]) != 0): logger.info("Adjusting phase for first CDS: " + CDS) add_ftr_obj["dna_sequence"] = add_ftr_obj["dna_sequence"][ int(add_ftr["phase"]):] dna_sequence += add_ftr_obj["dna_sequence"] locations.append(add_ftr_obj["location"][0]) #translate sequence dna_sequence_obj = Seq(dna_sequence, IUPAC.ambiguous_dna) rna_sequence = dna_sequence_obj.transcribe() #Incomplete gene model with no start codon #Translate as is if str(rna_sequence.upper())[:3] not in codon_table.start_codons: logger.info("Missing start codon for " + feature_object["id"] + " Assuming incomplete gene model.") #temp_seq = 'AUG'+str(rna_sequence.upper())[3:] #rna_sequence = Seq(temp_seq, IUPAC.ambiguous_dna) #You should never have this problem, needs to be reported rather than "fixed" codon_count = len(str(rna_sequence)) % 3 if codon_count != 0: logger.info( "Number of bases for RNA sequence for " + feature_object["id"] + " is not divisible by 3. The resulting protein may well be mis-translated." ) #temp_seq = str(rna_sequence.upper())+"N" #if codon_count == 1: # temp_seq+="N" #new_codon_count=len(temp_seq) % 3 #rna_sequence = Seq(temp_seq, IUPAC.ambiguous_dna) protein_sequence = Seq("") try: protein_sequence = rna_sequence.translate() #cds=True) except CodonTable.TranslationError as te: logger.info("TranslationError for: " + feature_object["id"], phases, exons, " : " + str(te)) cds_object["protein_translation"] = str(protein_sequence).upper() cds_object["protein_translation_length"] = len( cds_object["protein_translation"]) cds_object["md5"] = hashlib.md5( cds_object["protein_translation"]).hexdigest() if (cds_object["protein_translation_length"] > longest_protein_length): longest_protein_length = cds_object[ "protein_translation_length"] longest_protein_sequence = cds_object["protein_translation"] del cds_object["dna_sequence"] del cds_object["dna_sequence_length"] if ("aliases" not in cds_object): cds_object["aliases"] = list() if ("function" not in cds_object): cds_object["function"] = "" #End of mRNA loop genome_mrnas_list.append(mrna_object) genome_cdss_list.append(cds_object) #End of gene loop gene_object["ontology_terms"] = dict() gene_object["protein_translation"] = longest_protein_sequence gene_object["protein_translation_length"] = longest_protein_length genome_features_list.append(gene_object) #remove sequences before loading for contig in assembly["contigs"]: del assembly["contigs"][contig]["sequence"] # assembly_string = simplejson.dumps(assembly, sort_keys=True, indent=4, ensure_ascii=False) # assembly_file = open("Bulk_Phytozome_Upload/"+assembly["name"]+'.json', 'w+') # assembly_file.write(assembly_string) # assembly_file.close() if (assembly_ref == None): #Upload FASTA to shock #Need to gunzip file first gunzipped_fasta_file = input_fasta_file # gunzipped_fasta_file=input_fasta_file[0:-3] # with gzip.open(input_fasta_file, 'rb') as f_in: # with open(gunzipped_fasta_file, 'wb') as f_out: # shutil.copyfileobj(f_in, f_out) token = os.environ.get('KB_AUTH_TOKEN') logger.info("Attempting Assembly save for %s" % (assembly["assembly_id"])) aUtil = AssemblyUtil(callback_url) assembly_ref = aUtil.save_assembly_from_fasta({ 'file': { 'path': gunzipped_fasta_file, 'assembly_name': assembly['assembly_id'] }, 'workspace_name': workspace_name, 'assembly_name': assembly['assembly_id'] }) logger.info("Assembly saved for %s" % (assembly["name"])) #Remove gunzipped file #os.remove(input_fasta_file[0:-3]) genome = dict() genome["id"] = core_genome_name genome["scientific_name"] = scientific_name genome["assembly_ref"] = assembly_ref genome["features"] = genome_features_list genome["cdss"] = genome_cdss_list genome["mrnas"] = genome_mrnas_list genome["source"] = source genome["domain"] = "Eukaryota" genome["genetic_code"] = 1 genome["gc_content"] = assembly["gc_content"] genome["dna_size"] = assembly["dna_size"] if taxon_reference is not None: genome["taxon_ref"] = taxon_reference genome["taxonomy"] = taxonomy UserMeta = dict() UserMeta['Taxonomy'] = taxonomy UserMeta['Source'] = source UserMeta['Domain'] = "Eukaryota" UserMeta['Source ID'] = core_genome_name UserMeta['Name'] = scientific_name UserMeta['Genetic code'] = 1 UserMeta['GC content'] = assembly["gc_content"] UserMeta['Size'] = assembly["dna_size"] UserMeta['Number contigs'] = assembly['num_contigs'] #id_source_version_array = core_genome_name.split("_") #version = "_".join(id_source_version_array[2:]) #UserMeta['Version']=version #UserMeta['url']=''; if (gff_handle_ref == None): token = os.environ.get('KB_AUTH_TOKEN') file_upload = dfUtil.file_to_shock({ 'file_path': input_gff_file, 'make_handle': 1, 'pack': "gzip" }) gff_handle_ref = file_upload['handle']['hid'] genome['gff_handle_ref'] = gff_handle_ref # genome_string = simplejson.dumps(genome, sort_keys=True, indent=4, ensure_ascii=False) # genome_file = open("Bulk_Phytozome_Upload/"+core_genome_name+'.json', 'w+') # genome_file.write(genome_string) # genome_file.close() logger.info("Attempting Genome save for %s" % (core_genome_name)) workspace_id = dfUtil.ws_name_to_id(workspace_name) genome_info = dfUtil.save_objects({ "id": workspace_id, "objects": [{ "name": core_genome_name, "type": "KBaseGenomes.Genome", "data": genome }] })[0] logger.info("Genome saved for %s" % (core_genome_name)) return {'genome_info': genome_info, 'report_string': ""}