def _get_cached_index(self, assembly_info, validated_params): try: # note: list_reference_objects does not yet support reference paths, so we need to call # with the direct reference. So we won't get a cache hit if you don't have direct access # to the assembly object right now (although you can still always build the assembly object) # Once this call supports paths, this should be changed to set ref = assembly_info['ref'] info = assembly_info['info'] ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) objs = self.ws.list_referencing_objects([{'ref': ref}])[0] # iterate through each of the objects that reference the assembly bwa_indexes = [] for o in objs: if o[2].startswith('KBaseRNASeq.Bowtie2IndexV2'): bwa_indexes.append(o) # Nothing refs this assembly, so cache miss if len(bwa_indexes) == 0: return False # if there is more than one hit, get the most recent one # (obj_info[3] is the save_date timestamp (eg 2017-05-30T22:56:49+0000), so we can sort on that) bwa_indexes.sort(key=lambda x: x[3]) bwa_index_info = bwa_indexes[-1] index_ref = str(bwa_index_info[6]) + '/' + str(bwa_index_info[0]) + '/' + str(bwa_index_info[4]) # get the object data index_obj_data = self.ws.get_objects2({'objects': [{'ref': index_ref}]})['data'][0]['data'] # download the handle object os.makedirs(validated_params['output_dir']) dfu = DataFileUtil(self.callback_url) dfu.shock_to_file({'file_path': os.path.join(validated_params['output_dir'], 'bt2_index.tar.gz'), 'handle_id': index_obj_data['handle']['hid'], 'unpack': 'unpack'}) print('Cache hit: ') pprint(index_obj_data) return {'output_dir': validated_params['output_dir'], 'index_files_basename': index_obj_data['index_files_basename']} except Exception: # if we fail in saving the cached object, don't worry print('WARNING: exception encountered when trying to lookup in cache:') print(traceback.format_exc()) print('END WARNING: exception encountered when trying to lookup in cache.') return None
class VariationToVCF: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) def is_gz_file(filepath): with open(filepath, 'rb') as test_f: return binascii.hexlify(test_f.read(2)) == b'1f8b' def export_as_vcf(self, params): if 'input_var_ref' not in params: raise ValueError('Cannot export Variation- no input_var_ref field defined.') file = self.variation_to_vcf({'variation_ref': params['input_var_ref']}) export_dir = os.path.join(self.scratch, file['variation_name']) os.makedirs(export_dir) try: shutil.move(file['path'], os.path.join(export_dir, os.path.basename(file['path']))) except shutil.Error as e: exit(e) dfupkg = self.dfu.package_for_download({ 'file_path': export_dir, 'ws_refs': [params['input_var_ref']] }) return {'shock_id': dfupkg['shock_id']} def variation_to_vcf(self, params): self.validate_params(params) print('downloading ws object data: '+params["variation_ref"]) variation_obj = self.dfu.get_objects({'object_refs': [params['variation_ref']]})['data'][0] ws_type = variation_obj['info'][2] obj_name = variation_obj['info'][1] if 'KBaseGwasData.Variations' in ws_type: dl_path = self.process_vcf(self.scratch, variation_obj['data']) else: raise ValueError('Cannot write data to VCF; invalid WS type (' + ws_type + '). Supported types is KBaseGwasData.Variations') return {'path': dl_path, 'variation_name': obj_name} def process_vcf(self, output_vcf_file_path, data): obj = self.dfu.shock_to_file({ 'handle_id': data['vcf_handle_ref'], 'file_path': output_vcf_file_path, }) return obj['file_path'] def validate_params(self, params): for key in ['variation_ref']: if key not in params: raise ValueError('required "' + key + '" field was not defined')
def get_annotated_metagenome_assembly_features(self, params): """ params: ref - workspace reference for KBaseMetagenomes.AnnotatedMetagenomeAssembly object output: features - list of features, each representing a dict. """ ref = params['ref'] self._confirm_ws_type(ref) ret = self.ws.get_objects2( {"objects": [{ "ref": ref, "included": ["features_handle_ref"] }]})['data'] features_handle_ref = ret[0]['data']['features_handle_ref'] dfu = DataFileUtil(self.cb_url, token=self.token) file_name = 'features.json.gz' file_path = os.path.join(self.scratch, file_name) shock_ret = dfu.shock_to_file({ 'handle_id': features_handle_ref, 'file_path': file_path, 'unpack': "uncompress" }) file_path = shock_ret['file_path'] with open(file_path) as fd: json_features = json.load(fd) if params.get('feature_type'): accepted_feature_types = [ "cds", "gene", "mrna", "trna", "rrna", "repeat_region" ] feat_type = params['feature_type'] if feat_type.lower() not in accepted_feature_types: raise ValueError( f"{feat_type} not an accepted feature type; accepted feature" " types (in lower case) are {accepted_feature_types}") json_features = [ feature for feature in json_features if feature['type'].lower() == feat_type.lower() ] if params.get('only_ids'): json_features = [{ 'id': feature['id'] } for feature in json_features] return {'features': json_features}
class AssemblyToFasta: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) def export_as_fasta(self, ctx, params): """ Used almost exclusively for download only """ # validate parameters if 'input_ref' not in params: raise ValueError( 'Cannot export Assembly- not input_ref field defined.') # export to a file file = self.assembly_as_fasta(ctx, {'ref': params['input_ref']}) # create the output directory and move the file there export_package_dir = os.path.join(self.scratch, file['assembly_name']) os.makedirs(export_package_dir) shutil.move( file['path'], os.path.join(export_package_dir, os.path.basename(file['path']))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) return {'shock_id': package_details['shock_id']} def assembly_as_fasta(self, ctx, params): """ main function that accepts a ref to an object and writes a file """ self.validate_params(params) print(f'downloading ws object data ({ params["ref"]})') assembly_object = self.dfu.get_objects( {'object_refs': [params['ref']]})['data'][0] ws_type = assembly_object['info'][2] obj_name = assembly_object['info'][1] if 'filename' in params: output_filename = params['filename'] else: output_filename = obj_name + '.fa' output_fasta_file_path = os.path.join(self.scratch, output_filename) if 'KBaseGenomes.ContigSet' in ws_type: self.process_legacy_contigset(output_fasta_file_path, assembly_object['data']) elif 'KBaseGenomeAnnotations.Assembly' in ws_type: self.process_assembly(output_fasta_file_path, assembly_object['data']) else: raise ValueError( 'Cannot write data to fasta; invalid WS type (' + ws_type + '). Supported types are KBaseGenomes.ContigSet and ' + 'KBaseGenomeAnnotations.Assembly') return {'path': output_fasta_file_path, 'assembly_name': obj_name} def fasta_rows_generator_from_contigset(self, contig_list): """ generates SeqRecords iterator for writing from a legacy contigset object """ for contig in contig_list: description = '' if 'description' in contig and contig['description']: description = contig['description'] yield SeqRecord(Seq(contig['sequence'], SingleLetterAlphabet), id=contig['id'], description=description) def process_legacy_contigset(self, output_fasta_path, data): SeqIO.write(self.fasta_rows_generator_from_contigset(data['contigs']), output_fasta_path, "fasta") def process_assembly(self, output_fasta_path, data): self.dfu.shock_to_file({ 'handle_id': data['fasta_handle_ref'], 'file_path': output_fasta_path, 'unpack': 'uncompress' }) def validate_params(self, params): for key in ['ref']: if key not in params: raise ValueError('required "' + key + '" field was not defined')
class GenbankToGenome: def __init__(self, config): self.cfg = config self.gi = GenomeInterface(config) self.dfu = DataFileUtil(config.callbackURL) self.aUtil = AssemblyUtil(config.callbackURL) self.ws = Workspace(config.workspaceURL) self._messages = [] self.time_string = str( datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() self.version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) self.generate_parents = False self.generate_ids = False self.genes = OrderedDict() self.mrnas = OrderedDict() self.cdss = OrderedDict() self.noncoding = [] self.ontologies_present = defaultdict(dict) self.ontology_events = list() self.skiped_features = Counter() self.feature_counts = Counter() self.orphan_types = Counter() self.contig_seq = {} self.circ_contigs = set() self.features_spaning_zero = set() self.genome_warnings = [] self.genome_suspect = False self.defects = Counter() self.spoofed_genes = 0 self.excluded_features = ('source', 'exon', 'fasta_record') self.ont_mappings = load_ontology_mappings('/kb/module/data') self.code_table = 11 self.re_api_url = config.re_api_url # dict with feature 'id's that have been used more than once. self.used_twice_identifiers = {} self.default_params = { 'source': 'Genbank', 'taxon_wsname': self.cfg.raw['taxon-workspace-name'], 'taxon_lookup_obj_name': self.cfg.raw['taxon-lookup-object-name'], 'ontology_wsname': self.cfg.raw['ontology-workspace-name'], 'ontology_GO_obj_name': self.cfg.raw['ontology-gene-ontology-obj-name'], 'ontology_PO_obj_name': self.cfg.raw['ontology-plant-ontology-obj-name'], 'release': None, 'genetic_code': 11, 'generate_ids_if_needed': 0, 'metadata': {} } @property def messages(self): return "\n".join(self._messages) def refactored_import(self, ctx, params): # 1) validate parameters and extract defaults self.validate_params(params) # 2) construct the input directory staging area input_directory = self.stage_input(params) # 3) update default params self.default_params.update(params) params = self.default_params self.generate_parents = params.get('generate_missing_genes') self.generate_ids = params.get('generate_ids_if_needed') if params.get('genetic_code'): self.code_table = params['genetic_code'] # 4) Do the upload files = self._find_input_files(input_directory) consolidated_file = self._join_files_skip_empty_lines(files) genome = self.parse_genbank(consolidated_file, params) if params.get('genetic_code'): genome["genetic_code"] = params['genetic_code'] result = self.gi.save_one_genome({ 'workspace': params['workspace_name'], 'name': params['genome_name'], 'data': genome, "meta": params['metadata'], }) ref = f"{result['info'][6]}/{result['info'][0]}/{result['info'][4]}" logging.info(f"Genome saved to {ref}") # 5) clear the temp directory shutil.rmtree(input_directory) # 6) return the result info = result['info'] details = {'genome_ref': ref, 'genome_info': info} return details @staticmethod def validate_params(params): if 'workspace_name' not in params: raise ValueError('required "workspace_name" field was not defined') if 'genome_name' not in params: raise ValueError('required "genome_name" field was not defined') if 'file' not in params: raise ValueError('required "file" field was not defined') # one and only one of 'path', 'shock_id', or 'ftp_url' is required file = params['file'] if not isinstance(file, dict): raise ValueError('required "file" field must be a map/dict') sources = ('path', 'shock_id', 'ftp_url') n_valid_fields = sum(1 for f in sources if file.get(f)) if n_valid_fields < 1: raise ValueError(f'required "file" field must include one source: ' f'{", ".join(sources)}') if n_valid_fields > 1: raise ValueError( f'required "file" field has too many sources specified: ' f'{", ".join(file.keys())}') if params.get('genetic_code'): if not (isinstance(params['genetic_code'], int) and 0 < params['genetic_code'] < 32): raise ValueError(f"Invalid genetic code specified: {params}") def stage_input(self, params): """ Setup the input_directory by fetching the files and uncompressing if needed. """ # construct the input directory where we stage files input_directory = os.path.join( self.cfg.sharedFolder, f'genome-upload-staging-{uuid.uuid4()}') os.makedirs(input_directory) # at this point, the 'file' input is validated, so we don't have to catch any special cases # we expect one and only one of path, shock_id, or ftp_url # determine how to get the file: if it is from shock, download it. If it # is just sitting there, then use it. Move the file to the staging input directory file = params['file'] genbank_file_path = None if file.get('path') is not None: # copy the local file to the input staging directory # (NOTE: could just move it, but then this method would have the side effect of moving your # file which another SDK module might have an open handle on) local_file_path = file['path'] genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path)) shutil.copy2(local_file_path, genbank_file_path) if 'shock_id' in file and file['shock_id'] is not None: # handle shock file logging.info( f'Downloading file from SHOCK node: {self.cfg.shockURL} - {file["shock_id"]}' ) sys.stdout.flush() file_name = self.dfu.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] genbank_file_path = os.path.join(input_directory, file_name) if 'ftp_url' in file and file['ftp_url'] is not None: logging.info('Downloading file from: ' + str(file['ftp_url'])) local_file_path = self.dfu.download_web_file({ 'file_url': file['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path)) shutil.copy2(local_file_path, genbank_file_path) # extract the file if it is compressed if genbank_file_path is not None: logging.info("staged input file =" + genbank_file_path) self.dfu.unpack_file({'file_path': genbank_file_path}) else: raise ValueError( 'No valid files could be extracted based on the input') return input_directory def parse_genbank(self, file_path, params): logging.info("Saving original file to shock") shock_res = self.dfu.file_to_shock({ 'file_path': file_path, 'make_handle': 1, 'pack': 'gzip', }) # Write and save assembly file assembly_ref = self._save_assembly(file_path, params) assembly_data = self.dfu.get_objects({ 'object_refs': [assembly_ref], 'ignore_errors': 0 })['data'][0]['data'] genome = { "id": params['genome_name'], "original_source_file_name": os.path.basename(file_path), "assembly_ref": assembly_ref, "gc_content": assembly_data['gc_content'], "dna_size": assembly_data['dna_size'], "md5": assembly_data['md5'], "genbank_handle_ref": shock_res['handle']['hid'], "publications": set(), "contig_ids": [], "contig_lengths": [], } genome['source'], genome['genome_tiers'] = self.gi.determine_tier( params['source']) if params.get('genome_type'): genome['genome_type'] = params['genome_type'] # Set taxonomy-related fields in the genome # Also validates the given taxon ID if params.get('taxon_id'): set_taxon_data(int(params['taxon_id']), self.re_api_url, genome) else: set_default_taxon_data(genome) dates = [] # Parse data from genbank file contigs = Bio.SeqIO.parse(file_path, "genbank") for record in contigs: r_annot = record.annotations logging.info("parsing contig: " + record.id) try: dates.append(time.strptime(r_annot.get('date'), "%d-%b-%Y")) except (TypeError, ValueError): pass genome['contig_ids'].append(record.id) genome['contig_lengths'].append(len(record)) genome["publications"] |= self._get_pubs(r_annot) # only do the following once(on the first contig) if "source_id" not in genome: genome["source_id"] = record.id.split('.')[0] organism = r_annot.get('organism', 'Unknown Organism') if params.get('scientific_name'): genome['scientific_name'] = params['scientific_name'] else: genome['scientific_name'] = organism self.code_table = genome['genetic_code'] genome["molecule_type"] = r_annot.get('molecule_type', 'DNA') genome['notes'] = r_annot.get('comment', "").replace('\\n', '\n') self._parse_features(record, genome['source']) genome.update(self.get_feature_lists()) genome['num_contigs'] = len(genome['contig_ids']) # add dates dates.sort() if dates: genome['external_source_origination_date'] = time.strftime( "%d-%b-%Y", dates[0]) if dates[0] != dates[-1]: genome['external_source_origination_date'] += " _ " + \ time.strftime("%d-%b-%Y", dates[-1]) if self.ontologies_present: genome['ontologies_present'] = dict(self.ontologies_present) genome["ontology_events"] = self.ontology_events genome['feature_counts'] = dict(self.feature_counts) # can't serialize a set genome['publications'] = list(genome['publications']) if len(genome['cdss']) and (self.defects['cds_seq_not_matching'] / float(len(genome['cdss'])) > 0.02): self.genome_warnings.append( warnings["genome_inc_translation"].format( self.defects['cds_seq_not_matching'], len(genome['cdss']))) self.genome_suspect = 1 if self.defects['bad_parent_loc']: self.genome_warnings.append( f"There were {self.defects['bad_parent_loc']} parent/child " "relationships that were not able to be determined. Some of " "these may have splice variants that may be valid relationships." ) if self.defects['spoofed_genes']: self.genome_warnings.append(warnings['spoofed_genome'].format( self.defects['spoofed_genes'])) genome['suspect'] = 1 if self.defects['not_trans_spliced']: self.genome_warnings.append( warnings['genome_not_trans_spliced'].format( self.defects['not_trans_spliced'])) genome['suspect'] = 1 if self.genome_warnings: genome['warnings'] = self.genome_warnings if self.genome_suspect: genome['suspect'] = 1 logging.info(f"Feature Counts: {genome['feature_counts']}") return genome def _save_assembly(self, genbank_file, params): """Convert genbank file to fasta and sve as assembly""" contigs = Bio.SeqIO.parse(genbank_file, "genbank") assembly_id = f"{params['genome_name']}_assembly" fasta_file = f"{self.cfg.sharedFolder}/{params['genome_name']}_assembly.fasta" out_contigs = [] extra_info = defaultdict(dict) for in_contig in contigs: if in_contig.annotations.get('topology', "") == 'circular': extra_info[in_contig.id]['is_circ'] = 1 self.circ_contigs.add(in_contig.id) elif in_contig.annotations.get('topology', "") == 'linear': extra_info[in_contig.id]['is_circ'] = 0 out_contigs.append(in_contig) self.contig_seq[in_contig.id] = in_contig.seq.upper() assembly_ref = params.get("use_existing_assembly") if assembly_ref: if not re.match("\d+\/\d+\/\d+", assembly_ref): raise ValueError( f"Assembly ref: {assembly_ref} is not a valid format. Must" f" be in numerical <ws>/<object>/<version> format.") ret = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0] if "KBaseGenomeAnnotations.Assembly" not in ret['info'][2]: raise ValueError( f"{assembly_ref} is not a reference to an assembly") unmatched_ids = list() unmatched_ids_md5s = list() for current_contig in self.contig_seq.keys(): current_contig_md5 = hashlib.md5( str(self.contig_seq[current_contig]).encode( 'utf8')).hexdigest() if current_contig in ret['data']['contigs']: if current_contig_md5 != ret['data']['contigs'][ current_contig]['md5']: unmatched_ids_md5s.append(current_contig) else: unmatched_ids.append(current_contig) if len(unmatched_ids) > 0: raise ValueError(warnings['assembly_ref_extra_contigs'].format( ", ".join(unmatched_ids))) if len(unmatched_ids_md5s) > 0: raise ValueError(warnings["assembly_ref_diff_seq"].format( ", ".join(unmatched_ids_md5s))) logging.info(f"Using supplied assembly: {assembly_ref}") return assembly_ref logging.info("Saving sequence as Assembly object") Bio.SeqIO.write(out_contigs, fasta_file, "fasta") assembly_ref = self.aUtil.save_assembly_from_fasta({ 'file': { 'path': fasta_file }, 'workspace_name': params['workspace_name'], 'assembly_name': assembly_id, 'type': params.get('genome_type', 'isolate'), 'contig_info': extra_info }) logging.info(f"Assembly saved to {assembly_ref}") return assembly_ref def _find_input_files(self, input_directory): logging.info("Scanning for Genbank Format files.") valid_extensions = [".gbff", ".gbk", ".gb", ".genbank", ".dat", ".gbf"] files = os.listdir(os.path.abspath(input_directory)) logging.info("Genbank Files : " + ", ".join(files)) genbank_files = [ x for x in files if os.path.splitext(x)[-1].lower() in valid_extensions ] if len(genbank_files) == 0: raise Exception( f"The input directory does not have any files with one of the " f"following extensions {','.join(valid_extensions)}.") logging.info(f"Found {len(genbank_files)} genbank files") input_files = [] for genbank_file in genbank_files: input_files.append(os.path.join(input_directory, genbank_file)) return input_files def _join_files_skip_empty_lines(self, input_files): """ Applies strip to each line of each input file. Args: input_files: Paths to input files in Genbank format. Returns: Path to resulting file (currenly it's the same file as input). """ if len(input_files) == 0: raise ValueError("NO GENBANK FILE") temp_dir = os.path.join(os.path.dirname(input_files[0]), "combined") if not os.path.exists(temp_dir): os.makedirs(temp_dir) ret_file = os.path.join(temp_dir, os.path.basename(input_files[0])) # take in Genbank file and remove all empty lines from it. with open(ret_file, 'w', buffering=2**20) as f_out: for input_file in input_files: with open(input_file, 'r') as f_in: for line in f_in: line = line.rstrip('\r\n') if line.strip(): f_out.write(line + '\n') return ret_file def _get_pubs(self, r_annotations): """Get a contig's publications""" pub_list = [] for in_pub in r_annotations.get('references', []): # don't add blank pubs if not in_pub.authors: continue out_pub = [ 0, # pmid "", # source in_pub.title, "", # web address "", # date in_pub.authors, in_pub.journal, ] date_match = re.match("\((\d{4})\)", in_pub.journal) if date_match: out_pub[4] = date_match.group(1) if in_pub.pubmed_id: out_pub[0:4] = [ int(in_pub.pubmed_id), "PubMed", in_pub.title, f"http://www.ncbi.nlm.nih.gov/pubmed/{in_pub.pubmed_id}" ] pub_list.append(tuple(out_pub)) logging.info(f"Parsed {len(pub_list)} publication records") return set(pub_list) def _get_id(self, feat, tags=None): """Assign a id to a feature based on the first tag that exists""" _id = "" if not tags: tags = ['locus_tag', 'kbase_id'] for t in tags: _id = feat.qualifiers.get(t, [""])[0] if _id: break if not _id: if feat.type == 'gene': if not self.generate_ids: raise ValueError( f"Unable to find a valid id for gene " f"among these tags: {', '.join(tags)}. Correct the " f"file or rerun with generate_ids\n {feat}") self.orphan_types['gene'] += 1 _id = f"gene_{self.orphan_types['gene']}" if 'rna' in feat.type.lower() or feat.type in { 'CDS', 'sig_peptide', 'five_prime_UTR', 'three_prime_UTR' }: _id = f"gene_{self.orphan_types['gene']}" return _id def _parse_features(self, record, source): def _location(feat): """Convert to KBase style location objects""" strand_trans = ("", "+", "-") loc = [] for part in feat.location.parts: contig_id = part.ref if part.ref else record.id if part.strand >= 0: begin = int(part.start) + 1 else: begin = int(part.end) loc.append( (contig_id, begin, strand_trans[part.strand], len(part))) return loc def _warn(message): if message not in out_feat.get('warnings', []): out_feat['warnings'] = out_feat.get('warnings', []) + [message] def _check_suspect_location(parent=None): if 'trans_splicing' in out_feat.get('flags', []): return if out_feat['location'] == sorted( out_feat['location'], reverse=(in_feature.location.strand == -1)): return if record.id in self.circ_contigs and \ in_feature.location.start == 0 \ and in_feature.location.end == len(record): self.features_spaning_zero.add(out_feat['id']) return if parent and parent['id'] in self.features_spaning_zero: return _warn(warnings['not_trans_spliced']) self.defects['not_trans_spliced'] += 1 for in_feature in record.features: if in_feature.type in self.excluded_features: self.skiped_features[in_feature.type] += 1 continue feat_seq = self._get_seq(in_feature, record.id) if source == "Ensembl": _id = self._get_id(in_feature, ['gene', 'locus_tag']) else: _id = self._get_id(in_feature) # The following is common to all the feature types out_feat = { "id": "_".join([_id, in_feature.type]), "location": _location(in_feature), "dna_sequence": str(feat_seq), "dna_sequence_length": len(feat_seq), "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(), } if not _id: out_feat['id'] = in_feature.type # validate input feature # note that end is the larger number regardless of strand if int(in_feature.location.end) > len(record): self.genome_warnings.append( warnings["coordinates_off_end"].format(out_feat['id'])) self.genome_suspect = 1 continue for piece in in_feature.location.parts: if not isinstance(piece.start, ExactPosition) \ or not isinstance(piece.end, ExactPosition): _warn(warnings["non_exact_coordinates"]) self.feature_counts[in_feature.type] += 1 # add optional fields if 'note' in in_feature.qualifiers: out_feat['note'] = in_feature.qualifiers["note"][0] out_feat.update(self._get_aliases_flags_functions(in_feature)) ont, db_xrefs = self._get_ontology_db_xrefs(in_feature) if ont: out_feat['ontology_terms'] = ont if db_xrefs: out_feat['db_xrefs'] = db_xrefs if 'inference' in in_feature.qualifiers: out_feat['inference_data'] = parse_inferences( in_feature.qualifiers['inference']) _check_suspect_location(self.genes.get(_id)) # add type specific features if in_feature.type == 'CDS': self.process_cds(_id, feat_seq, in_feature, out_feat) elif in_feature.type == 'gene': self.process_gene(_id, out_feat) elif in_feature.type == 'mRNA': self.process_mrna(_id, out_feat) else: self.noncoding.append( self.process_noncoding(_id, in_feature.type, out_feat)) def get_feature_lists(self): """sort genes into their final arrays""" coding = [] for g in self.genes.values(): if len(g['cdss']): if g['mrnas'] and len(g['mrnas']) != len(g['cdss']): msg = "The length of the mrna and cdss arrays are not equal" g['warnings'] = g.get('warnings', []) + [msg] # remove duplicates that may arise from CDS info propagation for key in ('functions', 'aliases', 'db_xrefs'): if key in g: g[key] = list(set(g[key])) if not g['mrnas']: del g['mrnas'] del g['type'] coding.append(g) self.feature_counts["protein_encoding_gene"] += 1 else: del g['mrnas'], g['cdss'] self.noncoding.append(g) self.feature_counts["non_coding_genes"] += 1 self.feature_counts["non_coding_features"] = len(self.noncoding) return { 'features': coding, 'non_coding_features': self.noncoding, 'cdss': list(self.cdss.values()), 'mrnas': list(self.mrnas.values()) } def _get_seq(self, feat, contig): """Extract the DNA sequence for a feature""" seq = [] for part in feat.location.parts: strand = part.strand # handle trans-splicing across contigs if part.ref: part_contig = part.ref else: part_contig = contig if strand >= 0: seq.append( str(self.contig_seq[part_contig][part.start:part.end])) else: seq.append( str(self.contig_seq[part_contig] [part.start:part.end].reverse_complement())) return "".join(seq) def _create_ontology_event(self, ontology_type): """Creates the ontology_event if necessary Returns the index of the ontology event back.""" if ontology_type not in self.ont_mappings: raise ValueError(f"{ontology_type} is not a supported ontology") if "event_index" not in self.ont_mappings[ontology_type]: self.ont_mappings[ontology_type]['event_index'] = len( self.ontology_events) if ontology_type == "GO": ontology_ref = "KBaseOntology/gene_ontology" elif ontology_type == "PO": ontology_ref = "KBaseOntology/plant_ontology" else: ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology" self.ontology_events.append({ "method": "GenomeFileUtils Genbank uploader from annotations", "method_version": self.version, "timestamp": self.time_string, "id": ontology_type, "ontology_ref": ontology_ref }) return self.ont_mappings[ontology_type]['event_index'] def _get_ontology_db_xrefs(self, feature): """Splits the ontology info from the other db_xrefs""" ontology = defaultdict(dict) db_xrefs = [] for key in ("GO_process", "GO_function", "GO_component"): ontology_event_index = self._create_ontology_event("GO") for term in feature.qualifiers.get(key, []): sp = term.split(" - ") ontology['GO'][sp[0]] = [ontology_event_index] self.ontologies_present['GO'][ sp[0]] = self.ont_mappings['GO'].get(sp[0], '') for ref in feature.qualifiers.get('db_xref', []): if ref.startswith('GO:'): ontology['GO'][ref] = [self._create_ontology_event("GO")] self.ontologies_present['GO'][ref] = self.ont_mappings[ 'GO'].get(ref, '') elif ref.startswith('PO:'): ontology['PO'][ref] = [self._create_ontology_event("PO")] self.ontologies_present['PO'][ref] = self.ont_mappings[ 'PO'].get(ref, '') elif ref.startswith('KO:'): ontology['KO'][ref] = [self._create_ontology_event("KO")] self.ontologies_present['KO'][ref] = self.ont_mappings[ 'KO'].get(ref, '') elif ref.startswith('COG'): ontology['COG'][ref] = [self._create_ontology_event("COG")] self.ontologies_present['COG'][ref] = self.ont_mappings[ 'COG'].get(ref, '') elif ref.startswith('PF'): ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")] self.ontologies_present['PFAM'][ref] = self.ont_mappings[ 'PFAM'].get(ref, '') elif ref.startswith('TIGR'): ontology['TIGRFAM'][ref] = [ self._create_ontology_event("TIGRFAM") ] self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[ 'TIGRFAM'].get(ref, '') elif ":" not in ref: db_xrefs.append(tuple(["Unknown_Source", ref])) else: db_xrefs.append(tuple(ref.split(":", 1))) return dict(ontology), sorted(db_xrefs) @staticmethod def _get_aliases_flags_functions(feat): """Get the values for aliases flags and features from qualifiers""" alias_keys = { 'locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id', 'gene', 'EC_number', 'gene_synonym' } result = defaultdict(list) for key, val_list in feat.qualifiers.items(): if key in alias_keys: result['aliases'].extend([(key, val) for val in val_list]) # flags have no other information associated with them if val_list == ['']: result['flags'].append(key) if key == 'function': result['functional_descriptions'].extend( val_list[0].split('; ')) if key == 'product': result['functions'] = val_list return result def _find_parent_gene(self, potential_id, feature): """Unfortunately, Genbank files don't have a parent ID and the features can be out of order at times. To account for this, the this function works backwards from the end of list of IDs and stops when if finds a parent with valid coordinates or it hits the maximum number of tries""" if potential_id in self.genes: lookup_attempts = 0 while lookup_attempts < MAX_PARENT_LOOKUPS: if is_parent(self.genes[potential_id], feature): return potential_id lookup_attempts += 1 try: potential_id = list( self.genes.keys())[-(lookup_attempts + 1)] except IndexError: break # no more genes that could match exist self.defects['bad_parent_loc'] += 1 return None def assign_new_id(self, _id): """given a feature id that has already been used, add a unique modifier to it""" _id_modifier = self.used_twice_identifiers.get(_id, 1) self.used_twice_identifiers[_id] = _id_modifier + 1 return _id + "." + str(_id_modifier) def process_gene(self, _id, out_feat): out_feat.update({ "id": _id, "type": 'gene', "mrnas": [], 'cdss': [], }) if _id in self.genes: _id = self.assign_new_id(_id) out_feat.update({"id": _id}) # raise ValueError(f"Duplicate gene ID: {_id}") self.genes[_id] = out_feat def process_noncoding(self, gene_id, feat_type, out_feat): out_feat["type"] = feat_type # this prevents big misc_features from blowing up the genome size if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE: del out_feat['dna_sequence'] gene_id = self._find_parent_gene(gene_id, out_feat) if gene_id: if 'children' not in self.genes[gene_id]: self.genes[gene_id]['children'] = [] out_feat['id'] += "_" + str( len(self.genes[gene_id]['children']) + 1) self.genes[gene_id]['children'].append(out_feat['id']) out_feat['parent_gene'] = gene_id else: self.orphan_types[feat_type] += 1 out_feat['id'] += "_" + str(self.orphan_types[feat_type]) return out_feat def process_mrna(self, gene_id, out_feat): if gene_id not in self.genes and self.generate_parents: self.process_gene(gene_id, copy.copy(out_feat)) gene_id = self._find_parent_gene(gene_id, out_feat) if gene_id: out_feat['id'] = "_".join( (gene_id, "mRNA", str(len(self.genes[gene_id]['mrnas']) + 1))) self.genes[gene_id]['mrnas'].append(out_feat['id']) out_feat['parent_gene'] = gene_id else: self.orphan_types['mrna'] += 1 out_feat['id'] = f"mRNA_{self.orphan_types['mrna']}" out_feat['warnings'] = out_feat.get('warnings', []) + [ 'Unable to find parent gene for ' + str(out_feat['id']) ] self.mrnas[out_feat['id']] = out_feat def process_cds(self, gene_id, feat_seq, in_feature, out_feat): # Associate CDS with parents cds_warnings = out_feat.get('warnings', []) validated_gene_id = self._find_parent_gene(gene_id, out_feat) if validated_gene_id: out_feat['id'] = "_".join( (validated_gene_id, "CDS", str(len(self.genes[validated_gene_id]['cdss']) + 1))) self.genes[validated_gene_id]['cdss'].append(out_feat['id']) out_feat['parent_gene'] = validated_gene_id elif self.generate_parents and gene_id not in self.genes: new_feat = copy.copy(out_feat) new_feat['id'] = gene_id new_feat['warnings'] = [warnings['spoofed_gene']] self.orphan_types['gene'] += 1 self.defects['spoofed_genes'] += 1 self.process_gene(new_feat['id'], new_feat) out_feat['id'] = "_".join( (gene_id, "CDS", str(len(self.genes[gene_id]['cdss']) + 1))) self.genes[gene_id]['cdss'].append(out_feat['id']) out_feat['parent_gene'] = gene_id else: self.orphan_types['cds'] += 1 out_feat['id'] = f"CDS_{self.orphan_types['cds']}" cds_warnings.append( f"Unable to find parent gene for {out_feat['id']}") # there is a 1 to 1 relationship of mRNA to CDS so XXX_mRNA_1 will match XXX_CDS_1 mrna_id = out_feat["id"].replace('CDS', 'mRNA') if mrna_id in self.mrnas: if not is_parent(self.mrnas[mrna_id], out_feat): cds_warnings.append(warnings['cds_mrna_cds'].format(mrna_id)) self.mrnas[mrna_id]['warnings'] = self.mrnas[mrna_id].get( 'warnings', []) + [warnings['cds_mrna_mrna']] self.defects['bad_parent_loc'] += 1 else: out_feat['parent_mrna'] = mrna_id self.mrnas[mrna_id]['cds'] = out_feat['id'] # process protein prot_seq = in_feature.qualifiers.get("translation", [""])[0] # allow a little slack to account for frameshift and stop codon if prot_seq and abs(len(prot_seq) * 3 - len(feat_seq)) > 4: cds_warnings.append(warnings["inconsistent_CDS_length"].format( len(feat_seq), len(prot_seq))) self.genome_warnings.append( warnings['genome_inc_CDS_length'].format( out_feat['id'], len(feat_seq), len(prot_seq))) self.genome_suspect = 1 try: if prot_seq and prot_seq != Seq.translate( feat_seq, self.code_table, cds=True).strip("*"): cds_warnings.append(warnings["inconsistent_translation"]) self.defects['cds_seq_not_matching'] += 1 except TranslationError as e: cds_warnings.append("Unable to verify protein sequence:" + str(e)) if not prot_seq: try: prot_seq = Seq.translate(feat_seq, self.code_table, cds=True).strip("*") cds_warnings.append(warnings["no_translation_supplied"]) except TranslationError as e: cds_warnings.append(warnings["no_translation_supplied"] + str(e)) out_feat.update({ "protein_translation": prot_seq, "protein_md5": hashlib.md5(prot_seq.encode('utf8')).hexdigest(), "protein_translation_length": len(prot_seq), }) if out_feat.get('parent_gene'): propagate_cds_props_to_gene(out_feat, self.genes[out_feat['parent_gene']]) if cds_warnings: out_feat['warnings'] = cds_warnings self.cdss[out_feat['id']] = out_feat
class AttributesUtil: def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.srv_wiz_url = config['srv-wiz-url'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.kbse = KBaseSearchEngine(config['search-url']) self.data_util = DataUtil(config) self.wsClient = workspaceService(self.ws_url, token=self.token) self.DEFAULT_ONTOLOGY_ID = "Custom:Term" self.DEFAULT_UNIT_ID = "Custom:Unit" self.ONT_LABEL_DEL = " - " self.ONT_TERM_DEL = ":" @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) def file_to_attribute_mapping(self, params): """Convert a user supplied file to a compound set""" if 'input_file_path' in params: scratch_file_path = params['input_file_path'] elif 'input_shock_id' in params: scratch_file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') else: raise ValueError( "Must supply either a input_shock_id or input_file_path") attr_mapping = self._file_to_am_obj(scratch_file_path) info = self.dfu.save_objects({ "id": params['output_ws_id'], "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": attr_mapping, "name": params['output_obj_name'] }] })[0] return { "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4]) } def append_file_to_attribute_mapping(self, staging_file_subdir_path, old_am_ref, output_ws_id, new_am_name=None): """append an attribute mapping file to existing attribute mapping object """ download_staging_file_params = { 'staging_file_subdir_path': staging_file_subdir_path } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') append_am_data = self._file_to_am_obj(scratch_file_path) old_am_obj = self.dfu.get_objects({'object_refs': [old_am_ref]})['data'][0] old_am_info = old_am_obj['info'] old_am_name = old_am_info[1] old_am_data = old_am_obj['data'] new_am_data = self._check_and_append_am_data(old_am_data, append_am_data) if not new_am_name: current_time = time.localtime() new_am_name = old_am_name + time.strftime('_%H_%M_%S_%Y_%m_%d', current_time) info = self.dfu.save_objects({ "id": output_ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": new_am_data, "name": new_am_name }] })[0] return { "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4]) } def update_matrix_attribute_mapping(self, params): dimension = params.get('dimension') if dimension not in ['col', 'row']: raise ValueError('Please use "col" or "row" for input dimension') workspace_name = params.get('workspace_name') old_matrix_ref = params.get('input_matrix_ref') old_matrix_obj = self.dfu.get_objects( {'object_refs': [old_matrix_ref]})['data'][0] old_matrix_info = old_matrix_obj['info'] old_matrix_data = old_matrix_obj['data'] old_am_ref = old_matrix_data.get( '{}_attributemapping_ref'.format(dimension)) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name if not old_am_ref: raise ValueError( 'Matrix object does not have {} attribute mapping'.format( dimension)) new_am_ref = self.append_file_to_attribute_mapping( params['staging_file_subdir_path'], old_am_ref, workspace_id, params['output_am_obj_name'])['attribute_mapping_ref'] old_matrix_data['{}_attributemapping_ref'.format( dimension)] = new_am_ref info = self.dfu.save_objects({ "id": workspace_id, "objects": [{ "type": old_matrix_info[2], "data": old_matrix_data, "name": params['output_matrix_obj_name'] }] })[0] new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) objects_created = [{ 'ref': new_am_ref, 'description': 'Updated Attribute Mapping' }, { 'ref': new_matrix_obj_ref, 'description': 'Updated Matrix' }] report_params = { 'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) return { 'new_matrix_obj_ref': new_matrix_obj_ref, 'new_attribute_mapping_ref': new_am_ref, 'report_name': output['name'], 'report_ref': output['ref'] } def _check_and_append_am_data(self, old_am_data, append_am_data): exclude_keys = {'attributes', 'instances'} new_am_data = { k: old_am_data[k] for k in set(list(old_am_data.keys())) - exclude_keys } old_attrs = old_am_data.get('attributes') old_insts = old_am_data.get('instances') append_attrs = append_am_data.get('attributes') append_insts = append_am_data.get('instances') # checking duplicate attributes old_attrs_names = [old_attr.get('attribute') for old_attr in old_attrs] append_attrs_names = [ append_attr.get('attribute') for append_attr in append_attrs ] duplicate_attrs = set(old_attrs_names).intersection(append_attrs_names) if duplicate_attrs: error_msg = 'Duplicate attribute mappings: [{}]'.format( duplicate_attrs) raise ValueError(error_msg) # checking missing instances missing_inst = old_insts.keys() - append_insts.keys() if missing_inst: error_msg = 'Appended attribute mapping misses [{}] instances'.format( missing_inst) raise ValueError(error_msg) new_attrs = old_attrs + append_attrs new_am_data['attributes'] = new_attrs new_insts = deepcopy(old_insts) for inst_name, val in new_insts.items(): append_val = append_insts.get(inst_name) val.extend(append_val) new_am_data['instances'] = new_insts return new_am_data def _am_data_to_df(self, data): """ Converts a compound set object data to a dataframe """ attributes = pd.DataFrame(data['attributes']) attributes.rename(columns=lambda x: x.replace("ont", "ontology"). capitalize().replace("_", " ")) instances = pd.DataFrame(data['instances']) am_df = attributes.join(instances) return am_df def _clusterset_data_to_df(self, data): """ Converts a cluster set object data to a dataframe """ original_matrix_ref = data.get('original_data') data_matrix = self.data_util.fetch_data({ 'obj_ref': original_matrix_ref }).get('data_matrix') data_df = pd.read_json(data_matrix) clusters = data.get('clusters') id_name_list = [ list(cluster.get('id_to_data_position').keys()) for cluster in clusters ] id_names = [item for sublist in id_name_list for item in sublist] if set(data_df.columns.tolist()) == set( id_names): # cluster is based on columns data_df = data_df.T cluster_names = [None] * data_df.index.size cluster_id = 0 for cluster in clusters: item_ids = list(cluster.get('id_to_data_position').keys()) item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids] for idx in item_idx: cluster_names[idx] = cluster_id cluster_id += 1 data_df['cluster'] = cluster_names return data_df def _ws_obj_to_df(self, input_ref): """Converts workspace obj to a DataFrame""" res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0] name = res['info'][1] obj_type = res['info'][2] if "KBaseExperiments.AttributeMapping" in obj_type: cs_df = self._am_data_to_df(res['data']) elif "KBaseExperiments.ClusterSet" in obj_type: cs_df = self._clusterset_data_to_df(res['data']) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please supply KBaseExperiments.AttributeMapping or KBaseExperiments.ClusterSet' raise ValueError("err_msg") return name, cs_df, obj_type def _file_to_am_obj(self, scratch_file_path): try: df = pd.read_excel(scratch_file_path, dtype='str') except XLRDError: df = pd.read_csv(scratch_file_path, sep=None, dtype='str') df = df.replace('nan', '') if df.columns[1].lower() == "attribute ontology id": am_obj = self._df_to_am_obj(df) else: am_obj = self._isa_df_to_am_object(df) return am_obj def _df_to_am_obj(self, am_df): """Converts a dataframe from a user file to a compound set object""" if not len(am_df): raise ValueError("No attributes in supplied files") attribute_df = am_df.filter(regex="[Uu]nit|[Aa]ttribute") instance_df = am_df.drop(attribute_df.columns, axis=1) if not len(instance_df.columns): raise ValueError( "Unable to find any instance columns in supplied file") attribute_df.rename( columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(), inplace=True) if "attribute" not in attribute_df.columns: raise ValueError( "Unable to find a 'attribute' column in supplied file") attribute_df['source'] = 'upload' attribute_fields = ('attribute', 'unit', 'attribute_ont_id', 'unit_ont_id', 'source') attributes = attribute_df.filter( items=attribute_fields).to_dict('records') print(attributes) self._validate_attribute_values( am_df.set_index(attribute_df.attribute).iterrows()) attribute_mapping = { 'ontology_mapping_method': "User Curation", 'attributes': [self._add_ontology_info(f) for f in attributes], 'instances': instance_df.to_dict('list') } return attribute_mapping def _isa_df_to_am_object(self, isa_df): skip_columns = { 'Raw Data File', 'Derived Data File', 'Array Data File', 'Image File' } if 'Sample Name' in isa_df.columns and not any( isa_df['Sample Name'].duplicated()): isa_df.set_index('Sample Name', inplace=True) elif 'Assay Name' in isa_df.columns and not any( isa_df['Assay Name'].duplicated()): isa_df.set_index('Assay Name', inplace=True) elif not any(isa_df[isa_df.columns[0]].duplicated()): logging.warning(f'Using {isa_df.columns[0]} as ID column') isa_df.set_index(isa_df.columns[0], inplace=True) else: raise ValueError( "Unable to detect an ID column that was unigue for each row. " f"Considered 'Sample Names', 'Assay Names' and {isa_df.columns[0]}" ) self._validate_attribute_values(isa_df.iteritems()) attribute_mapping = { 'ontology_mapping_method': "User Curation - ISA format" } attribute_mapping[ 'attributes'], new_skip_cols = self._get_attributes_from_isa( isa_df, skip_columns) reduced_isa = isa_df.drop(columns=new_skip_cols, errors='ignore') attribute_mapping['instances'] = reduced_isa.T.to_dict('list') return attribute_mapping def _validate_attribute_values(self, attribute_series): errors = {} for attr, vals in attribute_series: try: validator = getattr(AttributeValidation, attr) attr_errors = validator(vals) if attr_errors: errors[attr] = attr_errors except AttributeError: continue if errors: for attr, attr_errors in errors.items(): logging.error( f'Attribute {attr} had the following validation errors:\n' "\n".join(attr_errors) + '\n') raise ValueError( f'The following attributes failed validation: {", ".join(errors)}' f'\n See the log for details') def _get_attributes_from_isa(self, isa_df, skip_columns): attributes = [] # associate attribute columns with the other columns that relate to them for i, col in enumerate(isa_df.columns): if col.startswith('Term Source REF'): skip_columns.add(col) last_attr = attributes[-1] if '_unit' in last_attr: last_attr['_unit_ont'] = col else: last_attr['_val_ont'] = col elif col.startswith('Term Accession Number'): # If the term Accession is a web link only grab the last bit # Similarly, sometimes the number is prefixed with the term source e.x. UO_0000012 isa_df[col] = isa_df[col].map( lambda x: x.split("/")[-1].split("_")[-1]) skip_columns.add(col) last_attr = attributes[-1] if '_unit' in last_attr: last_attr['_unit_accession'] = col else: last_attr['_val_accession'] = col elif col.startswith('Unit'): skip_columns.add(col) last_attr = attributes[-1] if last_attr.get('unit'): raise ValueError( "More than one unit column is supplied for attribute {}" .format(last_attr['attribute'])) last_attr['_unit'] = col elif col not in skip_columns: split_col = col.split("|", 1) if len(split_col) > 1: attributes.append({ "attribute": split_col[0], "attribute_ont_id": split_col[1], "source": "upload" }) else: attributes.append({"attribute": col, "source": "upload"}) # handle the categories for each attribute for i, attribute in enumerate(attributes): if '_val_accession' in attribute: category_df = isa_df[[ attribute['attribute'], attribute.pop('_val_ont'), attribute.pop('_val_accession') ]].drop_duplicates() category_df[ 'attribute_ont_id'] = category_df.iloc[:, 1].str.cat( category_df.iloc[:, 2], ":") category_df['value'] = category_df[attribute['attribute']] cats = category_df.set_index(attribute['attribute'])[[ 'value', 'attribute_ont_id' ]].to_dict('index') attribute['categories'] = { k: self._add_ontology_info(v) for k, v in cats.items() } if '_unit' in attribute: units = isa_df[attribute.pop('_unit')].unique() if len(units) > 1: raise ValueError( "More than one unit type is supplied for attribute {}: {}" .format(attribute['attribute'], units)) attribute['unit'] = units[0] if '_unit_ont' in attribute: unit_ont = isa_df[attribute.pop('_unit_ont')].str.cat( isa_df[attribute.pop('_unit_accession')], ":").unique() if len(units) > 1: raise ValueError( "More than one unit ontology is supplied for attribute " "{}: {}".format(attribute['attribute'], unit_ont)) attribute['unit_ont_id'] = unit_ont[0] attributes[i] = self._add_ontology_info(attribute) return attributes, skip_columns def _search_ontologies(self, term, closest=False): """ Match to an existing KBase ontology term :param term: Test to match :param closest: if false, term must exactly match an ontology ID :return: dict(ontology_ref, id) """ params = { "object_types": ["OntologyTerm"], "match_filter": { "lookup_in_keys": { "id": { "value": term } } }, "access_filter": { "with_private": 0, "with_public": 1 }, "pagination": { "count": 1 }, "post_processing": { "skip_data": 1 } } if closest: params['match_filter'] = {"full_text_in_all": term} res = self.kbse.search_objects(params) if not res['objects']: return None term = res['objects'][0] return { "ontology_ref": term['guid'].split(":")[1], "id": term['key_props']['id'] } def _add_ontology_info(self, attribute): """Searches KBASE ontologies for terms matching the user supplied attributes and units. Add the references if found""" optionals = { "unit", "unit_ont_id", "unit_ont_ref", } attribute = { k: v for k, v in attribute.items() if k not in optionals or v != "" } ont_info = self._search_ontologies( attribute.get('attribute_ont_id', "").replace("_", ":")) if ont_info: attribute['attribute_ont_ref'] = ont_info['ontology_ref'] attribute['attribute_ont_id'] = ont_info['id'] elif not attribute.get( 'attribute_ont_id') or attribute['attribute_ont_id'] == ":": attribute.pop('attribute_ont_id', None) if attribute.get('unit'): ont_info = self._search_ontologies( attribute.get('unit_ont_id', '').replace("_", ":")) if ont_info: attribute['unit_ont_ref'] = ont_info['ontology_ref'] attribute['unit_ont_id'] = ont_info['id'] elif not attribute.get( 'attribute_ont_id') or attribute['unit_ont_id'] == ":": attribute.pop('unit_ont_id', None) return attribute def to_tsv(self, params): """Convert an compound set to TSV file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".tsv") df.to_csv(files['file_path'], sep="\t", index=False) return _id, files def to_excel(self, params): """Convert an compound set to Excel file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".xlsx") writer = pd.ExcelWriter(files['file_path']) if "KBaseExperiments.AttributeMapping" in obj_type: df.to_excel(writer, "Attributes", index=False) elif "KBaseExperiments.ClusterSet" in obj_type: df.to_excel(writer, "ClusterSet", index=True) # else is checked in `_ws_obj_to_df` writer.save() return _id, files def export(self, file, name, input_ref): """Saves a set of files to SHOCK for export""" export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4())) os.makedirs(export_package_dir) shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [input_ref] }) return {'shock_id': package_details['shock_id']}
class FastaGFFToGenome: def __init__(self, config): self.cfg = config self.au = AssemblyUtil(config.callbackURL) self.dfu = DataFileUtil(self.cfg.callbackURL) self.gi = GenomeInterface(self.cfg) self.taxon_wsname = self.cfg.raw['taxon-workspace-name'] self.time_string = str( datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() mod_match = re.search(r'module-version:\n\W+(.+)\n', yml_text) if mod_match: self.version = mod_match.group(1) else: self.version = None self.ont_mappings = load_ontology_mappings('/kb/module/data') self.code_table = 11 self.skip_types = ('exon', 'five_prime_UTR', 'three_prime_UTR', 'start_codon', 'stop_codon', 'region', 'chromosome', 'scaffold') self.spoof_gene_count = 0 self.is_phytozome = False self.is_metagenome = False self.strict = True self.generate_genes = False self.warnings = [] # type: list self.feature_dict = collections.OrderedDict() # type: dict self.cdss = set() # type: set self.ontologies_present = collections.defaultdict(dict) # type: dict self.ontology_events = list() # type: list self.skiped_features = collections.Counter( ) # type: collections.Counter self.feature_counts = collections.Counter( ) # type: collections.Counter self.re_api_url = config.re_api_url def warn(self, message): self.warnings.append(message) def generate_genome_json(self, params): # 1) validate parameters self._validate_import_file_params(params) self.code_table = params.get('genetic_code', 11) # 2) construct the input directory staging area input_directory = os.path.join(self.cfg.sharedFolder, 'fast_gff_upload_' + str(uuid.uuid4())) os.makedirs(input_directory) file_paths = self._stage_input(params, input_directory) # 3) extract out the parameters params = self._set_parsed_params(params) if params.get('generate_missing_genes'): self.generate_genes = True # 4) do the upload genome = self._gen_genome_json(params, file_paths["gff_file"], file_paths["fasta_file"]) return genome, input_directory def import_file(self, params): self.is_metagenome = params.get('is_metagenome', False) if self.is_metagenome: ws_datatype = "KBaseMetagenomes.AnnotatedMetagenomeAssembly" else: ws_datatype = "KBaseGenomes.Genome" genome, input_directory = self.generate_genome_json(params) json.dump(genome, open(f"{self.cfg.sharedFolder}/{genome['id']}.json", 'w'), indent=4) result = self.gi.save_one_genome({ 'workspace': params['workspace_name'], 'name': params['genome_name'], 'data': genome, "meta": params.get('metadata', {}), 'workspace_datatype': ws_datatype, }) feature_types = "\n".join( [f"{k}: {v}" for k, v in genome['feature_counts'].items()]) report_string = ( f"A genome with {len(genome['contig_ids'])} contigs and the following feature " f"types was imported: \n{feature_types}") # XXX report_string is unused except for this log logging.info(report_string) # 5) clear the temp directory shutil.rmtree(input_directory) # 6) return the result info = result['info'] prefix = '' if self.is_metagenome: prefix = 'meta' details = { prefix + 'genome_ref': f'{info[6]}/{info[0]}/{info[4]}', prefix + 'genome_info': info } return details def _gen_genome_json(self, params, input_gff_file, input_fasta_file): # reading in GFF file features_by_contig = self._retrieve_gff_file(input_gff_file) contig_ids = set() # parse feature information fasta_contigs = Bio.SeqIO.parse(input_fasta_file, "fasta") for contig in fasta_contigs: molecule_type = str(contig.seq.alphabet).replace( 'IUPACAmbiguous', '').strip('()') contig_ids.add(contig.id) for feature in features_by_contig.get(contig.id, []): self._transform_feature(contig, feature) for cid in set(features_by_contig.keys()) - contig_ids: self.warn( f"Sequence name {cid} does not match a sequence id in the FASTA file." f"{len(features_by_contig[cid])} features will not be imported." ) if self.strict: raise ValueError( "Every feature sequence id must match a fasta sequence id") prot_fasta_path = f"{self.cfg.sharedFolder}/{params['genome_name']}_protein.fasta" # if is a metagenome, the following function writes a protein fasta self._process_cdss(prot_fasta_path) # save assembly file ''' Metagenome Changes: if we want to pass more stuff to AssemblyUtil, do here. TODO: add flag to save_assembly_from_fasta ''' if self.is_metagenome: genome_type = "metagenome" else: genome_type = params.get('genome_type', 'isolate') if params.get('existing_assembly_ref'): assembly_ref = params['existing_assembly_ref'] ret = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0] assembly_obj_type = ret['info'][2].split('-')[0] valid_assembly_types = [ "KBaseGenomeAnnotations.Assembly", "KBaseGenomes.ContigSet" ] if assembly_obj_type not in valid_assembly_types: raise ValueError( f"{assembly_ref} is not a reference to an assembly") assembly_data = ret['data'] # should do more thorough check of sequences. if not validate_lists_have_same_elements( assembly_data['contigs'].keys(), contig_ids): raise ValueError( f"provided assembly with ref {assembly_ref} does not " "have matching contig ids to provided input fasta.") logging.info(f"Using supplied assembly: {assembly_ref}") else: assembly_ref = self.au.save_assembly_from_fasta({ 'file': { 'path': input_fasta_file }, 'workspace_name': params['workspace_name'], 'assembly_name': params['genome_name'] + ".assembly", 'type': genome_type, }) assembly_data = self.dfu.get_objects({ 'object_refs': [assembly_ref], 'ignore_errors': 0 })['data'][0]['data'] # generate genome info genome = self._gen_genome_info(assembly_ref, assembly_data, input_gff_file, molecule_type, prot_fasta_path, params) if self.spoof_gene_count > 0: self.warn(warnings['spoofed_genome'].format(self.spoof_gene_count)) genome['suspect'] = 1 if self.warnings: genome['warnings'] = self.warnings return genome @staticmethod def _location(in_feature): in_feature['strand'] = in_feature['strand'].replace( "-1", "-").translate(strand_table) if in_feature['strand'] == '+': start = in_feature['start'] elif in_feature['strand'] == '-': start = in_feature['end'] else: raise ValueError('Invalid feature strand: {}'.format( in_feature['strand'])) return [ in_feature['contig'], start, in_feature['strand'], in_feature['end'] - in_feature['start'] + 1 ] @staticmethod def _validate_import_file_params(params): """ validate_import_file_params: validates params passed to FastaGFFToGenome.import_file method """ # check for required parameters for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError(f'"{p}" parameter is required, but missing') # one and only one of 'path', or 'shock_id' is required for key in ('fasta_file', 'gff_file'): file = params[key] if not isinstance(file, dict): raise ValueError(f'Required "{key}" field must be a map/dict') sources = ('path', 'shock_id') n_valid_fields = sum(1 for f in sources if file.get(f)) print(f"inputs: {n_valid_fields}") if n_valid_fields < 1: raise ValueError( f'Required "{key}" field must include one source: ' f'{", ".join(sources)}') if n_valid_fields > 1: raise ValueError( f'Required "{key}" field has too many sources specified: ' f'{", ".join(file.keys())}') if params.get('genetic_code'): if not (isinstance(params['genetic_code'], int) and 0 < params['genetic_code'] < 32): raise ValueError( "Invalid genetic code specified: {}".format(params)) def _set_parsed_params(self, params): logging.info('Setting params') default_params = { 'taxon_wsname': self.cfg.raw['taxon-workspace-name'], 'scientific_name': 'unknown_taxon', 'source': 'User', 'release': None, 'metadata': {}, 'source_id': 'unknown', } default_params.update(params) logging.info(json.dumps(default_params, indent=1)) return default_params def _stage_input(self, params, input_directory): """ stage_input: Setup the input_directory by fetching the files and uncompressing if needed """ file_paths = dict() for key in ('fasta_file', 'gff_file'): file = params[key] file_path = None ''' below seems like weird if statement ''' if file.get('path') is not None: local_file_path = file['path'] file_path = os.path.join(input_directory, os.path.basename(local_file_path)) logging.info( f'Moving file from {local_file_path} to {file_path}') # Metagenome Updates # not sure if we have to be careful about moving the objects # around if os.path.isfile(local_file_path): shutil.copy2(local_file_path, file_path) else: raise FileNotFoundError( f"Input {key} file {local_file_path} not found") err_msg = "Shutil copy unsucessful" elif file.get('shock_id') is not None: # handle shock file logging.info(f'Downloading file from SHOCK node: ' f'{self.cfg.sharedFolder}-{file["shock_id"]}') sys.stdout.flush() file_name = self.dfu.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) err_msg = "Shock retrieval" # extract the file if it is compressed ''' Metagenome Changes: may have to make check here to see if the the file is too big for working dir. ''' if file_path is not None: logging.info("staged input file =" + file_path) sys.stdout.flush() if not os.path.isfile(file_path): raise FileNotFoundError(f"{file_path} not a file") dfUtil_result = self.dfu.unpack_file({'file_path': file_path}) file_paths[key] = dfUtil_result['file_path'] err_msg = "DataFielUtil 'unpack_file' function call" else: raise ValueError( 'No valid files could be extracted based on the input') if not os.path.isfile(file_path): raise ValueError(f"{err_msg} for {key} file to {file_path}") return file_paths def _retrieve_gff_file(self, input_gff_file): """ _retrieve_gff_file: retrieve info from gff_file """ logging.info("Reading GFF file") feature_list = collections.defaultdict(list) # type: dict is_patric = 0 ''' Metagenome Changes: the lines below iterate through the entire gff input file, which for a Metagenome may be an issue. ! Only a problem if there are space limits on processing in this request ''' for current_line in open(input_gff_file): if current_line.isspace( ) or current_line == "" or current_line.startswith("#"): continue # Split line try: (contig_id, source_id, feature_type, start, end, score, strand, phase, attributes) = current_line.split('\t') except ValueError: raise ValueError(f"unable to parse {current_line}") ''' Do Metagenomes need this phytozome/PATRIC stuff??''' # Checking to see if Phytozome if "phytozome" in source_id.lower(): self.is_phytozome = True # Checking to see if Phytozome if "PATRIC" in source_id: is_patric = True # PATRIC prepends their contig ids with some gibberish if is_patric and "|" in contig_id: contig_id = contig_id.split("|", 1)[1] # Populating basic feature object ftr: dict = { 'contig': contig_id, 'source': source_id, 'type': feature_type, 'start': int(start), 'end': int(end), 'score': score, 'strand': strand, 'phase': phase, 'attributes': collections.defaultdict(list) } # Populating with attribute key-value pair # This is where the feature id is from for attribute in attributes.split(";"): attribute = attribute.strip() # Sometimes empty string if not attribute: continue # Use of 1 to limit split as '=' character can also be made available later # Sometimes lack of "=", assume spaces instead if "=" in attribute: key, value = attribute.split("=", 1) elif " " in attribute: key, value = attribute.split(" ", 1) else: logging.debug(f'Unable to parse {attribute}') continue ftr['attributes'][make_snake_case(key)].append( parse.unquote(value.strip('"'))) ftr['attributes']['raw'] = attributes if "id" in ftr['attributes']: ftr['ID'] = ftr['attributes']['id'][0] if "parent" in ftr['attributes']: ftr['Parent'] = ftr['attributes']['parent'][0] feature_list[contig_id].append(ftr) # Some GFF/GTF files don't use "ID" so we go through the possibilities feature_list = self._add_missing_identifiers(feature_list) # Most bacterial files have only CDSs # In order to work with prokaryotic and eukaryotic gene structure synonymously # Here we add feature dictionaries representing the parent gene and mRNAs # feature_list = self._add_missing_parents(feature_list) # Phytozome has the annoying habit of editing their identifiers so we fix them if self.is_phytozome: self._update_phytozome_features(feature_list) # All identifiers need to be checked so that they follow the same general rules # Rules are listed within the function itself feature_list = self._update_identifiers(feature_list) return feature_list def _add_missing_identifiers(self, feature_list): logging.info("Adding missing identifiers") # General rule is to iterate through a range of possibilities if "ID" is missing for contig in feature_list: for i, feat in enumerate(feature_list[contig]): if "ID" not in feature_list[contig][i]: # all of the following are not guaranteed to be unique ID's # for key in ("transcriptid", "proteinid", "pacid", # "parent", "name", 'transcript_id'): for key in ("protein_id", "name", "pacid", "parent"): if key in feature_list[contig][i]['attributes']: feature_list[contig][i]['ID'] = feature_list[ contig][i]['attributes'][key][0] break if feat['type'] not in self.skip_types: self.feature_counts[feat['type']] += 1 # If the process fails, throw an error if "ID" not in feature_list[contig][i]: feat[ 'ID'] = f"{feat['type']}_{self.feature_counts[feat['type']]}" return feature_list def _add_missing_parents(self, feature_list): # General rules is if CDS or RNA missing parent, add them for contig in feature_list: ftrs = feature_list[contig] new_ftrs = [] for i in range(len(ftrs)): if ftrs[i]["type"] in self.skip_types: continue if "Parent" not in ftrs[i]: # Assuming parent doesn't exist at all, so create de novo instead of trying to find it if "RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]: new_gene_ftr = copy.deepcopy(ftrs[i]) new_gene_ftr["type"] = "gene" ftrs[i]["Parent"] = new_gene_ftr["ID"] new_ftrs.append(new_gene_ftr) if "CDS" in ftrs[i]["type"]: new_rna_ftr = copy.deepcopy(ftrs[i]) new_rna_ftr["type"] = "mRNA" new_ftrs.append(new_rna_ftr) ftrs[i]["Parent"] = new_rna_ftr["ID"] new_ftrs.append(ftrs[i]) feature_list[contig] = new_ftrs return feature_list @staticmethod def _update_phytozome_features(feature_list): # General rule is to use the "Name" field where possible # And update parent attribute correspondingly for contig in feature_list: feature_position_dict = {} for i in range(len(feature_list[contig])): # Maintain old_id for reference # Sometimes ID isn't available, so use PACid old_id = None for key in ("id", "pacid"): if key in feature_list[contig][i]['attributes']: old_id = feature_list[contig][i]['attributes'][key][0] break if old_id is None: continue # Retain old_id feature_position_dict[old_id] = i # Clip off the increment on CDS IDs so fragments of the same # CDS share the same ID if "CDS" in feature_list[contig][i]["ID"]: feature_list[contig][i]["ID"] = feature_list[contig][i][ "ID"].rsplit('.', 1)[0] # In Phytozome, gene and mRNA have "Name" field, CDS do not if "name" in feature_list[contig][i]['attributes']: feature_list[contig][i]["ID"] = feature_list[contig][i][ 'attributes']['name'][0] if "Parent" in feature_list[contig][i]: # Update Parent to match new ID of parent ftr feature_list[contig][i]["Parent"] = feature_list[contig][ feature_position_dict[feature_list[contig][i] ["Parent"]]]["ID"] return feature_list def _update_identifiers(self, feature_list): # General rules: # 1) Genes keep identifier # 2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA" # 3) CDS always uses RNA identifier with ".CDS" appended mRNA_parent_dict = dict() for contig in feature_list: for ftr in feature_list[contig]: if ftr["type"] in self.skip_types: continue if "Parent" in ftr: # Retain old_id of parents old_id = ftr["ID"] if ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]: ftr["ID"] = ftr["Parent"] + "." + ftr["type"] # link old to new ids for mRNA to use with CDS if "RNA" in ftr["type"]: mRNA_parent_dict[old_id] = ftr["ID"] return feature_list def _check_location_order(self, locations): """If order looks good return None. If out of order return warning If on multiple strands return warning""" strand = None last_start = 0 for location in locations: if strand is None: strand = location[2] elif strand != location[2]: return warnings["both_strand_coordinates"] if strand == "-": locations = reversed(locations) for location in locations: if last_start > location[1]: return warnings["out_of_order"] else: last_start = location[1] return None def _create_ontology_event(self, ontology_type): """Creates the ontology_event if necessary Returns the index of the ontology event back.""" if ontology_type not in self.ont_mappings: raise ValueError( "{} is not a supported ontology".format(ontology_type)) if "event_index" not in self.ont_mappings[ontology_type]: self.ont_mappings[ontology_type]['event_index'] = len( self.ontology_events) if ontology_type == "GO": ontology_ref = "KBaseOntology/gene_ontology" elif ontology_type == "PO": ontology_ref = "KBaseOntology/plant_ontology" else: ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology" self.ontology_events.append({ "method": "GenomeFileUtils Genbank uploader from annotations", "method_version": self.version, "timestamp": self.time_string, "id": ontology_type, "ontology_ref": ontology_ref }) return self.ont_mappings[ontology_type]['event_index'] def _get_ontology_db_xrefs(self, feature): """Splits the ontology info from the other db_xrefs""" ontology = collections.defaultdict(dict) # type: dict db_xrefs = [] # these are keys are formatted strangely and require special parsing for key in ("go_process", "go_function", "go_component"): ontology_event_index = self._create_ontology_event("GO") for term in feature.get(key, []): sp = term.split(" - ") ontology['GO'][sp[0]] = [ontology_event_index] self.ontologies_present['GO'][ sp[0]] = self.ont_mappings['GO'].get(sp[0], '') # CATH terms are not distinct from EC numbers so myst be found by key for term in feature.get('cath_funfam', []) + feature.get('cath', []): for ref in term.split(','): ontology['CATH'][ref] = [self._create_ontology_event("CATH")] self.ontologies_present['CATH'][ref] = self.ont_mappings[ 'CATH'].get(ref, '') search_keys = [ 'ontology_term', 'db_xref', 'dbxref', 'product_source', 'tigrfam', 'pfam', 'cog', 'go', 'po', 'ko' ] ont_terms = [] # type: list # flatten out into list of values for key in search_keys: if key in feature: ont_terms += [x for y in feature[key] for x in y.split(',')] for ref in ont_terms: if ref.startswith('GO:'): ontology['GO'][ref] = [self._create_ontology_event("GO")] self.ontologies_present['GO'][ref] = self.ont_mappings[ 'GO'].get(ref, '') elif ref.startswith('PO:'): ontology['PO'][ref] = [self._create_ontology_event("PO")] self.ontologies_present['PO'][ref] = self.ont_mappings[ 'PO'].get(ref, '') elif ref.startswith('KO:'): ontology['KO'][ref] = [self._create_ontology_event("KO")] self.ontologies_present['KO'][ref] = self.ont_mappings[ 'KO'].get(ref, '') elif ref.startswith('COG'): ontology['COG'][ref] = [self._create_ontology_event("COG")] self.ontologies_present['COG'][ref] = self.ont_mappings[ 'COG'].get(ref, '') elif ref.startswith('PF'): ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")] self.ontologies_present['PFAM'][ref] = self.ont_mappings[ 'PFAM'].get(ref, '') elif ref.startswith('TIGR'): ontology['TIGRFAM'][ref] = [ self._create_ontology_event("TIGRFAM") ] self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[ 'TIGRFAM'].get(ref, '') elif ":" not in ref: db_xrefs.append(tuple(["Unknown_Source", ref])) else: db_xrefs.append(tuple(ref.split(":", 1))) return dict(ontology), db_xrefs ''' Metagenome Changes: okay looks like this might be the real meat of it ''' def _transform_feature(self, contig, in_feature): """Converts a feature from the gff ftr format into the appropriate format for a genome object """ def _aliases(feat): keys = ('locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id', 'gene', 'ec_number', 'gene_synonym') alias_list = [] for key in keys: if key in feat['attributes']: alias_list.extend([(key, val) for val in feat['attributes'][key]]) return alias_list if in_feature['start'] < 1 or in_feature['end'] > len(contig): self.warn( f"Feature with invalid location for specified contig: {in_feature}" ) if self.strict: raise ValueError( "Features must be completely contained within the Contig in the " f"Fasta file. Feature: in_feature") return feat_seq = contig.seq[in_feature['start'] - 1:in_feature['end']].upper() if in_feature['strand'] in {'-', '-1'}: feat_seq = feat_seq.reverse_complement() # if the feature ID is duplicated (CDS or transpliced gene) we only # need to update the location and dna_sequence if in_feature.get('ID') in self.feature_dict: existing = self.feature_dict[in_feature['ID']] existing['location'].append(self._location(in_feature)) existing['dna_sequence'] = existing.get('dna_sequence', '') + str(feat_seq) existing['dna_sequence_length'] = len(existing['dna_sequence']) return # The following is common to all the feature types out_feat = { "id": in_feature.get('ID'), "type": in_feature['type'], "location": [self._location(in_feature)], "dna_sequence": str(feat_seq), "dna_sequence_length": len(feat_seq), "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(), "warnings": [], "flags": [], } # add optional fields if 'note' in in_feature['attributes']: out_feat['note'] = in_feature['attributes']["note"][0] ont, db_xrefs = self._get_ontology_db_xrefs(in_feature['attributes']) if ont: out_feat['ontology_terms'] = ont aliases = _aliases(in_feature) if aliases: out_feat['aliases'] = aliases if db_xrefs: out_feat['db_xrefs'] = db_xrefs if 'product' in in_feature['attributes']: out_feat['functions'] = in_feature['attributes']["product"] if 'product_name' in in_feature['attributes']: if "functions" in out_feat: out_feat['functions'].extend( in_feature['attributes']["product_name"]) else: out_feat['functions'] = in_feature['attributes'][ "product_name"] if 'function' in in_feature['attributes']: out_feat['functional_descriptions'] = in_feature['attributes'][ "function"] if 'inference' in in_feature['attributes']: GenomeUtils.parse_inferences(in_feature['attributes']['inference']) if 'trans-splicing' in in_feature['attributes'].get('exception', []): out_feat['flags'].append('trans_splicing') if 'pseudo' in in_feature['attributes'].get('exception', []): out_feat['flags'].append('pseudo') if 'ribosomal-slippage' in in_feature['attributes'].get( 'exception', []): out_feat['flags'].append('ribosomal_slippage') parent_id = in_feature.get('Parent', '') if parent_id and parent_id not in self.feature_dict: raise ValueError( f"Parent ID: {parent_id} was not found in feature ID list.") # if the feature is a exon or UTR, it will only be used to update the # location and sequence of it's parent, we add the info to it parent # feature but not the feature dict if in_feature['type'] in self.skip_types: if parent_id and in_feature['type'] in { 'exon', 'five_prime_UTR', 'three_prime_UTR' }: parent = self.feature_dict[parent_id] if in_feature['type'] not in parent: parent[in_feature['type']] = [] parent[in_feature['type']].append(out_feat) return # add type specific features elif 'gene' in in_feature['type']: out_feat['protein_translation_length'] = 0 out_feat['cdss'] = [] elif in_feature['type'] == 'CDS': if parent_id: parent = self.feature_dict[parent_id] if 'cdss' in parent: # parent must be a gene if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings[ "genes_CDS_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"].append(warnings[ "CDS_fail_child_of_gene_coordinate_validation"]. format(parent_id)) parent['cdss'].append(in_feature['ID']) out_feat['parent_gene'] = parent_id else: # parent must be mRNA if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings["mRNA_fail_parent_coordinate_validation"]. format(out_feat["id"]) ] out_feat["warnings"].append(warnings[ "CDS_fail_child_of_mRNA_coordinate_validation"]. format(parent_id)) parent['cds'] = in_feature['ID'] out_feat['parent_mrna'] = parent_id parent_gene = self.feature_dict[parent['parent_gene']] parent_gene['cdss'].append(in_feature['ID']) out_feat['parent_gene'] = parent['parent_gene'] # keep track of CDSs for post processing self.cdss.add(out_feat['id']) elif in_feature['type'] == 'mRNA': if parent_id: parent = self.feature_dict[parent_id] if 'mrnas' not in parent: parent['mrnas'] = [] if 'cdss' in parent: # parent must be a gene parent['mrnas'].append(in_feature['ID']) out_feat['parent_gene'] = parent_id if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings["genes_mRNA_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"].append( warnings["mRNAs_parent_gene_fails_location_validation"] .format(parent_id)) else: out_feat["type"] = in_feature['type'] # this prevents big misc_features from blowing up the genome size if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE: del out_feat['dna_sequence'] if parent_id: parent = self.feature_dict[parent_id] if 'children' not in parent: parent['children'] = [] parent['children'].append(out_feat['id']) out_feat['parent_gene'] = parent_id if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings[ "generic_parents_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"].append(warnings[ "generic_childs_parent_fails_location_validation"]. format(parent_id)) # cleanup empty optional arrays for key in ['warnings', 'flags']: if not out_feat[key]: del out_feat[key] self.feature_dict[out_feat['id']] = out_feat def _process_cdss(self, prot_fasta_path): """Because CDSs can have multiple fragments, it's necessary to go back over them to calculate a final protein sequence""" if self.is_metagenome: prot_fasta = {} # type: dict untranslatable_prot = set() for cds_id in self.cdss: cds = self.feature_dict[cds_id] try: prot_seq = str( Seq(cds['dna_sequence']).translate(self.code_table, cds=True).strip("*")) except TranslationError as e: cds['warnings'] = cds.get('warnings', []) + [str(e)] # NOTE: we may need a different way of handling this for metagenomes. prot_seq = "" if self.is_metagenome: untranslatable_prot.add(cds_id) if self.is_metagenome: if prot_seq != "": protein_id = "" if cds.get("aliases"): aliases = cds['aliases'] for key, val in aliases: if key == "protein_id": protein_id = val if not protein_id: protein_id = cds['id'] # assign to some default else: # log a warning here? pass # TODO: update header to reflect what we actually want people # to see. if protein_id in prot_fasta: prot_fasta[protein_id][0] += "|" + cds['id'] else: fasta_seq_data = ">" + protein_id + " cds_ids:" + cds[ 'id'] prot_fasta[protein_id] = [fasta_seq_data, prot_seq] else: pass else: cds.update({ "protein_translation": prot_seq, "protein_md5": hashlib.md5(prot_seq.encode('utf8')).hexdigest(), "protein_translation_length": len(prot_seq), }) if 'parent_gene' in cds: parent_gene = self.feature_dict[cds['parent_gene']] # no propigation for now propagate_cds_props_to_gene(cds, parent_gene, self.is_metagenome) elif self.generate_genes: spoof = copy.copy(cds) spoof['type'] = 'gene' spoof['id'] = cds['id'] + "_gene" spoof['cdss'] = [cds['id']] spoof['warnings'] = [ warnings['spoofed_gene'].format(cds['id']) ] self.feature_dict[spoof['id']] = spoof cds['parent_gene'] = spoof['id'] self.spoof_gene_count += 1 else: raise ValueError(warnings['no_spoof']) self.feature_dict[cds['id']] = cds if self.is_metagenome: with open(prot_fasta_path, 'w') as fid: for key, line in prot_fasta.items(): fid.write('\n'.join(line)) # do something with 'untranslatable_prot' def _update_from_exons(self, feature): """This function updates the sequence and location of a feature based on it's UTRs, CDSs and exon information""" # note that start and end here are in direction of translation def start(loc): return loc[0][1] def end(loc): if loc[-1][2] == "+": return loc[-1][1] + loc[-1][3] + 1 else: return loc[-1][1] - loc[-1][3] - 1 if 'exon' in feature: # update the feature with the exon locations and sequences feature['location'] = [x['location'][0] for x in feature['exon']] feature['dna_sequence'] = "".join(x['dna_sequence'] for x in feature['exon']) feature['dna_sequence_length'] = len(feature['dna_sequence']) # construct feature location from utrs and cdss if present elif 'cds' in feature: cds = [copy.deepcopy(self.feature_dict[feature['cds']])] locs = [] # type: list seq = "" for frag in feature.get('five_prime_UTR', []) + cds + \ feature.get('three_prime_UTR', []): # merge into last location if adjacent if locs and abs(end(locs) - start(frag['location'])) == 1: # extend the location length by the length of the first # location in the fragment first = frag['location'].pop(0) locs[-1][3] += first[3] locs.extend(frag['location']) seq += frag['dna_sequence'] feature['location'] = locs feature['dna_sequence'] = seq feature['dna_sequence_length'] = len(seq) # remove these properties as they are no longer needed for x in ['five_prime_UTR', 'three_prime_UTR', 'exon']: feature.pop(x, None) else: ValueError( 'Feature {feature["id"]} must contain either exon or cds data to ' 'construct an accurate location and sequence') def _gen_genome_info(self, assembly_ref, assembly, input_gff_file, molecule_type, prot_fasta_path, params): """ _gen_genome_info: generate genome info Here is the meat of the saving operation. Genome Fields: features: protein encoding genes cdss: mrnas: mrna sequences non_coding_features: everything that doesn't fall into 'features', 'cdss', 'mrnas' """ features = [] cdss = [] mrnas = [] non_coding_features = [] genome = { "id": params.get('genome_name'), "scientific_name": params.get('scientific_name', "Unknown"), "assembly_ref": assembly_ref, 'molecule_type': molecule_type, "gc_content": assembly["gc_content"], "dna_size": assembly["dna_size"], 'md5': assembly['md5'], 'num_contigs': len(assembly['contigs']), 'ontologies_present': dict(self.ontologies_present), 'ontology_events': self.ontology_events, } if self.is_metagenome: metagenome_fields = [ ("publications", []), ("external_source_origination_date", None), ("original_source_file_name", None), ("notes", None), # NOTE: in the future environment should use an ontology. ("environment", None), ] # type: list for field, default in metagenome_fields: genome[field] = params.get(field, default) # save protein fasta to shock prot_to_shock = self.dfu.file_to_shock({ 'file_path': prot_fasta_path, 'make_handle': 1, 'pack': 'gzip' }) genome['protein_handle_ref'] = prot_to_shock['handle']['hid'] genome['contig_ids'], genome['contig_lengths'] = zip( *[(k, v['length']) for k, v in assembly['contigs'].items()]) if self.is_metagenome: genome['source'], _ = self.gi.determine_tier(params.get('source')) else: genome['source'], genome['genome_tiers'] = self.gi.determine_tier( params.get('source')) # Set taxonomy-related fields in the genome data if params.get('taxon_id'): GenomeUtils.set_taxon_data(int(params['taxon_id']), self.re_api_url, genome) else: GenomeUtils.set_default_taxon_data(genome) # handle optional fields for key in ('release', 'genetic_code', 'genome_type', 'source_id'): if params.get(key): genome[key] = params[key] # Phytozome gff files are not compatible with the RNASeq Pipeline # so it's better to build from the object than cache the file if self.is_phytozome or self.is_metagenome: gff_file_to_shock = self.dfu.file_to_shock({ 'file_path': input_gff_file, 'make_handle': 1, 'pack': "gzip" }) genome['gff_handle_ref'] = gff_file_to_shock['handle']['hid'] for feature in self.feature_dict.values(): self.feature_counts[feature['type']] += 1 if 'exon' in feature or feature['type'] == 'mRNA': self._update_from_exons(feature) # Test if location order is in order. is_transpliced = "flags" in feature and "trans_splicing" in feature[ "flags"] if not is_transpliced and len(feature["location"]) > 1: # Check the order only if not trans_spliced and has more than 1 location. location_warning = self._check_location_order( feature["location"]) if location_warning is not None: feature["warnings"] = feature.get('warnings', []) + [location_warning] contig_len = genome["contig_lengths"][genome["contig_ids"].index( feature["location"][0][0])] feature = check_full_contig_length_or_multi_strand_feature( feature, is_transpliced, contig_len, self.skip_types) # sort features into their respective arrays if feature['type'] == 'CDS': if not self.is_metagenome: del feature['type'] cdss.append(feature) elif feature['type'] == 'mRNA': if not self.is_metagenome: del feature['type'] mrnas.append(feature) elif feature['type'] == 'gene': # remove duplicates that may arise from CDS info propagation for key in ('functions', 'aliases', 'db_xrefs'): if key in feature: feature[key] = list(set(feature[key])) if feature['cdss']: if not self.is_metagenome: del feature['type'] self.feature_counts["protein_encoding_gene"] += 1 features.append(feature) else: feature.pop('mrnas', None) feature.pop('cdss', None) feature.pop('protein_translation_length', None) self.feature_counts["non_coding_gene"] += 1 non_coding_features.append(feature) else: non_coding_features.append(feature) # if input is metagenome, save features, cdss, non_coding_features, and # mrnas to shock if self.is_metagenome: # TODO: make this section more efficient by editing the above. metagenome_features = features + cdss + mrnas + non_coding_features genome['num_features'] = len(metagenome_features) genome_name = params['genome_name'] json_file_path = f'{self.cfg.sharedFolder}/{genome_name}_features.json' # save to json files first with open(json_file_path, 'w') as fid: json.dump(metagenome_features, fid) # write json to shock json_to_shock = self.dfu.file_to_shock({ 'file_path': json_file_path, 'make_handle': 1, 'pack': 'gzip' }) self.feature_counts["non_coding_features"] = len( non_coding_features) genome['features_handle_ref'] = json_to_shock['handle']['hid'] # remove json file to avoid disk overload os.remove(json_file_path) # delete python objects to reduce overhead del metagenome_features del features, cdss, mrnas, non_coding_features else: # TODO determine whether we want to deepcopy here instead of reference. genome['features'] = features genome['cdss'] = cdss genome['mrnas'] = mrnas genome['non_coding_features'] = non_coding_features self.feature_counts["non_coding_features"] = len( genome['non_coding_features']) if self.warnings: genome['warnings'] = self.warnings genome['feature_counts'] = dict(self.feature_counts) return genome
class DataUtil: @staticmethod def _find_between(s, start, end): """ _find_between: find string in between start and end """ return re.search('{}(.*){}'.format(start, end), s).group(1) def _find_constraints(self, obj_type): """ _find_constraints: retrieve constraints (@contains, rowsum, unique, conditionally_required) """ type_info = self.wsClient.get_type_info(obj_type) type_desc = type_info.get('description') constraints = {} for tag in ('contains', 'rowsum', 'unique', 'conditionally_required'): constraints[tag] = [line.strip().split()[1:] for line in type_desc.split("\n") if line.startswith(f'@{tag}')] return constraints def _filter_constraints(self, constraints, data): """filters out constraints with missing keys""" contains_constraints = constraints.get('contains') filtered_constraints = [] for contains_constraint in contains_constraints: in_values = contains_constraint[1:] missing_key = True for in_value in in_values: if in_value.startswith('values'): search_value = re.search('{}(.*){}'.format('\(', '\)'), in_value).group(1) unique_list = search_value.split('.') key = unique_list[0] elif ':' in in_value: key = in_value.split(':')[0] else: unique_list = in_value.split('.') key = unique_list[0] if key in data: missing_key = False break if missing_key: filtered_constraints.append(contains_constraint) for x in filtered_constraints: contains_constraints.remove(x) return constraints def _retrieve_value(self, data, value): """Parse the provided 'data' object to retrieve the item in 'value'.""" logging.info('Getting value for {}'.format(value)) retrieve_data = [] m_data = DotMap(data) if value.startswith('set('): retrieve_data = value[4:-1].split(",") elif value.startswith('values('): # TODO: nested values e.g. values(values(ids)) search_value = re.search('{}(.*){}'.format('\(', '\)'), value).group(1) unique_list = search_value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = list(m_data_cp.values()) elif ':' in value: obj_ref = getattr(m_data, value.split(':')[0]) if obj_ref: included = value.split(':')[1] included = '/' + included.replace('.', '/') ref_data = self.wsClient.get_objects2({'objects': [{'ref': obj_ref, 'included': [included]}]})['data'][0]['data'] m_ref_data = DotMap(ref_data) if ref_data: if '*' not in included: for key in included.split('/')[1:]: m_ref_data = getattr(m_ref_data, key) else: keys = included.split('/')[1:] m_ref_data = [x.get(keys[2]) for x in ref_data.get(keys[0])] # TODO: only works for 2 level nested data like '/features/[*]/id' retrieve_data = list(m_ref_data) else: unique_list = value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = list(m_data_cp) logging.info('Retrieved value (first 20):\n{}\n'.format(retrieve_data[:20])) return retrieve_data def _validate(self, constraints, data): """ _validate: validate data """ validated = True failed_constraints = defaultdict(list) unique_constraints = constraints.get('unique') for unique_constraint in unique_constraints: retrieved_value = self._retrieve_value(data, unique_constraint[0]) if len(set(retrieved_value)) != len(retrieved_value): validated = False failed_constraints['unique'].append(unique_constraint[0]) contains_constraints = constraints.get('contains') for contains_constraint in contains_constraints: value = contains_constraint[0] in_values = contains_constraint[1:] retrieved_in_values = [] for in_value in in_values: retrieved_in_values += self._retrieve_value(data, in_value) if not (set(self._retrieve_value(data, value)) <= set(retrieved_in_values)): validated = False failed_constraints['contains'].append(" ".join(contains_constraint)) conditional_constraints = constraints.get('conditionally_required') for conditional_constraint in conditional_constraints: trigger = conditional_constraint[0] required_keys = conditional_constraint[1:] if trigger in data: missing_keys = [key for key in required_keys if key not in data] if missing_keys: validated = False failed_constraints['conditionally_required'].append( (trigger, required_keys, missing_keys)) return validated, failed_constraints @staticmethod def _mkdir_p(path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise @staticmethod def _raise_validation_error(params, validate): """Raise a meaningful error message for failed validation""" logging.error('Data failed type checking') failed_constraints = validate.get('failed_constraints') error_msg = ['Object {} failed type checking:'.format(params.get('obj_name'))] if failed_constraints.get('unique'): unique_values = failed_constraints.get('unique') error_msg.append('Object should have unique field: {}'.format(unique_values)) if failed_constraints.get('contains'): contained_values = failed_constraints.get('contains') for contained_value in contained_values: subset_value = contained_value.split(' ')[0] super_value = ' '.join(contained_value.split(' ')[1:]) if 'col_mapping' in super_value: error_msg.append('Column attribute mapping instances should contain all ' 'column index from original data') if 'row_mapping' in super_value: error_msg.append('Row attribute mapping instances should contain all row ' 'index from original data') error_msg.append('Object field [{}] should contain field [{}]'.format( super_value, subset_value)) for failure in failed_constraints.get('conditionally_required', []): error_msg.append('If object field "{}" is present than object field(s) {} should ' 'also be present. Object is missing {}'.format(*failure)) raise ValueError('\n'.join(error_msg)) def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.serviceWizardURL = config['srv-wiz-url'] self.wsClient = workspaceService(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.generics_service = GenericsService(self.serviceWizardURL) self.ws_large_data = WsLargeDataIO(self.callback_url) def list_generic_types(self, params=None): """ *Not yet exposed in spec* list_generic_types: lists the current valid generics types arguments: none return: A list of generic types in the current environment """ returnVal = [x['type_def'] for module in GENERICS_MODULES for x in self.wsClient.get_all_type_info(module)] return returnVal def fetch_data(self, params): """ fetch_data: fetch generics data as pandas dataframe for a generics data object arguments: obj_ref: generics object reference optional arguments: generics_module: the generics data module to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; generics_module should be {'data': 'FloatMatrix2D', 'condition_set_ref': 'condition_set_ref'} return: data_matrix: a pandas dataframe in json format """ for p in ['obj_ref']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) return self.generics_service.fetch_data(params) def validate_data(self, params): """ validate_data: validate data arguments: obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1' data: obj data to be validated return: validated: True or False """ constraints = self._find_constraints(params.get('obj_type')) data = params.get('data') constraints = self._filter_constraints(constraints, data) validated, failed_constraints = self._validate(constraints, data) return {'validated': validated, 'failed_constraints': failed_constraints} def save_object(self, params): """ save_object: validate data constraints and save matrix object arguments: obj_type: saving object data type obj_name: saving object name data: data to be saved workspace_name: workspace name matrix object to be saved to return: obj_ref: object reference """ logging.info('Starting validating and saving object data') obj_type = params.get('obj_type').split('-')[0] module_name = obj_type.split('.')[0] type_name = obj_type.split('.')[1] types = self.wsClient.get_module_info({'mod': module_name}).get('types') for module_type in types: if self._find_between(module_type, '\.', '\-') == type_name: obj_type = module_type break data = dict((k, v) for k, v in params.get('data').items() if v) validate = self.validate_data({'obj_type': obj_type, 'data': data}) if not validate.get('validated'): self._raise_validation_error(params, validate) # make sure users with shared object have access to the handle file upon saving handle = data.get('sequencing_file_handle') if handle: output_directory = os.path.join(self.scratch, str(uuid.uuid4())) logging.info('Downloading consensus sequence file in {}'.format(output_directory)) self._mkdir_p(output_directory) matrix_fasta_file = self.dfu.shock_to_file({ 'handle_id': handle, 'file_path': self.scratch}).get('file_path') logging.info('Saving consensus sequence file to shock: {}'.format(matrix_fasta_file)) handle_id = self.dfu.file_to_shock({'file_path': matrix_fasta_file, 'make_handle': True})['handle']['hid'] data['sequencing_file_handle'] = handle_id # cast data int_data_names = ['sequencing_quality_filter_cutoff', 'read_length_cutoff'] for data_name in int_data_names: if data_name in data: try: logging.info('Casting {} to int'.format(data_name)) data[data_name] = int(data[data_name]) except Exception as e: err_msg = 'Unexpected data type {}. '.format(data_name) err_msg += 'Data type {} requests {} to be an integer value. '.format( obj_type, data_name) err_msg += 'Provided [{}] {} instead'.format( type(data[data_name]), data[data_name]) raise ValueError(err_msg) from e float_data_names = ['barcode_error_rate', 'sequence_error_cutoff', 'clustering_cutoff'] for data_name in float_data_names: if data_name in data: try: logging.info('Casting {} to float'.format(data_name)) data[data_name] = float(data[data_name]) except Exception as e: err_msg = 'Unexpected data type {}. '.format(data_name) err_msg += 'Data type {} requests {} to be a float value. '.format( obj_type, data_name) err_msg += 'Provided [{}] {} instead'.format( type(data[data_name]), data[data_name]) raise ValueError(err_msg) from e ws_name_id = params.get('workspace_id') workspace_name = params.get('workspace_name') if not ws_name_id: if not isinstance(workspace_name, int): ws_name_id = self.dfu.ws_name_to_id(workspace_name) else: ws_name_id = workspace_name try: logging.info('Starting saving object via DataFileUtil') info = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data": data, "name": params.get('obj_name') }] })[0] except Exception: logging.info('Saving object via DataFileUtil failed') logging.info('Starting saving object via WsLargeDataIO') data_path = os.path.join(self.scratch, params.get('obj_name') + "_" + str(uuid.uuid4()) + ".json") json.dump(data, open(data_path, 'w')) info = self.ws_large_data.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data_json_file": data_path, "name": params.get('obj_name') }] })[0] return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])}
class FastaGFFToGenome: def __init__(self, config): self.cfg = config self.au = AssemblyUtil(config.callbackURL) self.dfu = DataFileUtil(self.cfg.callbackURL) self.gi = GenomeInterface(self.cfg) self.taxon_wsname = self.cfg.raw['taxon-workspace-name'] self.time_string = str( datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() self.version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) self.ont_mappings = load_ontology_mappings('/kb/module/data') self.code_table = 11 self.skip_types = ('exon', 'five_prime_UTR', 'three_prime_UTR', 'start_codon', 'stop_codon', 'region', 'chromosome', 'scaffold') self.spoof_gene_count = 0 self.is_phytozome = False self.strict = True self.generate_genes = False self.warnings = [] self.feature_dict = collections.OrderedDict() self.cdss = set() self.ontologies_present = collections.defaultdict(dict) self.ontology_events = list() self.skiped_features = collections.Counter() self.feature_counts = collections.Counter() def warn(self, message): self.warnings.append(message) def generate_genome_json(self, params): # 1) validate parameters self._validate_import_file_params(params) self.code_table = params.get('genetic_code', 11) # 2) construct the input directory staging area input_directory = os.path.join(self.cfg.sharedFolder, 'fast_gff_upload_' + str(uuid.uuid4())) os.makedirs(input_directory) file_paths = self._stage_input(params, input_directory) # 3) extract out the parameters params = self._set_parsed_params(params) if params.get('generate_missing_genes'): self.generate_genes = True # 4) do the upload genome = self._gen_genome_json( input_fasta_file=file_paths["fasta_file"], input_gff_file=file_paths["gff_file"], workspace_name=params['workspace_name'], core_genome_name=params['genome_name'], scientific_name=params['scientific_name'], source=params['source'], source_id=params['source_id'], release=params['release'], ) if params.get('genetic_code'): genome["genetic_code"] = params['genetic_code'] return genome, input_directory def import_file(self, params): genome, input_directory = self.generate_genome_json(params) json.dump(genome, open( "{}/{}.json".format(self.cfg.sharedFolder, genome['id']), 'w'), indent=4) result = self.gi.save_one_genome({ 'workspace': params['workspace_name'], 'name': params['genome_name'], 'data': genome, "meta": params.get('metadata', {}), }) report_string = 'A genome with {} contigs and the following feature ' \ 'types was imported: {}'\ .format(len(genome['contig_ids']), "\n".join( [k + ": " + str(v) for k, v in genome['feature_counts'].items()])) log(report_string) # 5) clear the temp directory shutil.rmtree(input_directory) # 6) return the result info = result['info'] details = { 'genome_ref': str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]), 'genome_info': info } return details def _gen_genome_json(self, input_gff_file=None, input_fasta_file=None, workspace_name=None, core_genome_name=None, scientific_name="unknown_taxon", source=None, source_id=None, release=None): # reading in GFF file features_by_contig = self._retrieve_gff_file(input_gff_file) contig_ids = set() # parse feature information fasta_contigs = Bio.SeqIO.parse(input_fasta_file, "fasta") for contig in fasta_contigs: molecule_type = str(contig.seq.alphabet).replace( 'IUPACAmbiguous', '').strip('()') contig_ids.add(contig.id) for feature in features_by_contig.get(contig.id, []): self._transform_feature(contig, feature) for cid in set(features_by_contig.keys()) - contig_ids: self.warn("Sequence name {} does not match a sequence id in the " "FASTA file. {} features will not be imported.".format( cid, len(features_by_contig[cid]))) if self.strict: raise ValueError( "Every feature sequence id must match a fasta sequence id") self._process_cdss() # save assembly file assembly_ref = self.au.save_assembly_from_fasta({ 'file': { 'path': input_fasta_file }, 'workspace_name': workspace_name, 'assembly_name': core_genome_name + ".assembly" }) assembly_data = self.dfu.get_objects({ 'object_refs': [assembly_ref], 'ignore_errors': 0 })['data'][0]['data'] # generate genome info genome = self._gen_genome_info(core_genome_name, scientific_name, assembly_ref, source, source_id, assembly_data, input_gff_file, molecule_type) genome['release'] = release if self.spoof_gene_count > 0: genome['warnings'] = genome.get('warnings', []) + \ [warnings['spoofed_genome'].format(self.spoof_gene_count)] genome['suspect'] = 1 return genome @staticmethod def _location(in_feature): in_feature['strand'] = in_feature['strand'].replace( "-1", "-").translate(strand_table) if in_feature['strand'] == '+': start = in_feature['start'] elif in_feature['strand'] == '-': start = in_feature['end'] else: raise ValueError('Invalid feature strand: {}'.format( in_feature['strand'])) return [ in_feature['contig'], start, in_feature['strand'], in_feature['end'] - in_feature['start'] + 1 ] @staticmethod def _validate_import_file_params(params): """ validate_import_file_params: validates params passed to FastaGFFToGenome.import_file method """ # check for required parameters for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) # one and only one of 'path', or 'shock_id' is required for key in ('fasta_file', 'gff_file'): file = params[key] if not isinstance(file, dict): raise ValueError( 'Required "{}" field must be a map/dict'.format(key)) n_valid_fields = 0 if 'path' in file and file['path'] is not None: n_valid_fields += 1 if 'shock_id' in file and file['shock_id'] is not None: n_valid_fields += 1 if 'ftp_url' in file and file['ftp_url'] is not None: n_valid_fields += 1 raise ValueError( 'FTP link is currently not supported for FastaGFFToGenome') if n_valid_fields < 1: error_msg = 'Required "{}" field must include one source: '.format( key) error_msg += 'path | shock_id' raise ValueError(error_msg) if n_valid_fields > 1: error_msg = 'Required "{}" field has too many sources specified: '.format( key) error_msg += str(list(file.keys())) raise ValueError(error_msg) if params.get('genetic_code'): if not (isinstance(params['genetic_code'], int) and 0 < params['genetic_code'] < 32): raise ValueError( "Invalid genetic code specified: {}".format(params)) def _set_parsed_params(self, params): log('Setting params') default_params = { 'taxon_wsname': self.cfg.raw['taxon-workspace-name'], 'scientific_name': 'unknown_taxon', 'taxon_reference': None, 'source': 'User', 'release': None, 'metadata': {}, 'source_id': 'unknown', } default_params.update(params) log(json.dumps(default_params, indent=1)) return default_params def _stage_input(self, params, input_directory): """ stage_input: Setup the input_directory by fetching the files and uncompressing if needed """ file_paths = dict() for key in ('fasta_file', 'gff_file'): file = params[key] file_path = None if 'path' in file and file['path'] is not None: local_file_path = file['path'] file_path = os.path.join(input_directory, os.path.basename(local_file_path)) log('Moving file from {} to {}'.format(local_file_path, file_path)) shutil.copy2(local_file_path, file_path) if 'shock_id' in file and file['shock_id'] is not None: # handle shock file log('Downloading file from SHOCK node: {}-{}'.format( self.cfg.sharedFolder, file['shock_id'])) sys.stdout.flush() file_name = self.dfu.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) # extract the file if it is compressed if file_path is not None: log("staged input file =" + file_path) sys.stdout.flush() dfUtil_result = self.dfu.unpack_file({'file_path': file_path}) file_paths[key] = dfUtil_result['file_path'] else: raise ValueError( 'No valid files could be extracted based on the input') return file_paths def _retrieve_gff_file(self, input_gff_file): """ _retrieve_gff_file: retrieve info from gff_file """ log("Reading GFF file") feature_list = collections.defaultdict(list) is_patric = 0 gff_file_handle = open(input_gff_file) current_line = gff_file_handle.readline() line_count = 0 while (current_line != ''): current_line = current_line.strip() if (current_line.isspace() or current_line == "" or current_line.startswith("#")): pass else: #Split line (contig_id, source_id, feature_type, start, end, score, strand, phase, attributes) = current_line.split('\t') #Checking to see if Phytozome if "phytozome" in source_id.lower(): self.is_phytozome = True #Checking to see if Phytozome if "PATRIC" in source_id: is_patric = True #PATRIC prepends their contig ids with some gibberish if is_patric and "|" in contig_id: contig_id = contig_id.split("|", 1)[1] #Populating basic feature object ftr = { 'contig': contig_id, 'source': source_id, 'type': feature_type, 'start': int(start), 'end': int(end), 'score': score, 'strand': strand, 'phase': phase, 'attributes': collections.defaultdict(list) } #Populating with attribute key-value pair #This is where the feature id is from for attribute in attributes.split(";"): attribute = attribute.strip() #Sometimes empty string if not attribute: continue #Use of 1 to limit split as '=' character can also be made available later #Sometimes lack of "=", assume spaces instead if ("=" in attribute): key, value = attribute.split("=", 1) ftr['attributes'][key.lower()].append( parse.unquote(value.strip('"'))) elif (" " in attribute): key, value = attribute.split(" ", 1) ftr['attributes'][key.lower()].append( parse.unquote(value.strip('"'))) else: pass #log("Warning: attribute "+attribute+" cannot be separated into key,value pair") ftr['attributes']['raw'] = attributes if "id" in ftr['attributes']: ftr['ID'] = ftr['attributes']['id'][0] if "parent" in ftr['attributes']: ftr['Parent'] = ftr['attributes']['parent'][0] feature_list[contig_id].append(ftr) current_line = gff_file_handle.readline() gff_file_handle.close() #Some GFF/GTF files don't use "ID" so we go through the possibilities feature_list = self._add_missing_identifiers(feature_list) #Most bacterial files have only CDSs #In order to work with prokaryotic and eukaryotic gene structure synonymously #Here we add feature dictionaries representing the parent gene and mRNAs #feature_list = self._add_missing_parents(feature_list) #Phytozome has the annoying habit of editing their identifiers so we fix them if self.is_phytozome: self._update_phytozome_features(feature_list) #All identifiers need to be checked so that they follow the same general rules #Rules are listed within the function itself feature_list = self._update_identifiers(feature_list) return feature_list def _add_missing_identifiers(self, feature_list): log("Adding missing identifiers") #General rule is to iterate through a range of possibilities if "ID" is missing for contig in feature_list: for i, feat in enumerate(feature_list[contig]): if "ID" not in feature_list[contig][i]: for key in ("transcriptid", "proteinid", "pacid", "parent", "name", 'transcript_id'): if key in feature_list[contig][i]['attributes']: feature_list[contig][i]['ID'] = feature_list[ contig][i]['attributes'][key][0] break if feat['type'] not in self.skip_types: self.feature_counts[feat['type']] += 1 #If the process fails, throw an error if "ID" not in feature_list[contig][i]: feat['ID'] = "{}_{}".format( feat['type'], self.feature_counts[feat['type']]) #log("Warning: Could find unique ID to utilize in GFF attributes: {}. " # "ID '{}' has been assigned".format(feat['attributes'], feat['ID'])) return feature_list def _add_missing_parents(self, feature_list): #General rules is if CDS or RNA missing parent, add them for contig in feature_list: ftrs = feature_list[contig] new_ftrs = [] for i in range(len(ftrs)): if ftrs[i]["type"] in self.skip_types: continue if ("Parent" not in ftrs[i]): #Assuming parent doesn't exist at all, so create de novo instead of trying to find it if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]): new_gene_ftr = copy.deepcopy(ftrs[i]) new_gene_ftr["type"] = "gene" ftrs[i]["Parent"] = new_gene_ftr["ID"] new_ftrs.append(new_gene_ftr) if ("CDS" in ftrs[i]["type"]): new_rna_ftr = copy.deepcopy(ftrs[i]) new_rna_ftr["type"] = "mRNA" new_ftrs.append(new_rna_ftr) ftrs[i]["Parent"] = new_rna_ftr["ID"] new_ftrs.append(ftrs[i]) feature_list[contig] = new_ftrs return feature_list @staticmethod def _update_phytozome_features(feature_list): #General rule is to use the "Name" field where possible #And update parent attribute correspondingly for contig in feature_list: feature_position_dict = {} for i in range(len(feature_list[contig])): #Maintain old_id for reference #Sometimes ID isn't available, so use PACid old_id = None for key in ("id", "pacid"): if (key in feature_list[contig][i]['attributes']): old_id = feature_list[contig][i]['attributes'][key][0] break if (old_id is None): #This should be an error #log("Cannot find unique ID, PACid, or pacid in GFF " # "attributes: " + feature_list[contig][i][contig]) continue #Retain old_id feature_position_dict[old_id] = i # Clip off the increment on CDS IDs so fragments of the same # CDS share the same ID if "CDS" in feature_list[contig][i]["ID"]: feature_list[contig][i]["ID"] = feature_list[contig][i][ "ID"].rsplit('.', 1)[0] #In Phytozome, gene and mRNA have "Name" field, CDS do not if ("name" in feature_list[contig][i]['attributes']): feature_list[contig][i]["ID"] = feature_list[contig][i][ 'attributes']['name'][0] if ("Parent" in feature_list[contig][i]): #Update Parent to match new ID of parent ftr feature_list[contig][i]["Parent"] = feature_list[contig][ feature_position_dict[feature_list[contig][i] ["Parent"]]]["ID"] return feature_list def _update_identifiers(self, feature_list): #General rules: #1) Genes keep identifier #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA" #3) CDS always uses RNA identifier with ".CDS" appended mRNA_parent_dict = dict() for contig in feature_list: for ftr in feature_list[contig]: if ftr["type"] in self.skip_types: continue if ("Parent" in ftr): #Retain old_id of parents old_id = ftr["ID"] if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]): ftr["ID"] = ftr["Parent"] + "." + ftr["type"] #link old to new ids for mRNA to use with CDS if ("RNA" in ftr["type"]): mRNA_parent_dict[old_id] = ftr["ID"] return feature_list def _check_location_order(self, locations): """If order looks good return None. If out of order return warning If on multiple strands return warning""" strand = None last_start = 0 for location in locations: if strand == None: strand = location[2] elif strand != location[2]: return warnings["both_strand_coordinates"] if strand == "-": locations = reversed(locations) for location in locations: if last_start > location[1]: return warnings["out_of_order"] else: last_start = location[1] return None def _create_ontology_event(self, ontology_type): """Creates the ontology_event if necessary Returns the index of the ontology event back.""" if ontology_type not in self.ont_mappings: raise ValueError( "{} is not a supported ontology".format(ontology_type)) if "event_index" not in self.ont_mappings[ontology_type]: self.ont_mappings[ontology_type]['event_index'] = len( self.ontology_events) if ontology_type == "GO": ontology_ref = "KBaseOntology/gene_ontology" elif ontology_type == "PO": ontology_ref = "KBaseOntology/plant_ontology" else: ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology" self.ontology_events.append({ "method": "GenomeFileUtils Genbank uploader from annotations", "method_version": self.version, "timestamp": self.time_string, "id": ontology_type, "ontology_ref": ontology_ref }) return self.ont_mappings[ontology_type]['event_index'] def _get_ontology_db_xrefs(self, feature): """Splits the ontology info from the other db_xrefs""" ontology = collections.defaultdict(dict) db_xrefs = [] # these are keys are formatted strangely and require special parsing for key in ("go_process", "go_function", "go_component"): ontology_event_index = self._create_ontology_event("GO") for term in feature.get(key, []): sp = term.split(" - ") ontology['GO'][sp[0]] = [ontology_event_index] self.ontologies_present['GO'][ sp[0]] = self.ont_mappings['GO'].get(sp[0], '') # CATH terms are not distinct from EC numbers so myst be found by key for term in feature.get('cath_funfam', []) + feature.get('cath', []): for ref in term.split(','): ontology['CATH'][ref] = [self._create_ontology_event("CATH")] self.ontologies_present['CATH'][ref] = self.ont_mappings[ 'CATH'].get(ref, '') search_keys = [ 'ontology_term', 'db_xref', 'dbxref', 'product_source', 'tigrfam', 'pfam', 'cog', 'go', 'po', 'ko' ] ont_terms = [] # flatten out into list of values for key in search_keys: if key in feature: ont_terms += [x for y in feature[key] for x in y.split(',')] for ref in ont_terms: if ref.startswith('GO:'): ontology['GO'][ref] = [self._create_ontology_event("GO")] self.ontologies_present['GO'][ref] = self.ont_mappings[ 'GO'].get(ref, '') elif ref.startswith('PO:'): ontology['PO'][ref] = [self._create_ontology_event("PO")] self.ontologies_present['PO'][ref] = self.ont_mappings[ 'PO'].get(ref, '') elif ref.startswith('KO:'): ontology['KO'][ref] = [self._create_ontology_event("KO")] self.ontologies_present['KO'][ref] = self.ont_mappings[ 'KO'].get(ref, '') elif ref.startswith('COG'): ontology['COG'][ref] = [self._create_ontology_event("COG")] self.ontologies_present['COG'][ref] = self.ont_mappings[ 'COG'].get(ref, '') elif ref.startswith('PF'): ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")] self.ontologies_present['PFAM'][ref] = self.ont_mappings[ 'PFAM'].get(ref, '') elif ref.startswith('TIGR'): ontology['TIGRFAM'][ref] = [ self._create_ontology_event("TIGRFAM") ] self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[ 'TIGRFAM'].get(ref, '') else: db_xrefs.append(tuple(ref.split(":", 1))) return dict(ontology), db_xrefs def _transform_feature(self, contig, in_feature): """Converts a feature from the gff ftr format into the appropriate format for a genome object """ def _aliases(feat): keys = ('locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id', 'gene', 'ec_number', 'gene_synonym') alias_list = [] for key in keys: if key in feat['attributes']: alias_list.extend([(key, val) for val in feat['attributes'][key]]) return alias_list if in_feature['start'] < 1 or in_feature['end'] > len(contig): self.warn("Feature with invalid location for specified " "contig: " + str(in_feature)) if self.strict: raise ValueError( "Features must be completely contained within the Contig in the " "Fasta file. Feature: " + str(in_feature)) return feat_seq = contig.seq[in_feature['start'] - 1:in_feature['end']].upper() if in_feature['strand'] in {'-', '-1'}: feat_seq = feat_seq.reverse_complement() # if the feature ID is duplicated (CDS or transpliced gene) we only # need to update the location and dna_sequence if in_feature.get('ID') in self.feature_dict: existing = self.feature_dict[in_feature['ID']] existing['location'].append(self._location(in_feature)) existing['dna_sequence'] = existing.get('dna_sequence', '') + str(feat_seq) existing['dna_sequence_length'] = len(existing['dna_sequence']) return # The following is common to all the feature types out_feat = { "id": in_feature.get('ID'), "type": in_feature['type'], "location": [self._location(in_feature)], "dna_sequence": str(feat_seq), "dna_sequence_length": len(feat_seq), "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(), } # add optional fields if 'note' in in_feature['attributes']: out_feat['note'] = in_feature['attributes']["note"][0] ont, db_xrefs = self._get_ontology_db_xrefs(in_feature['attributes']) if ont: out_feat['ontology_terms'] = ont aliases = _aliases(in_feature) if aliases: out_feat['aliases'] = aliases if db_xrefs: out_feat['db_xrefs'] = db_xrefs if 'product' in in_feature['attributes']: out_feat['functions'] = in_feature['attributes']["product"] if 'product_name' in in_feature['attributes']: if "functions" in out_feat: out_feat['functions'].extend( in_feature['attributes']["product_name"]) else: out_feat['functions'] = in_feature['attributes'][ "product_name"] if 'function' in in_feature['attributes']: out_feat['functional_descriptions'] = in_feature['attributes'][ "function"] if 'inference' in in_feature['attributes']: GenomeUtils.parse_inferences(in_feature['attributes']['inference']) if 'trans-splicing' in in_feature['attributes'].get('exception', []): out_feat['flags'] = out_feat.get('flags', []) + ['trans_splicing'] if 'pseudo' in in_feature['attributes'].get('exception', []): out_feat['flags'] = out_feat.get('flags', []) + ['pseudo'] if 'ribosomal-slippage' in in_feature['attributes'].get( 'exception', []): out_feat['flags'] = out_feat.get('flags', []) + ['ribosomal_slippage'] parent_id = in_feature.get('Parent', '') if parent_id and parent_id not in self.feature_dict: raise ValueError( "Parent ID: {} was not found in feature ID list.".format( parent_id)) # if the feature is a exon or UTR, it will only be used to update the # location and sequence of it's parent, we add the info to it parent # feature but not the feature dict if in_feature['type'] in self.skip_types: if parent_id and in_feature['type'] in { 'exon', 'five_prime_UTR', 'three_prime_UTR' }: parent = self.feature_dict[parent_id] if in_feature['type'] not in parent: parent[in_feature['type']] = [] parent[in_feature['type']].append(out_feat) return # add type specific features elif 'gene' in in_feature['type']: out_feat['protein_translation_length'] = 0 out_feat['cdss'] = [] elif in_feature['type'] == 'CDS': if parent_id: parent = self.feature_dict[parent_id] if 'cdss' in parent: # parent must be a gene if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings[ "genes_CDS_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"] = out_feat.get('warnings', []) + [ warnings[ "CDS_fail_child_of_gene_coordinate_validation"] .format(parent_id) ] parent['cdss'].append(in_feature['ID']) out_feat['parent_gene'] = parent_id else: # parent must be mRNA if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings["mRNA_fail_parent_coordinate_validation"]. format(out_feat["id"]) ] out_feat["warnings"] = out_feat.get('warnings', []) + [ warnings[ "CDS_fail_child_of_mRNA_coordinate_validation"] .format(parent_id) ] parent['cds'] = in_feature['ID'] out_feat['parent_mrna'] = parent_id parent_gene = self.feature_dict[parent['parent_gene']] parent_gene['cdss'].append(in_feature['ID']) out_feat['parent_gene'] = parent['parent_gene'] # keep track of CDSs for post processing self.cdss.add(out_feat['id']) elif in_feature['type'] == 'mRNA': if parent_id: parent = self.feature_dict[parent_id] if 'mrnas' not in parent: parent['mrnas'] = [] if 'cdss' in parent: # parent must be a gene parent['mrnas'].append(in_feature['ID']) out_feat['parent_gene'] = parent_id if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings["genes_mRNA_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"] = out_feat.get('warnings', []) + [ warnings["mRNAs_parent_gene_fails_location_validation"] .format(parent_id) ] else: out_feat["type"] = in_feature['type'] # this prevents big misc_features from blowing up the genome size if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE: del out_feat['dna_sequence'] if parent_id: parent = self.feature_dict[parent_id] if 'children' not in parent: parent['children'] = [] parent['children'].append(out_feat['id']) out_feat['parent_gene'] = parent_id if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings[ "generic_parents_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"] = out_feat.get('warnings', []) + [ warnings[ "generic_childs_parent_fails_location_validation"]. format(parent_id) ] self.feature_dict[out_feat['id']] = out_feat def _process_cdss(self): """Because CDSs can have multiple fragments, it's necessary to go back over them to calculate a final protein sequence""" for cds_id in self.cdss: cds = self.feature_dict[cds_id] try: prot_seq = str( Seq(cds['dna_sequence']).translate(self.code_table, cds=True).strip("*")) except TranslationError as e: cds['warnings'] = cds.get('warnings', []) + [str(e)] prot_seq = "" cds.update({ "protein_translation": prot_seq, "protein_md5": hashlib.md5(prot_seq.encode('utf8')).hexdigest(), "protein_translation_length": len(prot_seq), }) if 'parent_gene' in cds: parent_gene = self.feature_dict[cds['parent_gene']] # no propigation for now propagate_cds_props_to_gene(cds, parent_gene) elif self.generate_genes: spoof = copy.copy(cds) spoof['type'] = 'gene' spoof['id'] = cds['id'] + "_gene" spoof['cdss'] = [cds['id']] spoof['warnings'] = [ warnings['spoofed_gene'].format(cds['id']) ] self.feature_dict[spoof['id']] = spoof cds['parent_gene'] = spoof['id'] self.spoof_gene_count += 1 else: raise ValueError(warnings['no_spoof']) self.feature_dict[cds['id']] = cds def _update_from_exons(self, feature): """This function updates the sequence and location of a feature based on it's UTRs, CDSs and exon information""" # note that start and end here are in direction of translation def start(loc): return loc[0][1] def end(loc): if loc[-1][2] == "+": return loc[-1][1] + loc[-1][3] + 1 else: return loc[-1][1] - loc[-1][3] - 1 if 'exon' in feature: # update the feature with the exon locations and sequences feature['location'] = [x['location'][0] for x in feature['exon']] feature['dna_sequence'] = "".join(x['dna_sequence'] for x in feature['exon']) feature['dna_sequence_length'] = len(feature['dna_sequence']) # construct feature location from utrs and cdss if present elif 'cds' in feature: cds = [copy.deepcopy(self.feature_dict[feature['cds']])] locs = [] seq = "" for frag in feature.get('five_prime_UTR', []) + cds + \ feature.get('three_prime_UTR', []): # merge into last location if adjacent if locs and abs(end(locs) - start(frag['location'])) == 1: # extend the location length by the length of the first # location in the fragment first = frag['location'].pop(0) locs[-1][3] += first[3] locs.extend(frag['location']) seq += frag['dna_sequence'] feature['location'] = locs feature['dna_sequence'] = seq feature['dna_sequence_length'] = len(seq) # remove these properties as they are no longer needed for x in ['five_prime_UTR', 'three_prime_UTR', 'exon']: feature.pop(x, None) else: ValueError('Feature {} must contain either exon or cds data to ' 'construct an accurate location and sequence'.format( feature['id'])) def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref, source, source_id, assembly, input_gff_file, molecule_type): """ _gen_genome_info: generate genome info """ genome = dict() genome["id"] = core_genome_name genome["scientific_name"] = scientific_name genome["assembly_ref"] = assembly_ref genome['molecule_type'] = molecule_type genome["features"] = [] genome["cdss"] = [] genome["mrnas"] = [] genome['non_coding_features'] = [] genome["gc_content"] = assembly["gc_content"] genome["dna_size"] = assembly["dna_size"] genome['md5'] = assembly['md5'] genome['contig_ids'], genome['contig_lengths'] = zip( *[(k, v['length']) for k, v in assembly['contigs'].items()]) genome['num_contigs'] = len(assembly['contigs']) genome['ontologies_present'] = dict(self.ontologies_present) genome['ontology_events'] = self.ontology_events genome['taxonomy'], genome['taxon_ref'], genome['domain'], \ genome["genetic_code"] = self.gi.retrieve_taxon(self.taxon_wsname, genome['scientific_name']) genome['source'], genome['genome_tiers'] = self.gi.determine_tier( source) genome['source_id'] = source_id # Phytozome gff files are not compatible with the RNASeq Pipeline # so it's better to build from the object than cache the file if self.is_phytozome: gff_file_to_shock = self.dfu.file_to_shock({ 'file_path': input_gff_file, 'make_handle': 1, 'pack': "gzip" }) genome['gff_handle_ref'] = gff_file_to_shock['handle']['hid'] for feature in self.feature_dict.values(): self.feature_counts[feature['type']] += 1 if 'exon' in feature or feature['type'] == 'mRNA': self._update_from_exons(feature) # Test if location order is in order. is_transpliced = "flags" in feature and "trans_splicing" in feature[ "flags"] if not is_transpliced and len(feature["location"]) > 1: # Check the order only if not trans_spliced and has more than 1 location. location_warning = self._check_location_order( feature["location"]) if location_warning is not None: feature["warnings"] = feature.get('warnings', []) + [location_warning] contig_len = genome["contig_lengths"][genome["contig_ids"].index( feature["location"][0][0])] feature = check_full_contig_length_or_multi_strand_feature( feature, is_transpliced, contig_len, self.skip_types) # sort features into their respective arrays if feature['type'] == 'CDS': del feature['type'] genome['cdss'].append(feature) elif feature['type'] == 'mRNA': del feature['type'] genome['mrnas'].append(feature) elif feature['type'] == 'gene': # remove duplicates that may arise from CDS info propagation for key in ('functions', 'aliases', 'db_xrefs'): if key in feature: feature[key] = list(set(feature[key])) if feature['cdss']: del feature['type'] self.feature_counts["protein_encoding_gene"] += 1 genome['features'].append(feature) else: feature.pop('mrnas', None) feature.pop('cdss', None) feature.pop('protein_translation_length', None) self.feature_counts["non_coding_features"] += 1 genome['non_coding_features'].append(feature) else: genome['non_coding_features'].append(feature) if self.warnings: genome['warnings'] = self.warnings genome['feature_counts'] = dict(self.feature_counts) return genome
def _compare_features(self, metagenome_orig, metagenome_new): scratch_dir = self.cfg['scratch'] dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) orig_file_name = dfu.shock_to_file({'file_path': scratch_dir, 'handle_id': metagenome_orig['features_handle_ref'], 'unpack': 'unpack' })['file_path'] new_file_name = dfu.shock_to_file({'file_path': scratch_dir, 'handle_id': metagenome_new['features_handle_ref'], 'unpack': 'unpack' })['file_path'] # open json files with open(orig_file_name) as fid: metagenome_orig_data = json.load(fid) with open(new_file_name) as fid: metagenome_new_data = json.load(fid) print('Testing length or original vs new genome') self.assertTrue(len(metagenome_orig_data) == len(metagenome_new_data), "list is not of equal length in Original and New Genomes.") print("\n\n" + " TOTAL NUMBER:" + str(len(metagenome_orig_data))) orig_dict = dict([(x['id'], x) for x in metagenome_orig_data]) new_dict = dict([(x['id'], x) for x in metagenome_new_data]) first_pass_matches = 0 first_pass_non_match = 0 second_pass_matches = 0 print('Testing keys in metagenomes....') for key in orig_dict: orig_feature = orig_dict[key] new_feature = new_dict[key] if "aliases" in orig_feature: orig_feature['aliases'] = sorted(orig_feature.get('aliases', [])) new_feature['aliases'] = sorted(new_feature.get('aliases', [])) if "db_xrefs" in orig_feature: orig_feature['db_xrefs'] = sorted(orig_feature.get('db_xrefs', [])) new_feature['db_xrefs'] = sorted(new_feature.get('db_xrefs', [])) if "functions" in orig_feature: orig_feature["functions"] = sorted(orig_feature.get('functions', [])) new_feature["functions"] = sorted(new_feature.get('functions', [])) if orig_feature == new_feature: first_pass_matches += 1 else: first_pass_non_match += 1 orig_feature.pop("note", None) new_feature.pop("note", None) orig_feature.pop('inference_data', None) new_feature.pop('inference_data', None) if "warnings" in orig_feature and "warnings" not in new_feature: del(orig_feature["warnings"]) if orig_feature == new_feature: second_pass_matches += 1 else: self.maxDiff = None self.assertEqual(orig_feature, new_feature) self.assertEqual( len(orig_dict), (first_pass_matches + second_pass_matches), (f"There were {first_pass_matches} first pass matches " f"and {second_pass_matches} second pass matches out of " f"{len(orig_dict)} items in features") )
def run_kb_dramv_annotate(self, ctx, params): """ :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_dramv_annotate warnings.filterwarnings("ignore") # setup affi_contigs_shock_ids = params['affi_contigs_shock_id'] min_contig_size = params['min_contig_size'] trans_table = str(params['trans_table']) bitscore = params['bitscore'] rbh_bitscore = params['rbh_bitscore'] assembly_util = AssemblyUtil(self.callback_url) datafile_util = DataFileUtil(self.callback_url) # get contigs and merge assemblies = assembly_util.get_fastas( {'ref_lst': [params['assembly_input_ref']]}) fasta = os.path.join(self.shared_folder, 'merged_contigs.fasta') with open(fasta, 'w') as f: for assembly_ref, assembly_data in assemblies.items(): fasta_path = assembly_data['paths'][0] for line in open(fasta_path): f.write(line) # get affi contigs, read all and merge affi_contigs_path = os.path.join(self.shared_folder, 'VIRSorter_affi-contigs.tab') with open(affi_contigs_path, 'w') as f: for affi_contigs_shock_id in affi_contigs_shock_ids: temp_affi_contigs_path = os.path.join( self.shared_folder, 'temp_VIRSorter_affi-contigs.tab') temp_affi_contigs = datafile_util.shock_to_file({ 'shock_id': affi_contigs_shock_id, 'file_path': temp_affi_contigs_path, 'unpack': 'unpack' })['file_path'] for line in open(temp_affi_contigs): f.write(line) os.remove(temp_affi_contigs) # set DRAM database locations print('DRAM version: %s' % dram_version) import_config('/data/DRAM_databases/CONFIG') # This is a hack to get around a bug in my database setup set_database_paths( description_db_loc='/data/DRAM_databases/description_db.sqlite') print_database_locations() # clean affi contigs file cleaned_fasta = os.path.join( self.shared_folder, '%s.cleaned.fasta' % os.path.basename(fasta)) remove_bad_chars(input_fasta=fasta, output=cleaned_fasta) cleaned_affi_contigs = os.path.join( self.shared_folder, 'VIRSorter_affi-contigs.cleaned.tab') remove_bad_chars(input_virsorter_affi_contigs=affi_contigs_path, output=cleaned_affi_contigs) # annotate and distill output_dir = os.path.join(self.shared_folder, 'DRAM_annos') annotate_vgfs(cleaned_fasta, cleaned_affi_contigs, output_dir, min_contig_size, trans_table=trans_table, bit_score_threshold=bitscore, rbh_bit_score_threshold=rbh_bitscore, low_mem_mode=True, keep_tmp_dir=False, threads=THREADS, verbose=False) output_files = get_annotation_files(output_dir) distill_output_dir = os.path.join(output_dir, 'distilled') summarize_vgfs(output_files['annotations']['path'], distill_output_dir, groupby_column='scaffold') output_files = get_viral_distill_files(distill_output_dir, output_files) # generate report product_html_loc = os.path.join(distill_output_dir, 'product.html') report = generate_product_report(self.callback_url, params['workspace_name'], output_dir, product_html_loc, output_files) output = { 'report_name': report['name'], 'report_ref': report['ref'], } #END run_kb_dramv_annotate # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_dramv_annotate return value ' + 'output is not type dict as required.') # return the results return [output]
class PDBUtil: # “Expect Value” threshold to restrict which alignments will be significant E_VALUE_THRESH = 1e-20 # BLAST sequence identity threshold to determine which pdb structures will be # matched to a KBase genome/feature B_IDENTITY_THRESH = 0.6 def _validate_import_pdb_file_params(self, params): """ _validate_import_pdb_file_params: validates input params to import_model_pdb_file and import_experiment_pdb_file """ # check for required parameters for p in ['structure_name', 'workspace_name']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) if params.get('input_file_path'): file_path = params.get('input_file_path') elif params.get('input_shock_id'): file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') elif params.get('input_staging_file_path'): file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params.get('input_staging_file_path') }).get('copy_file_path') else: error_msg = "Must supply either a input_shock_id or input_file_path " error_msg += "or input_staging_file_path" raise ValueError(error_msg) return file_path, params.get('workspace_name'), params.get( 'structure_name') def _model_file_to_data(self, file_path, params): """ _model_file_to_data: Do the PDB conversion--parse the model pdb file for creating a pdb data object """ logging.info( f'Parsing pdb file {file_path} to a pdb structure with params: {params}' ) parser = PDB.PDBParser(PERMISSIVE=1) pdb1 = file_path pp_no = 0 data = {} try: structure = parser.get_structure("test", pdb1) except (RuntimeError, TypeError, KeyError, ValueError) as e: logging.info(f'PDBParser errored with message: {e.message}') raise else: ppb = PPBuilder() for pp in ppb.build_peptides(structure): pp_no += 1 # logging.info(f'Getting pdb structure data for {structure}!') (compound, source) = self._get_compound_source(structure) (num_models, model_ids) = self._get_models_from_structure(structure) (num_chains, chain_ids) = self._get_chains_from_structure(structure) (num_residues, residue_ids) = self._get_residues_from_structure(structure) (num_atoms, atom_ids) = self._get_atoms_from_structure(structure) model = structure[0] protein_data = self._get_proteins_by_structure( structure, model.get_id(), file_path) (protein_data, params) = self._match_features(params, protein_data) pdb_info = params.get('pdb_info', None) if pdb_info and pdb_info.get('sequence_identities', None): data = { 'name': structure.header.get('name', ''), 'num_chains': num_chains, 'num_residues': num_residues, 'num_atoms': num_atoms, 'compound': compound, 'source': source, 'proteins': protein_data } else: logging.info( f'Parsing pdb file {file_path} failed to match KBase genome/features!' ) data = {} finally: return data, pp_no, params def _exp_file_to_data(self, file_path, params): """ _exp_file_to_data: Do the PDB conversion--parse the experiment pdb file for creating a pdb data object """ logging.info( f'Parsing pdb file {file_path} to a pdb structure with params: {params}' ) parser = PDB.MMCIFParser() cif = file_path pp_no = 0 mmcif_data = None try: structure = parser.get_structure("PHA-L", cif) except (RuntimeError, TypeError, KeyError, ValueError) as e: logging.info(f'MMCIFParser errored with message: {e.message}') raise else: ppb = PPBuilder() for pp in ppb.build_peptides(structure): pp_no += 1 struc_name = structure.header.get('name', '') hd = self._upload_to_shock(file_path) # logging.info(f'Getting pdb structure data for {structure}!') (cpd, src) = self._get_compound_source(structure) (num_models, model_ids) = self._get_models_from_structure(structure) (num_chains, chain_ids) = self._get_chains_from_structure(structure) (num_residues, residue_ids) = self._get_residues_from_structure(structure) (num_atoms, atom_ids) = self._get_atoms_from_structure(structure) protein_data = self._get_proteins_by_structure( structure, model_ids[0], file_path) (protein_data, params) = self._match_features(params, protein_data) pdb_info = params.get('pdb_info', None) if pdb_info and pdb_info.get('sequence_identities', None): mmcif_data = { 'name': struc_name, 'head': structure.header.get('head', ''), 'rcsb_id': structure.header.get('rcsb_id', ''), 'deposition_date': structure.header.get('deposition_date', ''), 'release_date': structure.header.get('release_date', ''), 'structure_method': structure.header.get('structure_method', ''), 'resolution': structure.header.get('resolution', 0.0), 'structure_reference': structure.header.get('structure_reference', []), 'keywords': structure.header.get('keywords', ''), 'author': structure.header.get('author', ''), 'compound': cpd, 'source': src, 'num_models': num_models, 'num_chains': num_chains, 'num_residues': num_residues, 'num_atoms': num_atoms, 'num_het_atoms': structure.header.get('num_het_atoms', 0), 'num_water_atoms': structure.header.get('num_water_atoms', 0), 'num_disordered_atoms': structure.header.get('num_disordered_atoms', 0), 'num_disordered_residues': structure.header.get('num_disordered_residues', 0), 'pdb_handle': hd, 'mmcif_handle': hd, 'xml_handle': hd, 'proteins': protein_data } else: mmcif_data = {} logging.info( f'Parsing pdb file {file_path} failed to match KBase genome/features!' ) finally: return mmcif_data, pp_no, params def _match_features(self, params, protein_data): """ _match_features: match the protein_translation in feature_id with chain sequences in protein_data and compute the seq_identity and determine the exact_match example (in appdev): genome_obj = '57196/6/1', genome_name = 'Synthetic_bacterium_JCVI_Syn3.0_genome' feature_id = 'JCVISYN3_0004_CDS_1', feature_type = 'CDS' OR feature_id = 'JCVISYN3_0004', feature_type = 'gene' """ pdb_info = params.get('pdb_info', None) if pdb_info: kb_feature_type = '' kb_feature_seq = '' genome_name = pdb_info['genome_name'] narr_id = pdb_info['narrative_id'] feature_id = pdb_info['feature_id'] logging.info( f"Looking up for feature {feature_id} in genome {genome_name}'s features" ) # 1. Get the genome's features and reference (gn_ref, kb_genome_features) = self._get_genome_ref_features( narr_id, genome_name) if not gn_ref: logging.info( f"Given genome {genome_name} does not exist in workspace {narr_id}!" ) return protein_data, params pdb_info['genome_ref'] = gn_ref # 2. Match the genome features with the specified feature_id to obtain feature sequence for feat in kb_genome_features: if feat['id'] == feature_id: logging.info( f'Found genome feature match for {feature_id}') kb_feature_type = self._get_feature_type(feat) kb_feature_seq = feat.get('protein_translation', '') break pdb_info['feature_type'] = kb_feature_type # 3. Call self._compute_sequence_identity with the feature sequence and the the pdb # proteins' translations to to get the seq_identity and exact_match if kb_feature_seq: logging.info( f"Finding seq_identity and exact_match for feature {feature_id}" f" in genome {genome_name}'s features...") pdb_chain_ids = [] pdb_model_ids = [] pdb_seq_idens = [] pdb_exact_matches = [] for prot in protein_data: seq_idens, seq_mats = self._compute_sequence_identity( kb_feature_seq, prot.get('sequence', '')) if seq_idens: seq_idens.sort() max_iden = seq_idens.pop() if max_iden >= self.B_IDENTITY_THRESH: # get the good matches prot['seq_identity'] = max_iden prot['exact_match'] = 1 if max_iden > 0.99 else 0 prot['genome_ref'] = gn_ref prot['feature_id'] = feature_id prot['feature_type'] = kb_feature_type pdb_chain_ids.append(prot['chain_id']) pdb_model_ids.append(str(prot['model_id'])) pdb_seq_idens.append(str(prot['seq_identity'])) pdb_exact_matches.append(str(prot['exact_match'])) if pdb_seq_idens: pdb_info['sequence_identities'] = ','.join(pdb_seq_idens) if pdb_chain_ids: pdb_info['chain_ids'] = ','.join(pdb_chain_ids) if pdb_model_ids: pdb_info['model_ids'] = ','.join(pdb_model_ids) if pdb_exact_matches: pdb_info['exact_matches'] = ','.join(pdb_exact_matches) else: logging.info( f'Found NO feature in genome that matches with {feature_id}' ) else: logging.info( 'NO KBase genome/feature object info were given for uploading') return protein_data, params def _compute_sequence_identity(self, seq1, seq2): """ _compute_sequence_identity: Given two input sequences, do a blast identity check and then compute and return the matching percentage. """ # Create two sequence files Seq1 = SeqRecord(Seq(seq1), id="query_seq") Seq2 = SeqRecord(Seq(seq2), id="subject_seq") blast_dir = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(blast_dir) query_seq = os.path.join(blast_dir, 'seq_qry.fasta') subject_seq = os.path.join(blast_dir, 'seq_sbj.fasta') SeqIO.write(Seq1, query_seq, "fasta") SeqIO.write(Seq2, subject_seq, "fasta") # on my laptop: blastp_path = '/Users/qzhang/miniconda3/bin/blastp' blastp_path = 'blastp' output_file_path = os.path.join(blast_dir, 'blast_output.xml') # Build the BLASTp command blastp_cmd = [blastp_path] blastp_cmd.append('-out') blastp_cmd.append(output_file_path) blastp_cmd.append('-outfmt') blastp_cmd.append('5') blastp_cmd.append('-query') blastp_cmd.append(query_seq) blastp_cmd.append('-subject') blastp_cmd.append(subject_seq) # Run BLASTp and parse the output as XML and then parse the xml file for identity matches exact_matches = [] idens = [] try: p = subprocess.Popen(blastp_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) output, errors = p.communicate() if not output: logging.info(f'BLASTp returned: {p.returncode}') logging.info(f'OK> output: {output}') if errors: e = subprocess.CalledProcessError(p.returncode, blastp_cmd, output=output) raise e except OSError as e: logging.info(f'OSError > {e.errno}') logging.info(f'OSError > {e.strerror}') logging.info(f'OSError > {e.filename}') except subprocess.CalledProcessError as e: logging.info(f'CalledError > {e.returncode}') logging.info(f'CalledError > {e.output}') except: logging.info(f'Unexpected error > {sys.exc_info()[0]}') else: with open(output_file_path) as blast_fhd: blast_record = NCBIXML.read(blast_fhd) if blast_record: logging.info(f'query: {blast_record.query[:100]}') for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < self.E_VALUE_THRESH: logging.info('****Alignment****') logging.info(f'sequence: {alignment.title}') logging.info(f'length: {alignment.length}') logging.info(f'e value: {hsp.expect}') logging.info(f'hsp query: {hsp.query}') logging.info(f'hsp match: {hsp.match}') logging.info(f'hsp subject: {hsp.sbjct}') logging.info( f'hsp identities: {hsp.identities}') logging.info(f'hsp positives: {hsp.positives}') iden = round(hsp.identities / hsp.positives, 6) logging.info(f'identity={iden}') idens.append(iden) if hsp.positives == hsp.identities: exact_matches.append(alignment.title[:100]) return idens, exact_matches def _get_genome_ref_features(self, narr_id, genome_name): """ _get_genome_ref_features: Get the genome reference and features for genome_name """ genome_ref = '' genome_features = [] (genome_info, genome_data) = self._get_object_info_data(narr_id, genome_name) if genome_info and genome_data: genome_ref = '/'.join( [str(narr_id), str(genome_info[0]), str(genome_info[4])]) genome_features = genome_data['features'] return (genome_ref, genome_features) def _get_feature_type(self, feature_obj): """ _get_feature_type: Get the type for the feature object of given feature_obj """ feat_type = feature_obj.get('type', '') if not feat_type: if feature_obj.get('protein_translation'): feat_type = 'gene' else: feat_type = 'other' return feat_type def _get_object_info_data(self, narr_id, obj_name): """ _get_object_info_data: Get the object info/data with given obj_name in narrative narr_id """ obj_info = None obj_data = None if narr_id and obj_name: try: obj_data_res = self.ws_client.get_objects2( {'objects': [{ 'wsid': narr_id, 'name': obj_name }]})['data'][0] obj_info = obj_data_res['info'] obj_data = obj_data_res['data'] except: logging.info( f'No object with name {obj_name} exists in workspace {narr_id}' ) logging.info( f'Unexpected error occurred while getting object for {obj_name}' ) pass return (obj_info, obj_data) def _get_atoms_from_structure(self, pdb_structure): """ _get_atoms_from_structure: Given a pdb_structure object, parse atoms into a list of atoms and return it """ atom_ids = [] num_atoms = 0 my_residues = pdb_structure.get_residues() for r_ele in my_residues: for a_ele in r_ele.get_atoms(): num_atoms += 1 atom_ids.append(a_ele.get_id()) return (num_atoms, atom_ids) def _get_residues_from_structure(self, pdb_structure): """ _get_residues_from_structure: Given a pdb_structure object, parse residues into a list and return it """ res_ids = [] num_res = 0 my_res = pdb_structure.get_residues() for r_ele in my_res: if PDB.is_aa(r_ele): num_res += 1 res_ids.append(r_ele.get_id()) return (num_res, res_ids) def _get_chains_from_structure(self, pdb_structure): """ _get_chains: Given a pdb_structure object, parse chain ids into a list and return it """ chain_ids = [] num_chains = 0 my_chains = pdb_structure.get_chains() for c_ele in my_chains: if (c_ele): num_chains += 1 chain_ids.append(c_ele.get_id()) return (num_chains, chain_ids) def _get_models_from_structure(self, pdb_structure): """ _get_models_from_structure: Given a pdb_structure object, parse model ids into a list and return it """ model_ids = [] num_models = 0 my_models = pdb_structure.get_models() for m_ele in my_models: if (m_ele): num_models += 1 model_ids.append(m_ele.get_id()) return (num_models, model_ids) def _get_compound_source(self, structure): """ _get_compound_source: Parse data from given structure for compound and source """ cpd_dict = dict() cpd = structure.header.get('compound', {}) # logging.info(f'Compound:\n {cpd}') if cpd and cpd.get('1'): cpd_dict = cpd.get('1') src_dict = dict() src = structure.header.get('source', {}) # logging.info(f'Source:\n {src}') if src and src.get('1'): src_dict = src.get('1') return (cpd_dict, src_dict) def _get_proteins_by_structure(self, pdb_structure, model, file_path): """ _get_proteins_by_structure: Given a pdb_structure, parse the essential protein data """ ppb = PPBuilder() protein_data = [] # Parse for the chain_id and chain sequence for c_ele in pdb_structure.get_chains(): if (c_ele): c_ppd_list = [] for c_ppd in ppb.build_peptides(c_ele): c_pp_seq = str(c_ppd.get_sequence()) c_ppd_list.append(c_pp_seq) c_seq = ''.join(c_ppd_list) protein_data.append({ 'id': os.path.basename(file_path), 'model_id': model, 'chain_id': c_ele.get_id(), 'sequence': c_seq, 'md5': hashlib.md5(c_seq.encode()).hexdigest() }) return protein_data def _validate_file(self, file_path): """ _validate_file: Check if file_path is accessable, if yes, return the handle """ try: fh = open(file_path, 'r') except IOError as e: if e.errno == errno.ENOENT: # No such file or directory raise ValueError(f'"{file_path}" does not exist!') elif e.errno == errno.EACCES: # Permission denied raise ValueError(f'"{file_path}" cannot be read!') else: raise ValueError(f'"{e.strerror}" error occurred') else: fh.close() return True def _dfu_get_objects(self, obj_ref): """ _dfu_get_objects: call dfu.get_objects to return object data and info """ obj = self.dfu.get_objects({"object_refs": [obj_ref]})['data'][0] return obj['data'], obj['info'] def _get_pdb_shock_id(self, obj_ref): """ _get_pdb_shock_id: Return the shock id for the PDB file """ obj_data, obj_info = self._dfu_get_objects(obj_ref) return self.hs.hids_to_handles([obj_data['pdb_handle']])[0]['id'] def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ logging.info(f'Start uploading file to shock: {file_path}') file_to_shock_params = { 'file_path': file_path, 'pack': 'gzip', 'make_handle': True, } shock_id = self.dfu.file_to_shock( file_to_shock_params)['handle']['hid'] return shock_id def _generate_report_html(self, pdb_name, pdb_path): """ _generate_report_html: generates the HTML for the upload report """ html_report = list() # Make report directory and copy over files output_directory = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(output_directory) result_file_path = os.path.join(output_directory, 'viewer.html') new_pdb_path = os.path.join(output_directory, os.path.basename(pdb_path)) shutil.copy(pdb_path, new_pdb_path) # Fill in template HTML with open( os.path.join(os.path.dirname(__file__), 'templates', 'viewer_template.html')) as report_template_file: report_template = report_template_file.read()\ .replace('*PDB_NAME*', pdb_name)\ .replace('*PDB_PATH*', os.path.basename(pdb_path)) with open(result_file_path, 'w') as result_file: result_file.write(report_template) html_report.append({ 'path': output_directory, 'name': os.path.basename(result_file_path), 'description': 'HTML report for PDB upload' }) return html_report def _generate_report(self, method_name, pdb_obj_ref, workspace_name, n_poly_pep, pdb_name, pdb_path): """ _generate_report: generate summary report for upload """ output_html_files = self._generate_report_html(pdb_name, pdb_path) report_params = { 'message': f'You uploaded a PDB file. {n_poly_pep} polypeptides detected.', 'html_links': output_html_files, 'direct_html_link_index': 0, 'objects_created': [{ 'ref': pdb_obj_ref, 'description': 'Imported PDB' }], 'workspace_name': workspace_name, 'report_object_name': method_name + '_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _validate_batch_import_pdbs_params(self, params): """ _validate_batch_import_pdbs_params: validates params passed to batch_import_pdbs method """ # check for required parameters for p in [ 'structures_name', 'workspace_name', 'metadata_staging_file_path' ]: if p not in params: raise ValueError(f'"{p}" parameter is required, but missing') # metadata_staging_file_path must be from the staging area--must have the staging dir prefix if params.get('metadata_staging_file_path', None): staging_file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params.get('metadata_staging_file_path') }).get('copy_file_path') return (staging_file_path, params['workspace_name'], params['structures_name']) else: error_msg = "Must supply a 'metadata_staging_file_path'" raise ValueError(error_msg) def _read_file_by_type(self, file_path): """ _read_file_by_type: read the file given by file_path depending on its type, return a DataFrame object """ logging.info(f'Reading input from file: {file_path}...') if not self._validate_file(file_path): raise ValueError('Input file is invalid or not found!') df = None file_ext = pathlib.Path(file_path).suffix try: # read the data from file_path depending on its extension if 'csv' in file_ext: df = pd.read_csv(file_path) elif 'tsv' in file_ext: df = pd.read_csv(file_path, '\t') elif 'xls' in file_ext or 'od' in file_ext: # handle xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions df = pd.read_excel(file_path, index_col=None, engine='openpyxl') else: # invalid file type error_msg = "Invalid input file type, only 'csv/tsv/xlsx' are accepted!" raise ValueError(error_msg) # strip off the leading and trailing whitespaces of the column names df.columns = df.columns.str.strip() except (RuntimeError, TypeError, KeyError, ValueError, WorkspaceError) as e: logging.info( f'Reading file {file_path} errored with message: {e.message} and data: {e.data}' ) raise return df def _parse_metadata_file(self, metadata_file_path, ws_id): """ _parse_metadata_file: From metadata_file_path, a spreadsheet file, sort out the model_pdb_file_paths, exp_pdb_file_paths and the kbase_meta_data return: lists model_pdb_file_paths, exp_pdb_file_paths and dict kbase_meta_data """ logging.info( f'parsing metadata from input file {metadata_file_path}...') required_columns = [ 'Narrative ID', 'Object name (Genome AMA feature set)', 'Feature ID', 'PDB filename', 'Is model', 'From RCSB' ] pdb_file_paths = list() narrative_ids = list() genome_names = list() feature_ids = list() # df_meta_data is a Panda DataFrame object df_meta_data = self._read_file_by_type(metadata_file_path) df_col_list = df_meta_data.columns.values.tolist() # check if required columns are read in correctly for col in required_columns: if col not in df_col_list: missing_required = f"Required column '{col}' is missing!" raise ValueError(missing_required) df_indexes = df_meta_data.columns for i in range(len(df_meta_data[df_indexes[0]])): narr_id = int(df_meta_data[df_indexes[0]][i]) if not pd.isna(narr_id): narrative_ids.append(narr_id) else: missing_narr_id = "Please fill all the rows in column 'Narrative ID'!" raise ValueError(missing_narr_id) obj_name = df_meta_data[df_indexes[1]][i] if not pd.isna(obj_name): genome_names.append(obj_name) else: missing_obj_name = "Please fill all the rows in column 'Object name'!" raise ValueError(missing_obj_name) feat_id = df_meta_data[df_indexes[2]][i] if not pd.isna(feat_id): feature_ids.append(feat_id) else: missing_feature_id = f"Please fill all the rows in column '{required_columns[2]}'!" raise ValueError(missing_feature_id) pdb_fn = df_meta_data[df_indexes[3]][ i] # pdb_fn does not have staging dir prefix if pd.isna(pdb_fn): missing_pdb_file = f"Please fill all the rows in column '{required_columns[3]}'!" raise ValueError(missing_pdb_file) (struct_name, ext) = os.path.splitext(os.path.basename(pdb_fn)) from_rcsb = df_meta_data[df_indexes[5]][ i] # pdb file source, default to 'yes' if pd.isna(from_rcsb): from_rcsb = 'yes' is_model = df_meta_data[df_indexes[4]][i] if not pd.isna(is_model): pdb_file_paths.append({ 'file_path': pdb_fn, 'structure_name': struct_name, 'narrative_id': narr_id, 'genome_name': obj_name, 'feature_id': feat_id, 'is_model': 'y' in is_model or 'Y' in is_model, 'from_rcsb': 'y' in from_rcsb or 'Y' in from_rcsb }) else: missing_pdb_md = f"Please fill all the rows in columns '{required_columns[4]}'!" raise ValueError(missing_pdb_md) if not pdb_file_paths: error_msg = "No PDB file info is provided!" raise ValueError(error_msg) return (pdb_file_paths, narrative_ids, genome_names, feature_ids) def _generate_batch_report(self, workspace_name, structs_ref, structs_name, pdb_infos, failed_pdbs): """ _generate_batch_report: generate summary report for upload """ output_html_files = self._generate_batch_report_html( structs_name, pdb_infos) description = ( f'Imported PDBs into a ProteinStructures object "{structs_ref}", ' f'named "{structs_name}".') if failed_pdbs: failed_files = ','.join(failed_pdbs) description += f' These files "{failed_files}" failed to load.' report_params = { 'message': f'You have uploaded a batch of PDB files into {structs_name}.', 'html_links': output_html_files, 'direct_html_link_index': 0, 'objects_created': [{ 'ref': structs_ref, 'description': description }], 'workspace_name': workspace_name, 'report_object_name': 'batch_import_pdb_files_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _write_pdb_htmls(self, output_dir, succ_pdb_infos): """ _write_pdb_htmls: write the batch pdb info as a jQuery DataTable into HTML files """ pdb_html = '' srv_domain = urlparse( self.shock_url).netloc # parse url to get the domain portion srv_base_url = f'https://{srv_domain}' logging.info(f'Get the url for building the anchors: {srv_base_url}') dir_name = os.path.dirname(__file__) molstar_html_file = os.path.join(dir_name, 'templates', 'molstar_viewer.html') molstar_js_file = os.path.join(dir_name, 'templates', 'molstar.js') molstar_css_file = os.path.join(dir_name, 'templates', 'molstar.css') shutil.copy(molstar_html_file, os.path.join(output_dir, 'molstar_viewer.html')) shutil.copy(molstar_js_file, os.path.join(output_dir, 'molstar.js')) shutil.copy(molstar_css_file, os.path.join(output_dir, 'molstar.css')) for succ_pdb in succ_pdb_infos: row_html = '<tr>' file_path = succ_pdb['file_path'] pdb_file_path = succ_pdb[ 'scratch_path'] # This is the scratch path for this pdb file new_pdb_path = os.path.join(output_dir, os.path.basename(file_path)) shutil.copy(pdb_file_path, new_pdb_path) struct_nm = succ_pdb['structure_name'].upper() genome_name = succ_pdb['genome_name'] genome_ref = succ_pdb['genome_ref'] feat_id = succ_pdb['feature_id'] feat_type = succ_pdb['feature_type'] src_rcsb = succ_pdb['from_rcsb'] pdb_chains = [] pdb_models = [] seq_idens = [] if succ_pdb.get('chain_ids', None): pdb_chains = succ_pdb['chain_ids'].split() if succ_pdb.get('model_ids', None): pdb_models = succ_pdb['model_ids'].split() if succ_pdb.get('sequence_identities', None): seq_idens = succ_pdb['sequence_identities'].split() if src_rcsb: row_html += ( f'<td>{struct_nm}<a href="https://www.rcsb.org/3d-view/{struct_nm}"' f' target="_blank"> RCSB Structure</a></td>') else: row_html += (f'<td>{struct_nm}<a href="./molstar_viewer.html"' f' or <a href="molstar_viewer.html"' f' target="_blank"> MolStar Viewer</a></td>') row_html += (f'<td><a href="{srv_base_url}/#dataview/{genome_ref}"' f' target="_blank">{genome_name}</a></td>' f'<td>{feat_id}</td><td>{feat_type}</td>') row_html += f'<td>{pdb_models}</td>' row_html += f'<td>{pdb_chains}</td>' row_html += f'<td>{seq_idens}</td>' row_html += '</tr>' pdb_html += row_html return pdb_html def _generate_batch_report_html(self, prot_structs_name, succ_pdb_infos): """ _generate_batch_report_html: generates the HTML for the upload report """ html_report = list() # Make report directory and copy over uploaded pdb files output_directory = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(output_directory) # Create the template html file for reporting batch-uploaded pdb files batch_html_report_path = os.path.join(output_directory, 'batch_pdb_viewer.html') pdb_html = self._write_pdb_htmls(output_directory, succ_pdb_infos) # Fetch & fill in detailed info into template HTML with open( os.path.join( os.path.dirname(__file__), 'templates', 'batch_pdb_template.html')) as batch_template_html: batch_html_report = batch_template_html.read()\ .replace('<!--replace this content-->', pdb_html) with open(batch_html_report_path, 'w') as html_report_file: html_report_file.write(batch_html_report) print( f'Full batch_html_report has been written to {batch_html_report_path}' ) html_report.append({ 'path': output_directory, 'name': os.path.basename(batch_html_report_path), 'description': 'HTML report for PDB upload' }) return html_report def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.user_id = config['USER_ID'] self.dfu = DataFileUtil(self.callback_url) self.hs = AbstractHandle(config['handle-service-url']) self.ws_client = Workspace(config['workspace-url']) self.shock_url = config['shock-url'] def import_model_pdb_file(self, params, create_report=True): """ import_model_pdb_file: upload an experiment pdb file and convert into a KBaseStructure.ModelProteinStructure object """ logging.info( f'import_model_pdb_file to a pdb data structure with params: {params}' ) # file_path is the pdb file's working area path (after dfu.download_staging_file call) file_path, workspace_name, pdb_name = self._validate_import_pdb_file_params( params) (data, n_polypeptides, params) = self._model_file_to_data(file_path, params) if not data: logging.info( f'PDB file {file_path} import with "Import ModelProteinStructure" failed!' ) return {}, {} data['pdb_handle'] = self._upload_to_shock(file_path) data['user_data'] = params.get('description', '') pdb_info = params.get('pdb_info', None) if pdb_info: pdb_info['scratch_path'] = file_path logging.info(f'Model structure data:{data}') return data, pdb_info def import_experiment_pdb_file(self, params, create_report=True): """ import_experiment_pdb_file: upload an experiment pdb file and convert into a KBaseStructure.ExperimentalProteinStructure object """ logging.info( f'import_experiment_pdb_file to a pdb structure with params: {params}' ) # file_path is the pdb file's working area path (after dfu.download_staging_file call) file_path, workspace_name, mmcif_name = self._validate_import_pdb_file_params( params) # Parse the experimental pdb file for an experimental data structure (data, n_polypeptides, params) = self._exp_file_to_data(file_path, params) if not data: logging.info( f'Import {file_path} with "Import ExperimentalProteinStructure" failed!' ) return {}, {} data['pdb_handle'] = self._upload_to_shock(file_path) data['user_data'] = params.get('description', '') pdb_info = params.get('pdb_info', None) if pdb_info: pdb_info['scratch_path'] = file_path logging.info(data) return data, pdb_info def _export_pdb(self, params): """ _export_pdb: return the shock_id of the uploaded pdb object """ if "input_ref" not in params: raise ValueError("'input_ref' not in supplied params") return {'shock_id': self._get_pdb_shock_id(params['input_ref'])} def _structure_to_pdb_file(self, params): """ _structure_to_pdb_file: get the file path for the given pdb object """ if "input_ref" not in params: raise ValueError("input_ref not in supplied params") if "destination_dir" not in params: raise ValueError("destination_dir not in supplied params") shock_id = self._get_pdb_shock_id(params['input_ref']) file_path = self.dfu.shock_to_file({ 'shock_id': shock_id, 'file_path': params['destination_dir'], 'unpack': 'uncompress' })['file_path'] return {'file_path': file_path} def export_pdb_structures(self, params): """ export_pdb_structures: return the shock_ids of the ProteinStructures object """ if 'input_ref' not in params: raise ValueError("'input_ref' not in supplied params") model_pdbs = [] exp_pdbs = [] # shock_ids = [] for m_pdb in model_pdbs: pass for e_pdb in exp_pdbs: pass return {'shock_id': self._get_pdb_shock_id(params['input_ref'])} def batch_import_pdbs(self, params): """ batch_import_pdbs: upload two sets of pdb files and create a KBaseStructure.ProteinStructures object required params: metadata_staging_file_path: a metafile from the user's staging area that must be a subdirectory file path in staging area, e.g., /data/bulk/user_name/metadata_staging_file_path staging_file_subdir_path is metadata_staging_file_path structures_name: name of the ProteinStructures object to be generated workspace_name: workspace name that the protein structure(s) will be saved return: structures_ref: return ProteinStructures object reference report_name: name of generated report (if any) report_ref: report reference (if any) 1. call _validate_batch_import_pdbs_params to validate input params 2. call _parse_metadata to parse for model_pdb_files, exp_pdb_files and kbase_meta_data 3. call import_model_pdb_file on each entry in model_pdb_paths, and call import_experiment_pdb_file on each entry in exp_pdb_paths 4. assemble the data for a ProteinStructures and save the data object 5. call _generate_batch_report to generate a report for batch_import_pdbs' result """ (metadata_file_path, workspace_name, structures_name) = self._validate_batch_import_pdbs_params(params) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name params['workspace_id'] = workspace_id (pdb_file_paths, narrative_ids, genome_names, feature_ids) = self._parse_metadata_file(metadata_file_path, workspace_id) model_pdb_objects = list() exp_pdb_objects = list() pdb_infos = list() successful_files = list() failed_files = list() protein_structures = dict() total_structures = 0 pdb_params = {} # loop through the list of pdb_file_paths for pdb in pdb_file_paths: pdb_params['pdb_info'] = pdb pdb_params['input_staging_file_path'] = pdb['file_path'] pdb_params['input_file_path'] = None pdb_params['input_shock_id'] = None pdb_params['workspace_name'] = workspace_name pdb_params['structure_name'] = pdb['structure_name'] if pdb['is_model']: model_pdb_data, pdb_info = self.import_model_pdb_file( pdb_params, False) if model_pdb_data: model_pdb_objects.append(model_pdb_data) pdb_infos.append(pdb_info) successful_files.append(pdb['file_path']) total_structures += 1 else: failed_files.append(pdb['file_path']) else: exp_pdb_data, pdb_info = self.import_experiment_pdb_file( pdb_params, False) if exp_pdb_data: exp_pdb_objects.append(exp_pdb_data) pdb_infos.append(pdb_info) successful_files.append(pdb['file_path']) total_structures += 1 else: failed_files.append(pdb['file_path']) if not model_pdb_objects: logging.info("No model pdb structure was created/saved!") return {} protein_structures['model_structures'] = model_pdb_objects protein_structures['experimental_structures'] = exp_pdb_objects protein_structures['total_structures'] = total_structures protein_structures['description'] = ( f'Created {total_structures} ' f'structures in {structures_name}') logging.info( f'ProteinStructures data structure to be saved:\n{protein_structures}' ) returnVal = {} try: info = self.dfu.save_objects({ 'id': workspace_id, 'objects': [{ 'type': 'KBaseStructure.ProteinStructures', 'name': structures_name, 'data': protein_structures }] })[0] except (RuntimeError, TypeError, KeyError, ValueError, WorkspaceError) as e: err_msg = f'DFU.save_objects errored with message: {e.message} and data: {e.data}' logging.info(err_msg) raise ValueError(err_msg) else: structs_ref = f"{info[6]}/{info[0]}/{info[4]}" returnVal = {'structures_ref': structs_ref} report_output = self._generate_batch_report( workspace_name, structs_ref, structures_name, pdb_infos, failed_files) returnVal.update(report_output) finally: return returnVal
class MotifParser: def __init__(self, config): self.scratch = config['scratch'] self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.Homer = Homer(config) self.Gibbs = Gibbs(config) self.MEME = MEME(config) self.MFMD = MFMD(config) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) def get_motif_format(self, format): supported_formats = { "MEME": self.MEME, "JASPAR": None, "GIBBS": self.Gibbs, "HOMER": self.Homer, "TRANSFAC": None, "MFMD": self.MFMD, } return supported_formats[format] def parseMotif(self, params): # remove empty keys from file removekeys = [] file = params['file'] motifformat = params['format'] for k, v in file.items(): if v == '' or v is None or v == 'NA': removekeys.append(k) for k in removekeys: file.pop(k) if not len(file) == 1: raise ValueError('Please input a single file location within the parameters:\n' + 'file = {\n' + '\t\'shock_id\': \'SHOCKID\',\n' + '\t\'ftp_url\': \'FTPURL\',\n' + '\t\'path\': \'CONTAINERFILEPATH\'\n' + '}\n\n') if 'shock_id' in file: # TODO: verify this works with directories and compressed files mfile = self.dfu.shock_to_file({ 'shock_id': file['shock_id'], 'handle_id': '', 'file_path': self.scratch }) self.motif_file = mfile['path'] elif 'ftp_url' in file: # TODO: verify this works with directories and compressed files try: parse.urlparse(file['ftp_url']) except Exception: raise ValueError('Input parameter motif file is specified as an ftp-url with an' + 'invalid url: ' + str(file['ftp_url'])) self.motif_file = request.urlretrieve(file['ftp_url'], self.scratch)[0] elif 'path' in file: if not os.path.exists(file['path']): raise ValueError('The file specified from the input parameter file, does not exists') else: self.motif_file = file['path'] motifinfo = self.get_motif_format(motifformat) if motifinfo is None: raise NotImplementedError(f'Motif format ({motifformat}) is not supported yet') return motifinfo.parse(self.motif_file, params)
class GenomeToGenbank(object): def __init__(self, sdk_config): self.cfg = sdk_config self.dfu = DataFileUtil(self.cfg.callbackURL) self.gi = GenomeInterface(sdk_config) def validate_params(self, params): if 'genome_ref' not in params: raise ValueError('required "genome_ref" field was not defined') def export(self, ctx, params): # 1) validate parameters and extract defaults self.validate_params(params) # 2) get genome info data, info = self.gi.get_one_genome( {'objects': [{ "ref": params['genome_ref'] }]}) # 3) make sure the type is valid if info[2].split(".")[1].split('-')[0] != 'Genome': raise ValueError('Object is not a Genome, it is a:' + str(info[2])) # 4) build the genbank file and return it log('not cached, building file...') result = self.build_genbank_file(data, "KBase_derived_" + info[1] + ".gbff", params['genome_ref']) if result is None: raise ValueError('Unable to generate file. Something went wrong') result['from_cache'] = 0 return result def export_original_genbank(self, ctx, params): # 1) validate parameters and extract defaults self.validate_params(params) # 2) get genome genbank handle reference data, info = self.gi.get_one_genome( {'objects': [{ "ref": params['genome_ref'] }]}) # 3) make sure the type is valid if info[2].split(".")[1].split('-')[0] != 'Genome': raise ValueError('Object is not a Genome, it is a:' + str(info[2])) # 4) if the genbank handle is there, get it and return log('checking if genbank file is cached...') result = self.get_genbank_handle(data) return result def get_genbank_handle(self, data): if 'genbank_handle_ref' not in data: return None if data['genbank_handle_ref'] is None: return None log('pulling cached genbank file from Shock: ' + str(data['genbank_handle_ref'])) file = self.dfu.shock_to_file({ 'handle_id': data['genbank_handle_ref'], 'file_path': self.cfg.sharedFolder, 'unpack': 'unpack' }) return {'genbank_file': {'file_path': file['file_path']}} def build_genbank_file(self, genome_data, output_filename, genome_ref): g = GenomeFile(self.cfg, genome_data, genome_ref) file_path = self.cfg.sharedFolder + "/" + output_filename g.write_genbank_file(file_path) return {'genbank_file': {'file_path': file_path}}
class ReadsAlignmentUtils: ''' Module Name: ReadsAlignmentUtils Module Description: A KBase module: ReadsAlignmentUtils This module is intended for use by Aligners and Assemblers to upload and download alignment files. The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files in sam and bai formats from the downloaded bam file. This utility also generates stats from the stored alignment. ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.3.6" GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git" GIT_COMMIT_HASH = "75ef2c24694c056dfca71859d6f344ccff7d4725" #BEGIN_CLASS_HEADER PARAM_IN_FILE = 'file_path' PARAM_IN_SRC_REF = 'source_ref' PARAM_IN_DST_REF = 'destination_ref' PARAM_IN_CONDITION = 'condition' PARAM_IN_READ_LIB_REF = 'read_library_ref' PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref' PARAM_IN_ALIGNED_USING = 'aligned_using' PARAM_IN_ALIGNER_VER = 'aligner_version' PARAM_IN_ALIGNER_OPTS = 'aligner_opts' PARAM_IN_REPLICATE_ID = 'replicate_id' PARAM_IN_PLATFORM = 'platform' PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index' PARAM_IN_SAMPLESET_REF = 'sampleset_ref' PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id' PARAM_IN_DOWNLOAD_SAM = 'downloadSAM' PARAM_IN_DOWNLOAD_BAI = 'downloadBAI' PARAM_IN_VALIDATE = 'validate' INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') INVALID_WS_NAME_RE = re.compile('[^\\w:._-]') def _get_file_path_info(self, file_path): """ Given a file path, returns the directory, file name, file base and file extension """ dir, file_name = os.path.split(file_path) file_base, file_ext = os.path.splitext(file_name) return dir, file_name, file_base, file_ext def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _check_required_param(self, in_params, param_list): """ Checks if each of the params in the list are in the input params """ for param in param_list: if (param not in in_params or not in_params[param]): raise ValueError('{} parameter is required'.format(param)) def _proc_ws_obj_params(self, ctx, params): """ Checks the validity of workspace and object params and returns them """ dst_ref = params.get(self.PARAM_IN_DST_REF) ws_name_id, obj_name_id = os.path.split(dst_ref) if not bool(ws_name_id.strip()) or ws_name_id == '/': raise ValueError("Workspace name or id is required in " + self.PARAM_IN_DST_REF) if not bool(obj_name_id.strip()): raise ValueError("Object name or id is required in " + self.PARAM_IN_DST_REF) if not isinstance(ws_name_id, int): try: ws_name_id = self.dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id)) return ws_name_id, obj_name_id def _get_ws_info(self, obj_ref): ws = Workspace(self.ws_url) try: info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0] except WorkspaceError as wse: self.__LOGGER.error('Logging workspace exception') self.__LOGGER.error(str(wse)) raise return info def _proc_upload_alignment_params(self, ctx, params): """ Checks the presence and validity of upload alignment params """ self._check_required_param(params, [ self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION, self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF ]) ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params) file_path = params.get(self.PARAM_IN_FILE) if not (os.path.isfile(file_path)): raise ValueError('File does not exist: ' + file_path) lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2] if lib_type.startswith('KBaseFile.SingleEndLibrary') or \ lib_type.startswith('KBaseFile.PairedEndLibrary') or \ lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \ lib_type.startswith('KBaseAssembly.PairedEndLibrary'): pass else: raise ValueError(self.PARAM_IN_READ_LIB_REF + ' parameter should be of type' + ' KBaseFile.SingleEndLibrary or' + ' KBaseFile.PairedEndLibrary or' + ' KBaseAssembly.SingleEndLibrary or' + ' KBaseAssembly.PairedEndLibrary') obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2] if obj_type.startswith('KBaseGenomes.Genome') or \ obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \ obj_type.startswith('KBaseGenomes.ContigSet'): pass else: raise ValueError(self.PARAM_IN_ASM_GEN_REF + ' parameter should be of type' + ' KBaseGenomes.Genome or' + ' KBaseGenomeAnnotations.Assembly or' + ' KBaseGenomes.ContigSet') return ws_name_id, obj_name_id, file_path, lib_type def _get_aligner_stats(self, bam_file): """ Gets the aligner stats from BAM file How we compute this stats: For each segment (line) in SAM/BAM file: we take the first element as `reads_id` the second element as `flag` if the last bit (0x1) of flag is `1`: we treat this segment as paired end reads otherwise: we treat this segment as single end reads For single end reads: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_reads_count else: we treat this `reads_id` as mapped for all mapped `reads_ids`" if it appears only once: we treat this `reads_id` as `singletons` else: we treat this `reads_id` as `multiple_alignments` lastly, total_reads = unmapped_reads_count + identical mapped `reads_id` For paired end reads: if the 7th last bit (0x40) of flag is `1`: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_left_reads_count else: we treat this `reads_id` as mapped if the 8th last bit ( 0x80) of flag is `1`: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_right_reads_count else: we treat this `reads_id` as mapped for all mapped `reads_ids`" if it appears only once: we treat this `reads_id` as `singletons` else: we treat this `reads_id` as `multiple_alignments` lastly, total_reads = unmapped_left_reads_count + unmapped_right_reads_count + identical mapped `reads_id` """ path, file = os.path.split(bam_file) self.__LOGGER.info('Start to generate aligner stats') start_time = time.time() infile = pysam.AlignmentFile(bam_file, 'r') properly_paired = 0 unmapped_reads_count = 0 unmapped_left_reads_count = 0 unmapped_right_reads_count = 0 mapped_reads_ids = [] mapped_left_reads_ids = [] mapped_right_reads_ids = [] paired = False for alignment in infile: seg = alignment.to_string().split('\t') reads_id = seg[0] flag = "0000000" + "{0:b}".format(int(seg[1])) if flag[-1] == '1': paired = True if paired: # process paired end sequence if flag[-7] == '1': # first sequence of a pair if flag[-3] == '1': unmapped_left_reads_count += 1 else: mapped_left_reads_ids.append(reads_id) if flag[-8] == '1': # second sequence of a pair if flag[-3] == '1': unmapped_right_reads_count += 1 else: mapped_right_reads_ids.append(reads_id) if flag[-2] == '1': properly_paired += 1 else: # process single end sequence if flag[-3] == '1': unmapped_reads_count += 1 else: mapped_reads_ids.append(reads_id) if flag[-2] == '1': properly_paired += 1 infile.close() if paired: mapped_reads_ids = mapped_left_reads_ids + mapped_right_reads_ids unmapped_reads_count = unmapped_left_reads_count + unmapped_right_reads_count mapped_reads_ids_counter = Counter(mapped_reads_ids) mapped_reads_count = len(list(mapped_reads_ids_counter)) singletons = list(mapped_reads_ids_counter.values()).count(1) multiple_alignments = mapped_reads_count - singletons total_reads = unmapped_reads_count + mapped_reads_count properly_paired = properly_paired // 2 else: mapped_reads_ids_counter = Counter(mapped_reads_ids) mapped_reads_count = len(list(mapped_reads_ids_counter)) singletons = list(mapped_reads_ids_counter.values()).count(1) multiple_alignments = mapped_reads_count - singletons total_reads = unmapped_reads_count + mapped_reads_count try: alignment_rate = round( float(mapped_reads_count) / total_reads * 100, 3) except ZeroDivisionError: alignment_rate = 0 if alignment_rate > 100: alignment_rate = 100.0 elapsed_time = time.time() - start_time self.__LOGGER.info('Used: {}'.format( time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) stats_data = { "alignment_rate": alignment_rate, "mapped_reads": mapped_reads_count, "multiple_alignments": multiple_alignments, "properly_paired": properly_paired, "singletons": singletons, "total_reads": total_reads, "unmapped_reads": unmapped_reads_count } return stats_data def _validate(self, params): samt = SamTools(self.config, self.__LOGGER) if 'ignore' in params: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path, ignore=params['ignore']) else: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path) return rval #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.__LOGGER = logging.getLogger('KBaseRNASeq') if 'log_level' in config: self.__LOGGER.setLevel(config['log_level']) else: self.__LOGGER.setLevel(logging.INFO) streamHandler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "%(asctime)s - %(filename)s - %(lineno)d - \ %(levelname)s - %(message)s") formatter.converter = time.gmtime streamHandler.setFormatter(formatter) self.__LOGGER.addHandler(streamHandler) self.__LOGGER.info("Logger was set") script_utils.check_sys_stat(self.__LOGGER) self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.samtools = SamTools(config) #END_CONSTRUCTOR pass def validate_alignment(self, ctx, params): """ :param params: instance of type "ValidateAlignmentParams" (* Input parameters for validating a reads alignment. For validation errors to ignore, see http://broadinstitute.github.io/picard/command-line-overview.html#V alidateSamFile) -> structure: parameter "file_path" of String, parameter "ignore" of list of String :returns: instance of type "ValidateAlignmentOutput" (* Results from validate alignment *) -> structure: parameter "validated" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)) """ # ctx is the context object # return variables are: returnVal #BEGIN validate_alignment rval = self._validate(params) if rval == 0: returnVal = {'validated': True} else: returnVal = {'validated': False} #END validate_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method validate_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def upload_alignment(self, ctx, params): """ Validates and uploads the reads alignment How we compute BAM stats: For each segment (line) in SAM/BAM file: we take the first element as `reads_id` the second element as `flag` if the last bit (0x1) of flag is `1`: we treat this segment as paired end reads otherwise: we treat this segment as single end reads For single end reads: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_reads_count else: we treat this `reads_id` as mapped for all mapped `reads_ids`" if it appears only once: we treat this `reads_id` as `singletons` else: we treat this `reads_id` as `multiple_alignments` lastly, total_reads = unmapped_reads_count + identical mapped `reads_id` For paired end reads: if the 7th last bit (0x40) of flag is `1`: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_left_reads_count else: we treat this `reads_id` as mapped if the 8th last bit ( 0x80) of flag is `1`: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_right_reads_count else: we treat this `reads_id` as mapped for all mapped `reads_ids`" if it appears only once: we treat this `reads_id` as `singletons` else: we treat this `reads_id` as `multiple_alignments` lastly, total_reads = unmapped_left_reads_count + unmapped_right_reads_count + identical mapped `reads_id` :param params: instance of type "UploadAlignmentParams" (* Required input parameters for uploading a reads alignment string destination_ref - object reference of alignment destination. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id file_path - File with the path of the sam or bam file to be uploaded. If a sam file is provided, it will be converted to the sorted bam format before being saved read_library_ref - workspace object ref of the read sample used to make the alignment file condition - assembly_or_genome_ref - workspace object ref of genome assembly or genome object that was used to build the alignment *) -> structure: parameter "destination_ref" of String, parameter "file_path" of String, parameter "read_library_ref" of String, parameter "condition" of String, parameter "assembly_or_genome_ref" of String, parameter "aligned_using" of String, parameter "aligner_version" of String, parameter "aligner_opts" of mapping from String to String, parameter "replicate_id" of String, parameter "platform" of String, parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter "sampleset_ref" of type "ws_Sampleset_ref", parameter "mapped_sample_id" of mapping from String to mapping from String to String, parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "UploadAlignmentOutput" (* Output from uploading a reads alignment *) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN upload_alignment self.__LOGGER.info( 'Starting upload Reads Alignment, parsing parameters ') pprint(params) ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params( ctx, params) dir, file_name, file_base, file_ext = self._get_file_path_info( file_path) if self.PARAM_IN_VALIDATE in params and params[ self.PARAM_IN_VALIDATE] is True: if self._validate(params) == 1: raise Exception('{0} failed validation'.format(file_path)) bam_file = file_path if file_ext.lower() == '.sam': bam_file = os.path.join(dir, file_base + '.bam') self.samtools.convert_sam_to_sorted_bam(ifile=file_name, ipath=dir, ofile=bam_file) uploaded_file = self.dfu.file_to_shock({ 'file_path': bam_file, 'make_handle': 1 }) file_handle = uploaded_file['handle'] file_size = uploaded_file['size'] aligner_stats = self._get_aligner_stats(file_path) aligner_data = { 'file': file_handle, 'size': file_size, 'condition': params.get(self.PARAM_IN_CONDITION), 'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF), 'library_type': lib_type, 'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF), 'alignment_stats': aligner_stats } optional_params = [ self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER, self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID, self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX, self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID ] for opt_param in optional_params: if opt_param in params and params[opt_param] is not None: aligner_data[opt_param] = params[opt_param] self.__LOGGER.info('========= Adding extra_provenance_refs') self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF)) self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF)) self.__LOGGER.info('=======================================') res = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": "KBaseRNASeq.RNASeqAlignment", "data": aligner_data, "name": obj_name_id, "extra_provenance_input_refs": [ params.get(self.PARAM_IN_READ_LIB_REF), params.get(self.PARAM_IN_ASM_GEN_REF) ] }] })[0] self.__LOGGER.info('save complete') returnVal = { 'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4]) } self.__LOGGER.info('Uploaded object: ') self.__LOGGER.info(returnVal) #END upload_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method upload_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def download_alignment(self, ctx, params): """ Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats * :param params: instance of type "DownloadAlignmentParams" (* Required input parameters for downloading a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "DownloadAlignmentOutput" (* The output of the download method. *) -> structure: parameter "destination_dir" of String, parameter "stats" of type "AlignmentStats" -> structure: parameter "properly_paired" of Long, parameter "multiple_alignments" of Long, parameter "singletons" of Long, parameter "alignment_rate" of Double, parameter "unmapped_reads" of Long, parameter "mapped_reads" of Long, parameter "total_reads" of Long """ # ctx is the context object # return variables are: returnVal #BEGIN download_alignment self.__LOGGER.info('Running download_alignment with params:\n' + pformat(params)) inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise # set the output dir uuid_str = str(uuid.uuid4()) output_dir = os.path.join(self.scratch, 'download_' + uuid_str) self._mkdir_p(output_dir) file_ret = self.dfu.shock_to_file({ 'shock_id': alignment[0]['data']['file']['id'], 'file_path': output_dir }) if zipfile.is_zipfile(file_ret.get('file_path')): with zipfile.ZipFile(file_ret.get('file_path')) as z: z.extractall(output_dir) for f in glob.glob(output_dir + '/*.zip'): os.remove(f) bam_files = glob.glob(output_dir + '/*.bam') if len(bam_files) == 0: raise ValueError("Alignment object does not contain a bam file") for bam_file_path in bam_files: dir, file_name, file_base, file_ext = self._get_file_path_info( bam_file_path) if params.get(self.PARAM_IN_VALIDATE, False): validate_params = {'file_path': bam_file_path} if self._validate(validate_params) == 1: raise Exception( '{0} failed validation'.format(bam_file_path)) if params.get(self.PARAM_IN_DOWNLOAD_BAI, False): bai_file = file_base + '.bai' bai_file_path = os.path.join(output_dir, bai_file) self.samtools.create_bai_from_bam(ifile=file_name, ipath=output_dir, ofile=bai_file) if not os.path.isfile(bai_file_path): raise ValueError('Error creating {}'.format(bai_file_path)) if params.get(self.PARAM_IN_DOWNLOAD_SAM, False): sam_file = file_base + '.sam' sam_file_path = os.path.join(output_dir, sam_file) self.samtools.convert_bam_to_sam(ifile=file_name, ipath=output_dir, ofile=sam_file) if not os.path.isfile(sam_file_path): raise ValueError('Error creating {}'.format(sam_file_path)) returnVal = { 'destination_dir': output_dir, 'stats': alignment[0]['data']['alignment_stats'] } #END download_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method download_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def export_alignment(self, ctx, params): """ Wrapper function for use by in-narrative downloaders to download alignments from shock * :param params: instance of type "ExportParams" (* Required input parameters for exporting a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_alignment inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) if params.get(self.PARAM_IN_VALIDATE, False) or \ params.get('exportBAI', False) or \ params.get('exportSAM', False): """ Need to validate or convert files. Use download_alignment """ download_params = {} for key, val in params.items(): download_params[key.replace('export', 'download')] = val download_retVal = self.download_alignment(ctx, download_params)[0] export_dir = download_retVal['destination_dir'] # package and load to shock ret = self.dfu.package_for_download({ 'file_path': export_dir, 'ws_refs': [inref] }) output = {'shock_id': ret['shock_id']} else: """ return shock id from the object """ try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise output = {'shock_id': alignment[0]['data']['file']['id']} #END export_alignment # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_alignment return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class GenomeToGFF: """ typedef structure { string genome_ref; list <string> ref_path_to_genome; int is_gtf; } GenomeToGFFParams; /* from_cache is 1 if the file already exists and was just returned, 0 if the file was generated during this call. */ typedef structure { File file_path; boolean from_cache; } GenomeToGFFResult; funcdef genome_to_gff(GenomeToGFFParams params) returns (GenomeToGFFResult result) authentication required; """ def __init__(self, sdk_config): self.cfg = sdk_config self.dfu = DataFileUtil(self.cfg.callbackURL) self.gi = GenomeInterface(sdk_config) self.child_dict = {} self.transcript_counter = defaultdict(int) def export(self, ctx, params): # 1) validate parameters and extract defaults self.validate_params(params) # 2) get genome info data, info = self.gi.get_one_genome({'objects': [{"ref": params['genome_ref']}]}) # 3) make sure the type is valid ws_type_name = info[2].split('.')[1].split('-')[0] if ws_type_name != 'Genome' and ws_type_name != 'AnnotatedMetagenomeAssembly': raise ValueError('Object is not a Genome or an AnnotatedMetagenomeAssembly, it is a:' + str(info[2])) is_gtf = params.get('is_gtf', 0) target_dir = params.get('target_dir') if not target_dir: target_dir = os.path.join(self.cfg.sharedFolder, "gff_" + str(int(time.time() * 1000))) if not os.path.exists(target_dir): os.makedirs(target_dir) is_metagenome = 'AnnotatedMetagenomeAssembly' in info[2] if is_metagenome: # if the type is metagenome, get from shock result = self.get_gff_handle(data, target_dir) else: # 4) Build the GFF/GTF file and return it result = self.build_gff_file(data, target_dir, info[1], is_gtf == 1, is_metagenome) if result is None: raise ValueError('Unable to generate file. Something went wrong') result['from_cache'] = int(is_metagenome) return result def get_gff_handle(self, data, output_dir): """Get the gff file directly from the 'gff_handle_ref' field in the object""" if not data.get('gff_handle_ref'): return None print('pulling cached GFF file from Shock: '+str(data['gff_handle_ref'])) file_ret = self.dfu.shock_to_file( {'handle_id': data['gff_handle_ref'], 'file_path': output_dir, 'unpack': 'unpack'}) return {'file_path': file_ret['file_path']} def build_gff_file(self, genome_data, output_dir, output_filename, is_gtf, is_metagenome): def feature_sort(feat): order = ('gene', 'mRNA', 'CDS') if feat.get('children'): priority = 0 elif feat['type'] not in order: priority = len(order) else: priority = order.index(feat['type']) return get_start(self.get_common_location( feat['location'])), priority gff_header = ['seqname', 'source', 'type', 'start', 'end', 'score', 'strand', 'frame', 'attribute'] # create the file file_ext = ".gtf" if is_gtf else ".gff" out_file_path = os.path.join(output_dir, output_filename + file_ext) print('Creating file: ' + str(out_file_path)) if is_metagenome: json_file_path = os.path.join(output_dir, output_filename + '_features.json') json_res = self.dfu.shock_to_file({ 'handle_id': genome_data['features_handle_ref'], 'file_path': json_file_path }) with open(json_res['file_path']) as json_fid: features = json.load(json_fid) features_by_contig = defaultdict(list) for feature in features: if 'type' not in feature: feature['type'] = 'gene' elif feature['type']== 'CDS' or feature['type'] == 'mRNA': if feature.get('parent_gene'): self.child_dict[feature['id']] = feature features_by_contig[feature['location'][0][0]].append(feature) else: """There is two ways of printing, if a feature has a parent_gene, it will be printed breadth first when it's parent parent gene is printed. if not, it needs to be added to the features_by_contig to be printed""" # sort every feature in the feat_arrays into a dict by contig features_by_contig = defaultdict(list) for feature in genome_data['features'] + genome_data.get( 'non_coding_features', []): # type is not present in new gene array if 'type' not in feature: feature['type'] = 'gene' features_by_contig[feature['location'][0][0]].append(feature) for mrna in genome_data.get('mrnas', []): mrna['type'] = 'mRNA' if mrna.get('parent_gene'): self.child_dict[mrna['id']] = mrna else: features_by_contig[mrna['location'][0][0]].append(mrna) for cds in genome_data.get('cdss', []): cds['type'] = 'CDS' if cds.get('parent_gene') or cds.get('parent_mrna'): self.child_dict[cds['id']] = cds else: features_by_contig[cds['location'][0][0]].append(cds) file_handle = open(out_file_path, 'w') writer = csv.DictWriter(file_handle, gff_header, delimiter="\t", escapechar='\\', quotechar="'") for contig in genome_data.get('contig_ids', features_by_contig.keys()): file_handle.write("##sequence-region {}\n".format(contig)) features_by_contig[contig].sort(key=feature_sort) for feature in features_by_contig[contig]: writer.writerows(self.make_feature_group(feature, is_gtf)) return {'file_path': out_file_path} def make_feature_group(self, feature, is_gtf): # RNA types make exons if they have compound locations if feature['type'] in {'RNA', 'mRNA', 'tRNA', 'rRNA', 'misc_RNA', 'transcript'}: loc = self.get_common_location(feature['location']) lines = [self.make_feature(loc, feature, is_gtf)] for i, loc in enumerate(feature['location']): exon = {'id': "{}_exon_{}".format(feature['id'], i + 1), 'parent_gene': feature.get('parent_gene', ""), 'parent_mrna': feature['id']} lines.append(self.make_feature(loc, exon, is_gtf)) # other types duplicate the feature else: lines = [self.make_feature(loc, feature, is_gtf) for loc in feature['location']] #if this is a gene with mRNAs, make the mrna (and subfeatures) if feature.get('mrnas', False): for mrna_id in feature['mrnas']: lines += self.make_feature_group(self.child_dict[mrna_id], is_gtf) # if no mrnas are present in a gene and there are CDS, make them here elif feature.get('cdss', False): for cds_id in feature['cdss']: lines += self.make_feature_group(self.child_dict[cds_id], is_gtf) # if this is a mrna with a child CDS, make it here elif feature.get('cds', False): lines += self.make_feature_group(self.child_dict[feature['cds']], is_gtf) return lines def make_feature(self, location, in_feature, is_gtf): """Make a single feature line for the file""" try: out_feature = { 'seqname': location[0], 'source': 'KBase', 'type': in_feature.get('type', 'exon'), 'start': str(get_start(location)), 'end': str(get_end(location)), 'score': '.', 'strand': location[2], 'frame': '0', } if is_gtf: out_feature['attribute'] = self.gen_gtf_attr(in_feature) else: out_feature['attribute'] = self.gen_gff_attr(in_feature) except Exception as e: traceback.print_exc() raise Exception(f'Unable to parse {in_feature}:{e}') return out_feature @staticmethod def gen_gtf_attr(feature): """Makes the attribute line for a feature in gtf style""" if feature.get('type') == 'gene': return f'gene_id "{feature["id"]}"; transcript_id ""' if "parent" in feature: feature['parent_gene'] = feature['parent'] return (f'gene_id "{feature.get("parent_gene", feature["id"])}"; ' f'transcript_id "{feature.get("parent_mrna", feature["id"])}"') @staticmethod def gen_gff_attr(feature): """Makes the attribute line for a feature in gff style""" def _one_attr(k, val): return f'{k}={urllib.parse.quote(val, " /:")}' # don't add an attribute that could be 0 without refactor for key in ('parent_gene', 'parent_mrna'): if key in feature: feature['parent'] = feature[key] attr_keys = (('id', 'ID'), ('parent', 'Parent'), ('note', 'note')) attrs = [_one_attr(pair[1], feature[pair[0]]) for pair in attr_keys if feature.get(pair[0])] attrs.extend([_one_attr('db_xref', '{}:{}'.format(*x)) for x in feature.get('db_xrefs', [])]) attrs.extend([_one_attr(pair[0], pair[1]) for pair in feature.get('aliases', ['']) if isinstance(pair, list)]) if feature.get('functional_descriptions'): attrs.append(_one_attr('function', ";".join( feature['functional_descriptions']))) if feature.get('functions'): attrs.append(_one_attr('product', ";".join(feature['functions']))) elif feature.get('function'): attrs.append(_one_attr('product', feature['function'])) for ont in feature.get('ontology_terms', []): attrs.extend([_one_attr(ont.lower(), x) for x in feature['ontology_terms'][ont]]) if 'inference_data' in feature: attrs.extend([_one_attr( 'inference', ":".join([x[y] for y in ('category', 'type', 'evidence') if x[y]])) for x in feature['inference_data']]) if 'trans_splicing' in feature.get('flags', []): attrs.append(_one_attr("exception", "trans-splicing")) return "; ".join(attrs) @staticmethod def get_common_location(location_array): """Merges a compound location array into an overall location""" contig = location_array[0][0] strand = location_array[0][2] min_pos = min([get_start(loc) for loc in location_array]) max_pos = max([get_end(loc) for loc in location_array]) common_length = max_pos - min_pos + 1 common_start = min_pos if strand == '+' else max_pos return [contig, common_start, strand, common_length] @staticmethod def validate_params(params): if 'genome_ref' not in params: raise ValueError('required "genome_ref" field was not defined')
class PDBUtil: def _validate_import_pdb_file_params(self, params): """ _validate_import_matrix_from_excel_params: validates params passed to import_matrix_from_excel method """ # check for required parameters for p in ['structure_name', 'workspace_name']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) if params.get('input_file_path'): file_path = params.get('input_file_path') elif params.get('input_shock_id'): file_path = self.dfu.shock_to_file( {'shock_id': params['input_shock_id'], 'file_path': self.scratch}).get('file_path') elif params.get('input_staging_file_path'): file_path = self.dfu.download_staging_file( {'staging_file_subdir_path': params.get('input_staging_file_path')} ).get('copy_file_path') else: error_msg = "Must supply either a input_shock_id or input_file_path " error_msg += "or input_staging_file_path" raise ValueError(error_msg) return file_path, params.get('workspace_name'), params.get('structure_name') def _file_to_data(self, file_path): """Do the PDB conversion""" pdb1 = file_path structure = parser.get_structure("test", pdb1) model = structure[0] chain_no = 0 res_no = 0 atom_no = 0 pp_list = [] pp_no = 0 for model in structure: for chain in model: chain_no += 1 for residue in model.get_residues(): if PDB.is_aa(residue): res_no += 1 for atom in residue.get_atoms(): atom_no += 1 for pp in ppb.build_peptides(structure): pp_no += 1 my_seq= pp.get_sequence() pp_list += str(my_seq) seq = ''.join(pp_list) return { 'name': os.path.basename(file_path), 'num_chains': chain_no, 'num_residues': res_no, 'num_atoms': atom_no, 'protein': { 'id': os.path.basename(file_path), 'sequence': seq, 'md5': hashlib.md5(seq.encode()).hexdigest() }, } def _get_pdb_shock_id(self, obj_ref): """Return the shock id for the PDB file""" obj_data = self.dfu.get_objects({"object_refs": [obj_ref]})['data'][0]['data'] return self.hs.hids_to_handles([obj_data['pdb_handle']])[0]['id'] def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ logging.info('Start uploading file to shock: {}'.format(file_path)) file_to_shock_params = { 'file_path': file_path, 'pack': 'gzip', 'make_handle': True, } shock_id = self.dfu.file_to_shock(file_to_shock_params)['handle']['hid'] return shock_id def _generate_html_report(self, header_str, table_str): #TODO: make this work with the PDB viewer html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'search.html') with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'templates', 'viewer_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('//HEADER_STR', header_str) report_template = report_template.replace('//TABLE_STR', table_str) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({'file_path': output_directory, 'pack': 'zip'})['shock_id'] html_report.append({'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Search Matrix App'}) return html_report def _generate_report(self, pdb_obj_ref, workspace_name): """ _generate_report: generate summary report """ # included as an example. Replace with your own implementation # output_html_files = self._generate_html_report(header_str, table_str) report_params = {'message': 'You uploaded a PDB file!', #'html_links': output_html_files, #'direct_html_link_index': 0, 'objects_created': [{'ref': pdb_obj_ref, 'description': 'Imported PDB'}], 'workspace_name': workspace_name, 'report_object_name': 'import_pdb_from_staging_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.hs = AbstractHandle(config['handle-service-url']) def import_model_pdb_file(self, params): file_path, workspace_name, pdb_name = self._validate_import_pdb_file_params(params) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name data = self._file_to_data(file_path) data['pdb_handle'] = self._upload_to_shock(file_path) data['user_data'] = params.get('description', '') logging.info(data) info = self.dfu.save_objects({ 'id': workspace_id, 'objects': [ {'type': 'KBaseStructure.ModelProteinStructure', 'name': pdb_name, 'data': data}] })[0] obj_ref = f"{info[6]}/{info[0]}/{info[4]}" returnVal = {'structure_obj_ref': obj_ref} report_output = self._generate_report(obj_ref, workspace_name) returnVal.update(report_output) return returnVal def export_pdb(self, params): if "input_ref" not in params: raise ValueError("input_ref not in supplied params") return {'shock_id': self._get_pdb_shock_id(params['input_ref'])} def structure_to_pdb_file(self, params): if "input_ref" not in params: raise ValueError("input_ref not in supplied params") if "destination_dir" not in params: raise ValueError("destination_dir not in supplied params") shock_id = self._get_pdb_shock_id(params['input_ref']) file_path = self.dfu.shock_to_file({ 'shock_id': shock_id, 'file_path': params['destination_dir'], 'unpack': 'uncompress' })['file_path'] return {'file_path': file_path}
class BiomUtil: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _process_params(self, params): logging.info('start validating import_matrix_from_biom params') # check for required parameters for p in [ 'obj_type', 'matrix_name', 'workspace_id', 'scale', 'amplicon_type', 'sequencing_technology', 'sequencing_instrument', 'target_gene', 'target_subfragment', 'taxon_calling' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) # check sequencing_technology and sequencing_instrument matching sequencing_technology = params.get('sequencing_technology') sequencing_instrument = params.get('sequencing_instrument') if sequencing_technology not in SEQ_INSTRUMENTS_MAP: raise ValueError('Unexpected sequencing technology: {}'.format( sequencing_technology)) expected_instruments = SEQ_INSTRUMENTS_MAP.get(sequencing_technology) if sequencing_instrument not in expected_instruments: raise ValueError( 'Please select sequencing instrument among {} for {}'.format( expected_instruments, sequencing_technology)) # check target_gene and target_subfragment matching target_gene = params.get('target_gene') target_subfragment = list(set(params.get('target_subfragment'))) params['target_subfragment'] = target_subfragment if target_gene not in TARGET_GENE_SUBFRAGMENT_MAP: raise ValueError('Unexpected target gene: {}'.format(target_gene)) expected_subfragments = TARGET_GENE_SUBFRAGMENT_MAP.get(target_gene) if not set(target_subfragment) <= set(expected_subfragments): raise ValueError( 'Please select target subfragments among {} for {}'.format( expected_subfragments, target_gene)) # check taxon_calling taxon_calling = params.get('taxon_calling') taxon_calling_method = list( set(taxon_calling.get('taxon_calling_method'))) params['taxon_calling_method'] = taxon_calling_method if 'denoising' in taxon_calling_method: denoise_method = taxon_calling.get('denoise_method') sequence_error_cutoff = taxon_calling.get('sequence_error_cutoff') if not (denoise_method and sequence_error_cutoff): raise ValueError( 'Please provide denoise_method and sequence_error_cutoff') params['denoise_method'] = denoise_method params['sequence_error_cutoff'] = sequence_error_cutoff if 'clustering' in taxon_calling_method: clustering_method = taxon_calling.get('clustering_method') clustering_cutoff = taxon_calling.get('clustering_cutoff') if not (clustering_method and clustering_cutoff): raise ValueError( 'Please provide clustering_method and clustering_cutoff') params['clustering_method'] = clustering_method params['clustering_cutoff'] = clustering_cutoff obj_type = params.get('obj_type') if obj_type not in self.matrix_types: raise ValueError('Unknown matrix object type: {}'.format(obj_type)) scale = params.get('scale') if scale not in SCALE_TYPES: raise ValueError('Unknown scale type: {}'.format(scale)) biom_file = None tsv_file = None fasta_file = None metadata_keys = DEFAULT_META_KEYS input_local_file = params.get('input_local_file', False) if params.get('taxonomic_abundance_tsv') and params.get( 'taxonomic_fasta'): tsv_file = params.get('taxonomic_abundance_tsv') fasta_file = params.get('taxonomic_fasta') if not (tsv_file and fasta_file): raise ValueError('missing TSV or FASTA file') if not input_local_file: tsv_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': tsv_file }).get('copy_file_path') fasta_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': fasta_file }).get('copy_file_path') metadata_keys_str = params.get('metadata_keys') if metadata_keys_str: metadata_keys += [ x.strip() for x in metadata_keys_str.split(',') ] mode = 'tsv_fasta' elif params.get('biom_fasta'): biom_fasta = params.get('biom_fasta') biom_file = biom_fasta.get('biom_file_biom_fasta') fasta_file = biom_fasta.get('fasta_file_biom_fasta') if not (biom_file and fasta_file): raise ValueError('missing BIOM or FASTA file') if not input_local_file: biom_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': biom_file }).get('copy_file_path') fasta_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': fasta_file }).get('copy_file_path') mode = 'biom_fasta' elif params.get('tsv_fasta'): tsv_fasta = params.get('tsv_fasta') tsv_file = tsv_fasta.get('tsv_file_tsv_fasta') fasta_file = tsv_fasta.get('fasta_file_tsv_fasta') if not (tsv_file and fasta_file): raise ValueError('missing TSV or FASTA file') if not input_local_file: tsv_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': tsv_file }).get('copy_file_path') fasta_file = self.dfu.download_staging_file({ 'staging_file_subdir_path': fasta_file }).get('copy_file_path') metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta') if metadata_keys_str: metadata_keys += [ x.strip() for x in metadata_keys_str.split(',') ] mode = 'tsv_fasta' else: raise ValueError('missing valide file group type in parameters') return (biom_file, tsv_file, fasta_file, mode, list(set(metadata_keys))) def _validate_fasta_file(self, df, fasta_file): logging.info('start validating FASTA file') try: fastq_dict = SeqIO.index(fasta_file, "fasta") except Exception: raise ValueError( 'Cannot parse file. Please provide valide FASTA file') matrix_ids = df.index file_ids = fastq_dict.keys() unmatched_ids = set(matrix_ids) - set(file_ids) if unmatched_ids: raise ValueError( 'FASTA file does not have [{}] OTU id'.format(unmatched_ids)) def _file_to_amplicon_data(self, biom_file, tsv_file, fasta_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys=None): amplicon_data = refs if mode.startswith('biom'): logging.info('start parsing BIOM file for matrix data') table = biom.load_table(biom_file) observation_metadata = table._observation_metadata sample_metadata = table._sample_metadata matrix_data = { 'row_ids': table._observation_ids.tolist(), 'col_ids': table._sample_ids.tolist(), 'values': table.matrix_data.toarray().tolist() } logging.info('start building attribute mapping object') amplicon_data.update( self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data.update( self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} for k in ('create_date', 'generated_by'): val = getattr(table, k) if not val: continue if isinstance(val, bytes): amplicon_data['attributes'][k] = val.decode('utf-8') else: amplicon_data['attributes'][k] = str(val) elif mode.startswith('tsv'): observation_metadata = None sample_metadata = None try: logging.info('start parsing TSV file for matrix data') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide tsv file') else: self._validate_fasta_file(df, fasta_file) metadata_df = None if metadata_keys: shared_metadata_keys = list( set(metadata_keys) & set(df.columns)) if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys: raise ValueError( 'TSV file does not include consensus_sequence') if shared_metadata_keys: metadata_df = df[shared_metadata_keys] df.drop(columns=shared_metadata_keys, inplace=True) try: df = df.astype(float) except ValueError: err_msg = 'Found some non-float values. Matrix contains only numeric values\n' err_msg += 'Please list any non-numeric column names in Metadata Keys field' raise ValueError(err_msg) df.fillna(0, inplace=True) df.index = df.index.astype('str') df.columns = df.columns.astype('str') matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist() } logging.info('start building attribute mapping object') amplicon_data.update( self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df=metadata_df)) amplicon_data.update( self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} else: raise ValueError( 'error parsing _file_to_amplicon_data, mode: {}'.format(mode)) amplicon_data.update({'data': matrix_data}) amplicon_data['search_attributes'] = [ f'{k}|{v}' for k, v in amplicon_data['attributes'].items() ] amplicon_data['scale'] = scale if description: amplicon_data['description'] = description return amplicon_data def get_attribute_mapping(self, axis, metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df=None): mapping_data = {} axis_ids = matrix_data[f'{axis}_ids'] if refs.get('sample_set_ref') and axis == 'col': name = matrix_name + "_{}_attributes".format(axis) mapping_data[ f'{axis}_attributemapping_ref'] = self._sample_set_to_attribute_mapping( axis_ids, refs.get('sample_set_ref'), name, workspace_id) mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif refs.get(f'{axis}_attributemapping_ref'): am_data = self.dfu.get_objects( {'object_refs': [refs[f'{axis}_attributemapping_ref']]})['data'][0]['data'] unmatched_ids = set(axis_ids) - set(am_data['instances'].keys()) if unmatched_ids: name = "Column" if axis == 'col' else "Row" raise ValueError( f"The following {name} IDs from the uploaded matrix do not match " f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}" f"\nPlease verify the input data or upload an excel file with a" f"{name} mapping tab.") else: mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata: name = matrix_name + "_{}_attributes".format(axis) mapping_data[ f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping( axis_ids, metadata, name, workspace_id) # if coming from biom file, metadata and axis IDs are guaranteed to match mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata_df is not None: name = matrix_name + "_{}_attributes".format(axis) mapping_data[ f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping( axis_ids, metadata_df, name, workspace_id) mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} return mapping_data def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name, ws_id): data = {'ontology_mapping_method': "TSV file", 'instances': {}} metadata_df = metadata_df.astype(str) attribute_keys = metadata_df.columns.tolist() data['attributes'] = [{ 'attribute': key, 'source': 'upload' } for key in attribute_keys] if 'taxonomy' in attribute_keys: data['attributes'].append({ 'attribute': 'parsed_user_taxonomy', 'source': 'upload' }) for axis_id in axis_ids: data['instances'][axis_id] = metadata_df.loc[axis_id].tolist() if 'taxonomy' in attribute_keys: parsed_user_taxonomy = None taxonomy_index = attribute_keys.index('taxonomy') taxonomy_str = metadata_df.loc[axis_id].tolist( )[taxonomy_index] parsed_user_taxonomy = self.taxon_util.process_taxonomic_str( taxonomy_str) data['instances'][axis_id].append(parsed_user_taxonomy) logging.info( 'start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] return f'{info[6]}/{info[0]}/{info[4]}' def _sample_set_to_attribute_mapping(self, axis_ids, sample_set_ref, obj_name, ws_id): am_data = self.sampleservice_util.sample_set_to_attribute_mapping( sample_set_ref) unmatched_ids = set(axis_ids) - set(am_data['instances'].keys()) if unmatched_ids: name = "Column" raise ValueError( f"The following {name} IDs from the uploaded matrix do not match " f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}" f"\nPlease verify the input data or upload an excel file with a" f"{name} mapping tab.") logging.info( 'start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": am_data, "name": obj_name }] })[0] return f'{info[6]}/{info[0]}/{info[4]}' def _metadata_to_attribute_mapping(self, instances, metadata, obj_name, ws_id): data = {'ontology_mapping_method': "BIOM file", 'instances': {}} sample_set = metadata[0:min(len(metadata), 25)] metadata_keys = sorted( set((k for m_dict in sample_set for k in m_dict))) data['attributes'] = [{ 'attribute': key, 'source': 'upload' } for key in metadata_keys] for inst, meta in zip(instances, metadata): data['instances'][inst] = [ str(meta[attr]) for attr in metadata_keys ] logging.info( 'start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] return f'{info[6]}/{info[0]}/{info[4]}' def _generate_visualization_content(self, output_directory, heatmap_dir, data_df, top_heatmap_dir, top_percent, display_count): row_data_summary = data_df.T.describe().round(2).to_string() col_data_summary = data_df.describe().round(2).to_string() tab_def_content = '' tab_content = '' viewer_name = 'data_summary' tab_def_content += '''\n<div class="tab">\n''' tab_def_content += '''\n<button class="tablinks" ''' tab_def_content += '''onclick="openTab(event, '{}')"'''.format( viewer_name) tab_def_content += ''' id="defaultOpen"''' tab_def_content += '''>Matrix Statistics</button>\n''' tab_content += '''\n<div id="{}" class="tabcontent" style="overflow:auto">'''.format( viewer_name) tab_content += '''\n<h5>Amplicon Matrix Size: {} x {}</h5>'''.format( len(data_df.index), len(data_df.columns)) tab_content += '''\n<h5>Row Aggregating Statistics</h5>''' html = '''\n<pre class="tab">''' + str(row_data_summary).replace( "\n", "<br>") + "</pre>" tab_content += html tab_content += '''\n<br>''' tab_content += '''\n<hr style="height:2px;border-width:0;color:gray;background-color:gray">''' tab_content += '''\n<br>''' tab_content += '''\n<h5>Column Aggregating Statistics</h5>''' html = '''\n<pre class="tab">''' + str(col_data_summary).replace( "\n", "<br>") + "</pre>" tab_content += html tab_content += '\n</div>\n' if top_heatmap_dir: viewer_name = 'TopHeatmapViewer' tab_def_content += '''\n<button class="tablinks" ''' tab_def_content += '''onclick="openTab(event, '{}')"'''.format( viewer_name) tab_def_content += '''>Top {}% ({} Rows) Heatmap</button>\n'''.format( round(top_percent, 2), display_count) heatmap_report_files = os.listdir(top_heatmap_dir) heatmap_index_page = None for heatmap_report_file in heatmap_report_files: if heatmap_report_file.endswith('.html'): heatmap_index_page = heatmap_report_file shutil.copy2( os.path.join(top_heatmap_dir, heatmap_report_file), output_directory) if heatmap_index_page: tab_content += '''\n<div id="{}" class="tabcontent">'''.format( viewer_name) msg = 'Top {} percent of matrix sorted by sum of abundance values.'.format( round(top_percent, 2)) tab_content += '''<p style="color:red;" >{}</p>'''.format(msg) tab_content += '\n<iframe height="1300px" width="100%" ' tab_content += 'src="{}" '.format(heatmap_index_page) tab_content += 'style="border:none;"></iframe>' tab_content += '\n</div>\n' else: tab_content += '''\n<div id="{}" class="tabcontent">'''.format( viewer_name) tab_content += '''\n<p style="color:red;" >''' tab_content += '''Heatmap is too large to be displayed.</p>\n''' tab_content += '\n</div>\n' viewer_name = 'MatrixHeatmapViewer' tab_def_content += '''\n<button class="tablinks" ''' tab_def_content += '''onclick="openTab(event, '{}')"'''.format( viewer_name) tab_def_content += '''>Matrix Heatmap</button>\n''' heatmap_report_files = os.listdir(heatmap_dir) heatmap_index_page = None for heatmap_report_file in heatmap_report_files: if heatmap_report_file.endswith('.html'): heatmap_index_page = heatmap_report_file shutil.copy2(os.path.join(heatmap_dir, heatmap_report_file), output_directory) if heatmap_index_page: tab_content += '''\n<div id="{}" class="tabcontent">'''.format( viewer_name) tab_content += '\n<iframe height="1300px" width="100%" ' tab_content += 'src="{}" '.format(heatmap_index_page) tab_content += 'style="border:none;"></iframe>' tab_content += '\n</div>\n' else: tab_content += '''\n<div id="{}" class="tabcontent">'''.format( viewer_name) tab_content += '''\n<p style="color:red;" >''' tab_content += '''Heatmap is too large to be displayed.</p>\n''' tab_content += '\n</div>\n' tab_def_content += '\n</div>\n' return tab_def_content + tab_content def _generate_heatmap_html_report(self, data): logging.info('Start generating heatmap report page') data_df = pd.DataFrame(data['values'], index=data['row_ids'], columns=data['col_ids']) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) tsv_file_path = os.path.join( result_directory, 'heatmap_data_{}.tsv'.format(str(uuid.uuid4()))) data_df.to_csv(tsv_file_path) if data_df.index.size < 10000: heatmap_dir = self.report_util.build_heatmap_html({ 'tsv_file_path': tsv_file_path, 'cluster_data': True })['html_dir'] else: logging.info( 'Original matrix is too large. Skip clustering data in report.' ) heatmap_dir = self.report_util.build_heatmap_html({ 'tsv_file_path': tsv_file_path, 'cluster_data': False })['html_dir'] top_heatmap_dir = None top_percent = 100 display_count = 200 # roughly count for display items if len(data_df.index) > 1000: top_percent = min(display_count / data_df.index.size * 100, 100) top_heatmap_dir = self.report_util.build_heatmap_html({ 'tsv_file_path': tsv_file_path, 'sort_by_sum': True, 'top_percent': top_percent })['html_dir'] output_directory = os.path.join(self.scratch, str(uuid.uuid4())) logging.info( 'Start generating html report in {}'.format(output_directory)) html_report = list() self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'matrix_viewer_report.html') visualization_content = self._generate_visualization_content( output_directory, heatmap_dir, data_df, top_heatmap_dir, top_percent, display_count) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'templates', 'matrix_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Visualization_Content</p>', visualization_content) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_directory, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Import Amplicon Matrix App' }) return html_report def _generate_report(self, matrix_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_id, data=None): """ _generate_report: generate summary report """ objects_created = [{ 'ref': matrix_obj_ref, 'description': 'Imported Amplicon Matrix' }] if new_row_attr_ref: objects_created.append({ 'ref': new_row_attr_ref, 'description': 'Imported Amplicons(Row) Attribute Mapping' }) if new_col_attr_ref: objects_created.append({ 'ref': new_col_attr_ref, 'description': 'Imported Samples(Column) Attribute Mapping' }) if data: output_html_files = self._generate_heatmap_html_report(data) report_params = { 'message': '', 'objects_created': objects_created, 'workspace_id': workspace_id, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 1400, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } else: report_params = { 'message': '', 'objects_created': objects_created, 'workspace_id': workspace_id, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.report_util = kb_GenericsReport(self.callback_url) self.data_util = DataUtil(config) self.sampleservice_util = SampleServiceUtil(config) self.attr_util = AttributesUtil(config) self.matrix_util = MatrixUtil(config) self.taxon_util = TaxonUtil(config) self.matrix_types = [ x.split(".")[1].split('-')[0] for x in self.data_util.list_generic_types() ] self.taxon_wsname = config['taxon-workspace-name'] self.kbse = KBaseSearchEngine(config['search-url']) self.taxon_cache = dict() def fetch_sequence(self, matrix_ref): logging.info('start to fetch consensus sequence') input_matrix_obj = self.dfu.get_objects({'object_refs': [matrix_ref]})['data'][0] input_matrix_info = input_matrix_obj['info'] matrix_name = input_matrix_info[1] matrix_type = input_matrix_info[2] matrix_data = input_matrix_obj['data'] if 'KBaseMatrices.AmpliconMatrix' not in matrix_type: raise ValueError('Unexpected data type: {}'.format(matrix_type)) handle = matrix_data.get('sequencing_file_handle') if not handle: raise ValueError( 'Missing sequencing_file_handle from the matrix object') output_directory = os.path.join(self.scratch, str(uuid.uuid4())) logging.info('Start generating consensus sequence file in {}'.format( output_directory)) self._mkdir_p(output_directory) matrix_fasta_file = self.dfu.shock_to_file({ 'handle_id': handle, 'file_path': self.scratch }).get('file_path') try: logging.info('start parsing FASTA file') fastq_dict = SeqIO.index(matrix_fasta_file, "fasta") except Exception: raise ValueError( 'Cannot parse file. Please provide valide FASTA file') row_ids = matrix_data['data']['row_ids'] fasta_file_path = os.path.join( output_directory, matrix_name + 'consensus_sequence.fasta') with open(fasta_file_path, 'w') as f: for row_id in row_ids: consensus_sequence = str(fastq_dict.get(row_id).seq) f.write('>' + str(row_id) + '\n') f.write(consensus_sequence + '\n') return fasta_file_path def import_matrix_from_biom(self, params): """ arguments: obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix matrix_name: matrix object name workspace_id: workspace id matrix object to be saved to input_shock_id: file shock id or input_file_path: absolute file path or input_staging_file_path: staging area file path optional arguments: col_attributemapping_ref: column AttributeMapping reference row_attributemapping_ref: row AttributeMapping reference genome_ref: genome reference matrix_obj_ref: Matrix reference """ (biom_file, tsv_file, fasta_file, mode, metadata_keys) = self._process_params(params) workspace_id = params.get('workspace_id') matrix_name = params.get('matrix_name') obj_type = params.get('obj_type') scale = params.get('scale') description = params.get('description') refs = {k: v for k, v in params.items() if "_ref" in k} amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file, fasta_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys) for key in [ 'amplicon_type', 'amplification', 'extraction', 'target_gene', 'target_subfragment', 'pcr_primers', 'library_kit', 'library_layout', 'library_screening_strategy', 'sequencing_center', 'sequencing_date', 'sequencing_technology', 'sequencing_instrument', 'sequencing_quality_filter_cutoff', 'read_length_cutoff', 'read_pairing', 'barcode_error_rate', 'chimera_detection_and_removal', 'taxon_calling_method', 'denoise_method', 'sequence_error_cutoff', 'clustering_method', 'clustering_cutoff', 'sample_set_ref', 'reads_set_ref' ]: if params.get(key): amplicon_data[key] = params[key] new_row_attr_ref = None if not params.get('row_attributemapping_ref'): new_row_attr_ref = amplicon_data.get('row_attributemapping_ref') new_col_attr_ref = None if not params.get('col_attributemapping_ref'): new_col_attr_ref = amplicon_data.get('col_attributemapping_ref') if fasta_file: logging.info( 'start saving consensus sequence file to shock: {}'.format( fasta_file)) handle_id = self.dfu.file_to_shock({ 'file_path': fasta_file, 'make_handle': True })['handle']['hid'] amplicon_data['sequencing_file_handle'] = handle_id logging.info('start saving Matrix object: {}'.format(matrix_name)) matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': amplicon_data, 'workspace_id': workspace_id })['obj_ref'] if params.get('sample_set_ref'): self.matrix_util._link_matrix_to_samples(matrix_obj_ref, amplicon_data, params['sample_set_ref']) returnVal = {'matrix_obj_ref': matrix_obj_ref} report_output = self._generate_report(matrix_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_id, data=amplicon_data['data']) returnVal.update(report_output) return returnVal
class FastaToAssembly: def __init__(self, callback_url, scratch, ws_url): self.scratch = scratch self.dfu = DataFileUtil(callback_url) self.ws = Workspace(ws_url) # Note added X due to kb|g.1886.fasta self.valid_chars = "-ACGTUWSMKRYBDHVNX" self.amino_acid_specific_characters = "PLIFQE" def import_fasta(self, ctx, params): print('validating parameters') self.validate_params(params) print('staging input files') fasta_file_path = self.stage_input(params) if 'min_contig_length' in params: min_contig_length = int(params['min_contig_length']) print(f'filtering FASTA file by contig length (min len={min_contig_length} bp)') fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length) print(f'parsing FASTA file: {fasta_file_path}') assembly_data = self.parse_fasta(fasta_file_path, params) print(f' - parsed {assembly_data["num_contigs"]} contigs,{assembly_data["dna_size"]} bp') print('saving assembly to KBase') # save file to shock and build handle fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path) # construct the output object assembly_object_to_save = self.build_assembly_object(assembly_data, fasta_file_handle_info, params) json.dump(assembly_object_to_save, open(self.scratch+"/example.json", 'w')) # save to WS and return if 'workspace_id' in params: workspace_id = int(params['workspace_id']) else: workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) assembly_info = self.save_assembly_object(workspace_id, params['assembly_name'], assembly_object_to_save) return assembly_info def build_assembly_object(self, assembly_data, fasta_file_handle_info, params): """ construct the WS object data to save based on the parsed info and params """ assembly_data['assembly_id'] = params['assembly_name'] assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid'] fasta_file_handle_info['handle'] = fasta_file_handle_info['handle'] assembly_data['fasta_handle_info'] = fasta_file_handle_info assembly_data['type'] = 'Unknown' if 'type' in params: assembly_data['type'] = params['type'] if 'taxon_ref' in params: info = self.ws.get_object_info3({'objects':[{'ref': params['taxon_ref']}]})['infos'][0] assembly_data['taxon_ref'] = f'{info[6]}/{info[0]}/{info[4]}' if 'external_source' in params: assembly_data['external_source'] = params['external_source'] if 'external_source_id' in params: assembly_data['external_source_id'] = params['external_source_id'] if 'external_source_origination_date' in params: assembly_data['external_source_origination_date'] = params['external_source_origination_date'] return sort_dict(assembly_data) def parse_fasta(self, fasta_file_path, params): """ Do the actual work of inspecting each contig """ # variables to store running counts of things total_length = 0 base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0} md5_list = [] # map from contig_id to contig_info all_contig_data = {} extra_contig_info = {} if'contig_info' in params: extra_contig_info = params['contig_info'] for record in SeqIO.parse(fasta_file_path, "fasta"): # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()), # id='gi|113968346|ref|NC_008321.1|', # name='gi|113968346|ref|NC_008321.1|', # description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome', # dbxrefs=[]) sequence = str(record.seq).upper() contig_info = { 'contig_id': record.id, 'name': record.id, 'description': record.description[len(record.id):].strip(), 'length': len(record.seq) } # 1) compute sequence character statistics running total total_length += contig_info['length'] sequence_count_table = dict(Counter(sequence)) for character in sequence_count_table: if character in base_counts: base_counts[character] = base_counts[character] + sequence_count_table[character] else: base_counts[character] = sequence_count_table[character] if character not in self.valid_chars: if character in self.amino_acid_specific_characters: raise ValueError('This FASTA file may have amino acids in it instead ' 'of the required nucleotides.') raise ValueError(f"This FASTA file has non nucleic acid characters: " f"{character}") # 2) record number of 'N' characters (only set if there are some) Ncount = 0 if 'N' in sequence_count_table: Ncount = sequence_count_table['N'] contig_info['Ncount'] = Ncount # 2b) record if the contig is circular if record.id in extra_contig_info: if 'is_circ' in extra_contig_info[record.id]: contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ']) if 'description' in extra_contig_info[record.id]: contig_info['description'] = str(extra_contig_info[record.id]['description']) # 3) record md5 checksum contig_md5 = md5(sequence.encode()).hexdigest() contig_info['md5'] = contig_md5 md5_list.append(contig_md5) # 4) record the all important GC to ~3 significant digits GC_count = 0 for base in ['G', 'C']: if base in sequence_count_table: GC_count += sequence_count_table[base] contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5) # 5) add to contig list if contig_info['contig_id'] in all_contig_data: raise ValueError('The FASTA header key ' + contig_info['contig_id'] + 'appears more than once in the file') all_contig_data[contig_info['contig_id']] = contig_info # Aggregate stats for the data total_gc_content = None if total_length > 0: total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5) assembly_data = { 'md5': md5(",".join(sorted(md5_list)).encode()).hexdigest(), 'base_counts': base_counts, 'dna_size': total_length, 'gc_content': total_gc_content, 'contigs': all_contig_data, 'num_contigs': len(all_contig_data) } return assembly_data @staticmethod def fasta_filter_contigs_generator(fasta_record_iter, min_contig_length): """ generates SeqRecords iterator for writing from a legacy contigset object """ rows = 0 rows_added = 0 for record in fasta_record_iter: rows += 1 if len(record.seq) >= min_contig_length: rows_added += 1 yield record print(f' - filtered out {rows - rows_added} of {rows} contigs that were shorter ' f'than {(min_contig_length)} bp.') def filter_contigs_by_length(self, fasta_file_path, min_contig_length): """ removes all contigs less than the min_contig_length provided """ filtered_fasta_file_path = fasta_file_path + '.filtered.fa' fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta') SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length), filtered_fasta_file_path, 'fasta') return filtered_fasta_file_path def save_assembly_object(self, workspace_id, assembly_name, obj_data): print('Saving Assembly to Workspace') sys.stdout.flush() if len(obj_data["contigs"]) == 0: raise ValueError('There are no contigs to save, thus there is no valid assembly.') obj_info = self.dfu.save_objects({'id': workspace_id, 'objects': [{'type': 'KBaseGenomeAnnotations.Assembly', 'data': obj_data, 'name': assembly_name }] })[0] return obj_info def save_fasta_file_to_shock(self, fasta_file_path): """ Given the path to the file, upload to shock and return Handle information returns: typedef structure { string shock_id; Handle handle; string node_file_name; string size; } FileToShockOutput; """ print(f'Uploading FASTA file ({fasta_file_path}) to SHOCK') sys.stdout.flush() return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1}) def stage_input(self, params): """ Setup the input_directory by fetching the files and returning the path to the file""" file_path = None if 'file' in params: if not os.path.isfile(params['file']['path']): raise ValueError('KBase Assembly Utils tried to save an assembly, but the calling application specified a file ('+params['file']['path']+') that is missing. Please check the application logs for details.') file_path = os.path.abspath(params['file']['path']) elif 'shock_id' in params: print(f'Downloading file from SHOCK node: {params["shock_id"]}') sys.stdout.flush() input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4())) os.makedirs(input_directory) file_name = self.dfu.shock_to_file({'file_path': input_directory, 'shock_id': params['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) elif 'ftp_url' in params: print(f'Downloading file from: {params["ftp_url"]}') sys.stdout.flush() file_path = self.dfu.download_web_file({'file_url': params['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] # extract the file if it is compressed if file_path is not None: unpacked_file = self.dfu.unpack_file({'file_path': file_path}) return unpacked_file['file_path'] raise ValueError('No valid FASTA could be extracted based on the input parameters') @staticmethod def validate_params(params): for key in ('workspace_name', 'assembly_name'): if key not in params: raise ValueError('required "' + key + '" field was not defined') # one and only one of either 'file', 'shock_id', or ftp_url is required input_count = 0 for key in ('file', 'shock_id', 'ftp_url'): if key in params and params[key] is not None: input_count = input_count + 1 if key == 'file': if not isinstance(params[key], dict) or 'path' not in params[key]: raise ValueError('when specifying a FASTA file input, "path" field was not defined in "file"') if input_count == 0: raise ValueError('required FASTA file as input, set as either "file", "shock_id", or "ftp_url"') if input_count > 1: raise ValueError('required exactly one FASTA file as input source, you set more than one of ' + 'these fields: "file", "shock_id", or "ftp_url"')