class FastaGFFToGenome: def __init__(self, config): self.cfg = config self.au = AssemblyUtil(config.callbackURL) self.dfu = DataFileUtil(self.cfg.callbackURL) self.gi = GenomeInterface(self.cfg) self.taxon_wsname = self.cfg.raw['taxon-workspace-name'] self.time_string = str( datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() self.version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) self.ont_mappings = load_ontology_mappings('/kb/module/data') self.code_table = 11 self.skip_types = ('exon', 'five_prime_UTR', 'three_prime_UTR', 'start_codon', 'stop_codon', 'region', 'chromosome', 'scaffold') self.spoof_gene_count = 0 self.is_phytozome = False self.strict = True self.generate_genes = False self.warnings = [] self.feature_dict = collections.OrderedDict() self.cdss = set() self.ontologies_present = collections.defaultdict(dict) self.ontology_events = list() self.skiped_features = collections.Counter() self.feature_counts = collections.Counter() def warn(self, message): self.warnings.append(message) def generate_genome_json(self, params): # 1) validate parameters self._validate_import_file_params(params) self.code_table = params.get('genetic_code', 11) # 2) construct the input directory staging area input_directory = os.path.join(self.cfg.sharedFolder, 'fast_gff_upload_' + str(uuid.uuid4())) os.makedirs(input_directory) file_paths = self._stage_input(params, input_directory) # 3) extract out the parameters params = self._set_parsed_params(params) if params.get('generate_missing_genes'): self.generate_genes = True # 4) do the upload genome = self._gen_genome_json( input_fasta_file=file_paths["fasta_file"], input_gff_file=file_paths["gff_file"], workspace_name=params['workspace_name'], core_genome_name=params['genome_name'], scientific_name=params['scientific_name'], source=params['source'], source_id=params['source_id'], release=params['release'], ) if params.get('genetic_code'): genome["genetic_code"] = params['genetic_code'] return genome, input_directory def import_file(self, params): genome, input_directory = self.generate_genome_json(params) json.dump(genome, open( "{}/{}.json".format(self.cfg.sharedFolder, genome['id']), 'w'), indent=4) result = self.gi.save_one_genome({ 'workspace': params['workspace_name'], 'name': params['genome_name'], 'data': genome, "meta": params.get('metadata', {}), }) report_string = 'A genome with {} contigs and the following feature ' \ 'types was imported: {}'\ .format(len(genome['contig_ids']), "\n".join( [k + ": " + str(v) for k, v in genome['feature_counts'].items()])) log(report_string) # 5) clear the temp directory shutil.rmtree(input_directory) # 6) return the result info = result['info'] details = { 'genome_ref': str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]), 'genome_info': info } return details def _gen_genome_json(self, input_gff_file=None, input_fasta_file=None, workspace_name=None, core_genome_name=None, scientific_name="unknown_taxon", source=None, source_id=None, release=None): # reading in GFF file features_by_contig = self._retrieve_gff_file(input_gff_file) contig_ids = set() # parse feature information fasta_contigs = Bio.SeqIO.parse(input_fasta_file, "fasta") for contig in fasta_contigs: molecule_type = str(contig.seq.alphabet).replace( 'IUPACAmbiguous', '').strip('()') contig_ids.add(contig.id) for feature in features_by_contig.get(contig.id, []): self._transform_feature(contig, feature) for cid in set(features_by_contig.keys()) - contig_ids: self.warn("Sequence name {} does not match a sequence id in the " "FASTA file. {} features will not be imported.".format( cid, len(features_by_contig[cid]))) if self.strict: raise ValueError( "Every feature sequence id must match a fasta sequence id") self._process_cdss() # save assembly file assembly_ref = self.au.save_assembly_from_fasta({ 'file': { 'path': input_fasta_file }, 'workspace_name': workspace_name, 'assembly_name': core_genome_name + ".assembly" }) assembly_data = self.dfu.get_objects({ 'object_refs': [assembly_ref], 'ignore_errors': 0 })['data'][0]['data'] # generate genome info genome = self._gen_genome_info(core_genome_name, scientific_name, assembly_ref, source, source_id, assembly_data, input_gff_file, molecule_type) genome['release'] = release if self.spoof_gene_count > 0: genome['warnings'] = genome.get('warnings', []) + \ [warnings['spoofed_genome'].format(self.spoof_gene_count)] genome['suspect'] = 1 return genome @staticmethod def _location(in_feature): in_feature['strand'] = in_feature['strand'].replace( "-1", "-").translate(strand_table) if in_feature['strand'] == '+': start = in_feature['start'] elif in_feature['strand'] == '-': start = in_feature['end'] else: raise ValueError('Invalid feature strand: {}'.format( in_feature['strand'])) return [ in_feature['contig'], start, in_feature['strand'], in_feature['end'] - in_feature['start'] + 1 ] @staticmethod def _validate_import_file_params(params): """ validate_import_file_params: validates params passed to FastaGFFToGenome.import_file method """ # check for required parameters for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) # one and only one of 'path', or 'shock_id' is required for key in ('fasta_file', 'gff_file'): file = params[key] if not isinstance(file, dict): raise ValueError( 'Required "{}" field must be a map/dict'.format(key)) n_valid_fields = 0 if 'path' in file and file['path'] is not None: n_valid_fields += 1 if 'shock_id' in file and file['shock_id'] is not None: n_valid_fields += 1 if 'ftp_url' in file and file['ftp_url'] is not None: n_valid_fields += 1 raise ValueError( 'FTP link is currently not supported for FastaGFFToGenome') if n_valid_fields < 1: error_msg = 'Required "{}" field must include one source: '.format( key) error_msg += 'path | shock_id' raise ValueError(error_msg) if n_valid_fields > 1: error_msg = 'Required "{}" field has too many sources specified: '.format( key) error_msg += str(list(file.keys())) raise ValueError(error_msg) if params.get('genetic_code'): if not (isinstance(params['genetic_code'], int) and 0 < params['genetic_code'] < 32): raise ValueError( "Invalid genetic code specified: {}".format(params)) def _set_parsed_params(self, params): log('Setting params') default_params = { 'taxon_wsname': self.cfg.raw['taxon-workspace-name'], 'scientific_name': 'unknown_taxon', 'taxon_reference': None, 'source': 'User', 'release': None, 'metadata': {}, 'source_id': 'unknown', } default_params.update(params) log(json.dumps(default_params, indent=1)) return default_params def _stage_input(self, params, input_directory): """ stage_input: Setup the input_directory by fetching the files and uncompressing if needed """ file_paths = dict() for key in ('fasta_file', 'gff_file'): file = params[key] file_path = None if 'path' in file and file['path'] is not None: local_file_path = file['path'] file_path = os.path.join(input_directory, os.path.basename(local_file_path)) log('Moving file from {} to {}'.format(local_file_path, file_path)) shutil.copy2(local_file_path, file_path) if 'shock_id' in file and file['shock_id'] is not None: # handle shock file log('Downloading file from SHOCK node: {}-{}'.format( self.cfg.sharedFolder, file['shock_id'])) sys.stdout.flush() file_name = self.dfu.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) # extract the file if it is compressed if file_path is not None: log("staged input file =" + file_path) sys.stdout.flush() dfUtil_result = self.dfu.unpack_file({'file_path': file_path}) file_paths[key] = dfUtil_result['file_path'] else: raise ValueError( 'No valid files could be extracted based on the input') return file_paths def _retrieve_gff_file(self, input_gff_file): """ _retrieve_gff_file: retrieve info from gff_file """ log("Reading GFF file") feature_list = collections.defaultdict(list) is_patric = 0 gff_file_handle = open(input_gff_file) current_line = gff_file_handle.readline() line_count = 0 while (current_line != ''): current_line = current_line.strip() if (current_line.isspace() or current_line == "" or current_line.startswith("#")): pass else: #Split line (contig_id, source_id, feature_type, start, end, score, strand, phase, attributes) = current_line.split('\t') #Checking to see if Phytozome if "phytozome" in source_id.lower(): self.is_phytozome = True #Checking to see if Phytozome if "PATRIC" in source_id: is_patric = True #PATRIC prepends their contig ids with some gibberish if is_patric and "|" in contig_id: contig_id = contig_id.split("|", 1)[1] #Populating basic feature object ftr = { 'contig': contig_id, 'source': source_id, 'type': feature_type, 'start': int(start), 'end': int(end), 'score': score, 'strand': strand, 'phase': phase, 'attributes': collections.defaultdict(list) } #Populating with attribute key-value pair #This is where the feature id is from for attribute in attributes.split(";"): attribute = attribute.strip() #Sometimes empty string if not attribute: continue #Use of 1 to limit split as '=' character can also be made available later #Sometimes lack of "=", assume spaces instead if ("=" in attribute): key, value = attribute.split("=", 1) ftr['attributes'][key.lower()].append( parse.unquote(value.strip('"'))) elif (" " in attribute): key, value = attribute.split(" ", 1) ftr['attributes'][key.lower()].append( parse.unquote(value.strip('"'))) else: pass #log("Warning: attribute "+attribute+" cannot be separated into key,value pair") ftr['attributes']['raw'] = attributes if "id" in ftr['attributes']: ftr['ID'] = ftr['attributes']['id'][0] if "parent" in ftr['attributes']: ftr['Parent'] = ftr['attributes']['parent'][0] feature_list[contig_id].append(ftr) current_line = gff_file_handle.readline() gff_file_handle.close() #Some GFF/GTF files don't use "ID" so we go through the possibilities feature_list = self._add_missing_identifiers(feature_list) #Most bacterial files have only CDSs #In order to work with prokaryotic and eukaryotic gene structure synonymously #Here we add feature dictionaries representing the parent gene and mRNAs #feature_list = self._add_missing_parents(feature_list) #Phytozome has the annoying habit of editing their identifiers so we fix them if self.is_phytozome: self._update_phytozome_features(feature_list) #All identifiers need to be checked so that they follow the same general rules #Rules are listed within the function itself feature_list = self._update_identifiers(feature_list) return feature_list def _add_missing_identifiers(self, feature_list): log("Adding missing identifiers") #General rule is to iterate through a range of possibilities if "ID" is missing for contig in feature_list: for i, feat in enumerate(feature_list[contig]): if "ID" not in feature_list[contig][i]: for key in ("transcriptid", "proteinid", "pacid", "parent", "name", 'transcript_id'): if key in feature_list[contig][i]['attributes']: feature_list[contig][i]['ID'] = feature_list[ contig][i]['attributes'][key][0] break if feat['type'] not in self.skip_types: self.feature_counts[feat['type']] += 1 #If the process fails, throw an error if "ID" not in feature_list[contig][i]: feat['ID'] = "{}_{}".format( feat['type'], self.feature_counts[feat['type']]) #log("Warning: Could find unique ID to utilize in GFF attributes: {}. " # "ID '{}' has been assigned".format(feat['attributes'], feat['ID'])) return feature_list def _add_missing_parents(self, feature_list): #General rules is if CDS or RNA missing parent, add them for contig in feature_list: ftrs = feature_list[contig] new_ftrs = [] for i in range(len(ftrs)): if ftrs[i]["type"] in self.skip_types: continue if ("Parent" not in ftrs[i]): #Assuming parent doesn't exist at all, so create de novo instead of trying to find it if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]): new_gene_ftr = copy.deepcopy(ftrs[i]) new_gene_ftr["type"] = "gene" ftrs[i]["Parent"] = new_gene_ftr["ID"] new_ftrs.append(new_gene_ftr) if ("CDS" in ftrs[i]["type"]): new_rna_ftr = copy.deepcopy(ftrs[i]) new_rna_ftr["type"] = "mRNA" new_ftrs.append(new_rna_ftr) ftrs[i]["Parent"] = new_rna_ftr["ID"] new_ftrs.append(ftrs[i]) feature_list[contig] = new_ftrs return feature_list @staticmethod def _update_phytozome_features(feature_list): #General rule is to use the "Name" field where possible #And update parent attribute correspondingly for contig in feature_list: feature_position_dict = {} for i in range(len(feature_list[contig])): #Maintain old_id for reference #Sometimes ID isn't available, so use PACid old_id = None for key in ("id", "pacid"): if (key in feature_list[contig][i]['attributes']): old_id = feature_list[contig][i]['attributes'][key][0] break if (old_id is None): #This should be an error #log("Cannot find unique ID, PACid, or pacid in GFF " # "attributes: " + feature_list[contig][i][contig]) continue #Retain old_id feature_position_dict[old_id] = i # Clip off the increment on CDS IDs so fragments of the same # CDS share the same ID if "CDS" in feature_list[contig][i]["ID"]: feature_list[contig][i]["ID"] = feature_list[contig][i][ "ID"].rsplit('.', 1)[0] #In Phytozome, gene and mRNA have "Name" field, CDS do not if ("name" in feature_list[contig][i]['attributes']): feature_list[contig][i]["ID"] = feature_list[contig][i][ 'attributes']['name'][0] if ("Parent" in feature_list[contig][i]): #Update Parent to match new ID of parent ftr feature_list[contig][i]["Parent"] = feature_list[contig][ feature_position_dict[feature_list[contig][i] ["Parent"]]]["ID"] return feature_list def _update_identifiers(self, feature_list): #General rules: #1) Genes keep identifier #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA" #3) CDS always uses RNA identifier with ".CDS" appended mRNA_parent_dict = dict() for contig in feature_list: for ftr in feature_list[contig]: if ftr["type"] in self.skip_types: continue if ("Parent" in ftr): #Retain old_id of parents old_id = ftr["ID"] if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]): ftr["ID"] = ftr["Parent"] + "." + ftr["type"] #link old to new ids for mRNA to use with CDS if ("RNA" in ftr["type"]): mRNA_parent_dict[old_id] = ftr["ID"] return feature_list def _check_location_order(self, locations): """If order looks good return None. If out of order return warning If on multiple strands return warning""" strand = None last_start = 0 for location in locations: if strand == None: strand = location[2] elif strand != location[2]: return warnings["both_strand_coordinates"] if strand == "-": locations = reversed(locations) for location in locations: if last_start > location[1]: return warnings["out_of_order"] else: last_start = location[1] return None def _create_ontology_event(self, ontology_type): """Creates the ontology_event if necessary Returns the index of the ontology event back.""" if ontology_type not in self.ont_mappings: raise ValueError( "{} is not a supported ontology".format(ontology_type)) if "event_index" not in self.ont_mappings[ontology_type]: self.ont_mappings[ontology_type]['event_index'] = len( self.ontology_events) if ontology_type == "GO": ontology_ref = "KBaseOntology/gene_ontology" elif ontology_type == "PO": ontology_ref = "KBaseOntology/plant_ontology" else: ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology" self.ontology_events.append({ "method": "GenomeFileUtils Genbank uploader from annotations", "method_version": self.version, "timestamp": self.time_string, "id": ontology_type, "ontology_ref": ontology_ref }) return self.ont_mappings[ontology_type]['event_index'] def _get_ontology_db_xrefs(self, feature): """Splits the ontology info from the other db_xrefs""" ontology = collections.defaultdict(dict) db_xrefs = [] # these are keys are formatted strangely and require special parsing for key in ("go_process", "go_function", "go_component"): ontology_event_index = self._create_ontology_event("GO") for term in feature.get(key, []): sp = term.split(" - ") ontology['GO'][sp[0]] = [ontology_event_index] self.ontologies_present['GO'][ sp[0]] = self.ont_mappings['GO'].get(sp[0], '') # CATH terms are not distinct from EC numbers so myst be found by key for term in feature.get('cath_funfam', []) + feature.get('cath', []): for ref in term.split(','): ontology['CATH'][ref] = [self._create_ontology_event("CATH")] self.ontologies_present['CATH'][ref] = self.ont_mappings[ 'CATH'].get(ref, '') search_keys = [ 'ontology_term', 'db_xref', 'dbxref', 'product_source', 'tigrfam', 'pfam', 'cog', 'go', 'po', 'ko' ] ont_terms = [] # flatten out into list of values for key in search_keys: if key in feature: ont_terms += [x for y in feature[key] for x in y.split(',')] for ref in ont_terms: if ref.startswith('GO:'): ontology['GO'][ref] = [self._create_ontology_event("GO")] self.ontologies_present['GO'][ref] = self.ont_mappings[ 'GO'].get(ref, '') elif ref.startswith('PO:'): ontology['PO'][ref] = [self._create_ontology_event("PO")] self.ontologies_present['PO'][ref] = self.ont_mappings[ 'PO'].get(ref, '') elif ref.startswith('KO:'): ontology['KO'][ref] = [self._create_ontology_event("KO")] self.ontologies_present['KO'][ref] = self.ont_mappings[ 'KO'].get(ref, '') elif ref.startswith('COG'): ontology['COG'][ref] = [self._create_ontology_event("COG")] self.ontologies_present['COG'][ref] = self.ont_mappings[ 'COG'].get(ref, '') elif ref.startswith('PF'): ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")] self.ontologies_present['PFAM'][ref] = self.ont_mappings[ 'PFAM'].get(ref, '') elif ref.startswith('TIGR'): ontology['TIGRFAM'][ref] = [ self._create_ontology_event("TIGRFAM") ] self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[ 'TIGRFAM'].get(ref, '') else: db_xrefs.append(tuple(ref.split(":", 1))) return dict(ontology), db_xrefs def _transform_feature(self, contig, in_feature): """Converts a feature from the gff ftr format into the appropriate format for a genome object """ def _aliases(feat): keys = ('locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id', 'gene', 'ec_number', 'gene_synonym') alias_list = [] for key in keys: if key in feat['attributes']: alias_list.extend([(key, val) for val in feat['attributes'][key]]) return alias_list if in_feature['start'] < 1 or in_feature['end'] > len(contig): self.warn("Feature with invalid location for specified " "contig: " + str(in_feature)) if self.strict: raise ValueError( "Features must be completely contained within the Contig in the " "Fasta file. Feature: " + str(in_feature)) return feat_seq = contig.seq[in_feature['start'] - 1:in_feature['end']].upper() if in_feature['strand'] in {'-', '-1'}: feat_seq = feat_seq.reverse_complement() # if the feature ID is duplicated (CDS or transpliced gene) we only # need to update the location and dna_sequence if in_feature.get('ID') in self.feature_dict: existing = self.feature_dict[in_feature['ID']] existing['location'].append(self._location(in_feature)) existing['dna_sequence'] = existing.get('dna_sequence', '') + str(feat_seq) existing['dna_sequence_length'] = len(existing['dna_sequence']) return # The following is common to all the feature types out_feat = { "id": in_feature.get('ID'), "type": in_feature['type'], "location": [self._location(in_feature)], "dna_sequence": str(feat_seq), "dna_sequence_length": len(feat_seq), "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(), } # add optional fields if 'note' in in_feature['attributes']: out_feat['note'] = in_feature['attributes']["note"][0] ont, db_xrefs = self._get_ontology_db_xrefs(in_feature['attributes']) if ont: out_feat['ontology_terms'] = ont aliases = _aliases(in_feature) if aliases: out_feat['aliases'] = aliases if db_xrefs: out_feat['db_xrefs'] = db_xrefs if 'product' in in_feature['attributes']: out_feat['functions'] = in_feature['attributes']["product"] if 'product_name' in in_feature['attributes']: if "functions" in out_feat: out_feat['functions'].extend( in_feature['attributes']["product_name"]) else: out_feat['functions'] = in_feature['attributes'][ "product_name"] if 'function' in in_feature['attributes']: out_feat['functional_descriptions'] = in_feature['attributes'][ "function"] if 'inference' in in_feature['attributes']: GenomeUtils.parse_inferences(in_feature['attributes']['inference']) if 'trans-splicing' in in_feature['attributes'].get('exception', []): out_feat['flags'] = out_feat.get('flags', []) + ['trans_splicing'] if 'pseudo' in in_feature['attributes'].get('exception', []): out_feat['flags'] = out_feat.get('flags', []) + ['pseudo'] if 'ribosomal-slippage' in in_feature['attributes'].get( 'exception', []): out_feat['flags'] = out_feat.get('flags', []) + ['ribosomal_slippage'] parent_id = in_feature.get('Parent', '') if parent_id and parent_id not in self.feature_dict: raise ValueError( "Parent ID: {} was not found in feature ID list.".format( parent_id)) # if the feature is a exon or UTR, it will only be used to update the # location and sequence of it's parent, we add the info to it parent # feature but not the feature dict if in_feature['type'] in self.skip_types: if parent_id and in_feature['type'] in { 'exon', 'five_prime_UTR', 'three_prime_UTR' }: parent = self.feature_dict[parent_id] if in_feature['type'] not in parent: parent[in_feature['type']] = [] parent[in_feature['type']].append(out_feat) return # add type specific features elif 'gene' in in_feature['type']: out_feat['protein_translation_length'] = 0 out_feat['cdss'] = [] elif in_feature['type'] == 'CDS': if parent_id: parent = self.feature_dict[parent_id] if 'cdss' in parent: # parent must be a gene if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings[ "genes_CDS_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"] = out_feat.get('warnings', []) + [ warnings[ "CDS_fail_child_of_gene_coordinate_validation"] .format(parent_id) ] parent['cdss'].append(in_feature['ID']) out_feat['parent_gene'] = parent_id else: # parent must be mRNA if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings["mRNA_fail_parent_coordinate_validation"]. format(out_feat["id"]) ] out_feat["warnings"] = out_feat.get('warnings', []) + [ warnings[ "CDS_fail_child_of_mRNA_coordinate_validation"] .format(parent_id) ] parent['cds'] = in_feature['ID'] out_feat['parent_mrna'] = parent_id parent_gene = self.feature_dict[parent['parent_gene']] parent_gene['cdss'].append(in_feature['ID']) out_feat['parent_gene'] = parent['parent_gene'] # keep track of CDSs for post processing self.cdss.add(out_feat['id']) elif in_feature['type'] == 'mRNA': if parent_id: parent = self.feature_dict[parent_id] if 'mrnas' not in parent: parent['mrnas'] = [] if 'cdss' in parent: # parent must be a gene parent['mrnas'].append(in_feature['ID']) out_feat['parent_gene'] = parent_id if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings["genes_mRNA_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"] = out_feat.get('warnings', []) + [ warnings["mRNAs_parent_gene_fails_location_validation"] .format(parent_id) ] else: out_feat["type"] = in_feature['type'] # this prevents big misc_features from blowing up the genome size if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE: del out_feat['dna_sequence'] if parent_id: parent = self.feature_dict[parent_id] if 'children' not in parent: parent['children'] = [] parent['children'].append(out_feat['id']) out_feat['parent_gene'] = parent_id if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings[ "generic_parents_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"] = out_feat.get('warnings', []) + [ warnings[ "generic_childs_parent_fails_location_validation"]. format(parent_id) ] self.feature_dict[out_feat['id']] = out_feat def _process_cdss(self): """Because CDSs can have multiple fragments, it's necessary to go back over them to calculate a final protein sequence""" for cds_id in self.cdss: cds = self.feature_dict[cds_id] try: prot_seq = str( Seq(cds['dna_sequence']).translate(self.code_table, cds=True).strip("*")) except TranslationError as e: cds['warnings'] = cds.get('warnings', []) + [str(e)] prot_seq = "" cds.update({ "protein_translation": prot_seq, "protein_md5": hashlib.md5(prot_seq.encode('utf8')).hexdigest(), "protein_translation_length": len(prot_seq), }) if 'parent_gene' in cds: parent_gene = self.feature_dict[cds['parent_gene']] # no propigation for now propagate_cds_props_to_gene(cds, parent_gene) elif self.generate_genes: spoof = copy.copy(cds) spoof['type'] = 'gene' spoof['id'] = cds['id'] + "_gene" spoof['cdss'] = [cds['id']] spoof['warnings'] = [ warnings['spoofed_gene'].format(cds['id']) ] self.feature_dict[spoof['id']] = spoof cds['parent_gene'] = spoof['id'] self.spoof_gene_count += 1 else: raise ValueError(warnings['no_spoof']) self.feature_dict[cds['id']] = cds def _update_from_exons(self, feature): """This function updates the sequence and location of a feature based on it's UTRs, CDSs and exon information""" # note that start and end here are in direction of translation def start(loc): return loc[0][1] def end(loc): if loc[-1][2] == "+": return loc[-1][1] + loc[-1][3] + 1 else: return loc[-1][1] - loc[-1][3] - 1 if 'exon' in feature: # update the feature with the exon locations and sequences feature['location'] = [x['location'][0] for x in feature['exon']] feature['dna_sequence'] = "".join(x['dna_sequence'] for x in feature['exon']) feature['dna_sequence_length'] = len(feature['dna_sequence']) # construct feature location from utrs and cdss if present elif 'cds' in feature: cds = [copy.deepcopy(self.feature_dict[feature['cds']])] locs = [] seq = "" for frag in feature.get('five_prime_UTR', []) + cds + \ feature.get('three_prime_UTR', []): # merge into last location if adjacent if locs and abs(end(locs) - start(frag['location'])) == 1: # extend the location length by the length of the first # location in the fragment first = frag['location'].pop(0) locs[-1][3] += first[3] locs.extend(frag['location']) seq += frag['dna_sequence'] feature['location'] = locs feature['dna_sequence'] = seq feature['dna_sequence_length'] = len(seq) # remove these properties as they are no longer needed for x in ['five_prime_UTR', 'three_prime_UTR', 'exon']: feature.pop(x, None) else: ValueError('Feature {} must contain either exon or cds data to ' 'construct an accurate location and sequence'.format( feature['id'])) def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref, source, source_id, assembly, input_gff_file, molecule_type): """ _gen_genome_info: generate genome info """ genome = dict() genome["id"] = core_genome_name genome["scientific_name"] = scientific_name genome["assembly_ref"] = assembly_ref genome['molecule_type'] = molecule_type genome["features"] = [] genome["cdss"] = [] genome["mrnas"] = [] genome['non_coding_features'] = [] genome["gc_content"] = assembly["gc_content"] genome["dna_size"] = assembly["dna_size"] genome['md5'] = assembly['md5'] genome['contig_ids'], genome['contig_lengths'] = zip( *[(k, v['length']) for k, v in assembly['contigs'].items()]) genome['num_contigs'] = len(assembly['contigs']) genome['ontologies_present'] = dict(self.ontologies_present) genome['ontology_events'] = self.ontology_events genome['taxonomy'], genome['taxon_ref'], genome['domain'], \ genome["genetic_code"] = self.gi.retrieve_taxon(self.taxon_wsname, genome['scientific_name']) genome['source'], genome['genome_tiers'] = self.gi.determine_tier( source) genome['source_id'] = source_id # Phytozome gff files are not compatible with the RNASeq Pipeline # so it's better to build from the object than cache the file if self.is_phytozome: gff_file_to_shock = self.dfu.file_to_shock({ 'file_path': input_gff_file, 'make_handle': 1, 'pack': "gzip" }) genome['gff_handle_ref'] = gff_file_to_shock['handle']['hid'] for feature in self.feature_dict.values(): self.feature_counts[feature['type']] += 1 if 'exon' in feature or feature['type'] == 'mRNA': self._update_from_exons(feature) # Test if location order is in order. is_transpliced = "flags" in feature and "trans_splicing" in feature[ "flags"] if not is_transpliced and len(feature["location"]) > 1: # Check the order only if not trans_spliced and has more than 1 location. location_warning = self._check_location_order( feature["location"]) if location_warning is not None: feature["warnings"] = feature.get('warnings', []) + [location_warning] contig_len = genome["contig_lengths"][genome["contig_ids"].index( feature["location"][0][0])] feature = check_full_contig_length_or_multi_strand_feature( feature, is_transpliced, contig_len, self.skip_types) # sort features into their respective arrays if feature['type'] == 'CDS': del feature['type'] genome['cdss'].append(feature) elif feature['type'] == 'mRNA': del feature['type'] genome['mrnas'].append(feature) elif feature['type'] == 'gene': # remove duplicates that may arise from CDS info propagation for key in ('functions', 'aliases', 'db_xrefs'): if key in feature: feature[key] = list(set(feature[key])) if feature['cdss']: del feature['type'] self.feature_counts["protein_encoding_gene"] += 1 genome['features'].append(feature) else: feature.pop('mrnas', None) feature.pop('cdss', None) feature.pop('protein_translation_length', None) self.feature_counts["non_coding_features"] += 1 genome['non_coding_features'].append(feature) else: genome['non_coding_features'].append(feature) if self.warnings: genome['warnings'] = self.warnings genome['feature_counts'] = dict(self.feature_counts) return genome
class GenbankToGenome: def __init__(self, config): self.cfg = config self.gi = GenomeInterface(config) self.dfu = DataFileUtil(config.callbackURL) self.aUtil = AssemblyUtil(config.callbackURL) self.ws = Workspace(config.workspaceURL) self._messages = [] self.time_string = str( datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() self.version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) self.generate_parents = False self.generate_ids = False self.genes = OrderedDict() self.mrnas = OrderedDict() self.cdss = OrderedDict() self.noncoding = [] self.ontologies_present = defaultdict(dict) self.ontology_events = list() self.skiped_features = Counter() self.feature_counts = Counter() self.orphan_types = Counter() self.contig_seq = {} self.circ_contigs = set() self.features_spaning_zero = set() self.genome_warnings = [] self.genome_suspect = False self.defects = Counter() self.spoofed_genes = 0 self.excluded_features = ('source', 'exon') self.ont_mappings = load_ontology_mappings('/kb/module/data') self.code_table = 11 self.default_params = { 'source': 'Genbank', 'taxon_wsname': self.cfg.raw['taxon-workspace-name'], 'taxon_lookup_obj_name': self.cfg.raw['taxon-lookup-object-name'], 'taxon_reference': None, 'ontology_wsname': self.cfg.raw['ontology-workspace-name'], 'ontology_GO_obj_name': self.cfg.raw['ontology-gene-ontology-obj-name'], 'ontology_PO_obj_name': self.cfg.raw['ontology-plant-ontology-obj-name'], 'release': None, 'genetic_code': 11, 'generate_ids_if_needed': 0, 'metadata': {} } def log(self, message): self._messages.append(message) print('{0:.2f}'.format(time.time()) + ': ' + str(message)) @property def messages(self): return "\n".join(self._messages) def refactored_import(self, ctx, params): # 1) validate parameters and extract defaults self.validate_params(params) # 2) construct the input directory staging area input_directory = self.stage_input(params) # 3) update default params self.default_params.update(params) params = self.default_params self.generate_parents = params.get('generate_missing_genes') self.generate_ids = params.get('generate_ids_if_needed') if params.get('genetic_code'): self.code_table = params['genetic_code'] # 4) Do the upload files = self._find_input_files(input_directory) consolidated_file = self._join_files_skip_empty_lines(files) genome = self.parse_genbank(consolidated_file, params) if params.get('genetic_code'): genome["genetic_code"] = params['genetic_code'] ### # DEBUGGING CODE INSTRUCTIONS TO BE KEPT FOR DETERMINING ISSUE WITH A PARTICULAR FILE # THAT FAILS TYPESPEC CHECKING - THIS ALLOW YOU TO LOOK AT THE JSON BEFORE SAVED: # Turn this on : # 1) uncomment the TWO LINES for json printing lines below the ### # 2) move skip/utility_test/problem_genome_test.py into the test dir # 3) change the file location in the test_problem_genome_for_json test # 4) add your test file to the test/data dir # 5) run kb-sdk test as normal # 6) after test completes go and find the json file at test_local/workdir/tmp/ProblemGenome.json ### #with open(self.cfg.sharedFolder+'/ProblemGenome.json', 'w') as outfile: # json.dump(genome, outfile, indent=4) # json.dump(genome, outfile) result = self.gi.save_one_genome({ 'workspace': params['workspace_name'], 'name': params['genome_name'], 'data': genome, "meta": params['metadata'], }) ref = "{}/{}/{}".format(result['info'][6], result['info'][0], result['info'][4]) self.log("Genome saved to {}".format(ref)) # 5) clear the temp directory shutil.rmtree(input_directory) # 6) return the result info = result['info'] details = {'genome_ref': ref, 'genome_info': info} return details @staticmethod def validate_params(params): if 'workspace_name' not in params: raise ValueError('required "workspace_name" field was not defined') if 'genome_name' not in params: raise ValueError('required "genome_name" field was not defined') if 'file' not in params: raise ValueError('required "file" field was not defined') # one and only one of 'path', 'shock_id', or 'ftp_url' is required file = params['file'] if not isinstance(file, dict): raise ValueError('required "file" field must be a map/dict') n_valid_fields = 0 if 'path' in file and file['path'] is not None: n_valid_fields += 1 if 'shock_id' in file and file['shock_id'] is not None: n_valid_fields += 1 if 'ftp_url' in file and file['ftp_url'] is not None: n_valid_fields += 1 if n_valid_fields < 1: raise ValueError('required "file" field must include one source: ' 'path | shock_id | ftp_url') if n_valid_fields > 1: raise ValueError('required "file" field has too many sources ' 'specified: ' + str(list(file.keys()))) if params.get('genetic_code'): if not (isinstance(params['genetic_code'], int) and 0 < params['genetic_code'] < 32): raise ValueError( "Invalid genetic code specified: {}".format(params)) def stage_input(self, params): """ Setup the input_directory by fetching the files and uncompressing if needed. """ # construct the input directory where we stage files input_directory = os.path.join( self.cfg.sharedFolder, 'genome-upload-staging-' + str(uuid.uuid4())) os.makedirs(input_directory) # at this point, the 'file' input is validated, so we don't have to catch any special cases # we expect one and only one of path, shock_id, or ftp_url # determine how to get the file: if it is from shock, download it. If it # is just sitting there, then use it. Move the file to the staging input directory file = params['file'] genbank_file_path = None if 'path' in file and file['path'] is not None: # copy the local file to the input staging directory # (NOTE: could just move it, but then this method would have the side effect of moving your # file which another SDK module might have an open handle on) local_file_path = file['path'] genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path)) shutil.copy2(local_file_path, genbank_file_path) if 'shock_id' in file and file['shock_id'] is not None: # handle shock file self.log('Downloading file from SHOCK node: {} - {}'.format( self.cfg.shockURL, file['shock_id'])) sys.stdout.flush() file_name = self.dfu.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] genbank_file_path = os.path.join(input_directory, file_name) if 'ftp_url' in file and file['ftp_url'] is not None: self.log('Downloading file from: ' + str(file['ftp_url'])) local_file_path = self.dfu.download_web_file({ 'file_url': file['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path)) shutil.copy2(local_file_path, genbank_file_path) # extract the file if it is compressed if genbank_file_path is not None: self.log("staged input file =" + genbank_file_path) self.dfu.unpack_file({'file_path': genbank_file_path}) else: raise ValueError( 'No valid files could be extracted based on the input') return input_directory def parse_genbank(self, file_path, params): self.log("Saving original file to shock") shock_res = self.dfu.file_to_shock({ 'file_path': file_path, 'make_handle': 1, 'pack': 'gzip', }) # Write and save assembly file assembly_ref = self._save_assembly(file_path, params) assembly_data = self.dfu.get_objects({ 'object_refs': [assembly_ref], 'ignore_errors': 0 })['data'][0]['data'] genome = { "id": params['genome_name'], "original_source_file_name": os.path.basename(file_path), "assembly_ref": assembly_ref, "gc_content": assembly_data['gc_content'], "dna_size": assembly_data['dna_size'], "md5": assembly_data['md5'], "genbank_handle_ref": shock_res['handle']['hid'], "publications": set(), "contig_ids": [], "contig_lengths": [], } genome['source'], genome['genome_tiers'] = \ self.gi.determine_tier(params['source']) dates = [] # Parse data from genbank file contigs = Bio.SeqIO.parse(file_path, "genbank") for record in contigs: r_annot = record.annotations self.log("parsing contig: " + record.id) if 'date' in r_annot: dates.append(time.strptime(r_annot['date'], "%d-%b-%Y")) genome['contig_ids'].append(record.id) genome['contig_lengths'].append(len(record)) genome["publications"] |= self._get_pubs(r_annot) organism = r_annot.get('organism', 'Unknown Organism') if 'scientific_name' not in genome: genome['scientific_name'] = organism elif genome['scientific_name'] != organism: warn = "Multiple organism in provided files: {}, {}".format( genome['scientific_name'], organism) genome['warnings'] = genome.get('warnings', []) + [warn] # only do the following once(on the first contig) if "source_id" not in genome: genome["source_id"] = record.id.split('.')[0] genome['taxonomy'], genome['taxon_ref'], genome['domain'], \ genome['genetic_code'] = self.gi.retrieve_taxon( params['taxon_wsname'], genome['scientific_name']) self.code_table = genome['genetic_code'] genome["molecule_type"] = r_annot.get('molecule_type', 'DNA') genome['notes'] = r_annot.get('comment', "").replace('\\n', '\n') self._parse_features(record, params['source']) genome.update(self.get_feature_lists()) genome['num_contigs'] = len(genome['contig_ids']) # add dates dates.sort() if dates: genome['external_source_origination_date'] = time.strftime( "%d-%b-%Y", dates[0]) if dates[0] != dates[-1]: genome['external_source_origination_date'] += " _ " + \ time.strftime("%d-%b-%Y", dates[-1]) if self.ontologies_present: genome['ontologies_present'] = dict(self.ontologies_present) genome["ontology_events"] = self.ontology_events genome['feature_counts'] = dict(self.feature_counts) # can't serialize a set genome['publications'] = list(genome['publications']) if len(genome['cdss']) and (self.defects['cds_seq_not_matching'] / float(len(genome['cdss'])) > 0.02): self.genome_warnings.append( warnings["genome_inc_translation"].format( self.defects['cds_seq_not_matching'], len(genome['cdss']))) self.genome_suspect = 1 if self.defects['bad_parent_loc']: self.genome_warnings.append( "There were {} parent/child " "relationships that were not able to be determined. Some of " "these may have splice variants that may be valid " "relationships.".format(self.defects['bad_parent_loc'])) if self.defects['spoofed_genes']: self.genome_warnings.append(warnings['spoofed_genome'].format( self.defects['spoofed_genes'])) genome['suspect'] = 1 if self.defects['not_trans_spliced']: self.genome_warnings.append( warnings['genome_not_trans_spliced'].format( self.defects['not_trans_spliced'])) genome['suspect'] = 1 if self.genome_warnings: genome['warnings'] = self.genome_warnings if self.genome_suspect: genome['suspect'] = 1 self.log("Feature Counts: {}".format(genome['feature_counts'])) return genome def _save_assembly(self, genbank_file, params): """Convert genbank file to fasta and sve as assembly""" contigs = Bio.SeqIO.parse(genbank_file, "genbank") assembly_id = "{}_assembly".format(params['genome_name']) fasta_file = "{}/{}_assembly.fasta".format(self.cfg.sharedFolder, params['genome_name'], self.time_string) out_contigs = [] extra_info = defaultdict(dict) for in_contig in contigs: if in_contig.annotations.get('topology', "") == 'circular': extra_info[in_contig.id]['is_circ'] = 1 self.circ_contigs.add(in_contig.id) elif in_contig.annotations.get('topology', "") == 'linear': extra_info[in_contig.id]['is_circ'] = 0 out_contigs.append(in_contig) self.contig_seq[in_contig.id] = in_contig.seq.upper() assembly_ref = params.get("use_existing_assembly") if assembly_ref: if not re.match("\d+\/\d+\/\d+", assembly_ref): raise ValueError("Assembly ref: {} is not a valid format. Must" " be in numerical <ws>/<object>/<version>" " format.".format(assembly_ref)) ret = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0] if "KBaseGenomeAnnotations.Assembly" not in ret['info'][2]: raise ValueError("{} is not a reference to an assembly".format( assembly_ref)) unmatched_ids = list() unmatched_ids_md5s = list() for current_contig in self.contig_seq.keys(): current_contig_md5 = hashlib.md5( str(self.contig_seq[current_contig]).encode( 'utf8')).hexdigest() if current_contig in ret['data']['contigs']: if current_contig_md5 != ret['data']['contigs'][ current_contig]['md5']: unmatched_ids_md5s.append(current_contig) else: unmatched_ids.append(current_contig) if len(unmatched_ids) > 0: raise ValueError(warnings['assembly_ref_extra_contigs'].format( ", ".join(unmatched_ids))) if len(unmatched_ids_md5s) > 0: raise ValueError(warnings["assembly_ref_diff_seq"].format( ", ".join(unmatched_ids_md5s))) self.log("Using supplied assembly: {}".format(assembly_ref)) return assembly_ref self.log("Saving sequence as Assembly object") Bio.SeqIO.write(out_contigs, fasta_file, "fasta") assembly_ref = self.aUtil.save_assembly_from_fasta({ 'file': { 'path': fasta_file }, 'workspace_name': params['workspace_name'], 'assembly_name': assembly_id, 'contig_info': extra_info }) self.log("Assembly saved to {}".format(assembly_ref)) return assembly_ref def _find_input_files(self, input_directory): self.log("Scanning for Genbank Format files.") valid_extensions = [".gbff", ".gbk", ".gb", ".genbank", ".dat", ".gbf"] files = os.listdir(os.path.abspath(input_directory)) self.log("Genbank Files : " + ", ".join(files)) genbank_files = [ x for x in files if os.path.splitext(x)[-1] in valid_extensions ] if len(genbank_files) == 0: raise Exception( "The input directory does not have any files with one of the " "following extensions %s." % (",".join(valid_extensions))) self.log("Found {} genbank files".format(len(genbank_files))) input_files = [] for genbank_file in genbank_files: input_files.append(os.path.join(input_directory, genbank_file)) return input_files def _join_files_skip_empty_lines(self, input_files): """ Applies strip to each line of each input file. Args: input_files: Paths to input files in Genbank format. Returns: Path to resulting file (currenly it's the same file as input). """ if len(input_files) == 0: raise ValueError("NO GENBANK FILE") temp_dir = os.path.join(os.path.dirname(input_files[0]), "combined") if not os.path.exists(temp_dir): os.makedirs(temp_dir) ret_file = os.path.join(temp_dir, os.path.basename(input_files[0])) # take in Genbank file and remove all empty lines from it. with open(ret_file, 'w', buffering=2**20) as f_out: for input_file in input_files: with open(input_file, 'r') as f_in: for line in f_in: line = line.rstrip('\r\n') if line.strip(): f_out.write(line + '\n') return ret_file def _get_pubs(self, r_annotations): """Get a contig's publications""" pub_list = [] for in_pub in r_annotations.get('references', []): # don't add blank pubs if not in_pub.authors: continue out_pub = [ 0, # pmid "", # source in_pub.title, "", # web address "", # date in_pub.authors, in_pub.journal, ] date_match = re.match("\((\d{4})\)", in_pub.journal) if date_match: out_pub[4] = date_match.group(1) if in_pub.pubmed_id: out_pub[0:4] = [ int(in_pub.pubmed_id), "PubMed", in_pub.title, "http://www.ncbi.nlm.nih.gov/pubmed/{}".format( in_pub.pubmed_id) ] pub_list.append(tuple(out_pub)) self.log("Parsed {} publication records".format(len(pub_list))) return set(pub_list) def _parse_features(self, record, source): def _get_id(feat, tags=None): """Assign a id to a feature based on the first tag that exists""" _id = "" if not tags: tags = ['locus_tag', 'kbase_id'] for t in tags: _id = feat.qualifiers.get(t, [""])[0] if _id: break if not _id: if feat.type == 'gene': if not self.generate_ids: raise ValueError( "Unable to find a valid id for genes " "among these tags: {}. Correct the " "file or rerun with generate_ids".format( ", ".join(tags))) self.orphan_types['gene'] += 1 _id = "gene_{}".format(self.orphan_types['gene']) if 'rna' in feat.type.lower() or feat.type in { 'CDS', 'sig_peptide', 'five_prime_UTR', 'three_prime_UTR' }: _id = "gene_{}".format(self.orphan_types['gene']) return _id def _location(feat): """Convert to KBase style location objects""" strand_trans = ("", "+", "-") loc = [] for part in feat.location.parts: contig_id = part.ref if part.ref else record.id if part.strand >= 0: begin = int(part.start) + 1 else: begin = int(part.end) loc.append( (contig_id, begin, strand_trans[part.strand], len(part))) return loc def _warn(message): if message not in out_feat.get('warnings', []): out_feat['warnings'] = out_feat.get('warnings', []) + [message] def _check_suspect_location(parent=None): if 'trans_splicing' in out_feat.get('flags', []): return if out_feat['location'] == sorted( out_feat['location'], reverse=(in_feature.location.strand == -1)): return if record.id in self.circ_contigs and \ in_feature.location.start == 0 \ and in_feature.location.end == len(record): self.features_spaning_zero.add(out_feat['id']) return if parent and parent['id'] in self.features_spaning_zero: return _warn(warnings['not_trans_spliced']) self.defects['not_trans_spliced'] += 1 for in_feature in record.features: if in_feature.type in self.excluded_features: self.skiped_features[in_feature.type] += 1 continue feat_seq = self._get_seq(in_feature, record.id) if source == "Ensembl": _id = _get_id(in_feature, ['gene', 'locus_tag']) else: _id = _get_id(in_feature) # The following is common to all the feature types out_feat = { "id": "_".join([_id, in_feature.type]), "location": _location(in_feature), "dna_sequence": str(feat_seq), "dna_sequence_length": len(feat_seq), "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(), } if not _id: out_feat['id'] = in_feature.type # validate input feature # note that end is the larger number regardless of strand if int(in_feature.location.end) > len(record): self.genome_warnings.append( warnings["coordinates_off_end"].format(out_feat['id'])) self.genome_suspect = 1 continue for piece in in_feature.location.parts: if not isinstance(piece.start, ExactPosition) \ or not isinstance(piece.end, ExactPosition): _warn(warnings["non_exact_coordinates"]) self.feature_counts[in_feature.type] += 1 # add optional fields if 'note' in in_feature.qualifiers: out_feat['note'] = in_feature.qualifiers["note"][0] out_feat.update(self._get_aliases_flags_functions(in_feature)) ont, db_xrefs = self._get_ontology_db_xrefs(in_feature) if ont: out_feat['ontology_terms'] = ont if db_xrefs: out_feat['db_xrefs'] = db_xrefs if 'inference' in in_feature.qualifiers: out_feat['inference_data'] = parse_inferences( in_feature.qualifiers['inference']) _check_suspect_location(self.genes.get(_id)) # add type specific features if in_feature.type == 'CDS': self.process_cds(_id, feat_seq, in_feature, out_feat) elif in_feature.type == 'gene': self.process_gene(_id, out_feat) elif in_feature.type == 'mRNA': self.process_mrna(_id, out_feat) else: self.noncoding.append( self.process_noncoding(_id, in_feature.type, out_feat)) def get_feature_lists(self): """sort genes into their final arrays""" coding = [] for g in self.genes.values(): if len(g['cdss']): if g['mrnas'] and len(g['mrnas']) != len(g['cdss']): msg = "The length of the mrna and cdss arrays are not equal" g['warnings'] = g.get('warnings', []) + [msg] # remove duplicates that may arise from CDS info propagation for key in ('functions', 'aliases', 'db_xrefs'): if key in g: g[key] = list(set(g[key])) if not g['mrnas']: del g['mrnas'] del g['type'] coding.append(g) self.feature_counts["protein_encoding_gene"] += 1 else: del g['mrnas'], g['cdss'] self.noncoding.append(g) self.feature_counts["non_coding_features"] += 1 return { 'features': coding, 'non_coding_features': self.noncoding, 'cdss': list(self.cdss.values()), 'mrnas': list(self.mrnas.values()) } def _get_seq(self, feat, contig): """Extract the DNA sequence for a feature""" seq = [] for part in feat.location.parts: strand = part.strand # handle trans-splicing across contigs if part.ref: part_contig = part.ref else: part_contig = contig if strand >= 0: seq.append( str(self.contig_seq[part_contig][part.start:part.end])) else: seq.append( str(self.contig_seq[part_contig] [part.start:part.end].reverse_complement())) return "".join(seq) def _create_ontology_event(self, ontology_type): """Creates the ontology_event if necessary Returns the index of the ontology event back.""" if ontology_type not in self.ont_mappings: raise ValueError( "{} is not a supported ontology".format(ontology_type)) if "event_index" not in self.ont_mappings[ontology_type]: self.ont_mappings[ontology_type]['event_index'] = len( self.ontology_events) if ontology_type == "GO": ontology_ref = "KBaseOntology/gene_ontology" elif ontology_type == "PO": ontology_ref = "KBaseOntology/plant_ontology" else: ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology" self.ontology_events.append({ "method": "GenomeFileUtils Genbank uploader from annotations", "method_version": self.version, "timestamp": self.time_string, "id": ontology_type, "ontology_ref": ontology_ref }) return self.ont_mappings[ontology_type]['event_index'] def _get_ontology_db_xrefs(self, feature): """Splits the ontology info from the other db_xrefs""" ontology = defaultdict(dict) db_xrefs = [] for key in ("GO_process", "GO_function", "GO_component"): ontology_event_index = self._create_ontology_event("GO") for term in feature.qualifiers.get(key, []): sp = term.split(" - ") ontology['GO'][sp[0]] = [ontology_event_index] self.ontologies_present['GO'][ sp[0]] = self.ont_mappings['GO'].get(sp[0], '') for ref in feature.qualifiers.get('db_xref', []): if ref.startswith('GO:'): ontology['GO'][ref] = [self._create_ontology_event("GO")] self.ontologies_present['GO'][ref] = self.ont_mappings[ 'GO'].get(ref, '') elif ref.startswith('PO:'): ontology['PO'][ref] = [self._create_ontology_event("PO")] self.ontologies_present['PO'][ref] = self.ont_mappings[ 'PO'].get(ref, '') elif ref.startswith('KO:'): ontology['KO'][ref] = [self._create_ontology_event("KO")] self.ontologies_present['KO'][ref] = self.ont_mappings[ 'KO'].get(ref, '') elif ref.startswith('COG'): ontology['COG'][ref] = [self._create_ontology_event("COG")] self.ontologies_present['COG'][ref] = self.ont_mappings[ 'COG'].get(ref, '') elif ref.startswith('PF'): ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")] self.ontologies_present['PFAM'][ref] = self.ont_mappings[ 'PFAM'].get(ref, '') elif ref.startswith('TIGR'): ontology['TIGRFAM'][ref] = [ self._create_ontology_event("TIGRFAM") ] self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[ 'TIGRFAM'].get(ref, '') else: db_xrefs.append(tuple(ref.split(":", 1))) return dict(ontology), sorted(db_xrefs) @staticmethod def _get_aliases_flags_functions(feat): """Get the values for aliases flags and features from qualifiers""" alias_keys = { 'locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id', 'gene', 'EC_number', 'gene_synonym' } result = defaultdict(list) for key, val_list in feat.qualifiers.items(): if key in alias_keys: result['aliases'].extend([(key, val) for val in val_list]) # flags have no other information associated with them if val_list == ['']: result['flags'].append(key) if key == 'function': result['functional_descriptions'].extend( val_list[0].split('; ')) if key == 'product': result['functions'] = val_list return result def _find_parent_gene(self, potential_id, feature): if potential_id in self.genes: lookup_attempts = 0 while lookup_attempts < MAX_PARENT_LOOKUPS: if is_parent(self.genes[potential_id], feature): return potential_id lookup_attempts += 1 try: potential_id = list( self.genes.keys())[-(lookup_attempts + 1)] except IndexError: break # no more genes that could match exist self.defects['bad_parent_loc'] += 1 return None def process_gene(self, _id, out_feat): out_feat.update({ "id": _id, "type": 'gene', "mrnas": [], 'cdss': [], }) if _id in self.genes: raise ValueError("Duplicate gene ID: {}".format(_id)) self.genes[_id] = out_feat def process_noncoding(self, gene_id, feat_type, out_feat): out_feat["type"] = feat_type # this prevents big misc_features from blowing up the genome size if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE: del out_feat['dna_sequence'] gene_id = self._find_parent_gene(gene_id, out_feat) if gene_id: if 'children' not in self.genes[gene_id]: self.genes[gene_id]['children'] = [] out_feat['id'] += "_" + str( len(self.genes[gene_id]['children']) + 1) self.genes[gene_id]['children'].append(out_feat['id']) out_feat['parent_gene'] = gene_id else: self.orphan_types[feat_type] += 1 out_feat['id'] += "_" + str(self.orphan_types[feat_type]) return out_feat def process_mrna(self, gene_id, out_feat): if gene_id not in self.genes and self.generate_parents: self.process_gene(gene_id, copy.copy(out_feat)) gene_id = self._find_parent_gene(gene_id, out_feat) if gene_id: out_feat['id'] = "_".join( (gene_id, "mRNA", str(len(self.genes[gene_id]['mrnas']) + 1))) self.genes[gene_id]['mrnas'].append(out_feat['id']) out_feat['parent_gene'] = gene_id else: self.orphan_types['mrna'] += 1 out_feat['id'] = "mRNA_{}".format(self.orphan_types['mrna']) out_feat['warnings'] = out_feat.get('warnings', []) + [ 'Unable to find parent gene for ' + str(out_feat['id']) ] self.mrnas[out_feat['id']] = out_feat def process_cds(self, gene_id, feat_seq, in_feature, out_feat): # Associate CDS with parents if gene_id not in self.genes: if not self.generate_parents: self.log("Expected gene id: {}".format(gene_id)) raise ValueError(warnings['no_spoof']) new_feat = copy.copy(out_feat) new_feat['id'] = gene_id new_feat['warnings'] = [warnings['spoofed_gene']] self.feature_counts['gene'] += 1 self.defects['spoofed_genes'] += 1 self.process_gene(gene_id, new_feat) gene_id = self._find_parent_gene(gene_id, out_feat) if gene_id: out_feat['id'] = "_".join( (gene_id, "CDS", str(len(self.genes[gene_id]['cdss']) + 1))) self.genes[gene_id]['cdss'].append(out_feat['id']) out_feat['parent_gene'] = gene_id else: self.orphan_types['cds'] += 1 out_feat['id'] = "CDS_{}".format(self.orphan_types['cds']) out_feat['warnings'] = out_feat.get('warnings', []) + [ 'Unable to find parent gene for ' + str(out_feat['id']) ] # there is a 1 to 1 relationship of mRNA to CDS so XXX_mRNA_1 will match XXX_CDS_1 mrna_id = out_feat["id"].replace('CDS', 'mRNA') if mrna_id in self.mrnas: if not is_parent(self.mrnas[mrna_id], out_feat): out_feat['warnings'] = out_feat.get('warnings', []) + [ warnings['cds_mrna_cds'].format(mrna_id) ] self.mrnas[mrna_id]['warnings'] = self.mrnas[mrna_id].get( 'warnings', []) + [warnings['cds_mrna_mrna']] self.defects['bad_parent_loc'] += 1 else: out_feat['parent_mrna'] = mrna_id self.mrnas[mrna_id]['cds'] = out_feat['id'] # process protein prot_seq = in_feature.qualifiers.get("translation", [""])[0] # allow a little slack to account for frameshift and stop codon if prot_seq and abs(len(prot_seq) * 3 - len(feat_seq)) > 4: out_feat['warnings'] = out_feat.get('warnings', []) + [ warnings["inconsistent_CDS_length"].format( len(feat_seq), len(prot_seq)) ] self.genome_warnings.append( warnings['genome_inc_CDS_length'].format( out_feat['id'], len(feat_seq), len(prot_seq))) self.genome_suspect = 1 try: if prot_seq and prot_seq != Seq.translate( feat_seq, self.code_table, cds=True).strip("*"): out_feat['warnings'] = out_feat.get( 'warnings', []) + [warnings["inconsistent_translation"]] self.defects['cds_seq_not_matching'] += 1 except TranslationError as e: out_feat['warnings'] = out_feat.get('warnings', []) + [ "Unable to verify protein sequence:" + str(e) ] if not prot_seq: try: prot_seq = Seq.translate(feat_seq, self.code_table, cds=True).strip("*") out_feat['warnings'] = out_feat.get( 'warnings', []) + [warnings["no_translation_supplied"]] except TranslationError as e: out_feat['warnings'] = out_feat.get('warnings', []) + [ warnings["no_translation_supplied"] + str(e) ] out_feat.update({ "protein_translation": prot_seq, "protein_md5": hashlib.md5(prot_seq.encode('utf8')).hexdigest(), "protein_translation_length": len(prot_seq), }) if out_feat.get('parent_gene'): propagate_cds_props_to_gene(out_feat, self.genes[gene_id]) self.cdss[out_feat['id']] = out_feat