def get_bindingdb_data_file(self): # Unless use_existing_bindingdb_data is set to True, retrieve a new file from BindingDB if os.path.exists(bindingdb_all_data_filepath) and self.use_existing_bindingdb_data: logger.info('BindingDB data file found at: {0}'.format(bindingdb_all_data_filepath)) else: logger.info('Retrieving new BindingDB data file from BindingDB server...') retrieve_all_BindingDB_data(bindingdb_all_data_filepath, decompress=False)
def setup(self): self.uniprot_data_dir = os.path.join(external_data_dirpath, 'UniProt') if not os.path.exists(self.uniprot_data_dir): os.mkdir(self.uniprot_data_dir) self.uniprot_xml_out_filepath = os.path.join(self.uniprot_data_dir, 'uniprot-search.xml') self.domain_names_filename = 'selected_domain_names.txt' self.now = datetime.datetime.utcnow() # get current crawl number crawldata_row = models.CrawlData.query.first() self.current_crawl_number = crawldata_row.current_crawl_number logger.info('Current crawl number: {0}'.format(self.current_crawl_number))
def get_bindingdb_data_file(self): # Unless use_existing_bindingdb_data is set to True, retrieve a new file from BindingDB if os.path.exists(bindingdb_all_data_filepath ) and self.use_existing_bindingdb_data: logger.info('BindingDB data file found at: {0}'.format( bindingdb_all_data_filepath)) else: logger.info( 'Retrieving new BindingDB data file from BindingDB server...') retrieve_all_BindingDB_data(bindingdb_all_data_filepath, decompress=False)
def setup(self): self.uniprot_data_dir = os.path.join(external_data_dirpath, 'UniProt') if not os.path.exists(self.uniprot_data_dir): os.mkdir(self.uniprot_data_dir) self.uniprot_xml_out_filepath = os.path.join(self.uniprot_data_dir, 'uniprot-search.xml') self.domain_names_filename = 'selected_domain_names.txt' self.now = datetime.datetime.utcnow() # get current crawl number crawldata_row = models.CrawlData.query.first() self.current_crawl_number = crawldata_row.current_crawl_number logger.info('Current crawl number: {0}'.format( self.current_crawl_number))
def get_uniprot_data(self): if os.path.exists(self.uniprot_xml_out_filepath) and self.use_existing_data: logger.info('UniProt XML document found at: {0}'.format(self.uniprot_xml_out_filepath)) else: logger.info('Retrieving new XML document from UniProt website.') xml_text = retrieve_uniprot(self.uniprot_query) if len(xml_text) == 0: raise Exception('UniProt search returned no entries.') logger.info('Saving new XML document as: {0}'.format(self.uniprot_xml_out_filepath)) with open(self.uniprot_xml_out_filepath, 'w') as uniprot_xml_file: uniprot_xml_file.write(xml_text + '\n') logger.info('Reading UniProt XML document: {0}'.format(self.uniprot_xml_out_filepath)) self.uniprot_xml = etree.parse(self.uniprot_xml_out_filepath, xml_parser).getroot()
def get_uniprot_data(self): if os.path.exists( self.uniprot_xml_out_filepath) and self.use_existing_data: logger.info('UniProt XML document found at: {0}'.format( self.uniprot_xml_out_filepath)) else: logger.info('Retrieving new XML document from UniProt website.') xml_text = retrieve_uniprot(self.uniprot_query) if len(xml_text) == 0: raise Exception('UniProt search returned no entries.') logger.info('Saving new XML document as: {0}'.format( self.uniprot_xml_out_filepath)) with open(self.uniprot_xml_out_filepath, 'w') as uniprot_xml_file: uniprot_xml_file.write(xml_text + '\n') logger.info('Reading UniProt XML document: {0}'.format( self.uniprot_xml_out_filepath)) self.uniprot_xml = etree.parse(self.uniprot_xml_out_filepath, xml_parser).getroot()
def check_all_gather_scripts_have_been_run(self): """ Test whether each of the gather scripts have been run, and whether they have been updated in the correct order """ data_problem = False for data_type in ['uniprot', 'ncbi_gene', 'bindingdb', 'pdb', 'cbioportal', 'chembl']: datestamp_type = data_type + '_datestamp' current_crawl_datatype_datestamp = getattr(self.current_crawl_datestamps_row, datestamp_type) if current_crawl_datatype_datestamp == None: logger.info('data_type "%s" FAIL: no data found in db' % data_type) data_problem = True elif current_crawl_datatype_datestamp <= self.safe_crawl_datestamp: logger.info('data_type "%s" FAIL: current data (%s) is older than or as old as safe-crawl data (%s)' % (data_type, current_crawl_datatype_datestamp.strftime(targetexplorer.core.datestamp_format_string), self.safe_crawl_datestamp.strftime(targetexplorer.core.datestamp_format_string))) data_problem = True elif current_crawl_datatype_datestamp > self.safe_crawl_datestamp: logger.info('data_type "%s" PASS: current data (%s) is newer than safe-crawl data (%s)' % (data_type, current_crawl_datatype_datestamp.strftime(targetexplorer.core.datestamp_format_string), self.safe_crawl_datestamp.strftime(targetexplorer.core.datestamp_format_string))) if data_problem: raise DatabaseException('Commit aborted.') else: logger.info('Proceeding to commit to master db...')
def check_all_gather_scripts_have_been_run(self): """ Test whether each of the gather scripts have been run, and whether they have been updated in the correct order """ data_problem = False for data_type in ['uniprot', 'ncbi_gene', 'bindingdb', 'pdb', 'cbioportal']: datestamp_type = data_type + '_datestamp' current_crawl_datatype_datestamp = getattr(self.current_crawl_datestamps_row, datestamp_type) if current_crawl_datatype_datestamp == None: logger.info('data_type "%s" FAIL: no data found in db' % data_type) data_problem = True elif current_crawl_datatype_datestamp <= self.safe_crawl_datestamp: logger.info('data_type "%s" FAIL: current data (%s) is older than or as old as safe-crawl data (%s)' % (data_type, current_crawl_datatype_datestamp.strftime(targetexplorer.core.datestamp_format_string), self.safe_crawl_datestamp.strftime(targetexplorer.core.datestamp_format_string))) data_problem = True elif current_crawl_datatype_datestamp > self.safe_crawl_datestamp: logger.info('data_type "%s" PASS: current data (%s) is newer than safe-crawl data (%s)' % (data_type, current_crawl_datatype_datestamp.strftime(targetexplorer.core.datestamp_format_string), self.safe_crawl_datestamp.strftime(targetexplorer.core.datestamp_format_string))) if data_problem: raise DatabaseException('Commit aborted.') else: logger.info('Proceeding to commit to master db...')
def delete_old_crawls(self): crawl_numbers = [row.crawl_number for row in models.DateStamps.query.all()] if len(crawl_numbers) > self.project_config['ncrawls_to_save']: logger.info('More than %d crawls found.' % self.project_config['ncrawls_to_save']) crawl_numbers_sorted = sorted(crawl_numbers, reverse=True) crawls_to_delete = crawl_numbers_sorted[self.project_config['ncrawls_to_save']:] # iterate through crawls to delete for crawl_to_delete in crawls_to_delete: logger.info('Deleting crawl %d...' % crawl_to_delete) # iterate through tables for table_class_name in models.table_class_names: if table_class_name == 'CrawlData': continue table = getattr(models, table_class_name) rows_to_delete = table.query.filter_by(crawl_number=crawl_to_delete) logger.info(' - %s - %d rows' % (table_class_name, rows_to_delete.count())) rows_to_delete.delete()
def extract_sifts_seq(sifts_filepath, uniprot_ac, uniprot_entry_name, pdb_id, chain_id, uniprot_sequence): exception_message = None sifts = etree.fromstring( gzip.open(sifts_filepath, 'r').read() ) # First check whether the first residue with matching chainID and a UniProt crossref has the same UniProt AC as was picked up from UniProt (by gather-uniprot.py). # 3O50 and 3O51 are picked up by gather-uniprot.py from uniprot AC O14965. But these have uniprot AC B4DX16 in the sifts .xml files, which is a TrEMBL entry. Sequences are almost identical except for deletion of ~70 residues prior to PK domain of B4DX16. This means that experimental_sequence_aln and related sequences are not added by gather-pdb.py. Need to sort out a special case for these pdbs. Should check for similar cases in other kinases. # 3O50 and 3O51 can be ignored. (Plenty of other PDBs for that protein) # 3OG7 is picked up from uniprot AC P15056, but the PDB entry links to Q5IBP5 - this is the AKAP9-BRAF fusion protein. # XXX TODO XXX 3OG7 will be ignored for now, but at some point should make separate entries for fusion proteins, and add the PDB files accordingly. first_matching_uniprot_resi = sifts.find('entity[@type="protein"]/segment/listResidue/residue/crossRefDb[@dbSource="PDB"][@dbChainId="%s"]/../crossRefDb[@dbSource="UniProt"]' % chain_id) sifts_uniprot_ac = first_matching_uniprot_resi.get('dbAccessionId') if uniprot_ac != sifts_uniprot_ac: logger.info('PDB %s chain %s picked up from UniProt entry %s %s. Non-matching UniProtAC in sifts: %s. This chain will be deleted.' % (pdb_id, chain_id, uniprot_entry_name, uniprot_ac, sifts_uniprot_ac)) exception_message = 'DELETE_ME' # # # TODO check if there are any PDBs where two proteins share the same chainID (I seem to remember that there are - check previous scripts) # # # ====== # Extract sequence data from the SIFTS XML # ====== # These are the sifts residues which include a PDB crossref with matching chainID chain_residues = sifts.findall('entity[@type="protein"]/segment/listResidue/residue/crossRefDb[@dbSource="PDB"][@dbChainId="%s"]/..' % chain_id) experimental_sequence = '' experimental_sequence_pdb_resids = [] experimental_sequence_uniprot_res_indices = [] observed_sequence_aln_exp = '' experimental_sequence_aln = ['-'] * len(uniprot_sequence) # This will contain the alignment of the experimental sequence against the full UniProt sequence. Conflicting residues will be added if they are contiguous with non-conflicting segments. NOTE: this is no longer added to the database. experimental_sequence_aln_conflicts = ['-'] * len(uniprot_sequence) # Same, but conflicting residues are added as lower case observed_sequence_aln = ['-'] * len(uniprot_sequence) # This will contain the alignment of the observed sequence against the full UniProt sequence. Conflicting residues will be ignored. ss_aln = ['-'] * len(uniprot_sequence) # This will contain the alignment of the secondary structure codes against the full UniProt sequence. Conflicting residues will be ignored. n_crossref_uniprot_matches = 0 for r in chain_residues: residue_details = r.findall('residueDetail') residue_detail_texts = [ detail.text.strip() for detail in residue_details ] # list of strings ss = r.findtext('residueDetail[@property="codeSecondaryStructure"]') resname = r.attrib['dbResName'] if resname == None: print 'ERROR: UniProt crossref not found for conflicting residue!', uniprot_ac, pdb_id, chain_id, r.attrib raise Exception try: # Note that this BioPython dict converts a modified aa to the single-letter code of its unmodified parent (e.g. "TPO":"T") single_letter = Bio.Data.SCOPData.protein_letters_3to1[ resname ] except KeyError: if resname == 'ACE': # Just ignore N-terminal ACE continue elif resname == 'CAS': # S-(dimethylarsenic)cysteine single_letter = 'C' elif resname == 'MHO': # S-oxymethionine single_letter = 'M' elif resname == 'LGY': # 3NX8. (E)-N-(4-oxobutylidene)lysine single_letter = 'K' elif resname == 'AME': # N-acetylmethionine single_letter = 'M' elif resname == 'NMM': # 3KB7 single_letter = 'R' elif resname == 'OCY': # 2R9S single_letter = 'C' elif resname == 'CY0': # 2J5E single_letter = 'C' elif resname == 'CY7': # 2JIV single_letter = 'C' else: print 'KeyError: Problem converting resname', resname, 'to single letter code.', chain_id, r.attrib raise KeyError # Add residue to experimental_sequence experimental_sequence += single_letter # Also save the pdb resids, which we will use later pdb_resid = r.find('crossRefDb[@dbSource="PDB"]').attrib['dbResNum'] # TODO need to generalize this. Shift to manual_overrides.yaml or do something else? In the short-term, perhaps just skip these PDBs? # Some pdb resids are e.g. '464A' if pdb_resid.isdigit() == False: if pdb_id in ['1O6L','2JDO','2JDR','2UW9','2X39','2XH5']: # These pdbs include three residues with pdb resids 464A, 464B, 464C, (all with UniProt crossrefs) then continues from 465. We will change this so that the pdb resids continue to iterate corrected_pdb_resids = {'464A':465, '464B':466, '464C':467} if pdb_resid in corrected_pdb_resids.keys(): pdb_resid = corrected_pdb_resids[pdb_resid] elif int(pdb_resid[0:3]) > 464: pdb_resid = int(pdb_resid) + 3 # Otherwise just extract the number (this will also detect negative numbers) else: pdb_resid = ''.join([char for char in pdb_resid if (char.isdigit() or char == '-')]) try: experimental_sequence_pdb_resids.append( int(pdb_resid) ) except: print 'Problem converting pdb_resid into int.', uniprot_ac, pdb_id, chain_id, pdb_resid raise Exception # Also add residue to experimental_sequence_aln. Residues which do not match the uniprot sequence (and thus do not have a uniprot crossref) will be added later crossref_uniprot = r.find('crossRefDb[@dbSource="UniProt"][@dbAccessionId="%s"]' % uniprot_ac) if crossref_uniprot != None: n_crossref_uniprot_matches += 1 index = int(crossref_uniprot.attrib['dbResNum']) - 1 experimental_sequence_aln[index] = single_letter if 'Conflict' in residue_detail_texts or 'Engineered mutation' in residue_detail_texts: experimental_sequence_aln_conflicts[index] = single_letter.lower() else: experimental_sequence_aln_conflicts[index] = single_letter experimental_sequence_uniprot_res_indices.append(index) # Add residue to observed_sequence_aln if it is observed and is not a conflict if 'Not_Observed' not in residue_detail_texts and ('Conflict' not in residue_detail_texts or 'Engineered mutation' in residue_detail_texts): observed_sequence_aln[index] = single_letter if ss != None: ss_aln[index] = ss else: experimental_sequence_uniprot_res_indices.append(None) pass # Add residue to observed_sequence_aln_exp if it is observed, otherwise '-' if 'Not_Observed' in residue_detail_texts: observed_sequence_aln_exp += '-' else: observed_sequence_aln_exp += single_letter # Now check whether the number of non-observed residues is more than 90% of the experimental sequence length n_unobserved_residues = observed_sequence_aln_exp.count('-') if ( float(n_unobserved_residues) / float(len(experimental_sequence)) ) > 0.9: exception_message = 'DELETE_ME' # ====== # Now we add the residues which do not have a UniProt crossref # ====== #print e, uniprot_ac, pdb_id, chain_id #print experimental_sequence #print ''.join(experimental_sequence_aln_conflicts) i = 0 # But first we have to deal with cases where residues have been added at the N-terminus which extend before the start of the uniprot sequence. The excess residues will be ignored. # Get the uniprot residue index of the first residue with a uniprot crossref for s in range(len(experimental_sequence_uniprot_res_indices)): UP_res_index = experimental_sequence_uniprot_res_indices[s] if UP_res_index != None: first_exp_seq_uniprot_res_index = UP_res_index # And the corresponding pdb resid corresponding_pdb_resid = experimental_sequence_pdb_resids[s] exp_seq_first_uniprot_res_index = s break # And get the pdb resid of the first residue in the experimental sequence for s in experimental_sequence_pdb_resids: if s != None: first_exp_seq_pdb_resid = s break ignore_excess_Nterm_residues_flag = False # If the experimental sequence includes the first residue of the full uniprot sequence try: if first_exp_seq_uniprot_res_index == 0: # And if the value of the first pdb resid is lower than that of the pdb resid corresponding to the first uniprot residue if first_exp_seq_pdb_resid < corresponding_pdb_resid: # Then we will ignore the excess residues ignore_excess_Nterm_residues_flag = True except: # XXX should do something better than this # exception occurs with P27791 (KAPCA_RAT) exception_message = 'DELETE_ME' # Now iterate through the residues in the experimental sequence and add residues which do not have a uniprot crossref, but are contiguous in terms of PDB numbering while i < len(experimental_sequence): resname_i = experimental_sequence[i] uniprot_res_index_i = experimental_sequence_uniprot_res_indices[i] pdb_resid_i = experimental_sequence_pdb_resids[i] if (ignore_excess_Nterm_residues_flag == True) and (pdb_resid_i < corresponding_pdb_resid): pass # we ignore these residues # If this residue does not have a uniprot crossref elif uniprot_res_index_i == None: # Start a list of residues with no uniprot crossref contiguous_noUP_residues = [ resname_i ] # Then check the next residue j = i + 1 while j < len(experimental_sequence): resname_j = experimental_sequence[j] uniprot_res_index_j = experimental_sequence_uniprot_res_indices[j] pdb_resid_j = experimental_sequence_pdb_resids[j] #print 'len, i, j:', len(experimental_sequence), i, j, pdb_resid_i, pdb_resid_j, contiguous_noUP_residues # If this residue also has no uniprot crossref, and is contiguous in terms of pdb resnum, then add it to the list, and move on to the next one if (uniprot_res_index_j == None) and ((pdb_resid_j - pdb_resid_i) == (j-i)): #print 'adding to list:', j, resname_j contiguous_noUP_residues.append( resname_j ) pass # If this residue does have a uniprot crossref, and if it is contiguous in terms of pdb resnum, then we add the list of residues without uniprot crossrefs at this position elif (uniprot_res_index_j != None) and ((pdb_resid_j - pdb_resid_i) == (j-i)): #print 'adding to sequence_aln:', j experimental_sequence_aln[ (uniprot_res_index_j - j) : uniprot_res_index_j ] = contiguous_noUP_residues experimental_sequence_aln_conflicts[ (uniprot_res_index_j - j) : uniprot_res_index_j ] = list(''.join(contiguous_noUP_residues).lower()) i = j break # If this residue is not contiguous in terms of pdb resnum, go back and check if the first of contiguous_noUP_residues is pdb-contiguous with the previous residue - if so, add contiguous_noUP_residues elif (pdb_resid_j - pdb_resid_i) != (j-i): #print 'checking backwards:', j if (pdb_resid_i - experimental_sequence_pdb_resids[i-1]) == 1: last_uniprot_res_index = experimental_sequence_uniprot_res_indices[i-1] experimental_sequence_aln[ last_uniprot_res_index + 1 : last_uniprot_res_index + 1 + (j-i)] = contiguous_noUP_residues experimental_sequence_aln_conflicts[ last_uniprot_res_index + 1 : last_uniprot_res_index + 1 + (j-i)] = list(''.join(contiguous_noUP_residues).lower()) i = j - 1 break # If we have reached the end of experimental_sequence, go back and check if the first of contiguous_noUP_residues is pdb-contiguous with the previous residue - if so, add contiguous_noUP_residues if j == len(experimental_sequence) - 1: #print 'THIS IS THE END', len(experimental_sequence), i, j, pdb_resid_i, experimental_sequence_pdb_resids[i], experimental_sequence_pdb_resids[i-1], contiguous_noUP_residues #print experimental_sequence_pdb_resids if (pdb_resid_i - experimental_sequence_pdb_resids[i-1]) == 1: last_uniprot_res_index = experimental_sequence_uniprot_res_indices[i-1] experimental_sequence_aln[ last_uniprot_res_index + 1 : last_uniprot_res_index + 2 + (j-i)] = contiguous_noUP_residues experimental_sequence_aln_conflicts[ last_uniprot_res_index + 1 : last_uniprot_res_index + 2 + (j-i)] = list(''.join(contiguous_noUP_residues).lower()) i = j break j += 1 i += 1 # ====== # Some final processing # ====== # In cases such as 3LAU and 1O6L, additional sequence at end makes experimental_sequence_aln longer than uniprot_sequence by 1 # Handle this by removing the extraneous sequence if len(experimental_sequence_aln) != len(uniprot_sequence): experimental_sequence_aln = experimental_sequence_aln[0:len(uniprot_sequence)] experimental_sequence_aln_conflicts = experimental_sequence_aln_conflicts[0:len(uniprot_sequence)] experimental_sequence_aln = ''.join(experimental_sequence_aln) experimental_sequence_aln_conflicts = ''.join(experimental_sequence_aln_conflicts) observed_sequence_aln = ''.join(observed_sequence_aln) ss_aln = ''.join(ss_aln) chain_results_dict = { 'chain_id': chain_id, 'experimental_seq': experimental_sequence, 'experimental_seq_aln_conflicts': experimental_sequence_aln_conflicts, 'observed_seq_aln_exp': observed_sequence_aln_exp, 'observed_seq_aln': observed_sequence_aln, 'observed_ss_aln': ss_aln, 'exception_message': exception_message, } return chain_results_dict
def extract_detailed_uniprot_data(self, uniprot_entry_node): # = IDs and names = ac = uniprot_entry_node.findtext('./accession') entry_name = uniprot_entry_node.findtext('./name') if self.skip_uniprot_entries and entry_name in self.skip_uniprot_entries: skip_message = self.skip_uniprot_entries[entry_name] logger.info( 'OVERRIDE: Skipping UniProt entry {0} - reason: {1}'.format( entry_name, skip_message ) ) return recommended_name = uniprot_entry_node.findtext('./protein/recommendedName/fullName') gene_name_nodes = uniprot_entry_node.findall('./gene/name') gene_name_data = [] for gene_name_node in gene_name_nodes: gene_name = gene_name_node.text gene_name_type = gene_name_node.get('type') gene_name_obj = models.UniProtGeneName( crawl_number=self.current_crawl_number, gene_name=gene_name, gene_name_type=gene_name_type ) gene_name_data.append(gene_name_obj) # = Date entry was last modified in UniProt = last_uniprot_update = uniprot_entry_node.get('modified') # = Taxonomy = uniprot_organism_node = uniprot_entry_node.find('organism') ncbi_taxon_id = uniprot_organism_node.find('dbReference[@type="NCBI Taxonomy"]').get('id') taxon_name_scientific = uniprot_organism_node.findtext('name[@type="scientific"]') taxon_name_common = uniprot_organism_node.findtext('name[@type="common"]') lineage = uniprot_organism_node.find('lineage') lineage_csv = ','.join([taxon.text for taxon in lineage.getchildren()]) # = Functions, disease associations, subcellular locations = functions = [] disease_associations = [] subcellular_locations = [] for domain in uniprot_entry_node.findall('./comment[@type="function"]'): functions.append( models.UniProtFunction( crawl_number=self.current_crawl_number, function=domain.findtext('./text') ) ) for domain in uniprot_entry_node.findall('./comment[@type="disease"]'): disease_associations.append( models.UniProtDiseaseAssociation( crawl_number=self.current_crawl_number, disease_association=domain.findtext('./text') ) ) for domain in uniprot_entry_node.findall('./comment[@type="subcellular location"]'): subcellular_locations.append( models.UniProtSubcellularLocation( crawl_number=self.current_crawl_number, subcellular_location=domain.findtext('./subcellularLocation/location') ) ) # = Canonical isoform = isoforms = [] # Returned UniProt XML contains sequence data only for the canonical isoform uniprot_canonical_sequence_node = uniprot_entry_node.find( './sequence[@length][@mass]' ) canonical_sequence = ''.join(uniprot_canonical_sequence_node.text.split()) canseq_length = uniprot_canonical_sequence_node.get('length') canseq_mass = uniprot_canonical_sequence_node.get('mass') canseq_date_modified = uniprot_canonical_sequence_node.get('modified') canseq_version = uniprot_canonical_sequence_node.get('version') uniprot_isoform = models.UniProtIsoform( crawl_number=self.current_crawl_number, ac=ac+'-1', is_canonical=True, length=canseq_length, mass=canseq_mass, date_modified=canseq_date_modified, version=canseq_version, sequence=canonical_sequence ) # empty list for notes (which do not exist for the canonical sequence) isoforms.append((uniprot_isoform, [])) # = Alternative isoforms = # Canonical isoform is given the attrib type="displayed", meaning that the sequence is displayed in the HTML version of the entry # Example alt isoform: # <comment> # <isoform> # <id>P00519-2</id> # <name>IB</name> # <sequence type="described" ref="VSP_004957"/> # <note>Contains a N-myristoyl glycine at position 2.</note> # </isoform> # </comment> for uniprot_isoform_node in uniprot_entry_node.findall('comment/isoform'): isoform_ac = uniprot_isoform_node.findtext('id') seq_node = uniprot_isoform_node.find('sequence') notes = [ models.UniProtIsoformNote( crawl_number=self.current_crawl_number, note=node.text ) for node in uniprot_isoform_node.findall('note') ] if seq_node.get('type') != 'displayed': uniprot_isoform = models.UniProtIsoform( crawl_number=self.current_crawl_number, ac=isoform_ac, is_canonical=False ) isoforms.append((uniprot_isoform, notes)) # = UniProt "Protein kinase" domain annotations = # XXX TODO Generalize # if self.uniprot_domain_regex != None: # selected_domains = uniprot_entry_node.xpath( # 'feature[@type="domain"][match_regex(@description, "{0}")]'.format( # self.uniprot_domain_regex # ), # extensions={(None, 'match_regex'): xpath_match_regex_case_sensitive} # ) # else: domains = uniprot_entry_node.findall('feature[@type="domain"]') # Skip if no matching domains found if len(domains) < 1: return # Finally, add the domains to the new database domain_objs = [] target_iter = 0 for domain_id, domain in enumerate(domains): # First calculate the PK domain length and sequence domain_description = domain.get('description') if self.uniprot_domain_regex and re.match(self.uniprot_domain_regex, domain_description): is_target_domain = True target_id = entry_name + '_D' + str(target_iter) target_iter += 1 else: is_target_domain = False begin = int(domain.find('./location/begin').get('position')) end = int(domain.find('./location/end').get('position')) length = end - begin + 1 domain_seq = canonical_sequence[begin-1:end] if (self.pseudodomain_manual_annotations and entry_name in self.pseudodomain_manual_annotations and domain_description == self.pseudodomain_manual_annotations[entry_name].get('description') ): pseudodomain_notes = self.pseudodomain_manual_annotations[entry_name].get('message') logger.info( 'OVERRIDE: Labeling domain "{0}" as a pseudodomain - reason: {1}'.format( target_id, pseudodomain_notes ) ) is_pseudodomain = True else: is_pseudodomain = False domain_obj = models.UniProtDomain( crawl_number=self.current_crawl_number, domain_id=domain_id, target_id=target_id if is_target_domain else None, is_target_domain=is_target_domain, description=domain_description, is_pseudodomain=is_pseudodomain, pseudodomain_notes=pseudodomain_notes if is_pseudodomain else None, begin=begin, end=end, length=length, sequence=domain_seq ) domain_objs.append(domain_obj) # = References to other DBs = # NCBI Gene ncbi_gene_entries = [] gene_ids = [ int(domain.get('id')) for domain in uniprot_entry_node.findall('./dbReference[@type="GeneID"]') ] # manual annotations if self.ncbi_gene_id_manual_annotations and entry_name in self.ncbi_gene_id_manual_annotations: gene_ids = self.ncbi_gene_id_manual_annotations[entry_name].get('gene_ids') gene_ids_message = self.ncbi_gene_id_manual_annotations[entry_name].get('message') logger.info( 'OVERRIDE: Manually annotating Gene IDs for entry {0} - reason: {1}'.format( entry_name, gene_ids_message ) ) for gene_id in gene_ids: # manual override skips if self.skip_ncbi_gene_entries and gene_id in self.skip_ncbi_gene_entries: skip_gene_id_message = self.skip_ncbi_gene_entries[gene_id] logger.info( 'OVERRIDE: Skipping Gene ID {0} for entry {1} - reason: {2}'.format( gene_id, entry_name, skip_gene_id_message ) ) continue ncbi_gene_entries.append( models.NCBIGeneEntry( crawl_number=self.current_crawl_number, gene_id=gene_id ) ) # Ensembl # transcript_data = { # 'ENSMUST00000003710': # { # 'gene': # 'ENSG000...', # 'protein': # 'ENSP000...', # } # } ensembl_transcript_nodes = uniprot_entry_node.findall( './dbReference[@type="Ensembl"]' ) ensembl_data = {} ensembl_transcript_matched_to_uniprot_isoform = False for transcript_node in ensembl_transcript_nodes: ensembl_transcript_id = transcript_node.get('id') ensembl_gene_nodes = transcript_node.findall('property[@type="gene ID"]') if len(ensembl_gene_nodes) > 1: logger.info( 'WARNING: Ensembl transcript {0} linked with > 1 gene ID'.format( ensembl_transcript_id ) ) ensembl_gene_id = ensembl_gene_nodes[0].get('value') ensembl_protein_nodes = transcript_node.findall('property[@type="protein sequence ID"]') if len(ensembl_protein_nodes) > 1: logger.info( 'WARNING: Ensembl transcript {0} linked with > 1 protein ID'.format( ensembl_transcript_id ) ) ensembl_protein_id = ensembl_protein_nodes[0].get('value') uniprot_isoform_molecule_node = transcript_node.find('molecule') if uniprot_isoform_molecule_node is not None: uniprot_isoform_ac = uniprot_isoform_molecule_node.get('id') if uniprot_isoform_ac == isoforms[0][0].ac: ensembl_transcript_matched_to_uniprot_isoform = True elif uniprot_isoform_molecule_node is None and ensembl_transcript_matched_to_uniprot_isoform is False: uniprot_isoform_ac = isoforms[0][0].ac ensembl_transcript_matched_to_uniprot_isoform = True else: uniprot_isoform_ac = None ensembl_data[ensembl_transcript_id] = { 'gene': ensembl_gene_id, 'protein': ensembl_protein_id, 'uniprot_isoform_ac': uniprot_isoform_ac } # HGNC hgnc_entries = [] hgnc_dbrefs = uniprot_entry_node.findall('./dbReference[@type="HGNC"]') for hgnc_dbref in hgnc_dbrefs: hgnc_gene_id = hgnc_dbref.get('id') approved_symbol = hgnc_dbref.find('property[@type="gene designation"]').get('value') hgnc_entries.append( models.HGNCEntry( crawl_number=self.current_crawl_number, gene_id=hgnc_gene_id, approved_symbol=approved_symbol ) ) # = Family information = similarity_comments = uniprot_entry_node.xpath('./comment[@type="similarity"]') family = False for s in similarity_comments: for f in kinase_family_uniprot_similarity_text.keys(): if f in s.findtext('text'): family = kinase_family_uniprot_similarity_text[f] # = PDB entries (from UniProt XML) = # keep X-ray and NMR structures (not "Model") pdbs = uniprot_entry_node.xpath( './dbReference[@type="PDB"]/property[@type="method"][@value="X-ray" or @value="NMR"]/..' ) pdb_data = [] for p in pdbs: pdb_id = p.get('id') if self.skip_pdbs and pdb_id in self.skip_pdbs: skip_pdb_message = self.skip_pdbs[pdb_id] logger.info( 'OVERRIDE: Skipping PDB {0} for entry {1} - reason: {2}'.format( pdb_id, entry_name, skip_pdb_message ) ) continue pdb_method = p.find('property[@type="method"]').get('value') resolution_node = p.find('property[@type="resolution"]') resolution = resolution_node.get('value') if resolution_node != None else None chains_span_str = p.find('property[@type="chains"]').get('value') chains_span = parse_uniprot_pdbref_chains(chains_span_str) chain_data_dicts = [] for c in chains_span.keys(): chain_id = c pdb_begin = chains_span[c][0] pdb_end = chains_span[c][1] # Use the begin and end info to decide if this pdb chain includes the pk_domain. But we will get other sequence info from sifts XML files, using gather-pdb.py # Have to check against each PK domain for domain in domain_objs: pk_begin = domain.begin pk_end = domain.end if (pdb_begin < pk_begin+30) & (pdb_end > pk_end-30): chain_data_dict = models.PDBChain( crawl_number=self.current_crawl_number, chain_id=chain_id, begin=pdb_begin, end=pdb_end ) chain_data_dicts.append({ 'chain_obj': chain_data_dict, 'domain_obj': domain }) else: continue if len(chain_data_dicts) > 0: pdb_obj = models.PDBEntry( crawl_number=self.current_crawl_number, pdb_id=pdb_id, method=pdb_method, resolution=resolution ) pdb_data.append({'pdb_obj': pdb_obj, 'chain_data_dicts': chain_data_dicts}) # ======== # Construct data objects and add to db # ======== db_entry = models.DBEntry( crawl_number=self.current_crawl_number, npdbs=len(pdb_data), ndomains=len(domain_objs), nisoforms=len(isoforms), nfunctions=len(functions), ndisease_associations=len(disease_associations), ) db.session.add(db_entry) uniprot_entry = models.UniProtEntry( crawl_number=self.current_crawl_number, ac=ac, entry_name=entry_name, last_uniprot_update=last_uniprot_update, ncbi_taxon_id=ncbi_taxon_id, db_entry=db_entry, recommended_name=recommended_name, taxon_name_scientific=taxon_name_scientific, taxon_name_common=taxon_name_common, lineage=lineage_csv, ) if family: uniprot_entry.family = family db.session.add(uniprot_entry) for function_obj in functions: function_obj.db_entry = db_entry function_obj.uniprot_entry = uniprot_entry db.session.add(function_obj) for disease_association_obj in disease_associations: disease_association_obj.db_entry = db_entry disease_association_obj.uniprot_entry = uniprot_entry db.session.add(disease_association_obj) for subcellular_location_obj in subcellular_locations: subcellular_location_obj.db_entry = db_entry subcellular_location_obj.uniprot_entry = uniprot_entry db.session.add(subcellular_location_obj) for isoform_data in isoforms: isoform_obj = isoform_data[0] notes = isoform_data[1] isoform_obj.db_entry = db_entry isoform_obj.uniprot_entry = uniprot_entry db.session.add(isoform_obj) for note_obj in notes: note_obj.uniprotisoform = isoform_obj db.session.add(note_obj) for domain_obj in domain_objs: domain_obj.db_entry = db_entry domain_obj.uniprot_entry = uniprot_entry db.session.add(domain_obj) for pdb_data_dict in pdb_data: pdb_obj = pdb_data_dict['pdb_obj'] chain_data_dicts = pdb_data_dict['chain_data_dicts'] pdb_obj.db_entry = db_entry db.session.add(pdb_obj) for chain_data_dict in chain_data_dicts: chain_obj = chain_data_dict['chain_obj'] domain_obj = chain_data_dict['domain_obj'] chain_obj.pdb_entry = pdb_obj chain_obj.uniprot_domain = domain_obj db.session.add(chain_obj) for gene_name_obj in gene_name_data: gene_name_obj.db_entry = db_entry db.session.add(gene_name_obj) for NCBIGeneEntry in ncbi_gene_entries: NCBIGeneEntry.db_entry = db_entry db.session.add(NCBIGeneEntry) for HGNCEntry in hgnc_entries: HGNCEntry.db_entry = db_entry db.session.add(HGNCEntry) for ensembl_transcript_id in ensembl_data: ensembl_gene_id = ensembl_data[ensembl_transcript_id]['gene'] ensembl_gene_row = models.EnsemblGene( crawl_number=self.current_crawl_number, gene_id=ensembl_gene_id, db_entry=db_entry, ) db.session.add(ensembl_gene_row) ensembl_transcript_row = models.EnsemblTranscript( crawl_number=self.current_crawl_number, transcript_id=ensembl_transcript_id, ensembl_gene=ensembl_gene_row, ) ensembl_transcript_uniprot_isoform_ac = ensembl_data[ensembl_transcript_id]['uniprot_isoform_ac'] if ensembl_transcript_uniprot_isoform_ac is not None: matching_uniprot_isoform_obj = [ isoform[0] for isoform in isoforms if isoform[0].ac == ensembl_transcript_uniprot_isoform_ac ] if len(matching_uniprot_isoform_obj) != 0: ensembl_transcript_row.uniprot_isoform = matching_uniprot_isoform_obj[0] db.session.add(ensembl_transcript_row) ensembl_protein_id = ensembl_data[ensembl_transcript_id]['protein'] ensembl_protein_row = models.EnsemblProtein( crawl_number=self.current_crawl_number, protein_id=ensembl_protein_id, ensembl_gene=ensembl_gene_row, ensembl_transcript=ensembl_transcript_row, ) db.session.add(ensembl_protein_row)
def finish(self): logger.info('Done.')
def analyze_domain_selections(self): """ Prints useful info on the domains selected by uniprot_domain_regex """ selected_domain_names = list( set([d.get('description') for d in self.selected_domains])) selected_domain_name_counts = [ len( self.uniprot_xml.findall( 'entry/feature[@type="domain"][@description="%s"]' % name)) for name in selected_domain_names ] domain_names_str = 'Regex: %s\n' % self.uniprot_domain_regex domain_names_str += 'Number of domains matching regex: %d\n\n' % len( self.selected_domains) domain_names_str += '= Unique domain names which match regex =\n' for i in range(len(selected_domain_names)): domain_names_str += '{:^{name_width}s} : {:>{pop_width}d}\n'.format( selected_domain_names[i], selected_domain_name_counts[i], name_width=max([len(n) + 4 for n in selected_domain_names]), pop_width=max( [len(str(p)) + 1 for p in selected_domain_name_counts])) domain_names_str += '\n' logger.info(domain_names_str) logger.info( '(Unique domain names which do not match regex will be output to {0})' .format(self.domain_names_filename)) all_domains = self.uniprot_xml.findall( './entry/feature[@type="domain"]') domain_names_str += '= Unique domain names which do not match regex =\n' nonselected_domain_names = list( set([ d.get('description') for d in all_domains if d.get('description') not in selected_domain_names ])) if self.count_nonselected_domain_names: nonselected_domain_name_counts = [ int( self.uniprot_xml.xpath( 'count(entry/feature[@type="domain"][@description="{0}"])' .format(name))) for name in nonselected_domain_names ] for i in range(len(nonselected_domain_names)): domain_names_str += '{:^{name_width}s} : {:>{pop_width}d}\n'.format( nonselected_domain_names[i], nonselected_domain_name_counts[i], name_width=max( [len(n) + 4 for n in nonselected_domain_names]), pop_width=max([ len(str(p)) + 1 for p in nonselected_domain_name_counts ]), ) else: for i in range(len(nonselected_domain_names)): domain_names_str += '{:^{name_width}s}\n'.format( nonselected_domain_names[i], name_width=max( [len(n) + 4 for n in nonselected_domain_names]), ) domain_names_str += '\n' with open(self.domain_names_filename, 'w') as domain_names_file: domain_names_file.write(domain_names_str)
def extract_detailed_uniprot_data(self, uniprot_entry_node): # = IDs and names = ac = uniprot_entry_node.findtext('./accession') entry_name = uniprot_entry_node.findtext('./name') if self.skip_uniprot_entries and entry_name in self.skip_uniprot_entries: skip_message = self.skip_uniprot_entries[entry_name] logger.info( 'OVERRIDE: Skipping UniProt entry {0} - reason: {1}'.format( entry_name, skip_message)) return recommended_name = uniprot_entry_node.findtext( './protein/recommendedName/fullName') gene_name_nodes = uniprot_entry_node.findall('./gene/name') gene_name_data = [] for gene_name_node in gene_name_nodes: gene_name = gene_name_node.text gene_name_type = gene_name_node.get('type') gene_name_obj = models.UniProtGeneName( crawl_number=self.current_crawl_number, gene_name=gene_name, gene_name_type=gene_name_type) gene_name_data.append(gene_name_obj) # = Date entry was last modified in UniProt = last_uniprot_update = uniprot_entry_node.get('modified') # = Taxonomy = uniprot_organism_node = uniprot_entry_node.find('organism') ncbi_taxon_id = uniprot_organism_node.find( 'dbReference[@type="NCBI Taxonomy"]').get('id') taxon_name_scientific = uniprot_organism_node.findtext( 'name[@type="scientific"]') taxon_name_common = uniprot_organism_node.findtext( 'name[@type="common"]') lineage = uniprot_organism_node.find('lineage') lineage_csv = ','.join([taxon.text for taxon in lineage.getchildren()]) # = Functions, disease associations, subcellular locations = functions = [] disease_associations = [] subcellular_locations = [] for domain in uniprot_entry_node.findall( './comment[@type="function"]'): functions.append( models.UniProtFunction(crawl_number=self.current_crawl_number, function=domain.findtext('./text'))) for domain in uniprot_entry_node.findall('./comment[@type="disease"]'): disease_associations.append( models.UniProtDiseaseAssociation( crawl_number=self.current_crawl_number, disease_association=domain.findtext('./text'))) for domain in uniprot_entry_node.findall( './comment[@type="subcellular location"]'): subcellular_locations.append( models.UniProtSubcellularLocation( crawl_number=self.current_crawl_number, subcellular_location=domain.findtext( './subcellularLocation/location'))) # = Canonical isoform = isoforms = [] # Returned UniProt XML contains sequence data only for the canonical isoform uniprot_canonical_sequence_node = uniprot_entry_node.find( './sequence[@length][@mass]') canonical_sequence = ''.join( uniprot_canonical_sequence_node.text.split()) canseq_length = uniprot_canonical_sequence_node.get('length') canseq_mass = uniprot_canonical_sequence_node.get('mass') canseq_date_modified = uniprot_canonical_sequence_node.get('modified') canseq_version = uniprot_canonical_sequence_node.get('version') uniprot_isoform = models.UniProtIsoform( crawl_number=self.current_crawl_number, ac=ac + '-1', is_canonical=True, length=canseq_length, mass=canseq_mass, date_modified=canseq_date_modified, version=canseq_version, sequence=canonical_sequence) # empty list for notes (which do not exist for the canonical sequence) isoforms.append((uniprot_isoform, [])) # = Alternative isoforms = # Canonical isoform is given the attrib type="displayed", meaning that the sequence is displayed in the HTML version of the entry # Example alt isoform: # <comment> # <isoform> # <id>P00519-2</id> # <name>IB</name> # <sequence type="described" ref="VSP_004957"/> # <note>Contains a N-myristoyl glycine at position 2.</note> # </isoform> # </comment> for uniprot_isoform_node in uniprot_entry_node.findall( 'comment/isoform'): isoform_ac = uniprot_isoform_node.findtext('id') seq_node = uniprot_isoform_node.find('sequence') notes = [ models.UniProtIsoformNote( crawl_number=self.current_crawl_number, note=node.text) for node in uniprot_isoform_node.findall('note') ] if seq_node.get('type') != 'displayed': uniprot_isoform = models.UniProtIsoform( crawl_number=self.current_crawl_number, ac=isoform_ac, is_canonical=False) isoforms.append((uniprot_isoform, notes)) # = UniProt "Protein kinase" domain annotations = # XXX TODO Generalize # if self.uniprot_domain_regex != None: # selected_domains = uniprot_entry_node.xpath( # 'feature[@type="domain"][match_regex(@description, "{0}")]'.format( # self.uniprot_domain_regex # ), # extensions={(None, 'match_regex'): xpath_match_regex_case_sensitive} # ) # else: domains = uniprot_entry_node.findall('feature[@type="domain"]') # Skip if no matching domains found if len(domains) < 1: return # Finally, add the domains to the new database domain_objs = [] target_iter = 0 for domain_id, domain in enumerate(domains): # First calculate the PK domain length and sequence domain_description = domain.get('description') if self.uniprot_domain_regex and re.match( self.uniprot_domain_regex, domain_description): is_target_domain = True target_id = entry_name + '_D' + str(target_iter) target_iter += 1 else: is_target_domain = False begin = int(domain.find('./location/begin').get('position')) end = int(domain.find('./location/end').get('position')) length = end - begin + 1 domain_seq = canonical_sequence[begin - 1:end] if (self.pseudodomain_manual_annotations and entry_name in self.pseudodomain_manual_annotations and domain_description == self.pseudodomain_manual_annotations[entry_name].get( 'description')): pseudodomain_notes = self.pseudodomain_manual_annotations[ entry_name].get('message') logger.info( 'OVERRIDE: Labeling domain "{0}" as a pseudodomain - reason: {1}' .format(target_id, pseudodomain_notes)) is_pseudodomain = True else: is_pseudodomain = False domain_obj = models.UniProtDomain( crawl_number=self.current_crawl_number, domain_id=domain_id, target_id=target_id if is_target_domain else None, is_target_domain=is_target_domain, description=domain_description, is_pseudodomain=is_pseudodomain, pseudodomain_notes=pseudodomain_notes if is_pseudodomain else None, begin=begin, end=end, length=length, sequence=domain_seq) domain_objs.append(domain_obj) # = References to other DBs = # NCBI Gene ncbi_gene_entries = [] gene_ids = [ int(domain.get('id')) for domain in uniprot_entry_node.findall( './dbReference[@type="GeneID"]') ] # manual annotations if self.ncbi_gene_id_manual_annotations and entry_name in self.ncbi_gene_id_manual_annotations: gene_ids = self.ncbi_gene_id_manual_annotations[entry_name].get( 'gene_ids') gene_ids_message = self.ncbi_gene_id_manual_annotations[ entry_name].get('message') logger.info( 'OVERRIDE: Manually annotating Gene IDs for entry {0} - reason: {1}' .format(entry_name, gene_ids_message)) for gene_id in gene_ids: # manual override skips if self.skip_ncbi_gene_entries and gene_id in self.skip_ncbi_gene_entries: skip_gene_id_message = self.skip_ncbi_gene_entries[gene_id] logger.info( 'OVERRIDE: Skipping Gene ID {0} for entry {1} - reason: {2}' .format(gene_id, entry_name, skip_gene_id_message)) continue ncbi_gene_entries.append( models.NCBIGeneEntry(crawl_number=self.current_crawl_number, gene_id=gene_id)) # Ensembl # transcript_data = { # 'ENSMUST00000003710': # { # 'gene': # 'ENSG000...', # 'protein': # 'ENSP000...', # } # } ensembl_transcript_nodes = uniprot_entry_node.findall( './dbReference[@type="Ensembl"]') ensembl_data = {} ensembl_transcript_matched_to_uniprot_isoform = False for transcript_node in ensembl_transcript_nodes: ensembl_transcript_id = transcript_node.get('id') ensembl_gene_nodes = transcript_node.findall( 'property[@type="gene ID"]') if len(ensembl_gene_nodes) > 1: logger.info( 'WARNING: Ensembl transcript {0} linked with > 1 gene ID'. format(ensembl_transcript_id)) ensembl_gene_id = ensembl_gene_nodes[0].get('value') ensembl_protein_nodes = transcript_node.findall( 'property[@type="protein sequence ID"]') if len(ensembl_protein_nodes) > 1: logger.info( 'WARNING: Ensembl transcript {0} linked with > 1 protein ID' .format(ensembl_transcript_id)) ensembl_protein_id = ensembl_protein_nodes[0].get('value') uniprot_isoform_molecule_node = transcript_node.find('molecule') if uniprot_isoform_molecule_node is not None: uniprot_isoform_ac = uniprot_isoform_molecule_node.get('id') if uniprot_isoform_ac == isoforms[0][0].ac: ensembl_transcript_matched_to_uniprot_isoform = True elif uniprot_isoform_molecule_node is None and ensembl_transcript_matched_to_uniprot_isoform is False: uniprot_isoform_ac = isoforms[0][0].ac ensembl_transcript_matched_to_uniprot_isoform = True else: uniprot_isoform_ac = None ensembl_data[ensembl_transcript_id] = { 'gene': ensembl_gene_id, 'protein': ensembl_protein_id, 'uniprot_isoform_ac': uniprot_isoform_ac } # HGNC hgnc_entries = [] hgnc_dbrefs = uniprot_entry_node.findall('./dbReference[@type="HGNC"]') for hgnc_dbref in hgnc_dbrefs: hgnc_gene_id = hgnc_dbref.get('id') approved_symbol = hgnc_dbref.find( 'property[@type="gene designation"]').get('value') hgnc_entries.append( models.HGNCEntry(crawl_number=self.current_crawl_number, gene_id=hgnc_gene_id, approved_symbol=approved_symbol)) # = Family information = similarity_comments = uniprot_entry_node.xpath( './comment[@type="similarity"]') family = False for s in similarity_comments: for f in kinase_family_uniprot_similarity_text.keys(): if f in s.findtext('text'): family = kinase_family_uniprot_similarity_text[f] # = PDB entries (from UniProt XML) = # keep X-ray and NMR structures (not "Model") pdbs = uniprot_entry_node.xpath( './dbReference[@type="PDB"]/property[@type="method"][@value="X-ray" or @value="NMR"]/..' ) pdb_data = [] for p in pdbs: pdb_id = p.get('id') if self.skip_pdbs and pdb_id in self.skip_pdbs: skip_pdb_message = self.skip_pdbs[pdb_id] logger.info( 'OVERRIDE: Skipping PDB {0} for entry {1} - reason: {2}'. format(pdb_id, entry_name, skip_pdb_message)) continue pdb_method = p.find('property[@type="method"]').get('value') resolution_node = p.find('property[@type="resolution"]') resolution = resolution_node.get( 'value') if resolution_node != None else None chains_span_str = p.find('property[@type="chains"]').get('value') chains_span = parse_uniprot_pdbref_chains(chains_span_str) chain_data_dicts = [] for c in chains_span.keys(): chain_id = c pdb_begin = chains_span[c][0] pdb_end = chains_span[c][1] # Use the begin and end info to decide if this pdb chain includes the pk_domain. But we will get other sequence info from sifts XML files, using gather-pdb.py # Have to check against each PK domain for domain in domain_objs: pk_begin = domain.begin pk_end = domain.end if (pdb_begin < pk_begin + 30) & (pdb_end > pk_end - 30): chain_data_dict = models.PDBChain( crawl_number=self.current_crawl_number, chain_id=chain_id, begin=pdb_begin, end=pdb_end) chain_data_dicts.append({ 'chain_obj': chain_data_dict, 'domain_obj': domain }) else: continue if len(chain_data_dicts) > 0: pdb_obj = models.PDBEntry( crawl_number=self.current_crawl_number, pdb_id=pdb_id, method=pdb_method, resolution=resolution) pdb_data.append({ 'pdb_obj': pdb_obj, 'chain_data_dicts': chain_data_dicts }) # ======== # Construct data objects and add to db # ======== db_entry = models.DBEntry( crawl_number=self.current_crawl_number, npdbs=len(pdb_data), ndomains=len(domain_objs), nisoforms=len(isoforms), nfunctions=len(functions), ndisease_associations=len(disease_associations), ) db.session.add(db_entry) uniprot_entry = models.UniProtEntry( crawl_number=self.current_crawl_number, ac=ac, entry_name=entry_name, last_uniprot_update=last_uniprot_update, ncbi_taxon_id=ncbi_taxon_id, db_entry=db_entry, recommended_name=recommended_name, taxon_name_scientific=taxon_name_scientific, taxon_name_common=taxon_name_common, lineage=lineage_csv, ) if family: uniprot_entry.family = family db.session.add(uniprot_entry) for function_obj in functions: function_obj.db_entry = db_entry function_obj.uniprot_entry = uniprot_entry db.session.add(function_obj) for disease_association_obj in disease_associations: disease_association_obj.db_entry = db_entry disease_association_obj.uniprot_entry = uniprot_entry db.session.add(disease_association_obj) for subcellular_location_obj in subcellular_locations: subcellular_location_obj.db_entry = db_entry subcellular_location_obj.uniprot_entry = uniprot_entry db.session.add(subcellular_location_obj) for isoform_data in isoforms: isoform_obj = isoform_data[0] notes = isoform_data[1] isoform_obj.db_entry = db_entry isoform_obj.uniprot_entry = uniprot_entry db.session.add(isoform_obj) for note_obj in notes: note_obj.uniprotisoform = isoform_obj db.session.add(note_obj) for domain_obj in domain_objs: domain_obj.db_entry = db_entry domain_obj.uniprot_entry = uniprot_entry db.session.add(domain_obj) for pdb_data_dict in pdb_data: pdb_obj = pdb_data_dict['pdb_obj'] chain_data_dicts = pdb_data_dict['chain_data_dicts'] pdb_obj.db_entry = db_entry db.session.add(pdb_obj) for chain_data_dict in chain_data_dicts: chain_obj = chain_data_dict['chain_obj'] domain_obj = chain_data_dict['domain_obj'] chain_obj.pdb_entry = pdb_obj chain_obj.uniprot_domain = domain_obj db.session.add(chain_obj) for gene_name_obj in gene_name_data: gene_name_obj.db_entry = db_entry db.session.add(gene_name_obj) for NCBIGeneEntry in ncbi_gene_entries: NCBIGeneEntry.db_entry = db_entry db.session.add(NCBIGeneEntry) for HGNCEntry in hgnc_entries: HGNCEntry.db_entry = db_entry db.session.add(HGNCEntry) for ensembl_transcript_id in ensembl_data: ensembl_gene_id = ensembl_data[ensembl_transcript_id]['gene'] ensembl_gene_row = models.EnsemblGene( crawl_number=self.current_crawl_number, gene_id=ensembl_gene_id, db_entry=db_entry, ) db.session.add(ensembl_gene_row) ensembl_transcript_row = models.EnsemblTranscript( crawl_number=self.current_crawl_number, transcript_id=ensembl_transcript_id, ensembl_gene=ensembl_gene_row, ) ensembl_transcript_uniprot_isoform_ac = ensembl_data[ ensembl_transcript_id]['uniprot_isoform_ac'] if ensembl_transcript_uniprot_isoform_ac is not None: matching_uniprot_isoform_obj = [ isoform[0] for isoform in isoforms if isoform[0].ac == ensembl_transcript_uniprot_isoform_ac ] if len(matching_uniprot_isoform_obj) != 0: ensembl_transcript_row.uniprot_isoform = matching_uniprot_isoform_obj[ 0] db.session.add(ensembl_transcript_row) ensembl_protein_id = ensembl_data[ensembl_transcript_id]['protein'] ensembl_protein_row = models.EnsemblProtein( crawl_number=self.current_crawl_number, protein_id=ensembl_protein_id, ensembl_gene=ensembl_gene_row, ensembl_transcript=ensembl_transcript_row, ) db.session.add(ensembl_protein_row)
def extract_mutation_data(self): case_rows = {} n_mutations_added = 0 for maf_index_row_tuple in self.maf_df.iterrows(): maf_row = maf_index_row_tuple[1] # hgnc_symbol = maf_row.Hugo_Symbol oncotator_ensembl_transcript_id = maf_row.Transcript_ID matching_db_ensembl_transcript_row = models.EnsemblTranscript.query.filter_by( transcript_id=oncotator_ensembl_transcript_id).first() if matching_db_ensembl_transcript_row is None: continue study = 'internal' case_id = maf_row.Tumor_Sample_Barcode if case_id not in case_rows: case_rows[case_id] = models.CbioportalCase( crawl_number=self.current_crawl_number, case_id=case_id, study=study) db.session.add(case_rows[case_id]) type = maf_row.Variant_Classification chromosome_index = maf_row.Chromosome chromosome_startpos = maf_row.Start_Position chromosome_endpos = maf_row.End_Position reference_dna_allele = maf_row.Reference_Allele if maf_row.Tumor_Seq_Allele1 != reference_dna_allele: variant_dna_allele = maf_row.Tumor_Seq_Allele1 elif maf_row.Tumor_Seq_Allele2 != reference_dna_allele: variant_dna_allele = maf_row.Tumor_Seq_Allele2 else: variant_dna_allele = maf_row.Tumor_Seq_Allele1 cbioportal_aa_change_string = None oncotator_reference_aa = None oncotator_aa_pos = None oncotator_variant_aa = None if maf_row.Amino_Acid_Change is not np.nan: aa_change_regex_match = re.match(self.aa_change_regex, maf_row.Amino_Acid_Change) if aa_change_regex_match: cbioportal_aa_change_string = aa_change_regex_match.groups( )[0] if type == 'Missense_Mutation': aa_change_split_regex_match = re.match( self.aa_change_split_regex, cbioportal_aa_change_string) if aa_change_split_regex_match: oncotator_reference_aa = aa_change_split_regex_match.groups( )[0] oncotator_aa_pos = int( aa_change_split_regex_match.groups()[1]) oncotator_variant_aa = aa_change_split_regex_match.groups( )[2] validation_status = maf_row.Validation_Status functional_impact_score = maf_row['MA:FImpact'] print type, cbioportal_aa_change_string, oncotator_reference_aa, oncotator_aa_pos, oncotator_variant_aa mutation_row = models.CbioportalMutation( crawl_number=self.current_crawl_number, type=type, cbioportal_aa_change_string=cbioportal_aa_change_string, mutation_origin=None, validation_status=validation_status, functional_impact_score=functional_impact_score, chromosome_index=chromosome_index, chromosome_startpos=chromosome_startpos, chromosome_endpos=chromosome_endpos, reference_dna_allele=reference_dna_allele, variant_dna_allele=variant_dna_allele, oncotator_aa_pos=oncotator_aa_pos, oncotator_reference_aa=oncotator_reference_aa, oncotator_variant_aa=oncotator_variant_aa, oncotator_ensembl_transcript_id=oncotator_ensembl_transcript_id, db_entry=matching_db_ensembl_transcript_row.ensembl_gene. db_entry, cbioportal_case=case_rows[case_id], in_uniprot_domain=False, ) # is mutation within a uniprot domain? matching_uniprot_domains = matching_db_ensembl_transcript_row.ensembl_gene.db_entry.uniprot_domains.all( ) for domain in matching_uniprot_domains: if oncotator_aa_pos >= domain.begin and oncotator_aa_pos <= domain.end: if oncotator_reference_aa != cbioportal_aa_change_string[0]: continue mutation_row.in_uniprot_domain = True mutation_row.uniprot_domain = domain db.session.add(mutation_row) n_mutations_added += 1 logger.info( 'From {} mutation annotations, added {} mutations and {} cases.'. format(len(self.maf_df), n_mutations_added, len(case_rows)))
def commit(self): db.session.commit() logger.info('Database committed.') logger.info('New safe crawl number: {0}'.format(self.current_crawl_number)) logger.info('New current crawl number: {0}'.format(self.current_crawl_number+1)) logger.info('Done.')
def extract_sifts_seq(sifts_filepath, uniprot_ac, uniprot_entry_name, pdb_id, chain_id, uniprot_sequence): exception_message = None sifts = etree.fromstring(gzip.open(sifts_filepath, 'r').read()) # First check whether the first residue with matching chainID and a UniProt crossref has the same UniProt AC as was picked up from UniProt (by gather-uniprot.py). # 3O50 and 3O51 are picked up by gather-uniprot.py from uniprot AC O14965. But these have uniprot AC B4DX16 in the sifts .xml files, which is a TrEMBL entry. Sequences are almost identical except for deletion of ~70 residues prior to PK domain of B4DX16. This means that experimental_sequence_aln and related sequences are not added by gather-pdb.py. Need to sort out a special case for these pdbs. Should check for similar cases in other kinases. # 3O50 and 3O51 can be ignored. (Plenty of other PDBs for that protein) # 3OG7 is picked up from uniprot AC P15056, but the PDB entry links to Q5IBP5 - this is the AKAP9-BRAF fusion protein. # XXX TODO XXX 3OG7 will be ignored for now, but at some point should make separate entries for fusion proteins, and add the PDB files accordingly. first_matching_uniprot_resi = sifts.find( 'entity[@type="protein"]/segment/listResidue/residue/crossRefDb[@dbSource="PDB"][@dbChainId="%s"]/../crossRefDb[@dbSource="UniProt"]' % chain_id) sifts_uniprot_ac = first_matching_uniprot_resi.get('dbAccessionId') if uniprot_ac != sifts_uniprot_ac: logger.info( 'PDB %s chain %s picked up from UniProt entry %s %s. Non-matching UniProtAC in sifts: %s. This chain will be deleted.' % (pdb_id, chain_id, uniprot_entry_name, uniprot_ac, sifts_uniprot_ac)) exception_message = 'DELETE_ME' # # # TODO check if there are any PDBs where two proteins share the same chainID (I seem to remember that there are - check previous scripts) # # # ====== # Extract sequence data from the SIFTS XML # ====== # These are the sifts residues which include a PDB crossref with matching chainID chain_residues = sifts.findall( 'entity[@type="protein"]/segment/listResidue/residue/crossRefDb[@dbSource="PDB"][@dbChainId="%s"]/..' % chain_id) experimental_sequence = '' experimental_sequence_pdb_resids = [] experimental_sequence_uniprot_res_indices = [] observed_sequence_aln_exp = '' experimental_sequence_aln = ['-'] * len( uniprot_sequence ) # This will contain the alignment of the experimental sequence against the full UniProt sequence. Conflicting residues will be added if they are contiguous with non-conflicting segments. NOTE: this is no longer added to the database. experimental_sequence_aln_conflicts = ['-'] * len( uniprot_sequence ) # Same, but conflicting residues are added as lower case observed_sequence_aln = ['-'] * len( uniprot_sequence ) # This will contain the alignment of the observed sequence against the full UniProt sequence. Conflicting residues will be ignored. ss_aln = ['-'] * len( uniprot_sequence ) # This will contain the alignment of the secondary structure codes against the full UniProt sequence. Conflicting residues will be ignored. n_crossref_uniprot_matches = 0 for r in chain_residues: residue_details = r.findall('residueDetail') residue_detail_texts = [ detail.text.strip() for detail in residue_details ] # list of strings ss = r.findtext('residueDetail[@property="codeSecondaryStructure"]') resname = r.attrib['dbResName'] if resname == None: print 'ERROR: UniProt crossref not found for conflicting residue!', uniprot_ac, pdb_id, chain_id, r.attrib raise Exception try: # Note that this BioPython dict converts a modified aa to the single-letter code of its unmodified parent (e.g. "TPO":"T") single_letter = Bio.Data.SCOPData.protein_letters_3to1[resname] except KeyError: if resname == 'ACE': # Just ignore N-terminal ACE continue elif resname == 'CAS': # S-(dimethylarsenic)cysteine single_letter = 'C' elif resname == 'MHO': # S-oxymethionine single_letter = 'M' elif resname == 'LGY': # 3NX8. (E)-N-(4-oxobutylidene)lysine single_letter = 'K' elif resname == 'AME': # N-acetylmethionine single_letter = 'M' elif resname == 'NMM': # 3KB7 single_letter = 'R' elif resname == 'OCY': # 2R9S single_letter = 'C' elif resname == 'CY0': # 2J5E single_letter = 'C' elif resname == 'CY7': # 2JIV single_letter = 'C' else: print 'KeyError: Problem converting resname', resname, 'to single letter code.', chain_id, r.attrib raise KeyError # Add residue to experimental_sequence experimental_sequence += single_letter # Also save the pdb resids, which we will use later pdb_resid = r.find('crossRefDb[@dbSource="PDB"]').attrib['dbResNum'] # TODO need to generalize this. Shift to manual_overrides.yaml or do something else? In the short-term, perhaps just skip these PDBs? # Some pdb resids are e.g. '464A' if pdb_resid.isdigit() == False: if pdb_id in [ '1O6L', '2JDO', '2JDR', '2UW9', '2X39', '2XH5' ]: # These pdbs include three residues with pdb resids 464A, 464B, 464C, (all with UniProt crossrefs) then continues from 465. We will change this so that the pdb resids continue to iterate corrected_pdb_resids = {'464A': 465, '464B': 466, '464C': 467} if pdb_resid in corrected_pdb_resids.keys(): pdb_resid = corrected_pdb_resids[pdb_resid] elif int(pdb_resid[0:3]) > 464: pdb_resid = int(pdb_resid) + 3 # Otherwise just extract the number (this will also detect negative numbers) else: pdb_resid = ''.join([ char for char in pdb_resid if (char.isdigit() or char == '-') ]) try: experimental_sequence_pdb_resids.append(int(pdb_resid)) except: print 'Problem converting pdb_resid into int.', uniprot_ac, pdb_id, chain_id, pdb_resid raise Exception # Also add residue to experimental_sequence_aln. Residues which do not match the uniprot sequence (and thus do not have a uniprot crossref) will be added later crossref_uniprot = r.find( 'crossRefDb[@dbSource="UniProt"][@dbAccessionId="%s"]' % uniprot_ac) if crossref_uniprot != None: n_crossref_uniprot_matches += 1 index = int(crossref_uniprot.attrib['dbResNum']) - 1 experimental_sequence_aln[index] = single_letter if 'Conflict' in residue_detail_texts or 'Engineered mutation' in residue_detail_texts: experimental_sequence_aln_conflicts[ index] = single_letter.lower() else: experimental_sequence_aln_conflicts[index] = single_letter experimental_sequence_uniprot_res_indices.append(index) # Add residue to observed_sequence_aln if it is observed and is not a conflict if 'Not_Observed' not in residue_detail_texts and ( 'Conflict' not in residue_detail_texts or 'Engineered mutation' in residue_detail_texts): observed_sequence_aln[index] = single_letter if ss != None: ss_aln[index] = ss else: experimental_sequence_uniprot_res_indices.append(None) pass # Add residue to observed_sequence_aln_exp if it is observed, otherwise '-' if 'Not_Observed' in residue_detail_texts: observed_sequence_aln_exp += '-' else: observed_sequence_aln_exp += single_letter # Now check whether the number of non-observed residues is more than 90% of the experimental sequence length n_unobserved_residues = observed_sequence_aln_exp.count('-') if (float(n_unobserved_residues) / float(len(experimental_sequence))) > 0.9: exception_message = 'DELETE_ME' # ====== # Now we add the residues which do not have a UniProt crossref # ====== #print e, uniprot_ac, pdb_id, chain_id #print experimental_sequence #print ''.join(experimental_sequence_aln_conflicts) i = 0 # But first we have to deal with cases where residues have been added at the N-terminus which extend before the start of the uniprot sequence. The excess residues will be ignored. # Get the uniprot residue index of the first residue with a uniprot crossref for s in range(len(experimental_sequence_uniprot_res_indices)): UP_res_index = experimental_sequence_uniprot_res_indices[s] if UP_res_index != None: first_exp_seq_uniprot_res_index = UP_res_index # And the corresponding pdb resid corresponding_pdb_resid = experimental_sequence_pdb_resids[s] exp_seq_first_uniprot_res_index = s break # And get the pdb resid of the first residue in the experimental sequence for s in experimental_sequence_pdb_resids: if s != None: first_exp_seq_pdb_resid = s break ignore_excess_Nterm_residues_flag = False # If the experimental sequence includes the first residue of the full uniprot sequence try: if first_exp_seq_uniprot_res_index == 0: # And if the value of the first pdb resid is lower than that of the pdb resid corresponding to the first uniprot residue if first_exp_seq_pdb_resid < corresponding_pdb_resid: # Then we will ignore the excess residues ignore_excess_Nterm_residues_flag = True except: # XXX should do something better than this # exception occurs with P27791 (KAPCA_RAT) exception_message = 'DELETE_ME' # Now iterate through the residues in the experimental sequence and add residues which do not have a uniprot crossref, but are contiguous in terms of PDB numbering while i < len(experimental_sequence): resname_i = experimental_sequence[i] uniprot_res_index_i = experimental_sequence_uniprot_res_indices[i] pdb_resid_i = experimental_sequence_pdb_resids[i] if (ignore_excess_Nterm_residues_flag == True) and (pdb_resid_i < corresponding_pdb_resid): pass # we ignore these residues # If this residue does not have a uniprot crossref elif uniprot_res_index_i == None: # Start a list of residues with no uniprot crossref contiguous_noUP_residues = [resname_i] # Then check the next residue j = i + 1 while j < len(experimental_sequence): resname_j = experimental_sequence[j] uniprot_res_index_j = experimental_sequence_uniprot_res_indices[ j] pdb_resid_j = experimental_sequence_pdb_resids[j] #print 'len, i, j:', len(experimental_sequence), i, j, pdb_resid_i, pdb_resid_j, contiguous_noUP_residues # If this residue also has no uniprot crossref, and is contiguous in terms of pdb resnum, then add it to the list, and move on to the next one if (uniprot_res_index_j == None) and ((pdb_resid_j - pdb_resid_i) == (j - i)): #print 'adding to list:', j, resname_j contiguous_noUP_residues.append(resname_j) pass # If this residue does have a uniprot crossref, and if it is contiguous in terms of pdb resnum, then we add the list of residues without uniprot crossrefs at this position elif (uniprot_res_index_j != None) and ( (pdb_resid_j - pdb_resid_i) == (j - i)): #print 'adding to sequence_aln:', j experimental_sequence_aln[( uniprot_res_index_j - j):uniprot_res_index_j] = contiguous_noUP_residues experimental_sequence_aln_conflicts[( uniprot_res_index_j - j):uniprot_res_index_j] = list( ''.join(contiguous_noUP_residues).lower()) i = j break # If this residue is not contiguous in terms of pdb resnum, go back and check if the first of contiguous_noUP_residues is pdb-contiguous with the previous residue - if so, add contiguous_noUP_residues elif (pdb_resid_j - pdb_resid_i) != (j - i): #print 'checking backwards:', j if (pdb_resid_i - experimental_sequence_pdb_resids[i - 1]) == 1: last_uniprot_res_index = experimental_sequence_uniprot_res_indices[ i - 1] experimental_sequence_aln[ last_uniprot_res_index + 1:last_uniprot_res_index + 1 + (j - i)] = contiguous_noUP_residues experimental_sequence_aln_conflicts[ last_uniprot_res_index + 1:last_uniprot_res_index + 1 + (j - i)] = list( ''.join(contiguous_noUP_residues).lower()) i = j - 1 break # If we have reached the end of experimental_sequence, go back and check if the first of contiguous_noUP_residues is pdb-contiguous with the previous residue - if so, add contiguous_noUP_residues if j == len(experimental_sequence) - 1: #print 'THIS IS THE END', len(experimental_sequence), i, j, pdb_resid_i, experimental_sequence_pdb_resids[i], experimental_sequence_pdb_resids[i-1], contiguous_noUP_residues #print experimental_sequence_pdb_resids if (pdb_resid_i - experimental_sequence_pdb_resids[i - 1]) == 1: last_uniprot_res_index = experimental_sequence_uniprot_res_indices[ i - 1] experimental_sequence_aln[ last_uniprot_res_index + 1:last_uniprot_res_index + 2 + (j - i)] = contiguous_noUP_residues experimental_sequence_aln_conflicts[ last_uniprot_res_index + 1:last_uniprot_res_index + 2 + (j - i)] = list( ''.join(contiguous_noUP_residues).lower()) i = j break j += 1 i += 1 # ====== # Some final processing # ====== # In cases such as 3LAU and 1O6L, additional sequence at end makes experimental_sequence_aln longer than uniprot_sequence by 1 # Handle this by removing the extraneous sequence if len(experimental_sequence_aln) != len(uniprot_sequence): experimental_sequence_aln = experimental_sequence_aln[ 0:len(uniprot_sequence)] experimental_sequence_aln_conflicts = experimental_sequence_aln_conflicts[ 0:len(uniprot_sequence)] experimental_sequence_aln = ''.join(experimental_sequence_aln) experimental_sequence_aln_conflicts = ''.join( experimental_sequence_aln_conflicts) observed_sequence_aln = ''.join(observed_sequence_aln) ss_aln = ''.join(ss_aln) chain_results_dict = { 'chain_id': chain_id, 'experimental_seq': experimental_sequence, 'experimental_seq_aln_conflicts': experimental_sequence_aln_conflicts, 'observed_seq_aln_exp': observed_sequence_aln_exp, 'observed_seq_aln': observed_sequence_aln, 'observed_ss_aln': ss_aln, 'exception_message': exception_message, } return chain_results_dict
def analyze_domain_selections(self): """ Prints useful info on the domains selected by uniprot_domain_regex """ selected_domain_names = list(set( [d.get('description') for d in self.selected_domains] )) selected_domain_name_counts = [ len(self.uniprot_xml.findall('entry/feature[@type="domain"][@description="%s"]' % name)) for name in selected_domain_names ] domain_names_str = 'Regex: %s\n' % self.uniprot_domain_regex domain_names_str += 'Number of domains matching regex: %d\n\n' % len(self.selected_domains) domain_names_str += '= Unique domain names which match regex =\n' for i in range(len(selected_domain_names)): domain_names_str += '{:^{name_width}s} : {:>{pop_width}d}\n'.format(selected_domain_names[i], selected_domain_name_counts[i], name_width=max([len(n)+4 for n in selected_domain_names]), pop_width=max([len(str(p))+1 for p in selected_domain_name_counts]) ) domain_names_str += '\n' logger.info(domain_names_str) logger.info( '(Unique domain names which do not match regex will be output to {0})'.format( self.domain_names_filename ) ) all_domains = self.uniprot_xml.findall('./entry/feature[@type="domain"]') domain_names_str += '= Unique domain names which do not match regex =\n' nonselected_domain_names = list(set([ d.get('description') for d in all_domains if d.get('description') not in selected_domain_names ])) if self.count_nonselected_domain_names: nonselected_domain_name_counts = [ int( self.uniprot_xml.xpath( 'count(entry/feature[@type="domain"][@description="{0}"])'.format(name) ) ) for name in nonselected_domain_names ] for i in range(len(nonselected_domain_names)): domain_names_str += '{:^{name_width}s} : {:>{pop_width}d}\n'.format( nonselected_domain_names[i], nonselected_domain_name_counts[i], name_width=max([len(n)+4 for n in nonselected_domain_names]), pop_width=max([len(str(p))+1 for p in nonselected_domain_name_counts]), ) else: for i in range(len(nonselected_domain_names)): domain_names_str += '{:^{name_width}s}\n'.format( nonselected_domain_names[i], name_width=max([len(n)+4 for n in nonselected_domain_names]), ) domain_names_str += '\n' with open(self.domain_names_filename, 'w') as domain_names_file: domain_names_file.write(domain_names_str)
def extract_mutation_data(self): case_rows = {} n_mutations_added = 0 for maf_index_row_tuple in self.maf_df.iterrows(): maf_row = maf_index_row_tuple[1] # hgnc_symbol = maf_row.Hugo_Symbol oncotator_ensembl_transcript_id = maf_row.Transcript_ID matching_db_ensembl_transcript_row = models.EnsemblTranscript.query.filter_by( transcript_id=oncotator_ensembl_transcript_id ).first() if matching_db_ensembl_transcript_row is None: continue study = 'internal' case_id = maf_row.Tumor_Sample_Barcode if case_id not in case_rows: case_rows[case_id] = models.CbioportalCase( crawl_number=self.current_crawl_number, case_id=case_id, study=study ) db.session.add(case_rows[case_id]) type = maf_row.Variant_Classification chromosome_index = maf_row.Chromosome chromosome_startpos = maf_row.Start_Position chromosome_endpos = maf_row.End_Position reference_dna_allele = maf_row.Reference_Allele if maf_row.Tumor_Seq_Allele1 != reference_dna_allele: variant_dna_allele = maf_row.Tumor_Seq_Allele1 elif maf_row.Tumor_Seq_Allele2 != reference_dna_allele: variant_dna_allele = maf_row.Tumor_Seq_Allele2 else: variant_dna_allele = maf_row.Tumor_Seq_Allele1 cbioportal_aa_change_string = None oncotator_reference_aa = None oncotator_aa_pos = None oncotator_variant_aa = None if maf_row.Amino_Acid_Change is not np.nan: aa_change_regex_match = re.match(self.aa_change_regex, maf_row.Amino_Acid_Change) if aa_change_regex_match: cbioportal_aa_change_string = aa_change_regex_match.groups()[0] if type == 'Missense_Mutation': aa_change_split_regex_match = re.match( self.aa_change_split_regex, cbioportal_aa_change_string ) if aa_change_split_regex_match: oncotator_reference_aa = aa_change_split_regex_match.groups()[0] oncotator_aa_pos = int(aa_change_split_regex_match.groups()[1]) oncotator_variant_aa = aa_change_split_regex_match.groups()[2] validation_status = maf_row.Validation_Status functional_impact_score = maf_row['MA:FImpact'] print type, cbioportal_aa_change_string, oncotator_reference_aa, oncotator_aa_pos, oncotator_variant_aa mutation_row = models.CbioportalMutation( crawl_number=self.current_crawl_number, type=type, cbioportal_aa_change_string=cbioportal_aa_change_string, mutation_origin=None, validation_status=validation_status, functional_impact_score=functional_impact_score, chromosome_index=chromosome_index, chromosome_startpos=chromosome_startpos, chromosome_endpos=chromosome_endpos, reference_dna_allele=reference_dna_allele, variant_dna_allele=variant_dna_allele, oncotator_aa_pos=oncotator_aa_pos, oncotator_reference_aa=oncotator_reference_aa, oncotator_variant_aa=oncotator_variant_aa, oncotator_ensembl_transcript_id=oncotator_ensembl_transcript_id, db_entry=matching_db_ensembl_transcript_row.ensembl_gene.db_entry, cbioportal_case=case_rows[case_id], in_uniprot_domain=False, ) # is mutation within a uniprot domain? matching_uniprot_domains = matching_db_ensembl_transcript_row.ensembl_gene.db_entry.uniprot_domains.all() for domain in matching_uniprot_domains: if oncotator_aa_pos >= domain.begin and oncotator_aa_pos <= domain.end: if oncotator_reference_aa != cbioportal_aa_change_string[0]: continue mutation_row.in_uniprot_domain = True mutation_row.uniprot_domain = domain db.session.add(mutation_row) n_mutations_added += 1 logger.info('From {} mutation annotations, added {} mutations and {} cases.'.format( len(self.maf_df), n_mutations_added, len(case_rows)) )