def get_info(self,modelid: str) -> Dict[str, str]: with PDBMapSQLdb() as db: db.activate_dict_cursor() rows_selected = db.execute("SELECT * FROM Modbase2016 WHERE database_id = %s",(modelid,)) assert rows_selected == 1 fetched_data = db.fetchone() return fetched_data
def transcript2modelids(self,transcript: PDBMapTranscriptBase, max_target_beg:int =2000000000, min_target_end:int = 1) -> List[Tuple[str,str]]: """Return a list of (model_id,database_id) tuples for any transcript, via its amino acid sequence""" transcript_model_matches = [] with PDBMapSQLdb() as db: rows_selected = db.execute("SELECT model_id,database_id FROM Modbase2020 WHERE seq_id = %s and target_beg <= %s and target_end >= %s",(transcript.md5sum + transcript.aa_seq[0:4] + transcript.aa_seq[-4:],max_target_beg,min_target_end)) if rows_selected: row_tuples = db.fetchall() transcript_model_matches = [(row[0],row[1]) for row in row_tuples] assert rows_selected == len(transcript_model_matches) return transcript_model_matches if transcript_model_matches else []
def ensp2modelids(self,ensp_id: str) -> List[str]: """Return a list of (model_id,database_id) tuples for an ENSP id""" ensp_model_matches = [] with PDBMapSQLdb() as db: rows_selected = db.execute("SELECT model_id,database_id FROM Modbase2020 WHERE database_id LIKE %(ensp_id)s",{'ensp_id': ensp_id+'%%'}) if rows_selected: row_tuples = db.fetchall() ensp_model_matches = [(row[0],row[1]) for row in row_tuples] assert rows_selected == len(ensp_model_matches) return ensp_model_matches if ensp_model_matches else []
def write_unwritten_vcf_rows(): nonlocal rows_inserted, query, unwritten_vcf_rows # try: with PDBMapSQLdb() as db: db.set_session_transaction_isolation_read_committed() rows_inserted += db.executemany(query,unwritten_vcf_rows) # We flushed the rows so now we re-initialize for more rows unwritten_vcf_rows = [] return rows_inserted
def get_info(self,model_id_database_id_tuple: Tuple[str,str]) -> Dict[str, str]: """Return the entire row of information from the summary file for a specific (model_id,database_id) Tuple""" with PDBMapSQLdb() as db: db.activate_dict_cursor() # OK - so we only use the model_id part of the tuple :) rows_selected = db.execute("SELECT * FROM Modbase2020 WHERE model_id = %s",(model_id_database_id_tuple[0],)) assert rows_selected == 1 fetched_data = db.fetchone() return fetched_data
def transcript2summary_rows(self,transcript: PDBMapTranscriptBase,max_target_beg:int =2000000000, min_target_end:int = 1) -> List[Dict]: """Return a list of Dictionaries that contain the rows of the Modbase summary file that match a sequence of interest""" modbase2020_summary_rows = [] with PDBMapSQLdb() as db: db.activate_dict_cursor() rows_selected_count = db.execute( "SELECT * FROM Modbase2020 WHERE seq_id = %s and target_beg <= %s and target_end >= %s", (transcript.md5sum + transcript.aa_seq[0:4] + transcript.aa_seq[-4:],max_target_beg,min_target_end)) if rows_selected_count: summary_rows = db.fetchall() # Convert the summary_rows Tuple of Dicts returned by SQL to a list of dicts modbase2020_summary_rows = [summary_row for summary_row in summary_rows] assert rows_selected_count == len(modbase2020_summary_rows) return modbase2020_summary_rows if modbase2020_summary_rows else []
def load_aa_seq_from_sql(self): if not self._aa_seq: query = """ SELECT unp,uniparc,fasta FROM Idmapping LEFT JOIN Uniparc on Uniparc.uniparc = Idmapping.ID where Id_Type = 'UniParc' and unp = %(unp)s""" with PDBMapSQLdb() as sql_connection: sql_connection.execute(query, {'unp': self.uniprot_id}) row = sql_connection.fetchone() if not row or len(row) != 3: return ( False, "unp %s is invalid, or its UniParc ID sequence unavailable" % self.uniprot_id) self._uniparc_id = str(row[1]).strip() self._aa_seq = str(row[2]).strip() return (True, self.aa_seq)
def write_unwritten_GenomicDataINFOs(): with PDBMapSQLdb() as db: nonlocal unwritten_GenomicDataINFOs db.activate_dict_cursor() db.set_session_transaction_isolation_read_committed() insert_GenomicData = (SQL_EXTENDED_LOCK_WAIT+"INSERT IGNORE INTO GenomicData " "(label,chrom,pos,end,id,variation,vtype,svtype,ref_allele,alt_allele," "svlen,quality,avgpost,rsq,erate,theta,ldaf,ac,an,aa,da,maf,amr_af,asn_af," "eas_af,sas_af,afr_af,eur_af,ens_gene,hgnc_gene," "snpsource,format,gt) VALUES " "(%(LABEL)s,%(CHROM)s,%(POS)s,%(END)s,%(ID)s,%(EXISTING)s,%(VT)s," "%(SVTYPE)s,%(REF)s,%(ALT)s,%(SVLEN)s,%(QUAL)s,%(AVGPOST)s,%(RSQ)s," "%(ERATE)s,%(THETA)s,%(LDAF)s,%(AC)s,%(AN)s,%(AA)s,%(DA)s," "%(AF)s,%(AMR_AF)s,%(ASN_AF)s,%(EAS_AF)s,%(SAS_AF)s,%(AFR_AF)s,%(EUR_AF)s," "%(GENE)s,%(HGNC)s,%(SNPSOURCE)s,%(FORMAT)s,%(GT)s)") rows_inserted = db.executemany(insert_GenomicData,unwritten_GenomicDataINFOs) # We flushed the rows so now we re-initialize for more rows unwritten_GenomicDataINFOs = [] return rows_inserted
def write_unwritten_GenomicConsequenceCSQs(): with PDBMapSQLdb() as db: nonlocal unwritten_GenomicConsequenceCSQs db.activate_dict_cursor() db.set_session_transaction_isolation_read_committed() # Upload each consequence to GenomicConsequence insert_GenomicConsequence = (SQL_EXTENDED_LOCK_WAIT+"INSERT IGNORE INTO GenomicConsequence " "(label,chrom,pos,end,id,transcript,protein,uniprot,canonical,allele," "consequence,cdna_pos,cds_pos,protein_pos,ref_amino_acid," "alt_amino_acid,ref_codon,alt_codon,polyphen,sift,biotype," "domains) VALUES " "(%(LABEL)s,%(CHROM)s,%(POS)s,%(END)s,%(ID)s," "%(Feature)s,%(ENSP)s,%(SWISSPROT)s,%(CANONICAL)s,%(Allele)s," "%(Consequence)s,%(cDNA_position)s,%(CDS_position)s," "%(Protein_position)s,%(Ref_AminoAcid)s,%(Alt_AminoAcid)s," "%(Ref_Codon)s,%(Alt_Codon)s,%(PolyPhen)s,%(SIFT)s," "%(BIOTYPE)s,%(DOMAINS)s)") rows_inserted = db.executemany(insert_GenomicConsequence,unwritten_GenomicConsequenceCSQs) # We flushed the rows so now we re-initialize for more rows unwritten_GenomicConsequenceCSQs = [] return rows_inserted
def vcf_file_to_sql_table(self,table_name: str,vcf_filename: str,buffer_size:int =1,local_db_config: Dict[str,str] = None): """ Creates a supplementary SQL table as echo of VCF datafile The first 7 VCF tab-delimited data columns are extracted. The 8th column, INFO, is parsed apart and SQL columns created for each element For more details, be sure to google the VCF4.x specification .pdf and PyVCF library documentation """ if not local_db_config: local_db_config = {} for dbparam in ['dbhost','dbname','dbuser','dbpass']: assert dbparam in self._config_dict,"%s missing from configuration dictionary"%dbparam local_db_config[dbparam] = self._config_dict[dbparam] for dbparam in ['dbhost','dbname','dbuser','dbpass']: assert dbparam in local_db_config,"%s missing from local_db_config dictionary"%dbparam # Open a PyVCF parser and initialize vcf_reader.infos from the ## header # The parser will later iterate over vcf records in the input vcf_filename # # rameter prepend_chr=True, prepends the string 'chr' to all the chromosome # values found (as in chr1/chrX/chrEtc) vcf_reader = self._my_vcf_file_Reader(vcf_filename,prepend_chr=(not 'liftover' in vcf_filename)) # The first 7 mandatory vcf columns are consistent from vcf file to vcf file # var_header = ["chr","start","name","ref","alt","qual","filter"] # 2019-Nov. Chris Moth strongly prefers to maintain source data field names first_7_columns = ["chrom","pos","id","ref","alt","qual","filter"] first_7_comments = ["Chromosome","position","identifier", "reference base(s)","alternate base(s)","quality", "filter status PASS/MISSING/etc"] # SQL types for these first 7 standard mandatory fields first_7_types = ["VARCHAR(100)","BIGINT","VARCHAR(100)","VARCHAR(100)"] first_7_types += ["VARCHAR(100)","DOUBLE","VARCHAR(100)"] primary_key_components = ['chrom','pos','ref','vcf_record_number'] # Replace INFO keys that overlap with the first_7 column headers for info_key in list(vcf_reader.infos.keys()): if info_key.lower() in first_7_columns: # If the INFO has a key same as first 7 # then rename that as key2 to avoid confusion vcf_reader.infos["%s2"%info_key] = vcf_reader.infos[info_key] del vcf_reader.infos[info_key] # Remove the duplicating info_key from the dictionary # Extract info fields info_header = list(vcf_reader.infos.keys()) # replace single quotes in the comment strings with two single adjacent quotes, for SQL info_comments = [info.desc.replace("'","''") for info in list(vcf_reader.infos.values())] # Extract and convert info types from VCF INFO ## Meta information to SQL types type_conv = {"Integer":"BIGINT","Float":"DOUBLE","Flag":"TINYINT(1)","String":"TEXT"} info_types = [type_conv[info.type] for info in list(vcf_reader.infos.values())] # replace all punctuation with underscores in the info keys that could foul up SQL punctuation_to_underscore = str.maketrans(string.punctuation,'_'*len(string.punctuation)) info_header = [f.translate(punctuation_to_underscore) for f in info_header] header = first_7_columns + info_header #+ csq_header comments = first_7_comments + info_comments #+ csq_header # Use the first row to infer data types for the INFO fields record = next(vcf_reader) types = first_7_types + info_types #+ csq_types # Set default values for each type # In our Mariadb 5.5, TEXT has no DEFAULT sql_defaults = {"BIGINT":0,"DOUBLE":0.0,"TINYINT(1)":0,"VARCHAR(100)":"''"} sql_notnull_types = set() # Not sure why Mike had notnull on these: was-> set["TINYINT","TEXT","VARCHAR(100)","DOUBLE"]) # don't worry about type conversion for now, Nones are causing an issue formatter = {"BIGINT":"%s","DOUBLE":"%s","TEXT":"%s","TINYINT(1)":"%s","VARCHAR(100)":"%s"} # Generate a create statement for this table # First, generate the column definitions # Each line defines: # a table column name, backquoted to avoid conflicts with SQL reserved words # the sql type of the column # A default value # A non null specifier table_columns_def = ( ["vcf_record_number BIGINT NOT NULL AUTO_INCREMENT"] + ["`%s` %s %s %s COMMENT '%s'"%(header[i].lower(), \ types[i], \ ("DEFAULT %s"%sql_defaults[types[i]]) if types[i] in sql_defaults else "", \ "NOT NULL" if (header[i].lower in primary_key_components) or (types[i] in sql_notnull_types) else "", \ comments[i]) \ for i in range(len(header))] ) # Include as many non-VARCHAR(65000)/TEXT columns in primary key as allowed (16) # Additional INFO (like END) may justify duplicates within the standard VCF fields dbname_dot_table_name = local_db_config['dbname'] + "." + table_name # Dropping the table is a bad idea if 25 Slurm tasks attempting to load it # with PDBMapSQLdb() as db: # db.execute("DROP TABLE IF EXISTS %s;"%dbname_dot_table_name) create_statement = "CREATE TABLE IF NOT EXISTS %s.%s\n(%s, PRIMARY KEY(%s), UNIQUE KEY(vcf_record_number))\n%s\n%s" query = create_statement%(local_db_config['dbname'], table_name, ',\n'.join(table_columns_def), # The columns with their types and defaults ',\n'.join(primary_key_components), "CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci", "COMMENT 'created from %s'"%vcf_filename) LOGGER.info("Creating table %s with 'query'=\n%s"%(table_name,query)) # Create the table with PDBMapSQLdb() as db: db.execute(query) # Populate the table with contents of VCF file reformatted for clean INSERT # Strategy is one INSERT with many many value tuples def record2row(record,infos):#,csq_header=None): row = [record.CHROM,record.POS,record.ID] row += [record.REF,record.ALT,record.QUAL,record.FILTER] # Only retain the most frequent alternate allele (force biallelic) row += [record.INFO[f] if f in record.INFO else None for f in list(infos.keys())] # Replace any empty lists with None row = [r if type(r)!=list or len(r)<1 else r[0] for r in row] row = [r if r!=[] else None for r in row] # Change None to DEFAULT in return row # Add the first record to insert rows unwritten_vcf_rows = [record2row(record,vcf_reader.infos)] rows_inserted = 0 query = SQL_EXTENDED_LOCK_WAIT query += "INSERT INTO %s "%dbname_dot_table_name query += "(%s) VALUES "%','.join(['`%s`'%h for h in header]) query += "(%s)"%','.join([formatter[f] for f in types]) def write_unwritten_vcf_rows(): nonlocal rows_inserted, query, unwritten_vcf_rows # try: with PDBMapSQLdb() as db: db.set_session_transaction_isolation_read_committed() rows_inserted += db.executemany(query,unwritten_vcf_rows) # We flushed the rows so now we re-initialize for more rows unwritten_vcf_rows = [] return rows_inserted for i,record in enumerate(vcf_reader): # Buffered upload - Flush as specified in config if not (i+1) % buffer_size: # Flush this one! write_unwritten_vcf_rows() unwritten_vcf_rows = [] # More often, we keep building up our big INSERT statement unwritten_vcf_rows.extend([record2row(record,vcf_reader.infos)]) if unwritten_vcf_rows: write_unwritten_vcf_rows() # Return the number of rows uploaded return rows_inserted