예제 #1
0
 def get_info(self,modelid: str) -> Dict[str, str]:
     with PDBMapSQLdb() as db:
         db.activate_dict_cursor()
         rows_selected =  db.execute("SELECT * FROM Modbase2016 WHERE database_id = %s",(modelid,))
         assert rows_selected == 1
         fetched_data = db.fetchone()
         return fetched_data
예제 #2
0
 def transcript2modelids(self,transcript: PDBMapTranscriptBase, max_target_beg:int =2000000000, min_target_end:int = 1) -> List[Tuple[str,str]]:
     """Return a list of (model_id,database_id) tuples for any transcript, via its amino acid sequence"""
     transcript_model_matches = []
     with PDBMapSQLdb() as db:
         rows_selected =  db.execute("SELECT model_id,database_id FROM Modbase2020 WHERE seq_id = %s and target_beg <= %s and target_end >= %s",(transcript.md5sum + transcript.aa_seq[0:4] + transcript.aa_seq[-4:],max_target_beg,min_target_end))
         if rows_selected:
             row_tuples = db.fetchall()
             transcript_model_matches = [(row[0],row[1]) for row in row_tuples]
         assert rows_selected == len(transcript_model_matches)
     return transcript_model_matches if transcript_model_matches else []
예제 #3
0
 def ensp2modelids(self,ensp_id: str) -> List[str]:
     """Return a list of (model_id,database_id) tuples for an ENSP id"""
     ensp_model_matches = []
     with PDBMapSQLdb() as db:
         rows_selected =  db.execute("SELECT model_id,database_id FROM Modbase2020 WHERE database_id LIKE %(ensp_id)s",{'ensp_id': ensp_id+'%%'})
         if rows_selected:
             row_tuples = db.fetchall()
             ensp_model_matches = [(row[0],row[1]) for row in row_tuples]
         assert rows_selected == len(ensp_model_matches)
     return ensp_model_matches if ensp_model_matches else []
예제 #4
0
        def write_unwritten_vcf_rows():
            nonlocal rows_inserted, query, unwritten_vcf_rows
            # try:

            with PDBMapSQLdb() as db:
                db.set_session_transaction_isolation_read_committed()
                rows_inserted += db.executemany(query,unwritten_vcf_rows)
            # We flushed the rows so now we re-initialize for more rows
            unwritten_vcf_rows  = []
            return rows_inserted
예제 #5
0
    def get_info(self,model_id_database_id_tuple: Tuple[str,str]) -> Dict[str, str]:
        """Return the entire row of information from the summary file for a specific 
        (model_id,database_id) Tuple"""

        with PDBMapSQLdb() as db:
            db.activate_dict_cursor()
            # OK - so we only use the model_id part of the tuple :)
            rows_selected =  db.execute("SELECT * FROM Modbase2020 WHERE model_id = %s",(model_id_database_id_tuple[0],))
            assert rows_selected == 1
            fetched_data = db.fetchone()
            return fetched_data
예제 #6
0
 def transcript2summary_rows(self,transcript: PDBMapTranscriptBase,max_target_beg:int =2000000000, min_target_end:int = 1) -> List[Dict]:
     """Return a list of Dictionaries that contain the rows of the Modbase summary file that match a sequence of interest"""
     modbase2020_summary_rows = []
     with PDBMapSQLdb() as db:
         db.activate_dict_cursor()
         rows_selected_count =  db.execute(
             "SELECT * FROM Modbase2020 WHERE seq_id = %s and target_beg <= %s and target_end >= %s",
             (transcript.md5sum + transcript.aa_seq[0:4] + transcript.aa_seq[-4:],max_target_beg,min_target_end))
         if rows_selected_count:
             summary_rows = db.fetchall()
             # Convert the summary_rows Tuple of Dicts returned by SQL to a list of dicts
             modbase2020_summary_rows = [summary_row for summary_row in summary_rows]
         assert rows_selected_count == len(modbase2020_summary_rows)
     return modbase2020_summary_rows if modbase2020_summary_rows else []
예제 #7
0
    def load_aa_seq_from_sql(self):
        if not self._aa_seq:
            query = """
SELECT unp,uniparc,fasta FROM Idmapping 
LEFT JOIN Uniparc on Uniparc.uniparc = Idmapping.ID  
where Id_Type = 'UniParc' and unp = %(unp)s"""
            with PDBMapSQLdb() as sql_connection:
                sql_connection.execute(query, {'unp': self.uniprot_id})
                row = sql_connection.fetchone()
                if not row or len(row) != 3:
                    return (
                        False,
                        "unp %s is invalid, or its UniParc ID sequence unavailable"
                        % self.uniprot_id)
                self._uniparc_id = str(row[1]).strip()
                self._aa_seq = str(row[2]).strip()
        return (True, self.aa_seq)
예제 #8
0
        def write_unwritten_GenomicDataINFOs():
            with PDBMapSQLdb() as db:
                nonlocal unwritten_GenomicDataINFOs
                db.activate_dict_cursor()
                db.set_session_transaction_isolation_read_committed()

                insert_GenomicData   = (SQL_EXTENDED_LOCK_WAIT+"INSERT IGNORE INTO GenomicData "
                     "(label,chrom,pos,end,id,variation,vtype,svtype,ref_allele,alt_allele,"
                     "svlen,quality,avgpost,rsq,erate,theta,ldaf,ac,an,aa,da,maf,amr_af,asn_af,"
                     "eas_af,sas_af,afr_af,eur_af,ens_gene,hgnc_gene,"
                     "snpsource,format,gt) VALUES "
                     "(%(LABEL)s,%(CHROM)s,%(POS)s,%(END)s,%(ID)s,%(EXISTING)s,%(VT)s,"
                     "%(SVTYPE)s,%(REF)s,%(ALT)s,%(SVLEN)s,%(QUAL)s,%(AVGPOST)s,%(RSQ)s,"
                     "%(ERATE)s,%(THETA)s,%(LDAF)s,%(AC)s,%(AN)s,%(AA)s,%(DA)s,"
                     "%(AF)s,%(AMR_AF)s,%(ASN_AF)s,%(EAS_AF)s,%(SAS_AF)s,%(AFR_AF)s,%(EUR_AF)s,"
                     "%(GENE)s,%(HGNC)s,%(SNPSOURCE)s,%(FORMAT)s,%(GT)s)")

                rows_inserted = db.executemany(insert_GenomicData,unwritten_GenomicDataINFOs)
                # We flushed the rows so now we re-initialize for more rows
                unwritten_GenomicDataINFOs = []
                return rows_inserted
예제 #9
0
        def write_unwritten_GenomicConsequenceCSQs():
            with PDBMapSQLdb() as db:
                nonlocal unwritten_GenomicConsequenceCSQs
                db.activate_dict_cursor()
                db.set_session_transaction_isolation_read_committed()

                # Upload each consequence to GenomicConsequence
                insert_GenomicConsequence  = (SQL_EXTENDED_LOCK_WAIT+"INSERT IGNORE INTO GenomicConsequence "
                     "(label,chrom,pos,end,id,transcript,protein,uniprot,canonical,allele,"
                     "consequence,cdna_pos,cds_pos,protein_pos,ref_amino_acid,"
                     "alt_amino_acid,ref_codon,alt_codon,polyphen,sift,biotype,"
                     "domains) VALUES "
                     "(%(LABEL)s,%(CHROM)s,%(POS)s,%(END)s,%(ID)s,"
                     "%(Feature)s,%(ENSP)s,%(SWISSPROT)s,%(CANONICAL)s,%(Allele)s,"
                     "%(Consequence)s,%(cDNA_position)s,%(CDS_position)s,"
                     "%(Protein_position)s,%(Ref_AminoAcid)s,%(Alt_AminoAcid)s,"
                     "%(Ref_Codon)s,%(Alt_Codon)s,%(PolyPhen)s,%(SIFT)s,"
                     "%(BIOTYPE)s,%(DOMAINS)s)")

                rows_inserted = db.executemany(insert_GenomicConsequence,unwritten_GenomicConsequenceCSQs)
                # We flushed the rows so now we re-initialize for more rows
                unwritten_GenomicConsequenceCSQs = []
                return rows_inserted
예제 #10
0
    def vcf_file_to_sql_table(self,table_name: str,vcf_filename: str,buffer_size:int =1,local_db_config: Dict[str,str] = None):
        """ Creates a supplementary SQL table as echo of  VCF datafile
            The first 7 VCF tab-delimited data columns are extracted.
            The 8th column, INFO, is parsed apart and SQL columns created
            for each element

            For more details, be sure to google the VCF4.x specification .pdf
            and PyVCF library documentation """

        if not local_db_config:
            local_db_config = {}
            for dbparam in ['dbhost','dbname','dbuser','dbpass']:
                assert dbparam in self._config_dict,"%s missing from configuration dictionary"%dbparam
                local_db_config[dbparam] = self._config_dict[dbparam]

        for dbparam in ['dbhost','dbname','dbuser','dbpass']:
            assert dbparam in local_db_config,"%s missing from local_db_config dictionary"%dbparam

        # Open a PyVCF parser  and initialize vcf_reader.infos from the ## header
        # The parser will later iterate over vcf records in the input vcf_filename
        #
        #  rameter prepend_chr=True, prepends the string 'chr' to all the chromosome
        #   values found (as in chr1/chrX/chrEtc)
        vcf_reader = self._my_vcf_file_Reader(vcf_filename,prepend_chr=(not 'liftover' in vcf_filename))

        # The first 7 mandatory vcf columns are consistent from vcf file to vcf file
        # var_header  = ["chr","start","name","ref","alt","qual","filter"]
        # 2019-Nov.  Chris Moth strongly prefers to maintain source data field names
        first_7_columns  = ["chrom","pos","id","ref","alt","qual","filter"]
        first_7_comments = ["Chromosome","position","identifier",
                            "reference base(s)","alternate base(s)","quality",
                            "filter status PASS/MISSING/etc"]
                            
        # SQL types for these first 7 standard mandatory fields
        first_7_types  = ["VARCHAR(100)","BIGINT","VARCHAR(100)","VARCHAR(100)"]
        first_7_types += ["VARCHAR(100)","DOUBLE","VARCHAR(100)"]

        primary_key_components = ['chrom','pos','ref','vcf_record_number']

        # Replace INFO keys that overlap with the first_7 column headers
        for info_key in list(vcf_reader.infos.keys()):
            if info_key.lower() in first_7_columns:
              # If the INFO has a key same as first 7
              # then rename that as key2 to avoid confusion
              vcf_reader.infos["%s2"%info_key] = vcf_reader.infos[info_key]
              del vcf_reader.infos[info_key] # Remove the duplicating info_key from the dictionary

        # Extract info fields
        info_header = list(vcf_reader.infos.keys())
        # replace single quotes in the comment strings with two single adjacent quotes, for SQL
        info_comments = [info.desc.replace("'","''") for info in list(vcf_reader.infos.values())]
        # Extract and convert info types from VCF INFO ## Meta information to SQL types
        type_conv = {"Integer":"BIGINT","Float":"DOUBLE","Flag":"TINYINT(1)","String":"TEXT"}
        info_types  = [type_conv[info.type] for info in list(vcf_reader.infos.values())]

        # replace all punctuation with underscores in the info keys that could foul up SQL
        punctuation_to_underscore = str.maketrans(string.punctuation,'_'*len(string.punctuation))
        info_header = [f.translate(punctuation_to_underscore)  for f in info_header]

        header = first_7_columns + info_header #+ csq_header
        comments = first_7_comments + info_comments #+ csq_header
        # Use the first row to infer data types for the INFO fields
        record = next(vcf_reader)
        types    = first_7_types + info_types #+ csq_types

        # Set default values for each type
        # In our Mariadb 5.5, TEXT has no DEFAULT
        sql_defaults = {"BIGINT":0,"DOUBLE":0.0,"TINYINT(1)":0,"VARCHAR(100)":"''"}
        sql_notnull_types  = set() # Not sure why Mike had notnull on these: was-> set["TINYINT","TEXT","VARCHAR(100)","DOUBLE"])
        # don't worry about type conversion for now, Nones are causing an issue
        formatter = {"BIGINT":"%s","DOUBLE":"%s","TEXT":"%s","TINYINT(1)":"%s","VARCHAR(100)":"%s"}
        # Generate a create statement for this table
        # First, generate the column definitions
        # Each line defines:
        # a table column name, backquoted to avoid conflicts with SQL reserved words
        # the sql type of the column
        # A default value
        # A non null specifier
        table_columns_def = (
            ["vcf_record_number BIGINT NOT NULL AUTO_INCREMENT"] + 
            ["`%s` %s %s %s COMMENT '%s'"%(header[i].lower(), \
               types[i], \
               ("DEFAULT %s"%sql_defaults[types[i]]) if types[i] in sql_defaults else "", \
               "NOT NULL" if (header[i].lower in primary_key_components) or (types[i] in sql_notnull_types) else "", \
               comments[i]) \
                        for i in range(len(header))]
            )

        # Include as many non-VARCHAR(65000)/TEXT columns in primary key as allowed (16)
        # Additional INFO (like END) may justify duplicates within the standard VCF fields
        dbname_dot_table_name = local_db_config['dbname'] + "." + table_name

        # Dropping the table is a bad idea if 25 Slurm tasks attempting to load it
        # with PDBMapSQLdb() as db:
        #     db.execute("DROP TABLE IF EXISTS %s;"%dbname_dot_table_name)
        create_statement = "CREATE TABLE IF NOT EXISTS %s.%s\n(%s, PRIMARY KEY(%s), UNIQUE KEY(vcf_record_number))\n%s\n%s"
        query = create_statement%(local_db_config['dbname'],
            table_name,
            ',\n'.join(table_columns_def), # The columns with their types and defaults
            ',\n'.join(primary_key_components),
            "CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci",
            "COMMENT 'created from %s'"%vcf_filename)
        LOGGER.info("Creating table %s with 'query'=\n%s"%(table_name,query))

        # Create the table
        with PDBMapSQLdb() as db:
            db.execute(query)


        # Populate the table with contents of VCF file reformatted for clean INSERT
        # Strategy is one INSERT with many many value tuples
        def record2row(record,infos):#,csq_header=None):
            row  = [record.CHROM,record.POS,record.ID]
            row += [record.REF,record.ALT,record.QUAL,record.FILTER]
            # Only retain the most frequent alternate allele (force biallelic)
            row += [record.INFO[f] if f in record.INFO else None for f in list(infos.keys())]
            # Replace any empty lists with None
            row =  [r if type(r)!=list or len(r)<1 else r[0] for r in row]
            row =  [r if r!=[] else None for r in row]
            # Change None to DEFAULT in 
            return row
        # Add the first record to insert rows
        unwritten_vcf_rows   = [record2row(record,vcf_reader.infos)]
        rows_inserted = 0
        query  = SQL_EXTENDED_LOCK_WAIT 
        query += "INSERT INTO %s "%dbname_dot_table_name
        query += "(%s) VALUES "%','.join(['`%s`'%h for h in header])
        query += "(%s)"%','.join([formatter[f] for f in types])

        def write_unwritten_vcf_rows():
            nonlocal rows_inserted, query, unwritten_vcf_rows
            # try:

            with PDBMapSQLdb() as db:
                db.set_session_transaction_isolation_read_committed()
                rows_inserted += db.executemany(query,unwritten_vcf_rows)
            # We flushed the rows so now we re-initialize for more rows
            unwritten_vcf_rows  = []
            return rows_inserted


        for i,record in enumerate(vcf_reader):
            # Buffered upload - Flush as specified in config
            if not (i+1) % buffer_size: # Flush this one!
                write_unwritten_vcf_rows()
                unwritten_vcf_rows  = []

            # More often, we keep building up our big INSERT statement
            unwritten_vcf_rows.extend([record2row(record,vcf_reader.infos)])

        if unwritten_vcf_rows:
            write_unwritten_vcf_rows()


        # Return the number of rows uploaded
        return rows_inserted