def UpdateHash(con, cur, seq_id, hash_seq_full, hash_seq_150, hash_seq_100): ''' update hash information Parameters ---------- con,cur seq_id hash_seq_full - hash for full hash_seq_150 - hash for first 150 characters hash_seq_100 - hash for first 100 characters Returns ------- true or false ''' debug(1, 'UpdateHash') try: cur.execute( "update annotationschematest.sequencestable set hashfull='%s',hash150='%s',hash100='%s' where id=%s" % (hash_seq_full, hash_seq_150, hash_seq_100, seq_id)) con.commit() return True except: return False
def AddSequenceTax(con, cur, seq_id, col, value): ''' update taxonomy record value Parameters ---------- con,cur seq_id col - taxonomyrank coloumn name value - taxonomyrank value Returns ------- true or false ''' debug(1, 'GetSequenceStrByID') try: cur.execute( "update annotationschematest.sequencestable set %s='%s' where id=%s" % (col, value, seq_id)) con.commit() return True except: return False
def GetSequenceWithNoHashID(con, cur): ''' Get sequence with no hash value (if any) Parameters ---------- con,cur Returns ------- sequence id : return the sequence id ''' debug(1, 'GetSequenceWithNoHashID') cur.execute( "select id from annotationschematest.sequencestable where (COALESCE(hashfull,'')='' AND COALESCE(hash150,'')='' AND COALESCE(hash100,'')='') limit 1" ) if cur.rowcount == 0: errmsg = 'no missing hash' debug(1, errmsg) return errmsg, -1 res = cur.fetchone() return_id = res[0] return '', return_id
def get_primers(con, cur): '''Get information about all the sequencing primers used in dbbact Returns ------- primers: list of dict of { 'primerid': int dbbact internal id of the primer region (i.e. 1 for v4, etc.) 'name': str, name of the primer region (i.e. 'v4', 'its1', etc.) 'fprimer': str 'rprimer: str name of the forward and reverse primers for the region (i.e. 515f, etc.) ''' debug(1, 'get_primers') primers = [] cur.execute('SELECT id, regionname, forwardprimer, reverseprimer FROM PrimersTable') res = cur.fetchall() for cres in res: cprimer = {} cprimer['primerid'] = cres[0] cprimer['name'] = cres[1] cprimer['fprimer'] = cres[2] cprimer['rprimer'] = cres[3] primers.append(cprimer) debug(1, 'found %d primers' % len(primers)) return '', primers
def SequencesWholeToFile(con, cur, fileName, dbid): ''' Save list of sequences to file, this will be used later 'whole' ids script Parameters ---------- con,cur fileName - output file name dbid - type of db (e.g. silva) Returns ------- error message ''' debug(1, 'SequencesWholeToFile') try: # cur.execute("SELECT id,sequence,ggid FROM sequencestable") cur.execute( "SELECT id,sequence,ggid FROM sequencestable where id not in (select distinct dbbactid from wholeseqidstable where dbid=%s)" % dbid) seq_count = 0 with open(fileName, 'w') as fl: for cres in cur: fl.write('>%s\n%s\n' % (cres[0], cres[1])) seq_count += 1 except psycopg2.DatabaseError as e: debug(7, 'database error %s' % e) return "database error %s" % e return ''
def GetSequenceStrByID(con, cur, seq_id): ''' Get sequence from seqid Parameters ---------- con,cur seq_id: int the dbbact seqid Returns ------- sequence str : return the sequence str ''' debug(1, 'GetSequenceStrByID') cur.execute( "select sequence from annotationschematest.sequencestable where id=%s" % seq_id) if cur.rowcount == 0: errmsg = 'no sequeence for seqid %s' % seq_id debug(1, errmsg) return errmsg, seq_id res = cur.fetchone() return_id = res[0] return '', return_id
def get_taxonomy_seqids(con, cur, taxonomy, userid=None): '''Get a list of all dbbact sequences containing the taxonomy as substring of the dbbact taxonomy Parameters ---------- con,cur taxonomy : str the taxonomy substring to look for userid : int (optional) the userid of the querying user (to enable searching private annotations) Returns ------- list of int The sequenceids for all sequences containing the taxonomy ''' taxonomy = taxonomy.lower() taxStr = taxonomy debug(1, 'GetTaxonomyAnnotationIDS for taxonomy %s' % taxonomy) cur.execute( 'SELECT id from SequencesTable where (taxrootrank ILIKE %s OR taxdomain ILIKE %s OR taxphylum ILIKE %s OR taxclass ILIKE %s OR taxfamily ILIKE %s OR taxgenus ILIKE %s OR taxorder ILIKE %s)', [taxStr, taxStr, taxStr, taxStr, taxStr, taxStr, taxStr]) res = cur.fetchall() seqids = [] for cres in res: seqids.append(cres[0]) debug(1, 'found %d matching sequences for the taxonomy' % len(seqids)) return seqids
def get_taxonomy_seqids(con, cur, taxonomy, userid=None): '''Get a list of all dbbact sequences containing the taxonomy as substring of the dbbact taxonomy Parameters ---------- con,cur taxonomy : str the taxonomy substring to look for userid : int (optional) the userid of the querying user (to enable searching private annotations) Returns ------- list of int The sequenceids for all sequences containing the taxonomy ''' taxonomy = taxonomy.lower() taxStr = taxonomy debug(1, 'GetTaxonomyAnnotationIDS for taxonomy %s' % taxonomy) cur.execute('SELECT id from SequencesTable where (taxrootrank ILIKE %s OR taxdomain ILIKE %s OR taxphylum ILIKE %s OR taxclass ILIKE %s OR taxfamily ILIKE %s OR taxgenus ILIKE %s OR taxorder ILIKE %s)', [taxStr, taxStr, taxStr, taxStr, taxStr, taxStr, taxStr]) res = cur.fetchall() seqids = [] for cres in res: seqids.append(cres[0]) debug(1, 'found %d matching sequences for the taxonomy' % len(seqids)) return seqids
def SequencesWholeToFile(con, cur, fileName, dbid): ''' Save list of sequences to file, this will be used later 'whole' ids script Parameters ---------- con,cur fileName - output file name dbid - type of db (e.g. silva) Returns ------- error message ''' debug(1, 'SequencesWholeToFile') try: # cur.execute("SELECT id,sequence,ggid FROM sequencestable") cur.execute("SELECT id,sequence,ggid FROM sequencestable where id not in (select distinct dbbactid from wholeseqidstable where dbid=%s)" % dbid) seq_count = 0 with open(fileName, 'w') as fl: for cres in cur: fl.write('>%s\n%s\n' % (cres[0], cres[1])) seq_count += 1 except psycopg2.DatabaseError as e: debug(7, 'database error %s' % e) return "database error %s" % e return ''
def get_primers(con, cur): '''Get information about all the sequencing primers used in dbbact Returns ------- primers: list of dict of { 'primerid': int dbbact internal id of the primer region (i.e. 1 for v4, etc.) 'name': str, name of the primer region (i.e. 'v4', 'its1', etc.) 'fprimer': str 'rprimer: str name of the forward and reverse primers for the region (i.e. 515f, etc.) ''' debug(1, 'get_primers') primers = [] cur.execute( 'SELECT id, regionname, forwardprimer, reverseprimer FROM PrimersTable' ) res = cur.fetchall() for cres in res: cprimer = {} cprimer['primerid'] = cres[0] cprimer['name'] = cres[1] cprimer['fprimer'] = cres[2] cprimer['rprimer'] = cres[3] primers.append(cprimer) debug(1, 'found %d primers' % len(primers)) return '', primers
def AddWholeSeqId(con, cur, dbidVal, dbbactidVal, wholeseqidVal, noTest=False): ''' Add record to wholeseqidstable table Parameters ---------- con,cur dbidVal - db type (e.g. silva, gg) dbbactidVal - sequnence id in dbbact wholeseqidVal - the id in different db (e.g. silva, gg) Returns ------- error message ''' debug(1, 'AddWholeSeqId') try: if noTest is True: cur.execute('INSERT INTO wholeseqidstable (dbid, dbbactid, wholeseqid) VALUES (%s, %s, %s)', [dbidVal, dbbactidVal, wholeseqidVal]) else: err, existFlag = WholeSeqIdExists(con, cur, dbidVal, dbbactidVal, 'na') if existFlag is False: cur.execute('INSERT INTO wholeseqidstable (dbid, dbbactid, wholeseqid) VALUES (%s, %s, %s)', [dbidVal, dbbactidVal, wholeseqidVal]) else: cur.execute('UPDATE wholeseqidstable set wholeseqid = %s where (dbid = %s and dbbactid = %s)', [wholeseqidVal, dbidVal, dbbactidVal]) con.commit() except psycopg2.DatabaseError as e: debug(7, 'database error %s' % e) return "database error %s" % e return ""
def WholeSeqIdExists(con, cur, dbidVal, dbbactidVal, wholeseqidVal=''): ''' Check if record is already exist in wholeseqidstable table Parameters ---------- con,cur dbidVal - db type (e.g. silva, gg) dbbactidVal - sequnence id in dbbact wholeseqidVal - the id in different db (e.g. silva, gg) if empty we will retrive all the ids which have at list one record Returns ------- True if exist error message ''' debug(1, 'WholeSeqIdExists') try: if wholeseqidVal: cur.execute("SELECT * FROM wholeseqidstable where dbid = %s and dbbactid = %s and wholeseqid = %s ", [dbidVal, dbbactidVal, wholeseqidVal]) else: cur.execute("SELECT * FROM wholeseqidstable where dbid = %s and dbbactid = %s and wholeseqid != 'na'", [dbidVal, dbbactidVal]) if cur.rowcount > 0: return "", True else: return "", False except psycopg2.DatabaseError as e: debug(7, 'database error %s' % e) return "database error %s" % e, False return "", False
def GetSequenceStrByID(con, cur, seq_id): ''' Get sequence from seqid Parameters ---------- con,cur seq_id: int the dbbact seqid Returns ------- sequence str : return the sequence str ''' debug(1, 'GetSequenceStrByID') cur.execute("select sequence from annotationschematest.sequencestable where id=%s" % seq_id) if cur.rowcount == 0: errmsg = 'no sequeence for seqid %s' % seq_id debug(1, errmsg) return errmsg, seq_id res = cur.fetchone() return_id = res[0] return '', return_id
def GetSequenceWithNoTaxonomyID(con, cur): ''' Get sequence with no taxonomy (if any) Parameters ---------- con,cur Returns ------- sequence id : return the sequence id ''' debug(1, 'GetSequenceWithNoTaxonomy') cur.execute( "select id from annotationschematest.sequencestable where (COALESCE(taxrootrank,'')='' AND COALESCE(taxdomain,'')='' AND COALESCE(taxphylum,'')='' AND COALESCE(taxclass,'')='' AND COALESCE(taxfamily,'')='' AND COALESCE(taxgenus,'')='' AND COALESCE(taxorder,'')='') limit 1" ) if cur.rowcount == 0: errmsg = 'no missing taxonomy' debug(1, errmsg) return errmsg, -1 res = cur.fetchone() return_id = res[0] return '', return_id
def GetGgAnnotationIDs(con, cur, gg_str, userid=None): ''' Get annotationids for all annotations containing any sequence matching the gg id (substring) Parameters ---------- con,cur gg : str the gg id substring to look for userid : int (optional) the userid of the querying user (to enable searching private annotations) Returns ------- annotationids : list of (int, int) (annotationid, count) list containing the ids of all annotations that contain a sequence with the Hash and the count of number of sequences from the Hash in that annotation seqids : list of int list of the sequenceids that have this annotation ''' gg_str = gg_str.lower() ggStr = gg_str debug(1, 'GetGgAnnotationIDs for gg %s' % gg_str) # cur.execute("SELECT id,sequence,ggid FROM sequencestable where id in (select distinct dbbactid from wholeseqidstable where dbid=2 and wholeseqid != 'na' and wholeseqid ILIKE %s)", [ggStr]) # res = cur.fetchall() # seqids = [] # seqnames = [] # for cres in res: # seqids.append(cres[0]) # seqnames.append(cres[1]) # debug(1, 'found %d matching sequences for the gg' % len(seqids)) err, seqids, seqnames = get_seqs_from_db_id(con, cur, 'gg', ggStr) if err != '': return err, [], [], [] annotationids_dict = defaultdict(int) for cseq in seqids: cur.execute( 'SELECT annotationid from sequencesAnnotationTable where seqid=%s', [cseq]) res = cur.fetchall() for cres in res: annotationids_dict[cres[0]] += 1 # NOTE: need to add user validation for the ids!!!!!! debug(1, 'found %d unique annotations for the gg' % len(annotationids_dict)) annotationids = [] for k, v in annotationids_dict.items(): annotationids.append((k, v)) return '', annotationids, seqids, seqnames
def GetSequenceTaxonomy(con, cur, sequence, region=None, userid=0): """ Get taxonomy str for given sequence Parameters ---------- con,cur : sequence : str ('ACGT') the sequence to search for in the database region : int (optional) None to not compare region, or the regionid the sequence is from userid : int (optional) the id of the user requesting the annotations. Private annotations with non-matching user will not be returned Returns ------- err : str The error encountered or '' if ok taxonomy: str The taxonomy string (of format d__XXX;p__YYYY;...) """ debug(1, 'GetSequenceTaxonomy sequence %s' % sequence) cseq = sequence.lower() cur.execute( "SELECT coalesce(taxdomain,''),coalesce(taxphylum,''), coalesce(taxclass,''),coalesce(taxorder,''),coalesce(taxfamily,''), coalesce(taxgenus,'') as taxonomy_str FROM SequencesTable WHERE sequence=%s", [cseq]) if cur.rowcount == 0: debug(1, 'taxonomy not found for sequence %s' % cseq) # ctaxinfo = {'taxonomy': 'NA'} # return '', ctaxinfo return '', 'NA' res = cur.fetchone() firstTax = True taxStr = '' list_of_pre_str = ["d__", "p__", "c__", "o__", "f__", "g__"] for idx, val in enumerate(list_of_pre_str): if res[idx]: if firstTax is False: taxStr += ';' taxStr += val + res[idx] firstTax = False # ctaxinfo = {'taxonomy': taxStr} # return '', ctaxinfo return '', taxStr
def GetGgAnnotationIDs(con, cur, gg_str, userid=None): ''' Get annotationids for all annotations containing any sequence matching the gg id (substring) Parameters ---------- con,cur gg : str the gg id substring to look for userid : int (optional) the userid of the querying user (to enable searching private annotations) Returns ------- annotationids : list of (int, int) (annotationid, count) list containing the ids of all annotations that contain a sequence with the Hash and the count of number of sequences from the Hash in that annotation seqids : list of int list of the sequenceids that have this annotation ''' gg_str = gg_str.lower() ggStr = gg_str debug(1, 'GetGgAnnotationIDs for gg %s' % gg_str) # cur.execute("SELECT id,sequence,ggid FROM sequencestable where id in (select distinct dbbactid from wholeseqidstable where dbid=2 and wholeseqid != 'na' and wholeseqid ILIKE %s)", [ggStr]) # res = cur.fetchall() # seqids = [] # seqnames = [] # for cres in res: # seqids.append(cres[0]) # seqnames.append(cres[1]) # debug(1, 'found %d matching sequences for the gg' % len(seqids)) err, seqids, seqnames = get_seqs_from_db_id(con, cur, 'gg', ggStr) if err != '': return err, [], [], [] annotationids_dict = defaultdict(int) for cseq in seqids: cur.execute('SELECT annotationid from sequencesAnnotationTable where seqid=%s', [cseq]) res = cur.fetchall() for cres in res: annotationids_dict[cres[0]] += 1 # NOTE: need to add user validation for the ids!!!!!! debug(1, 'found %d unique annotations for the gg' % len(annotationids_dict)) annotationids = [] for k, v in annotationids_dict.items(): annotationids.append((k, v)) return '', annotationids, seqids, seqnames
def GetSequenceTaxonomy(con, cur, sequence, region=None, userid=0): """ Get taxonomy str for given sequence Parameters ---------- con,cur : sequence : str ('ACGT') the sequence to search for in the database region : int (optional) None to not compare region, or the regionid the sequence is from userid : int (optional) the id of the user requesting the annotations. Private annotations with non-matching user will not be returned Returns ------- err : str The error encountered or '' if ok taxonomy: str The taxonomy string (of format d__XXX;p__YYYY;...) """ debug(1, 'GetSequenceTaxonomy sequence %s' % sequence) cseq = sequence.lower() cur.execute("SELECT coalesce(taxdomain,''),coalesce(taxphylum,''), coalesce(taxclass,''),coalesce(taxorder,''),coalesce(taxfamily,''), coalesce(taxgenus,'') as taxonomy_str FROM SequencesTable WHERE sequence=%s", [cseq]) if cur.rowcount == 0: debug(1, 'taxonomy not found for sequence %s' % cseq) # ctaxinfo = {'taxonomy': 'NA'} # return '', ctaxinfo return '', 'NA' res = cur.fetchone() firstTax = True taxStr = '' list_of_pre_str = ["d__", "p__", "c__", "o__", "f__", "g__"] for idx, val in enumerate(list_of_pre_str): if res[idx]: if firstTax is False: taxStr += ';' taxStr += val + res[idx] firstTax = False # ctaxinfo = {'taxonomy': taxStr} # return '', ctaxinfo return '', taxStr
def get_annotaiton_parents(): cur.execute('SELECT annotationdetail,ontology FROM AnnotationParentsTable WHERE idannotation=%s', [annotationid]) if cur.rowcount == 0: errmsg = 'No Annotation Parents found for annotationid %d in AnnotationParentsTable' % annotationid debug(3, errmsg) return(errmsg, {}) parents = {} res = cur.fetchall() for cres in res: cdetail = cres[0] conto = cres[1] if cdetail in parents: parents[cdetail].append(conto) else: parents[cdetail] = [conto] debug(1, 'found %d detail types' % len(parents)) return '', parents
def GetTaxonomyAnnotationIDs(con, cur, taxonomy, userid=None): ''' Get annotationids for all annotations containing any sequence matching the taxonomy (substring) Parameters ---------- con,cur taxonomy : str the taxonomy substring to look for userid : int (optional) the userid of the querying user (to enable searching private annotations) Returns ------- annotationids : list of (int, int) (annotationid, count) list containing the ids of all annotations that contain a sequence with the taxonomy and the count of number of sequences from the taxonomy in that annotation seqids : list of int list of the sequenceids that have this annotation ''' seqids = get_taxonomy_seqids(con, cur, taxonomy=taxonomy, userid=userid) # taxonomy = taxonomy.lower() # taxStr = taxonomy # debug(1, 'GetTaxonomyAnnotationIDS for taxonomy %s' % taxonomy) # cur.execute('SELECT id from SequencesTable where (taxrootrank ILIKE %s OR taxdomain ILIKE %s OR taxphylum ILIKE %s OR taxclass ILIKE %s OR taxfamily ILIKE %s OR taxgenus ILIKE %s OR taxorder ILIKE %s)', [taxStr, taxStr, taxStr, taxStr, taxStr, taxStr, taxStr]) # res = cur.fetchall() # seqids = [] # for cres in res: # seqids.append(cres[0]) # debug(1, 'found %d matching sequences for the taxonomy' % len(seqids)) annotationids_dict = defaultdict(int) for cseq in seqids: cur.execute( 'SELECT annotationid from sequencesAnnotationTable where seqid=%s', [cseq]) res = cur.fetchall() for cres in res: annotationids_dict[cres[0]] += 1 # NOTE: need to add user validation for the ids!!!!!! debug( 1, 'found %d unique annotations for the taxonomy' % len(annotationids_dict)) annotationids = [] for k, v in annotationids_dict.items(): annotationids.append((k, v)) return '', annotationids, seqids
def GetTaxonomyAnnotationIDs(con, cur, taxonomy, userid=None): ''' Get annotationids for all annotations containing any sequence matching the taxonomy (substring) Parameters ---------- con,cur taxonomy : str the taxonomy substring to look for userid : int (optional) the userid of the querying user (to enable searching private annotations) Returns ------- annotationids : list of (int, int) (annotationid, count) list containing the ids of all annotations that contain a sequence with the taxonomy and the count of number of sequences from the taxonomy in that annotation seqids : list of int list of the sequenceids that have this annotation ''' seqids = get_taxonomy_seqids(con, cur, taxonomy=taxonomy, userid=userid) # taxonomy = taxonomy.lower() # taxStr = taxonomy # debug(1, 'GetTaxonomyAnnotationIDS for taxonomy %s' % taxonomy) # cur.execute('SELECT id from SequencesTable where (taxrootrank ILIKE %s OR taxdomain ILIKE %s OR taxphylum ILIKE %s OR taxclass ILIKE %s OR taxfamily ILIKE %s OR taxgenus ILIKE %s OR taxorder ILIKE %s)', [taxStr, taxStr, taxStr, taxStr, taxStr, taxStr, taxStr]) # res = cur.fetchall() # seqids = [] # for cres in res: # seqids.append(cres[0]) # debug(1, 'found %d matching sequences for the taxonomy' % len(seqids)) annotationids_dict = defaultdict(int) for cseq in seqids: cur.execute('SELECT annotationid from sequencesAnnotationTable where seqid=%s', [cseq]) res = cur.fetchall() for cres in res: annotationids_dict[cres[0]] += 1 # NOTE: need to add user validation for the ids!!!!!! debug(1, 'found %d unique annotations for the taxonomy' % len(annotationids_dict)) annotationids = [] for k, v in annotationids_dict.items(): annotationids.append((k, v)) return '', annotationids, seqids
def GetSequenceIdFromGG(con, cur, ggid): ''' Get the sequence id for a given greengenes id (from rep. set 97%) Parameters ---------- con,cur : database connection and cursor ggid : int The greengenes (rep_set 97%) identifier of the sequence Returns ------- errmsg : str "" if ok, error msg if error encountered sid : list of int the ids of the matching sequences (empty tuple if not found) Note: can be more than one as several dbbact sequences can map to same ggid ''' sid = [] debug(1, 'get id for ggid %d' % ggid) cur.execute('SELECT id FROM SequencesTable WHERE ggid=%s', [ggid]) if cur.rowcount == 0: errmsg = 'ggid %s not found in database' % ggid debug(1, errmsg) return errmsg, sid res = cur.fetchall() for cres in res: resid = cres[0] sid.append(resid) debug(1, 'found %d sequences for ggid %d' % (len(sid), ggid)) return '', sid
def get_seqs_from_db_id(con, cur, db_name, db_seq_id): '''Get all sequences that match the db_seq_id supplied for silva/greengenes Parameters ---------- con, cur db_name: str name of the database from which the id originates. can be "silva" or "gg" db_seq_id: str the sequence identifier in the database (i.e. 'FJ978486.1.1387' for silva or '1111883' for greengenes) Returns ------- error: str or '' if ok list of int the dbbact ids for all the dbbact sequences matching the db_seq_id list of str the actual sequences for the dbbact sequences matching the db_seq_id (same order) ''' database_ids = {'silva': 1, 'gg': 2} if db_name in database_ids: db_id = database_ids[db_name] else: err = 'database id %s not found. options are: %s' % database_ids.keys() debug(9, err) return err, [], [] db_seq_id = db_seq_id.lower() cur.execute( "SELECT id,sequence FROM SequencesTable where id in (select distinct dbbactid from WholeSeqIDsTable where WholeSeqID=%s AND dbid=%s)", [db_seq_id, db_id]) seq_ids = [] sequences = [] res = cur.fetchall() for cres in res: seq_ids.append(cres[0]) sequences.append(cres[1]) debug(1, 'found %d dbbact sequences for seqid %s' % (len(seq_ids), db_seq_id)) return '', seq_ids, sequences
def AddWholeSeqId(con, cur, dbidVal, dbbactidVal, wholeseqidVal, noTest=False): ''' Add record to wholeseqidstable table Parameters ---------- con,cur dbidVal - db type (e.g. silva, gg) dbbactidVal - sequnence id in dbbact wholeseqidVal - the id in different db (e.g. silva, gg) Returns ------- error message ''' debug(1, 'AddWholeSeqId') try: if noTest is True: cur.execute( 'INSERT INTO wholeseqidstable (dbid, dbbactid, wholeseqid) VALUES (%s, %s, %s)', [dbidVal, dbbactidVal, wholeseqidVal]) else: err, existFlag = WholeSeqIdExists(con, cur, dbidVal, dbbactidVal, 'na') if existFlag is False: cur.execute( 'INSERT INTO wholeseqidstable (dbid, dbbactid, wholeseqid) VALUES (%s, %s, %s)', [dbidVal, dbbactidVal, wholeseqidVal]) else: cur.execute( 'UPDATE wholeseqidstable set wholeseqid = %s where (dbid = %s and dbbactid = %s)', [wholeseqidVal, dbidVal, dbbactidVal]) con.commit() except psycopg2.DatabaseError as e: debug(7, 'database error %s' % e) return "database error %s" % e return ""
def GetSequenceWithNoTaxonomyID(con, cur): ''' Get sequence with no taxonomy (if any) Parameters ---------- con,cur Returns ------- sequence id : return the sequence id ''' debug(1, 'GetSequenceWithNoTaxonomy') cur.execute("select id from annotationschematest.sequencestable where (COALESCE(taxrootrank,'')='' AND COALESCE(taxdomain,'')='' AND COALESCE(taxphylum,'')='' AND COALESCE(taxclass,'')='' AND COALESCE(taxfamily,'')='' AND COALESCE(taxgenus,'')='' AND COALESCE(taxorder,'')='') limit 1") if cur.rowcount == 0: errmsg = 'no missing taxonomy' debug(1, errmsg) return errmsg, -1 res = cur.fetchone() return_id = res[0] return '', return_id
def GetSequenceWithNoHashID(con, cur): ''' Get sequence with no hash value (if any) Parameters ---------- con,cur Returns ------- sequence id : return the sequence id ''' debug(1, 'GetSequenceWithNoHashID') cur.execute("select id from annotationschematest.sequencestable where (COALESCE(hashfull,'')='' AND COALESCE(hash150,'')='' AND COALESCE(hash100,'')='') limit 1") if cur.rowcount == 0: errmsg = 'no missing hash' debug(1, errmsg) return errmsg, -1 res = cur.fetchone() return_id = res[0] return '', return_id
def AddSequenceTax(con, cur, seq_id, col, value): ''' update taxonomy record value Parameters ---------- con,cur seq_id col - taxonomyrank coloumn name value - taxonomyrank value Returns ------- true or false ''' debug(1, 'GetSequenceStrByID') try: cur.execute("update annotationschematest.sequencestable set %s='%s' where id=%s" % (col, value, seq_id)) con.commit() return True except: return False
def WholeSeqIdExists(con, cur, dbidVal, dbbactidVal, wholeseqidVal=''): ''' Check if record is already exist in wholeseqidstable table Parameters ---------- con,cur dbidVal - db type (e.g. silva, gg) dbbactidVal - sequnence id in dbbact wholeseqidVal - the id in different db (e.g. silva, gg) if empty we will retrive all the ids which have at list one record Returns ------- True if exist error message ''' debug(1, 'WholeSeqIdExists') try: if wholeseqidVal: cur.execute( "SELECT * FROM wholeseqidstable where dbid = %s and dbbactid = %s and wholeseqid = %s ", [dbidVal, dbbactidVal, wholeseqidVal]) else: cur.execute( "SELECT * FROM wholeseqidstable where dbid = %s and dbbactid = %s and wholeseqid != 'na'", [dbidVal, dbbactidVal]) if cur.rowcount > 0: return "", True else: return "", False except psycopg2.DatabaseError as e: debug(7, 'database error %s' % e) return "database error %s" % e, False return "", False
def get_seqs_from_db_id(con, cur, db_name, db_seq_id): '''Get all sequences that match the db_seq_id supplied for silva/greengenes Parameters ---------- con, cur db_name: str name of the database from which the id originates. can be "silva" or "gg" db_seq_id: str the sequence identifier in the database (i.e. 'FJ978486.1.1387' for silva or '1111883' for greengenes) Returns ------- error: str or '' if ok list of int the dbbact ids for all the dbbact sequences matching the db_seq_id list of str the actual sequences for the dbbact sequences matching the db_seq_id (same order) ''' database_ids = {'silva': 1, 'gg': 2} if db_name in database_ids: db_id = database_ids[db_name] else: err = 'database id %s not found. options are: %s' % database_ids.keys() debug(9, err) return err, [], [] db_seq_id = db_seq_id.lower() cur.execute("SELECT id,sequence FROM SequencesTable where id in (select distinct dbbactid from WholeSeqIDsTable where WholeSeqID=%s AND dbid=%s)", [db_seq_id, db_id]) seq_ids = [] sequences = [] res = cur.fetchall() for cres in res: seq_ids.append(cres[0]) sequences.append(cres[1]) debug(1, 'found %d dbbact sequences for seqid %s' % (len(seq_ids), db_seq_id)) return '', seq_ids, sequences
def UpdateHash(con, cur, seq_id, hash_seq_full, hash_seq_150, hash_seq_100): ''' update hash information Parameters ---------- con,cur seq_id hash_seq_full - hash for full hash_seq_150 - hash for first 150 characters hash_seq_100 - hash for first 100 characters Returns ------- true or false ''' debug(1, 'UpdateHash') try: cur.execute("update annotationschematest.sequencestable set hashfull='%s',hash150='%s',hash100='%s' where id=%s" % (hash_seq_full, hash_seq_150, hash_seq_100, seq_id)) con.commit() return True except: return False
def GetHashAnnotations(con, cur, hash_str, userid=None): ''' Get annotations for all annotations containing any sequence matching the hash (substring) Parameters ---------- con,cur taxonomy : str the hash substring to look for userid : int (optional) the userid of the querying user (to enable searching private annotations) Returns ------- annotations : list of tuples (annotation, counts) list containing the details for all annotations that contain a sequence with the taxonomy annotation - (see dbannotations.GetAnnotationsFromID() ) counts - the number of sequences from taxonomy appearing in this annotations seqids : list of int list of the sequenceids which have this taxonomy seqnames : list of sequence strings ''' debug(1, 'GetHashAnnotations for hash %s' % hash_str) # get the annotation ids err, annotationids, seqids, seqnames = GetHashAnnotationIDs( con, cur, hash_str, userid) if err: errmsg = 'Failed to get annotationIDs for hash_str %s: %s' % (hash_str, err) debug(6, errmsg) return errmsg, None # and get the annotation details for each annotations = [] for cres in annotationids: cid = cres[0] ccount = cres[1] err, cdetails = dbbact.dbannotations.GetAnnotationsFromID( con, cur, cid) if err: debug(6, err) continue annotations.append((cdetails, ccount)) debug(1, 'got %d details' % len(annotations)) return '', annotations, seqids, seqnames
def GetSilvaAnnotationIDs(con, cur, silva_str, userid=None): ''' Get annotationids for all annotations containing any sequence matching the silva id (substring) Parameters ---------- con,cur Silva : str the silva substring to look for userid : int (optional) the userid of the querying user (to enable searching private annotations) Returns ------- err: str the error encountered or '' if successful annotationids : list of (int, int) (annotationid, count) list containing the ids of all annotations that contain a sequence with the silvaID and the count of number of sequences with the silvaID in that annotation seqids : list of int list of the sequenceids that have this silvaID seqnames: list of str the sequences matching the silvaID ''' debug(1, 'GetSilvaAnnotationIDs for Silva %s' % silva_str) err, seqids, seqnames = get_seqs_from_db_id(con, cur, 'silva', silva_str) if err != '': return err, [], [], [] # cur.execute("SELECT id,sequence FROM sequencestable where id in (select distinct dbbactid from wholeseqidstable where dbid=1 and wholeseqid != 'na' and wholeseqid ILIKE %s)", [silvaStr]) # res = cur.fetchall() # seqids = [] # seqnames = [] # for cres in res: # seqids.append(cres[0]) # seqnames.append(cres[1]) debug(1, 'found %d matching sequences for the silva' % len(seqids)) annotationids_dict = defaultdict(int) for cseq in seqids: cur.execute( 'SELECT annotationid from sequencesAnnotationTable where seqid=%s', [cseq]) res = cur.fetchall() for cres in res: annotationids_dict[cres[0]] += 1 # NOTE: need to add user validation for the ids!!!!!! debug( 1, 'found %d unique annotations for the Silva' % len(annotationids_dict)) annotationids = [] for k, v in annotationids_dict.items(): annotationids.append((k, v)) return '', annotationids, seqids, seqnames
def GetSilvaAnnotations(con, cur, silva_str, userid=None): ''' Get annotations for all annotations containing any sequence matching the silvaID (substring) Parameters ---------- con,cur silva_str : str the silva id substring to look for userid : int (optional) the userid of the querying user (to enable searching private annotations) Returns ------- annotations : list of tuples (annotation, counts) list containing the details for all annotations that contain a sequence with the taxonomy annotation - (see dbannotations.GetAnnotationsFromID() ) counts - the number of sequences from taxonomy appearing in this annotations seqids : list of int list of the sequenceids which have this taxonomy seqnames : list of sequence strings ''' debug(1, 'GetSilvaAnnotations for silva ID %s' % silva_str) # get the annotation ids err, annotationids, seqids, seqnames = GetSilvaAnnotationIDs(con, cur, silva_str, userid) if err: errmsg = 'Failed to get annotationIDs for silva_str %s: %s' % (silva_str, err) debug(6, errmsg) return errmsg, None # and get the annotation details for each annotations = [] for cres in annotationids: cid = cres[0] ccount = cres[1] err, cdetails = dbannotations.GetAnnotationsFromID(con, cur, cid) if err: debug(6, err) continue annotations.append((cdetails, ccount)) debug(1, 'got %d details' % len(annotations)) return '', annotations, seqids, seqnames
def GetSilvaAnnotationIDs(con, cur, silva_str, userid=None): ''' Get annotationids for all annotations containing any sequence matching the silva id (substring) Parameters ---------- con,cur Silva : str the silva substring to look for userid : int (optional) the userid of the querying user (to enable searching private annotations) Returns ------- err: str the error encountered or '' if successful annotationids : list of (int, int) (annotationid, count) list containing the ids of all annotations that contain a sequence with the silvaID and the count of number of sequences with the silvaID in that annotation seqids : list of int list of the sequenceids that have this silvaID seqnames: list of str the sequences matching the silvaID ''' debug(1, 'GetSilvaAnnotationIDs for Silva %s' % silva_str) err, seqids, seqnames = get_seqs_from_db_id(con, cur, 'silva', silva_str) if err != '': return err, [], [], [] # cur.execute("SELECT id,sequence FROM sequencestable where id in (select distinct dbbactid from wholeseqidstable where dbid=1 and wholeseqid != 'na' and wholeseqid ILIKE %s)", [silvaStr]) # res = cur.fetchall() # seqids = [] # seqnames = [] # for cres in res: # seqids.append(cres[0]) # seqnames.append(cres[1]) debug(1, 'found %d matching sequences for the silva' % len(seqids)) annotationids_dict = defaultdict(int) for cseq in seqids: cur.execute('SELECT annotationid from sequencesAnnotationTable where seqid=%s', [cseq]) res = cur.fetchall() for cres in res: annotationids_dict[cres[0]] += 1 # NOTE: need to add user validation for the ids!!!!!! debug(1, 'found %d unique annotations for the Silva' % len(annotationids_dict)) annotationids = [] for k, v in annotationids_dict.items(): annotationids.append((k, v)) return '', annotationids, seqids, seqnames
def GetHashAnnotationIDs(con, cur, hash_str, userid=None): ''' Get annotationids for all annotations containing any sequence matching the Hash (substring) Parameters ---------- con,cur Hash : str the Hash substring to look for userid : int (optional) the userid of the querying user (to enable searching private annotations) Returns ------- annotationids : list of (int, int) (annotationid, count) list containing the ids of all annotations that contain a sequence with the Hash and the count of number of sequences from the Hash in that annotation seqids : list of int list of the sequenceids that have this annotation ''' hash_str = hash_str.lower() taxStr = hash_str debug(1, 'GetHashAnnotationIDS for Hash %s' % hash_str) cur.execute( 'SELECT id,sequence from SequencesTable where (hashfull ILIKE %s or hash150 ILIKE %s or hash100 ILIKE %s)', [hash_str, hash_str, hash_str]) res = cur.fetchall() seqids = [] seqnames = [] for cres in res: seqids.append(cres[0]) seqnames.append(cres[1]) debug(1, 'found %d matching sequences for the Hash' % len(seqids)) annotationids_dict = defaultdict(int) for cseq in seqids: cur.execute( 'SELECT annotationid from sequencesAnnotationTable where seqid=%s', [cseq]) res = cur.fetchall() for cres in res: annotationids_dict[cres[0]] += 1 # NOTE: need to add user validation for the ids!!!!!! debug(1, 'found %d unique annotations for the Hash' % len(annotationids_dict)) annotationids = [] for k, v in annotationids_dict.items(): annotationids.append((k, v)) return '', annotationids, seqids, seqnames
def GetHashAnnotationIDs(con, cur, hash_str, userid=None): ''' Get annotationids for all annotations containing any sequence matching the Hash (substring) Parameters ---------- con,cur Hash : str the Hash substring to look for userid : int (optional) the userid of the querying user (to enable searching private annotations) Returns ------- annotationids : list of (int, int) (annotationid, count) list containing the ids of all annotations that contain a sequence with the Hash and the count of number of sequences from the Hash in that annotation seqids : list of int list of the sequenceids that have this annotation ''' hash_str = hash_str.lower() taxStr = hash_str debug(1, 'GetHashAnnotationIDS for Hash %s' % hash_str) cur.execute('SELECT id,sequence from SequencesTable where (hashfull ILIKE %s or hash150 ILIKE %s or hash100 ILIKE %s)', [hash_str, hash_str, hash_str]) res = cur.fetchall() seqids = [] seqnames = [] for cres in res: seqids.append(cres[0]) seqnames.append(cres[1]) debug(1, 'found %d matching sequences for the Hash' % len(seqids)) annotationids_dict = defaultdict(int) for cseq in seqids: cur.execute('SELECT annotationid from sequencesAnnotationTable where seqid=%s', [cseq]) res = cur.fetchall() for cres in res: annotationids_dict[cres[0]] += 1 # NOTE: need to add user validation for the ids!!!!!! debug(1, 'found %d unique annotations for the Hash' % len(annotationids_dict)) annotationids = [] for k, v in annotationids_dict.items(): annotationids.append((k, v)) return '', annotationids, seqids, seqnames
def hash_sequences(filename, short_len=100): '''hash all the sequences in a fasta file Parameters ---------- filename: str the fasta file Returns ------- seq_hash: dict of {seq: seqid} seq_lens : list of int all the sequence lengths in the fasta file (so we can hash all the lengths in the queries) short_hash: dict of {short_seq: seq_hash dict} ''' num_too_short = 0 seq_hash = {} seq_lens = set() all_ids = set() short_hash = defaultdict(dict) for cseq, chead in iter_fasta_seqs(filename): all_ids.add(chead) clen = len(cseq) if clen < short_len: num_too_short += 1 continue short_seq = cseq[:short_len] short_hash[short_seq][cseq] = chead if clen not in seq_lens: seq_lens.add(clen) seq_hash[cseq] = chead debug(2, 'processed %d sequences.' % len(seq_hash)) debug(2, 'lens: %s' % seq_lens) debug(2, 'num too short: %d' % num_too_short) return all_ids, seq_hash, seq_lens, short_hash
def hash_sequences(filename, short_len=100): '''hash all the sequences in a fasta file Parameters ---------- filename: str the fasta file Returns ------- seq_hash: dict of {seq: seqid} seq_lens : list of int all the sequence lengths in the fasta file (so we can hash all the lengths in the queries) short_hash: dict of {short_seq: seq_hash dict} ''' num_too_short = 0 seq_hash = {} seq_lens = set() all_ids = set() short_hash = defaultdict(dict) for cseq, chead in iter_fasta_seqs(filename): all_ids.add(chead) clen = len(cseq) if clen < short_len: num_too_short += 1 continue short_seq = cseq[:short_len] short_hash[short_seq][cseq] = chead if clen not in seq_lens: seq_lens.add(clen) seq_hash[cseq] = chead debug(2,'processed %d sequences.' % len(seq_hash)) debug(2,'lens: %s' % seq_lens) debug(2,'num too short: %d' % num_too_short) return all_ids, seq_hash, seq_lens, short_hash
#Update silva summary_str += "Silva script started at : " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n" main_func_silva() summary_str += "Silva script ended at : " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n" #Update tax summary_str += "Tax script started at : " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n" main_func_tax() summary_str += "Tax script ended at : " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n" #Update hash for sequence summary_str += "Seq hash script started at : " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n" main_func_seq_hash() summary_str += "Seq hash script ended at : " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n" summary_str += "Sleep sleep at: " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n" maint_log += summary_str ##summary_str = "failed count = %s\nsuccess count = %s\nis exist count = %s\nis exist error = %s\nfailed dummy count = %s\nsuccess dummy count = %s\nis exist dummy count = %s\nis exist dummy error %s\n" % (count_seq_failure,count_seq_success,count_seq_exist,count_seq_is_exist_failure,count_seq_dummy_failure,count_seq_dummy_success,count_seq_dummy_exist,count_seq_dummy_exist) saveStringToFile("maint_summary_log_" + date_time_str,summary_str) saveStringToFile("maint_log_" + date_time_str,maint_log) #Sleep until the next time debug(2, "go to sleep") time.sleep(sleep_time) saveStringToFile("maint_summary_log_" + date_time_str,summary_str) saveStringToFile("maint_log_" + date_time_str,maint_log)
def main_func_seq_hash(): SetDebugLevel(0) date_time_str = datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") #connect to the db con, cur = db_access.connect_db() debug(2, 'Started') if 'OPENU_FLAG' in os.environ: debug(2, 'Openu') else: debug(2, 'normal') count_success = 0 count_failure = 0 count_seq_success = 0 count_seq_failure = 0 count = 1 hash_log = "" sleep_time = 86400 hash_seq_full = '' hash_seq_150 = '' hash_seq_100 = '' while isFileExist("stop_seq_hash") == False: err, seq_id = dbsequences.GetSequenceWithNoHashID(con, cur) if err or seq_id == -1: #If no empty sequence, wait for long time debug(2, "go to sleep") hash_log += "sleep start " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n" saveStringToFile("hash_summary_log_sleep_" + date_time_str,"sleep started " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")) #continue return # insted of sleep, one master file run all scripts hash_log += "sequence id = " + str(seq_id) + "\n" err, seq_str = dbsequences.GetSequenceStrByID(con, cur, seq_id) if err: tax_log += "Fatal Error, could not find sequence " + "\n" break hash_seq_full = 'na' hash_seq_150 = 'na' hash_seq_100 = 'na' seq_str = seq_str.upper() if len(seq_str) > 0 : hash_seq_full = hashlib.md5(seq_str.encode('utf-8')).hexdigest() if len(seq_str) >= 150 : hash_seq_150 = hashlib.md5(seq_str[:150].encode('utf-8')).hexdigest() if len(seq_str) >= 100 : hash_seq_100 = hashlib.md5(seq_str[:100].encode('utf-8')).hexdigest() hash_log += "id: " + str(seq_id) + "\n" hash_log += "hash: " + str(hash_seq_full) + "\n" hash_log += "hash 150: " + str(hash_seq_150) + "\n" hash_log += "hash 100: " + str(hash_seq_100) + "\n" has_failure = False if dbsequences.UpdateHash(con, cur, seq_id,hash_seq_full,hash_seq_150,hash_seq_100) == True: hash_log += " SUCCESS" + "\n" count_seq_success = count_seq_success + 1 else: hash_log += " FAILED" + "\n" count_seq_failure = count_seq_failure + 1 has_failure = True if has_failure == True: count_failure = count_failure + 1 else: count_success = count_success + 1 summary_str = "count_seq_success = %s\ncount_seq_failure = %s\n" % (count_seq_success,count_seq_failure) saveStringToFile("hash_summary_log_" + date_time_str,summary_str) saveStringToFile("hash_log_" + date_time_str,hash_log) debug(2, 'found sequence %s' % seq_str) debug(2, 'return %s,%s,%s' % (hash_seq_full,hash_seq_150,hash_seq_100)) count = count + 1 #stop the script in case of error if count_failure > 0: break;
).strftime("%Y-%m-%d--%H:%M:%S") + "\n" #Update tax summary_str += "Tax script started at : " + datetime.datetime.now( ).strftime("%Y-%m-%d--%H:%M:%S") + "\n" main_func_tax() summary_str += "Tax script ended at : " + datetime.datetime.now( ).strftime("%Y-%m-%d--%H:%M:%S") + "\n" #Update hash for sequence summary_str += "Seq hash script started at : " + datetime.datetime.now( ).strftime("%Y-%m-%d--%H:%M:%S") + "\n" main_func_seq_hash() summary_str += "Seq hash script ended at : " + datetime.datetime.now( ).strftime("%Y-%m-%d--%H:%M:%S") + "\n" summary_str += "Sleep sleep at: " + datetime.datetime.now().strftime( "%Y-%m-%d--%H:%M:%S") + "\n" maint_log += summary_str ##summary_str = "failed count = %s\nsuccess count = %s\nis exist count = %s\nis exist error = %s\nfailed dummy count = %s\nsuccess dummy count = %s\nis exist dummy count = %s\nis exist dummy error %s\n" % (count_seq_failure,count_seq_success,count_seq_exist,count_seq_is_exist_failure,count_seq_dummy_failure,count_seq_dummy_success,count_seq_dummy_exist,count_seq_dummy_exist) saveStringToFile("maint_summary_log_" + date_time_str, summary_str) saveStringToFile("maint_log_" + date_time_str, maint_log) #Sleep until the next time debug(2, "go to sleep") time.sleep(sleep_time) saveStringToFile("maint_summary_log_" + date_time_str, summary_str) saveStringToFile("maint_log_" + date_time_str, maint_log)
def AddSequences(con, cur, sequences, taxonomies=None, ggids=None, primer='V4', commit=True): """ Add sequence entries to database if they do not exist yet input: con,cur : database connection and cursor sequences: list of str the sequences to add taxonomies: list of str (optional) taxonomy of each sequence or None to add NA ggids: list of int (optional) list of GreenGenes id for each sequence or None to add 0 primer: str (optional) Name of the primer (from PrimersTable). default is V4 commit : bool (optional) True (default) to commit, False to wait with the commit output: errmsg : str "" if ok, error msg if error encountered seqids : list of int or None list of the new ids or None if error enountered """ # get the primer region id seqids = [] numadded = 0 idprimer = dbbact.primers.GetIdFromName(con, cur, primer) if idprimer < 0: debug(2, 'primer %s not found' % primer) return "primer %s not found" % primer, None debug(1, 'primerid %s' % idprimer) try: for idx, cseq in enumerate(sequences): if len(cseq) < SEED_SEQ_LEN: errmsg = 'sequence too short (<%d) for sequence %s' % ( SEED_SEQ_LEN, cseq) debug(4, errmsg) return errmsg, None # test if already exists, skip it err, cseqid = GetSequenceId(con, cur, sequence=cseq, idprimer=idprimer, no_shorter=True, no_longer=True) if len(cseqid) == 0: # not found, so need to add this sequence if taxonomies is None: ctax = 'na' else: ctax = taxonomies[idx].lower() if ggids is None: cggid = 0 else: cggid = ggids[idx] cseq = cseq.lower() cseedseq = cseq[:SEED_SEQ_LEN] cur.execute( 'INSERT INTO SequencesTable (idPrimer,sequence,length,taxonomy,ggid,seedsequence) VALUES (%s,%s,%s,%s,%s,%s) RETURNING id', [idprimer, cseq, len(cseq), ctax, cggid, cseedseq]) cseqid = cur.fetchone() numadded += 1 if len(cseqid) > 1: debug( 8, 'AddSequences - Same sequence appears twice in database: %s' % cseq) seqids.append(cseqid[0]) if commit: con.commit() debug(3, "Added %d sequences (out of %d)" % (numadded, len(sequences))) return "", seqids except psycopg2.DatabaseError as e: debug(7, 'database error %s' % e) return "database error %s" % e, None
def main_func_silva(): SetDebugLevel(0) date_time_str = datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") #connect to the db con, cur = db_access.connect_db() debug(2, 'Started') if 'OPENU_FLAG' in os.environ: debug(2, 'Openu') else: debug(2, 'normal') count_success = 0 count_failure = 0 count_dummy_success = 0 count_dummy_failure = 0 count_seq_success = 0 count_seq_failure = 0 count_seq_is_exist_failure = 0 count_seq_exist = 0 count_seq_is_exist_dummy_failure = 0 count_seq_dummy_exist = 0 count_seq_dummy_failure = 0 count_seq_dummy_success = 0 count = 1 hash_log = "" sleep_time = 86400 #sleep_time = 10 short_len = 150 seqdbid = 1 # SILVA silva_log = "" tempFileName = 'tempSilvaScript.fasta' silvaFileName = 'SILVA_132_SSURef_tax_silva.fasta' while isFileExist("stop_silva") == False: #Create the file and read it dbsequences.SequencesWholeToFile(con, cur, tempFileName, seqdbid) all_ids, seq_hash, seq_lens, short_hash = hash_sequences( filename=tempFileName, short_len=150) #nothing to do, go to sleep if len(all_ids) == 0: debug(2, "go to sleep") silva_log += "sleep start " + datetime.datetime.now().strftime( "%Y-%m-%d--%H:%M:%S") + "\n" saveStringToFile( "silva_summary_log_sleep_" + date_time_str, "sleep started " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")) #continue return # insted of sleep, one master file run all scripts else: for seq_id in all_ids: err = dbsequences.AddWholeSeqId(con, cur, seqdbid, seq_id, 'na', noTest=True) if err: debug(2, "failed to add dummy") silva_log += "failed to add\n" count_seq_dummy_failure += 1 else: debug(2, "add dummy") silva_log += "added\n" count_seq_dummy_success += 1 idx = 0 num_matches = 0 for cseq, chead in iter_fasta_seqs(silvaFileName): isFound = False idx += 1 if idx % 1000 == 0: debug(2, "count: %d" % idx) summary_str = "failed count = %s\nsuccess count = %s\nis exist count = %s\nis exist error = %s\nfailed dummy count = %s\nsuccess dummy count = %s\nis exist dummy count = %s\nis exist dummy error %s\n" % ( count_seq_failure, count_seq_success, count_seq_exist, count_seq_is_exist_failure, count_seq_dummy_failure, count_seq_dummy_success, count_seq_dummy_exist, count_seq_dummy_exist) saveStringToFile("silva_summary_log_" + date_time_str, summary_str) saveStringToFile("silva_log_" + date_time_str, silva_log) for cpos in range(len(cseq) - short_len): ccseq = cseq[cpos:cpos + short_len] if ccseq in short_hash: for k, v in short_hash[ccseq].items(): if k in cseq: cid = chead.split(' ')[0] # remove the tail from the id split_cid = cid.split('.') if len(split_cid) > 2: cid = ".".join(split_cid[:-2]) else: cid = ".".join(split_cid) cid = cid.lower() silva_log += "rec found: seq id %s , db bact id %s, id %s\n" % ( seqdbid, v, cid) #check if already exist err, existFlag = dbsequences.WholeSeqIdExists( con, cur, seqdbid, v, cid) if err: count_seq_is_exist_failure += 1 silva_log += "failed to found\n" if existFlag: count_seq_exist += 1 silva_log += "found\n" isFound = True break else: debug(2, "add normal") cid = cid.replace('.', '') cid = cid.lower() err = dbsequences.AddWholeSeqId( con, cur, seqdbid, v, cid) if err: silva_log += "failed to add\n" count_seq_failure += 1 break else: silva_log += "added\n" count_seq_success += 1 isFound = True break #go over all ids, if not exist add record for seq_id in all_ids: err, existFlag = dbsequences.WholeSeqIdExists( con, cur, seqdbid, seq_id) if err: count_seq_is_exist_dummy_failure += 1 silva_log += "failed to found\n" if existFlag: count_seq_dummy_exist += 1 silva_log += "found\n" isFound = True break else: debug(2, "add dummy") err = dbsequences.AddWholeSeqId(con, cur, seqdbid, seq_id, 'na') if err: silva_log += "failed to add\n" count_seq_dummy_failure += 1 break else: silva_log += "added\n" count_seq_dummy_success += 1 break debug(2, 'done') summary_str = "failed count = %s\nsuccess count = %s\nis exist count = %s\nis exist error = %s\nfailed dummy count = %s\nsuccess dummy count = %s\nis exist dummy count = %s\nis exist dummy error %s\n" % ( count_seq_failure, count_seq_success, count_seq_exist, count_seq_is_exist_failure, count_seq_dummy_failure, count_seq_dummy_success, count_seq_dummy_exist, count_seq_dummy_exist) saveStringToFile("silva_summary_log_" + date_time_str, summary_str) saveStringToFile("silva_log_" + date_time_str, silva_log)
def AddSequences(con, cur, sequences, taxonomies=None, ggids=None, primer='V4', commit=True): """ Add sequence entries to database if they do not exist yet input: con,cur : database connection and cursor sequences: list of str the sequences to add taxonomies: list of str (optional) taxonomy of each sequence or None to add NA ggids: list of int (optional) list of GreenGenes id for each sequence or None to add 0 primer: str (optional) Name of the primer (from PrimersTable). default is V4 commit : bool (optional) True (default) to commit, False to wait with the commit output: errmsg : str "" if ok, error msg if error encountered seqids : list of int or None list of the new ids or None if error enountered """ # get the primer region id seqids = [] numadded = 0 idprimer = dbbact.primers.GetIdFromName(con, cur, primer) if idprimer < 0: debug(2, 'primer %s not found' % primer) return "primer %s not found" % primer, None debug(1, 'primerid %s' % idprimer) try: for idx, cseq in enumerate(sequences): if len(cseq) < SEED_SEQ_LEN: errmsg = 'sequence too short (<%d) for sequence %s' % (SEED_SEQ_LEN, cseq) debug(4, errmsg) return errmsg, None # test if already exists, skip it err, cseqid = GetSequenceId(con, cur, sequence=cseq, idprimer=idprimer, no_shorter=True, no_longer=True) if len(cseqid) == 0: # not found, so need to add this sequence if taxonomies is None: ctax = 'na' else: ctax = taxonomies[idx].lower() if ggids is None: cggid = 0 else: cggid = ggids[idx] cseq = cseq.lower() cseedseq = cseq[:SEED_SEQ_LEN] cur.execute('INSERT INTO SequencesTable (idPrimer,sequence,length,taxonomy,ggid,seedsequence) VALUES (%s,%s,%s,%s,%s,%s) RETURNING id', [idprimer, cseq, len(cseq), ctax, cggid, cseedseq]) cseqid = cur.fetchone() numadded += 1 if len(cseqid) > 1: debug(8, 'AddSequences - Same sequence appears twice in database: %s' % cseq) seqids.append(cseqid[0]) if commit: con.commit() debug(3, "Added %d sequences (out of %d)" % (numadded, len(sequences))) return "", seqids except psycopg2.DatabaseError as e: debug(7, 'database error %s' % e) return "database error %s" % e, None
def GetSequenceId(con, cur, sequence, idprimer=None, no_shorter=False, no_longer=False): """ Get sequence ids for a sequence input: con,cur : database connection and cursor sequence : str (ACGT sequences) idprimer : int (optional) if supplied, verify the sequence is from this idPrimer no_shorter : bool (optional) False (default) to enable shorter db sequences matching sequence, True to require at least length of query sequence no_longer : bool (optional) False (default) to enable longer db sequences matching sequence, True to require at least length of database sequence output: errmsg : str "" if ok, error msg if error encountered sid : list of int the ids of the matching sequences (empty tuple if not found) Note: can be more than one as we also look for short subsequences / long supersequences """ # check if the sequence is made only of digits assume it is a greengenes id if sequence.isdigit(): debug(1, 'getting id for ggid %s' % sequence) return GetSequenceIdFromGG(con, cur, int(sequence)) sid = [] cseq = sequence.lower() if len(cseq) < SEED_SEQ_LEN: errmsg = 'sequence too short (<%d) for sequence %s' % (SEED_SEQ_LEN, cseq) debug(4, errmsg) return errmsg, sid # look for all sequences matching the seed cseedseq = cseq[:SEED_SEQ_LEN] cur.execute('SELECT id,sequence FROM SequencesTable WHERE seedsequence=%s', [cseedseq]) if cur.rowcount == 0: errmsg = 'sequence %s not found' % sequence debug(1, errmsg) return errmsg, sid cseqlen = len(cseq) res = cur.fetchall() for cres in res: resid = cres[0] resseq = cres[1] if no_shorter: if len(resseq) < cseqlen: continue comparelen = cseqlen else: comparelen = min(len(resseq), cseqlen) if no_longer: if len(resseq) > cseqlen: continue if cseq[:comparelen] == resseq[:comparelen]: if idprimer is None: sid.append(resid) cur.execute( 'SELECT idPrimer FROM SequencesTable WHERE id=%s LIMIT 1', [resid]) res = cur.fetchone() if res[0] == idprimer: sid.append(resid) if len(sid) == 0: errmsg = 'sequence %s not found' % sequence debug(1, errmsg) return errmsg, sid return '', sid
def main_func_gg(): SetDebugLevel(0) date_time_str = datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") #connect to the db con, cur = db_access.connect_db() debug(2, 'Started') if 'OPENU_FLAG' in os.environ: debug(2, 'Openu') else: debug(2, 'normal') count_success = 0 count_failure = 0 count_dummy_success = 0 count_dummy_failure = 0 count_seq_success = 0 count_seq_failure = 0 count_seq_is_exist_failure = 0 count_seq_exist = 0 count_seq_is_exist_dummy_failure = 0 count_seq_dummy_exist = 0 count_seq_dummy_failure = 0 count_seq_dummy_success = 0 count = 1 hash_log = "" sleep_time = 86400 #sleep_time = 10 short_len=150 seqdbid = 2 # GG gg_log = "" tempFileName = 'tempGgScript.fasta' #ggFileName = '/Volumes/Photos/Temporary Studies/gg_13_5.fasta' ggFileName = 'gg_13_5.fasta' while isFileExist("stop_gg") == False: #Create the file and read it dbsequences.SequencesWholeToFile(con, cur, tempFileName, seqdbid) all_ids , seq_hash, seq_lens, short_hash = hash_sequences(filename=tempFileName, short_len=150) #nothing to do, go to sleep if len(all_ids) == 0: debug(2, "go to sleep") gg_log += "sleep start " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") + "\n" saveStringToFile("gg_summary_log_sleep_" + date_time_str,"sleep started " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")) #time.sleep(sleep_time) #continue return # insted of sleep, one master file run all scripts else: for seq_id in all_ids: err = dbsequences.AddWholeSeqId(con,cur, seqdbid, seq_id, 'na', noTest = True) if err: debug(2, "failed to add dummy") gg_log += "failed to add\n" count_seq_dummy_failure += 1 else: debug(2, "add dummy") gg_log += "added\n" count_seq_dummy_success += 1 idx = 0 num_matches = 0 for cseq, chead in iter_fasta_seqs(ggFileName): isFound = False idx += 1 if idx % 1000 == 0: debug(2, "count: %d" % idx) summary_str = "failed count = %s\nsuccess count = %s\nis exist count = %s\nis exist error = %s\nfailed dummy count = %s\nsuccess dummy count = %s\nis exist dummy count = %s\nis exist dummy error %s\n" % (count_seq_failure,count_seq_success,count_seq_exist,count_seq_is_exist_failure,count_seq_dummy_failure,count_seq_dummy_success,count_seq_dummy_exist,count_seq_dummy_exist) saveStringToFile("gg_summary_log_" + date_time_str,summary_str) saveStringToFile("gg_log_" + date_time_str,gg_log) for cpos in range(len(cseq) - short_len): ccseq = cseq[cpos:cpos + short_len] if ccseq in short_hash: for k, v in short_hash[ccseq].items(): if k in cseq: cid = chead.split(' ')[0] gg_log += "rec found: seq id %s , db bact id %s, id %s\n" % (seqdbid, v, cid) #check if already exist err, existFlag = dbsequences.WholeSeqIdExists(con,cur, seqdbid, v, cid); if err: count_seq_is_exist_failure += 1 gg_log += "failed to found\n" if existFlag: count_seq_exist += 1 gg_log += "found\n" isFound = True break else: debug(2, "add normal") err = dbsequences.AddWholeSeqId(con,cur, seqdbid, v, cid) if err: gg_log += "failed to add\n" count_seq_failure += 1 break else: gg_log += "added\n" count_seq_success += 1 isFound = True break #go over all ids, if not exist add record for seq_id in all_ids: err, existFlag = dbsequences.WholeSeqIdExists(con,cur, seqdbid, seq_id) if err: count_seq_is_exist_dummy_failure += 1 gg_log += "failed to found\n" if existFlag: count_seq_dummy_exist += 1 gg_log += "found\n" isFound = True break else: debug(2, "add dummy") err = dbsequences.AddWholeSeqId(con,cur, seqdbid, seq_id, 'na') if err: gg_log += "failed to add\n" count_seq_dummy_failure += 1 break else: gg_log += "added\n" count_seq_dummy_success += 1 break debug(2, 'done') summary_str = "failed count = %s\nsuccess count = %s\nis exist count = %s\nis exist error = %s\nfailed dummy count = %s\nsuccess dummy count = %s\nis exist dummy count = %s\nis exist dummy error %s\n" % (count_seq_failure,count_seq_success,count_seq_exist,count_seq_is_exist_failure,count_seq_dummy_failure,count_seq_dummy_success,count_seq_dummy_exist,count_seq_dummy_exist) saveStringToFile("gg_summary_log_" + date_time_str,summary_str) saveStringToFile("gg_log_" + date_time_str,gg_log)
def connect_db(servertype='main', schema='AnnotationSchemaTest'): """ connect to the postgres database and return the connection and cursor input: servertype : str (optional) the database to access. options are: 'main' (default) - the main remote production database 'develop' - the remote development database 'local' - a local postgres instance of the database 'amnon' - the local mac installed veriosn of dbbact schema : str (optional) name of the schema containing the annotation database output: con : the database connection cur : the database cursor """ debug(1, 'connecting to database') try: database = 'scdb' user = '******' password = '******' port = 5432 host = 'localhost' if servertype == 'main': debug(1, 'servertype is main') database = 'scdb' user = '******' password = '******' port = 29546 elif servertype == 'develop': debug(1, 'servertype is develop') database = 'scdb_develop' user = '******' password = '******' port = 29546 elif servertype == 'local': debug(1, 'servertype is local') database = 'postgres' user = '******' password = '******' port = 5432 elif servertype == 'amnon': debug(1, 'servertype is amnon') database = 'dbbact' user = '******' password = '******' port = 5432 elif servertype == 'openu': debug(1, 'servertype is openu') database = 'scdb' user = '******' password = '******' port = 5432 else: debug(6, 'unknown server type %s' % servertype) print('unknown server type %s' % servertype) if servertype == 'openu': debug(1, 'connecting database=%s, user=%s, port=%d' % (database, user, port)) con = psycopg2.connect(database=database, user=user, password=password, port=port) else: debug(1, 'connecting host=%s, database=%s, user=%s, port=%d' % (host, database, user, port)) con = psycopg2.connect(host=host, database=database, user=user, password=password, port=port) cur = con.cursor(cursor_factory=psycopg2.extras.DictCursor) cur.execute('SET search_path to %s' % schema) debug(1, 'connected to database') return (con, cur) except psycopg2.DatabaseError as e: print('Cannot connect to database. Error %s' % e) raise SystemError('Cannot connect to database. Error %s' % e) return None
def GetSequenceId(con, cur, sequence, idprimer=None, no_shorter=False, no_longer=False): """ Get sequence ids for a sequence input: con,cur : database connection and cursor sequence : str (ACGT sequences) idprimer : int (optional) if supplied, verify the sequence is from this idPrimer no_shorter : bool (optional) False (default) to enable shorter db sequences matching sequence, True to require at least length of query sequence no_longer : bool (optional) False (default) to enable longer db sequences matching sequence, True to require at least length of database sequence output: errmsg : str "" if ok, error msg if error encountered sid : list of int the ids of the matching sequences (empty tuple if not found) Note: can be more than one as we also look for short subsequences / long supersequences """ # check if the sequence is made only of digits assume it is a greengenes id if sequence.isdigit(): debug(1, 'getting id for ggid %s' % sequence) return GetSequenceIdFromGG(con, cur, int(sequence)) sid = [] cseq = sequence.lower() if len(cseq) < SEED_SEQ_LEN: errmsg = 'sequence too short (<%d) for sequence %s' % (SEED_SEQ_LEN, cseq) debug(4, errmsg) return errmsg, sid # look for all sequences matching the seed cseedseq = cseq[:SEED_SEQ_LEN] cur.execute('SELECT id,sequence FROM SequencesTable WHERE seedsequence=%s', [cseedseq]) if cur.rowcount == 0: errmsg = 'sequence %s not found' % sequence debug(1, errmsg) return errmsg, sid cseqlen = len(cseq) res = cur.fetchall() for cres in res: resid = cres[0] resseq = cres[1] if no_shorter: if len(resseq) < cseqlen: continue comparelen = cseqlen else: comparelen = min(len(resseq), cseqlen) if no_longer: if len(resseq) > cseqlen: continue if cseq[:comparelen] == resseq[:comparelen]: if idprimer is None: sid.append(resid) cur.execute('SELECT idPrimer FROM SequencesTable WHERE id=%s LIMIT 1', [resid]) res = cur.fetchone() if res[0] == idprimer: sid.append(resid) if len(sid) == 0: errmsg = 'sequence %s not found' % sequence debug(1, errmsg) return errmsg, sid return '', sid
def main_func_tax(): SetDebugLevel(0) date_time_str = datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S") #connect to the db con, cur = db_access.connect_db() debug(2, 'Started') if 'OPENU_FLAG' in os.environ: debug(2, 'Openu') else: debug(2, 'normal') rank_list = [] rank_list.append("rootrank") rank_list.append("life") rank_list.append("domain") rank_list.append("kingdom") rank_list.append("phylum") rank_list.append("class") rank_list.append("order") rank_list.append("family") rank_list.append("genus") rank_list.append("species") count_success = 0 count_failure = 0 count_seq_success = 0 count_seq_failure = 0 count = 1 tax_log = "" rdp_exe_location = "rdp_classifier_2.12/" sleep_time = 86400 while isFileExist("stop_tax") == False: removeFile("%sinput" % rdp_exe_location) removeFile("%soutput" % rdp_exe_location) err, seq_id = dbsequences.GetSequenceWithNoTaxonomyID(con, cur) if err or seq_id == -1: #If no empty sequence, wait for long time debug(2, "go to sleep") tax_log += "sleep start " + datetime.datetime.now().strftime( "%Y-%m-%d--%H:%M:%S") + "\n" saveStringToFile( "tax_summary_log_sleep_" + date_time_str, "sleep started " + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")) #continue return # insted of sleep, one master file run all scripts tax_log += "sequence id = " + str(seq_id) + "\n" err, seq_str = dbsequences.GetSequenceStrByID(con, cur, seq_id) if err: tax_log += "Fatal Error, could not find sequence " + "\n" break #java -Xmx1g -jar dist/classifier.jar classify -o output_filename example.fasta input_file_name = "%sinput" % rdp_exe_location output_file_name = "%soutput" % rdp_exe_location #get the taxononmy for specific sequence createSeqFile(input_file_name, seq_str) os.system("java -Xmx1g -jar %sdist/classifier.jar classify -o %s %s" % (rdp_exe_location, output_file_name, input_file_name)) tex_res = readResultFromFile(output_file_name) tax_log += "the data:\n" for line in tex_res: tax_log += line + "\n" data = line.split('\t') #search for the string prev = "" has_failure = False size_of_list = len(data) list_index = 0 while list_index < size_of_list: has_failure = False curr_val = data[list_index] curr_val = curr_val.replace("\"", "") curr_val = curr_val.replace("\n", "") for y in rank_list: if curr_val == y: tax_log += curr_val + " = " + prev if list_index > 0 & list_index < (size_of_list - 1): # keep the next and previous value prev_val = data[list_index - 1] next_val = data[list_index + 1] #remove unnecesary characters prev_val = prev_val.replace("\"", "") prev_val = prev_val.replace("\n", "") next_val = next_val.replace("\"", "") next_val = next_val.replace("\n", "") if (float(next_val) >= 0.9): # Add to DB if dbsequences.AddSequenceTax( con, cur, seq_id, "tax" + curr_val, prev_val) == True: tax_log += " SUCCESS" + "\n" count_seq_success = count_seq_success + 1 else: tax_log += " FAILED" + "\n" count_seq_failure = count_seq_failure + 1 has_failure = True else: tax_log += " FAILED (low probablility)" + "\n" else: tax_log += " FAILED (bad index)" + "\n" list_index = list_index + 1 if has_failure == True: count_failure = count_failure + 1 else: count_success = count_success + 1 summary_str = "count_success = %s\ncount_failure = %s\ncount_seq_success = %s\ncount_seq_failure = %s\n" % ( count_success, count_failure, count_seq_success, count_seq_failure) saveStringToFile("tax_summary_log_" + date_time_str, summary_str) saveStringToFile("tax_log_" + date_time_str, tax_log) debug(2, 'found sequence %s' % seq_str) debug(2, 'return %s' % tex_res) count = count + 1 #stop the script in case of error if count_failure > 0: break
def add_term_info(servertype='develop', overwrite=False, add_pairs=True, add_single=True, add_parents=True, max_annotation_terms=15): '''Fill the term info details for each ontology term into the TermInfoTable. Terms are taken from all the annotations in the database Term details include: TotalExperiments: total number of experiments the term appears in TotalAnnotations: total number of annotations the term appears in Parameters ---------- servertype : str (optional) database to connect to ('main' or 'develop' or 'local') overwrite : bool (optional) False (default) to not overwrite existing (non-zero) seqCounts, True to delete all add_pairs: bool (optional) Add information about term pairs from each annotation add_single: bool, optional Add information about each single term in the annotation max_annotation_terms: int, optional maximal number of terms in an annotation in order to process the pairs in it ''' con, cur = connect_db(servertype=servertype) # remove the old counts cur.execute('DELETE FROM TermInfoTable') # get the lower detailtypes (i.e. 'low'). For these types we add - before lowertypes = set() cur.execute('SELECT id FROM AnnotationDetailsTypesTable WHERE description=%s', ['low']) lowertypes.add(cur.fetchone()[0]) term_id_experiments = defaultdict(set) term_id_annotations = defaultdict(int) all_term_ids = set() cur.execute('SELECT id, idexp from AnnotationsTable') res = cur.fetchall() debug(6, 'Getting term info from %d annotations' % len(res)) # iterate over all annotations for idx, cres in enumerate(res): annotation_terms = set() if idx % 100 == 0: debug(4, 'processed %d annotations' % idx) cannotation_id = cres[0] cexp_id = cres[1] cur.execute('SELECT idontology, idannotationdetail FROM AnnotationListTable WHERE idannotation=%s', [cannotation_id]) res2 = cur.fetchall() for cres2 in res2: cterm = cres2[0] all_term_ids.add(cterm) # if it is lower, add it as negative (we'll use it when we convert to strings...) if cres2[1] in lowertypes: cterm = -cterm term_id_experiments[cterm].add(cexp_id) term_id_annotations[cterm] += 1 annotation_terms.add(cterm) if add_pairs: if len(annotation_terms) <= max_annotation_terms: pairs = tessa(list(annotation_terms)) for cpair in pairs: cpair = tuple(sorted(cpair)) term_id_experiments[cpair].add(cexp_id) term_id_annotations[cpair] += 1 # get the term names for all the terms we encountered term_id_to_name = {} for cterm_id in all_term_ids: cur.execute('SELECT description FROM OntologyTable WHERE id=%s LIMIT 1', [cterm_id]) res = cur.fetchone() term_id_to_name[cterm_id] = res[0] debug(6, 'found %d terms' % len(term_id_experiments)) num_single = 0 num_pairs = 0 for cid in term_id_experiments.keys(): if add_single: if isinstance(cid, int): if cid > 0: cterm = term_id_to_name[cid] else: cterm = '-' + term_id_to_name[-cid] term_experiments = len(term_id_experiments[cid]) term_annotations = term_id_annotations[cid] cur.execute('INSERT INTO TermInfoTable (term, TotalExperiments, TotalAnnotations,TermType) VALUES (%s, %s, %s, %s)', [cterm, term_experiments, term_annotations, 'single']) num_single += 1 if add_pairs: if isinstance(cid, tuple): cnames = [] for ccid in cid: if ccid > 0: cnames.append(term_id_to_name[ccid]) else: cnames.append('-' + term_id_to_name[-ccid]) cnames = sorted(cnames) cterm = '+'.join(cnames) term_experiments = len(term_id_experiments[cid]) term_annotations = term_id_annotations[cid] cur.execute('INSERT INTO TermInfoTable (term, TotalExperiments, TotalAnnotations,TermType) VALUES (%s, %s, %s, %s)', [cterm, term_experiments, term_annotations, 'pair']) num_pairs += 1 debug(6, 'updated %d single, %d pairs' % (num_single, num_pairs)) debug(6, 'commiting') con.commit() debug(6, 'done')