def delete_term(ctx, term): '''Add a new term to the dbbact ontology ''' con = ctx.obj['con'] cur = ctx.obj['cur'] log_file = ctx.obj['log_file'] term = term.lower() debug(3, 'delete-term for term %s' % term) term_id = _add_dbbact_term(con, cur, term, create_if_not_exist=False, only_dbbact=True) # check if it is a parent of someone cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyparentid=%s', [term_id]) if cur.rowcount > 0: raise ValueError('The term %s is a parent of %d terms. Cannot delete' % cur.rowcount) # check if it appears in annotations cur.execute('SELECT idannotation FROM AnnotationListTable WHERE idontology = %s', [term_id]) if cur.rowcount > 0: raise ValueError('The term %s appears in %d annotations. Cannot delete' % cur.rowcount) res = input('Delete %s (%s): Are you sure (y/n)?' % (term, term_id)) if not res.lower() in ('y', 'yes'): raise ValueError('Delete aborted') # delete all the entries where it is a child cur.execute('DELETE FROM ontologytreestructuretable WHERE ontologyid=%s', [term_id]) # and delete the term itself cur.execute('DELETE FROM ontologytable WHERE id=%s', [term_id]) con.commit() _write_log(log_file, 'delete_term for term: %s (id: %s)' % (term, term_id))
def delete_annotation(con, cur, annotationid, userid=0, delete=False, commit=False): debug(3, 'delete annotation %d' % annotationid) if delete: res = DeleteAnnotation(con, cur, annotationid=annotationid, userid=userid, commit=False) if res: debug(5, res) if commit: con.commit()
def add_term(ctx, term): '''Add a new term to the dbbact ontology ''' con = ctx.obj['con'] cur = ctx.obj['cur'] log_file = ctx.obj['log_file'] term = term.lower() debug(3, 'add-term for term %s' % term) term_id = _add_dbbact_term(con, cur, term) con.commit() _write_log(log_file, 'add_term for term: %s (id: %s)' % (term, term_id))
def add_term_to_annotation(ctx, old_term, new_term, experiments, add_if_not_exist): '''Add another term to annotations containing a given term ''' con = ctx.obj['con'] cur = ctx.obj['cur'] log_file = ctx.obj['log_file'] old_term = old_term.lower() new_term = new_term.lower() debug(3, 'add term %s to annotations with term %s' % (old_term, new_term)) # not sure if multiple provides None or [], so let's make it None if experiments is not None: if len(experiments) == 0: experiments = None else: experiments = set(experiments) old_term_id = _get_term_id(con, cur, old_term, only_dbbact=False) if old_term_id is None: raise ValueError('Term %s does not exist' % old_term) new_term_id = _add_dbbact_term(con, cur, new_term, create_if_not_exist=add_if_not_exist, only_dbbact=False) # get all annotations with the old term cur.execute('SELECT idannotation,idannotationdetail FROM AnnotationListTable WHERE idontology=%s', [old_term_id]) if cur.rowcount == 0: raise ValueError('No annotations found containing term %s' % old_term) debug(3, 'found %d annotations with the term %s' % (cur.rowcount, old_term)) annotations = cur.fetchall() num_added = 0 num_non_match = 0 for cannotation in annotations: cannotation_id = cannotation['idannotation'] canntation_detail = cannotation['idannotationdetail'] if experiments is not None: cur.execute('SELECT idexp FROM AnnotationsTable WHERE id=%s LIMIT 1', [cannotation_id]) if cur.rowcount == 0: debug(7, 'experiment ID %s not found! skipping' % cannotation_id) num_non_match += 1 continue res = cur.fetchone() if res['idexp'] not in experiments: continue cur.execute('INSERT INTO AnnotationListTable (idannotation, idannotationdetail, idontology) VALUES (%s, %s, %s)', [cannotation_id, canntation_detail, new_term_id]) num_added += 1 debug(3, 'added new term to %d annotations (%d annotations skipped)' % (num_added, num_non_match)) _write_log(log_file, 'add_term_to_annotation for old_term: %s (id: %s) to new_term: %s (id: %s)' % (old_term, old_term_id, new_term, new_term_id)) con.commit() debug(3, 'done')
def main(argv): parser = argparse.ArgumentParser(description='delete_annotations version %s\ndelete sequences not in any annotation' % __version__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--port', help='postgres port', default=5432, type=int) parser.add_argument('--host', help='postgres host', default=None) parser.add_argument('--database', help='postgres database', default='dbbact') parser.add_argument('--user', help='postgres user', default='dbbact') parser.add_argument('--password', help='postgres password', default='magNiv') parser.add_argument('--annotationids', help='list of annotation ids to delete (space separated)', nargs='+', type=int) parser.add_argument('--expids', help='list of experiment ids to delete (space separated)', nargs='+', type=int) parser.add_argument('--delete', help='delete the sequences', action='store_true') parser.add_argument('--noseq', help='delete only annotations where all sequences do not start with noseq (i.e. acgt to not delete v4)') parser.add_argument('--log-level', help='output level (1 verbose, 10 error)', type=int, default=3) args = parser.parse_args(argv) SetDebugLevel(args.log_level) con, cur = db_access.connect_db(database=args.database, user=args.user, password=args.password, port=args.port, host=args.host) annotationids = [] # fill the annotations from each experiment if args.expids: for cexpid in args.expids: cur.execute("SELECT id from AnnotationsTable WHERE idexp=%s", [cexpid]) for cres in cur: annotationids.append(cres[0]) debug(3, 'found %d annotations for the experiments' % len(annotationids)) # and add the annotation ids supplied if args.annotationids is not None: annotationids.extend(args.annotationids) for cannotationid in annotationids: # test if all sequences of the annotation don't start with sequence notseq if args.noseq is not None: cur2 = con.cursor(cursor_factory=psycopg2.extras.DictCursor) noseq = args.noseq badseqs = 0 cur.execute("SELECT seqid FROM SequencesAnnotationTable WHERE annotationid=%s", [cannotationid]) for cres in cur: cseqid = cres[0] cur2.execute("SELECT sequence FROM SequencesTable WHERE id=%s", [cseqid]) res = cur2.fetchone()[0] if res[:len(noseq)] == noseq: badseqs += 1 if badseqs > 0: debug(5, "Annotation %d contains %d sequences starting with the noseq sequenece %s. not deleting" % (cannotationid, badseqs, noseq)) continue # get the user that created the annotation cur.execute("SELECT iduser FROM AnnotationsTable WHERE id=%s LIMIT 1", [cannotationid]) res = cur.fetchone() cuserid = res['iduser'] # and delete delete_annotation(con, cur, annotationid=cannotationid, userid=cuserid, delete=args.delete) debug(3, 'committing') con.commit() debug(3, 'done. please run delete_unused_seqs.py to remove unused sequences')
def _get_term_id(con, cur, term, fail_if_not_there=True, only_dbbact=True): '''Get the idx of a given dbBact term description or term_id if more than 1 match exists, get the dbBact match Parameters ---------- con, cur term: str the term description or term_id (dbbact:XXXX) to look for fail_if_not_there: bool, optional if True, fail if term does not exist. If false, return None instead only_dbbact: bool, optional if True, return only IDs for terms in dbbact ontology. If false, return term id for any ontology Returns ------- id: int ''' cur.execute("SELECT * FROM ontologytable WHERE term_id=%s", [term]) if cur.rowcount == 0: cur.execute("SELECT * FROM ontologytable WHERE description=%s", [term]) res = cur.fetchall() num_dbbact = 0 for cres in res: if only_dbbact: if not cres['term_id'].startswith('dbbact:'): continue term_id = cres['id'] num_dbbact += 1 if num_dbbact == 0: if fail_if_not_there: if only_dbbact: raise ValueError('Term %s not found in dbbact ontology. Found in %d non-dbbact' % (term, len(res))) else: raise ValueError('Term %s not found' % term) else: debug(2, 'term %s not found' % term) return None if num_dbbact > 1: raise ValueError('Term %s has >1 (%d) dbBact matches' % (term, num_dbbact)) debug(2, 'term found with 1 instance in ontologytable. id=%d' % term_id) return term_id
def update_old_primer_seqs(con, cur, old_primer, new_primer, commit=True): '''update all sequences with primer old_primer to new primer new_primer ''' cur.execute("SELECT sequence FROM SequencesTable WHERE idprimer=%s", [old_primer]) debug(3, 'found %d sequences with old primer %d' % (cur.rowcount, old_primer)) seqs = [] res = cur.fetchall() for cres in res: seqs.append(cres['sequence']) for cseq in seqs: update_sequence_primer(con, cur, sequence=cseq, primer=new_primer, commit=False) if commit: con.commit()
def add_annotation_seq_count(con, cur): cur2 = con.cursor(cursor_factory=psycopg2.extras.DictCursor) debug(3, 'add_annotation_seq_count started') debug(2, 'processing annotations') # iterate over all annotations num_anno = 0 cur.execute('SELECT id FROM AnnotationsTable') for cres in cur: cid = cres['id'] cur2.execute( 'SELECT COUNT(*) FROM sequencesannotationtable WHERE annotationid=%s', [cid]) cres2 = cur2.fetchone() num_seqs = cres2[0] cur2.execute('UPDATE annotationstable SET seqcount=%s WHERE id=%s', [num_seqs, cid]) num_anno += 1 debug(2, 'scanned %d annotations.' % num_anno) debug(2, 'committing') con.commit() debug(3, 'done')
def add_dbbact_ids(con, cur): ''' Add the approriate dbbact ontology id (term_id) to each dbbact term in OntologyTable need to run once since we didn't dutomatically set when adding new term ''' debug(3, 'getting terms without ontology term_id') cur.execute("SELECT * FROM ontologytable WHERE term_id=''") res = cur.fetchall() debug(3, 'found %d terms' % len(res)) for cres in res: cid = cres['id'] new_id_ontology = 'dbbact:%s' % cid cur.execute('UPDATE ontologytable SET term_id=%s WHERE id=%s', [new_id_ontology, cid]) debug(3, 'committing') con.commit() debug(3, 'done')
def fix_na(con, cur, commit=False): '''Update the OntologyTreeStructureTable to fix the old na root term (which was undefined as contained many optional NAs) Parameters ---------- con, cur: dbbact psycopg2 database connection and cursor commit: bool, optional True to commit changes, False to just perform dry run ''' # find the id of the dbbact ontology cur.execute('SELECT * FROM ontologynamestable WHERE description=%s', ['dbbact']) res = cur.fetchone() ontologynameid = res['id'] if ontologynameid != 8: raise ValueError('strange dbbact ontologynameid: %s (instead of 8)' % ontologynameid) # find the dbbact root term id "dbbact root" (id 1811274) cur.execute('SELECT * from OntologyTable WHERE description=%s', ['dbbact root']) res = cur.fetchone() if res['term_id'] != 'dbbact:1811274': raise ValueError('"dbbact root" term_id is %s instead of dbbact:1811274' % res['term_id']) root_id = res['id'] cur.execute('SELECT * FROM OntologyTable WHERE term_id LIKE %s', ['dbbact:%']) debug(3, 'Found %d dbbact terms' % cur.rowcount) res = cur.fetchall() num_na_parents = 0 for cres in res: cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyid=%s', [cres['id']]) tres = cur.fetchall() for ctres in tres: cur.execute('SELECT * FROM OntologyTable WHERE id=%s LIMIT 1', [ctres['ontologyparentid']]) if cur.rowcount == 0: continue ttres = cur.fetchone() if ttres['description'] == 'na': cur.execute('UPDATE OntologyTreeStructureTable SET ontologyparentid=%s, ontologynameid=%s WHERE uniqueid=%s', [root_id, ontologynameid, ctres['uniqueid']]) num_na_parents += 1 debug(4, 'updating %d dbbact terms roots' % num_na_parents) if commit: con.commit() debug(3, 'commited') debug(3, 'done')
def delete_unused_seqs(con, cur, delete=False): debug(3, 'delete unused seqs started') if delete: cur.execute( 'DELETE FROM SequencesTable WHERE NOT EXISTS(SELECT SequencesAnnotationTable.seqid FROM SequencesAnnotationTable WHERE SequencesAnnotationTable.seqid = SequencesTable.id)' ) debug(3, 'deleted') con.commit() else: cur.execute( 'SELECT * FROM SequencesTable WHERE NOT EXISTS(SELECT SequencesAnnotationTable.seqid FROM SequencesAnnotationTable WHERE SequencesAnnotationTable.seqid = SequencesTable.id)' ) print('NOT DELETING, but found %d sequences to delete' % cur.rowcount) debug(3, 'NOT DELETING, but found %d sequences to delete' % cur.rowcount)
def prepare_dbbact_calour_term_files(con, cur, outdir='./', include_synonyms=True): '''Prepare the 2 ontology term pickle files needed for dbbact_calour new annotation term autocomplete. Output is saved into 2 files: ontology.pickle: dict of {name(str): ontologyid(str)} name: contains the full term/sysnonim name + "(+"ONTOLOGY NAME+"original term + ")". This is the string displayed to the user ontologyid: contains a unique id for this term that appears in the data/ontologyfromid.pickle file (loaded to DBAnnotateSave._ontology_from_id). ontologyfromid.pickle: dict of {ontologyid(str): term(str)} ontologyid: contains a unique id for each of the terms (linked from data/ontologies.pickle or DBAnnotateSave._ontology_dict) term: the dbbact term name For example for the term "united states of america" we have in DBAnnotateSave._ontology_dict key "U.S.A. :GAZ(United States of America)" with value GAZ:00002459 and in DBAnnotateSave._ontology_from_id we have key "GAZ:00002459" with value "United States of America" Parameters ---------- outdir: str, optional name of the output dir where to save the pickle files include_synonyms: bool, optional True to add also all entries from synonyms table ''' debug(3, 'Counting all terms in dbBact') cur2 = con.cursor() cur2.execute('PREPARE find_syn(int) AS SELECT synonym FROM OntologySynonymTable WHERE idontology=$1') cur.execute('SELECT id, description, term_id FROM OntologyTable') num_terms_found = cur.rowcount debug(4, 'found %d terms' % num_terms_found) term_name_id = defaultdict(dict) term_id_term = defaultdict(dict) num_terms = 0 while True: res = cur.fetchone() if res is None: break num_terms += 1 if num_terms % 100000 == 0: debug(3, '%s (scanned %d/%d)' % (res, num_terms, num_terms_found)) term_names = [res['description']] main_term = res['description'] ontology_name = 'dbbact' if ':' in res['term_id']: ontology_name = res['term_id'].split(':')[0] cterm_id = res['term_id'] if cterm_id == '': cterm_id = 'dbbact:%d' % res['id'] # also get all the synonyms for the term if needed if include_synonyms: cur2.execute('EXECUTE find_syn(%s)', [res['id']]) if cur2.rowcount > 0: for cres2 in cur2: term_names.append(cres2[0]) for cterm in term_names: # if a synonym, put the original term in the parenthesis if cterm != main_term: term_name_id[ontology_name]['%s (%s - %s)' % (cterm, main_term, cterm_id)] = res['id'] # not sysnonym, so no need to add the original term - just the ENVO:XXXXX etc. else: term_name_id[ontology_name]['%s (%s)' % (cterm, cterm_id)] = res['id'] term_id_term[ontology_name][res['id']] = res['description'] # move small ontologies to 'other' ontology small_ontologies = [] all_ontologies = list(term_id_term.keys()) for contology in all_ontologies: if len(term_name_id[contology]) < 500: term_name_id['other'].update(term_name_id[contology]) term_id_term['other'].update(term_id_term[contology]) del term_name_id[contology] del term_id_term[contology] small_ontologies.append(contology) print('moved %d small ontologies into "other" ontology:\n%s' % (len(small_ontologies), small_ontologies)) # and save for contology in term_id_term.keys(): with open(os.path.join(outdir, contology + '.ontology.pickle'), 'wb') as ofl: pickle.dump(term_name_id[contology], ofl) with open(os.path.join(outdir, contology + '.ontology.ids.pickle'), 'wb') as ofl: pickle.dump(term_id_term[contology], ofl)
def rename_term(ctx, old_term, new_term, experiments, add_if_not_exist, ignore_no_annotations, inplace): '''replace a term with another term in all annotations. If inplace=True, just change the description of the term If the new term does not exist, dbBact creates it into the dbbact ontology ''' con = ctx.obj['con'] cur = ctx.obj['cur'] log_file = ctx.obj['log_file'] old_term = old_term.lower() new_term = new_term.lower() debug(3, 'rename term %s to term %s' % (old_term, new_term)) # not sure if multiple provides None or [], so let's make it None if experiments is not None: if len(experiments) == 0: experiments = None if experiments is not None: if inplace: raise ValueError('Cannot replcae in place in a subset of experiments.') old_term_id = _get_term_id(con, cur, old_term, only_dbbact=False) if old_term_id is None: raise ValueError('Term %s does not exist' % old_term) if inplace: cur.execute('SELECT * FROM OntologyTable WHERE description=%s', [new_term]) if cur.rowcount > 0: raise ValueError('new term %s already exists as description' % new_term) cur.execute('SELECT * FROM OntologyTable WHERE term_id=%s', [new_term]) if cur.rowcount > 0: raise ValueError('new term %s already exists as term_id' % new_term) cur.execute('UPDATE OntologyTable SET description=%s WHERE id=%s', [new_term, old_term_id]) _write_log(log_file, 'rename_term for old_term: %s (id: %s) to new_term: %s in place' % (old_term, old_term_id, new_term)) con.commit() debug(3, 'done') return new_term_id = _add_dbbact_term(con, cur, new_term, create_if_not_exist=add_if_not_exist, only_dbbact=False) # get all annotations with the old term cur.execute('SELECT idannotation FROM AnnotationListTable WHERE idontology=%s', [old_term_id]) if cur.rowcount == 0: if not ignore_no_annotations: raise ValueError('No annotations found containing term %s' % old_term) debug(3, 'found %d annotations with the term %s' % (cur.rowcount, old_term)) # update to the new term if experiments is None: cur.execute('UPDATE AnnotationListTable SET idontology=%s WHERE idontology=%s', [new_term_id, old_term_id]) else: num_match = 0 match_exps = set() non_match_exps = set() num_non_match = 0 experiments = set(experiments) annotations = cur.fetchall() for cannotation in annotations: cannotation_id = cannotation['idannotation'] cur.execute('SELECT idexp FROM AnnotationsTable WHERE id=%s LIMIT 1', [cannotation_id]) if cur.rowcount == 0: debug(7, 'experiment ID %s not found! skipping' % cannotation_id) continue res = cur.fetchone() if res['idexp'] in experiments: num_match += 1 match_exps.add(res['idexp']) cur.execute('UPDATE AnnotationListTable SET idontology=%s WHERE idontology=%s AND idannotation=%s', [new_term_id, old_term_id, cannotation_id]) else: num_non_match += 1 non_match_exps.add(res['idexp']) debug(3, 'found %d annotations (%d experiments) with a matching expid, %d (%d) without' % (num_match, len(match_exps), num_non_match, len(non_match_exps))) # update the ontology parents table - only if we did not do a partial update if experiments is None: cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyparentid=%s', [old_term_id]) if cur.rowcount > 0: debug(3, 'Found %d terms with %s as parent term. Updating' % (cur.rowcount, old_term)) res = cur.fetchall() for cres in res: cur.execute('UPDATE OntologyTreeStructureTable SET ontologyparentid=%s WHERE uniqueid=%s', [new_term_id, cres['uniqueid']]) _write_log(log_file, 'rename_term for old_term: %s (id: %s) to new_term: %s (id: %s)' % (old_term, old_term_id, new_term, new_term_id)) con.commit() debug(3, 'done')
def update_sequence_primer(con, cur, sequence, primer, commit=True): '''Update the primer region for the sequence. If the sequence already appears in dbBact with a different primer region, merge the two using the other region sequence Parameters ---------- con, cur: sequence: str the exact sequence to update (acgt) primer: int or str the primer region id (int) or name (str - i.e. 'v4') to update commit: bool, optional if True, commit after update Returns ------- error (str) or '' ''' debug( 2, 'update_sequence_primer for sequence %s new region %s' % (sequence, primer)) # setup the primer to be the id if not isinstance(primer, int): primer = GetIdFromName(con, cur, primer) # get the sequence id. Note we use idprimer=None since we don't want to look for the new region err, seqids = GetSequenceId(con, cur, sequence=sequence, idprimer=None, no_shorter=True, no_longer=True, seq_translate_api=None) if err: return err debug(2, 'found %d total matches to the sequence' % len(seqids)) if len(seqids) == 0: msg = 'trying to update sequence %s failed since it is not in SequencesTable' % sequence debug(4, msg) return msg # do we also have the same sequence with the correct primer? err, okid = GetSequenceId(con, cur, sequence=sequence, idprimer=primer, no_shorter=True, no_longer=True, seq_translate_api=None) if err: if err != 'primer mismatch': debug(5, err) return err # no region matches so choose the first, update it, and move all the others to it if len(okid) == 0: debug( 1, 'could not find sequence with good region. chose seqid %d and updating it' % seqids[0]) okid = seqids[0] cur.execute('UPDATE SequencesTable SET idprimer=%s WHERE id=%s', [primer, okid]) else: debug( 3, 'found good sequence id %s. transferring annotations to id' % okid) if len(okid) > 1: debug( 4, 'strange. found %d exact matches including region' % len(okid)) okid = okid[0] # now transfer all annotations from the wrong region sequence to the ok (match) sequence and delete the wrong region sequences for cseqid in seqids: if cseqid == okid: continue debug( 4, 'moving seqid %d to ok sequence %d and deleting' % (cseqid, okid)) cur.execute( 'UPDATE SequencesAnnotationTable SET seqid=%s WHERE seqid=%s', [okid, cseqid]) cur.execute('DELETE FROM SequencesTable WHERE id=%s', [cseqid]) if commit: debug(3, 'committing') con.commit() debug(1, 'update finished') return ''
def add_parent(ctx, term, parent, add_if_not_exist, old_parent, only_dbbact): '''Link a dbBact ontology term to a dbBact parent term. If the parent term does not exist, dbBact creates it ''' con = ctx.obj['con'] cur = ctx.obj['cur'] commit = ctx.obj['commit'] log_file = ctx.obj['log_file'] term = term.lower() parent = parent.lower() debug(3, 'add parent %s to term %s' % (parent, term)) term_id = _get_term_id(con, cur, term) parent_term_id = _add_dbbact_term(con, cur, parent, create_if_not_exist=add_if_not_exist, only_dbbact=only_dbbact) # to be safe, get the dbBact ontology number cur.execute('SELECT id FROM ontologynamestable WHERE description=%s', ['dbbact']) ontology_database_id = cur.fetchone()[0] debug(2, 'dbBact database id=%s' % ontology_database_id) if ontology_database_id != 8: raise ValueError('dbbact id is not 8! it is %d' % ontology_database_id) # check if it had "dbbact root" (id 1811274) as parent - remove it cur.execute('DELETE FROM ontologytreestructuretable WHERE ontologynameid=8 AND ontologyparentid=1811274 AND ontologyid=%s', [term_id]) cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyid=%s', [term_id]) if cur.rowcount > 0: debug(3, 'old parents (%d) found for term' % cur.rowcount) if old_parent == 'replace': if cur.rowcount > 1: raise ValueError('More than 1 parent for term (%d). Cannot replace.' % cur.rowcount) # remove the old parent res = cur.fetchone() cur.execute('DELETE FROM ontologytreestructuretable WHERE uniqueid=%s', [res['uniqueid']]) elif old_parent == 'insert': if cur.rowcount > 1: raise ValueError('More than 1 parent for term (%d). Cannot insert.' % cur.rowcount) # add our parent term in the middle res = cur.fetchone() cur.execute('SELECT term_id FROM ontologytable WHERE id=%s', [res['ontologyparentid']]) idres = cur.fetchone() ctx.invoke(add_parent, term=parent, parent=idres['term_id'], add_if_not_exist=False, old_parent='fail') # and remove the old parent connection cur.execute('DELETE FROM ontologytreestructuretable WHERE uniqueid=%s', [res['uniqueid']]) elif old_parent == 'ignore': debug('term already has parents (%d). Ignoring and adding new parent' % cur.rowcount) elif old_parent == 'fail': raise ValueError('Parents (%d) already exists for term. To override use the old-parent option' % cur.rowcount) # add to the OntologyTreeStructureTable cur.execute('INSERT INTO ontologytreestructuretable (ontologyid, ontologyparentid, ontologynameid) VALUES (%s, %s, %s)', [term_id, parent_term_id, ontology_database_id]) debug(3, 'Inserted into ontologytreestructuretable') if commit: _write_log(log_file, 'add_parent for term: %s (id: %s) parent: %s (id: %s)' % (term, term_id, parent, parent_term_id)) con.commit() else: debug(5, 'dry run - not commiting') debug(3, 'done')
def term_info(ctx, term, partial, no_parent): '''Get information about a dbBact term ''' con = ctx.obj['con'] cur = ctx.obj['cur'] log_file = ctx.obj['log_file'] term = term.lower() debug(3, 'term-info for term %s' % term) cur.execute('SELECT * FROM ontologytable WHERE term_id=%s', [term]) if cur.rowcount == 0: if partial: cur.execute('SELECT * FROM ontologytable WHERE description LIKE %s', [term + '%']) else: cur.execute('SELECT * FROM ontologytable WHERE description=%s', [term]) res = cur.fetchall() for cres in res: cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyid=%s', [cres['id']]) skip_it = False all_parents = [] if cur.rowcount > 0: parents = cur.fetchall() for cparent in parents: cur.execute('SELECT * FROM OntologyTable WHERE id=%s LIMIT 1', [cparent['ontologyparentid']]) cinfo = cur.fetchone() all_parents.append('%s (%s)' % (cinfo['description'], cinfo['term_id'])) if cinfo['term_id'] == 'dbbact:1811274': continue skip_it = True if skip_it: if no_parent: continue print('\n*******************') print('TERM: %s (TERM_ID: %s )' % (cres['description'], cres['term_id'])) print(list(cres.items())) print('===================') print('PARENTS:') for cparent in all_parents: print(cparent) print('CHILDREN:') cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyparentid=%s', [cres['id']]) children = cur.fetchall() for cchild in children: cur.execute('SELECT * FROM ontologytable WHERE id=%s LIMIT 1', [cchild['ontologyid']]) cchilddet = cur.fetchone() print(cchilddet['description']) annotation_ids = [] exp_names = set() print('ANNOTATIONS:') cur.execute('SELECT idannotation FROM AnnotationListTable WHERE idontology = %s', [cres['id']]) res2 = cur.fetchall() for cres2 in res2: annotation_ids.append(cres2['idannotation']) print('total %d annotations' % len(annotation_ids)) for canno in annotation_ids: cur.execute('SELECT idexp FROM AnnotationsTable WHERE id=%s LIMIT 1', [canno]) res2 = cur.fetchone() cur.execute('SELECT * FROM ExperimentsTable WHERE expid=%s', [res2['idexp']]) res2 = cur.fetchall() for cexp in res2: if cexp['type'] != 'name': continue exp_names.add('%s (expid: %s)' % (cexp['value'], cexp['expid'])) print('----------------') print('Experiments:') for cname in exp_names: print(cname)
def update_term_info_old(con, cur): cur2 = con.cursor(cursor_factory=psycopg2.extras.DictCursor) cur3 = con.cursor(cursor_factory=psycopg2.extras.DictCursor) debug(3, 'update_term_info started') debug(2, 'dropping old TermInfoTable') cur.execute('DELETE FROM TermInfoTable') debug(2, 'processing terms') cur.execute('SELECT id,description FROM OntologyTable') for idx, cres in enumerate(cur): term_exps_pos = set() term_exps_neg = set() term_annotations_pos = set() term_annotations_neg = set() ctermid = cres['id'] cterm = cres['description'] # get all the annotations containing this term cur2.execute( 'SELECT idannotation,idannotationdetail FROM AnnotationListTable WHERE idontology=%s', [ctermid]) for ctres in cur2: ctype = ctres['idannotationdetail'] cannotation = ctres['idannotation'] # get more info about the annotation cur3.execute( 'SELECT idexp FROM AnnotationsTable WHERE id=%s LIMIT 1', [cannotation]) cares = cur3.fetchone() cexp = cares['idexp'] # if it's "LOWER IN cterm" it is neg if ctype == 2: term_exps_neg.add(cexp) term_annotations_neg.add(cannotation) else: term_exps_pos.add(cexp) term_annotations_pos.add(cannotation) cur2.execute( 'INSERT INTO TermInfoTable (term, TotalExperiments, TotalAnnotations,TermType) VALUES (%s, %s, %s, %s)', [cterm, len(term_exps_pos), len(term_annotations_pos), 'single']) cur2.execute( 'INSERT INTO TermInfoTable (term, TotalExperiments, TotalAnnotations,TermType) VALUES (%s, %s, %s, %s)', [ '-' + cterm, len(term_exps_neg), len(term_annotations_neg), 'single' ]) if idx % 1000 == 0: debug( 2, 'processed term %d: %s. pos exps %d, pos anno %d, neg exps %d, neg anno %d' % (idx, cterm, len(term_exps_pos), len(term_annotations_pos), len(term_exps_neg), len(term_annotations_neg))) if cterm == 'small village': debug( 2, 'processed term %d: %s. pos exps %d, pos anno %d, neg exps %d, neg anno %d' % (idx, cterm, len(term_exps_pos), len(term_annotations_pos), len(term_exps_neg), len(term_annotations_neg))) debug(2, 'committing') con.commit() debug(3, 'done')
def update_term_info(con, cur): cur2 = con.cursor(cursor_factory=psycopg2.extras.DictCursor) debug(3, 'update_term_info started') debug(2, 'dropping old TermInfoTable') cur.execute('DELETE FROM TermInfoTable') debug(2, 'processing annotations') term_pos_exps = defaultdict(set) term_neg_exps = defaultdict(set) term_pos_anno = defaultdict(set) term_neg_anno = defaultdict(set) all_term_ids = set() # iterate all annotationes / annotationsdetails cur.execute('SELECT id, idexp FROM AnnotationsTable') for idx, cres in enumerate(cur): cannoid = cres['id'] cexp = cres['idexp'] if idx % 1000 == 0: debug(2, 'processing annotation %d' % cannoid) cur2.execute( 'SELECT idontology, idannotationdetail FROM AnnotationListTable WHERE idannotation=%s', [cannoid]) for cdres in cur2: ctype = cdres['idannotationdetail'] ctermid = cdres['idontology'] all_term_ids.add(ctermid) # if LOWER IN if ctype == 2: term_neg_exps[ctermid].add(cexp) term_neg_anno[ctermid].add(cannoid) else: term_pos_exps[ctermid].add(cexp) term_pos_anno[ctermid].add(cannoid) debug(3, 'Found %d terms' % len(all_term_ids)) debug(2, 'adding stats to TermInfoTable') for ctermid in all_term_ids: cur2.execute( 'SELECT description FROM OntologyTable WHERE id=%s LIMIT 1', [ctermid]) if cur2.rowcount == 0: debug( 5, 'no term name in OntologyTable for termid %d. skipping' % ctermid) continue res = cur2.fetchone() cterm = res[0] tot_exps_pos = len(term_pos_exps[ctermid]) tot_anno_pos = len(term_pos_anno[ctermid]) tot_exps_neg = len(term_neg_exps[ctermid]) tot_anno_neg = len(term_neg_anno[ctermid]) if ctermid in term_pos_exps: # test if we already have the term in the terminfotable # if the term was already added (so same term name with 2 different term_ids (from 2 ontologies) in different annotations) # we want to agglomerate the count cur2.execute( 'SELECT TotalExperiments, TotalAnnotations FROM TermInfoTable WHERE term=%s LIMIT 1', [cterm]) if cur2.rowcount > 0: res = cur2.fetchone() debug(2, 'already found %s' % cterm) tot_exps_pos += res[0] tot_anno_pos += res[1] cur2.execute('DELETE FROM TermInfoTable WHERE term=%s', [cterm]) cur2.execute( 'INSERT INTO TermInfoTable (term, TotalExperiments, TotalAnnotations,TermType) VALUES (%s, %s, %s, %s)', [cterm, tot_exps_pos, tot_anno_pos, 'single']) if ctermid in term_neg_exps: # test if we already have the term in the terminfotable # if the term was already added (so same term name with 2 different term_ids (from 2 ontologies) in different annotations) # we want to agglomerate the count cur2.execute( 'SELECT TotalExperiments, TotalAnnotations FROM TermInfoTable WHERE term=%s LIMIT 1', ['-' + cterm]) if cur2.rowcount > 0: res = cur2.fetchone() debug(2, 'already found -%s' % cterm) tot_exps_neg += res[0] tot_anno_neg += res[1] cur2.execute('DELETE FROM TermInfoTable WHERE term=%s', ['-' + cterm]) cur2.execute( 'INSERT INTO TermInfoTable (term, TotalExperiments, TotalAnnotations,TermType) VALUES (%s, %s, %s, %s)', ['-' + cterm, tot_exps_neg, tot_anno_neg, 'single']) debug(2, 'committing') con.commit() debug(3, 'done')
def update_obsolete_terms(con, cur, ontofilename, ontology_name=None, commit=True): '''replace obsolete terms as indicated by "replaved_by" in the new ontology. This is done by updating the annotations - the old term is replaced by the new term NOTE: this is done only if the term only participates in a single ontology (in the tree structure) Parameters ---------- con, cur: dbbact psycopg2 database connection and cursor ontofilename : str name of the .obo ontology file to add ontology_name : str or None, optional if not None, update only terms that appear only in this ontology tree (i.e. 'silva') ''' # we need 2 phases since in dbbact we store the name, whereas the replaced_by stores the id if ontology_name is not None: cur.execute( 'SELECT id FROM OntologyNamesTable WHERE description=%s LIMIT 1', [ontology_name]) if cur.rowcount == 0: raise ValueError( 'ontology %s not found in OntologyNamesTable. stopping') ontology_name_id = cur.fetchone()[0] else: ontology_name_id = None debug(3, 'phase 1: getting obsolete terms') # phase1 - get the required ids parser = oboparse.Parser(open(ontofilename)) ids_to_get = defaultdict(list) num_obsolete = 0 num_to_replace = 0 for citem in parser: tags = citem.tags cid = tags["id"][0] # just obsolete terms if "is_obsolete" not in tags: continue if tags["is_obsolete"][0].lower() != 'true': continue num_obsolete += 1 # and we need the replaced_by field if "replaced_by" not in tags: continue replaced_id = tags['replaced_by'][0].lower() if replaced_id == 'false': continue if "name" not in tags: continue orig_name = tags['name'][0].lower() orig_name = re.sub('obsolete ', '', orig_name, 1) ids_to_get[replaced_id].append(orig_name) num_to_replace += 1 debug( 3, 'found %d obsolete terms. %d to replace, with %d new term ids' % (num_obsolete, num_to_replace, len(ids_to_get))) debug(3, 'phase2: replacing original terms in annotations') # phase2: go over all terms, and if in list, replace these new values instead of the old ones parser = oboparse.Parser(open(ontofilename)) for citem in parser: tags = citem.tags cid = tags["id"][0].lower() if cid not in ids_to_get: continue if 'name' not in tags: debug(4, 'need to replace with term %s but no name supplied' % cid) continue cname = tags['name'][0] cur.execute( 'SELECT id FROM OntologyTable WHERE description=%s LIMIT 1', [cname]) if cur.rowcount == 0: debug(6, 'new term %s not found in ontology table' % cname) continue contoid = cur.fetchone()[0] for cobsolete_term in ids_to_get[cid]: cur.execute( 'SELECT id FROM OntologyTable WHERE description=%s LIMIT 1', [cobsolete_term]) if cur.rowcount == 0: debug( 6, 'obsolete term %s for new term %s not found in ontology table' % (cobsolete_term, cname)) continue cobsolete_id = cur.fetchone()[0] if ontology_name_id is not None: # make sure the obsolete term does not participate in other ontologies cur.execute( 'SELECT ontologynameid FROM OntologyTreeStructureTable WHERE ontologyid=%s AND ontologynameid!=%s', [cobsolete_id, ontology_name_id]) if cur.rowcount > 0: debug( 6, 'obsolete term %s participates in other ontologies. skipping' % cobsolete_term) continue cur.execute( 'UPDATE OntologyTable SET replaced_by=%s WHERE id=%s', [contoid, cobsolete_id]) debug( 3, 'for term %s (%d) replace with term %s (%d)' % (cobsolete_term, cobsolete_id, cname, contoid)) cur.execute( 'SELECT idannotation, idannotationdetail FROM AnnotationListTable WHERE idontology=%s', [cobsolete_id]) debug(3, 'got %d annotations with this term' % cur.rowcount) res = cur.fetchall() for cres in res: cidannotation = cres['idannotation'] cidannotationdetail = cres['idannotationdetail'] cur.execute( 'SELECT * FROM AnnotationListTable WHERE idannotation=%s AND idannotationdetail=%s AND idontology=%s', [cidannotation, cidannotationdetail, contoid]) if cur.rowcount == 0: cur.execute( 'INSERT INTO AnnotationListTable (idannotation, idannotationdetail, idontology) VALUES (%s, %s, %s)', [cidannotation, cidannotationdetail, contoid]) else: debug( 5, 'entry already exists for annotation %d' % cidannotation) cur.execute( 'DELETE FROM AnnotationListTable WHERE idannotation=%s AND idannotationdetail=%s AND idontology=%s', [cidannotation, cidannotationdetail, cobsolete_id]) debug( 3, 'did it for term %s replace with term %s' % (cobsolete_term, cname)) if commit: con.commit() debug(3, 'done')
def add_seq_counts(con, cur): cur2 = con.cursor(cursor_factory=psycopg2.extras.DictCursor) seq_exps = defaultdict(set) seq_annotations = defaultdict(set) debug(3, 'add_seq_counts started') debug(2, 'processing sequences') cur.execute('SELECT seqid,annotationid FROM SequencesAnnotationTable') for cres in cur: cseq_id = cres['seqid'] canno_id = cres['annotationid'] cur2.execute('SELECT idexp FROM AnnotationsTable WHERE id=%s LIMIT 1', [canno_id]) cres2 = cur2.fetchone() if cur2.rowcount != 0: cexp_id = cres2[0] seq_exps[cseq_id].add(cexp_id) if canno_id in seq_annotations[cseq_id]: debug( 5, 'sequence %s already associated with annotation %s' % (cseq_id, canno_id)) seq_annotations[cseq_id].add(canno_id) else: debug( 5, 'sequence %s annotationid %s does not exist in annotationstable' % (cseq_id, canno_id)) debug(2, 'found data for %d sequences' % len(seq_exps)) debug(2, 'adding total_annotations, total_experiments to SequencesTable') for cseq_id in seq_annotations.keys(): cur.execute( 'UPDATE SequencesTable SET total_annotations=%s, total_experiments=%s WHERE id=%s', [len(seq_annotations[cseq_id]), len(seq_exps[cseq_id]), cseq_id]) con.commit() debug(3, 'done')
def delete_unused_terms(con, cur, commit=True): '''Delete all unused terms from OntologyTable only delete terms that are not in annotations or tree structure Parameters ---------- con, cur commit: bool, optional True to commit the changes to the database. False to run without changing ''' debug(3, 'deleting unused terms') num_deleted = 0 cur.execute('SELECT id, description FROM OntologyTable') res = cur.fetchall() debug(3, 'found %d terms' % len(res)) for cres in res: cid = cres['id'] cterm = cres['description'] # do we use it in an annotation? cur.execute('SELECT * FROM AnnotationListTable WHERE idontology=%s', [cid]) if cur.rowcount > 0: continue # is it in the ontology tree as child? cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyid=%s', [cid]) if cur.rowcount > 0: continue # or as parent? cur.execute('SELECT * FROM OntologyTreeStructureTable WHERE ontologyparentid=%s', [cid]) if cur.rowcount > 0: continue # ok so not used, let's delete it # first delete from synonymstable cur.execute('DELETE FROM OntologySynonymTable WHERE idontology=%s', [cid]) cur.execute('DELETE FROM OntologyTable WHERE id=%s', [cid]) num_deleted += 1 debug(3, 'found %d unused terms to delete' % num_deleted) if commit: con.commit() debug(3, 'committed') else: debug(4, 'not committing changes. nothing was deleted') debug(3, 'done')
def add_primer_to_annotations(con, cur, update_all=False, commit=True): '''Update the primerID field in the AnnotationsTable according to the sequences in the annotation Parameters ---------- update_all: bool, optional if True, update all annotations. If False, update only annotations with 'na' (primerID=0) in the primerId field) commit: bool, optional True to commit changes to database ''' if update_all: cur.execute('SELECT id FROM AnnotationsTable') else: cur.execute('SELECT id FROM AnnotationsTable WHERE PrimerID=0') res = cur.fetchall() idx = 0 for idx, cres in enumerate(res): cid = cres['id'] cur.execute( 'SELECT seqID from SequencesAnnotationTable WHERE annotationID=%s', [cid]) res2 = cur.fetchall() cprimerid = None for cres2 in res2: cseqid = cres2['seqid'] cur.execute( 'SELECT idPrimer from SequencesTable WHERE id=%s LIMIT 1', [cseqid]) res3 = cur.fetchone() if cprimerid is None: cprimerid = res3['idprimer'] if res3['idprimer'] != cprimerid: debug( 8, 'annotation %d contains sequences from two different regions' % cid) cprimerid = None break if cprimerid is None: debug( 7, "didn't find primer region for annotation %d. skipping" % cid) continue debug(2, 'annotation %d primer region %d' % (cid, cprimerid)) cur.execute('UPDATE AnnotationsTable SET primerID=%s WHERE id=%s', [cprimerid, cid]) debug(3, 'found %d annotations' % idx) if commit: debug(3, 'committing changes to database') con.commit() debug(3, 'finished')