def load_data(): nex_session = get_nex_session() brenda = nex_session.query(Source).filter_by(format_name='BRENDA').one_or_none() b_source_id = brenda.source_id expasy = nex_session.query(Source).filter_by(format_name='ExPASy').one_or_none() e_source_id = expasy.source_id fw = open(log_file, "w") f = open(file_to_load) for line in f: line = line.strip() if len(line) < 8 or line[1] != ".": continue line = line.replace(". ", ".") # convert multiple spaces to single space line = ' '.join(line.split()) pieces = line.split(" ") ec = pieces.pop(0) desc = ' '.join(pieces) ec_id = load_ec(nex_session, fw, ec, desc, e_source_id) load_ec_url(nex_session, fw, ec_id, ec, e_source_id, b_source_id) f.close() fw.close() # nex_session.rollback() nex_session.commit()
def load_data(data_file, log_file): nex_session = get_nex_session() sgd = nex_session.query(Source).filter_by(format_name='SGD').one_or_none() source_id = sgd.source_id pmid_to_reference_id = dict([ (x.pmid, x.dbentity_id) for x in nex_session.query(Referencedbentity).all() ]) name_to_locus_id = dict([(x.systematic_name, x.dbentity_id) for x in nex_session.query(Locusdbentity).all()]) taxid_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id) for x in nex_session.query(Taxonomy).all()]) eco_to_id = dict([(x.ecoid, x.eco_id) for x in nex_session.query(Eco).all()]) goid_to_id = dict([(x.goid, x.go_id) for x in nex_session.query(Go).all()]) key_to_annotation = {} for x in nex_session.query(Regulationannotation).all(): happens_during = x.happens_during if x.happens_during is not None else '' key = (x.target_id, x.regulator_id, x.taxonomy_id, x.reference_id, x.eco_id, x.regulator_type, x.regulation_type, x.annotation_type, happens_during) key_to_annotation[key] = x strain_to_taxid = get_strain_taxid_mapping() fw = open(log_file, "w") loaded = {} f = open(data_file) for line in f: if line.startswith('Regulator'): continue pieces = line.strip().split("\t") regulator_id = name_to_locus_id.get(pieces[0].strip()) if regulator_id is None: print "The regulator name: ", pieces[0], " is not in the database." continue target_id = name_to_locus_id.get(pieces[3].strip()) if target_id is None: print "The target name: ", pieces[3], " is not in the database." continue strain = pieces[5].strip() if strain == 'CEN.PK': strain = 'CENPK' taxid = strain_to_taxid.get(strain) if taxid is None: print "The strain name: ", pieces[ 5], " is not in the mapping module." continue taxonomy_id = taxid_to_taxonomy_id.get(taxid) if taxonomy_id is None: print "The taxid: ", taxid, " is not in the database." continue happens_during = '' if pieces[8]: happens_during = goid_to_id.get(pieces[8].strip().split(' ')[0]) if happens_during is None: print "Unknown GOID: ", pieces[8].strip().split(' ')[0] continue reference_id = pmid_to_reference_id.get(int(pieces[10])) if reference_id is None: print "The pmid: ", pieces[10], " is not in the database" continue regulator_type = pieces[2].strip() direction = pieces[6].strip() regulation_type = pieces[7].strip() annotation_type = pieces[11].strip() created_by = pieces[12].strip() if regulator_type not in allowable_regulator_type: print "Unknown regulator_type: ", regulator_type continue if regulation_type not in allowable_regulation_type: print "Unknown regulation_type: ", regulation_type continue if direction and direction not in allowable_regulation_direction: print "Unknown regulation_direction: ", direction continue if annotation_type not in allowable_annotation_type: print "Unknown annotation_type: ", annotation_type if regulation_type == 'protein activity' and regulator_type in [ 'transcription factor', 'chromatin modifier' ]: print "regulator_type in (transcription factor, chromatin modifier) cannot be used with regulation_type = 'protein activity'. See line below:" print line continue if regulator_type == 'protein modifier' and regulation_type == 'regulation of transcription': print "regulator_type = 'protein modifier' cannot be used with regulation_type = 'regulation of transcription'. See line below:" print line continue eco_items = pieces[9].strip().split(',') for eco_item in eco_items: eco_id = eco_to_id.get(eco_item.strip().split(' ')[0]) if eco_id is None: print "The ECO code: ", pieces[9], " is not in the database." continue key = (target_id, regulator_id, taxonomy_id, reference_id, eco_id, regulator_type, regulation_type, annotation_type, happens_during) if key in loaded: print "Same row exists: ", loaded[key] print "Same row exists: ", line continue loaded[key] = line if key in key_to_annotation: x = key_to_annotation[key] direction_DB = x.direction if direction_DB is None: direction_DB = '' if direction_DB == direction: fw.write("IN database: " + line.strip() + " KEY=" + str(key) + " direction_in_db=" + str(x.direction) + "\n") continue ## update if x.direction is None: if direction: x.direction = direction elif x.direction != direction: x.direction = direction nex_session.add(x) nex_session.commit() fw.write("The direction has been updated for key=" + str(key) + "\n") else: insert_a_row(nex_session, fw, source_id, target_id, regulator_id, eco_id, reference_id, taxonomy_id, regulator_type, regulation_type, annotation_type, direction, happens_during, created_by)
def load_data(): nex_session = get_nex_session() bud_id_to_reference_id = dict([(x.bud_id, x.dbentity_id) for x in nex_session.query(Referencedbentity).all()]) name_to_locus_id = dict([(x.systematic_name, x.dbentity_id) for x in nex_session.query(Locusdbentity).all()]) bud_id_to_colleague_id = dict([(x.bud_id, x.colleague_id) for x in nex_session.query(Colleague).all()]) key_to_colleague_locus_id = dict([((x.colleague_id, x.locus_id), x.colleague_locus_id) for x in nex_session.query(ColleagueLocus).all()]) sgd = nex_session.query(Source).filter_by(display_name='SGD').one_or_none() sgd_source_id = sgd.source_id direct = nex_session.query(Source).filter_by(display_name='Direct submission').one_or_none() direct_source_id = direct.source_id fw = open(log_file, "w") f = open(file_to_load) for line in f: pieces = line.strip().split("\t") gene_name = pieces[0].strip() name_desc = pieces[2].strip() if pieces[1] == 'ORF': continue locus_id = name_to_locus_id.get(pieces[1].strip()) if locus_id is None: print "The ORF name: ", pieces[1], " is not in the database." continue colleague_id = bud_id_to_colleague_id.get(int(pieces[3])) if colleague_id is None: print "The colleague bud_id:", pieces[3], " is not in the database." continue reference_id = None if pieces[4]: if int(pieces[4]) in bud_id_to_reference_id: reference_id = bud_id_to_reference_id.get(int(pieces[4])) else: print "The reference bud_id:", pieces[4], " is not in the database." continue else: print "NO reference_no provided." print line continue [reserved_date, expired_date] = reformat_date(pieces[6]) print gene_name, locus_id, colleague_id, reference_id, reserved_date, expired_date, name_desc # update LOCUSDBENTITY nex_session.query(Locusdbentity).filter_by(dbentity_id=locus_id).update({"gene_name": gene_name, "name_description": name_desc}) fw.write("Update LOCUSDBENTITY row for "+pieces[1]+": gene_name="+gene_name+", name_desc="+name_desc+"\n") # update DBENTITY nex_session.query(Dbentity).filter_by(dbentity_id=locus_id).update({"display_name": gene_name}) fw.write("Update DBENTITY row for "+pieces[1]+": display_name="+gene_name+"\n") add_locus_reference(nex_session, fw, locus_id, reference_id, sgd_source_id) if (colleague_id, locus_id) not in key_to_colleague_locus_id: add_colleague_locus(nex_session, fw, locus_id, colleague_id, direct_source_id) add_reservedname(nex_session, fw, locus_id, gene_name, reference_id, colleague_id, reserved_date, expired_date, direct_source_id) f.close() fw.close() # nex_session.rollback() nex_session.commit()
def load_references(infile, logfile): nex_session = get_nex_session() name_to_locus_id = {} for x in nex_session.query(Locusdbentity).all(): name_to_locus_id[x.systematic_name] = x.dbentity_id if x.gene_name: name_to_locus_id[x.gene_name] = x.dbentity_id sgd = nex_session.query(Source).filter_by(format_name='SGD').one_or_none() source_id = sgd.source_id tax = nex_session.query(Taxonomy).filter_by(taxid=taxon).one_or_none() taxonomy_id = tax.taxonomy_id fw = open(logfile, "w") pmid_to_reference_id = dict([ (x.pmid, x.dbentity_id) for x in nex_session.query(Referencedbentity).all() ]) load_papers(fw, infile, pmid_to_reference_id) pmid_to_reference_id = dict([ (x.pmid, x.dbentity_id) for x in nex_session.query(Referencedbentity).all() ]) pmid_to_refdeleted_id = dict([ (x.pmid, x.referencedeleted_id) for x in nex_session.query(Referencedeleted).all() ]) key_in_annotation = {} key_in_curation = {} for x in nex_session.query(Literatureannotation).all(): dbentity_id = None if x.dbentity_id: dbentity_id = x.dbentity_id key_in_annotation[(x.reference_id, dbentity_id, x.topic)] = 1 for x in nex_session.query(CurationReference).all(): locus_id = None if x.locus_id: locus_id = x.locus_id key_in_curation[(x.reference_id, locus_id, x.curation_tag)] = 1 f = open(infile) header = [] for line in f: line = line.replace("Homology Disease", "Homology/Disease") pieces = line.strip().split("\t") if pieces[0] in ['PMID', 'pmid', 'pubmed', '']: header = pieces continue pmid = int(pieces[0]) created_by = pieces[1] date_created = pieces[15] if pieces[2] == '1': # add to DB only - reference has been loaded so skip this one print("Add to DB only: ", pieces[0]) continue if pieces[3] == '1': print("Discard this paper") if pmid in pmid_to_refdeleted_id: print("The row for PMID: ", pmid, " is in the REFERENCEDELETED table.") continue insert_referencedeleted(nex_session, fw, pmid, created_by, date_created) continue reference_id = pmid_to_reference_id.get(pmid) if reference_id is None: print("The pmid: ", pmid, " is not in the database.") continue # curation tags for i in [4, 5, 7, 8, 9, 13, 14]: # if len(pieces) <= i: # continue if pieces[i] != "": curation_tag = header[i].strip() if pieces[i] == '1': locus_id = None if (reference_id, locus_id, curation_tag) in key_in_curation: print("The row for ", (reference_id, locus_id, curation_tag), " is already in the CURATION_REFERENCE table.") continue insert_curation_reference(nex_session, fw, reference_id, locus_id, curation_tag, created_by, date_created, source_id) key_in_curation[(reference_id, locus_id, curation_tag)] = 1 else: names = pieces[i].strip().split(" ") for name in names: name = name.strip() locus_id = name_to_locus_id.get(name) if locus_id is None: print("The gene name: ", name, " is not in the database.") continue if (reference_id, locus_id, curation_tag) in key_in_curation: print( "The row for ", (reference_id, locus_id, curation_tag), " is already in the CURATION_REFERENCE table.") continue insert_curation_reference(nex_session, fw, reference_id, locus_id, curation_tag, created_by, date_created, source_id) key_in_curation[(reference_id, locus_id, curation_tag)] = 1 # literature topics for i in [6, 7, 8, 9, 10, 11, 12]: # if len(pieces) <= i: # continue if pieces[i] != "": topic = header[i].strip() if i in [7, 8, 9]: topic = "Primary Literature" if pieces[i] == '1' or topic == 'Omics': locus_id = None if (reference_id, locus_id, topic) in key_in_annotation: print( "The row for ", (reference_id, locus_id, topic), " is already in the LITERATUREANNOTATION table.") continue insert_literatureannotation(nex_session, fw, reference_id, taxonomy_id, locus_id, topic, created_by, date_created, source_id) key_in_annotation[(reference_id, locus_id, topic)] = 1 else: names = pieces[i].strip().split(" ") for name in names: name = name.strip() locus_id = name_to_locus_id.get(name) if locus_id is None: print("The gene name: ", name, " is not in the database.") continue if (reference_id, locus_id, topic) in key_in_annotation: print( "The row for ", (reference_id, locus_id, topic), " is already in the LITERATUREANNOTATION table." ) continue insert_literatureannotation(nex_session, fw, reference_id, taxonomy_id, locus_id, topic, created_by, date_created, source_id) key_in_annotation[(reference_id, locus_id, topic)] = 1 # nex_session.rollback() nex_session.commit() fw.close() f.close()