def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'Human Proteome Map', 'source': 'IDG-KMC generated data by Oleg Ursu at UNM.', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.humanproteomemap.org/'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance provs = [ {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPM Protein'", 'comment': 'Log Median and qualitative expression values are derived from files from http://www.humanproteomemap.org/download.php'}, {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPM Gene'", 'comment': 'Log Median and qualitative expression values are derived from files from http://www.humanproteomemap.org/download.php'}, {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPM Protein Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.humanproteomemap.org/download.php. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'}, {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPM Gene Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.humanproteomemap.org/download.php. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'}] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] with open(TISSUE2UBERON_FILE, 'r') as ifh: tiss2uid = ast.literal_eval(ifh.read()) if not args['--quiet']: print "\nGot {} tissue to Uberon ID mappings from file {}".format(len(tiss2uid), TISSUE2UBERON_FILE) # # Protein Level Expressions # line_ct = slmf.wcl(PROTEIN_QUAL_FILE) if not args['--quiet']: print "\nProcessing {} lines in HPM file {}".format(line_ct, PROTEIN_QUAL_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 rs2pids = defaultdict(list) notfnd = set() nouid = set() dba_err_ct = 0 pmark = {} exp_ct = 0 with open(PROTEIN_QUAL_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 pbar.update(ct) #rs = re.sub('\.\d+$', '', row[0]) # get rid of version rs = row[0] if rs in rs2pids: # we've already found it pids = rs2pids[rs] elif rs in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets_by_xref({'xtype': 'RefSeq', 'value': rs}, False) if not targets: notfnd.add(rs) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) rs2pids[rs] = pids # save this mapping so we only lookup each target once tissue = row[1] if row[3] == 'NA': init = {'etype': 'HPM Protein', 'tissue': tissue, 'qual_value': row[4],} else: init = {'etype': 'HPM Protein','tissue': tissue, 'qual_value': row[4], 'number_value': row[3]} # Add Uberon ID, if we can find one if tissue in tiss2uid: uberon_id = tiss2uid[tissue] else: uberon_id = dba.get_uberon_id({'name': tissue}) if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(tissue) for pid in pids: init['protein_id'] = pid rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new expression rows for {} proteins ({} RefSeqs)".format(exp_ct, len(pmark), len(rs2pids)) if notfnd: print "No target found for {} RefSeqs. See logfile {} for details.".format(len(notfnd), logfile) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) line_ct = slmf.wcl(PROTEIN_TAU_FILE) if not args['--quiet']: print "\nProcessing {} lines in Tissue Specificity Index file {}".format(line_ct, PROTEIN_TAU_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 dba_err_ct = 0 pmark = {} skip_ct = 0 ti_ct = 0 with open(PROTEIN_TAU_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 pbar.update(ct) #rs = re.sub('\.\d+$', '', row[0]) # get rid of version rs = row[0] tau = row[1] if rs not in rs2pids: skip_ct += 1 continue for pid in rs2pids[rs]: rv = dba.ins_tdl_info({'protein_id': pid, 'itype': 'HPM Protein Tissue Specificity Index', 'number_value': tau}) if not rv: dba_err_ct += 1 continue ti_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new HPM Protein Tissue Specificity Index tdl_info rows for {} proteins.".format(ti_ct, len(pmark)) if skip_ct > 0: print " Skipped {} rows with RefSeqs not in map from expression file.".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) # # Gene Level Expressions # line_ct = slmf.wcl(GENE_QUAL_FILE) if not args['--quiet']: print "\nProcessing {} lines in HPM file {}".format(line_ct, GENE_QUAL_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 sym2pids = defaultdict(list) notfnd = set() nouid = set() dba_err_ct = 0 pmark = {} exp_ct = 0 with open(GENE_QUAL_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 pbar.update(ct) sym = re.sub('\.\d+$', '', row[0]) # get rid of version if sym in sym2pids: pids = sym2pids[sym] elif sym in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets({'sym': sym}, False) if not targets: notfnd.add(sym) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) sym2pids[sym] = pids # save this mapping so we only lookup each target once tissue = row[1] if row[3] == 'NA': init = {'etype': 'HPM Gene', 'tissue': tissue, 'qual_value': row[4],} else: init = {'etype': 'HPM Gene','tissue': tissue, 'qual_value': row[4], 'number_value': row[3]} # Add Uberon ID, if we can find one if tissue in tiss2uid: uberon_id = tiss2uid[tissue] else: uberon_id = dba.get_uberon_id({'name': tissue}) if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(tissue) for pid in pids: init['protein_id'] = pid rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new expression rows for {} proteins ({} Gene Symbols)".format(exp_ct, len(pmark), len(sym2pids)) if notfnd: print " No target found for {} symbols. See logfile {} for details.".format(len(notfnd), logfile) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) line_ct = slmf.wcl(GENE_TAU_FILE) if not args['--quiet']: print "\nProcessing {} lines in Tissue Specificity Index file {}".format(line_ct, GENE_TAU_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 dba_err_ct = 0 pmark = {} skip_ct = 0 ti_ct = 0 with open(GENE_TAU_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 pbar.update(ct) sym = re.sub('\.\d+$', '', row[0]) # get rid of version tau = row[1] if sym not in sym2pids: skip_ct += 1 continue for pid in rs2pids[rs]: rv = dba.ins_tdl_info({'protein_id': pid, 'itype': 'HPM Gene Tissue Specificity Index', 'number_value': tau}) if not rv: dba_err_ct += 1 continue ti_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new HPM Gene Tissue Specificity Index tdl_info rows for {} proteins.".format(ti_ct, len(pmark)) if skip_ct > 0: print " Skipped {} rows with symbols not in map from expression file".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def tinx(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # The results of parsing the input mentions files will be the following dictionaries: pid2pmids = { } # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein # Including the UniProt accession in the key is just for convenience when # checking the output. It is not used for anything. doid2pmids = {} # DOID => set of all PMIDs that mention the disease pmid_disease_ct = { } # PMID => count of diseases mentioned in a given paper pmid_protein_ct = { } # PMID => count of proteins mentioned in a given paper # First parse the Disease Ontology OBO file to get DO names and defs dofile = DO_DOWNLOAD_DIR + DO_OBO print "\nParsing Disease Ontology file {}".format(dofile) do_parser = obo.Parser(open(dofile)) do = {} for stanza in do_parser: do[stanza.tags['id'][0].value] = stanza.tags print " Got {} Disease Ontology terms".format(len(do)) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] fn = JL_DOWNLOAD_DIR + PROTEIN_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines in protein file {}".format(line_ct, fn) with open(fn, 'rU') as tsvf: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 pbar.update(ct) if not line.startswith('ENSP'): skip_ct += 1 continue data = line.rstrip().split('\t') ensp = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) targets = dba.find_targets({'stringid': ensp}) if not targets: # if we don't find a target by stringid, which is the more reliable and # prefered way, try by Ensembl xref targets = dba.find_targets_by_xref({ 'xtype': 'Ensembl', 'value': ensp }) if not targets: notfnd.add(ensp) continue for t in targets: p = t['components']['protein'][0] k = "%s,%s" % (p['id'], p['uniprot']) if k in pid2pmids: pid2pmids[k] = pid2pmids[k].union(pmids) else: pid2pmids[k] = set(pmids) for pmid in pmids: if pmid in pmid_protein_ct: pmid_protein_ct[pmid] += 1.0 else: pmid_protein_ct[pmid] = 1.0 pbar.finish() for ensp in notfnd: logger.warn("No target found for {}".format(ensp)) print "{} lines processed.".format(ct) print " Skipped {} non-ENSP lines".format(skip_ct) print " Saved {} protein to PMIDs mappings".format(len(pid2pmids)) print " Saved {} PMID to protein count mappings".format( len(pmid_protein_ct)) if notfnd: print " No target found for {} ENSPs. See logfile {} for details.".format( len(notfnd), logfile) fn = JL_DOWNLOAD_DIR + DISEASE_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsvf: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 pbar.update(ct) if not line.startswith('DOID:'): skip_ct += 1 continue data = line.rstrip().split('\t') doid = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) if doid not in do: logger.warn("%s not found in DO" % doid) notfnd.add(doid) continue if doid in doid2pmids: doid2pmids[doid] = doid2pmids[doid].union(pmids) else: doid2pmids[doid] = set(pmids) for pmid in pmids: if pmid in pmid_disease_ct: pmid_disease_ct[pmid] += 1.0 else: pmid_disease_ct[pmid] = 1.0 pbar.finish() print "{} lines processed.".format(ct) print " Skipped {} non-DOID lines".format(skip_ct) print " Saved {} DOID to PMIDs mappings".format(len(doid2pmids)) print " Saved {} PMID to disease count mappings".format( len(pmid_disease_ct)) if notfnd: print "WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format( len(notfnd), logfile) if not args['--quiet']: print "\nComputing protein novely scores" # To calculate novelty scores, each paper (PMID) is assigned a # fractional target (FT) score of one divided by the number of targets # mentioned in it. The novelty score of a given protein is one divided # by the sum of the FT scores for all the papers mentioning that # protein. ct = 0 with open(PROTEIN_NOVELTY_FILE, 'wb') as pnovf: pnovf.write("Protein ID,UniProt,Novelty\n") for k in pid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in pid2pmids[k]: ft_score_sum += 1.0 / pmid_protein_ct[pmid] novelty = 1.0 / ft_score_sum pnovf.write("%s,%.8f\n" % (k, novelty)) print " Wrote {} novelty scores to file {}".format( ct, PROTEIN_NOVELTY_FILE) if not args['--quiet']: print "\nComputing disease novely scores" # Exactly as for proteins, but using disease mentions ct = 0 with open(DISEASE_NOVELTY_FILE, 'wb') as dnovf: dnovf.write("DOID,Novelty\n") for doid in doid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in doid2pmids[doid]: ft_score_sum += 1.0 / pmid_disease_ct[pmid] novelty = 1.0 / ft_score_sum dnovf.write("%s,%.8f\n" % (doid, novelty)) print " Wrote {} novelty scores to file {}".format( ct, DISEASE_NOVELTY_FILE) if not args['--quiet']: print "\nComputing importance scores" # To calculate importance scores, each paper is assigned a fractional # disease-target (FDT) score of one divided by the product of the # number of targets mentioned and the number of diseases # mentioned. The importance score for a given disease-target pair is # the sum of the FDT scores for all papers mentioning that disease and # protein. ct = 0 with open(IMPORTANCE_FILE, 'wb') as impf: impf.write("DOID,Protein ID,UniProt,Score\n") for k, ppmids in pid2pmids.items(): for doid, dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) fdt_score_sum = 0.0 for pmid in pd_pmids: fdt_score_sum += 1.0 / (pmid_protein_ct[pmid] * pmid_disease_ct[pmid]) if fdt_score_sum > 0: ct += 1 impf.write("%s,%s,%.8f\n" % (doid, k, fdt_score_sum)) print " Wrote {} importance scores to file {}".format(ct, IMPORTANCE_FILE) if not args['--quiet']: print "\nComputing PubMed rankings" # PMIDs are ranked for a given disease-target pair based on a score # calculated by multiplying the number of targets mentioned and the # number of diseases mentioned in that paper. Lower scores have a lower # rank (higher priority). If the scores do not discriminate, PMIDs are # reverse sorted by value with the assumption that larger PMIDs are # newer and of higher priority. ct = 0 with open(PMID_RANKING_FILE, 'wb') as pmrf: pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n") for k, ppmids in pid2pmids.items(): for doid, dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) scores = [ ] # scores are tuples of (PMID, protein_mentions*disease_mentions) for pmid in pd_pmids: scores.append( (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid])) if len(scores) > 0: scores.sort(cmp_pmids_scores) for i, t in enumerate(scores): ct += 1 pmrf.write("%s,%s,%d,%d\n" % (doid, k, t[0], i)) print " Wrote {} PubMed rankings to file {}".format(ct, PMID_RANKING_FILE)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as load() dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % ( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) start_time = time.time() # Dataset dataset_id = dba.ins_dataset({ 'name': 'ChEMBL', 'source': 'ChEMBL MySQL database {}'.format(CHEMBL_DB), 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ebi.ac.uk/chembl/' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile dataset_id2 = dba.ins_dataset({ 'name': 'ChEMBL Info', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'First reference year and selective compound info are generated by loader app.' }) if not dataset_id2: print "WARNING: Error inserting dataset See logfile %s for details." % logfile # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'chembl_acivity' }, { 'dataset_id': dataset_id2, 'table_name': 'tdl_info', 'where_clause': "itype = 'ChEMBL First Reference Year'", 'comment': "Derived from filtered ChEMBL activities." }, { 'dataset_id': dataset_id2, 'table_name': 'tdl_info', 'where_clause': "itype = 'ChEMBL Selective Compound'", 'comment': "Derived from filtered ChEMBL activities." }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) # ChEMBL MySQL connection f = open('/home/smathias/.dbirc', 'r') pw = f.readline().strip() chembldb = mysql.connect(host='localhost', port=3306, db=CHEMBL_DB, user='******', passwd=pw) # First get mapping of UniProt accestions to ChEMBL IDs up2chembl = {} f = DOWNLOAD_DIR + UNIPROT2CHEMBL_FILE line_ct = slmf.wcl(f) if not args['--quiet']: print "\nProcessing %d input lines in file %s" % (line_ct, f) with open(f, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): continue if row[0] in up2chembl: up2chembl[row[0]].append(row[1]) else: up2chembl[row[0]] = [row[1]] if not args['--quiet']: print "%d input lines processed." % ct #print "Saved %d keys in up2chembl dict" % len(up2chembl.keys()) upct = len(up2chembl) if not args['--quiet']: print "\nProcessing %d UniProt to ChEMBL ID(s) mappings" % upct pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=upct).start() ct = 0 notfnd = [] fnd_ct = 0 err_ct = 0 dba_err_ct = 0 nic_ct = 0 nga_ct = 0 tdl_ct = 0 ca_ct = 0 csti_ct = 0 ccti_ct = 0 cyti_ct = 0 t2acts = {} c2acts = {} for up in up2chembl.keys(): ct += 1 pbar.update(ct) targets = dba.find_targets({'uniprot': up}, include_annotations=True) if not targets: notfnd.append(up) continue t = targets[0] tid = t['id'] logger.info("Loading ChEMBL data for target %d - %s/%s" % (t['id'], t['components']['protein'][0]['sym'], up)) chembl_acts = [] for ctid in up2chembl[up]: with closing(chembldb.cursor(mysql.cursors.DictCursor)) as curs: # Query 1 curs.execute(SQLq1, (ctid, )) for d in curs: if d['year']: d['reference'] = "%s, (%d) %s:%s:%s" % ( d['journal'], d['year'], d['volume'], d['issue'], d['first_page']) else: d['reference'] = "%s, %s:%s:%s" % ( d['journal'], d['volume'], d['issue'], d['first_page']) for k in ['journal', 'volume', 'issue', 'first_page']: del (d[k]) chembl_acts.append(d) # Query 2 with closing(chembldb.cursor(mysql.cursors.DictCursor)) as curs: curs.execute(SQLq2, (ctid, )) for d in curs: d['reference'] = None chembl_acts.append(d) if t['fam'] == 'GPCR': cutoff = 7.0 # 100nM elif t['fam'] == 'IC': cutoff = 5.0 # 10uM elif t['fam'] == 'Kinase': cutoff = 7.52288 # 30nM elif t['fam'] == 'NR': cutoff = 7.0 # 100nM else: cutoff = 6.0 # 1uM for non-IDG Family targets logger.info("Target %d (%s) filter cutoff: %f " % (tid, t['name'], len(chembl_acts))) filtered_acts = [ a for a in chembl_acts if a['pchembl_value'] >= cutoff ] logger.info("%d ChEMBL acts => %d filtered acts" % (len(chembl_acts), len(filtered_acts))) if not filtered_acts: nga_ct += 1 continue logger.info(" Got %d filtered activities" % len(filtered_acts)) # # if we get here, target is Tchem # # sort all activities by std_val, so best activity is in sorted_by_stdval[-1] decorated = [(a['pchembl_value'], a) for a in filtered_acts] decorated.sort() sorted_by_stdval = [a for (key, a) in decorated] # sort filtered activities by reference year, so oldest activity is in sorted_by_year[0] decorated = [(a['year'], a) for a in filtered_acts if 'year' in a] decorated.sort() sorted_by_year = [a for (key, a) in decorated] # Save chembl_activities # The best activity for a given target will be the one with MAX(chembl_activity.id) for a in sorted_by_stdval: if 'pubmed_id' in a: pmid = a['pubmed_id'] else: pmid = None try: rv = dba.ins_cmpd_activity({ 'target_id': tid, 'catype': 'ChEMBL', 'cmpd_id_in_src': a['chembl_id'], 'cmpd_name_in_src': a['compound_name'], 'smiles': a['canonical_smiles'], 'reference': a['reference'], 'act_value': a['pchembl_value'], 'act_type': a['standard_type'], 'pubmed_ids': pmid }) except: # some names have weird hex characters and cause errors... rv = dba.ins_cmpd_activity({ 'target_id': tid, 'catype': 'ChEMBL', 'cmpd_id_in_src': a['chembl_id'], 'cmpd_name_in_src': '?', 'smiles': a['canonical_smiles'], 'reference': a['reference'], 'act_value': a['pchembl_value'], 'act_type': a['standard_type'], 'pubmed_ids': pmid }) if rv: ca_ct += 1 else: dba_err_ct += 1 # Save First ChEMBL Reference Year tdl_info, if there is one if len(sorted_by_year) > 0: oldest = sorted_by_year[0] rv = dba.ins_tdl_info({ 'target_id': tid, 'itype': 'ChEMBL First Reference Year', 'integer_value': sorted_by_year[0]['year'] }) if rv: cyti_ct += 1 else: dba_err_ct += 1 # Save mappings for selective compound calculations t2acts[tid] = copy.copy(sorted_by_stdval) for a in chembl_acts: ac = copy.copy(a) smi = ac['canonical_smiles'] del (ac['canonical_smiles']) ac['tid'] = tid ac['tname'] = t['components']['protein'][0]['name'] if smi in c2acts: c2acts[smi].append(ac) else: c2acts[smi] = [ac] pbar.finish() print "%d UniProt accessions processed." % ct if nic_ct > 0: print " %d targets not found in ChEMBL" % nic_ct print " %d targets have no qualifying TCRD activities in ChEMBL" % nga_ct print "Inserted %d new cmpd_activity rows" % ca_ct print "Inserted %d new ChEMBL First Reference Year tdl_infos" % cyti_ct if err_ct > 0: print "%d ERRORS" % err_ct if dba_err_ct > 0: print "WARNING: %d database errors occured. See logfile %s for details." % ( dba_err_ct, logfile) # Selective compound calculations if not args['--quiet']: print "\nRunning selective compound analysis..." #pickle.dump(t2acts, open('T2ChEMBLActs.p', 'wb')) #print "%d target to activities mappings saved to pickle T2ChEMBLActs.p" % len(t2acts.keys()) #pickle.dump(c2acts, open('C2AllChEMBLActs.p', 'wb')) #print "%d compound to activity mappings saved to pickle C2AllChEMBLActs.p" % len(c2acts.keys()) # filter c2acts for compounds with multiple activities c2macts = {} for c, acts in c2acts.items(): if len(acts) > 1: c2macts[c] = list(acts) # then sort the activity lists by pchembl_value c2smacts = {} for c, acts in c2macts.items(): decorated = [(a['pchembl_value'], a) for a in acts] decorated.sort() c2smacts[c] = [a for (key, a) in decorated] #pickle.dump(c2smacts, open('C2ChEMBLActs.p', 'wb')) #print "%d compound to activities mappings saved to pickle C2ChEMBLActs.p" % len(c2smacts.keys()) selective = [] for smi in c2smacts.keys(): i = 1 while i <= len(c2smacts[smi]) - 1: if c2smacts[smi][i]['tid'] == c2smacts[smi][i - 1]['tid']: i += 1 continue diff = c2smacts[smi][i]['pchembl_value'] - c2smacts[smi][ i - 1]['pchembl_value'] if diff >= 2: selective.append(smi) break i += 1 #pickle.dump(selective, open(SC_PFILE, 'wb')) #print "%d selective compounds saved to %s" % (len(selective), SC_PFILE) if not args['--quiet']: print " Found %d selective compounds" % len(selective) cscti_ct = 0 for tid, acts in t2acts.items(): for a in acts: if a['canonical_smiles'] in selective: # Save ChEMBL Selective Compound tdl_info val = "%s|%s" % (a['chembl_id'], a['canonical_smiles']) rv = dba.ins_tdl_info({ 'target_id': tid, 'itype': 'ChEMBL Selective Compound', 'string_value': val }) if rv: cscti_ct += 1 else: dba_err_ct += 1 break if not args['--quiet']: print "Inserted %d new ChEMBL Selective Compound tdl_infos" % cscti_ct
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'eRAM Disease Associations', 'source': 'Data scraped from eRAM web pages.', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.unimd.org/eram/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'eRAM'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) s = shelve.open(ERAM_SHELF_FILE) dis_ct = len(s['disease_names']) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} disease names in shelf file {}".format( dis_ct, ERAM_SHELF_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=dis_ct).start() ct = 0 pmark = {} skip_ct = 0 dnerr1_ct = 0 dnerr2_ct = 0 notfnd = set() dis_ct = 0 dba_err_ct = 0 for dname in s['disease_names']: ct += 1 try: dname = str(dname) except: dnerr2_ct += 1 logger.warn("UnicodeEncodeError for disease name '{}'".format( dname.encode('ascii', 'ignore'))) continue if dname not in s: dnerr_ct += 1 logger.warn("Disease name '{}' not in shelf".format(dname)) continue if 'currated_genes' not in s[dname]: skip_ct += 1 continue for cg in s[dname]['currated_genes']: sym = cg['sym'] geneid = cg['geneid'] k = "%s|%s" % (sym, geneid) if k in notfnd: continue targets = dba.find_targets({'sym': sym}) if not targets: targets = dba.find_targets({'geneid': geneid}) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue for t in targets: p = t['components']['protein'][0] pmark[t['id']] = True for doid in s[dname]['doids']: rv = dba.ins_disease({ 'protein_id': p['id'], 'dtype': 'eRAM', 'name': dname, 'did': doid, 'source': cg['sources'] }) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark)) if skip_ct > 0: print "Skipped {} diseases with no currated genes. See logfile {} for details.".format( skip_ct, logfile) if dnerr1_ct > 0: print "{} disease names not found in shelf. See logfile {} for details.".format( dnerr1_ct, logfile) if dnerr2_ct > 0: print "{} disease names cannot be decoded to strs. See logfile {} for details.".format( dnerr2_ct, logfile) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'JensenLab PubMed Text-mining Scores', 'source': 'File %s' % BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': BASE_URL }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'pmscore' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'JensenLab PubMed Score'" }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] ensp2pids = {} pmscores = {} # protein.id => sum(all scores) pms_ct = 0 upd_ct = 0 notfnd = {} dba_err_ct = 0 infile = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: # sym year score ct += 1 pbar.update(ct) if not row[0].startswith('ENSP'): continue ensp = row[0] if ensp in ensp2pids: # we've already found it pids = ensp2pids[ensp] elif ensp in notfnd: # we've already not found it continue else: targets = dba.find_targets({'stringid': ensp}) if not targets: targets = dba.find_targets_by_xref({ 'xtype': 'STRING', 'value': '9606.' + ensp }) if not targets: notfnd[ensp] = True logger.warn("No target found for {}".format(ensp)) continue pids = [] for target in targets: pids.append(target['components']['protein'][0]['id']) ensp2pids[ ensp] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_pmscore({ 'protein_id': pid, 'year': row[1], 'score': row[2] }) if rv: pms_ct += 1 else: dba_err_ct += 1 if pid in pmscores: pmscores[pid] += float(row[2]) else: pmscores[pid] = float(row[2]) pbar.finish() print "{} input lines processed.".format(ct) print " Inserted {} new pmscore rows for {} targets".format( pms_ct, len(pmscores)) if len(notfnd) > 0: print "No target found for {} STRING IDs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) print "\nLoading {} JensenLab PubMed Score tdl_infos".format( len(pmscores.keys())) ct = 0 ti_ct = 0 dba_err_ct = 0 for pid, score in pmscores.items(): ct += 1 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'JensenLab PubMed Score', 'number_value': score }) if rv: ti_ct += 1 else: dba_err_ct += 1 print "{} processed".format(ct) print " Inserted {} new JensenLab PubMed Score tdl_info rows".format( ti_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( (dba_err_ct, logfile))
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'GWAS Catalog', 'source': 'File %s from http://www.ebi.ac.uk/gwas/docs/file-downloads' % os.path.basename(INFILE), 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ebi.ac.uk/gwas/home' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'gwas'}) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) line_ct = slmf.wcl(INFILE) line_ct -= 1 if not args['--quiet']: print '\nProcessing {} lines from input file {}'.format( line_ct, INFILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() outlist = [] with open(INFILE, 'rU') as tsvfile: tsvreader = csv.reader(tsvfile, delimiter='\t') header = tsvreader.next() # skip header line ct = 0 notfnd = set() pmark = {} gwas_ct = 0 dba_err_ct = 0 # 0: DATE ADDED TO CATALOG # 1: PUBMEDID # 2: FIRST AUTHOR # 3: DATE # 4: JOURNAL # 5: LINK # 6: STUDY # 7: DISEASE/TRAIT # 8: INITIAL SAMPLE SIZE # 9: REPLICATION SAMPLE SIZE # 10: REGION # 11: CHR_ID # 12: CHR_POS # 13: REPORTED GENE(S) # 14: MAPPED_GENE # 15: UPSTREAM_GENE_ID # 16: DOWNSTREAM_GENE_ID # 17: SNP_GENE_IDS # 18: UPSTREAM_GENE_DISTANCE # 19: DOWNSTREAM_GENE_DISTANCE # 20: STRONGEST SNP-RISK ALLELE # 21: SNPS # 22: MERGED # 23: SNP_ID_CURRENT # 24: CONTEXT # 25: INTERGENIC # 26: RISK ALLELE FREQUENCY # 27: P-VALUE # 28: PVALUE_MLOG # 29: P-VALUE (TEXT) # 30: OR or BETA # 31: 95% CI (TEXT) # 32: PLATFORM [SNPS PASSING QC] # 33: CNV # 34: MAPPED_TRAIT # 35: MAPPED_TRAIT_URI # 36: STUDY ACCESSION # 37: GENOTYPING TECHNOLOGY symregex = re.compile(r' ?[-,;] ?') for row in tsvreader: ct += 1 if len(row) < 14: continue symstr = row[14] if symstr == 'NR': continue symlist = symregex.split(symstr) for sym in symlist: if sym in notfnd: continue targets = dba.find_targets({'sym': sym}) if not targets: notfnd.add(sym) logger.warn("No target found for symbol {}".format(sym)) continue for t in targets: p = t['components']['protein'][0] try: pval = float(row[27]) except: pval = None try: orbeta = float(row[30]) except: orbeta = None if row[25]: ig = int(row[25]) else: ig = None rv = dba.ins_gwas({ 'protein_id': p['id'], 'disease_trait': row[7], 'snps': row[21], 'pmid': row[1], 'study': row[6], 'context': row[24], 'intergenic': ig, 'p_value': pval, 'or_beta': orbeta, 'cnv': row[33], 'mapped_trait': row[34], 'mapped_trait_uri': row[35] }) if not rv: dba_err_ct += 1 continue pmark[p['id']] = True gwas_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new gwas rows for {} proteins".format( gwas_ct, len(pmark.keys())) if notfnd: print "No target found for {} symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Jensen Lab DISEASES', 'source': 'Files %s from %s' % (", ".join(SRC_FILES), BASE_URL), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://diseases.jensenlab.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype LIKE 'JensenLab %'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) # Knowledge channel fn = DOWNLOAD_DIR + FILE_K line_ct = slmf.wcl(fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} notfnd = set() dis_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in notfnd: continue targets = dba.find_targets({'stringid': ensp}) if not targets: targets = dba.find_targets({'sym': sym}, idg=False) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue dtype = 'JensenLab Knowledge ' + row[4] for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True init = { 'protein_id': p['id'], 'dtype': dtype, 'name': row[3], 'did': row[2], 'evidence': row[5], 'conf': row[6] } rv = dba.ins_disease(init) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark)) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Experiment channel fn = DOWNLOAD_DIR + FILE_E line_ct = slmf.wcl(fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} notfnd = set() dis_ct = 0 skip_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[6] == '0': # skip zero confidence rows skip_ct += 1 continue ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in notfnd: continue targets = dba.find_targets({'stringid': ensp}) if not targets: targets = dba.find_targets({'sym': sym}, idg=False) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue dtype = 'JensenLab Experiment ' + row[4] for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True rv = dba.ins_disease({ 'protein_id': p['id'], 'dtype': dtype, 'name': row[3], 'did': row[2], 'evidence': row[5], 'conf': row[6] }) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark)) if skip_ct > 0: print "Skipped {} zero confidence rows".format(skip_ct) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Text Mining channel fn = DOWNLOAD_DIR + FILE_T line_ct = slmf.wcl(fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} notfnd = set() dis_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in notfnd: continue targets = dba.find_targets({'stringid': ensp}) if not targets: targets = dba.find_targets({'sym': sym}, idg=False) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue dtype = 'JensenLab Text Mining' for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True rv = dba.ins_disease({ 'protein_id': p['id'], 'dtype': dtype, 'name': row[3], 'did': row[2], 'zscore': row[4], 'conf': row[5] }) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark)) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'LINCS L1000 XRefs', 'source': 'File %s'%os.path.basename(L1000_FILE), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://support.lincscloud.org/hc/en-us/articles/202092616-The-Landmark-Genes'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d"%dataset_id}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] line_ct = slmf.wcl(L1000_FILE) if not args['--quiet']: print "\nProcessing {} rows in file {}".format(line_ct, L1000_FILE) with open(L1000_FILE, 'rU') as csvfile: csvreader = csv.reader(csvfile) ct = 0 pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 pmark = {} xref_ct = 0 notfnd = set() dba_err_ct = 0 for row in csvreader: ct += 1 pbar.update(ct) l1000 = row[0] sym = row[1] geneid = row[2] targets = dba.find_targets({'sym': sym}) if not targets: targets = dba.find_targets({'geneid': geneid}) if not targets: continue target = targets[0] pid = target['components']['protein'][0]['id'] rv = dba.ins_xref({'protein_id': pid, 'xtype': 'L1000 ID', 'dataset_id': dataset_id, 'value': l1000}) if rv: xref_ct += 1 pmark[pid] = True else: dba_err_ct += 1 pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) print "{} rows processed.".format(ct) print " Inserted {} new L1000 ID xref rows for {} proteins.".format(xref_ct, len(pmark)) if len(notfnd) > 0: print "No target found for {} symbols/geneids. See logfile {} for details.".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'HomoloGene', 'source': 'File %s' % BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ncbi.nlm.nih.gov/homologene', 'comments': 'Only Human, Mouse and Rat members of HomoloGene groups are loaded. These relate protein to nhprotein.' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{'dataset_id': dataset_id, 'table_name': 'homology'}] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] infile = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 0 hom_ct = 0 nf_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 pbar.update(ct) # homologene_group_id tax_id ncbi_gene_id symbol protein_gi ref_seq taxid = int(row[1]) if taxid not in TAXIDS: skip_ct += 1 continue if taxid == 9606: targets = dba.find_targets({'geneid': row[2]}) if not targets: nf_ct += 1 logger.warn("No target found for {}".format(row)) continue for t in targets: p = t['components']['protein'][0] rv = dba.ins_homologene({ 'protein_id': p['id'], 'groupid': row[0], 'taxid': taxid }) if rv: hom_ct += 1 else: dba_err_ct += 1 else: nhproteins = dba.find_nhproteins({'geneid': row[2]}) if not nhproteins: nf_ct += 1 logger.warn("No nhprotein found for {}".format(row)) continue for nhp in nhproteins: rv = dba.ins_homologene({ 'nhprotein_id': nhp['id'], 'groupid': row[0], 'taxid': taxid }) if rv: hom_ct += 1 else: dba_err_ct += 1 pbar.finish() print "Processed {} lines.".format(ct) print "Loaded {} new homologene rows".format(hom_ct) print " Skipped {} non-Human/Mouse/Rat lines".format(skip_ct) if nf_ct > 0: print "WARNNING: No target/nhprotein found for {} lines. See logfile {} for details.".format( nf_ct, logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset exp_dataset_id = dba.ins_dataset( {'name': 'Human Cell Atlas Expression', 'source': 'File Table S1 from http://science.sciencemag.org/content/suppl/2017/05/10/science.aal3321.DC1', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://science.sciencemag.org/content/356/6340/eaal3321.full', 'comments': 'Qualitative expression values are generated by the loading app.'} ) assert exp_dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) cpt_dataset_id = dba.ins_dataset( {'name': 'Human Cell Atlas Compartments', 'source': 'File Table S6 from http://science.sciencemag.org/content/suppl/2017/05/10/science.aal3321.DC1', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://science.sciencemag.org/content/356/6340/eaal3321.full'} ) assert cpt_dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance provs = [ {'dataset_id': exp_dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HCA RNA'", 'comment': 'TPM and qualitative expression values are derived from file Table S1 from http://science.sciencemag.org/content/suppl/2017/05/10/science.aal3321.DC1'}, {'dataset_id': cpt_dataset_id, 'table_name': 'compartment', 'where_clause': "ctype = 'Human Cell Atlas'"} ] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) if not args['--quiet']: print "\nCalculating expression level percentiles" pctiles = calc_pctiles() pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] # # Expressions # line_ct = slmf.wcl(RNA_FILE) if not args['--quiet']: print "\nProcessing {} lines from HCA file {}".format(line_ct, RNA_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 k2pids = defaultdict(list) notfnd = set() dba_err_ct = 0 pmark = {} exp_ct = 0 with open(RNA_FILE, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() for row in csvreader: ct += 1 pbar.update(ct) sym = row[1] ensg = row[0] k = "%s|%s"%(sym,ensg) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets({'sym': sym}, False) if not targets: targets = dba.find_targets_by_xref({'xtype': 'Ensembl', 'value': ensg}, False) if not targets: notfnd.add(k) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) k2pids[k] = pids for pid in pids: cell_lines = [c.replace(' (TPM)', '') for c in header[2:]] for (i,cl) in enumerate(cell_lines): tpm_idx = i + 2 # add two because row has ENSG and Gene at beginning tpm = float(row[tpm_idx]) qv = calc_qual_value( tpm, pctiles[cl] ) rv = dba.ins_expression( {'protein_id': pid, 'etype': 'HCA RNA', 'tissue': 'Cell Line '+cl, 'qual_value': qv, 'number_value': tpm} ) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[pid] = True pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) print "Processed {} lines.".format(ct) print " Inserted {} new expression rows for {} proteins.".format(exp_ct, len(pmark)) if notfnd: print " No target found for {} Symbols/ENSGs. See logfile {} for details".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) # # Compartments # line_ct = slmf.wcl(LOC_FILE) if not args['--quiet']: print "\nProcessing {} lines from HCA file {}".format(line_ct, LOC_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 k2pids = defaultdict(list) notfnd = set() dba_err_ct = 0 pmark = {} cpt_ct = 0 with open(LOC_FILE, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() for row in csvreader: ct += 1 pbar.update(ct) uniprot = row[2] sym = row[1] k = "%s|%s"%(uniprot,sym) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets({'uniprot': uniprot}, False) if not targets: targets = dba.find_targets({'sym': sym}, False) if not targets: notfnd.add(k) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) k2pids[k] = pids for pid in pids: compartments = [c for c in header[3:-5]] for (i,c) in enumerate(compartments): val_idx = i + 3 # add three because row has ENSG,Gene,Uniprot at beginning val = int(row[val_idx]) if val == 0: continue rel = row[-5] if rel == 'Uncertain': continue rv = dba.ins_compartment( {'protein_id': pid, 'ctype': 'Human Cell Atlas', 'go_id': COMPARTMENTS[c][1], 'go_term': COMPARTMENTS[c][0], 'reliability': rel} ) if not rv: dba_err_ct += 1 continue cpt_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new compartment rows for {} protein.s".format(cpt_ct, len(pmark)) if notfnd: print " No target found for {} UniProts/Symbols. See logfile {} for details".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Drug Target Ontology IDs and Classifications', 'source': 'Files %s from Schurer Group' % (", ".join(SRC_FILES)), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://drugtargetontology.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'dtoid' }, { 'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'dtoclass' }] #{'dataset_id': dataset_id, 'table_name': 'dto'} ] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] line_ct = slmf.wcl(MAPPING_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, MAPPING_FILE) logger.info("Processing {} input lines in file {}".format( line_ct, MAPPING_FILE)) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() up2dto = {} up2pid = {} ct = 0 with open(MAPPING_FILE, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line ct += 1 upd_ct = 0 notfnd = set() dba_err_ct = 0 for row in csvreader: ct += 1 dtoid = row[0] up = row[1] logger.info("Searching for UniProt: {}".format(up)) targets = dba.find_targets({'uniprot': up}) if not targets: notfnd.add(up) continue t = targets[0] pid = t['components']['protein'][0]['id'] rv = dba.upd_protein(pid, 'dtoid', dtoid) if rv: upd_ct += 1 up2dto[up] = dtoid up2pid[up] = pid else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for up in notfnd: logger.warn("No target found for UniProt: {}".format(up)) print "{} lines processed.".format(ct) print " Updated {} protein.dtoid values".format(upd_ct) print "Got {} UniProt to DTO mappings for TCRD targets".format(len(up2dto)) print "Got {} UniProt to Protein ID mappings for TCRD targets".format( len(up2pid)) if notfnd: print "WARNING: No target found for {} UniProts. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Classifications line_ct = slmf.wcl(CLASS_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, CLASS_FILE) logger.info("Processing {} input lines in file {}".format( line_ct, CLASS_FILE)) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 dto_mark = {} with open(CLASS_FILE) as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line ct += 1 upd_ct = 0 notfnd = set() dba_err_ct = 0 for row in csvreader: ct += 1 up = row[0] dto_class = row[1] if up not in up2pid: notfnd.add(up) continue pid = up2pid[up] rv = dba.upd_protein(pid, 'dtoclass', dto_class) if rv: upd_ct += 1 else: dba_err_ct += 1 # if dto_class in dto_mark: # # we've already loaded this term/tree # continue # term_tree = extract_tree(row) # rv = dba.ins_dto({'id': dtoid, 'name': dtoname, 'parent': leaf_term_parent_id}) # if rv: # dto_mark[dtoid] = True # else: # dba_err_ct += 1 pbar.update(ct) pbar.finish() for up in notfnd: logger.warn("UniProt {} not in map.".format(up)) print "{} lines processed.".format(ct) print " Updated {} protein.dtoclass values".format(upd_ct) if notfnd: print "WARNING: Got {} unmapped UniProts. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'IMPC Mouse Clones', 'source': "File %s obtained directly from Terry Meehan/Alba Gomez at EBI." % os.path.basename(IMPC_FILE), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.mousephenotype.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'IMPC Clones'" }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'IMPC Status'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] line_ct = slmf.wcl(IMPC_FILE) if not args['--quiet']: print "\nProcessing {} rows from input file {}".format( line_ct, IMPC_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() ti1_ct = 0 ti2_ct = 0 dba_err_ct = 0 with open(IMPC_FILE, 'rU') as csvfile: csvreader = csv.DictReader(csvfile) for d in csvreader: # Gene,MGI Accession,Public IDG,Public CMG Tier1,Public CMG Tier 2,Number of notifications,Status,# Clones,Non-Assigned Plans,Assigned plans,Aborted MIs,MIs in Progress,GLT Mice,Private ct += 1 sym = d['Gene'].upper() targets = dba.find_targets({'sym': sym}) if not targets: targets = dba.find_targets_by_xref({ 'xtype': 'MGI ID', 'value': d['MGI Accession'] }) if not targets: k = "%s,%s" % (d['Gene'], d['MGI Accession']) notfnd.add(k) continue if not d['Status'] and not d['# Clones']: skip_ct += 1 continue tids = list() for t in targets: pid = t['components']['protein'][0]['id'] if not d['Status']: status = '?' else: status = d['Status'] rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'IMPC Status', 'string_value': status }) if rv: ti1_ct += 1 else: dba_err_ct += 1 if not d['# Clones']: continue rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'IMPC Clones', 'string_value': d['# Clones'] }) if rv: ti2_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for k in notfnd: logger.warn("No target found for: {}".format(k)) if not args['--quiet']: print "{} rows processed.".format(ct) print "Inserted {} new 'IMPC Status' tdl_info rows".format(ti1_ct) print "Inserted {} new 'IMPC Clones' tdl_info rows".format(ti2_ct) print "Skipped {} rows with no relevant info".format(skip_ct) if notfnd: print "No target found for {} rows. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Transcription Factor Flags', 'source': BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://humantfs.ccbr.utoronto.ca/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'Is Transcription Factor'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] TDLs = {'Tdark': 0, 'Tbio': 0, 'Tchem': 0, 'Tclin': 0} ifn = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(ifn) if not args['--quiet']: print "\nProcessing {} lines in input file {}".format(line_ct, ifn) with open(ifn, 'rU') as ifh: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(ifh) header = csvreader.next() # skip header line ct = 0 ti_ct = 0 skip_ct = 0 notfnd = set() dba_err_ct = 0 for row in csvreader: # 0 Ensembl ID # 1 HGNC symbol # 2 DBD # 3 Is TF? # 4 TF assessment # 5 Binding mode,Motif status # 6 Final Notes # 7 Final Comments # 8 Interpro ID(s) # 9 EntrezGene ID # 10 EntrezGene Description # 11 PDB ID # 12 TF tested by HT-SELEX? # 13 TF tested by PBM? # 14 Conditional Binding Requirements # 15 Original Comments # 16 Vaquerizas 2009 classification # 17 CisBP considers it a TF? # 18 TFCat classification # 19 Is a GO TF? # 20 Initial assessment # 21 Curator 1 # 22 Curator 2 # 23 TFclass considers ct += 1 if row[3] != 'Yes': skip_ct += 1 continue sym = row[1] targets = dba.find_targets({'sym': sym}) if not targets: gid = row[9] if gid != 'None' and not gid.startswith('IPR'): targets = dba.find_targets({'geneid': gid}) if not targets: ensg = row[0] targets = dba.find_targets_by_xref({ 'xtype': 'Ensembl', 'value': ensg }) if not targets: k = "%s|%s|%s" % (sym, gid, ensg) notfnd.add(k) continue t = targets[0] TDLs[t['tdl']] += 1 pid = t['components']['protein'][0]['id'] rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'Is Transcription Factor', 'boolean_value': 1 }) if rv: ti_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) print "\n{} lines processed.".format(ct) print " Inserted {} new 'Is Transcription Factor' tdl_infos".format(ti_ct) print " Skipped {} non-TF lines".format(skip_ct) if notfnd: print "No target found for {} symbols/geneids/ENSGs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) for tdl in ['Tclin', 'Tchem', 'Tbio', 'Tdark']: print "%s: %d" % (tdl, TDLs[tdl])
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Cell Surface Protein Atlas', 'source': 'Worksheet B in S1_File.xlsx from http://wlab.ethz.ch/cspa/#downloads, converted to CSV', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://wlab.ethz.ch/cspa' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'Cell Surface Protein Atlas'", 'comment': 'Only high confidence values are loaded.' }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] line_ct = slmf.wcl(INFILE) if not args['--quiet']: print "\nProcessing {} lines from CSPA file {}".format(line_ct, INFILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 k2pids = defaultdict(list) notfnd = set() skip_ct = 0 dba_err_ct = 0 pmark = {} exp_ct = 0 with open(INFILE, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() for row in csvreader: ct += 1 pbar.update(ct) if row[2] != '1 - high confidence': skip_ct += 1 continue uniprot = row[1] geneid = row[4] k = "%s|%s" % (uniprot, geneid) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets({'uniprot': uniprot}, False) if not targets: targets = dba.find_targets({'geneid': geneid}, False) if not targets: notfnd.add(k) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) for pid in pids: cell_lines = [ c for c in header[6:-1] ] # there's a blank field at the end of the header line for (i, cl) in enumerate(cell_lines): val_idx = i + 6 # add six because row has other values at beginning if not row[val_idx]: continue rv = dba.ins_expression({ 'protein_id': pid, 'etype': 'Cell Surface Protein Atlas', 'tissue': 'Cell Line ' + cl, 'boolean_value': True }) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[pid] = True pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) print "Processed {} CSPA lines.".format(ct) print " Inserted {} new expression rows for {} proteins.".format( exp_ct, len(pmark)) print " Skipped {} non-high confidence rows".format(skip_ct) if notfnd: print " No target found for {} UniProts/GeneIDs. See logfile {} for details".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'HGNC', 'source': 'Custom download file from https://www.genenames.org/download/custom/', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.genenames.org/', 'comments': 'File downloaded with the following column data: HGNC ID Approved symbol Approved name Status UniProt ID NCBI Gene ID Mouse genome database ID' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile {} for details.".format( logfile) sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'sym', 'comment': "This is only updated with HGNC data if data from UniProt is absent." }, { 'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'geneid', 'comment': "This is only updated with HGNC data if data from UniProt is absent." }, { 'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d" % dataset_id }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile {} for details.".format( logfile) sys.exit(1) line_ct = slmf.wcl(HGNC_TSV_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, HGNC_TSV_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 tmark = {} hgnc_ct = 0 mgi_ct = 0 sym_ct = 0 symdiscr_ct = 0 geneid_ct = 0 geneiddiscr_ct = 0 nf_ct = 0 db_err_ct = 0 with open(HGNC_TSV_FILE, 'rU') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: # 0: HGNC ID # 1: Approved symbol # 2: Approved name # 3: Status # 4: UniProt ID # 5: NCBI Gene ID # 6: Mouse genome database ID ct += 1 pbar.update(ct) sym = row[1] geneid = row[5] up = row[4] targets = dba.find_targets({'sym': sym}) if not targets: targets = dba.find_targets({'geneid': geneid}) if not targets: targets = dba.find_targets({'uniprot': up}) if not targets: nf_ct += 1 #logger.warn("No target found for {}|{}|{}".format(sym, geneid, up)) continue for t in targets: p = t['components']['protein'][0] pid = p['id'] tmark[pid] = True # HGNC xref rv = dba.ins_xref({ 'protein_id': pid, 'xtype': 'HGNC', 'dataset_id': dataset_id, 'value': row[0] }) if rv: hgnc_ct += 1 else: db_err_ct += 1 # MGI xref rv = dba.ins_xref({ 'protein_id': pid, 'xtype': 'MGI ID', 'dataset_id': dataset_id, 'value': row[6] }) if rv: mgi_ct += 1 else: db_err_ct += 1 # Add missing syms if p['sym'] == None: rv = dba.upd_protein(pid, 'sym', sym) if rv: logger.info( "Inserted new sym {} for protein {}, {}".format( sym, pid, p['uniprot'])) sym_ct += 1 else: db_err_ct += 1 else: # Check for symbol discrepancies if p['sym'] != sym: logger.warn("Symbol discrepancy: UniProt=%s, HGNC=%s" % (p['sym'], sym)) symdiscr_ct += 1 if geneid: # Add missing geneids if p['geneid'] == None: rv = dba.upd_protein(pid, 'geneid', geneid) if rv: logger.info( "Inserted new geneid {} for protein {}, {}". format(geneid, pid, p['uniprot'])) geneid_ct += 1 else: db_err_ct += 1 else: # Check for geneid discrepancies if p['geneid'] != int(geneid): logger.warn( "GeneID discrepancy: UniProt={}, HGNC={}". format(p['geneid'], geneid)) geneiddiscr_ct += 1 pbar.finish() print "Processed {} lines - {} targets annotated.".format(ct, len(tmark)) print "No target found for {} lines.".format(nf_ct) print " Inserted {} HGNC ID xrefs".format(hgnc_ct) print " Inserted {} MGI ID xrefs".format(mgi_ct) if sym_ct > 0: print " Added {} new HGNC symbols".format(sym_ct) if symdiscr_ct > 0: print "WARNING: {} discrepant HGNC symbols. See logfile {} for details".format( symdiscr_ct, logfile) if geneid_ct > 0: print " Added {} new NCBI Gene IDs".format(geneid_ct) if geneiddiscr_ct > 0: print "WARNING: {} discrepant NCBI Gene IDs. See logfile {} for details".format( geneiddiscr_ct, logfile) if db_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( db_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # the following maps Monarch's tcrdmatches_full.subject to TCRD's ortholog.id # ie. 'MGI:1347010' => 156650 ortho2id = dba.get_orthologs_dbid2id() if not args['--quiet']: print "\nGot {} orthologs from TCRD".format(len(ortho2id)) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Monarch Ortholog Disease Associations', 'source': 'UMiami Monarch MySQL database on AWS server.', 'app': PROGRAM, 'app_version': __version__, 'comments': "Monarch database contact: John Turner <*****@*****.**>" }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'ortholog_disease', 'comment': "" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) # if not args['--quiet']: # print "\nConnecting to UMiami Monarch database." # monarchdb = mysql.connect(host=MONARCH_DB_HOST, port=MONARCH_DB_PORT, db=MONARCH_DB_NAME, # user=MONARCH_DB_USER, passwd=MONARCH_DB_PW) # assert monarchdb, "ERROR connecting to Monarch database." # monarch_odas = [] # with closing(monarchdb.cursor(mysql.cursors.DictCursor)) as curs: # curs.execute(SQLq) # for d in curs: # monarch_odas.append(d) # if not args['--quiet']: # print " Got {} ortholog disease records from Monarch database.".format(len(monarch_odas)) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] line_ct = slmf.wcl(FILENAME) logger.info("Processing {} lines in file {}".format(line_ct, FILENAME)) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, FILENAME) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(FILENAME, 'rU') as ifh: csvreader = csv.reader(ifh) ct = 0 od_ct = 0 notfnd = set() ortho_notfnd = set() pmark = {} dba_err_ct = 0 for row in csvreader: # HGNC Sym, UniProt, name, did, score, Ortholog TaxID, Ortholog Species, Ortholog DBID, Ortholog GeneID, Ortholog Symbol ct += 1 up = row[1] sym = row[0] targets = dba.find_targets({'uniprot': up}) if not targets: targets = dba.find_targets({'sym': sym}) if not targets: k = "%s|%s" % (up, sym) notfnd.add(k) logger.warn("No target found for {}".format(k)) continue ortholog = dba.get_ortholog({'symbol': row[9], 'taxid': row[5]}) if not ortholog: ortholog = dba.get_ortholog({ 'geneid': row[8], 'taxid': row[5] }) if not ortholog: k = "%s|%s|%s" % (row[9], row[8], row[5]) ortho_notfnd.add(k) logger.warn("No ortholog found for {}".format(k)) continue for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True rv = dba.ins_ortholog_disease({ 'protein_id': p['id'], 'dtype': 'Monarch', 'ortholog_id': ortholog['id'], 'name': row[2], 'did': row[3], 'score': row[4] }) if not rv: dba_err_ct += 1 continue od_ct += 1 pbar.update(ct) pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print " Inserted {} new ortholog_disease rows for {} proteins.".format( od_ct, len(pmark)) if notfnd: print "WARNING: No target found for {} UniProts/symbols. See logfile {} for details.".format( len(notfnd), logfile) if ortho_notfnd: print "WARNING: No ortholog found for {} symbols/geneids. See logfile {} for details.".format( len(ortho_notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging when debug is 0 fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'JAX/MGI Mouse/Human Orthology Phenotypes', 'source': 'File %s from ftp.informatics.jax.org' % PT_FILE, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.informatics.jax.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'phenotype', 'where_clause': "ptype = 'JAX/MGI Human Ortholog Phenotyp'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) if not args['--quiet']: print "\nParsing Mammalian Phenotype Ontology file {}".format( DOWNLOAD_DIR + MPO_OWL_FILE) mpo = parse_mp_owl(MPO_OWL_FILE) if not args['--quiet']: print "Got {} MP terms".format(len(mpo)) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] fn = DOWNLOAD_DIR + PT_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines from input file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pt_ct = 0 skip_ct = 0 pmark = {} notfnd = set() dba_err_ct = 0 for row in tsvreader: ct += 1 if not row[6] or row[6] == '': skip_ct += 1 continue sym = row[0] geneid = row[1] k = "%s|%s" % (sym, geneid) if k in notfnd: continue targets = dba.find_targets({'sym': sym}, idg=False) if not targets: targets = dba.find_targets({'geneid': geneid}, idg=False) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue for t in targets: pid = t['components']['protein'][0]['id'] pmark[pid] = True for mpid in row[6].split(): rv = dba.ins_phenotype({ 'protein_id': pid, 'ptype': 'JAX/MGI Human Ortholog Phenotype', 'term_id': mpid, 'term_name': mpo[mpid]['name'] }) if rv: pt_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Loaded {} new phenotype rows for {} proteins".format( pt_ct, len(pmark.keys())) print " Skipped {} lines with no MP terms".format(skip_ct) if notfnd: print "No target found for {} gene symbols/ids. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'PANTHER protein classes', 'source': 'File %s from ftp://ftp.pantherdb.org//sequence_classifications/current_release/PANTHER_Sequence_Classification_files/, and files %s and %s from http://data.pantherdb.org/PANTHER14.1/ontology/' % (os.path.basename(P2PC_FILE), os.path.basename(CLASS_FILE), os.path.basename(RELN_FILE)), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.pantherdb.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'panther_class' }, { 'dataset_id': dataset_id, 'table_name': 'p2pc' }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) relns = {} line_ct = slmf.wcl(RELN_FILE) if not args['--quiet']: print "\nProcessing {} lines in relationships file {}".format( line_ct, RELN_FILE) with open(RELN_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: ct += 1 pcid = row[0] parentid = row[2] if pcid in relns: relns[pcid].append(parentid) else: relns[pcid] = [parentid] print "{} input lines processed.".format(ct) print " Got {} PANTHER Class relationships".format(len(relns)) pc2dbid = {} line_ct = slmf.wcl(CLASS_FILE) if not args['--quiet']: print "\nProcessing {} lines in class file {}".format( line_ct, CLASS_FILE) with open(CLASS_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pc_ct = 0 pcmark = {} dba_err_ct = 0 for row in tsvreader: ct += 1 pc = row[0] init = {'pcid': pc, 'name': row[2]} if row[3]: init['desc'] = row[3] if pc in relns: init['parent_pcids'] = "|".join(relns[pc]) # there are duplicates in this file too, so only insert if we haven't if pc not in pcmark: rv = dba.ins_panther_class(init) if rv: pc_ct += 1 else: dba_err_ct += 1 pc2dbid[pc] = rv pcmark[pc] = True print "{} lines processed.".format(ct) print " Inserted {} new panther_class rows".format(pc_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] line_ct = slmf.wcl(P2PC_FILE) regex = re.compile(r'#(PC\d{5})') if not args['--quiet']: print "\nProcessing {} lines in classification file {}".format( line_ct, P2PC_FILE) with open(P2PC_FILE, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 02 pmark = {} p2pc_ct = 0 notfnd = set() dba_err_ct = 0 for row in tsvreader: ct += 1 [sp, hgnc, up] = row[0].split('|') up = up.replace('UniProtKB=', '') hgnc = hgnc.replace('HGNC=', '') if not row[8]: skip_ct += 1 continue #print "[DEBUG] searching by uniprot", up targets = dba.find_targets({'uniprot': up}) if not targets: #print "[DEBUG] searching by Ensembl xref", ensg targets = dba.find_targets_by_xref({ 'xtype': 'HGNC', 'value': hgnc }) if not targets: k = "%s|%s" % (up, hgnc) notfnd.add(k) continue t = targets[0] pid = t['components']['protein'][0]['id'] pmark[pid] = True #print "[DEBUG] PCs:", row[8] for pc in regex.findall(row[8]): #print "[DEBUG] ", pc pcid = pc2dbid[pc] rv = dba.ins_p2pc({ 'protein_id': pid, 'panther_class_id': pcid }) if rv: p2pc_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) print "{} lines processed.".format(ct) print " Inserted {} new p2pc rows for {} distinct proteins".format( p2pc_ct, len(pmark)) print " Skipped {} rows without PCs".format(skip_ct) if notfnd: print "No target found for {} UniProt/HGNCs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as main() dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Drug Central', 'source': "Drug Central files download files: %s" % ", ".join(SRC_FILES), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://drugcentral.org/' }) if not dataset_id: print "WARNING: Error inserting dataset. See logfile {} for details.".format( logfile) sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'drug_activity' }, { 'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'DrugCentral Indication'" }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile {} for details.".format( logfile) sys.exit(1) # First get mapping of DrugCentral names to ids name2id = {} line_ct = slmf.wcl(NAME_ID_FILE) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format( line_ct, NAME_ID_FILE) with open(NAME_ID_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): continue name2id[row[0]] = row[1].replace("\n", '') print "{} input lines processed.".format(ct) print "Saved {} keys in infos map".format(len(name2id)) # Next get drug info fields infos = {} line_ct = slmf.wcl(DRUGINFO_FILE) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format( line_ct, DRUGINFO_FILE) with open(DRUGINFO_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): continue infos[row[0]] = row[1].replace("\n", '') print "{} input lines processed.".format(ct) print "Saved {} keys in infos map".format(len(infos)) # # MOA activities # drug2tids = defaultdict(list) line_ct = slmf.wcl(TCLIN_FILE) line_ct -= 1 if not args['--quiet']: print "\nProcessing {} lines from DrugDB MOA activities file {}".format( line_ct, TCLIN_FILE) with open(TCLIN_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line # uniprot swissprot drug_name act_value act_type action_type source_name reference smiles ChEMBL_Id ct = 0 da_ct = 0 err_ct = 0 notfnd = [] dba_err_ct = 0 for row in tsvreader: ct += 1 up = row[0] sp = row[1] drug = row[2] if drug not in name2id: err_ct += 1 logger.warn("No DrugCentral id found for {}".format(drug)) continue dcid = name2id[drug] targets = dba.find_targets({'uniprot': up}) if not targets: targets = dba.find_targets({'name': sp}) if not targets: notfnd.append(up) continue tid = targets[0]['id'] drug2tids[drug].append(tid) init = { 'target_id': tid, 'drug': drug, 'dcid': dcid, 'has_moa': 1, 'source': row[5] } if row[3]: init['act_value'] = row[3] if row[4]: init['act_type'] = row[4] if row[5]: init['action_type'] = row[5] if row[6]: init['source'] = row[6] if row[7]: init['reference'] = row[7] if row[8]: init['smiles'] = row[8] if row[9]: init['cmpd_chemblid'] = row[9] if drug in infos: init['nlm_drug_info'] = infos[drug] rv = dba.ins_drug_activity(init) if rv: da_ct += 1 else: dba_err_ct += 1 print "{} DrugCentral Tclin rows processed.".format(ct) print " Inserted {} new drug_activity rows".format(da_ct) if len(notfnd) > 0: print "WARNNING: {} Uniprot/Swissprot Accessions NOT FOUND in TCRD:".format( len(notfnd)) for up in notfnd: print up if err_ct > 0: print "WARNNING: DrugCentral ID not found for {} drug names. See logfile {} for details.".format( err_ct, logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # # Non-MOA activities # line_ct = slmf.wcl(TCHEM_FILE) line_ct -= 1 if not args['--quiet']: print "\nProcessing {} lines from Non-MOA activities file {}".format( line_ct, TCHEM_FILE) with open(TCHEM_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line # uniprot swissprot drug_name act_value act_type action_type source_name reference smiles ChEMBL_Id ct = 0 da_ct = 0 err_ct = 0 notfnd = [] dba_err_ct = 0 for row in tsvreader: ct += 1 up = row[0] sp = row[1] drug = row[2] if drug not in name2id: err_ct += 1 logger.warn("No DrugCentral id found for {}".format(drug)) continue dcid = name2id[drug] targets = dba.find_targets({'uniprot': up}) if not targets: targets = dba.find_targets({'name': sp}) if not targets: notfnd.append(up) continue tid = targets[0]['id'] drug2tids[drug].append(tid) init = { 'target_id': tid, 'drug': drug, 'dcid': dcid, 'has_moa': 0, 'source': row[5] } if row[3]: init['act_value'] = row[3] if row[4]: init['act_type'] = row[4] if row[5]: init['action_type'] = row[5] if row[6]: init['source'] = row[6] if row[7]: init['reference'] = row[7] if row[8]: init['smiles'] = row[8] if row[9]: init['chemblid'] = row[9] if drug in infos: init['nlm_drug_info'] = infos[drug] rv = dba.ins_drug_activity(init) if rv: da_ct += 1 else: dba_err_ct += 1 print "{} DrugCentral Tchem rows processed.".format(ct) print " Inserted {} new drug_activity rows".format(da_ct) if len(notfnd) > 0: print "WARNNING: {} DrugDB Uniprot Accessions NOT FOUND in TCRD:".format( len(notfnd)) for up in notfnd: print up if err_ct > 0: print "WARNNING: DrugCentral ID not found for {} drug names. See logfile {} for details.".format( err_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # # Indications (diseases) # line_ct = slmf.wcl(DRUGIND_FILE) line_ct -= 1 if not args['--quiet']: print "\nProcessing {} lines from indications file {}".format( line_ct, DRUGIND_FILE) with open(DRUGIND_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line # DRUG_ID DRUG_NAME INDICATION_FDB UMLS_CUI SNOMEDCT_CUI DOID ct = 0 t2d_ct = 0 notfnd = {} dba_err_ct = 0 for row in tsvreader: ct += 1 drug = row[1] if drug not in drug2tids: notfnd[drug] = True continue init = { 'protein_id': tid, 'dtype': 'DrugCentral Indication', 'name': row[2], 'drug_name': drug } if row[5] != '': init['did'] = row[5] for tid in drug2tids[drug]: # NB> Using target_id as protein_id works for now, but will not if/when we have multiple protein targets init['protein_id'] = tid rv = dba.ins_disease(init) if rv: t2d_ct += 1 else: dba_err_ct += 1 print "{} DrugCentral indication rows processed.".format(ct) print " Inserted {} new disease rows".format(t2d_ct) if len(notfnd.keys()) > 0: print "WARNNING: {} drugs NOT FOUND in activity files:".format( len(notfnd)) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Guide to Pharmacology', 'source': 'Files %s from %s' % (", ".join(SRC_FILES), BASE_URL), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.guidetopharmacology.org/' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'cmpd_activity', 'where_clause': "ctype = 'Guide to Pharmacology'" }) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] fn = DOWNLOAD_DIR + L_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines in input file {}".format(line_ct, fn) ligands = {} skip_ct = 0 with open(fn, 'rU') as ifh: csvreader = csv.reader(ifh) header = csvreader.next() # skip header line ct = 1 for row in csvreader: # These are the fields in version 2019.2 # 0 Ligand id The GtP ligand identifier # 1 Name The name of the ligand # 2 Species (Peptides) The species which endogenously express a particular peptide ligand sequence # 3 Type The type of chemical # 4 Approved The drug is or has in the past been approved for human clinical use by a regulatory agency # 5 Withdrawn The drug is no longer approved for its original clinical use in one or more countries # 6 Labelled The ligand has been labelled with a chemical group such as a fluorscent tag or unstable isotope # 7 Radioactive Ligand has been labelled with radioactive isotope # 8 PubChem SID The PubChem Substance identifier assigned when we deposited the ligand in PubChem # 9 PubChem CID Our curated PubChem Compound database link # 10 UniProt id (Peptides) The UniProtKB/SwissProt Accession for peptide sequences # 11 IUPAC name The IUPAC chemical name # 12 INN The International Non-proprietary Name assigned by the WHO # 13 Synonyms Commonly used synonyms from the literature # 14 SMILES Specification of the chemical structure in canonical, isomeric SMILES format # 15 InChIKey A hashed version of the full InChI designed for easy web searches of chemical compounds # 16 InChI A textual identifier for the chemical structure ct += 1 ligand_id = int(row[0]) ligand_type = row[3] if ligand_type == 'Antibody' or ligand_type == 'Peptide': skip_ct += 1 continue ligands[ligand_id] = { 'name': row[1], 'pubchem_cid': row[9], 'smiles': row[14] } if not args['--quiet']: print " Got info for {} ligands".format(len(ligands)) print " Skipped {} antibodies/peptides".format(skip_ct) # this dict will map uniprot|sym from interactions file to TCRD target(s) # so we only have to find target(s) once for each pair. k2ts = defaultdict(list) fn = DOWNLOAD_DIR + I_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines in input file {}".format(line_ct, fn) with open(fn, 'rU') as ifh: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(ifh) header = csvreader.next() # skip header line ct = 1 tmark = {} ca_ct = 0 ap_ct = 0 md_ct = 0 ba_ct = 0 notfnd = set() dba_err_ct = 0 for row in csvreader: # NB. these do NOT match the file descriptions on the site. This is directly from the header # 0 target # 1 target_id # 2 target_gene_symbol # 3 target_uniprot # 4 target_ensembl_gene_id # 5 target_ligand # 6 target_ligand_id # 7 target_ligand_gene_symbol # 8 target_ligand_ensembl_gene_id # 9 target_ligand_uniprot # 10 target_ligand_pubchem_sid # 11 target_species # 12 ligand # 13 ligand_id # 14 ligand_gene_symbol # 15 ligand_species # 16 ligand_pubchem_sid # 17 type # 18 action # 19 action_comment # 20 selectivity # 21 endogenous # 22 primary_target # 23 concentration_range # 24 affinity_units # 25 affinity_high # 26 affinity_median # 27 affinity_low # 28 original_affinity_units # 29 original_affinity_low_nm # 30 original_affinity_median_nm # 31 original_affinity_high_nm # 32 original_affinity_relation # 33 assay_description # 34 receptor_site # 35 ligand_context # 36 pubmed_id ct += 1 pbar.update(ct) lid = int(row[13]) if lid not in ligands: ap_ct += 1 continue if row[26] == '': # no activity value md_ct += 1 continue if '|' in row[3]: skip_ct += 1 continue val = "%.8f" % float(row[26]) act_type = row[28] up = row[3] sym = row[2] k = "%s|%s" % (up, sym) if k == '|': md_ct += 1 continue if k in k2ts: # already found target(s) ts = k2ts[k] elif k in notfnd: # already didn't find target(s) continue else: # lookup target(s) targets = dba.find_targets({'uniprot': up}) if not targets: targets = dba.find_targets({'sym': sym}) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue ts = [] for t in targets: ts.append({'id': t['id'], 'fam': t['fam']}) k2ts[k] = ts if row[36] and row[36] != '': pmids = row[36] else: pmids = None if ligands[lid]['pubchem_cid'] == '': pccid = None else: pccid = ligands[lid]['pubchem_cid'] for t in ts: if t['fam'] == 'GPCR': cutoff = 7.0 # 100nM elif t['fam'] == 'IC': cutoff = 5.0 # 10uM elif t['fam'] == 'Kinase': cutoff = 7.52288 # 30nM elif t['fam'] == 'NR': cutoff = 7.0 # 100nM else: cutoff = 6.0 # 1uM for non-IDG Family targets if val >= cutoff: # target is Tchem, save activity tmark[t['id']] = True rv = dba.ins_cmpd_activity({ 'target_id': t['id'], 'catype': 'Guide to Pharmacology', 'cmpd_id_in_src': lid, 'cmpd_name_in_src': ligands[lid]['name'], 'smiles': ligands[lid]['smiles'], 'act_value': val, 'act_type': act_type, 'pubmed_ids': pmids, 'cmpd_pubchem_cid': pccid }) if not rv: dba_err_ct += 1 continue ca_ct += 1 else: ba_ct += 1 pbar.finish() print "{} rows processed.".format(ct) print " Inserted {} new cmpd_activity rows for {} targets".format( ca_ct, len(tmark)) print " Skipped {} with below cutoff activity values".format(ba_ct) print " Skipped {} activities with multiple targets".format(skip_ct) print " Skipped {} antibody/peptide activities".format(ap_ct) print " Skipped {} activities with missing data".format(md_ct) if notfnd: print "No target found for {} uniprots/symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def calc_and_load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'KEGG Distances', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'Directed graphs are produced from KEGG pathway KGML files and all shortest path lengths are then calculated and stored.'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'kegg_distance'}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) kgmls = get_kgmls(KGML_DIR) if not args['--quiet']: print "\nProcessing {} KGML files in {}".format(len(kgmls), KGML_DIR) logger.info("Processing {} KGML files in {}".format(len(kgmls), KGML_DIR)) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=len(kgmls)).start() # All pathways shortest path lengths # (node1, node2) => distance all_pws_spls = {} ct = 0 err_ct = 0 for kgml in kgmls: logger.info(" Working on {}".format(kgml)) ct += 1 try: dig = kg.kgml_file_to_digraph(kgml) except: err_ct += 1 logger.error("Error parsing file: {}".format(kgml)) continue aspls = nx.all_pairs_shortest_path_length(dig) dct = 0 for source in aspls: for target in aspls[source]: if source == target: continue st = (source, target) if st in all_pws_spls: if aspls[source][target] < all_pws_spls[st]: all_pws_spls[st] = aspls[source][target] dct += 1 else: all_pws_spls[st] = aspls[source][target] dct += 1 logger.info(" {} has {} non-zero shortest path lengths".format(kgml, dct)) pbar.update(ct) pbar.finish() logger.info("Got {} total unique non-zero shortest path lengths".format(len(all_pws_spls))) if not args['--quiet']: print " Got {} total unique non-zero shortest path lengths".format(len(all_pws_spls)) if err_ct > 0: print "WARNNING: {} parsing errors occurred. See logfile {} for details.".format(err_ct, logfile) logger.info("Processing {} KEGG Distances".format(len(all_pws_spls))) if not args['--quiet']: print "\nProcessing {} KEGG Distances".format(len(all_pws_spls)) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=len(all_pws_spls)).start() gid2pids = defaultdict(list) # So we only find each target once, # save protein.geneid => protein.id(s) notfnd = set() ct = 0 skip_ct = 0 kd_ct = 0 dba_err_ct = 0 for st,dist in all_pws_spls.items(): ct += 1 geneid1 = re.sub(r'^hsa:', '', st[0]) geneid2 = re.sub(r'^hsa:', '', st[1]) if geneid1 in gid2pids: pids1 = gid2pids[geneid1] elif geneid1 in notfnd: skip_ct += 1 continue else: targets = dba.find_targets({'geneid': geneid1}) if not targets: skip_ct += 1 notfnd.add(geneid1) # add to notfnd so we don't try looking it up again logger.warn("No target found for KEGG Gene ID {}".format(geneid1)) continue pids1 = [] for t in targets: pid = t['components']['protein'][0]['id'] pids1.append(pid) gid2pids[geneid1].append(pid) if geneid2 in gid2pids: pids2 = gid2pids[geneid2] elif geneid2 in notfnd: skip_ct += 1 continue else: targets = dba.find_targets({'geneid': geneid2}) if not targets: skip_ct += 1 notfnd.add(geneid2) # add to notfnd so we don't try looking it up again logger.warn("No target found for KEGG Gene ID {}".format(geneid2)) continue pids2 = [] for t in targets: pid = t['components']['protein'][0]['id'] pids2.append(pid) gid2pids[geneid2].append(pid) for pid1 in pids1: for pid2 in pids2: rv = dba.ins_kegg_distance({'pid1': pid1, 'pid2': pid2, 'distance': dist}) if rv: kd_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} KEGG Distances processed.".format(ct) print " Inserted {} new kegg_distance rows".format(kd_ct) if skip_ct > 0: print " {} KEGG IDs not found in TCRD - Skipped {} rows. See logfile {} for details.".format(len(notfnd), skip_ct, logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as main() dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'EBI Patent Counts', 'source': 'File %s' % BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.surechembl.org/search/', 'comments': 'Patents from SureChEMBL were tagged using the JensenLab tagger.' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'patent_count' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'EBI Total Patent Count'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] patent_cts = {} notfnd = set() pc_ct = 0 dba_err_ct = 0 fname = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(fname) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fname) with open(fname, 'rU') as csvfile: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line ct = 0 for row in csvreader: ct += 1 up = row[0] targets = dba.find_targets({'uniprot': up}) if not targets: targets = dba.find_targets_by_alias({ 'type': 'UniProt', 'value': up }) if not targets: notfnd.add(up) continue pid = targets[0]['components']['protein'][0]['id'] rv = dba.ins_patent_count({ 'protein_id': pid, 'year': row[2], 'count': row[3] }) if rv: pc_ct += 1 else: dba_err_ct += 1 if pid in patent_cts: patent_cts[pid] += int(row[3]) else: patent_cts[pid] = int(row[3]) pbar.update(ct) pbar.finish() for up in notfnd: logger.warn("No target found for {}".format(up)) print "{} lines processed.".format(ct) print "Inserted {} new patent_count rows for {} proteins".format( pc_ct, len(patent_cts)) if notfnd: print "No target found for {} UniProts. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) if not args['--quiet']: print "\nLoading {} Patent Count tdl_infos".format(len(patent_cts)) ct = 0 ti_ct = 0 dba_err_ct = 0 for pid, count in patent_cts.items(): ct += 1 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'EBI Total Patent Count', 'integer_value': count }) if rv: ti_ct += 1 else: dba_err_ct += 1 print " {} processed".format(ct) print " Inserted {} new EBI Total Patent Count tdl_info rows".format( ti_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'LINCS', 'source': "CSV file exported from Oleg Ursu's lincs PostgreSQL database on seaborgium. I do not know the origin of this database at this time.", 'app': PROGRAM, 'app_version': __version__, 'url': 'http://lincsproject.org/LINCS/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'lincs'}) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) line_ct = slmf.wcl(INPUT_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 gid2pids = {} notfnd = set() dba_err_ct = 0 pmark = {} lincs_ct = 0 with open(INPUT_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') for row in tsvreader: # 0: level5_lm.pr_gene_id # 1: level5_lm.zscore # 2: perturbagen.dc_id # 3: perturbagen.canonical_smiles # 4: signature.cell_id ct += 1 gid = row[0] if gid in gid2pids: # we've already found it pids = gid2pids[gid] elif gid in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets({'geneid': gid}, False) if not targets: notfnd.add(gid) continue pids = [] for t in targets: pid = t['components']['protein'][0]['id'] pids.append(pid) gid2pids[ gid] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_lincs({ 'protein_id': pid, 'cellid': row[4], 'zscore': row[1], 'pert_dcid': row[2], 'pert_smiles': row[3] }) if not rv: dba_err_ct += 1 continue pmark[pid] = True lincs_ct += 1 pbar.update(ct) pbar.finish() for gid in notfnd: logger.warn("No target found for {}".format(gid)) print "{} lines processed.".format(ct) print "Loaded {} new lincs rows for {} proteins.".format( lincs_ct, len(pmark)) if notfnd: print "No target found for {} geneids. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) #omim2doid = pickle.load( open(OMIM2DOID_PFILE, 'r') ) #mesh2doid = pickle.load( open(MESH2DOID_PFILE, 'r') ) conn = conn_tcrd({}) mesh2doid = get_db2do_map(conn, 'MESH') omim2doid = get_db2do_map(conn, 'OMIM') # Dataset dataset_id = dba.ins_dataset({ 'name': 'CTD Disease Associations', 'source': 'File %s from %s.' % (INPUT_FILE, BASE_URL), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://ctdbase.org/', 'comments': "Only disease associations with direct evidence are loaded into TCRD." }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'CTD'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) infile = (DOWNLOAD_DIR + INPUT_FILE).replace('.gz', '') line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, infile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 k2pids = {} pmark = {} notfnd = set() skip_ct = 0 dis_ct = 0 dba_err_ct = 0 for row in tsvreader: # 0: GeneSymbol # 1: GeneID # 2: DiseaseName # 3: DiseaseID (MeSH or OMIM identifier) # 4: DirectEvidence ('|'-delimited list) # 5: InferenceChemicalName # 6: InferenceScore # 7: OmimIDs ('|'-delimited list) # 8: PubMedIDs ('|'-delimited list) ct += 1 if row[0].startswith('#'): continue if not row[4]: # only load associations with direct evidence skip_ct += 1 continue sym = row[0] geneid = row[1] k = "%s|%s" % (sym, geneid) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: targets = dba.find_targets({'sym': sym}) if not targets: targets = dba.find_targets({'geneid': geneid}) if not targets: notfnd.add(geneid) logger.warn("No target found for {}".format(k)) continue pids = [] for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True pids.append(p['id']) k2pids[ k] = pids # save this mapping so we only lookup each target once # Try to map MeSH and OMIM IDs to DOIDs if row[3].startswith('MESH:'): mesh = row[3].replace('MESH:', '') if mesh in mesh2doid: dids = mesh2doid[mesh] else: dids = [row[3]] elif row[3].startswith('OMIM:'): omim = row[3].replace('OMIM:', '') if omim in omim2doid: dids = omim2doid[omim] else: dids = [row[3]] else: dids = [row[3]] for pid in pids: for did in dids: rv = dba.ins_disease({ 'protein_id': pid, 'dtype': 'CTD', 'name': row[2], 'did': did, 'evidence': row[4] }) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Loaded {} new disease rows for {} proteins.".format( dis_ct, len(pmark)) if skip_ct > 0: print "Skipped {} with no direct evidence.".format(skip_ct) if notfnd: print "No target found for {} symbols/geneids. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Expression Atlas', 'source': 'IDG-KMC generated data at UNM.', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ebi.ac.uk/gxa/', 'comment': 'Disease associations are derived from files from ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/atlas-latest-data.tar.gz' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'Expression Atlas'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) line_ct = slmf.wcl(INPUT_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] with open(INPUT_FILE, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct = 0 k2pids = {} pmark = {} notfnd = set() dis_ct = 0 dba_err_ct = 0 for row in tsvreader: # 0: "Gene ID" # 1: "DOID" # 2: "Gene Name" # 3: "log2foldchange" # 4: "p-value" # 5: "disease" # 6: "experiment_id" # 7: "contrast_id" ct += 1 sym = row[2] ensg = row[0] k = "%s|%s" % (sym, ensg) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: targets = dba.find_targets({'sym': sym}, idg=False) if not targets: targets = dba.find_targets_by_xref({ 'xtype': 'ENSG', 'value': ensg }) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue pids = [] for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True pids.append(p['id']) k2pids[ k] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_disease({ 'protein_id': pid, 'dtype': 'Expression Atlas', 'name': row[5], 'did': row[1], 'log2foldchange': "%.3f" % float(row[3]), 'pvalue': row[4] }) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Loaded {} new disease rows for {} proteins.".format( dis_ct, len(pmark)) if notfnd: print "No target found for {} symbols/ensgs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % ( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'IDG Eligible Targets List', 'source': 'IDG generated data in file %s.' % IDG_LIST_FILE, 'app': PROGRAM, 'app_version': __version__, 'comments': 'IDG Flags and Families set from list of targets on GitHub.', 'url': 'https://github.com/druggablegenome/IDGTargets' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'idg', 'where_clause': 'column_name == "idg"' }, { 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'fam', 'where_clause': 'column_name == "fam"', 'where_clause': 'idg == 1' }, { 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'famext', 'where_clause': 'column_name == "fam"', 'where_clause': 'idg == 1' }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] line_ct = slmf.wcl(IDG_LIST_FILE) print '\nProcessing {} lines in list file {}'.format( line_ct, IDG_LIST_FILE) logger.info("Processing {} lines in list file {}".format( line_ct, IDG_LIST_FILE)) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() notfnd = [] multfnd = [] ct = 0 idg_ct = 0 fam_ct = 0 famext_ct = 0 dba_err_ct = 0 with open(IDG_LIST_FILE, 'rU') as ifh: csvreader = csv.reader(ifh) #header = csvreader.next() # skip header line #ct += 1 for row in csvreader: ct += 1 sym = row[0] fam = row[2] targets = dba.find_targets({'sym': sym}, idg=False, include_annotations=False) if not targets: notfnd.append(sym) continue if len(targets) > 1: multfnd.append(sym) for t in targets: rv = dba.upd_target(t['id'], 'idg', 1) if rv: idg_ct += 1 else: dba_err_ct += 1 rv = dba.upd_target(t['id'], 'fam', fam) if rv: fam_ct += 1 else: dba_err_ct += 1 if row[3]: famext = row[3] rv = dba.upd_target(t['id'], 'famext', famext) if rv: famext_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print "{} targets updated with IDG flags".format(idg_ct) print "{} targets updated with fams".format(fam_ct) print " {} targets updated with famexts".format(famext_ct) if notfnd: print "No target found for {} symbols: {}".format( len(notfnd), ", ".join(notfnd)) if multfnd: print "Multiple targets found for {} symbols: {}".format( len(multfnd), ", ".join(multfnd)) if dba_err_ct > 0: print "WARNING: {} database errors occured. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'OMIM', 'source': 'Files %s downloaded from omim.org' % ", ".join([GENEMAP_FILE, TITLES_FILE, PS_FILE]), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://omim.org/', 'comments': 'Confirmed OMIM phenotypes and OMIM Phenotype Series info' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'omim' }, { 'dataset_id': dataset_id, 'table_name': 'omim_ps' }, { 'dataset_id': dataset_id, 'table_name': 'phenotype', 'where_clause': "ptype = 'OMIM'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) # OMIMs and Phenotypic Series fname = DOWNLOAD_DIR + TITLES_FILE line_ct = slmf.wcl(fname) if not args['--quiet']: print '\nProcessing %d lines from input file %s' % (line_ct, fname) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fname, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 0 omim_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0: Prefix ??? # 1: Mim Number # 2: Preferred Title; symbol Alternative Title(s); symbol(s) # 3: Included Title(s); symbols title = row[2].partition(';')[0] rv = dba.ins_omim({'mim': row[1], 'title': title}) if not rv: dba_err_ct += 1 continue omim_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print " Skipped {} commented lines.".format(skip_ct) print "Loaded {} new omim rows".format(omim_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) fname = DOWNLOAD_DIR + PS_FILE line_ct = slmf.wcl(fname) if not args['--quiet']: print '\nProcessing %d lines from input file %s' % (line_ct, fname) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fname, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 0 ps_ct = 0 err_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0: Phenotypic Series Number # 1: Mim Number # 2: Phenotype if len(row) == 2: init = {'omim_ps_id': row[0], 'title': row[1]} elif len(row) == 3: init = {'omim_ps_id': row[0], 'mim': row[1], 'title': row[2]} else: err_ct += 1 logger.warn("Parsing error for row {}".format(row)) continue rv = dba.ins_omim_ps(init) if not rv: dba_err_ct += 1 continue ps_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print " Skipped {} commented lines.".format(skip_ct) print "Loaded {} new omim_ps rows".format(ps_ct) if err_ct > 0: print "WARNING: {} parsing errors occurred. See logfile {} for details.".format( er_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Phenotypes fname = DOWNLOAD_DIR + GENEMAP_FILE line_ct = slmf.wcl(fname) if not args['--quiet']: print '\nProcessing %d lines from input file %s' % (line_ct, fname) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fname, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 tmark = {} skip_ct = 0 notfnd_ct = 0 prov_ct = 0 dds_ct = 0 pt_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0 - Sort ??? # 1 - Month # 2 - Day # 3 - Year # 4 - Cytogenetic location # 5 - Gene Symbol(s) # 6 - Confidence # 7 - Gene Name # 8 - MIM Number # 9 - Mapping Method # 10 - Comments # 11 - Phenotypes # 12 - Mouse Gene Symbol pts = row[11] if pts.startswith('?'): prov_ct += 1 continue if '(4)' in pts: dds_ct += 1 trait = "MIM Number: %s" % row[8] if row[11]: trait += "; Phenotype: %s" % pts found = False syms = row[5].split(', ') logger.info("Checking for OMIM syms: {}".format(syms)) for sym in syms: targets = dba.find_targets({'sym': sym}) if targets: found = True for t in targets: p = t['components']['protein'][0] logger.info( " Symbol {} found target {}: {}, {}".format( sym, t['id'], p['name'], p['description'])) rv = dba.ins_phenotype({ 'protein_id': p['id'], 'ptype': 'OMIM', 'trait': trait }) if not rv: dba_err_ct += 1 continue tmark[t['id']] = True pt_ct += 1 if not found: notfnd_ct += 1 logger.warn("No target found for row {}".format(row)) pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print " Skipped {} commented lines.".format(skip_ct) print " Skipped {} provisional phenotype rows.".format(prov_ct) print " Skipped {} deletion/duplication syndrome rows.".format(dds_ct) print "Loaded {} OMIM phenotypes for {} targets".format(pt_ct, len(tmark)) if notfnd_ct > 0: print "No target found for {} good lines. See logfile {} for details.".format( notfnd_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'DisGeNET Disease Associations', 'source': 'File %s from %s.'%(INPUT_FILE, BASE_URL), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.disgenet.org/web/DisGeNET/menu'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'DisGeNET'"}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) infile = (DOWNLOAD_DIR + INPUT_FILE).replace('.gz', '') line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, infile) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] with open(infile, 'rU') as f: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 k2pids = {} pmark = {} notfnd = set() dis_ct = 0 dba_err_ct = 0 for line in f: # 0: geneId # 1: geneSymbol # 2: DSI # 3: DPI # 4: diseaseId # 5: diseaseName # 6: diseaseType # 7: diseaseClass # 8: diseaseSemanticType # 9: score # 10: EI # 11: YearInitial # 12: YearFinal # 13: NofPmids # 14: NofSnps # 15: source ct += 1 if line.startswith('#'): continue if line.startswith('geneId'): # header row continue data = line.split('\t') geneid = data[0].strip() sym = data[1] k = "%s|%s"%(sym,geneid) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: targets = dba.find_targets({'sym': sym}) if not targets: targets = dba.find_targets({'geneid': geneid}) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue pids = [] for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True pids.append(p['id']) k2pids[k] = pids # save this mapping so we only lookup each target once pmid_ct = data[13].strip() snp_ct = data[14].strip() if pmid_ct != '0': if snp_ct != '0': ev = "%s PubMed IDs; %s SNPs"%(pmid_ct, snp_ct) else: ev = "%s PubMed IDs"%pmid_ct else: ev = "%s SNPs"%snp_ct for pid in pids: rv = dba.ins_disease( {'protein_id': pid, 'dtype': 'DisGeNET', 'name': data[5], 'did': data[4], 'score': data[9], 'source': data[15].strip(), 'evidence': ev} ) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Loaded {} new disease rows for {} proteins.".format(dis_ct, len(pmark)) if notfnd: print "No target found for {} symbols/geneids. See logfile {} for details.".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)