def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging when debug is 0 fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as load() dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'STRING IDs', 'source': 'Files %s and %s from from http://string-db.org/'%(os.path.basename(INFILE1), os.path.basename(INFILE2)), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://string-db.org/'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'stringid'}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) aliasmap = {} pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] ct = 0 skip_ct = 0 mult_ct = 0 line_ct = slmf.wcl(INFILE1) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, INFILE1) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(INFILE1, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: # taxid uniprot_ac|uniprot_id string_id identity bit_score ct += 1 pbar.update(ct) if float(row[3]) != 100: skip_ct += 1 continue [uniprot, name] = row[1].split("|") ensp = row[2].replace('9606.', '') bitscore = float(row[4]) if uniprot in aliasmap: # Save mapping with highest bit score if bitscore > aliasmap[uniprot][1]: aliasmap[uniprot] = (ensp, bitscore) else: aliasmap[uniprot] = (ensp, bitscore) if name in aliasmap: # Save mapping with highest bit score if bitscore > aliasmap[name][1]: aliasmap[name] = (ensp, bitscore) else: aliasmap[name] = (ensp, bitscore) pbar.finish() unmap_ct = len(aliasmap) print "{} input lines processed.".format(ct) print " Skipped {} non-identity lines".format(skip_ct) print " Got {} uniprot/name to STRING ID mappings".format(unmap_ct) line_ct = slmf.wcl(INFILE2) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, INFILE2) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 warn_ct = 0 with open(INFILE2, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ## string_protein_id ## alias ## source ## ct += 1 pbar.update(ct) alias = row[1] ensp = row[0].replace('9606.', '') if alias in aliasmap and aliasmap[alias][0] != ensp: # do not replace mappings from *human.uniprot_2_string.2018* with aliases logger.warn("Different ENSPs found for same alias {}: {} vs {}".format(alias, aliasmap[alias][0], ensp)) warn_ct += 1 continue aliasmap[alias] = (ensp, None) pbar.finish() amap_ct = len(aliasmap) - unmap_ct print "{} input lines processed.".format(ct) print " Added {} alias to STRING ID mappings".format(amap_ct) if warn_ct > 0: print " Skipped {} aliases that would override UniProt mappings. See logfile {} for details.".format(warn_ct, logfile) tct = dba.get_target_count(idg=False) if not args['--quiet']: print "\nLoading STRING IDs for {} TCRD targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 upd_ct = 0 nf_ct = 0 dba_err_ct = 0 for target in dba.get_targets(include_annotations=True): ct += 1 pbar.update(ct) p = target['components']['protein'][0] geneid = 'hsa:' + str(p['geneid']) hgncid = None if 'HGNC' in p['xrefs']: hgncid = p['xrefs']['HGNC'][0]['value'] ensp = None if p['uniprot'] in aliasmap: ensp = aliasmap[p['uniprot']][0] elif p['name'] in aliasmap: ensp = aliasmap[p['name']][0] elif geneid in aliasmap: ensp = aliasmap[geneid][0] elif hgncid and hgncid in aliasmap: ensp = aliasmap[hgncid][0] if not ensp: nf_ct += 1 logger.warn("No stringid fo protein {} ({})".format(p['id'], p['uniprot'])) continue rv = dba.do_update({'table': 'protein', 'id': p['id'], 'col': 'stringid', 'val': ensp} ) if rv: upd_ct += 1 else: dba_err_ct += 1 pbar.finish() print "Updated {} STRING ID values".format(upd_ct) if nf_ct > 0: print "No stringid found for {} proteins. See logfile {} for details.".format(nf_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(infile, args, logger): dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'PubChem CIDs', 'source': 'File %s' % BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__ }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'cmpd_activity', 'column_name': 'pubchem_cid', 'comment': "Loaded from UniChem file mapping ChEMBL IDs to PubChem CIDs." }, { 'dataset_id': dataset_id, 'table_name': 'drug_activity', 'column_name': 'pubchem_cid', 'comment': "Loaded from UniChem file mapping ChEMBL IDs to PubChem CIDs." }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, infile) chembl2pc = {} with open(infile, 'rU') as tsv: ct = 0 tsv.readline() # skip header line for line in tsv: data = line.split('\t') chembl2pc[data[0]] = int(data[1]) if not args['--quiet']: print "Got {} ChEMBL to PubChem mappings".format(len(chembl2pc)) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] chembl_activities = dba.get_cmpd_activities(catype='ChEMBL') if not args['--quiet']: print "\nLoading PubChem CIDs for {} ChEMBL activities".format( len(chembl_activities)) logger.info("Loading PubChem CIDs for {} ChEMBL activities".format( len(chembl_activities))) pbar = ProgressBar(widgets=pbar_widgets, maxval=len(chembl_activities)).start() ct = 0 pcid_ct = 0 notfnd = set() dba_err_ct = 0 for ca in chembl_activities: ct += 1 if ca['cmpd_id_in_src'] not in chembl2pc: notfnd.add(ca['cmpd_id_in_src']) logger.warn("{} not found".format(ca['cmpd_id_in_src'])) continue pccid = chembl2pc[ca['cmpd_id_in_src']] rv = dba.do_update({ 'table': 'cmpd_activity', 'id': ca['id'], 'col': 'cmpd_pubchem_cid', 'val': pccid }) if rv: pcid_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} ChEMBL activities processed.".format(ct) print " Inserted {} new PubChem CIDs".format(pcid_ct) if len(notfnd) > 0: print " {} ChEMBL IDs not found. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: %d DB errors occurred. See logfile %s for details." % ( dba_err_ct, logfile) drug_activities = dba.get_drug_activities() if not args['--quiet']: print "\nLoading PubChem CIDs for {} drug activities".format( len(drug_activities)) logger.info("Loading PubChem CIDs for {} drug activities".format( len(drug_activities))) pbar = ProgressBar(widgets=pbar_widgets, maxval=len(drug_activities)).start() ct = 0 pcid_ct = 0 skip_ct = 0 notfnd = set() dba_err_ct = 0 for da in drug_activities: ct += 1 if not da['cmpd_chemblid']: skip_ct += 1 continue if da['cmpd_chemblid'] not in chembl2pc: notfnd.add(da['cmpd_chemblid']) logger.warn("{} not found".format(da['cmpd_chemblid'])) continue pccid = chembl2pc[da['cmpd_chemblid']] rv = dba.do_update({ 'table': 'drug_activity', 'id': da['id'], 'col': 'cmpd_pubchem_cid', 'val': pccid }) if rv: pcid_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} drug activities processed.".format(ct) print " Inserted {} new PubChem CIDs".format(pcid_ct) print " Skipped {} drug activities with no ChEMBL ID".format(skip_ct) if len(notfnd) > 0: print " {} ChEMBL IDs not found. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'GeneRIF Years', 'source': 'PubMed records via NCBI E-Utils', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ncbi.nlm.nih.gov/pubmed' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'generif', 'column_name': 'years' }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pubmed2date = pickle.load(open(PICKLE_FILE, 'rb')) if not args['--quiet']: print "\nGot %d PubMed date mappings from file %s" % (len(pubmed2date), PICKLE_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] generifs = dba.get_generifs() if not args['--quiet']: print "\nProcessing {} GeneRIFs".format(len(generifs)) logger.info("Processing {} GeneRIFs".format(len(generifs))) pbar = ProgressBar(widgets=pbar_widgets, maxval=len(generifs)).start() yrre = re.compile(r'^(\d{4})') ct = 0 yr_ct = 0 skip_ct = 0 net_err_ct = 0 dba_err_ct = 0 for generif in generifs: ct += 1 logger.debug("Processing GeneRIF: {}".format(generif)) # GeneRIFs with multiple refs often have duplicates, so fix that if "|" in generif['pubmed_ids']: pmids = set(generif['pubmed_ids'].split("|")) pmids = list(pmids) rv = dba.do_update({ 'table': 'generif', 'id': generif['id'], 'col': 'pubmed_ids', 'val': "|".join(pmids) }) if not rv: dba_err_ct += 1 else: pmids = [generif['pubmed_ids']] years = list() for pmid in pmids: if pmid in pubmed2date: m = yrre.match(pubmed2date[pmid]) if m: years.append(m.groups(1)[0]) else: years.append('') else: years.append('') # See if we got any years... if any(years): # if so, so do the updates rv = dba.do_update({ 'table': 'generif', 'id': generif['id'], 'col': 'years', 'val': "|".join(years) }) if rv: yr_ct += 1 else: dba_err_ct += 1 else: # if not, skip skip_ct += 1 pbar.update(ct) pbar.finish() if not args['--quiet']: print "{} GeneRIFs processed.".format(ct) print " Updated {} genefifs with years".format(yr_ct) print " Skipped {} generifs with no years.".format(skip_ct) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) if net_err_ct > 0: print "WARNING: {} Network/E-Utils errors occurred. See logfile {} for details.".format( net_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) with open(TISSUE2UBERON_FILE, 'r') as ifh: tiss2uid = ast.literal_eval(ifh.read()) if not args['--quiet']: print "\nGot {} tissue to Uberon ID mappings from file {}".format( len(tiss2uid), TISSUE2UBERON_FILE) exp_ct = dba.get_expression_count(etype=ETYPE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} {} expression rows".format(exp_ct, ETYPE) pbar = ProgressBar(widgets=pbar_widgets, maxval=exp_ct).start() ct = 0 nouid = set() upd_ct = 0 dba_err_ct = 0 for exp in dba.get_expressions(etype=ETYPE): ct += 1 uberon_id = None if exp['oid']: uberon_id = dba.get_uberon_id({'oid': exp['oid']}) if not uberon_id: uberon_id = dba.get_uberon_id({'name': exp['tissue']}) if not uberon_id and exp['tissue'] in tiss2uid: uberon_id = tiss2uid[exp['tissue']] if not uberon_id: nouid.add(exp['tissue']) continue rv = dba.do_update({ 'table': 'expression', 'id': exp['id'], 'col': 'uberon_id', 'val': uberon_id }) if rv: upd_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for t in nouid: logger.warn("No Uberon ID found for {}".format(t)) print "{} {} expression rows processed.".format(ct, ETYPE) print " Updated {} with Uberon IDs".format(upd_ct) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format( len(nouid), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)