def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Expression Atlas', 'source': 'IDG-KMC generated data at UNM.', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ebi.ac.uk/gxa/', 'comment': 'Disease associations are derived from files from ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/atlas-latest-data.tar.gz' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'Expression Atlas'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) line_ct = slmf.wcl(INPUT_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] with open(INPUT_FILE, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct = 0 k2pids = {} pmark = {} notfnd = set() dis_ct = 0 dba_err_ct = 0 for row in tsvreader: # 0: "Gene ID" # 1: "DOID" # 2: "Gene Name" # 3: "log2foldchange" # 4: "p-value" # 5: "disease" # 6: "experiment_id" # 7: "contrast_id" ct += 1 sym = row[2] ensg = row[0] k = "%s|%s" % (sym, ensg) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: targets = dba.find_targets({'sym': sym}, idg=False) if not targets: targets = dba.find_targets_by_xref({ 'xtype': 'ENSG', 'value': ensg }) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue pids = [] for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True pids.append(p['id']) k2pids[ k] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_disease({ 'protein_id': pid, 'dtype': 'Expression Atlas', 'name': row[5], 'did': row[1], 'log2foldchange': "%.3f" % float(row[3]), 'pvalue': row[4] }) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Loaded {} new disease rows for {} proteins.".format( dis_ct, len(pmark)) if notfnd: print "No target found for {} symbols/ensgs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def tinx(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # The results of parsing the input mentions files will be the following dictionaries: pid2pmids = { } # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein # Including the UniProt accession in the key is just for convenience when # checking the output. It is not used for anything. doid2pmids = {} # DOID => set of all PMIDs that mention the disease pmid_disease_ct = { } # PMID => count of diseases mentioned in a given paper pmid_protein_ct = { } # PMID => count of proteins mentioned in a given paper # First parse the Disease Ontology OBO file to get DO names and defs dofile = DO_DOWNLOAD_DIR + DO_OBO print "\nParsing Disease Ontology file {}".format(dofile) do_parser = obo.Parser(open(dofile)) do = {} for stanza in do_parser: do[stanza.tags['id'][0].value] = stanza.tags print " Got {} Disease Ontology terms".format(len(do)) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] fn = JL_DOWNLOAD_DIR + PROTEIN_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines in protein file {}".format(line_ct, fn) with open(fn, 'rU') as tsvf: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 pbar.update(ct) if not line.startswith('ENSP'): skip_ct += 1 continue data = line.rstrip().split('\t') ensp = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) targets = dba.find_targets({'stringid': ensp}) if not targets: # if we don't find a target by stringid, which is the more reliable and # prefered way, try by Ensembl xref targets = dba.find_targets_by_xref({ 'xtype': 'Ensembl', 'value': ensp }) if not targets: notfnd.add(ensp) continue for t in targets: p = t['components']['protein'][0] k = "%s,%s" % (p['id'], p['uniprot']) if k in pid2pmids: pid2pmids[k] = pid2pmids[k].union(pmids) else: pid2pmids[k] = set(pmids) for pmid in pmids: if pmid in pmid_protein_ct: pmid_protein_ct[pmid] += 1.0 else: pmid_protein_ct[pmid] = 1.0 pbar.finish() for ensp in notfnd: logger.warn("No target found for {}".format(ensp)) print "{} lines processed.".format(ct) print " Skipped {} non-ENSP lines".format(skip_ct) print " Saved {} protein to PMIDs mappings".format(len(pid2pmids)) print " Saved {} PMID to protein count mappings".format( len(pmid_protein_ct)) if notfnd: print " No target found for {} ENSPs. See logfile {} for details.".format( len(notfnd), logfile) fn = JL_DOWNLOAD_DIR + DISEASE_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsvf: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 pbar.update(ct) if not line.startswith('DOID:'): skip_ct += 1 continue data = line.rstrip().split('\t') doid = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) if doid not in do: logger.warn("%s not found in DO" % doid) notfnd.add(doid) continue if doid in doid2pmids: doid2pmids[doid] = doid2pmids[doid].union(pmids) else: doid2pmids[doid] = set(pmids) for pmid in pmids: if pmid in pmid_disease_ct: pmid_disease_ct[pmid] += 1.0 else: pmid_disease_ct[pmid] = 1.0 pbar.finish() print "{} lines processed.".format(ct) print " Skipped {} non-DOID lines".format(skip_ct) print " Saved {} DOID to PMIDs mappings".format(len(doid2pmids)) print " Saved {} PMID to disease count mappings".format( len(pmid_disease_ct)) if notfnd: print "WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format( len(notfnd), logfile) if not args['--quiet']: print "\nComputing protein novely scores" # To calculate novelty scores, each paper (PMID) is assigned a # fractional target (FT) score of one divided by the number of targets # mentioned in it. The novelty score of a given protein is one divided # by the sum of the FT scores for all the papers mentioning that # protein. ct = 0 with open(PROTEIN_NOVELTY_FILE, 'wb') as pnovf: pnovf.write("Protein ID,UniProt,Novelty\n") for k in pid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in pid2pmids[k]: ft_score_sum += 1.0 / pmid_protein_ct[pmid] novelty = 1.0 / ft_score_sum pnovf.write("%s,%.8f\n" % (k, novelty)) print " Wrote {} novelty scores to file {}".format( ct, PROTEIN_NOVELTY_FILE) if not args['--quiet']: print "\nComputing disease novely scores" # Exactly as for proteins, but using disease mentions ct = 0 with open(DISEASE_NOVELTY_FILE, 'wb') as dnovf: dnovf.write("DOID,Novelty\n") for doid in doid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in doid2pmids[doid]: ft_score_sum += 1.0 / pmid_disease_ct[pmid] novelty = 1.0 / ft_score_sum dnovf.write("%s,%.8f\n" % (doid, novelty)) print " Wrote {} novelty scores to file {}".format( ct, DISEASE_NOVELTY_FILE) if not args['--quiet']: print "\nComputing importance scores" # To calculate importance scores, each paper is assigned a fractional # disease-target (FDT) score of one divided by the product of the # number of targets mentioned and the number of diseases # mentioned. The importance score for a given disease-target pair is # the sum of the FDT scores for all papers mentioning that disease and # protein. ct = 0 with open(IMPORTANCE_FILE, 'wb') as impf: impf.write("DOID,Protein ID,UniProt,Score\n") for k, ppmids in pid2pmids.items(): for doid, dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) fdt_score_sum = 0.0 for pmid in pd_pmids: fdt_score_sum += 1.0 / (pmid_protein_ct[pmid] * pmid_disease_ct[pmid]) if fdt_score_sum > 0: ct += 1 impf.write("%s,%s,%.8f\n" % (doid, k, fdt_score_sum)) print " Wrote {} importance scores to file {}".format(ct, IMPORTANCE_FILE) if not args['--quiet']: print "\nComputing PubMed rankings" # PMIDs are ranked for a given disease-target pair based on a score # calculated by multiplying the number of targets mentioned and the # number of diseases mentioned in that paper. Lower scores have a lower # rank (higher priority). If the scores do not discriminate, PMIDs are # reverse sorted by value with the assumption that larger PMIDs are # newer and of higher priority. ct = 0 with open(PMID_RANKING_FILE, 'wb') as pmrf: pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n") for k, ppmids in pid2pmids.items(): for doid, dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) scores = [ ] # scores are tuples of (PMID, protein_mentions*disease_mentions) for pmid in pd_pmids: scores.append( (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid])) if len(scores) > 0: scores.sort(cmp_pmids_scores) for i, t in enumerate(scores): ct += 1 pmrf.write("%s,%s,%d,%d\n" % (doid, k, t[0], i)) print " Wrote {} PubMed rankings to file {}".format(ct, PMID_RANKING_FILE)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'Human Proteome Map', 'source': 'IDG-KMC generated data by Oleg Ursu at UNM.', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.humanproteomemap.org/'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance provs = [ {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPM Protein'", 'comment': 'Log Median and qualitative expression values are derived from files from http://www.humanproteomemap.org/download.php'}, {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPM Gene'", 'comment': 'Log Median and qualitative expression values are derived from files from http://www.humanproteomemap.org/download.php'}, {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPM Protein Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.humanproteomemap.org/download.php. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'}, {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPM Gene Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.humanproteomemap.org/download.php. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'}] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] with open(TISSUE2UBERON_FILE, 'r') as ifh: tiss2uid = ast.literal_eval(ifh.read()) if not args['--quiet']: print "\nGot {} tissue to Uberon ID mappings from file {}".format(len(tiss2uid), TISSUE2UBERON_FILE) # # Protein Level Expressions # line_ct = slmf.wcl(PROTEIN_QUAL_FILE) if not args['--quiet']: print "\nProcessing {} lines in HPM file {}".format(line_ct, PROTEIN_QUAL_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 rs2pids = defaultdict(list) notfnd = set() nouid = set() dba_err_ct = 0 pmark = {} exp_ct = 0 with open(PROTEIN_QUAL_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 pbar.update(ct) #rs = re.sub('\.\d+$', '', row[0]) # get rid of version rs = row[0] if rs in rs2pids: # we've already found it pids = rs2pids[rs] elif rs in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets_by_xref({'xtype': 'RefSeq', 'value': rs}, False) if not targets: notfnd.add(rs) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) rs2pids[rs] = pids # save this mapping so we only lookup each target once tissue = row[1] if row[3] == 'NA': init = {'etype': 'HPM Protein', 'tissue': tissue, 'qual_value': row[4],} else: init = {'etype': 'HPM Protein','tissue': tissue, 'qual_value': row[4], 'number_value': row[3]} # Add Uberon ID, if we can find one if tissue in tiss2uid: uberon_id = tiss2uid[tissue] else: uberon_id = dba.get_uberon_id({'name': tissue}) if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(tissue) for pid in pids: init['protein_id'] = pid rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new expression rows for {} proteins ({} RefSeqs)".format(exp_ct, len(pmark), len(rs2pids)) if notfnd: print "No target found for {} RefSeqs. See logfile {} for details.".format(len(notfnd), logfile) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) line_ct = slmf.wcl(PROTEIN_TAU_FILE) if not args['--quiet']: print "\nProcessing {} lines in Tissue Specificity Index file {}".format(line_ct, PROTEIN_TAU_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 dba_err_ct = 0 pmark = {} skip_ct = 0 ti_ct = 0 with open(PROTEIN_TAU_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 pbar.update(ct) #rs = re.sub('\.\d+$', '', row[0]) # get rid of version rs = row[0] tau = row[1] if rs not in rs2pids: skip_ct += 1 continue for pid in rs2pids[rs]: rv = dba.ins_tdl_info({'protein_id': pid, 'itype': 'HPM Protein Tissue Specificity Index', 'number_value': tau}) if not rv: dba_err_ct += 1 continue ti_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new HPM Protein Tissue Specificity Index tdl_info rows for {} proteins.".format(ti_ct, len(pmark)) if skip_ct > 0: print " Skipped {} rows with RefSeqs not in map from expression file.".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) # # Gene Level Expressions # line_ct = slmf.wcl(GENE_QUAL_FILE) if not args['--quiet']: print "\nProcessing {} lines in HPM file {}".format(line_ct, GENE_QUAL_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 sym2pids = defaultdict(list) notfnd = set() nouid = set() dba_err_ct = 0 pmark = {} exp_ct = 0 with open(GENE_QUAL_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 pbar.update(ct) sym = re.sub('\.\d+$', '', row[0]) # get rid of version if sym in sym2pids: pids = sym2pids[sym] elif sym in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets({'sym': sym}, False) if not targets: notfnd.add(sym) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) sym2pids[sym] = pids # save this mapping so we only lookup each target once tissue = row[1] if row[3] == 'NA': init = {'etype': 'HPM Gene', 'tissue': tissue, 'qual_value': row[4],} else: init = {'etype': 'HPM Gene','tissue': tissue, 'qual_value': row[4], 'number_value': row[3]} # Add Uberon ID, if we can find one if tissue in tiss2uid: uberon_id = tiss2uid[tissue] else: uberon_id = dba.get_uberon_id({'name': tissue}) if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(tissue) for pid in pids: init['protein_id'] = pid rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new expression rows for {} proteins ({} Gene Symbols)".format(exp_ct, len(pmark), len(sym2pids)) if notfnd: print " No target found for {} symbols. See logfile {} for details.".format(len(notfnd), logfile) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) line_ct = slmf.wcl(GENE_TAU_FILE) if not args['--quiet']: print "\nProcessing {} lines in Tissue Specificity Index file {}".format(line_ct, GENE_TAU_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 dba_err_ct = 0 pmark = {} skip_ct = 0 ti_ct = 0 with open(GENE_TAU_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 pbar.update(ct) sym = re.sub('\.\d+$', '', row[0]) # get rid of version tau = row[1] if sym not in sym2pids: skip_ct += 1 continue for pid in rs2pids[rs]: rv = dba.ins_tdl_info({'protein_id': pid, 'itype': 'HPM Gene Tissue Specificity Index', 'number_value': tau}) if not rv: dba_err_ct += 1 continue ti_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new HPM Gene Tissue Specificity Index tdl_info rows for {} proteins.".format(ti_ct, len(pmark)) if skip_ct > 0: print " Skipped {} rows with symbols not in map from expression file".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'JensenLab PubMed Text-mining Scores', 'source': 'File %s' % BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': BASE_URL }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'pmscore' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'JensenLab PubMed Score'" }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] ensp2pids = {} pmscores = {} # protein.id => sum(all scores) pms_ct = 0 upd_ct = 0 notfnd = {} dba_err_ct = 0 infile = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: # sym year score ct += 1 pbar.update(ct) if not row[0].startswith('ENSP'): continue ensp = row[0] if ensp in ensp2pids: # we've already found it pids = ensp2pids[ensp] elif ensp in notfnd: # we've already not found it continue else: targets = dba.find_targets({'stringid': ensp}) if not targets: targets = dba.find_targets_by_xref({ 'xtype': 'STRING', 'value': '9606.' + ensp }) if not targets: notfnd[ensp] = True logger.warn("No target found for {}".format(ensp)) continue pids = [] for target in targets: pids.append(target['components']['protein'][0]['id']) ensp2pids[ ensp] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_pmscore({ 'protein_id': pid, 'year': row[1], 'score': row[2] }) if rv: pms_ct += 1 else: dba_err_ct += 1 if pid in pmscores: pmscores[pid] += float(row[2]) else: pmscores[pid] = float(row[2]) pbar.finish() print "{} input lines processed.".format(ct) print " Inserted {} new pmscore rows for {} targets".format( pms_ct, len(pmscores)) if len(notfnd) > 0: print "No target found for {} STRING IDs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) print "\nLoading {} JensenLab PubMed Score tdl_infos".format( len(pmscores.keys())) ct = 0 ti_ct = 0 dba_err_ct = 0 for pid, score in pmscores.items(): ct += 1 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'JensenLab PubMed Score', 'number_value': score }) if rv: ti_ct += 1 else: dba_err_ct += 1 print "{} processed".format(ct) print " Inserted {} new JensenLab PubMed Score tdl_info rows".format( ti_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( (dba_err_ct, logfile))
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'PANTHER protein classes', 'source': 'File %s from ftp://ftp.pantherdb.org//sequence_classifications/current_release/PANTHER_Sequence_Classification_files/, and files %s and %s from http://data.pantherdb.org/PANTHER14.1/ontology/' % (os.path.basename(P2PC_FILE), os.path.basename(CLASS_FILE), os.path.basename(RELN_FILE)), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.pantherdb.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'panther_class' }, { 'dataset_id': dataset_id, 'table_name': 'p2pc' }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) relns = {} line_ct = slmf.wcl(RELN_FILE) if not args['--quiet']: print "\nProcessing {} lines in relationships file {}".format( line_ct, RELN_FILE) with open(RELN_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: ct += 1 pcid = row[0] parentid = row[2] if pcid in relns: relns[pcid].append(parentid) else: relns[pcid] = [parentid] print "{} input lines processed.".format(ct) print " Got {} PANTHER Class relationships".format(len(relns)) pc2dbid = {} line_ct = slmf.wcl(CLASS_FILE) if not args['--quiet']: print "\nProcessing {} lines in class file {}".format( line_ct, CLASS_FILE) with open(CLASS_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pc_ct = 0 pcmark = {} dba_err_ct = 0 for row in tsvreader: ct += 1 pc = row[0] init = {'pcid': pc, 'name': row[2]} if row[3]: init['desc'] = row[3] if pc in relns: init['parent_pcids'] = "|".join(relns[pc]) # there are duplicates in this file too, so only insert if we haven't if pc not in pcmark: rv = dba.ins_panther_class(init) if rv: pc_ct += 1 else: dba_err_ct += 1 pc2dbid[pc] = rv pcmark[pc] = True print "{} lines processed.".format(ct) print " Inserted {} new panther_class rows".format(pc_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] line_ct = slmf.wcl(P2PC_FILE) regex = re.compile(r'#(PC\d{5})') if not args['--quiet']: print "\nProcessing {} lines in classification file {}".format( line_ct, P2PC_FILE) with open(P2PC_FILE, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 02 pmark = {} p2pc_ct = 0 notfnd = set() dba_err_ct = 0 for row in tsvreader: ct += 1 [sp, hgnc, up] = row[0].split('|') up = up.replace('UniProtKB=', '') hgnc = hgnc.replace('HGNC=', '') if not row[8]: skip_ct += 1 continue #print "[DEBUG] searching by uniprot", up targets = dba.find_targets({'uniprot': up}) if not targets: #print "[DEBUG] searching by Ensembl xref", ensg targets = dba.find_targets_by_xref({ 'xtype': 'HGNC', 'value': hgnc }) if not targets: k = "%s|%s" % (up, hgnc) notfnd.add(k) continue t = targets[0] pid = t['components']['protein'][0]['id'] pmark[pid] = True #print "[DEBUG] PCs:", row[8] for pc in regex.findall(row[8]): #print "[DEBUG] ", pc pcid = pc2dbid[pc] rv = dba.ins_p2pc({ 'protein_id': pid, 'panther_class_id': pcid }) if rv: p2pc_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) print "{} lines processed.".format(ct) print " Inserted {} new p2pc rows for {} distinct proteins".format( p2pc_ct, len(pmark)) print " Skipped {} rows without PCs".format(skip_ct) if notfnd: print "No target found for {} UniProt/HGNCs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'GTEx', 'source': 'IDG-KMC generated data by Jeremy Yang at UNM from GTEx files.', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.gtexportal.org/home/'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance provs = [ {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'GTEx'", 'comment': 'Pre-processing code can be found here: https://github.com/unmtransinfo/expression-profiles'}, {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'GTEx Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from GTEx files. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'} ] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) with open(TISSUE2UBERON_FILE, 'r') as ifh: tiss2uid = ast.literal_eval(ifh.read()) if not args['--quiet']: print "\nGot {} tissue to Uberon ID mappings from file {}".format(len(tiss2uid), TISSUE2UBERON_FILE) line_ct = slmf.wcl(GTEX_FILE) if not args['--quiet']: print "\nProcessing {} lines in GTEx file {}".format(line_ct, GTEX_FILE) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 ensg2pids = defaultdict(list) notfnd = set() nouid = set() dba_err_ct = 0 pmark = {} exp_ct = 0 with open(GTEX_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: # ENSG SMTSD SEX TPM TAU TAU_BYSEX TPM_RANK TPM_RANK_BYSEX TPM_LEVEL TPM_LEVEL_BYSEX TPM_F TPM_M log2foldchange ct += 1 pbar.update(ct) ensg = re.sub('\.\d+$', '', row[0]) # get rid of version if present if ensg in ensg2pids: # we've already found it pids = ensg2pids[ensg] elif ensg in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets_by_xref({'xtype': 'Ensembl', 'value': ensg}, False) if not targets: notfnd.add(ensg) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) ensg2pids[ensg] = pids # save this mapping so we only lookup each target once tissue = row[1] init = {'tissue': tissue, 'gender': row[2], 'tpm': row[3], 'tpm_rank': row[6], 'tpm_rank_bysex': row[7], 'tpm_level': row[8], 'tpm_level_bysex': row[9], 'tau': row[4], 'tau_bysex': row[5]} if row[10]: init['tpm_f'] = row[10] if row[11]: init['tpm_m'] = row[11] if row[12]: init['log2foldchange'] = row[12] # Add Uberon ID, if we can find one if tissue in tiss2uid: uberon_id = tiss2uid[tissue] else: uberon_id = dba.get_uberon_id({'name': tissue}) if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(tissue) for pid in pids: init['protein_id'] = pid rv = dba.ins_gtex(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[pid] = True pbar.finish() for ensg in notfnd: logger.warn("No target found for {}".format(ensg)) for t in nouid: logger.warn("No Uberon ID found for {}".format(t)) print "Processed {} lines".format(ct) print " Inserted {} new expression rows for {} proteins ({} ENSGs)".format(exp_ct, len(pmark), len(ensg2pids)) if notfnd: print " No target found for {} ENSGs. See logfile {} for details.".format(len(notfnd), logfile) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset exp_dataset_id = dba.ins_dataset( {'name': 'Human Cell Atlas Expression', 'source': 'File Table S1 from http://science.sciencemag.org/content/suppl/2017/05/10/science.aal3321.DC1', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://science.sciencemag.org/content/356/6340/eaal3321.full', 'comments': 'Qualitative expression values are generated by the loading app.'} ) assert exp_dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) cpt_dataset_id = dba.ins_dataset( {'name': 'Human Cell Atlas Compartments', 'source': 'File Table S6 from http://science.sciencemag.org/content/suppl/2017/05/10/science.aal3321.DC1', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://science.sciencemag.org/content/356/6340/eaal3321.full'} ) assert cpt_dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance provs = [ {'dataset_id': exp_dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HCA RNA'", 'comment': 'TPM and qualitative expression values are derived from file Table S1 from http://science.sciencemag.org/content/suppl/2017/05/10/science.aal3321.DC1'}, {'dataset_id': cpt_dataset_id, 'table_name': 'compartment', 'where_clause': "ctype = 'Human Cell Atlas'"} ] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) if not args['--quiet']: print "\nCalculating expression level percentiles" pctiles = calc_pctiles() pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] # # Expressions # line_ct = slmf.wcl(RNA_FILE) if not args['--quiet']: print "\nProcessing {} lines from HCA file {}".format(line_ct, RNA_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 k2pids = defaultdict(list) notfnd = set() dba_err_ct = 0 pmark = {} exp_ct = 0 with open(RNA_FILE, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() for row in csvreader: ct += 1 pbar.update(ct) sym = row[1] ensg = row[0] k = "%s|%s"%(sym,ensg) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets({'sym': sym}, False) if not targets: targets = dba.find_targets_by_xref({'xtype': 'Ensembl', 'value': ensg}, False) if not targets: notfnd.add(k) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) k2pids[k] = pids for pid in pids: cell_lines = [c.replace(' (TPM)', '') for c in header[2:]] for (i,cl) in enumerate(cell_lines): tpm_idx = i + 2 # add two because row has ENSG and Gene at beginning tpm = float(row[tpm_idx]) qv = calc_qual_value( tpm, pctiles[cl] ) rv = dba.ins_expression( {'protein_id': pid, 'etype': 'HCA RNA', 'tissue': 'Cell Line '+cl, 'qual_value': qv, 'number_value': tpm} ) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[pid] = True pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) print "Processed {} lines.".format(ct) print " Inserted {} new expression rows for {} proteins.".format(exp_ct, len(pmark)) if notfnd: print " No target found for {} Symbols/ENSGs. See logfile {} for details".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) # # Compartments # line_ct = slmf.wcl(LOC_FILE) if not args['--quiet']: print "\nProcessing {} lines from HCA file {}".format(line_ct, LOC_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 k2pids = defaultdict(list) notfnd = set() dba_err_ct = 0 pmark = {} cpt_ct = 0 with open(LOC_FILE, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() for row in csvreader: ct += 1 pbar.update(ct) uniprot = row[2] sym = row[1] k = "%s|%s"%(uniprot,sym) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets({'uniprot': uniprot}, False) if not targets: targets = dba.find_targets({'sym': sym}, False) if not targets: notfnd.add(k) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) k2pids[k] = pids for pid in pids: compartments = [c for c in header[3:-5]] for (i,c) in enumerate(compartments): val_idx = i + 3 # add three because row has ENSG,Gene,Uniprot at beginning val = int(row[val_idx]) if val == 0: continue rel = row[-5] if rel == 'Uncertain': continue rv = dba.ins_compartment( {'protein_id': pid, 'ctype': 'Human Cell Atlas', 'go_id': COMPARTMENTS[c][1], 'go_term': COMPARTMENTS[c][0], 'reliability': rel} ) if not rv: dba_err_ct += 1 continue cpt_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new compartment rows for {} protein.s".format(cpt_ct, len(pmark)) if notfnd: print " No target found for {} UniProts/Symbols. See logfile {} for details".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'IMPC Mouse Clones', 'source': "File %s obtained directly from Terry Meehan/Alba Gomez at EBI." % os.path.basename(IMPC_FILE), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.mousephenotype.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'IMPC Clones'" }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'IMPC Status'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] line_ct = slmf.wcl(IMPC_FILE) if not args['--quiet']: print "\nProcessing {} rows from input file {}".format( line_ct, IMPC_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() ti1_ct = 0 ti2_ct = 0 dba_err_ct = 0 with open(IMPC_FILE, 'rU') as csvfile: csvreader = csv.DictReader(csvfile) for d in csvreader: # Gene,MGI Accession,Public IDG,Public CMG Tier1,Public CMG Tier 2,Number of notifications,Status,# Clones,Non-Assigned Plans,Assigned plans,Aborted MIs,MIs in Progress,GLT Mice,Private ct += 1 sym = d['Gene'].upper() targets = dba.find_targets({'sym': sym}) if not targets: targets = dba.find_targets_by_xref({ 'xtype': 'MGI ID', 'value': d['MGI Accession'] }) if not targets: k = "%s,%s" % (d['Gene'], d['MGI Accession']) notfnd.add(k) continue if not d['Status'] and not d['# Clones']: skip_ct += 1 continue tids = list() for t in targets: pid = t['components']['protein'][0]['id'] if not d['Status']: status = '?' else: status = d['Status'] rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'IMPC Status', 'string_value': status }) if rv: ti1_ct += 1 else: dba_err_ct += 1 if not d['# Clones']: continue rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'IMPC Clones', 'string_value': d['# Clones'] }) if rv: ti2_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for k in notfnd: logger.warn("No target found for: {}".format(k)) if not args['--quiet']: print "{} rows processed.".format(ct) print "Inserted {} new 'IMPC Status' tdl_info rows".format(ti1_ct) print "Inserted {} new 'IMPC Clones' tdl_info rows".format(ti2_ct) print "Skipped {} rows with no relevant info".format(skip_ct) if notfnd: print "No target found for {} rows. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Transcription Factor Flags', 'source': BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://humantfs.ccbr.utoronto.ca/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'Is Transcription Factor'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] TDLs = {'Tdark': 0, 'Tbio': 0, 'Tchem': 0, 'Tclin': 0} ifn = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(ifn) if not args['--quiet']: print "\nProcessing {} lines in input file {}".format(line_ct, ifn) with open(ifn, 'rU') as ifh: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(ifh) header = csvreader.next() # skip header line ct = 0 ti_ct = 0 skip_ct = 0 notfnd = set() dba_err_ct = 0 for row in csvreader: # 0 Ensembl ID # 1 HGNC symbol # 2 DBD # 3 Is TF? # 4 TF assessment # 5 Binding mode,Motif status # 6 Final Notes # 7 Final Comments # 8 Interpro ID(s) # 9 EntrezGene ID # 10 EntrezGene Description # 11 PDB ID # 12 TF tested by HT-SELEX? # 13 TF tested by PBM? # 14 Conditional Binding Requirements # 15 Original Comments # 16 Vaquerizas 2009 classification # 17 CisBP considers it a TF? # 18 TFCat classification # 19 Is a GO TF? # 20 Initial assessment # 21 Curator 1 # 22 Curator 2 # 23 TFclass considers ct += 1 if row[3] != 'Yes': skip_ct += 1 continue sym = row[1] targets = dba.find_targets({'sym': sym}) if not targets: gid = row[9] if gid != 'None' and not gid.startswith('IPR'): targets = dba.find_targets({'geneid': gid}) if not targets: ensg = row[0] targets = dba.find_targets_by_xref({ 'xtype': 'Ensembl', 'value': ensg }) if not targets: k = "%s|%s|%s" % (sym, gid, ensg) notfnd.add(k) continue t = targets[0] TDLs[t['tdl']] += 1 pid = t['components']['protein'][0]['id'] rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'Is Transcription Factor', 'boolean_value': 1 }) if rv: ti_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) print "\n{} lines processed.".format(ct) print " Inserted {} new 'Is Transcription Factor' tdl_infos".format(ti_ct) print " Skipped {} non-TF lines".format(skip_ct) if notfnd: print "No target found for {} symbols/geneids/ENSGs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) for tdl in ['Tclin', 'Tchem', 'Tbio', 'Tdark']: print "%s: %d" % (tdl, TDLs[tdl])