def load_tinx(curs): chunk_size = 50000 delim = '\t' print('\nLoading tinx tables...') for table in ['tinx_novelty', 'tinx_disease', 'tinx_importance', 'tinx_articlerank']: print(f" Loading {table}: ", end='') fn = INFILES[table] st = time.time() first_chunk = True row_ct = 0 for values in slmf.file_chunker(fn, chunk_size, delim): if first_chunk: values.pop(0) # get rid of the header first_chunk = False row_ct += len(values) curs.executemany(INS_SQL[table], [tuple(vals) for vals in values]) ets = slmf.secs2str(time.time() - st) print(f"OK - ({row_ct} rows). Elapsed time: {ets}") print("Done.")
def load_pubmed(curs, logger, logfile): st = time.time() fn = INFILES['pubmed'] line_ct = slmf.wcl(fn) print(f'\nLoading TIN-X pubmeds from {fn}...') ct = 0 pm_ct = 0 dup_ct = 0 err_ct = 0 with open(fn, 'r') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') for row in tsvreader: if ct == 0: # skip header header = row # header line ct += 1 continue ct += 1 slmf.update_progress(ct/line_ct) try: curs.execute(INS_SQL['pubmed'], tuple(row)) pm_ct += 1 except Error as e: if f"Duplicate entry '{row[0]}'" in e.msg: # this should not happen under "production" runs, but it's here for testing/debugging dup_ct += 1 continue else: err_ct += 1 logger.error(f"``{e}`` for line {ct}. Data: {row}") continue ets = slmf.secs2str(time.time() - st) print(f"\n Processed {ct} lines. Inserted {pm_ct} pubmed rows. Elapsed time: {ets}") if err_ct: print(f" WARNING: {err_ct} errors occurred. See logfile {logfile} for details.") if dup_ct: print(f" Skipped {dup_ct} existing pubmeds.") print("Done.")
def tinx_pubmed(args, dba, tinx_pmids, logger): st = time.time() tcrd_pmids = set(dba.get_pmids()) new_pmids = tinx_pmids - tcrd_pmids new_pmids = [str(i) for i in list(new_pmids)] new_pmid_ct = len(new_pmids) if not args['--quiet']: print(f"Fetching pubmed data for {new_pmid_ct} new TIN-X PMIDs") logger.info(f"Fetching pubmed data for {new_pmid_ct} new TIN-X PMIDs") ct = 0 net_err_ct = 0 chunk_ct = 0 fn = f"{TINX_OUTDIR}TINX_Pubmed.tsv" with open(fn, 'w') as ofh: ofh.write("PubMedID\tTitle\tJournal\tDate\tAutors\tAbstract\n") ct += 1 for chunk in slmf.chunker(new_pmids, 200): chunk_ct += 1 logger.info(f"Processing PMID chunk {chunk_ct}") pmas = tpm.fetch_pubmeds(chunk) if not pmas: logger.error("Bad E-Utils response for PMID chunk {}: {}".format(chunk_ct, ','.join(chunk))) net_err_ct += 1 continue for pma in pmas: pmid, title, journal, date, authors, abstract = tpm.parse_pubmed_article(pma) if abstract: ofh.write(f"{pmid}\t{title}\t{journal}\t{date}\t{authors}\t{abstract}\n") else: ofh.write(f"{pmid}\t{title}\t{journal}\t{date}\t{authors}\t''\n") ct += 1 ets = slmf.secs2str(time.time() - st) if not args['--quiet']: print(f"{ct} lines written to file {fn}. Elapsed time: {ets}") if net_err_ct > 0: print(f"WARNING: {net_err_ct} Network/E-Utils errors occurred.")
def tinx(args, dba, do, logger, logfile): tinx = TINX({'TINX_PROTEIN_FILE': JL_DOWNLOAD_DIR+TINX_PROTEIN_FILE, 'TINX_DISEASE_FILE': JL_DOWNLOAD_DIR+TINX_DISEASE_FILE, 'logfile': logfile, 'OUTDIR': TINX_OUTDIR}, dba, do) st = time.time() (ct1, ct2) = tinx.parse_protein_mentions() ets = slmf.secs2str(time.time() - st) if not args['--quiet']: print(f"Protein mappings: {ct1} protein to PMIDs ; {ct2} PMID to protein counts. Elapsed time: {ets}") st = time.time() (ct1, ct2) = tinx.parse_disease_mentions() ets = slmf.secs2str(time.time() - st) if not args['--quiet']: print(f"Disease mappings: {ct1} disease to PMIDs ; {ct2} PMID to disease counts. Elapsed time: {ets}") st = time.time() (ct, fn) = tinx.compute_protein_novelty() ets = slmf.secs2str(time.time() - st) if not args['--quiet']: print(f"Wrote {ct} lines to file {fn}. Elapsed time: {ets}") st = time.time() (ct, fn) = tinx.compute_disease_novelty() ets = slmf.secs2str(time.time() - st) if not args['--quiet']: print(f"Wrote {ct} lines to file {fn}. Elapsed time: {ets}") st = time.time() (ct, fn) = tinx.compute_importances() ets = slmf.secs2str(time.time() - st) if not args['--quiet']: print(f"Wrote {ct} lines to file {fn}. Elapsed time: {ets}") st = time.time() (ct, tinx_pmids, fn) = tinx.compute_pubmed_rankings() tinx_pmid_ct = len(tinx_pmids) ets = slmf.secs2str(time.time() - st) if not args['--quiet']: print(f"Wrote {ct} lines ({tinx_pmid_ct} total TIN-x PMIDs) to file {fn}. Elapsed time: {ets}") return tinx_pmids
'table_name': 'alias', 'where_clause': f"dataset_id = {dataset_id}" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, f"Error inserting provenance. See logfile {logfile} for details." load_human(args, dba, dataset_id, eco_map, logger, logfile) # Mouse and Rat proteins # Dataset and Provenance # As for human, we need the dataset id for xrefs and aliases dataset_id = dba.ins_dataset({ 'name': 'UniProt Mouse and Rat Proteins', 'source': f"Mouse and Rat from UniProt XML file {UP_RODENT_FILE} from {UP_BASE_URL}", 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.uniprot.org' }) assert dataset_id, f"Error inserting dataset See logfile {logfile} for details." rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'nhprotein' }) assert rv, f"Error inserting provenance. See logfile {logfile} for details." load_mouse_rat(args, dba, dataset_id, logger, logfile) elapsed = time.time() - start_time print("\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed)))
loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) for cfgd in CONFIG: name = cfgd['name'] #download(name) parsed_ont = cfgd['parse_function'](cfgd['DOWNLOAD_DIR']+cfgd['FILENAME']) cfgd['load_function'](dba, logger, logfile, parsed_ont, cfgd) elapsed = time.time() - start_time print("\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed)))
fh.setFormatter(fmtr) logger.addHandler(fh) st = time.time() cnx = None try: cnx = mysql.connector.connect(host = args['--dbhost'], database = args['--dbname'], user = '******', password = slmf.get_pw('/home/smathias/.dbirc'), autocommit = True) if cnx.is_connected(): if not args['--quiet']: print("Connected to TCRD database {}".format(args['--dbname'])) curs = cnx.cursor() del_dataset(curs) drop_tables(curs) create_tables(curs) load_tinx(curs) load_pubmed(curs, logger, logfile) load_dataset(curs) curs.close() except Error as e: print(f"ERROR: {e}") finally: if cnx and cnx.is_connected(): cnx.commit() cnx.close() ets = slmf.secs2str(time.time() - st) print(f"\n{PROGRAM}: Done. Total time: {ets}\n")