Exemplo n.º 1
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'NCBI GI Numbers',
        'source':
        'UniProt ID Mapping file %s' % (BASE_URL + FILENAME),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.uniprot.org/'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'xref',
        'where_clause': "dataset_id = %d" % dataset_id
    })
    if not rv:
        print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
        sys.exit(1)

    start_time = time.time()
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    infile = (DOWNLOAD_DIR + FILENAME).replace('.gz', '')
    line_ct = slmf.wcl(infile)
    # ID Mappiing fields
    # 1. UniProtKB-AC
    # 2. UniProtKB-ID
    # 3. GeneID (EntrezGene)
    # 4. RefSeq
    # 5. GI
    # 6. PDB
    # 7. GO
    # 8. UniRef100
    # 9. UniRef90
    # 10. UniRef50
    # 11. UniParc
    # 12. PIR
    # 13. NCBI-taxon
    # 14. MIM
    # 15. UniGene
    # 16. PubMed
    # 17. EMBL
    # 18. EMBL-CDS
    # 19. Ensembl
    # 20. Ensembl_TRS
    # 21. Ensembl_PRO
    # 22. Additional PubMed
    if not args['--quiet']:
        print "\nProcessing {} rows in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        tmark = {}
        xref_ct = 0
        skip_ct = 0
        dba_err_ct = 0
        for line in tsv:
            data = line.split('\t')
            ct += 1
            up = data[0]
            if not data[4]:  # no gi
                skip_ct += 1
                continue
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                skip_ct += 1
                continue
            target = targets[0]
            tmark[target['id']] = True
            pid = target['components']['protein'][0]['id']
            for gi in data[4].split('; '):
                rv = dba.ins_xref({
                    'protein_id': pid,
                    'xtype': 'NCBI GI',
                    'dataset_id': dataset_id,
                    'value': gi
                })
                if rv:
                    xref_ct += 1
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "\n{} rows processed".format(ct)
    print "  Inserted {} new GI xref rows for {} targets".format(
        xref_ct, len(tmark))
    print "  Skipped {} rows with no GI".format(skip_ct)
    if dba_err_ct > 0:
        print "WARNING: {} database errors occured. See logfile {} for details.".format(
            dba_err_ct, logfile)
Exemplo n.º 2
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging when debug is 0
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    
  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'Reactome Protein-Protein Interactions', 'source': "File %s"%BASE_URL+FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.reactome.org/'} )
  if not dataset_id:
    print "WARNING: Error inserting dataset See logfile %s for details." % logfile
    sys.exit(1)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'ppi', 'where_clause': "ppitype = 'Reactome'"})
  if not rv:
    print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
    sys.exit(1)

  infile = DOWNLOAD_DIR + FILENAME
  line_ct = slmf.wcl(infile)
  if not args['--quiet']:
    print "\nProcessing {} lines from Reactome PPI file {}".format(line_ct, infile)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  with open(infile, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct = 1
    skip_ct = 0
    same12_ct = 0
    dup_ct = 0
    ppis = {}
    ppi_ct = 0
    up2pid = {}
    notfnd = set()
    dba_err_ct = 0
    for row in tsvreader:
      # 0: Interactor 1 uniprot id
      # 1: Interactor 1 Ensembl gene id
      # 2: Interactor 1 Entrez Gene id
      # 3: Interactor 2 uniprot id
      # 4: Interactor 2 Ensembl gene id
      # 5: Interactor 2 Entrez Gene id
      # 6: Interaction type
      # 7: Interaction context Pubmed references
      ct += 1
      pbar.update(ct)
      if not row[0].startswith('uniprotkb:'):
        continue
      if not row[3].startswith('uniprotkb:'):
        continue
      up1 = row[0].replace('uniprotkb:', '')
      up2 = row[3].replace('uniprotkb:', '')      
      if not up1 or not up2:
        skip_ct += 1
        continue
      # protein1
      if up1 in up2pid:
        pid1 = up2pid[up1]
      elif up1 in notfnd:
        continue
      else:
        t1 = find_target(dba, up1)
        if not t1:
          notfnd.add(up1)
          continue
        pid1 = t1['components']['protein'][0]['id']
        up2pid[up1] = pid1
      # protein2
      if up2 in up2pid:
        pid2 = up2pid[up2]
      elif up2 in notfnd:
        continue
      else:
        t2 = find_target(dba, up2)
        if not t2:
          notfnd.add(up2)
          continue
        pid2 = t2['components']['protein'][0]['id']
        up2pid[up2] = pid2
      int_type = row[6]
      ppik = up1 + "|" + up2 + 'int_type'
      if ppik in ppis:
        dup_ct += 1
        continue
      if pid1 == pid2:
        same12_ct += 1
        continue
      # Insert PPI
      rv = dba.ins_ppi( {'ppitype': 'Reactome', 'interaction_type': int_type,
                         'protein1_id': pid1, 'protein1_str': up1,
                         'protein2_id': pid2, 'protein2_str': up2} )
      if rv:
        ppi_ct += 1
        ppis[ppik] = True
      else:
        dba_err_ct += 1
  pbar.finish()
  for up in notfnd:
    logger.warn("No target found for: {}".format(up))
  print "{} Reactome PPI rows processed.".format(ct)
  print "  Inserted {} ({}) new ppi rows".format(ppi_ct, len(ppis))
  if skip_ct:
    print "  Skipped {} rows without two UniProt interactors".format(skip_ct)
  if dup_ct:
    print "  Skipped {} duplicate PPIs".format(dup_ct)
  if same12_ct:
    print "  Skipped {} PPIs involving the same protein".format(same12_ct)
  if notfnd:
    print "  No target found for {} UniProt accessions. See logfile {} for details.".format(len(notfnd), logfile) 
  if dba_err_ct > 0:
    print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Exemplo n.º 3
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'MLP Assay Info', 'source': 'IDG-KMC generated data by Jeremy Yang at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': "This data is generated at UNM from PubChem and EUtils data. It contains details about targets studied in assays that were part of NIH's Molecular Libraries Program."} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': 3, 'table_name': 'mlp_assay_info'})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  if os.path.isfile(T2AID_PICKLE):
    t2aid = pickle.load( open(T2AID_PICKLE, 'rb'))
    act = 0
    for tid in t2aid.keys():
      for aid in t2aid[tid]:
        act += 1
    if not args['--debug']:
      print "\n{} targets have link(s) to {} PubChem MLP assay(s)".format(len(t2aid), act)
  else:
    pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
    line_ct = slmf.wcl(AIDGI_FILE)
    t2aid = {}
    if not args['--quiet']:
      print "\nProcessing {} lines in file {}".format(line_ct, AIDGI_FILE)
    with open(AIDGI_FILE, 'rU') as csvfile:
      pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
      csvreader = csv.reader(csvfile)
      ct = 0
      skip_ct = 0
      fndgi_ct = 0
      fndpl_ct = 0
      notfnd = set()
      assay_ct = 0
      dba_err_ct = 0
      for row in csvreader:
        # aid, tgt_gi, tgt_species, tgt_name
        #print "[DEBUG]", row
        ct += 1
        if row[2] != 'H**o sapiens':
          skip_ct += 1
          continue
        gi = row[1]
        targets = dba.find_targets_by_xref({'xtype': 'NCBI GI', 'value': gi})
        if targets:
          fndgi_ct += 1
        else:
          url = EFETCH_PROTEIN_URL + gi
          r = requests.get(url)
          if r.status_code == 200:
            soup = BeautifulSoup(r.text, "xml")
            grl = soup.find('Gene-ref_locus')
            if grl:
              sym = grl.text
              targets = dba.find_targets({'sym': sym})
          if targets:
            fndpl_ct += 1
          else:
            notfnd.append(gi)
            logger.warn("No target found for GI {}".format(gi))
            continue
        t = targets[0]
        tid = t['id']
        if tid in t2aid:
          t2aid[tid].append(row[0])
          assay_ct += 1
        else:
          t2aid[tid] = [row[0]]
          assay_ct += 1
        pbar.update(ct)
    pbar.finish()
    pickle.dump(t2aid, open(T2AID_PICKLE, "wb"))
    print "\n{} rows processed.".format(ct)
    print "  {} assays linked to {} TCRD targets".format(assay_ct, len(t2aid))
    print "  Skipped {} non-huamn assay rows".format(skip_ct)
    print "    {} linked by GI; {} linked via EUtils".format(fndgi_ct, fndpl_ct)
    print "  No target found for {} GIs. See logfile {} for details".format(len(notfnd), logfile)

  assay_info = {}
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  line_ct = slmf.wcl(ASSAYS_FILE)
  if not args['--quiet']:
    print "\nProcessing {} rows in file {}".format(line_ct, ASSAYS_FILE)
  with open(ASSAYS_FILE, 'rU') as csvfile:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    csvreader = csv.reader(csvfile)
    ct = 0
    for row in csvreader:
      # ID,ActivityOutcomeMethod,AssayName,SourceName,ModifyDate,DepositDate,ActiveSidCount,InactiveSidCount,InconclusiveSidCount,TotalSidCount,ActiveCidCount,TotalCidCount,ProteinTargetList
      aid = row[0]
      assay_info[aid] = row[1:]
      pbar.update(ct)
  pbar.finish()
  elapsed = time.time() - start_time
  print "Got assay info for {} assays.".format(len(assay_info))

  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  tct = len(t2aid.keys())
  if not args['--quiet']:
    print "\nLoading MLP Assay Info for {} targets".format(tct)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
  ct = 0
  ti_ct = 0
  mai_ct = 0
  dba_err_ct = 0
  for tid, aids in t2aid.items():
    ct += 1
    for aid in aids:
      ainfo = assay_info[aid]
      rv = dba.ins_mlp_assay_info({'protein_id': tid, 'aid': aid, 'assay_name': ainfo[1], 'method': ainfo[0], 'active_sids': ainfo[5], 'inactive_sids': ainfo[6], 'iconclusive_sids': ainfo[7], 'total_sids': ainfo[8]})
      if rv:
        mai_ct += 1
      else:
        dba_err_ct += 1
    pbar.update(ct)
  pbar.finish()
  print "\n{} targets processed.".format(ct)
  print "  Inserted {} new mlp_assay_info rows".format(mai_ct)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Exemplo n.º 4
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'Reactome Pathways',
        'source': 'File %s' % BASE_URL + PATHWAYS_FILE,
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.reactome.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'pathway',
        'where_clause': "pwtype = 'Reactome'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    infile = (DOWNLOAD_DIR + PATHWAYS_FILE).replace('.zip', '')
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} input line from Reactome Pathways file {}".format(
            line_ct, infile)
    with open(infile, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        # Example line:
        # Apoptosis       R-HSA-109581    Reactome Pathway        ACIN1   ADD1    AKT1    AKT2   ...
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        sym2pids = defaultdict(list)
        pmark = set()
        notfnd = set()
        pw_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            pwname = row[0]
            pwid = row[1]
            url = 'http://www.reactome.org/content/detail/' + pwid
            syms = row[3:]
            for sym in syms:
                if sym in sym2pids:
                    pids = sym2pids[sym]
                elif sym in notfnd:
                    continue
                else:
                    targets = dba.find_targets({'sym': sym})
                    if not targets:
                        notfnd.add(sym)
                        continue
                    pids = []
                    for t in targets:
                        pids.append(t['components']['protein'][0]['id'])
                    sym2pids[
                        sym] = pids  # save this mapping so we only lookup each target once
                for pid in pids:
                    rv = dba.ins_pathway({
                        'protein_id': pid,
                        'pwtype': 'Reactome',
                        'name': pwname,
                        'id_in_source': pwid,
                        'url': url
                    })
                    if rv:
                        pw_ct += 1
                        pmark.add(pid)
                    else:
                        dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    for sym in sym2pids:
        logger.warn("No target found for {}".format(sym))
    print "Processed {} Reactome Pathways.".format(ct)
    print "  Inserted {} pathway rows for {} proteins.".format(
        pw_ct, len(pmark))
    if notfnd:
        print "  No target found for {} Gene IDs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Exemplo n.º 5
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'PubTator Text-mining Scores',
        'source':
        'File %s' % BASE_URL + FILENAME,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/PubTator/',
        'comments':
        'PubTator data was subjected to the same counting scheme used to generate JensenLab PubMed Scores.'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'ptscore'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'PubTator PubMed Score'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    ptscores = {}  # protein.id => sum(all scores)
    pts_ct = 0
    dba_err_ct = 0
    infile = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        geneid2pid = {}
        notfnd = set()
        for row in tsvreader:
            # NCBI Gene ID  year  score
            ct += 1
            pbar.update(ct)
            gidstr = row[0].replace(',', ';')
            geneids = gidstr.split(';')
            for geneid in geneids:
                if not geneid or '(tax:' in geneid:
                    continue
                if geneid in geneid2pid:
                    # we've already found it
                    pids = geneid2pid[geneid]
                elif geneid in notfnd:
                    # we've already not found it
                    continue
                else:
                    targets = dba.find_targets({'geneid': geneid})
                    if not targets:
                        notfnd.add(geneid)
                        logger.warn("No target found for {}".format(geneid))
                        continue
                    pids = []
                    for target in targets:
                        pids.append(target['components']['protein'][0]['id'])
                        geneid2pid[
                            geneid] = pids  # save this mapping so we only lookup each target once
                for pid in pids:
                    rv = dba.ins_ptscore({
                        'protein_id': pid,
                        'year': row[1],
                        'score': row[2]
                    })
                    if rv:
                        pts_ct += 1
                    else:
                        dba_err_ct += 1
                    if pid in ptscores:
                        ptscores[pid] += float(row[2])
                    else:
                        ptscores[pid] = float(row[2])
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "  Inserted {} new ptscore rows for {} targets.".format(
        pts_ct, len(ptscores))
    if notfnd:
        print "No target found for {} NCBI Gene IDs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    print "\nLoading {} PubTator Score tdl_infos".format(len(ptscores))
    ct = 0
    ti_ct = 0
    dba_err_ct = 0
    for pid, score in ptscores.items():
        ct += 1
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'PubTator Score',
            'number_value': score
        })
        if rv:
            ti_ct += 1
        else:
            dba_err_ct += 1
    print "{} processed".format(ct)
    print "Inserted {} new PubTator PubMed Score tdl_info rows".format(ti_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Exemplo n.º 6
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'Pathway Commons',
        'source': 'File %s' % BASE_URL + PATHWAYS_FILE,
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.pathwaycommons.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'pathway',
        'where_clause': "pwtype LIKE 'PathwayCommons %s'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    infile = (DOWNLOAD_DIR + PATHWAYS_FILE).replace('.gz', '')
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} records from PathwayCommons file {}".format(
            line_ct, infile)
    with open(infile, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        # Example line:
        # http://identifiers.org/kegg.pathway/hsa00010    name: Glycolysis / Gluconeogenesis; datasource: kegg; organism: 9606; idtype: uniprot  A8K7J7  B4DDQ8  B4DNK4  E9PCR7  P04406  P06744  P07205  P07738  P09467 P09622   P09972  P10515  P11177  P14550  P30838  P35557  P51648  P60174  Q01813  Q16822  Q53Y25  Q6FHV6 Q6IRT1   Q6ZMR3  Q8IUN7  Q96C23  Q9BRR6  Q9NQR9  Q9NR19
        # However, note that pathway commons URLs in file give 404.
        # E.g. URL from this line:
        # http://pathwaycommons.org/pc2/Pathway_0136871cbdf9a3ecc09529f1878171df  name: VEGFR1 specific signals; datasource: pid; organism: 9606; idtype: uniprot    O14786  O15530  O60462  P05771  P07900  P15692  P16333  P17252  P17612  P17948  P19174  P20936     P22681  P27361  P27986  P28482  P29474  P31749  P42336  P49763  P49765  P62158  P98077  Q03135  Q06124  Q16665  Q9Y5K6
        # needs to be converted to:
        # http://apps.pathwaycommons.org/pathways?uri=http%3A%2F%2Fpathwaycommons.org%2Fpc2%2FPathway_0136871cbdf9a3ecc09529f1878171df
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        skip_ct = 0
        up2pid = {}
        pmark = set()
        notfnd = set()
        pw_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            src = re.search(r'datasource: (\w+)', row[1]).groups()[0]
            if src in ['kegg', 'wikipathways', 'reactome']:
                skip_ct += 1
                continue
            pwtype = 'PathwayCommons: ' + src
            name = re.search(r'name: (.+?);', row[1]).groups()[0]
            url = PCAPP_BASE_URL + urllib.quote(row[0], safe='')
            ups = row[2:]
            for up in ups:
                if up in up2pid:
                    pid = up2pid[up]
                elif up in notfnd:
                    continue
                else:
                    targets = dba.find_targets({'uniprot': up})
                    if not targets:
                        notfnd.add(up)
                        continue
                    t = targets[0]
                    pid = t['components']['protein'][0]['id']
                    up2pid[up] = pid
                rv = dba.ins_pathway({
                    'protein_id': pid,
                    'pwtype': pwtype,
                    'name': name,
                    'url': url
                })
                if rv:
                    pw_ct += 1
                    pmark.add(pid)
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    for up in notfnd:
        logger.warn("No target found for {}".format(up))
    print "Processed {} Pathway Commons records.".format(ct)
    print "  Inserted {} new pathway rows for {} proteins.".format(
        pw_ct, len(pmark))
    print "  Skipped {} records from 'kegg', 'wikipathways', 'reactome'".format(
        skip_ct)
    if notfnd:
        print "  No target found for {} UniProt accessions. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Exemplo n.º 7
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'LocSigDB',
        'source':
        'File %s from %s' % (FILENAME, BASE_URL),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://genome.unmc.edu/LocSigDB/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'locsig'})
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    fn = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines in input file {}".format(line_ct, fn)
    with open(fn, 'rU') as f:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        up2pid = {}
        notfnd = set()
        ls_ct = 0
        skip_ct = 0
        pmark = set()
        dba_err_ct = 0
        for line in f:
            ct += 1
            data = line.split(',')
            if 'H**o sapiens' not in data[5]:
                skip_ct += 1
                continue
            fnd = False
            for up in data[4].split(';'):
                if up in up2pid:
                    # we've already found it
                    pid = up2pid[up]
                elif up in notfnd:
                    # we've already not found it
                    continue
                else:
                    targets = dba.find_targets({'uniprot': up})
                    if not targets:
                        notfnd.add(up)
                        continue
                    pid = targets[0]['components']['protein'][0]['id']
                    up2pid[up] = pid
                rv = dba.ins_locsig({
                    'protein_id': pid,
                    'location': data[2],
                    'signal': data[0],
                    'pmids': data[3]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                ls_ct += 1
                pmark.add(pid)
            pbar.update(ct)
    pbar.finish()
    for up in notfnd:
        logger.warn("No target found for {}".format(up))
    print "{} lines processed.".format(ct)
    print "  Inserted {} new locsig rows for {} proteins".format(
        ls_ct, len(pmark))
    print "  Skipped {} non-human rows".format(skip_ct)
    if notfnd:
        print "No target found for {} UniProts. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Exemplo n.º 8
0
def main():
    argparser = argparse.ArgumentParser(
        description=
        "Create a pickle file containing IDG family geneids/uniprots")
    group = argparser.add_mutually_exclusive_group()
    group.add_argument("-v",
                       "--verbose",
                       action='count',
                       default=0,
                       help="Set output verbosity level")
    group.add_argument("-q", "--quiet", action="store_true")
    argparser.add_argument('-dh',
                           '--dbhost',
                           help='Database host.',
                           default=DBHOST)
    argparser.add_argument('-db',
                           '--dbname',
                           help='Database name.',
                           default=DBNAME)
    argparser.add_argument('-o', '--outfile', help='Database name.')
    args = argparser.parse_args()

    dba_params = {'dbhost': args.dbhost, 'dbname': args.dbname}
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    tct = dba.get_target_count(idg=True)
    if not args.quiet:
        print "\n%s (v%s) [%s]:" % (PROGRAM, __version__, time.strftime("%c"))
        print "  Connected to TCRD database %s (schema ver: %s, data ver: %s)" % (
            args.dbname, dbi['schema_ver'], dbi['data_ver'])
        print "  Dumping TCRD IDG Families for %d targets" % tct
    start_time = time.time()
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    idgs = {'GPCR': [], 'oGPCR': [], 'Kinase': [], 'IC': [], 'NR': []}
    ct = 0
    for t in dba.get_targets(idg=True, include_annotations=False):
        ct += 1
        p = t['components']['protein'][0]
        idg = t['idgfam']
        idgs[idg].append({
            'sym': p['sym'],
            'geneid': p['geneid'],
            'uniprot': p['uniprot']
        })
        pbar.update(ct)
    pbar.finish()

    elapsed = time.time() - start_time
    print "%d TCRD targets processed. Elapsed time: %s" % (ct,
                                                           secs2str(elapsed))
    print "Saving info for following IDG Family counts to pickle file %s" % args.outfile
    for idgfam in idgs.keys():
        print "  %s: %d" % (idgfam, len(idgs[idgfam]))
    pickle.dump(idgs, open(args.outfile, 'wb'))

    print "\n%s: Done.\n" % PROGRAM
Exemplo n.º 9
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'WikiPathways',
        'source': 'File %s' % BASE_URL + PATHWAYS_FILE,
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.wikipathways.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'pathway',
        'where_clause': "pwtype = 'WikiPathways'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    fn = DOWNLOAD_DIR + PATHWAYS_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} input lines from WikiPathways file {}".format(
            line_ct, fn)
    with open(fn, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        # Example line:
        # Apoptosis Modulation and Signaling%WikiPathways_20160516%WP1772%H**o sapiens    http://www.wikipathways.org/instance/WP1772_r85184       843     3725    842 ...
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        gid2pids = defaultdict(list)
        pmark = set()
        notfnd = set()
        pw_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            name = row[0].split('%')[0]
            wpid = row[1].split('/')[-1]
            geneids = row[2:]
            for gid in geneids:
                if gid in gid2pids:
                    pids = gid2pids[gid]
                elif gid in notfnd:
                    continue
                else:
                    targets = dba.find_targets({'geneid': gid})
                    if not targets:
                        notfnd.add(gid)
                        continue
                    pids = []
                for t in targets:
                    pids.append(t['components']['protein'][0]['id'])
                gid2pids[
                    gid] = pids  # save this mapping so we only lookup each target once
                for pid in pids:
                    rv = dba.ins_pathway({
                        'protein_id': pid,
                        'pwtype': 'WikiPathways',
                        'name': name,
                        'id_in_source': wpid,
                        'url': row[1]
                    })
                    if rv:
                        pw_ct += 1
                        pmark.add(pid)
                    else:
                        dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    for gid in gid2pids:
        logger.warn("No target found for {}".format(gid))
    print "Processed {} WikiPathways.".format(ct)
    print "  Inserted {} pathway rows for {} proteins.".format(
        pw_ct, len(pmark))
    if notfnd:
        print "  No target found for {} Gene IDs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Exemplo n.º 10
0
def load():
    args = docopt(__doc__, version=__version__)
    debug = int(args['--debug'])
    if debug:
        print "\n[*DEBUG*] ARGS:\n%s\n" % repr(args)

    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = "%s.log" % PROGRAM
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not debug:
        logger.propagate = False  # turns off console logging when debug is 0
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)",
                args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    if not args['--quiet']:
        print "Connected to TCRD database %s (schema ver %s; data ver %s)" % (
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'TechDev Worklist Info',
        'source':
        'Files from TechDev Groups',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'Loading app uses data from spreadsheets submitted by the TechDev groups listing targets being investigated.'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'techdev_contact',
        'comment': ""
    }, {
        'dataset_id': dataset_id,
        'table_name': 'techdev_info',
        'comment': ""
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
            sys.exit(1)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    for tdid, filename in INPUTFILES.items():
        line_ct = wcl(filename)
        if not args['--quiet']:
            print '\nProcessing %d lines from input file: %s' % (line_ct,
                                                                 filename)
        with open(filename, 'rU') as csvfile:
            csvreader = csv.reader(csvfile)
            pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
            ct = 0
            contact = {}
            skip_ct = 0
            err_ct = 0
            info_ct = 0
            notfnd = []
            dba_err_ct = 0
            for row in csvreader:
                ct += 1
                if row[0] == 'TechDev ID:':
                    techdev_id = int(row[1])
                    contact['id'] = techdev_id
                    continue
                if row[0] == 'Grant Number:':
                    contact['grant_number'] = row[1]
                    continue
                if row[0] == 'Submitter name:':
                    contact['contact_name'] = row[1]
                    continue
                if row[0] == 'Contact email:':
                    contact['contact_email'] = row[1]
                    continue
                if row[0] == 'Submission date:':
                    contact['date'] = row[1]
                    continue
                if row[0] == 'tcrd_target_id':
                    contact['pi'] = PIS[techdev_id]
                    contact_id = dba.ins_techdev_contact(contact)
                    if not contact_id:
                        logger.error("DBA error inserting techdev_contact.")
                        print "Exiting due to DBA error inserting techdev_contact. See logfile %s for details." % logfile
                        break
                    continue
                if not row[6]:
                    skip_ct += 1
                    continue
                sym = row[1]
                targets = dba.find_targets({'sym': sym})
                if not targets:
                    notfnd.append(sym)
                    continue
                t = targets[0]
                pid = t['components']['protein'][0]['id']
                init = {'contact_id': contact_id, 'protein_id': pid}
                if not row[7]:
                    err_ct += 1
                    continue
                init['comment'] = row[7]
                if row[8]:
                    init['publication_pcmid'] = row[8]
                if row[9]:
                    init['publication_pmid'] = row[9]
                if row[11]:
                    init['resource_url'] = row[11]
                if row[10]:
                    init['data_url'] = row[10]
                rv = dba.ins_techdev_info(init)
                if rv:
                    info_ct += 1
                else:
                    dba_err_ct += 1
                pbar.update(ct)
        pbar.finish()
        if not args['--quiet']:
            print "%d lines processed." % ct
            print "  Skipped %d lines not under investigation" % skip_ct
            if err_ct > 0:
                print "  WARNING: %d lines did not have a comment!" % err_ct
            if notfnd:
                print "  WARNING: %d symbols did not find a target!"
                for sym in notfnd:
                    print "    %s" % sym
            print "  Inserted 1 new techdev_contact row"
            print "  Inserted %d new techdev_info rows" % info_ct
            if dba_err_ct > 0:
                print "WARNING: %d DB errors occurred. See logfile %s for details." % (
                    dba_err_ct, logfile)
Exemplo n.º 11
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  if not args['--quiet']:
    print "\nLoading project info from pickle file %s" % PROJECTS_P
    projects = pickle.load( open(PROJECTS_P, 'rb') )

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'NIH Grant Textmining Info', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': "Grant info is generated from textmining results of running Lars Jensen's tagger software on project info downloaded from NIHExporter."} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  provs = [ {'dataset_id': dataset_id, 'table_name': 'grant'},
            {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': 'itype is "NIHRePORTER 2000-2017 R01 Count"'} ]
  for prov in provs:
    rv = dba.ins_provenance(prov)
    assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)
  
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]

  if not args['--quiet']:
    print "\nLoading tagging results in %s" % TAGGING_RESULTS_DIR
  r01cts = {}
  for year in [str(yr) for yr in range(2000, 2018)]: # 2000-2017
    pfile = "%s/Target2AppIDs%s.p" % (TAGGING_RESULTS_DIR, year)
    target2appids = pickle.load( open(pfile, 'rb') )
    tct = len(target2appids.keys())
    if not args['--quiet']:
      print "\nProcessing tagging results for {}: {} targets".format(year, tct)
    pfile = "%s/AppID2Targets%s.p" % (TAGGING_RESULTS_DIR, year)
    appid2targets = pickle.load( open(pfile, 'rb') )
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    t2g_ct = 0
    dba_err_ct = 0
    for tid,appids in target2appids.items():
      ct += 1
      pbar.update(ct)
      for appid in appids:
        if appid not in appid2targets:
          # need to do this check because of projects removed with > 10 targets tagged
          continue
        app_target_ct = len(appid2targets[appid]) # number of targets tagged in this grant
        ginfo = projects[year][appid]
        # gcost is total grant dollars
        if ginfo['TOTAL_COST']:
          gcost = float(ginfo['TOTAL_COST'])
        elif ginfo['TOTAL_COST_SUB_PROJECT']:
          gcost = float(ginfo['TOTAL_COST_SUB_PROJECT'])
        else:
          continue
        # grant_target_cost is dollars per target for this grant
        grant_target_cost = gcost/app_target_ct
        rv = dba.ins_grant( {'target_id': tid, 'appid': appid, 'year': year,
                             'full_project_num': ginfo['FULL_PROJECT_NUM'],
                             'activity': ginfo['ACTIVITY'],
                             'funding_ics': ginfo['FUNDING_ICs'],
                             'cost': "%.2f"%grant_target_cost } )
        if not rv:
          dba_err_ct += 1
          continue
        t2g_ct += 1
        # track R01s
        if ginfo['ACTIVITY'] == 'R01':
          if tid in r01cts:
            r01cts[tid] += 1
          else:
            r01cts[tid] = 1
    pbar.finish()
    print "Processed {} target tagging records.".format(ct)
    print "  Inserted {} new target2grant rows".format(t2g_ct)

  # Now load 'NIHRePORTER 2000-2017 R01 Count' tdl_infos
  print "\nLoading 'NIHRePORTER 2010-2017 R01 Count' tdl_infos for {} targets".format(len(r01cts))
  ti_ct = 0
  for tid in r01cts:
    rv = dba.ins_tdl_info( {'target_id': tid, 'itype': 'NIHRePORTER 2000-2017 R01 Count',
                            'integer_value': r01cts[tid]} )
    if not rv:
      dba_err_ct += 1
      continue
    ti_ct += 1
  print "  Inserted {} new tdl_info rows".format(ti_ct)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Exemplo n.º 12
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not debug:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  # DBAdaptor uses same logger as load()
  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
  if not args['--quiet']:
    print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % (args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'UniProt', 'source': 'Web API at %s'%BASEURL, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.uniprot.org/uniprot'} )
  if not dataset_id:
    print "WARNING: Error inserting dataset. See logfile %s for details." % logfile
    sys.exit(1)
  # Provenance
  provs = [ {'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'ttype'},
            {'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'name'},
            {'dataset_id': dataset_id, 'table_name': 'protein'},
            {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'UniProt Function'"},
            {'dataset_id': dataset_id, 'table_name': 'goa'},  
            {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'UniProt Tissue'"},
            {'dataset_id': dataset_id, 'table_name': 'pathway', 'where_clause': "type = 'uniprot'"},
            {'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'uniprot'"},
            {'dataset_id': dataset_id, 'table_name': 'feature'},
            {'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d"%dataset_id},
            {'dataset_id': dataset_id, 'table_name': 'alias', 'where_clause': "dataset_id = %d"%dataset_id} ]
  for prov in provs:
    rv = dba.ins_provenance(prov)
    if not rv:
      print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
      sys.exit(1)
  
  start_time = time.time()
  xtypes = dba.get_xref_types()
  # see EvidenceOntology.ipynb for where this comes from
  e2e = {'ECO:0000250': 'ISS', 'ECO:0000269': 'EXP', 'ECO:0000270': 'IEP', 'ECO:0000303': 'NAS', 
         'ECO:0000304': 'TAS', 'ECO:0000305': 'IC' ,'ECO:0000314': 'IDA','ECO:0000315': 'IMP',
         'ECO:0000316': 'IGI','ECO:0000318': 'IBA', 'ECO:0000353': 'IPI', 'ECO:0000501': 'IEA'}

  s = shelve.open(SHELF_FILE, writeback=True)
  s['ups'] = []
  s['loaded'] = {}
  s['retries'] = {}
  s['errors'] = {}

  line_ct = wcl(UPHUMAN_FILE)
  line_ct -= 1 # file has header row
  if not args['--quiet']:
    print "\nProcessing %d records in UniProt file %s" % (line_ct, UPHUMAN_FILE)
  with open(UPHUMAN_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    tsvreader.next() # skip header line
    for row in tsvreader:
      up = row[0]
      s['ups'].append(up)

  print "\nLoading data for %d proteins" % len(s['ups'])
  logger.info("Loading data for %d proteins" % len(s['ups']))
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=len(s['ups'])).start()
  ct = 0
  xml_err_ct = 0
  dba_err_ct = 0
  for i,up in enumerate(s['ups']):
    ct += 1
    logger.info("Processing UniProt entry %d: %s" % (i, up))
    (status, headers, upxml) = get_uniprot(up)
    # Code	Description
    # 200	The request was processed successfully.
    # 300 Obsolete.
    # 400	Bad request. There is a problem with your input.
    # 404	Not found. The resource you requested doesn't exist.
    # 410	Gone. The resource you requested was removed.
    # 500	Internal server error. Most likely a temporary problem, but if the problem persists please contact us.
    # 503	Service not available. The server is being updated, try again later.
    if not status:
      logger.warn("Failed getting accession %s" % up)
      s['retries'][up] = True
      continue
    if status != 200:
      logger.error("Bad UniProt API response for %s: %s" % (up, status))
      s['errors'][up] = status
      continue
    target = uniprotxml2target(up, upxml, dataset_id, xtypes, e2e)
    if not target:
      xml_err_ct += 1
      logger.error("XML Error for %s" % up)
      continue
    tid = dba.ins_target(target)
    if tid:
      logger.debug("Target insert id: %s" % tid)
      s['loaded'][up] = tid
    else:
      dba_err_ct += 1
    time.sleep(0.5)
    pbar.update(ct)
  pbar.finish()
  print "Processed %d UniProt records." % ct
  print "  Total loaded targets/proteins: %d" % len(s['loaded'].keys())
  if len(s['retries']) > 0:
    print "  Total targets/proteins remaining for retries: %d " % len(s['retries'])
  if len(s['errors']) > 0:
    print "WARNING: %d API errors occurred. See logfile %s for details." % (len(s['errors']), logfile)
  if xml_err_ct > 0:
    print "WARNING: %d XML parsing errors occurred." % xml_err_ct
  if dba_err_ct > 0:
    print "WARNING: %d DB errors occurred. See logfile %s for details." % (dba_err_ct, logfile)
  
  loop = 1
  while len(s['retries']) > 0:
    print "\nRetry loop %d: Trying to load data for %d proteins" % (loop, len(s['retries']))
    logger.info("Retry loop %d: Trying to load data for %d proteins" % (loop, len(s['retries'])))
    pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=len(s['retries'])).start()
    ct = 0
    tct = 0
    xml_err_ct = 0
    dba_err_ct = 0
    for up,_ in s['retries'].items():
      ct += 1
      logger.info("Processing UniProt entry %s" % up)
      (status, headers, upxml) = get_uniprot(up)
      # Code	Description
      # 200	The request was processed successfully.
      # 300 Obsolete.
      # 400	Bad request. There is a problem with your input.
      # 404	Not found. The resource you requested doesn't exist.
      # 410	Gone. The resource you requested was removed.
      # 500	Internal server error. Most likely a temporary problem, but if the problem persists please contact us.
      # 503	Service not available. The server is being updated, try again later.
      if not status:
        logger.warn("Failed getting accession %s" % up)
        continue
      if status != 200:
        logger.error("Bad UniProt API response for %s: %s" % (up, status))
        s['errors'][up] = status
        continue
      target = uniprotxml2target(up, upxml, dataset_id, xtypes, e2e)
      if not target:
        xml_err_ct += 1
        logger.error("XML Error for %s" % up)
        continue
      tid = dba.ins_target(target)
      if tid:
        tct += 1
        logger.debug("Target insert id: %s" % tid)
        s['loaded'][up] = tid
        del s['retries'][up]
      else:
        dba_err_ct += 1
      time.sleep(0.5)
      pbar.update(ct)
    loop += 1
    pbar.finish()
    print "Processed %d UniProt records." % ct
    print "  Loaded %d new targets/proteins" % tct
    print "  Total loaded targets/proteins: %d" % len(s['loaded'].keys())
    if len(s['retries']) > 0:
      print "  Total targets/proteins remaining for next loop: %d " % len(s['retries'])
    if len(s['errors']) > 0:
      print "WARNING: %d API errors occurred. See logfile %s for details." % (len(s['errors']), logfile)
    if xml_err_ct > 0:
      print "WARNING: %d XML parsing errors occurred." % xml_err_ct
    if dba_err_ct > 0:
      print "WARNING: %d DB errors occurred. See logfile %s for details." % (dba_err_ct, logfile)
  
  s.close()
Exemplo n.º 13
0
  args = docopt(__doc__, version=__version__)
  if args['--debug']:
    print "\n[*DEBUG*] ARGS:\n%s\n"%repr(args)
  
  loglevel = int(args['--loglevel'])
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(LOGFILE)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  start_time = time.time()
  if args['--command'] == 'map':
    pickle_sym2pid(args, dba, logger)
  elif args['--command'] == 'load':
    load(args, dba, logger)
  elapsed = time.time() - start_time
  print "\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed))

# ct = 0
# sym2pid = {}
Exemplo n.º 14
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'KEGG Pathways',
        'source':
        'API at %s' % KEGG_BASE_URL,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.genome.jp/kegg/pathway.html'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'pathway',
        'where_clause': "pwtype = 'KEGG'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    print "\nMapping KEGG pathways to gene lists"
    kpw2geneids = {}
    url = "%s/link/hsa/pathway" % KEGG_BASE_URL
    r = None
    attempts = 0
    while attempts < 3:
        try:
            r = requests.get(url)
            break
        except Exception, e:
            attempts += 1
Exemplo n.º 15
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not debug:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'AnimalTFDB',
        'source':
        'http://www.bioguo.org/AnimalTFDB/BrowseAllTF.php?spe=Homo_sapiens',
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.bioguo.org/AnimalTFDB/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'Is Transcription Factor'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    TDLs = {'Tdark': 0, 'Tbio': 0, 'Tchem': 0, 'Tclin': 0}

    line_ct = slmf.wcl(INFILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in input file {}\n".format(
            line_ct, INFILE)
    with open(INFILE, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        ti_ct = 0
        notfnd = []
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            sym = row[3]
            targets = dba.find_targets({'sym': sym})
            if not targets:
                gid = row[2]
                targets = dba.find_targets({'geneid': gid})
            if not targets:
                ensg = row[1]
                targets = dba.find_targets_by_xref({
                    'xtype': 'Ensembl',
                    'value': ensg
                })
            if not targets:
                notfnd.append(row)
                continue
            t = targets[0]
            TDLs[t['tdl']] += 1
            pid = t['components']['protein'][0]['id']
            rv = dba.ins_tdl_info({
                'protein_id': pid,
                'itype': 'Is Transcription Factor',
                'boolean_value': 1
            })
            if rv:
                ti_ct += 1
            else:
                dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "\n{} lines processed.".format(ct)
    print "  Inserted {} new Is Transcription Factor tdl_infos".format(ti_ct)
    if notfnd:
        print "No target found for {} rows:".format(len(notfnd))
    if dba_err_ct > 0:
        print "WARNING: %d DB errors occurred. See logfile %s for details." % (
            dba_err_ct, logfile)
    for tdl in ['Tclin', 'Tchem', 'Tbio', 'Tdark']:
        print "{}: {}".format(tdl, TDLs[tdl])
Exemplo n.º 16
0
def main(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(LOGLEVEL)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(LOGFILE)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    if not args['--quiet']:
        print "\nLoading project info from pickle file {}".format(PROJECTS_P)
        projects = pickle.load(open(PROJECTS_P, 'rb'))

    if not args['--quiet']:
        print "\nCreating Tagger..."
    tgr = Tagger()
    tgr.load_names(ENTITIES_FILE, NAMES_FILE)
    tgr.load_global(GLOBAL_FILE)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    for year in [str(yr) for yr in range(2000, 2018)]:  # 2000-2017
        pct = len(projects[year])
        print "\nTagging {} projects from {}".format(pct, year)
        logger.info("Tagging {} projects from {}".format(pct, year))
        pbar = ProgressBar(widgets=pbar_widgets, maxval=pct).start()
        start_time = time.time()
        ct = 0
        ttag_ct = 0
        abstag_ct = 0
        skip_ct = 0
        ttagsnotfnd = set()
        ttag2targetid = {}
        appid2targets = defaultdict(set)
        target2appids = defaultdict(set)
        for appid in projects[year].keys():
            ct += 1
            logger.debug("  Processing appid {}".format(appid))
            ginfo = projects[year][appid]
            # if there's no $$, we're not interested
            if ginfo['TOTAL_COST']:
                gcost = int(ginfo['TOTAL_COST'])
            elif ginfo['TOTAL_COST_SUB_PROJECT']:
                gcost = int(ginfo['TOTAL_COST_SUB_PROJECT'])
            else:
                continue
            # also, if there's less than $10k we're not interested
            if gcost < 10000:
                skip_ct += 1
                continue
            #
            # tag titles
            #
            matches = tgr.get_matches(projects[year][appid]['PROJECT_TITLE'],
                                      appid, [9606])
            if matches:
                ttag_ct += 1
                # the same tag can match multiple times, so get a set of ENSPs
                ensps = set()
                for m in matches:
                    ensps.add(m[2][0][1])
                ensps = list(ensps)
                for ensp in ensps:
                    if ensp in ttag2targetid:
                        tid = ttag2targetid[ensp]
                    elif ensp in ttagsnotfnd:
                        continue
                    else:
                        targets = dba.find_targets({'stringid': ensp},
                                                   idg=False)
                        if not targets:
                            targets = dba.find_targets_by_xref(
                                {
                                    'xtype': 'Ensembl',
                                    'value': ensp
                                }, idg=False)
                        if not targets:
                            ttagsnotfnd.add(ensp)
                            continue
                        tid = targets[0]['id']
                        ttag2targetid[
                            ensp] = tid  # save this so we don't look up the targets again
                    appid2targets[appid].add(tid)
                    target2appids[tid].add(appid)
            #
            # tag abstracts
            #
            if 'ABSTRACT' in projects[year][appid]:
                matches = tgr.get_matches(projects[year][appid]['ABSTRACT'],
                                          appid, [9606])
                if matches:
                    abstag_ct += 1
                    # the same tag can match multiple times, so get a set of ENSPs
                    ensps = set()
                    for m in matches:
                        ensps.add(m[2][0][1])
                    ensps = list(ensps)
                    for ensp in ensps:
                        if ensp in ttag2targetid:
                            tid = ttag2targetid[ensp]
                        elif ensp in ttagsnotfnd:
                            continue
                        else:
                            targets = dba.find_targets({'stringid': ensp},
                                                       idg=False)
                            if not targets:
                                targets = dba.find_targets_by_xref(
                                    {
                                        'xtype': 'Ensembl',
                                        'value': ensp
                                    },
                                    idg=False)
                            if not targets:
                                ttagsnotfnd.add(ensp)
                                continue
                            tid = targets[0]['id']
                            ttag2targetid[
                                ensp] = tid  # save this so we don't look up the targets again
                        appid2targets[appid].add(tid)
                        target2appids[tid].add(appid)
            pbar.update(ct)
        pbar.finish()

        del_ct = 0
        for appid, tidset in appid2targets.items():
            if len(tidset) > 10:
                del_ct += 1
                del (appid2targets[appid])

        logger.info("{} projects processed.".format(ct))
        logger.info("  Removed {} projects with > 10 targets" % del_ct)
        logger.info(
            "  Skipped {} projects with funds less than $10k:".format(skip_ct))
        logger.info("  {} titles have tagging result(s)".format(ttag_ct))
        logger.info("  {} abstracts have tagging result(s)".format(abstag_ct))
        logger.info("{} total tags map to {}/{} distinct targets".format(
            len(ttag2targetid.keys()), len(set(ttag2targetid.values())),
            len(target2appids.keys())))
        logger.info("{} project applications map to target(s)".format(
            len(appid2targets.keys())))
        if ttagsnotfnd:
            logger.info("  No target found for {} tags".format(
                len(ttagsnotfnd)))
        pfile = "%s/AppID2Targets%s.p" % (TAGGING_RESULTS_DIR, year)
        pickle.dump(appid2targets, open(pfile, 'wb'))
        logger.info("Tagging results saved to pickle {} for {}".format(
            pfile, year))
        pfile = "%s/Target2AppIDs%s.p" % (TAGGING_RESULTS_DIR, year)
        pickle.dump(target2appids, open(pfile, 'wb'))
        logger.info("Tagging results saved to pickle {} for {}".format(
            pfile, year))
        print "{} projects processed. See logfile {} for details.".format(
            ct, LOGFILE)
Exemplo n.º 17
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'BioPlex Protein-Protein Interactions', 'source': "Files %s from http://wren.hms.harvard.edu/bioplex/downloadInteractions.php"%", ".join(SRC_FILES), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://wren.hms.harvard.edu/bioplex/index.php'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'ppi', 'where_clause': "ppitype = 'BioPlex'"})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)
    
  f = BIOPLEX_FILE
  line_ct = slmf.wcl(f)
  line_ct -= 1
  if not args['--quiet']:
    print "\nProcessing {} lines from BioPlex PPI file {}".format(line_ct, f)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  with open(f, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    # GeneA   GeneB   UniprotA        UniprotB        SymbolA SymbolB pW      pNI     pInt
    ct = 0
    ppi_ct = 0
    same12_ct = 0
    k2pid = {}
    notfnd = set()
    dba_err_ct = 0
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      geneid1 = row[0]
      geneid2 = row[1]
      up1 = row[2]
      up2 = row[3]
      sym1 = row[4]
      sym2 = row[5]
      pw = row[6]
      pni = row[7]
      pint = row[8]
      # protein1
      k1 = "%s|%s|%s" % (up1, sym1, geneid1)
      if k1 in k2pid:
        pid1 = k2pid[k1]
      elif k1 in notfnd:
        continue
      else:
        t1 = find_target(dba, k1)
        if not t1:
          notfnd.add(k1)
          continue
        pid1 = t1['components']['protein'][0]['id']
      k2pid[k1] = pid1
      # protein2
      k2 = "%s|%s|%s" % (up2, sym2, geneid2)
      if k2 in k2pid:
        pid2 = k2pid[k2]
      elif k2 in notfnd:
        continue
      else:
        t2 = find_target(dba, k2)
        if not t2:
          notfnd.add(k2)
          continue
        pid2 = t2['components']['protein'][0]['id']
      k2pid[k2] = pid2
      if pid1 == pid2:
        same12_ct += 1
        continue
      # Insert PPI
      rv = dba.ins_ppi( {'ppitype': 'BioPlex','p_int': pint, 'p_ni': pni, 'p_wrong': pw,
                         'protein1_id': pid1, 'protein1_str': k1,
                         'protein2_id': pid2, 'protein2_str': k2} )
      if rv:
        ppi_ct += 1
      else:
        dba_err_ct += 1
  pbar.finish()
  for k in notfnd:
    logger.warn("No target found for: {}".format(k))
  print "{} BioPlex PPI rows processed.".format(ct)
  print "  Inserted {} new ppi rows".format(ppi_ct)
  if same12_ct:
    print "  Skipped {} PPIs involving the same protein".format(same12_ct)
  if notfnd:
    print "  No target found for {} UniProts/Syms/GeneIDs. See logfile {} for details.".format(len(notfnd), logfile) 
  if dba_err_ct > 0:
    print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  for f in UPD_FILES[1:]:
    start_time = time.time()
    line_ct = slmf.wcl(f)
    line_ct -= 1
    if not args['--quiet']:
      print "\nProcessing {} lines from BioPlex PPI update file {}".format(line_ct, f)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    with open(f, 'rU') as tsv:
      tsvreader = csv.reader(tsv, delimiter='\t')
      header = tsvreader.next() # skip header line
      # plate_num       well_num        db_protein_id   symbol  gene_id bait_symbol     bait_geneid     pWrongID        pNoInt  pInt
      ct = 0
      ppi_ct = 0
      same12_ct = 0
      k2pid = {}
      notfnd = set()
      dba_err_ct = 0
      for row in tsvreader:
        ct += 1
        pbar.update(ct)
        geneid1 = row[6]
        geneid2 = row[4]
        sym1 = row[5]
        sym2 = row[3]
        pw = row[7]
        pni = row[8]
        pint = row[9]
        # protein1
        k1 = "|%s|%s" % (sym1, geneid1)
        if k1 in k2pid:
          pid1 = k2pid[k1]
        elif k1 in notfnd:
          continue
        else:
          t1 = find_target(dba, k1)
          if not t1:
            notfnd.add(k1)
            continue
          pid1 = t1['components']['protein'][0]['id']
          k2pid[k1] = pid1
        # protein2
        k2 = "|%s|%s" % (sym2, geneid2)
        if k2 in k2pid:
          pid2 = k2pid[k2]
        elif k2 in notfnd:
          continue
        else:
          t2 = find_target(dba, k2)
          if not t2:
            notfnd.add(k2)
            continue
          pid2 = t2['components']['protein'][0]['id']
          k2pid[k2] = pid2
        if pid1 == pid2:
          same12_ct += 1
          continue
        # Insert PPI
        rv = dba.ins_ppi( {'ppitype': 'BioPlex','p_int': pint, 'p_ni': pni, 'p_wrong': pw,
                           'protein1_id': pid1, 'protein1_str': k1,
                           'protein2_id': pid2, 'protein2_str': k2} )
        if rv:
          ppi_ct += 1
        else:
          dba_err_ct += 1
    pbar.finish()
    for k in notfnd:
      logger.warn("No target found for: {}".format(k))
    print "{} BioPlex PPI rows processed.".format(ct)
    print "  Inserted {} new ppi rows".format(ppi_ct)
    if same12_ct:
      print "  Skipped {} PPIs involving the same protein".format(same12_ct)
    if notfnd:
      print "  No target found for {} UniProts/Syms/GeneIDs. See logfile {} for details.".format(len(notfnd), logfile) 
    if dba_err_ct > 0:
      print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Exemplo n.º 18
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  # DBAdaptor uses same logger as main()
  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
  
  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'Drugable Epigenome Domains', 'source': 'Files from http://www.nature.com/nrd/journal/v11/n5/suppinfo/nrd3674.html', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.nature.com/nrd/journal/v11/n5/suppinfo/nrd3674.html'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'Drugable Epigenome Class'"})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  total_ti_ct = 0
  notfnd = set()
  for k,d in FILE_LIST.items():
    if not args['--quiet']:
      print "\nProcessing Epigenetic {}s".format(k)
    for dom,f in d.items():
      f = INPUT_DIR + f
      line_ct = slmf.wcl(f)
      if not args['--quiet']:
        print 'Processing {} lines from {} input file {}'.format(line_ct, dom, f)
      with open(f, 'rU') as csvfile:
        csvreader = csv.reader(csvfile)
        header = csvreader.next() # skip header lines
        ct = 0
        not_fnd_ct = 0
        tct = 0
        ti_ct = 0
        dba_err_ct = 0
        for row in csvreader:
          ct += 1
          targets = dba.find_targets({'sym': row[0]})
          if not targets:
            targets = dba.find_targets({'geneid': row[3]})
          if not targets:
            targets = dba.find_targets({'uniprot': row[2]})
          if not targets:
            k = "%s|%s|%s"%(row[0],row[3],row[2])
            notfnd.add(k)
            logger.warn("No target found for: {}".format(k))
            continue
          tct += 1
          t = targets[0]
          p = t['components']['protein'][0]
          if len(row) == 5:
            val = "Epigenetic %s - %s" % (k, dom)
          else:
            val = "Epigenetic %s - %s %s: %s" % (k, dom, row[4], row[5])
          rv = dba.ins_tdl_info({'protein_id': p['id'], 'itype': 'Drugable Epigenome Class', 'string_value': val})
          if not rv:
            dba_err_ct += 1
            continue
          ti_ct += 1
        if not args['--quiet']:
          print "  {} lines processed. Found {}, skipped {}".format(ct, tct, not_fnd_ct)
          print "  Inserted {} new tdl_info rows".format(ti_ct)
        if dba_err_ct > 0:
          print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
        total_ti_ct += ti_ct
  if not args['--quiet']:
    print "\nInserted a total of {} new Drugable Epigenome Class tdl_infos".format(total_ti_ct)
    if len(notfnd) > 0:
      print "  No target found for {} sym/geneid/uniprots. See logfile {} for details.".format(len(notfnd), logfile)