Exemplo n.º 1
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'HGNC',
        'source':
        'Custom download file from https://www.genenames.org/download/custom/',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.genenames.org/',
        'comments':
        'File downloaded with the following column data: HGNC ID Approved symbol Approved name   Status  UniProt ID NCBI Gene ID    Mouse genome database ID'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile {} for details.".format(
            logfile)
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id':
        dataset_id,
        'table_name':
        'protein',
        'column_name':
        'sym',
        'comment':
        "This is only updated with HGNC data if data from UniProt is absent."
    }, {
        'dataset_id':
        dataset_id,
        'table_name':
        'protein',
        'column_name':
        'geneid',
        'comment':
        "This is only updated with HGNC data if data from UniProt is absent."
    }, {
        'dataset_id': dataset_id,
        'table_name': 'xref',
        'where_clause': "dataset_id = %d" % dataset_id
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile {} for details.".format(
                logfile)
            sys.exit(1)

    line_ct = slmf.wcl(HGNC_TSV_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, HGNC_TSV_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    tmark = {}
    hgnc_ct = 0
    mgi_ct = 0
    sym_ct = 0
    symdiscr_ct = 0
    geneid_ct = 0
    geneiddiscr_ct = 0
    nf_ct = 0
    db_err_ct = 0
    with open(HGNC_TSV_FILE, 'rU') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct += 1
        for row in tsvreader:
            # 0: HGNC ID
            # 1: Approved symbol
            # 2: Approved name
            # 3: Status
            # 4: NCBI Gene ID
            # 5: UniProt ID
            # 6: Mouse genome database ID
            ct += 1
            pbar.update(ct)
            sym = row[1]
            if row[4] != '':
                geneid = int(row[4])
            else:
                geneid = None
            if row[5] != '':
                up = row[5]
            else:
                up = None
            targets = dba.find_targets({'sym': sym})
            if not targets and up:
                targets = dba.find_targets({'uniprot': up})
            if not targets and geneid:
                targets = dba.find_targets({'geneid': geneid})
            if not targets:
                nf_ct += 1
                logger.warn("No target found for {}|{}|{}".format(
                    sym, up, geneid))
                continue
            for t in targets:
                p = t['components']['protein'][0]
                pid = p['id']
                tmark[pid] = True
                # HGNC xref
                rv = dba.ins_xref({
                    'protein_id': pid,
                    'xtype': 'HGNC',
                    'dataset_id': dataset_id,
                    'value': row[0]
                })
                if rv:
                    hgnc_ct += 1
                else:
                    db_err_ct += 1
                # MGI xref
                if row[6] != '':
                    rv = dba.ins_xref({
                        'protein_id': pid,
                        'xtype': 'MGI ID',
                        'dataset_id': dataset_id,
                        'value': row[6]
                    })
                    if rv:
                        mgi_ct += 1
                    else:
                        db_err_ct += 1
                # Add missing syms
                if p['sym'] == None:
                    rv = dba.upd_protein(pid, 'sym', sym)
                    if rv:
                        logger.info(
                            "Inserted new sym {} for protein {}, {}".format(
                                sym, pid, p['uniprot']))
                        sym_ct += 1
                    else:
                        db_err_ct += 1
                else:
                    # Check for symbol discrepancies
                    if p['sym'] != sym:
                        logger.warn("Symbol discrepancy: UniProt=%s, HGNC=%s" %
                                    (p['sym'], sym))
                        symdiscr_ct += 1
                if geneid:
                    # Add missing geneids
                    if p['geneid'] == None:
                        rv = dba.upd_protein(pid, 'geneid', geneid)
                        if rv:
                            logger.info(
                                "Inserted new geneid {} for protein {}, {}".
                                format(geneid, pid, p['uniprot']))
                            geneid_ct += 1
                        else:
                            db_err_ct += 1
                    else:
                        # Check for geneid discrepancies
                        if p['geneid'] != geneid:
                            logger.warn(
                                "GeneID discrepancy: UniProt={}, HGNC={}".
                                format(p['geneid'], geneid))
                            geneiddiscr_ct += 1
    pbar.finish()
    print "Processed {} lines - {} targets annotated.".format(ct, len(tmark))
    print "No target found for {} lines.".format(nf_ct)
    print "  Inserted {} HGNC ID xrefs".format(hgnc_ct)
    print "  Inserted {} MGI ID xrefs".format(mgi_ct)
    if sym_ct > 0:
        print "  Added {} new HGNC symbols".format(sym_ct)
    if symdiscr_ct > 0:
        print "WARNING: {} discrepant HGNC symbols. See logfile {} for details".format(
            symdiscr_ct, logfile)
    if geneid_ct > 0:
        print "  Added {} new NCBI Gene IDs".format(geneid_ct)
    if geneiddiscr_ct > 0:
        print "WARNING: {} discrepant NCBI Gene IDs. See logfile {} for details".format(
            geneiddiscr_ct, logfile)
    if db_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            db_err_ct, logfile)
Exemplo n.º 2
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)",
                args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    if not args['--quiet']:
        print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % (
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'IDG Eligible Targets List',
        'source':
        'IDG generated data in file %s.' % IDG_LIST_FILE,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'IDG Flags and Families set from list of targets on GitHub.',
        'url':
        'https://github.com/druggablegenome/IDGTargets'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'target',
        'column_name': 'idg',
        'where_clause': 'column_name == "idg"'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'target',
        'column_name': 'fam',
        'where_clause': 'column_name == "fam"',
        'where_clause': 'idg == 1'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'target',
        'column_name': 'famext',
        'where_clause': 'column_name == "fam"',
        'where_clause': 'idg == 1'
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    line_ct = slmf.wcl(IDG_LIST_FILE)
    print '\nProcessing {} lines in list file {}'.format(
        line_ct, IDG_LIST_FILE)
    logger.info("Processing {} lines in list file {}".format(
        line_ct, IDG_LIST_FILE))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    notfnd = []
    multfnd = []
    ct = 0
    idg_ct = 0
    fam_ct = 0
    famext_ct = 0
    dba_err_ct = 0
    with open(IDG_LIST_FILE, 'rU') as ifh:
        csvreader = csv.reader(ifh)
        #header = csvreader.next() # skip header line
        #ct += 1
        for row in csvreader:
            ct += 1
            sym = row[0]
            fam = row[2]
            targets = dba.find_targets({'sym': sym},
                                       idg=False,
                                       include_annotations=False)
            if not targets:
                notfnd.append(sym)
                continue
            if len(targets) > 1:
                multfnd.append(sym)
            for t in targets:
                rv = dba.upd_target(t['id'], 'idg', 1)
                if rv:
                    idg_ct += 1
                else:
                    dba_err_ct += 1
                rv = dba.upd_target(t['id'], 'fam', fam)
                if rv:
                    fam_ct += 1
                else:
                    dba_err_ct += 1
                if row[3]:
                    famext = row[3]
                    rv = dba.upd_target(t['id'], 'famext', famext)
                    if rv:
                        famext_ct += 1
                    else:
                        dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "{} targets updated with IDG flags".format(idg_ct)
    print "{} targets updated with fams".format(fam_ct)
    print "  {} targets updated with famexts".format(famext_ct)
    if notfnd:
        print "No target found for {} symbols: {}".format(
            len(notfnd), ", ".join(notfnd))
    if multfnd:
        print "Multiple targets found for {} symbols: {}".format(
            len(multfnd), ", ".join(multfnd))
    if dba_err_ct > 0:
        print "WARNING: {} database errors occured. See logfile {} for details.".format(
            dba_err_ct, logfile)
Exemplo n.º 3
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'NCBI GI Numbers',
        'source':
        'UniProt ID Mapping file %s' % (BASE_URL + FILENAME),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.uniprot.org/'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'xref',
        'where_clause': "dataset_id = %d" % dataset_id
    })
    if not rv:
        print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
        sys.exit(1)

    start_time = time.time()
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    infile = (DOWNLOAD_DIR + FILENAME).replace('.gz', '')
    line_ct = slmf.wcl(infile)
    # ID Mappiing fields
    # 1. UniProtKB-AC
    # 2. UniProtKB-ID
    # 3. GeneID (EntrezGene)
    # 4. RefSeq
    # 5. GI
    # 6. PDB
    # 7. GO
    # 8. UniRef100
    # 9. UniRef90
    # 10. UniRef50
    # 11. UniParc
    # 12. PIR
    # 13. NCBI-taxon
    # 14. MIM
    # 15. UniGene
    # 16. PubMed
    # 17. EMBL
    # 18. EMBL-CDS
    # 19. Ensembl
    # 20. Ensembl_TRS
    # 21. Ensembl_PRO
    # 22. Additional PubMed
    if not args['--quiet']:
        print "\nProcessing {} rows in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        tmark = {}
        xref_ct = 0
        skip_ct = 0
        dba_err_ct = 0
        for line in tsv:
            data = line.split('\t')
            ct += 1
            up = data[0]
            if not data[4]:  # no gi
                skip_ct += 1
                continue
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                skip_ct += 1
                continue
            target = targets[0]
            tmark[target['id']] = True
            pid = target['components']['protein'][0]['id']
            for gi in data[4].split('; '):
                rv = dba.ins_xref({
                    'protein_id': pid,
                    'xtype': 'NCBI GI',
                    'dataset_id': dataset_id,
                    'value': gi
                })
                if rv:
                    xref_ct += 1
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "\n{} rows processed".format(ct)
    print "  Inserted {} new GI xref rows for {} targets".format(
        xref_ct, len(tmark))
    print "  Skipped {} rows with no GI".format(skip_ct)
    if dba_err_ct > 0:
        print "WARNING: {} database errors occured. See logfile {} for details.".format(
            dba_err_ct, logfile)
Exemplo n.º 4
0
def load(args):
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    if not args['--quiet']:
        print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % (
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'IDG Families',
        'source':
        'IDG-KMC generated data from file %s' % os.path.basename(INFILE),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        "Target family designations generated by IDG-KMC groups at UNM and UMiami."
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'target',
        'column_name': 'tiofam'
    })
    if not rv:
        print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
        sys.exit(1)

    line_ct = slmf.wcl(INFILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, INFILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    idg_ct = 0
    upd_ct1 = 0
    upd_ct2 = 0
    null_ct = 0
    notfnd = []
    mulfnd = []
    dba_err_ct = 0
    with open(INFILE, 'rU') as csvfile:
        csvreader = csv.reader(csvfile)
        header = csvreader.next()  # skip header line
        ct += 1
        for row in csvreader:
            ct += 1
            pbar.update(ct)
            up = row[2].strip()
            fam = row[3].strip()
            famext = row[4].strip()
            if not fam:
                null_ct += 1
                continue
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                notfnd.append(up)
                continue
            if len(targets) > 1:
                mulfnd.append(up)
                continue
            t = targets[0]
            # only update fam for non-IDG targets
            # IDG target fams are set by load-IDGList.py
            if t['fam']:
                idg_ct += 1
                continue
            rv = dba.rv = dba.upd_target(t['id'], 'fam', fam)
            if not rv:
                print "ERROR updating target.fam: %d to %s" % (t['id'], fam)
            else:
                upd_ct1 += 1
            if famext and famext != '':
                rv = dba.upd_target(t['id'], 'famext', famext)
                if not rv:
                    print "ERROR updating target.famext: %d to %s" % (t['id'],
                                                                      famext)
                else:
                    upd_ct2 += 1
    pbar.finish()
    print "{} rows processed.".format(ct)
    print "{} IDG family designations loaded into TCRD.".format(upd_ct1)
    print "{} IDG extended family designations loaded into TCRD.".format(
        upd_ct2)
    print "Skipped {} IDG2 targets.".format(idg_ct)
    if notfnd:
        print "[WARNING] No target found for {} UniProt accessions: {}".format(
            len(notfnd), ", ".join(notfnd))
    if mulfnd:
        print "[WARNING] Multiple targets found for {} UniProt accessions: {}".format(
            len(mulfnd), ", ".join(mulfnd))
    if dba_err_ct > 0:
        print "WARNING: {} database errors occured. See logfile {} for details.".format(
            dba_err_ct, logfile)