示例#1
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'IMPC Phenotypes',
        'source':
        "Files %s and %s from ftp://ftp.ebi.ac.uk/pub/databases/impc/release-9.2/csv/"
        % (GENO_PHENO_FILE, STAT_RES_FILE),
        'app':
        PROGRAM,
        'app_version':
        __version__
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'phenotype',
        'where_clause': "ptype = 'IMPC'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    line_ct = slmf.wcl(GENO_PHENO_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines from input file {}".format(
            line_ct, GENO_PHENO_FILE)
    with open(GENO_PHENO_FILE, 'rU') as csvfile:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        csvreader = csv.reader(csvfile)
        header = csvreader.next()  # skip header line
        ct = 0
        pt_ct = 0
        pmark = {}
        notfnd = set()
        skip_ct = 0
        dba_err_ct = 0
        # 0: marker_accession_id
        # 1: marker_symbol
        # 2: phenotyping_center
        # 3: colony_id
        # 4: sex
        # 5: zygosity
        # 6: allele_accession_id
        # 7: allele_symbol
        # 8: allele_name
        # 9: strain_accession_id
        # 10: strain_name
        # 11: project_name
        # 12: project_fullname
        # 13: pipeline_name
        # 14: pipeline_stable_id
        # 15: procedure_stable_id
        # 16: procedure_name
        # 17: parameter_stable_id
        # 18: parameter_name
        # 19: top_level_mp_term_id
        # 20: top_level_mp_term_name
        # 21: mp_term_id
        # 22: mp_term_name
        # 23: p_value
        # 24: percentage_change
        # 25: effect_size
        # 26: statistical_method
        # 27: resource_name
        for row in csvreader:
            ct += 1
            sym = row[1]
            if sym in notfnd:
                continue
            if not row[21] and not row[22]:
                # skip data with neither a term_id or term_name (IMPC has some of these)
                skip_ct += 1
                continue
            nhps = dba.find_nhproteins({'sym': sym}, species='Mus musculus')
            if not nhps:
                notfnd.add(sym)
                logger.warn("No nhprotein found for symbol {}".format(sym))
                continue
            pval = None
            if row[23] and row[23] != '':
                try:
                    pval = float(row[23])
                except:
                    logger.warn(
                        "Problem converting p_value {} for row {}".format(
                            row[23], ct))
            for nhp in nhps:
                rv = dba.ins_phenotype({
                    'nhprotein_id': nhp['id'],
                    'ptype': 'IMPC',
                    'top_level_term_id': row[19],
                    'top_level_term_name': row[20],
                    'term_id': row[21],
                    'term_name': row[22],
                    'p_value': pval,
                    'percentage_change': row[24],
                    'effect_size': row[25],
                    'procedure_name': row[16],
                    'parameter_name': row[18],
                    'statistical_method': row[26],
                    'sex': row[4],
                    'gp_assoc': 1
                })
                if rv:
                    pmark[nhp['id']] = True
                    pt_ct += 1
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Loaded {} IMPC phenotypes for {} nhproteins".format(
        pt_ct, len(pmark.keys()))
    if notfnd:
        print "No nhprotein found for {} gene symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if skip_ct > 0:
        print "Skipped {} lines with no term_id or term_name.".format(skip_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    line_ct = slmf.wcl(STAT_RES_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines from input file {}".format(
            line_ct, STAT_RES_FILE)
    with open(STAT_RES_FILE, 'rU') as csvfile:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        csvreader = csv.reader(csvfile)
        header = csvreader.next()  # skip header line
        ct = 0
        pt_ct = 0
        pmark = {}
        notfnd = set()
        skip_ct = 0
        pv_ct = 0
        dba_err_ct = 0
        # 0: phenotyping_center
        # 1: intercept_estimate
        # 2: procedure_id
        # 3: mutant_biological_model_id
        # 4: rotated_residuals_test
        # 5: weight_effect_p_value
        # 6: male_mutant_count
        # 7: pipeline_stable_key
        # 8: female_ko_effect_p_value
        # 9: pipeline_stable_id
        # 10: parameter_stable_key
        # 11: data_type
        # 12: parameter_stable_id
        # 13: interaction_significant
        # 14: strain_accession_id
        # 15: control_selection_method
        # 16: parameter_name
        # 17: allele_name
        # 18: phenotyping_center_id
        # 19: weight_effect_stderr_estimate
        # 20: weight_effect_parameter_estimate
        # 21: procedure_stable_id
        # 22: status
        # 23: sex_effect_parameter_estimate
        # 24: female_ko_effect_stderr_estimate
        # 25: female_percentage_change
        # 26: group_2_residuals_normality_test
        # 27: marker_accession_id
        # 28: mp_term_name
        # 29: group_1_residuals_normality_test
        # 30: genotype_effect_p_value
        # 31: dependent_variable
        # 32: resource_name
        # 33: project_id
        # 34: procedure_name
        # 35: doc_id
        # 36: top_level_mp_term_id
        # 37: allele_accession_id
        # 38: blups_test
        # 39: null_test_p_value
        # 40: p_value
        # 41: marker_symbol
        # 42: control_biological_model_id
        # 43: pipeline_name
        # 44: sex
        # 45: interaction_effect_p_value
        # 46: colony_id
        # 47: project_name
        # 48: female_ko_parameter_estimate
        # 49: female_mutant_count
        # 50: organisation_id
        # 51: external_db_id
        # 52: female_control_count
        # 53: intermediate_mp_term_id
        # 54: db_id
        # 55: male_ko_effect_p_value
        # 56: top_level_mp_term_name
        # 57: metadata_group
        # 58: sex_effect_stderr_estimate
        # 59: zygosity
        # 60: male_percentage_change
        # 61: sex_effect_p_value
        # 62: mp_term_id
        # 63: male_ko_effect_stderr_estimate
        # 64: additional_information
        # 65: statistical_method
        # 66: _version_
        # 67: intercept_estimate_stderr_estimate
        # 68: male_control_count
        # 69: intermediate_mp_term_name
        # 70: strain_name
        # 71: classification_tag
        # 72: effect_size
        # 73: procedure_stable_key
        # 74: allele_symbol
        # 75: resource_id
        # 76: group_2_genotype
        # 77: variance_significant
        # 78: pipeline_id
        # 79: group_1_genotype
        # 80: male_ko_parameter_estimate
        # 81: genotype_effect_parameter_estimate
        # 82: categories
        # 83: parameter_id
        # 84: batch_significant
        # 85: genotype_effect_stderr_estimate
        # 86: resource_fullname
        for row in csvreader:
            ct += 1
            sym = row[41]
            if sym in notfnd:
                continue
            if not row[62] and not row[28]:
                # skip lines with neither a term_id or term_name
                skip_ct += 1
                continue
            if not row[40]:
                # skip lines with no p-value
                skip_ct += 1
                continue
            pval = None
            if row[40] and row[40] != '':
                try:
                    pval = float(row[40])
                except:
                    logger.warn(
                        "Problem converting p_value {} for row {}".format(
                            row[40], ct))
            if not pval:
                skip_ct += 1
                continue
            if pval > 0.05:
                pv_ct += 1
                continue
            nhps = dba.find_nhproteins({'sym': sym}, species='Mus musculus')
            if not nhps:
                notfnd.add(sym)
                logger.warn("No nhprotein found for symbol {}".format(sym))
                continue
            for nhp in nhps:
                rv = dba.ins_phenotype({
                    'nhprotein_id': nhp['id'],
                    'ptype': 'IMPC',
                    'top_level_term_id': row[36],
                    'top_level_term_name': row[56],
                    'term_id': row[62],
                    'term_name': row[28],
                    'p_value': pval,
                    'effect_size': row[72],
                    'procedure_name': row[34],
                    'parameter_name': row[16],
                    'statistical_method': row[65],
                    'sex': row[44],
                    'gp_assoc': 0
                })
                if rv:
                    pmark[nhp['id']] = True
                    pt_ct += 1
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Loaded {} IMPC phenotypes for {} nhproteins".format(
        pt_ct, len(pmark))
    if notfnd:
        print "No nhprotein found for {} gene symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if skip_ct > 0:
        print "Skipped {} lines with no term_id/term_name or no p-value.".format(
            skip_ct)
    if pv_ct > 0:
        print "Skipped {} lines with p-value > 0.05.".format(pv_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
示例#2
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'HomoloGene',
        'source':
        'File %s' % BASE_URL + FILENAME,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ncbi.nlm.nih.gov/homologene',
        'comments':
        'Only Human, Mouse and Rat members of HomoloGene groups are loaded. These relate protein to nhprotein.'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{'dataset_id': dataset_id, 'table_name': 'homology'}]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    infile = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} input lines in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        skip_ct = 0
        hom_ct = 0
        nf_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            pbar.update(ct)
            # homologene_group_id    tax_id    ncbi_gene_id    symbol    protein_gi    ref_seq
            taxid = int(row[1])
            if taxid not in TAXIDS:
                skip_ct += 1
                continue
            if taxid == 9606:
                targets = dba.find_targets({'geneid': row[2]})
                if not targets:
                    nf_ct += 1
                    logger.warn("No target found for {}".format(row))
                    continue
                for t in targets:
                    p = t['components']['protein'][0]
                    rv = dba.ins_homologene({
                        'protein_id': p['id'],
                        'groupid': row[0],
                        'taxid': taxid
                    })
                    if rv:
                        hom_ct += 1
                    else:
                        dba_err_ct += 1
            else:
                nhproteins = dba.find_nhproteins({'geneid': row[2]})
                if not nhproteins:
                    nf_ct += 1
                    logger.warn("No nhprotein found for {}".format(row))
                    continue
                for nhp in nhproteins:
                    rv = dba.ins_homologene({
                        'nhprotein_id': nhp['id'],
                        'groupid': row[0],
                        'taxid': taxid
                    })
                    if rv:
                        hom_ct += 1
                    else:
                        dba_err_ct += 1
    pbar.finish()
    print "Processed {} lines.".format(ct)
    print "Loaded {} new homologene rows".format(hom_ct)
    print "  Skipped {} non-Human/Mouse/Rat lines".format(skip_ct)
    if nf_ct > 0:
        print "WARNNING: No target/nhprotein found for {} lines. See logfile {} for details.".format(
            nf_ct, logfile)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)