def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'IMPC Phenotypes', 'source': "Files %s and %s from ftp://ftp.ebi.ac.uk/pub/databases/impc/release-9.2/csv/" % (GENO_PHENO_FILE, STAT_RES_FILE), 'app': PROGRAM, 'app_version': __version__ }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'phenotype', 'where_clause': "ptype = 'IMPC'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) line_ct = slmf.wcl(GENO_PHENO_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines from input file {}".format( line_ct, GENO_PHENO_FILE) with open(GENO_PHENO_FILE, 'rU') as csvfile: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line ct = 0 pt_ct = 0 pmark = {} notfnd = set() skip_ct = 0 dba_err_ct = 0 # 0: marker_accession_id # 1: marker_symbol # 2: phenotyping_center # 3: colony_id # 4: sex # 5: zygosity # 6: allele_accession_id # 7: allele_symbol # 8: allele_name # 9: strain_accession_id # 10: strain_name # 11: project_name # 12: project_fullname # 13: pipeline_name # 14: pipeline_stable_id # 15: procedure_stable_id # 16: procedure_name # 17: parameter_stable_id # 18: parameter_name # 19: top_level_mp_term_id # 20: top_level_mp_term_name # 21: mp_term_id # 22: mp_term_name # 23: p_value # 24: percentage_change # 25: effect_size # 26: statistical_method # 27: resource_name for row in csvreader: ct += 1 sym = row[1] if sym in notfnd: continue if not row[21] and not row[22]: # skip data with neither a term_id or term_name (IMPC has some of these) skip_ct += 1 continue nhps = dba.find_nhproteins({'sym': sym}, species='Mus musculus') if not nhps: notfnd.add(sym) logger.warn("No nhprotein found for symbol {}".format(sym)) continue pval = None if row[23] and row[23] != '': try: pval = float(row[23]) except: logger.warn( "Problem converting p_value {} for row {}".format( row[23], ct)) for nhp in nhps: rv = dba.ins_phenotype({ 'nhprotein_id': nhp['id'], 'ptype': 'IMPC', 'top_level_term_id': row[19], 'top_level_term_name': row[20], 'term_id': row[21], 'term_name': row[22], 'p_value': pval, 'percentage_change': row[24], 'effect_size': row[25], 'procedure_name': row[16], 'parameter_name': row[18], 'statistical_method': row[26], 'sex': row[4], 'gp_assoc': 1 }) if rv: pmark[nhp['id']] = True pt_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Loaded {} IMPC phenotypes for {} nhproteins".format( pt_ct, len(pmark.keys())) if notfnd: print "No nhprotein found for {} gene symbols. See logfile {} for details.".format( len(notfnd), logfile) if skip_ct > 0: print "Skipped {} lines with no term_id or term_name.".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) line_ct = slmf.wcl(STAT_RES_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines from input file {}".format( line_ct, STAT_RES_FILE) with open(STAT_RES_FILE, 'rU') as csvfile: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line ct = 0 pt_ct = 0 pmark = {} notfnd = set() skip_ct = 0 pv_ct = 0 dba_err_ct = 0 # 0: phenotyping_center # 1: intercept_estimate # 2: procedure_id # 3: mutant_biological_model_id # 4: rotated_residuals_test # 5: weight_effect_p_value # 6: male_mutant_count # 7: pipeline_stable_key # 8: female_ko_effect_p_value # 9: pipeline_stable_id # 10: parameter_stable_key # 11: data_type # 12: parameter_stable_id # 13: interaction_significant # 14: strain_accession_id # 15: control_selection_method # 16: parameter_name # 17: allele_name # 18: phenotyping_center_id # 19: weight_effect_stderr_estimate # 20: weight_effect_parameter_estimate # 21: procedure_stable_id # 22: status # 23: sex_effect_parameter_estimate # 24: female_ko_effect_stderr_estimate # 25: female_percentage_change # 26: group_2_residuals_normality_test # 27: marker_accession_id # 28: mp_term_name # 29: group_1_residuals_normality_test # 30: genotype_effect_p_value # 31: dependent_variable # 32: resource_name # 33: project_id # 34: procedure_name # 35: doc_id # 36: top_level_mp_term_id # 37: allele_accession_id # 38: blups_test # 39: null_test_p_value # 40: p_value # 41: marker_symbol # 42: control_biological_model_id # 43: pipeline_name # 44: sex # 45: interaction_effect_p_value # 46: colony_id # 47: project_name # 48: female_ko_parameter_estimate # 49: female_mutant_count # 50: organisation_id # 51: external_db_id # 52: female_control_count # 53: intermediate_mp_term_id # 54: db_id # 55: male_ko_effect_p_value # 56: top_level_mp_term_name # 57: metadata_group # 58: sex_effect_stderr_estimate # 59: zygosity # 60: male_percentage_change # 61: sex_effect_p_value # 62: mp_term_id # 63: male_ko_effect_stderr_estimate # 64: additional_information # 65: statistical_method # 66: _version_ # 67: intercept_estimate_stderr_estimate # 68: male_control_count # 69: intermediate_mp_term_name # 70: strain_name # 71: classification_tag # 72: effect_size # 73: procedure_stable_key # 74: allele_symbol # 75: resource_id # 76: group_2_genotype # 77: variance_significant # 78: pipeline_id # 79: group_1_genotype # 80: male_ko_parameter_estimate # 81: genotype_effect_parameter_estimate # 82: categories # 83: parameter_id # 84: batch_significant # 85: genotype_effect_stderr_estimate # 86: resource_fullname for row in csvreader: ct += 1 sym = row[41] if sym in notfnd: continue if not row[62] and not row[28]: # skip lines with neither a term_id or term_name skip_ct += 1 continue if not row[40]: # skip lines with no p-value skip_ct += 1 continue pval = None if row[40] and row[40] != '': try: pval = float(row[40]) except: logger.warn( "Problem converting p_value {} for row {}".format( row[40], ct)) if not pval: skip_ct += 1 continue if pval > 0.05: pv_ct += 1 continue nhps = dba.find_nhproteins({'sym': sym}, species='Mus musculus') if not nhps: notfnd.add(sym) logger.warn("No nhprotein found for symbol {}".format(sym)) continue for nhp in nhps: rv = dba.ins_phenotype({ 'nhprotein_id': nhp['id'], 'ptype': 'IMPC', 'top_level_term_id': row[36], 'top_level_term_name': row[56], 'term_id': row[62], 'term_name': row[28], 'p_value': pval, 'effect_size': row[72], 'procedure_name': row[34], 'parameter_name': row[16], 'statistical_method': row[65], 'sex': row[44], 'gp_assoc': 0 }) if rv: pmark[nhp['id']] = True pt_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Loaded {} IMPC phenotypes for {} nhproteins".format( pt_ct, len(pmark)) if notfnd: print "No nhprotein found for {} gene symbols. See logfile {} for details.".format( len(notfnd), logfile) if skip_ct > 0: print "Skipped {} lines with no term_id/term_name or no p-value.".format( skip_ct) if pv_ct > 0: print "Skipped {} lines with p-value > 0.05.".format(pv_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'HomoloGene', 'source': 'File %s' % BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ncbi.nlm.nih.gov/homologene', 'comments': 'Only Human, Mouse and Rat members of HomoloGene groups are loaded. These relate protein to nhprotein.' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{'dataset_id': dataset_id, 'table_name': 'homology'}] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] infile = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 0 hom_ct = 0 nf_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 pbar.update(ct) # homologene_group_id tax_id ncbi_gene_id symbol protein_gi ref_seq taxid = int(row[1]) if taxid not in TAXIDS: skip_ct += 1 continue if taxid == 9606: targets = dba.find_targets({'geneid': row[2]}) if not targets: nf_ct += 1 logger.warn("No target found for {}".format(row)) continue for t in targets: p = t['components']['protein'][0] rv = dba.ins_homologene({ 'protein_id': p['id'], 'groupid': row[0], 'taxid': taxid }) if rv: hom_ct += 1 else: dba_err_ct += 1 else: nhproteins = dba.find_nhproteins({'geneid': row[2]}) if not nhproteins: nf_ct += 1 logger.warn("No nhprotein found for {}".format(row)) continue for nhp in nhproteins: rv = dba.ins_homologene({ 'nhprotein_id': nhp['id'], 'groupid': row[0], 'taxid': taxid }) if rv: hom_ct += 1 else: dba_err_ct += 1 pbar.finish() print "Processed {} lines.".format(ct) print "Loaded {} new homologene rows".format(hom_ct) print " Skipped {} non-Human/Mouse/Rat lines".format(skip_ct) if nf_ct > 0: print "WARNNING: No target/nhprotein found for {} lines. See logfile {} for details.".format( nf_ct, logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)