def pheno_general_stats(obj, model): info = "" obj_ = load_cv_obj(obj) info += "%s\nPehnotype: %s\n" % (sep_str, obj_.Y_name) # number of samples (after sample intersection) info += "\tNumber of samples without missing pheno: %d\n" % len( set(read_id_list(obj_.samples))) # pheno. stats (after sample intersection) info += "\t0:%d,1:%d | 0:%.2f,1:%.2f\n" % ( obj_.Y_stat['class_num']['0'], obj_.Y_stat['class_num']['1'], obj_.Y_stat['class_pct']['0'], obj_.Y_stat['class_pct']['1']) # pheno check if obj_.Y_check: # k and lambda k = read_k(obj_.full['eig']['eig.k']) l = read_lambda(obj_.full['eig']['eig.lambda']) info += "EIGENSTRAT\n\tFull: k=%d, lambda=%.5f\n" % (k, l) for rep in obj_.cv.keys(): info += "\tRep %d" % (rep) for fold in obj_.cv[rep].keys(): k = read_k(obj_.cv[rep][fold]['eig']['eig.k']) l = read_lambda(obj_.cv[rep][fold]['eig']['eig.lambda']) info += " | fold %d: k=%d, lambda=%.5f" % (fold, k, l) info += "\n" # CV selected features n = len(set(read_id_list(obj_.full['features_sel']))) info += "Selected features:\n\tFull: %d\n" % n for rep in obj_.cv.keys(): info += "\tRep %d" % (rep) for fold in obj_.cv[rep].keys(): n = len(set(read_id_list(obj_.cv[rep][fold]['features_sel']))) info += " | fold %d: %d" % (fold, n) info += "\n" # CV model s = ';'.join(list(set(read_id_list(obj_.full[model]["combi.txt"])))) info += "Models\n\tFull: %s\n" % s for rep in obj_.cv.keys(): for fold in obj_.cv[rep].keys(): s = ';'.join( list( set( read_id_list( obj_.cv[rep][fold][model]["combi.txt"])))) info += "\tRep %d, fold %d: %s\n" % (rep, fold, s) # CV eval. reps info += 'Mssing fold pred. results:' for rep in obj_.cv.keys(): for fold in obj_.cv[rep].keys(): f = obj_.cv[rep][fold][model]['pred.csv'] if not os.path.isfile(f) or os.stat(f).st_size == 0: info += "Rep %d, fold %d: no model prediction results\n" % ( rep, fold) info += '\n' else: info += 'Check not passed\n' info += "%s\n" % sep_str return info
def bin_to_eig_pheno(pheno_file, pheno_name, pheno_ofile, sample_file=None, miss='NA'): samples = None if sample_file is not None: samples = set(read_id_list(sample_file)) # Pheno header = True pheno_col = 0 with open(pheno_file, "r") as pheno, open(pheno_ofile, "w") as pheno_o: for line in pheno: line = line.rstrip('\n') line = line.split('\t') if header: header = False for i in range(0, len(line)): if line[i] == pheno_name: pheno_col = i + 1 # +1 because 1st field is sampleID break continue assert pheno_col > 0 sampleID = line[0] if samples is not None and sampleID not in samples: continue samplePh = line[pheno_col] if samplePh == '1': samplePh = 'Case' elif samplePh == '0': samplePh = 'Control' elif samplePh == miss: samplePh = 'Ignore' else: sys.exit('Unknown phenotype: %s: %s: %s' % (pheno_name, sampleID, samplePh)) pheno_o.write("\t".join([sampleID, 'U', samplePh]) + "\n") #; pheno_o.flush()
def cv_pheno_stats(cv_obj): # already done if cv_obj.done['y_stats']: return "SKIP: Pheno stat.s already computed" cv_obj.Y_stat['class_num'] = {'0': 0, '1': 0} cv_obj.Y_stat['class_pct'] = {'0': 0, '1': 0} with open(cv_obj.Y_stat['file']) as infile: for line in infile: line = line.rstrip("\n") line = line.split('\t') if line[0] == 'miss': cv_obj.Y_stat['miss'] = int(line[1]) elif line[0] == '0': cv_obj.Y_stat['class_num']['0'] = int(line[1]) elif line[0] == '1': cv_obj.Y_stat['class_num']['1'] = int(line[1]) else: sys.exit("In Y stat.s file: unknown value %s" % line[0]) cv_obj.Y_stat['total'] = sum([ cv_obj.Y_stat['miss'], cv_obj.Y_stat['class_num']['0'], cv_obj.Y_stat['class_num']['1'] ]) samples = set(read_id_list(cv_obj.samples)) assert len(samples) == cv_obj.Y_stat[ 'total'], "Have %d samples in %s but only %d as total from Y stat.s file %s" % ( len(samples), cv_obj.samples, cv_obj.Y_stat['total'], cv_obj.Y_stat['file']) assert cv_obj.Y_stat['miss'] == 0, "Missing pheno for %s" % cv_obj.Y_name if (cv_obj.Y_stat['total'] - cv_obj.Y_stat['miss']) > 0: cv_obj.Y_stat['class_pct'] = { \ '0' : 100.0 * float(cv_obj.Y_stat['class_num']['0']) / float(cv_obj.Y_stat['total']-cv_obj.Y_stat['miss']) ,\ '1' : 100.0 * float(cv_obj.Y_stat['class_num']['1']) / float(cv_obj.Y_stat['total']-cv_obj.Y_stat['miss']) \ } cv_obj.done['y_stats'] = True return "Computed pheno stat.s"
def cv_check_samples(in_samples_f, ex_samples_f, cv_obj): if cv_obj.done['sample_check']: return "SKIP: Samples already checked" info = "" in_samples = set(read_id_list(in_samples_f)) ex_samples = set(read_id_list(ex_samples_f)) # CV full: cv_samples = set(read_id_list(cv_obj.full['samples'])) # no intersectoion with excluded assert len( cv_samples.intersection(ex_samples) ) == 0, "Intersection with excluded: Assertion error in %s" % cv_samples # all in included assert len( cv_samples.difference(in_samples) ) == 0, "Set diff. with included: Assertion error in %s" % cv_samples info += "CV %s (full): Sample list %s: %d entries\n" % ( cv_obj.Y_name, cv_obj.full['samples'], len(cv_samples)) # CV folds: only samples from samples in CV obj., print number for rep in cv_obj.cv.keys(): for fold in cv_obj.cv[rep].keys(): train_samples = set( read_id_list(cv_obj.cv[rep][fold]['samples_train'])) test_samples = set( read_id_list(cv_obj.cv[rep][fold]['samples_test'])) # no intersection of train and test assert len(train_samples.intersection(test_samples) ) == 0, "Assertion error in %s and %s" % (train_samples, test_samples) # together they should be the same as in full set assert cv_samples == train_samples.union( test_samples), "Assertion error in %s and %s" % (train_samples, test_samples) # number of samples info += "CV %s (rep %d, fold %d): Sample list %s: %d entries\n" % ( cv_obj.Y_name, rep, fold, cv_obj.cv[rep][fold]['samples_train'], len(train_samples)) info += "CV %s (rep %d, fold %d): Sample list %s: %d entries\n" % ( cv_obj.Y_name, rep, fold, cv_obj.cv[rep][fold]['samples_test'], len(test_samples)) cv_obj.done['sample_check'] = True return info
def cv_eig_convert(cv_obj, rep=None, fold=None, skip=False): eig_geno = eig_snps = eig_pheno = eig_snps_rm = None samples = features = features_pr = None cv_obj_sub = None if rep is None or fold is None: cv_obj_sub = cv_obj.full else: cv_obj_sub = cv_obj.cv[rep][fold] eig_geno = cv_obj_sub['eig']['eig.geno'] eig_snps = cv_obj_sub['eig']['eig.snps'] eig_snps_rm = cv_obj_sub['eig']['eig.snps.rm'] eig_pheno = cv_obj_sub['eig']['eig.pheno'] if rep is None or fold is None: samples = cv_obj_sub['samples'] else: samples = cv_obj_sub['samples_train'] features = cv_obj_sub['features'] features_pr = cv_obj_sub['features_pr'] if not skip: bin_to_eig_geno_snp( \ geno_file=cv_obj.X_bin, \ geno_ofile=eig_geno, snp_ofile=eig_snps, \ sample_file=samples, feature_file=features, \ miss='NA' \ ) bin_to_eig_pheno( \ pheno_file=cv_obj.Y_file, \ pheno_name=cv_obj.Y_name, \ pheno_ofile=eig_pheno, \ sample_file=samples, \ miss='NA' \ ) snps = set(read_id_list(features)) snps_pr = set(read_id_list(features_pr)) snps_rm = snps.difference(snps_pr) with open(eig_snps_rm, 'w') as ofile: for snp in snps_rm: ofile.write("rs%s\n" % snp)
def add_model_annot(obj, annot, odir, obname, model, f_type): fields = None if f_type == 'gff': fields = gene_annot_fields elif f_type == 'vcf': fields = vcf_annot_fields obj_ = load_cv_obj(obj) if not obj_.Y_check: return model_res = path.join(obj_.odir, "%s_total_perf.csv" % model) if not path.isfile(model_res) or os.stat(model_res).st_size == 0: return # no CV perf. if not path.isfile(obj_.full[model]['combi.txt']): return # no/empty model ofile = path.join(odir, "%s_%s_%s.csv" % (obname, obj_.Y_name_str, model)) perf_list = [ 'ERR', 'ACC', 'B_ACC', 'SENS', 'SPEC', 'PREC', 'NPV', 'FPR', 'FNR', 'Fmeasure', 'gm_RS', 'gm_RP', 'AUC_ROC', 'AUC_PR' ] perf_dict = dict.fromkeys(perf_list, None) # read perf with open(model_res) as ifile: header = True for line in ifile: if not line: continue line = line.rstrip('\n') line = line.split('\t') if header: header = False for i in range(0, len(line)): if line[i] in perf_dict: if perf_dict[line[i]] is None: perf_dict[line[i]] = {} perf_dict[line[i]]['id'] = i continue if line[0] == 'mean': for k in perf_dict.keys(): perf_dict[k]['value'] = line[perf_dict[k]['id']] # read features features = set(read_id_list(obj_.full[model]['combi.txt'])) # write file with open(ofile, 'w') as of: of.write("Feature\t%s\t%s\n" % ('\t'.join(fields), '\t'.join(perf_list))) for feature in features: of.write("%s\t%s\t%s\n" % (feature, '\t'.join([annot[feature][f] for f in fields]), '\t'.join([perf_dict[k]['value'] for k in perf_list])))
def cv_create_folds(cv_obj): if cv_obj.done['cv_folds']: return "SKIP: Folds already created" info = "" # did not pass the phenotype check if not cv_obj.Y_check: return "CV folds: %s did not pass pheno. check" % cv_obj.Y_name # Samples to use samples = set(read_id_list(cv_obj.samples)) # Y as array and sample IDs y, y_samples = pheno_as_list(pheno_file=cv_obj.Y_file, pheno_name=cv_obj.Y_name, ignore_miss=True, samples=samples) y = numpy.array(y) y_samples = numpy.array(y_samples) # need for indexing for rep in cv_obj.cv.keys(): # Create folds y_folds = cross_validation.StratifiedKFold(y=y, n_folds=cv_obj.folds, shuffle=True, random_state=None) # Save folds as lists of samples in train/test fold = 1 for train_index, test_index in y_folds: with open(cv_obj.cv[rep][fold]['samples_train'], 'w') as o_file: o_file.write("\n".join(y_samples[train_index].tolist()) + "\n") with open(cv_obj.cv[rep][fold]['samples_test'], 'w') as o_file: o_file.write("\n".join(y_samples[test_index].tolist()) + "\n") # Stats y_fold_stats = pheno_class_stats( pheno_file=cv_obj.Y_file, pheno_name=cv_obj.Y_name, samples=set(y_samples[train_index].tolist())) info += "Rep %d, fold %d, train: %s\n" % (rep, fold, ' ; '.join( [' - '.join([k, "%d" % v]) for k, v in y_fold_stats.items()])) y_fold_stats = pheno_class_stats( pheno_file=cv_obj.Y_file, pheno_name=cv_obj.Y_name, samples=set(y_samples[test_index].tolist())) info += "Rep %d, fold %d, test: %s\n" % (rep, fold, ' ; '.join( [' - '.join([k, "%d" % v]) for k, v in y_fold_stats.items()])) fold += 1 cv_obj.done['cv_folds'] = True return info
def bin_to_plink_pheno(pheno_file, pheno_name, pheno_ofile, family="Fam", sample_file=None, miss='NA', verbose=True): if verbose: sys.stdout.write("Bin. pheno mat. to PLINK alt. pheno") samples = None if sample_file is not None: samples = set(read_id_list(sample_file)) if verbose: sys.stdout.write("Sample list contains %d unique IDs\n" % len(samples)) # Pheno header = True pheno_col = 0 with open(pheno_file, "r") as pheno, open(pheno_ofile, "w") as pheno_o: sys.stdout.write("\tOutput: %s in %s\n" % (pheno_name, pheno_ofile)) for line in pheno: line = line.rstrip('\n') line = line.split('\t') if header: header = False for i in range(0, len(line)): if line[i] == pheno_name: pheno_col = i + 1 # +1 because 1s field is sampleID break continue sampleID = line[0] if samples is not None and sampleID not in samples: continue samplePh = line[pheno_col] if samplePh == miss: samplePh = '-9' pheno_o.write("\t".join([family, sampleID, samplePh]) + "\n")
def bin_to_eig_geno_snp(geno_file, geno_ofile, snp_ofile, sample_file=None, feature_file=None, miss='NA'): samples = features = None if sample_file is not None: samples = set(read_id_list(sample_file)) if feature_file is not None: features = set(read_id_list(feature_file)) # SNP header = True count = 1 found_features = 0 with open(geno_file, "r") as geno, open(snp_ofile, "w") as snp_o: for line in geno: if header: header = False continue line = line.rstrip('\n') snpID = line.split('\t')[0] if features is not None and snpID not in features: continue found_features += 1 snp_o.write( "\t".join(["rs" + snpID, "1", "0.0", str(count), "", ""]) + "\n") # EIGENSTRAT expects "rs<snpID>" count += 1 assert (features is None) or ( found_features == len(features) ), "%s: expected %d, found %d" % (snp_file, len(features), found_features) # Geno header = True sampleIDs = None found_features = 0 with open(geno_file, "r") as geno, open(geno_ofile, "w") as geno_o: for line in geno: line = line.rstrip('\n') line = line.split('\t') if header: header = False sampleIDs = line continue snpID = line[0] if features is not None and snpID not in features: continue found_features += 1 found_samples = 0 for i in range(1, len(line)): sampleID = sampleIDs[i - 1] if (samples is not None and sampleID not in samples) or ( features is not None and snpID not in features): continue found_samples += 1 sampleAl = line[i] if sampleAl == miss: continue #sampleAl = "9" else: geno_o.write("\t".join(["rs" + snpID, sampleID, sampleAl]) + "\n") assert (samples is None) or ( found_samples == len(samples)), "%s: expected %d, found %d" % ( geno_file, len(saples), found_samples) assert (features is None) or ( found_features == len(features) ), "%s: expected %d, found %d" % (geno_file, len(features), found_features)
def bin_to_plink_ped_map(geno_file, map_ofile, ped_ofile, family="Fam", sample_file=None, feature_file=None, feature_cl_file=None, miss='NA', verbose=True): if verbose: sys.stdout.write("Bin. feature mat. to PLINK PED/MAP") samples = None if sample_file is not None: samples = set(read_id_list(sample_file)) if verbose: sys.stdout.write("Sample list contains %d unique IDs\n" % len(samples)) features = None if feature_file is not None: features = set(read_id_list(feature_file)) if verbose: sys.stdout.write("Feature list contains %d unique IDs\n" % len(features)) feature_cl = None if feature_cl_file is not None: feature_cl = read_feature_cl(feature_cl_file) # MAP header = True count = 1 with open(geno_file, "r") as geno, open(map_ofile, "w") as map_o: if verbose: sys.stdout.write("\tOutput: %s\n" % map_ofile) for line in geno: if header: header = False continue line = line.rstrip('\n') snpID = line.split('\t')[0] snpChr = "0" snpDis = "0" snpPos = str(count) if features is not None and snpID not in features: continue if feature_cl is not None: snpChr = feature_cl[snpID] map_o.write("\t".join([snpChr, snpID, snpDis, snpPos]) + "\n") #map_o.flush() count += 1 # PED header = True sample_dict = None sample_IDs = None with open(geno_file, "r") as geno: if verbose: sys.stdout.write("\tOutput: %s\n" % ped_ofile) for line in geno: line = line.rstrip('\n') line = line.split('\t') if header: header = False sample_IDs = line sample_dict = dict.fromkeys(sample_IDs, "") continue snpID = line[0] for i in range(1, len(line)): sampleID = sample_IDs[i - 1] if samples is not None and sampleID not in samples: if sampleID in sample_dict: del sample_dict[sampleID] continue if features is not None and snpID not in features: continue sampleAl = line[i] if sampleAl == "1": sampleAl = "A A" elif sampleAl == "0": sampleAl = "G G" elif sampleAl == miss: sampleAl = "0 0" if sample_dict[sampleID] == "": sample_dict[sampleID] = sampleAl else: sample_dict[sampleID] += ("\t" + sampleAl) with open(ped_ofile, "w") as ped_o: for sampleID, sampleAl in sample_dict.items(): ped_o.write( "\t".join([family, sampleID, "0", "0", "0", "0", sampleAl]) + "\n")
# NOTE All pheno. CV objects objs = glob.glob("%s/obj/*.pkl" % args.idir) #--------------------------------------------------# # NOTE Collect information of each object sep_str = "#%s#" % ('-' * 50) with open(path.join(odir, "%s_stats.txt" % args.obname), 'w') as ofile: for obj in objs: info = pheno_general_stats(obj, args.model) ofile.write(info) #--------------------------------------------------# # NOTE Feature annotation annot = None # All genes obj_ = load_cv_obj( objs[0]) # load only first object (all should have same feature list) features = set(read_id_list(obj_.features)) sys.stdout.write("There are %d features\n" % len(features)) # Annotation from table if args.f_type == 'gff': annot = read_annot_tab(annot_tab=args.f_source) elif args.f_type == 'vcf': annot = read_annot_tab(annot_tab=args.f_source, id_prefix='var_') assert len(annot) == len(features) sys.stdout.write('Annotations were collected\n') #--------------------------------------------------# # NOTE Add annotation to results assert annot is not None pool = Pool(args.cores) pool_iter = itertools.product(objs, [annot], [odir], [args.obname], [args.model], [args.f_type]) annots = pool.starmap(add_gwas_annot, pool_iter)