def boxcoxTransformation(data=None): ind = sp.where(data.flatten() <= 0.0)[0] if ind.shape[0] > 0: return data if data.ndim == 1: [data, boxcox_lambda] = boxcox(data) elif data.shape[1] == 1: [tmp_y, boxcox_lambda] = boxcox(data[:, 0]) data = sp.zeros((tmp_y.shape[0], 1)) data[:, 0] = tmp_y elif data.shape[0] == 1: [tmp_y, boxcox_lambda] = boxcox(data[0, :]) data = sp.zeros((1, tmp_y.shape[0])) data[0, :] = tmp_y return data
def boxcoxTransformation(data=None): ind = sp.where(data.flatten()<=0.0)[0] if ind.shape[0]>0: return data if data.ndim==1: [data, boxcox_lambda] = boxcox(data) elif data.shape[1]==1: [tmp_y, boxcox_lambda] = boxcox(data[:,0]) data = sp.zeros((tmp_y.shape[0],1)) data[:,0] = tmp_y elif data.shape[0]==1: [tmp_y, boxcox_lambda] = boxcox(data[0,:]) data = sp.zeros((1,tmp_y.shape[0])) data[0,:] = tmp_y return data
def treatdata(self, data): """ Returns the box-cox transformed data if the flag has been set. otherwise it will return the data unchanged. """ #print 'length', len(data) if self.use_bc: t_data, llmb = boxcox(data) return t_data else: return data
def doBoxCoxTransformation(self): for i in range(self.n_t): y = self.Y[:, i] idx = SP.isfinite(y) self.Y[idx, i] = MS.boxcox(y[idx])[0]
def doBoxCoxTransformation(self): for i in range(self.n_t): y = self.Y[:,i] idx = SP.isfinite(y) self.Y[idx,i] = MS.boxcox(y[idx])[0]
def plotHistogram(y=None,phenotype_name=None,transform='sqrt',outdir=None): ind = sp.where(~sp.isnan(y)) y = y[ind] pl.figure(figsize=(12,6)) pl.subplot(121) [test_statistic, p_value] = shapiro(y) pl.hist(y,bins=30,color=color_t[1],label="Original") pl.title(phenotype_name.replace("_"," ") + ", Shapiro: %.2e"%(p_value)) leg = pl.legend(fancybox=True) leg.get_frame().set_alpha(0.2) #leg.get_frame().set_edgecolor("none") remove_border() pl.subplot(122) if transform=="all": p_vals = [] transformations = sp.array(['boxcox','sqrt','log','log10']) for t in transformations: zeros = sp.where(y==0)[0] if zeros.shape[0]==0: if t=='sqrt': tmpy = sp.sqrt(y) elif t=="boxcox": [tmpy,b_lambda] = boxcox(y) elif t=="log": tmpy = sp.log(y) elif t=="log10": tmpy = sp.log10(y) [test_statistic, pv] = shapiro(tmpy) else: pv = 0.0 p_vals.append(pv) p_vals = sp.array(p_vals) ind = sp.argmax(p_vals) transform = transformations[ind] ind = sp.where(y==0)[0] if ind.shape[0]>0: print "IMPORTANT: y contains 0 -> transformation changed to SQRT" transform = "sqrt" if transform=='sqrt': y = sp.sqrt(y) elif transform=="boxcox": [y,b_lambda] = boxcox(y) elif transform=="log": y = sp.log(y) elif transform=="log10": y = sp.log10(y) [test_statistic, p_value_t] = shapiro(y) pl.hist(y,bins=30,color=color_t[4],label=transform) pl.title(phenotype_name.replace("_"," ") + ", Shapiro: %.2e"%(p_value_t)) leg = pl.legend(fancybox=True) leg.get_frame().set_alpha(0.2) #leg.get_frame().set_edgecolor("none") remove_border() pl.subplots_adjust(left=0.03,bottom=0.05,right=0.99,top=0.94,wspace=0.07,hspace=0.34) pl.savefig(os.path.join(outdir,phenotype_name + ".pdf")) if(p_value>p_value_t): return "original" else: return transform
if len(sys.argv)==4: transform = sys.argv[3] [y,phenotype_names,sample_ids,fid] = read_data(sys.argv[1]) output_dir = sys.argv[2] for i,phenotype in enumerate(phenotype_names): selected_transform = plotHistogram(y=y[:,i],phenotype_name=phenotype,transform=transform,outdir=output_dir) ind = sp.where(~sp.isnan(y[:,i]))[0] if selected_transform=='sqrt': phenotype_names[i] = "sqrt_" + phenotype y[ind,i] = sp.sqrt(y[ind,i]) elif selected_transform=="boxcox": phenotype_names[i] = "boxcox_" + phenotype tmp = y[ind,i] [y[ind,i],b_lambda] = boxcox(tmp) elif selected_transform=="log": phenotype_names[i] = "log_" + phenotype y[ind,i] = sp.log(y[ind,i]) elif selected_transform=="log10": phenotype_names[i] = "log10_" + phenotype y[ind,i] = sp.log10(y[ind,i]) f = open(os.path.join(output_dir, "transformed_phenotypes.txt"),'w') f.write("FID IID ") string = "" for phenotype in phenotype_names: string += phenotype + " " f.write(string[:-1] + "\n") for i in range(fid.shape[0]): f.write(fid[i] + " " + sample_ids[i] + " ")
def plotHistogram(y=None, phenotype_name=None, transform='sqrt', outdir=None): ind = sp.where(~sp.isnan(y)) y = y[ind] pl.figure(figsize=(12, 6)) pl.subplot(121) [test_statistic, p_value] = shapiro(y) pl.hist(y, bins=30, color=color_t[1], label="Original") pl.title(phenotype_name.replace("_", " ") + ", Shapiro: %.2e" % (p_value)) leg = pl.legend(fancybox=True) leg.get_frame().set_alpha(0.2) #leg.get_frame().set_edgecolor("none") remove_border() pl.subplot(122) if transform == "all": p_vals = [] transformations = sp.array(['boxcox', 'sqrt', 'log', 'log10']) for t in transformations: zeros = sp.where(y == 0)[0] if zeros.shape[0] == 0: if t == 'sqrt': tmpy = sp.sqrt(y) elif t == "boxcox": [tmpy, b_lambda] = boxcox(y) elif t == "log": tmpy = sp.log(y) elif t == "log10": tmpy = sp.log10(y) [test_statistic, pv] = shapiro(tmpy) else: pv = 0.0 p_vals.append(pv) p_vals = sp.array(p_vals) ind = sp.argmax(p_vals) transform = transformations[ind] ind = sp.where(y == 0)[0] if ind.shape[0] > 0: print "IMPORTANT: y contains 0 -> transformation changed to SQRT" transform = "sqrt" if transform == 'sqrt': y = sp.sqrt(y) elif transform == "boxcox": [y, b_lambda] = boxcox(y) elif transform == "log": y = sp.log(y) elif transform == "log10": y = sp.log10(y) [test_statistic, p_value_t] = shapiro(y) pl.hist(y, bins=30, color=color_t[4], label=transform) pl.title( phenotype_name.replace("_", " ") + ", Shapiro: %.2e" % (p_value_t)) leg = pl.legend(fancybox=True) leg.get_frame().set_alpha(0.2) #leg.get_frame().set_edgecolor("none") remove_border() pl.subplots_adjust(left=0.03, bottom=0.05, right=0.99, top=0.94, wspace=0.07, hspace=0.34) pl.savefig(os.path.join(outdir, phenotype_name + ".pdf")) if (p_value > p_value_t): return "original" else: return transform
[y, phenotype_names, sample_ids, fid] = read_data(sys.argv[1]) output_dir = sys.argv[2] for i, phenotype in enumerate(phenotype_names): selected_transform = plotHistogram(y=y[:, i], phenotype_name=phenotype, transform=transform, outdir=output_dir) ind = sp.where(~sp.isnan(y[:, i]))[0] if selected_transform == 'sqrt': phenotype_names[i] = "sqrt_" + phenotype y[ind, i] = sp.sqrt(y[ind, i]) elif selected_transform == "boxcox": phenotype_names[i] = "boxcox_" + phenotype tmp = y[ind, i] [y[ind, i], b_lambda] = boxcox(tmp) elif selected_transform == "log": phenotype_names[i] = "log_" + phenotype y[ind, i] = sp.log(y[ind, i]) elif selected_transform == "log10": phenotype_names[i] = "log10_" + phenotype y[ind, i] = sp.log10(y[ind, i]) f = open(os.path.join(output_dir, "transformed_phenotypes.txt"), 'w') f.write("FID IID ") string = "" for phenotype in phenotype_names: string += phenotype + " " f.write(string[:-1] + "\n") for i in range(fid.shape[0]): f.write(fid[i] + " " + sample_ids[i] + " ")
def train(data, dist_choices, FORCE_APPART = False, bc_transform = True): """ Given a vector of data and list of distribution types the trainer will find the best fit for the mixture distribution. """ param_vec = N.array([]) bounds = [] if bc_transform: t_data, llm = boxcox(data) else: t_data = data #make initial guesses based on kmeans num_clust = len(dist_choices) init_weight = float(1)/float(num_clust) if num_clust == 1: centroids = [N.mean(t_data)] elif (num_clust == 2) & FORCE_APPART: c1 = N.max(t_data) c2 = N.min(t_data) centroids = N.array([c1, c2]) else: (centroids, distortion) = kmeans(t_data, num_clust) centroids.sort() min_val = N.min(t_data) max_val = N.max(t_data) #create an "emperical" pdf bin_divisor = get_bin_divisor(len(t_data)) n, bins = N.histogram(t_data, new=True, bins = len(t_data)/bin_divisor, normed = True) if len(dist_choices) > 1: max_width = (bins[1]-bins[0])*bin_divisor else: max_width = None #create param_vec and bounds vector for this_dist, cent in izip(dist_choices, centroids): if this_dist == 'norm': try: param_vec = N.concatenate((param_vec, N.array([cent,1,init_weight]))) except: param_vec = N.concatenate((param_vec, N.array([cent[0],1,init_weight]))) bounds += [(min_val, max_val), (0, max_width), (0,1)] elif this_dist == 'uniform': param_vec = N.concatenate((param_vec, N.array([min_val,max_val,init_weight]))) bounds += [(min_val, max_val), (min_val, max_val), (0,1)] elif this_dist == 'skewnorm': try: param_vec = N.concatenate((param_vec, N.array([cent,1,1,init_weight]))) except: param_vec = N.concatenate((param_vec, N.array([cent[0],1,1,init_weight]))) bounds += [(min_val, max_val), (0, max_width), (0, max_width), (0,1)] else: raise KeyError, 'Unknown distribution %s' % this_dist #do the actual training param_val, like, d = fmin_tnc(score, param_vec, args = (dist_choices, bins[1:], n), approx_grad = True, bounds = bounds, messages = 0) #make the trained distribution t_dist = m_modal() t_dist.use_bc = bc_transform t_dist.dists, t_w = unpack(param_val, dist_choices) #save normalized weights t_dist.weights = N.array(t_w)/N.array(t_w).sum() #TF, pval = t_dist.pval(data) return t_dist