def R_run_loess(x, y, span=0.75): """ Predict y as function of x. Takes two numpy vectors. """ # Ensure that Inf/-Inf values are substituted x[utils.where_null(x)] = robj.NA_Real y[utils.where_null(x)] = robj.NA_Real data = robj.DataFrame({"x": x, "y": y}) loess_fit = r.loess("y ~ x", data=data, span=span, family="symmetric") correction_factor = np.array(list(r.predict(loess_fit, x))) corrected_y = \ np.array(list(y)) - correction_factor return corrected_y, correction_factor
def run_loess(x, y, span=0.75): """ Predict y as function of x. Takes two numpy vectors. """ # Ensure that Inf/-Inf values are substituted x[where_na_like(x)] = robj.NA_Real y[where_na_like(x)] = robj.NA_Real data = robj.DataFrame({"x": x, "y": y}) loess_fit = r.loess("y ~ x", data=data, span=span, family="symmetric") correction_factor = np.array(list(r.predict(loess_fit, x))) corrected_y = \ np.array(list(y)) - correction_factor return corrected_y, correction_factor
def rloess_smooth(xdata, ydata, xfit, span=0.07, deg=2): """ Use rpy2 to call R's loess function from Pthon Input: x, y numpy array of data points xfit, numpy array of data at which yfit it to be computed lspan, span of the fitting ldeg, deg of the polynomial used for fitting Output: x, yfit numpy array of x and smoothed y """ # x = ro.FloatVector(list(xdata)) # y = ro.FloatVector(list(ydata)) # x_fit = ro.FloatVector(list(xfit)) rdf = ro.DataFrame({"x": xdata, "y": ydata}) loess_fit = r.loess('y ~ x', data=rdf, span=span, deg=deg) yfit = np.array(list(r.predict(loess_fit, xfit))) return yfit
def meanVar(_files, _gff_file , _output): NFILE=len(_files) if NFILE == 1: sys.stderr.write("Need at least two samples for each group.\n") sys.exit(1) ##### _dict_counts = dict() ## dictionary of gene counts _genes = HTSeq.GenomicArrayOfSets("auto",stranded=False) idx=0 if MODE == "all-genes": for feature in _gff_file: if feature.type in GENE: _dict_counts[ feature.name ] = [0]*NFILE _genes[feature.iv] += feature.name if feature.type in TX: if feature.attr["geneID"] not in _dict_counts: _dict_counts[feature.attr["geneID"]] = [0]*NFILE _genes[feature.iv] += feature.attr["geneID"] if MODE == "AS-genes": ## Bug: Does not report last gene in gff if it has at least two transcript transcript= set() cur_line = None last_gene_id = None for feature in _gff_file: if feature.type in GENE: if len(transcript) >1: _dict_counts[ cur_line.name ] = [0]*NFILE _genes[cur_line.iv] += cur_line.name cur_line = feature transcript.clear() if feature.type in TX: key = None if "geneID" in feature.attr: key = "geneID" elif "Parent" in feature.attr: key = "Parent" else: sys.stderr.write("transcript line does not have Parent or geneID field\n") if last_gene_id == feature.attr[key]: transcript.add(feature.attr["ID"]) else: if len(transcript) > 1: if feature.attr[key] not in _dict_counts: _dict_counts[feature.attr[key]] = [0]*NFILE _genes[feature.iv] += feature.attr[key] transcript.clear() transcript.add(feature.attr["ID"]) last_gene_id = feature.attr[key] if feature.type in EXON: transcript.add(feature.attr["Parent"]) print "num of genes to simulate: ", len(_dict_counts) _file_raw_count = open(_output+'.rawcounts','w') _file_nb_count = open(_output+'.nbcounts','w') ## This loop read through the input list and call countSam for each input file for f in _files: sam_file=HTSeq.SAM_Reader(f) _dict_counts=countSam(sam_file, _genes,_dict_counts, idx) f.close() idx += 1 sys.stderr.write("library %d has generated.\n" % idx) ## Print raw counts in file specified by <out> for key, value in sorted(_dict_counts.iteritems()): _file_raw_count.write(key+"\t"+"\t".join(map(str,value))+"\n") _file_raw_count.close() ## calculate group mean and variance list_mean = list() list_var = list() for key, value in sorted(_dict_counts.iteritems()): list_mean.append(np.mean(np.array(value))) list_var.append(np.var(np.array(value))) ## computer loess esimates ## The following code is using rpy2 module a = robjects.FloatVector(list_mean) b = robjects.FloatVector(list_var) df = robjects.DataFrame({"mean": a, "var": b}) non0_df=df.rx(df.rx2("mean").ro > 0, True) ## subsetting if mean > 0 loess_fit = r.loess("var ~ mean", data=non0_df, degree=2) ''' #good-of-fit test: variance=r.predict(loess_fit, 1000) print variance[0] print (1000*1000)/(variance[0]-1000) ''' var_pred = r.predict(loess_fit, a) # This loop overwrite global variable dict_counts for recoding new count data count_idx = 0 for key, value in sorted(_dict_counts.iteritems()): n = math.pow(list_mean[count_idx],2)/(var_pred[count_idx]-list_mean[count_idx]) n = int(n) # n: number of failures if n<=0: _dict_counts[key] = [0]*NREPS else: p = n/float(n+list_mean[count_idx]) # p: prob of success _dict_counts[key] = nbinom.rvs(n, p, size=NREPS).tolist() count_idx += 1 #var_pred = r.predict(loess_fit, a) for key, value in sorted(_dict_counts.iteritems()): _file_nb_count.write(key+"\t"+"\t".join(map(str,value))+"\n") _file_nb_count.close() _file_raw_count.close() return _dict_counts
def AFS(order, a=6, q=0.95, d=0.25): # Default value of q and d are 0.95 and 0.25. # Change the column names and format of the dataset. order.columns = ["wv", "intens"] # n records the number of pixels. n = order.shape[0] # ref is a pandas series recording wavelength ref = order["wv"] # Variable u is the parameter u in the step 1 of AFS algorithm. It scales the intensity vector. u = (ref.max() - ref.min()) / 10 / order["intens"].max() order["intens"] = order["intens"] * u # Let alpha be 1/6 of the wavelength range of the whole order. alpha = (order["wv"].max() - order["wv"].min()) / a # This chunk of code detects loops in the boundary of the alpha shape. # Ususally there is only one loop(polygon). # Variable loop is a list. # The indices of the k-th loop are recorded in the k-th element of variable loop. loops = [] # Variable points is a list that represents all the sample point (lambda_i,y_i) points = [(order["wv"][i], order["intens"][i]) for i in range(order.shape[0])] #tl=time() alpha_shape = alphashape.alphashape(points, 1 / alpha) #th=time() # print("alphashape function takes ", th-tl) # Input Vairables: # polygon: shapely polygon object # return Variable: # variable indices is a list recording the indices of the vertices in the polygon def find_vertices(polygon): coordinates = list(polygon.exterior.coords) return [ ref[ref == coordinates[i][0]].index[0] for i in range(len(coordinates)) ] # if alpha_shape is just a polygon, there is only one loop # if alpha_shape is a multi-polygon, we interate it and find all the loops. if (isinstance(alpha_shape, shapely.geometry.polygon.Polygon)): temp = find_vertices(alpha_shape) loops.append(temp) else: for polygon in alpha_shape: temp = find_vertices(polygon) loops.append(temp) # Use the loops to get the set W_alpha. # Variable Wa is a vector recording the indices of points in W_alpha. Wa = [0] for loop in loops: temp = loop temp = loop[:-1] temp = [i for i in temp if (i < n - 1)] max_k = max(temp) min_k = min(temp) len_k = len(temp) as_k = temp if ((as_k[0] == min_k and as_k[len_k - 1] == max_k) == False): index_max = as_k.index(max_k) index_min = as_k.index(min_k) if (index_min < index_max): as_k = as_k[index_min:(index_max + 1)] else: as_k = as_k[index_min:] + as_k[0:(index_max + 1)] Wa = Wa + as_k Wa.sort() Wa = Wa[1:] # AS is an n by 2 matrix recording tilde(AS_alpha). Each row is the wavelength and intensity of one pixel. AS = order.copy() for i in range(n - 1): indices = [m for m, v in enumerate(Wa) if v > i] if (len(indices) != 0): index = indices[0] a = Wa[index - 1] b = Wa[index] AS["intens"][i] = AS["intens"][a] + ( AS["intens"][b] - AS["intens"][a]) * ( (AS["wv"][i] - AS["wv"][a]) / (AS["wv"][b] - AS["wv"][a])) else: # AS=AS.drop(list(range(i, n))) break # Run a local polynomial on tilde(AS_alpha), as described in step 3 of the AFS algorithm. # Use the function loess_1d() to run a second order local polynomial. # Variable y_result is the predicted output from input x x = AS["wv"].values y = AS["intens"].values # covert x and y to R vectors x = robjects.FloatVector(list(x)) y = robjects.FloatVector(list(y)) df = robjects.DataFrame({"x": x, "y": y}) # run loess (haven't found a way to specify "control" parameters) loess_fit = r.loess("y ~ x", data=df, degree=2, span=d, surface="direct") B1 = r.predict(loess_fit, x) # Add a new column called select to the matrix order. # order["select"] records hat(y^(1)). select = order["intens"].values / B1 order["select"] = select # Make indices in Wa to the format of small windows. # Each row of the variable window is a pair of neighboring indices in Wa. window = np.column_stack((Wa[0:len(Wa) - 1], Wa[1:])) # This chunk of code select the top q quantile of points in each window. # The point indices are recorded in variable index, which is S_alpha, q in step 4 # of the AFS algorithm. index = [0] for i in range(window.shape[0]): loc_window = window[i, ] temp = order.loc[loc_window[0]:loc_window[1]] index_i = temp[temp["select"] >= np.quantile(temp["select"], q)].index index = index + list(index_i) index = np.unique(index[1:]) index = np.sort(index) # Run Loess for the last time x_2 = order.iloc[index]["wv"].values y_2 = order.iloc[index]["intens"].values x_2 = robjects.FloatVector(list(x_2)) y_2 = robjects.FloatVector(list(y_2)) df2 = robjects.DataFrame({"x_2": x_2, "y_2": y_2}) loess_fit2 = r.loess("y_2 ~ x_2", data=df2, degree=2, span=d, surface="direct") y_final = r.predict(loess_fit2, x) # Return the blaze-removed spectrum. result = order["intens"].values / y_final return result
def ALSFS(order, led, a=6, q=0.95, d=0.25): pd.options.mode.chained_assignment = None # Default value of q and d are 0.95 and 0.25. # Change the column names and format of the dataset. order.columns = ["wv", "intens"] # n records the number of pixels. n = order.shape[0] ref = order["wv"] # Variable u is the parameter u in the step 1 of AFS algorithm. It scales the intensity vector. u = (ref.max() - ref.min()) / 10 / order["intens"].max() order["intens"] = order["intens"] * u # Let alpha be 1/6 of the wavelength range of the whole order. alpha = (order["wv"].max() - order["wv"].min()) / a # This chunk of code detects loops in the boundary of the alpha shape. # Ususally there is only one loop(polygon). # Variable loop is a list. # The indices of the k-th loop are recorded in the k-th element of variable loop. loops = [] # Variable points is a list that represents all the sample point (lambda_i,y_i) points = [(order["wv"][i], order["intens"][i]) for i in range(order.shape[0])] #t1=time() alpha_shape = alphashape.alphashape(points, 1 / alpha) #t2=time() #print('alphashape function takes') #print(t2-t1) # Input Vairables: # polygon: shapely polygon object # return Variable: # variable indices is a list recording the indices of the vertices in the polygon def find_vertices(polygon): coordinates = list(polygon.exterior.coords) return [ ref[ref == coordinates[i][0]].index[0] for i in range(len(coordinates)) ] # if alpha_shape is just a polygon, there is only one loop # if alpha_shape is a multi-polygon, we interate it and find all the loops. if (isinstance(alpha_shape, shapely.geometry.polygon.Polygon)): temp = find_vertices(alpha_shape) loops.append(temp) else: for polygon in alpha_shape: temp = find_vertices(polygon) loops.append(temp) # Use the loops to get the set W_alpha. # Variable Wa is a vector recording the indices of points in W_alpha. Wa = [0] for loop in loops: temp = loop temp = loop[:-1] temp = [i for i in temp if (i < n - 1)] max_k = max(temp) min_k = min(temp) len_k = len(temp) as_k = temp if ((as_k[0] == min_k and as_k[len_k - 1] == max_k) == False): index_max = as_k.index(max_k) index_min = as_k.index(min_k) if (index_min < index_max): as_k = as_k[index_min:(index_max + 1)] else: as_k = as_k[index_min:] + as_k[0:(index_max + 1)] Wa = Wa + as_k Wa.sort() Wa = Wa[1:] # AS is an n by 2 matrix recording tilde(AS_alpha). Each row is the wavelength and intensity of one pixel. AS = order.copy() for i in range(n - 1): indices = [m for m, v in enumerate(Wa) if v > i] if (len(indices) != 0): index = indices[0] a = Wa[index - 1] b = Wa[index] AS["intens"][i] = AS["intens"][a] + ( AS["intens"][b] - AS["intens"][a]) * ( (AS["wv"][i] - AS["wv"][a]) / (AS["wv"][b] - AS["wv"][a])) else: # AS=AS.drop(list(range(i, n))) break # Run a local polynomial on tilde(AS_alpha), as described in step 3 of the AFS algorithm. # Use the function loess_1d() to run a second order local polynomial. # Variable y_result is the predicted output from input x x = AS["wv"].values y = AS["intens"].values # covert x and y to R vectors x = robjects.FloatVector(list(x)) y = robjects.FloatVector(list(y)) df = robjects.DataFrame({"x": x, "y": y}) # run loess (haven't found a way to specify "control" parameters) loess_fit = r.loess("y ~ x", data=df, degree=2, span=d, surface="direct") #wv_vec= robjects.FloatVector(list(order["wv"])) B1 = r.predict(loess_fit, x) # Add a new column called select to the matrix order. # order["select"] records hat(y^(1)). select = order["intens"].values / B1 order["select"] = select # Calculate Q_2q-1 in step 3 of the ALSFS algorithm. Q = np.quantile(order["select"], 1 - (1 - q) * 2) # Make indices in Wa to the format of small windows. # Each row of the variable window is a pair of neighboring indices in Wa. window = np.column_stack((Wa[0:len(Wa) - 1], Wa[1:])) # This chunk of code select the top q quantile of points in each window. # The point indices are recorded in variable index, which is S_alpha, q in step 4 # of the AFS algorithm. index = [0] for i in range(window.shape[0]): loc_window = window[i, ] temp = order.loc[loc_window[0]:loc_window[1]] temp_q = max(np.quantile(temp["select"], q), Q) index_i = temp[temp["select"] >= temp_q].index index = index + list(index_i) index = np.unique(index[1:]) index = np.sort(index) # The following chunk of code does step 5 of the ALSFS algorithm. # The function minimize()) is used to calculate the optimization of the three # linear transformation parameters. # The final estimate is in variable B2. m = len(index) led["intens"] = led["intens"] / np.max(led["intens"].values) * np.max( order["intens"].values) Xnew = led.iloc[index] Xnew["constants"] = np.ones(m) columnsTitles = ["constants", "intens", "wv"] Xnew = Xnew.reindex(columns=columnsTitles) order_new = order.iloc[index] beta = np.array([0, 1, 0]) v1 = order_new["intens"].values m1 = Xnew.values # Define the function to be optimized def f(beta): return np.sum( np.square((np.divide(v1, np.matmul(m1, beta)) - np.ones(m)))) op_result = minimize(f, beta) param = op_result.x B2 = param[1] * led["intens"].values + param[2] * led["wv"].values + param[ 0] return order["intens"].values / B2
def meanVar(_files, _gff_file, _output): NFILE = len(_files) if NFILE == 1: sys.stderr.write("Need at least two samples for each group.\n") sys.exit(1) ## Dictionary of gene counts _dict_counts = dict() _genes = HTSeq.GenomicArrayOfSets("auto", stranded=False) idx = 0 count = 0 transcript = set() cur_line = None lines = 0 for feature in _gff_file: lines += 1 if feature.type in GENE or lines == num_lines: if len(transcript) > 1: _dict_counts[cur_line.name] = [0] * NFILE _genes[cur_line.iv] += cur_line.name count += 1 cur_line = feature transcript.clear() if feature.type in EXON: transcript.add(feature.attr["Parent"]) print "Number of genes", count _file_raw_count = open(_output + '.rawcounts', 'w') _file_nb_count = open(_output + '.nbcounts', 'w') ## This loop read through the input list and call countbam for each input file for f in _files: bam_file = HTSeq.BAM_Reader(f) _dict_counts = countbam(bam_file, _genes, _dict_counts, idx) idx += 1 sys.stderr.write("Library %d has generated.\n" % idx) ## Print raw counts in file specified by <out> for key, value in sorted(_dict_counts.iteritems()): _file_raw_count.write(key + "\t" + "\t".join(map(str, value)) + "\n") _file_raw_count.close() ## Calculate group mean and variance list_mean = list() list_var = list() for key, value in sorted(_dict_counts.iteritems()): list_mean.append(np.mean(np.array(value))) list_var.append(np.var(np.array(value))) ## Computer loess esimates ## The following code is using rpy2 module a = robjects.FloatVector(list_mean) b = robjects.FloatVector(list_var) df = robjects.DataFrame({"mean": a, "var": b}) non0_df = df.rx(df.rx2("mean").ro > 0, True) ## subsetting if mean > 0 loess_fit = r.loess("var ~ mean", data=non0_df, degree=2) var_pred = r.predict(loess_fit, a) # This loop overwrite global variable dict_counts for recoding new count data count_idx = 0 for key, value in sorted(_dict_counts.iteritems()): n = math.pow(list_mean[count_idx], 2) / (var_pred[count_idx] - list_mean[count_idx]) n = int(n) # n: number of failures if n <= 0: _dict_counts[key] = [0] * NREPS else: p = n / float(n + list_mean[count_idx]) # p: prob of success _dict_counts[key] = nbinom.rvs(n, p, size=NREPS).tolist() count_idx += 1 for key, value in sorted(_dict_counts.iteritems()): _file_nb_count.write(key + "\t" + "\t".join(map(str, value)) + "\n") _file_nb_count.close() _file_raw_count.close() return _dict_counts