示例#1
0
def R_run_loess(x, y, span=0.75):
    """
    Predict y as function of x. Takes two numpy vectors.
    """
    # Ensure that Inf/-Inf values are substituted
    x[utils.where_null(x)] = robj.NA_Real
    y[utils.where_null(x)] = robj.NA_Real
    data = robj.DataFrame({"x": x, "y": y})
    loess_fit = r.loess("y ~ x", data=data, span=span, family="symmetric")
    correction_factor = np.array(list(r.predict(loess_fit, x)))
    corrected_y = \
        np.array(list(y)) - correction_factor
    return corrected_y, correction_factor
示例#2
0
def run_loess(x, y, span=0.75):
    """
    Predict y as function of x. Takes two numpy vectors.
    """
    # Ensure that Inf/-Inf values are substituted
    x[where_na_like(x)] = robj.NA_Real
    y[where_na_like(x)] = robj.NA_Real
    data = robj.DataFrame({"x": x, "y": y})
    loess_fit = r.loess("y ~ x", data=data, span=span,
                        family="symmetric")
    correction_factor = np.array(list(r.predict(loess_fit, x)))
    corrected_y = \
        np.array(list(y)) - correction_factor
    return corrected_y, correction_factor
def rloess_smooth(xdata, ydata, xfit, span=0.07, deg=2):
    """
    Use rpy2 to call R's loess function from Pthon

    Input: x, y numpy array of data points
           xfit, numpy array of data at which yfit it to be computed
           lspan, span of the fitting
           ldeg, deg of the polynomial used for fitting

    Output: x, yfit numpy array of x and smoothed y
    """
    # x = ro.FloatVector(list(xdata))
    # y = ro.FloatVector(list(ydata))
    # x_fit = ro.FloatVector(list(xfit))
    rdf = ro.DataFrame({"x": xdata, "y": ydata})
    loess_fit = r.loess('y ~ x', data=rdf, span=span, deg=deg)
    yfit = np.array(list(r.predict(loess_fit, xfit)))
    return yfit
示例#4
0
def meanVar(_files, _gff_file , _output):


	NFILE=len(_files)
	if NFILE == 1:
		sys.stderr.write("Need at least two samples for each group.\n")
		sys.exit(1)
	#####

	_dict_counts = dict() ## dictionary of gene counts
	_genes = HTSeq.GenomicArrayOfSets("auto",stranded=False)
	idx=0
	if MODE == "all-genes":
		for feature in _gff_file:
			if feature.type in GENE:
				_dict_counts[ feature.name ] = [0]*NFILE
				_genes[feature.iv] += feature.name
			if feature.type in TX:
                                if feature.attr["geneID"] not in _dict_counts:
				    _dict_counts[feature.attr["geneID"]] = [0]*NFILE
				    _genes[feature.iv] += feature.attr["geneID"]
	if MODE == "AS-genes":
		## Bug: Does not report last gene in gff if it has at least two transcript
		transcript= set()
		cur_line = None
                last_gene_id = None
		for feature in _gff_file:
			if feature.type in GENE:
				if len(transcript) >1:
					_dict_counts[ cur_line.name ] = [0]*NFILE
					_genes[cur_line.iv] += cur_line.name
				cur_line = feature
				transcript.clear()
                        if feature.type in TX:
                            key = None
                            if "geneID" in feature.attr:
                                key = "geneID"
                            elif "Parent" in feature.attr:
                                key = "Parent"
                            else:
                                sys.stderr.write("transcript line does not have Parent or geneID field\n")

                            if last_gene_id == feature.attr[key]: 
                                transcript.add(feature.attr["ID"])
                            else:
                                if len(transcript) > 1:
                                    if feature.attr[key] not in _dict_counts:
					_dict_counts[feature.attr[key]] = [0]*NFILE
					_genes[feature.iv] +=  feature.attr[key]
                                transcript.clear()
                                transcript.add(feature.attr["ID"])
                                last_gene_id = feature.attr[key]
			if feature.type in EXON:
				transcript.add(feature.attr["Parent"])
        print "num of genes to simulate: ", len(_dict_counts) 
	_file_raw_count = open(_output+'.rawcounts','w')
	_file_nb_count = open(_output+'.nbcounts','w')
	## This loop read through the input list and call countSam for each input file  
	for f in _files:
		sam_file=HTSeq.SAM_Reader(f)
		_dict_counts=countSam(sam_file, _genes,_dict_counts, idx)
		f.close()
		idx += 1
		sys.stderr.write("library %d has generated.\n" % idx)
	## Print raw counts in file specified by <out>
	for key, value in sorted(_dict_counts.iteritems()):
		_file_raw_count.write(key+"\t"+"\t".join(map(str,value))+"\n")
	_file_raw_count.close()
	## calculate group mean and variance
	list_mean = list()
	list_var = list()
	for key, value in sorted(_dict_counts.iteritems()):
		list_mean.append(np.mean(np.array(value)))
		list_var.append(np.var(np.array(value)))
	
	## computer loess esimates	
	## The following code is using rpy2 module
	a = robjects.FloatVector(list_mean)
	b = robjects.FloatVector(list_var)
	df = robjects.DataFrame({"mean": a, "var": b})
	non0_df=df.rx(df.rx2("mean").ro > 0, True) ## subsetting if mean > 0
	loess_fit = r.loess("var ~ mean", data=non0_df, degree=2)
	'''
	#good-of-fit test:
	variance=r.predict(loess_fit, 1000)
	print variance[0]
	print (1000*1000)/(variance[0]-1000)
	'''
	var_pred = r.predict(loess_fit, a)
	# This loop overwrite global variable dict_counts for recoding new count data
	count_idx = 0

	for key, value in sorted(_dict_counts.iteritems()):
		n = math.pow(list_mean[count_idx],2)/(var_pred[count_idx]-list_mean[count_idx])
		n = int(n) # n: number of failures
		if n<=0:
			_dict_counts[key] = [0]*NREPS
		else:
			p = n/float(n+list_mean[count_idx]) # p: prob of success
			_dict_counts[key] = nbinom.rvs(n, p, size=NREPS).tolist()
		count_idx += 1
	#var_pred = r.predict(loess_fit, a)
	for key, value in sorted(_dict_counts.iteritems()):
		_file_nb_count.write(key+"\t"+"\t".join(map(str,value))+"\n")
	_file_nb_count.close()
	_file_raw_count.close()
	return _dict_counts
示例#5
0
def AFS(order, a=6, q=0.95, d=0.25):
    # Default value of q and d are 0.95 and 0.25.
    # Change the column names and format of the dataset.
    order.columns = ["wv", "intens"]
    # n records the number of pixels.
    n = order.shape[0]
    # ref is a pandas series recording wavelength
    ref = order["wv"]
    # Variable u is the parameter u in the step 1 of AFS algorithm. It scales the intensity vector.
    u = (ref.max() - ref.min()) / 10 / order["intens"].max()
    order["intens"] = order["intens"] * u

    # Let alpha be 1/6 of the wavelength range of the whole order.
    alpha = (order["wv"].max() - order["wv"].min()) / a

    # This chunk of code detects loops in the boundary of the alpha shape.
    # Ususally there is only one loop(polygon).
    # Variable loop is a list.
    # The indices of the k-th loop are recorded in the k-th element of variable loop.
    loops = []
    # Variable points is a list that represents all the sample point (lambda_i,y_i)
    points = [(order["wv"][i], order["intens"][i])
              for i in range(order.shape[0])]
    #tl=time()
    alpha_shape = alphashape.alphashape(points, 1 / alpha)

    #th=time()
    # print("alphashape function takes ", th-tl)

    # Input Vairables:
    # polygon: shapely polygon object
    # return Variable:
    # variable indices is a list recording the indices of the vertices in the polygon
    def find_vertices(polygon):
        coordinates = list(polygon.exterior.coords)
        return [
            ref[ref == coordinates[i][0]].index[0]
            for i in range(len(coordinates))
        ]

    # if alpha_shape is just a polygon, there is only one loop
    # if alpha_shape is a multi-polygon, we interate it and find all the loops.
    if (isinstance(alpha_shape, shapely.geometry.polygon.Polygon)):
        temp = find_vertices(alpha_shape)
        loops.append(temp)

    else:
        for polygon in alpha_shape:
            temp = find_vertices(polygon)
            loops.append(temp)

    # Use the loops to get the set W_alpha.
    # Variable Wa is a vector recording the indices of points in W_alpha.
    Wa = [0]
    for loop in loops:
        temp = loop
        temp = loop[:-1]
        temp = [i for i in temp if (i < n - 1)]
        max_k = max(temp)
        min_k = min(temp)
        len_k = len(temp)
        as_k = temp
        if ((as_k[0] == min_k and as_k[len_k - 1] == max_k) == False):
            index_max = as_k.index(max_k)
            index_min = as_k.index(min_k)
            if (index_min < index_max):
                as_k = as_k[index_min:(index_max + 1)]
            else:
                as_k = as_k[index_min:] + as_k[0:(index_max + 1)]

        Wa = Wa + as_k
    Wa.sort()
    Wa = Wa[1:]

    # AS is an n by 2 matrix recording tilde(AS_alpha). Each row is the wavelength and intensity of one pixel.
    AS = order.copy()
    for i in range(n - 1):
        indices = [m for m, v in enumerate(Wa) if v > i]
        if (len(indices) != 0):
            index = indices[0]
            a = Wa[index - 1]
            b = Wa[index]
            AS["intens"][i] = AS["intens"][a] + (
                AS["intens"][b] - AS["intens"][a]) * (
                    (AS["wv"][i] - AS["wv"][a]) / (AS["wv"][b] - AS["wv"][a]))
        else:
            # AS=AS.drop(list(range(i, n)))
            break

    # Run a local polynomial on tilde(AS_alpha), as described in step 3 of the AFS algorithm.
    # Use the function loess_1d() to run a second order local polynomial.
    # Variable y_result is the predicted output from input x
    x = AS["wv"].values
    y = AS["intens"].values
    # covert x and y to R vectors
    x = robjects.FloatVector(list(x))
    y = robjects.FloatVector(list(y))
    df = robjects.DataFrame({"x": x, "y": y})
    # run loess (haven't found a way to specify "control" parameters)
    loess_fit = r.loess("y ~ x", data=df, degree=2, span=d, surface="direct")
    B1 = r.predict(loess_fit, x)
    # Add a new column called select to the matrix order.
    # order["select"] records hat(y^(1)).
    select = order["intens"].values / B1

    order["select"] = select
    # Make indices in Wa to the format of small windows.
    # Each row of the variable window is a pair of neighboring indices in Wa.
    window = np.column_stack((Wa[0:len(Wa) - 1], Wa[1:]))

    # This chunk of code select the top q quantile of points in each window.
    # The point indices are recorded in variable index, which is S_alpha, q in step 4
    # of the AFS algorithm.
    index = [0]
    for i in range(window.shape[0]):
        loc_window = window[i, ]
        temp = order.loc[loc_window[0]:loc_window[1]]
        index_i = temp[temp["select"] >= np.quantile(temp["select"], q)].index
        index = index + list(index_i)
    index = np.unique(index[1:])
    index = np.sort(index)

    # Run Loess for the last time
    x_2 = order.iloc[index]["wv"].values
    y_2 = order.iloc[index]["intens"].values
    x_2 = robjects.FloatVector(list(x_2))
    y_2 = robjects.FloatVector(list(y_2))
    df2 = robjects.DataFrame({"x_2": x_2, "y_2": y_2})
    loess_fit2 = r.loess("y_2 ~ x_2",
                         data=df2,
                         degree=2,
                         span=d,
                         surface="direct")
    y_final = r.predict(loess_fit2, x)
    # Return the blaze-removed spectrum.
    result = order["intens"].values / y_final
    return result
示例#6
0
def ALSFS(order, led, a=6, q=0.95, d=0.25):

    pd.options.mode.chained_assignment = None
    # Default value of q and d are 0.95 and 0.25.
    # Change the column names and format of the dataset.
    order.columns = ["wv", "intens"]
    # n records the number of pixels.
    n = order.shape[0]
    ref = order["wv"]
    # Variable u is the parameter u in the step 1 of AFS algorithm. It scales the intensity vector.
    u = (ref.max() - ref.min()) / 10 / order["intens"].max()
    order["intens"] = order["intens"] * u

    # Let alpha be 1/6 of the wavelength range of the whole order.
    alpha = (order["wv"].max() - order["wv"].min()) / a

    # This chunk of code detects loops in the boundary of the alpha shape.
    # Ususally there is only one loop(polygon).
    # Variable loop is a list.
    # The indices of the k-th loop are recorded in the k-th element of variable loop.
    loops = []
    # Variable points is a list that represents all the sample point (lambda_i,y_i)
    points = [(order["wv"][i], order["intens"][i])
              for i in range(order.shape[0])]
    #t1=time()
    alpha_shape = alphashape.alphashape(points, 1 / alpha)

    #t2=time()
    #print('alphashape function takes')
    #print(t2-t1)

    # Input Vairables:
    # polygon: shapely polygon object
    # return Variable:
    # variable indices is a list recording the indices of the vertices in the polygon
    def find_vertices(polygon):
        coordinates = list(polygon.exterior.coords)
        return [
            ref[ref == coordinates[i][0]].index[0]
            for i in range(len(coordinates))
        ]

    # if alpha_shape is just a polygon, there is only one loop
    # if alpha_shape is a multi-polygon, we interate it and find all the loops.
    if (isinstance(alpha_shape, shapely.geometry.polygon.Polygon)):
        temp = find_vertices(alpha_shape)
        loops.append(temp)

    else:
        for polygon in alpha_shape:
            temp = find_vertices(polygon)
            loops.append(temp)

    # Use the loops to get the set W_alpha.
    # Variable Wa is a vector recording the indices of points in W_alpha.
    Wa = [0]
    for loop in loops:
        temp = loop
        temp = loop[:-1]
        temp = [i for i in temp if (i < n - 1)]
        max_k = max(temp)
        min_k = min(temp)
        len_k = len(temp)
        as_k = temp
        if ((as_k[0] == min_k and as_k[len_k - 1] == max_k) == False):
            index_max = as_k.index(max_k)
            index_min = as_k.index(min_k)
            if (index_min < index_max):
                as_k = as_k[index_min:(index_max + 1)]
            else:
                as_k = as_k[index_min:] + as_k[0:(index_max + 1)]

        Wa = Wa + as_k
    Wa.sort()
    Wa = Wa[1:]

    # AS is an n by 2 matrix recording tilde(AS_alpha). Each row is the wavelength and intensity of one pixel.
    AS = order.copy()
    for i in range(n - 1):
        indices = [m for m, v in enumerate(Wa) if v > i]
        if (len(indices) != 0):
            index = indices[0]
            a = Wa[index - 1]
            b = Wa[index]
            AS["intens"][i] = AS["intens"][a] + (
                AS["intens"][b] - AS["intens"][a]) * (
                    (AS["wv"][i] - AS["wv"][a]) / (AS["wv"][b] - AS["wv"][a]))
        else:
            # AS=AS.drop(list(range(i, n)))
            break

    # Run a local polynomial on tilde(AS_alpha), as described in step 3 of the AFS algorithm.
    # Use the function loess_1d() to run a second order local polynomial.
    # Variable y_result is the predicted output from input x
    x = AS["wv"].values
    y = AS["intens"].values
    # covert x and y to R vectors
    x = robjects.FloatVector(list(x))
    y = robjects.FloatVector(list(y))
    df = robjects.DataFrame({"x": x, "y": y})
    # run loess (haven't found a way to specify "control" parameters)
    loess_fit = r.loess("y ~ x", data=df, degree=2, span=d, surface="direct")
    #wv_vec= robjects.FloatVector(list(order["wv"]))
    B1 = r.predict(loess_fit, x)
    # Add a new column called select to the matrix order.
    # order["select"] records hat(y^(1)).
    select = order["intens"].values / B1
    order["select"] = select

    # Calculate Q_2q-1 in step 3 of the ALSFS algorithm.
    Q = np.quantile(order["select"], 1 - (1 - q) * 2)

    # Make indices in Wa to the format of small windows.
    # Each row of the variable window is a pair of neighboring indices in Wa.
    window = np.column_stack((Wa[0:len(Wa) - 1], Wa[1:]))

    # This chunk of code select the top q quantile of points in each window.
    # The point indices are recorded in variable index, which is S_alpha, q in step 4
    # of the AFS algorithm.
    index = [0]
    for i in range(window.shape[0]):
        loc_window = window[i, ]
        temp = order.loc[loc_window[0]:loc_window[1]]
        temp_q = max(np.quantile(temp["select"], q), Q)
        index_i = temp[temp["select"] >= temp_q].index
        index = index + list(index_i)
    index = np.unique(index[1:])
    index = np.sort(index)

    # The following chunk of code does step 5 of the ALSFS algorithm.
    # The function minimize()) is used to calculate the optimization of the three
    # linear transformation parameters.
    # The final estimate is in variable B2.
    m = len(index)
    led["intens"] = led["intens"] / np.max(led["intens"].values) * np.max(
        order["intens"].values)
    Xnew = led.iloc[index]
    Xnew["constants"] = np.ones(m)
    columnsTitles = ["constants", "intens", "wv"]
    Xnew = Xnew.reindex(columns=columnsTitles)
    order_new = order.iloc[index]
    beta = np.array([0, 1, 0])
    v1 = order_new["intens"].values
    m1 = Xnew.values

    # Define the function to be optimized
    def f(beta):
        return np.sum(
            np.square((np.divide(v1, np.matmul(m1, beta)) - np.ones(m))))

    op_result = minimize(f, beta)
    param = op_result.x
    B2 = param[1] * led["intens"].values + param[2] * led["wv"].values + param[
        0]

    return order["intens"].values / B2
示例#7
0
def meanVar(_files, _gff_file, _output):

    NFILE = len(_files)
    if NFILE == 1:
        sys.stderr.write("Need at least two samples for each group.\n")
        sys.exit(1)

    ## Dictionary of gene counts
    _dict_counts = dict()
    _genes = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    idx = 0
    count = 0
    transcript = set()
    cur_line = None
    lines = 0
    for feature in _gff_file:
        lines += 1
        if feature.type in GENE or lines == num_lines:
            if len(transcript) > 1:
                _dict_counts[cur_line.name] = [0] * NFILE
                _genes[cur_line.iv] += cur_line.name
                count += 1
            cur_line = feature
            transcript.clear()
        if feature.type in EXON:
            transcript.add(feature.attr["Parent"])
    print "Number of genes", count
    _file_raw_count = open(_output + '.rawcounts', 'w')
    _file_nb_count = open(_output + '.nbcounts', 'w')
    ## This loop read through the input list and call countbam for each input file
    for f in _files:
        bam_file = HTSeq.BAM_Reader(f)
        _dict_counts = countbam(bam_file, _genes, _dict_counts, idx)
        idx += 1
        sys.stderr.write("Library %d has generated.\n" % idx)
    ## Print raw counts in file specified by <out>
    for key, value in sorted(_dict_counts.iteritems()):
        _file_raw_count.write(key + "\t" + "\t".join(map(str, value)) + "\n")
    _file_raw_count.close()
    ## Calculate group mean and variance
    list_mean = list()
    list_var = list()
    for key, value in sorted(_dict_counts.iteritems()):
        list_mean.append(np.mean(np.array(value)))
        list_var.append(np.var(np.array(value)))

    ## Computer loess esimates
    ## The following code is using rpy2 module
    a = robjects.FloatVector(list_mean)
    b = robjects.FloatVector(list_var)
    df = robjects.DataFrame({"mean": a, "var": b})
    non0_df = df.rx(df.rx2("mean").ro > 0, True)  ## subsetting if mean > 0
    loess_fit = r.loess("var ~ mean", data=non0_df, degree=2)

    var_pred = r.predict(loess_fit, a)
    # This loop overwrite global variable dict_counts for recoding new count data
    count_idx = 0
    for key, value in sorted(_dict_counts.iteritems()):
        n = math.pow(list_mean[count_idx],
                     2) / (var_pred[count_idx] - list_mean[count_idx])
        n = int(n)  # n: number of failures
        if n <= 0:
            _dict_counts[key] = [0] * NREPS
        else:
            p = n / float(n + list_mean[count_idx])  # p: prob of success
            _dict_counts[key] = nbinom.rvs(n, p, size=NREPS).tolist()
        count_idx += 1
    for key, value in sorted(_dict_counts.iteritems()):
        _file_nb_count.write(key + "\t" + "\t".join(map(str, value)) + "\n")
    _file_nb_count.close()
    _file_raw_count.close()
    return _dict_counts