示例#1
0
    def predict(self, X=None):
        """
        Predicts values for the given input features
        
        Args:
            n_trees (int):              Number of trees to train within the forest.
            max_features (int or None): The number of features to consider when looking for the best split. 
                                        Translated to mtry for the underlying R call.
                                        If None, then max_features=n_features.
            maxsurrogates (int or None):number of surrogate splits to evaluate. Note the currently only surrogate splits
                                        in ordered covariables are implemented.
                                        If None, all possible surrogates will be considered.
            debug (bool):               If true, display the types of the R dataframe as converted within
                                        the fit method.
        Returns:
            bool: The return value. True for success, False otherwise.
        """
        if self.r_model is None:
            raise Exception('Model must be trained first.')

        if X is None:
            r_rtn = r.predict(self.r_model, OOB=True)

        else:
            r_X = pandas2ri.py2ri(X)
            r_rtn = r.predict(self.r_model, newdata=r_X)

        if self.forest_type_is_classification:
            rtn = np.asarray([r_rtn.levels[r_rtn[x] - 1] for x in r_rtn])
        else:
            rtn = np.asarray([x for x in r_rtn])

        return np.asarray(rtn)
示例#2
0
def method_spline(rvar, train, test):
    """ B-splines with interaction """
    print("Splines")
    formula = rvar + ' ~ bs(OverallRank, df=6) + treat + '\
              'treat:bs(OverallRank, df=6) - 1'
    if rvar == 'Tuition':
        formula = formula + ' + year'
    model = r.lm(formula, data=train)
    #print(r.summary(model).rx2('coefficients'))
    print(r.summary(model).rx2('r.squared'))
    #print(r.summary(model))
    analytics(rvar, 'Training', train[rvar],
              np.array(r.predict(model)))
    if rvar != "UndergraduatemedianGPA":
        analytics(rvar, 'Testing', test[rvar],
                  np.array(r.predict(model, newdata=test)))
    print()
示例#3
0
def R_run_loess(x, y, span=0.75):
    """
    Predict y as function of x. Takes two numpy vectors.
    """
    # Ensure that Inf/-Inf values are substituted
    x[utils.where_null(x)] = robj.NA_Real
    y[utils.where_null(x)] = robj.NA_Real
    data = robj.DataFrame({"x": x, "y": y})
    loess_fit = r.loess("y ~ x", data=data, span=span, family="symmetric")
    correction_factor = np.array(list(r.predict(loess_fit, x)))
    corrected_y = \
        np.array(list(y)) - correction_factor
    return corrected_y, correction_factor
def get_team_salary():
    conference = request.args.get("conference")
    division = request.args.get("division")
    qb = int(request.args.get("qb"))
    my_test = r['data.frame']( CONF=conference,DIV=division,QB=qb)
    print(my_test)
    salary = r.predict(my_model, my_test)
    print(salary)
    return jsonify({
        'conference': conference,
        'division': division,
        'qb': qb,
        'salary': salary[0]
    })
示例#5
0
def run_loess(x, y, span=0.75):
    """
    Predict y as function of x. Takes two numpy vectors.
    """
    # Ensure that Inf/-Inf values are substituted
    x[where_na_like(x)] = robj.NA_Real
    y[where_na_like(x)] = robj.NA_Real
    data = robj.DataFrame({"x": x, "y": y})
    loess_fit = r.loess("y ~ x", data=data, span=span,
                        family="symmetric")
    correction_factor = np.array(list(r.predict(loess_fit, x)))
    corrected_y = \
        np.array(list(y)) - correction_factor
    return corrected_y, correction_factor
def rloess_smooth(xdata, ydata, xfit, span=0.07, deg=2):
    """
    Use rpy2 to call R's loess function from Pthon

    Input: x, y numpy array of data points
           xfit, numpy array of data at which yfit it to be computed
           lspan, span of the fitting
           ldeg, deg of the polynomial used for fitting

    Output: x, yfit numpy array of x and smoothed y
    """
    # x = ro.FloatVector(list(xdata))
    # y = ro.FloatVector(list(ydata))
    # x_fit = ro.FloatVector(list(xfit))
    rdf = ro.DataFrame({"x": xdata, "y": ydata})
    loess_fit = r.loess('y ~ x', data=rdf, span=span, deg=deg)
    yfit = np.array(list(r.predict(loess_fit, xfit)))
    return yfit
示例#7
0
def meanVar(_files, _gff_file , _output):


	NFILE=len(_files)
	if NFILE == 1:
		sys.stderr.write("Need at least two samples for each group.\n")
		sys.exit(1)
	#####

	_dict_counts = dict() ## dictionary of gene counts
	_genes = HTSeq.GenomicArrayOfSets("auto",stranded=False)
	idx=0
	if MODE == "all-genes":
		for feature in _gff_file:
			if feature.type in GENE:
				_dict_counts[ feature.name ] = [0]*NFILE
				_genes[feature.iv] += feature.name
			if feature.type in TX:
                                if feature.attr["geneID"] not in _dict_counts:
				    _dict_counts[feature.attr["geneID"]] = [0]*NFILE
				    _genes[feature.iv] += feature.attr["geneID"]
	if MODE == "AS-genes":
		## Bug: Does not report last gene in gff if it has at least two transcript
		transcript= set()
		cur_line = None
                last_gene_id = None
		for feature in _gff_file:
			if feature.type in GENE:
				if len(transcript) >1:
					_dict_counts[ cur_line.name ] = [0]*NFILE
					_genes[cur_line.iv] += cur_line.name
				cur_line = feature
				transcript.clear()
                        if feature.type in TX:
                            key = None
                            if "geneID" in feature.attr:
                                key = "geneID"
                            elif "Parent" in feature.attr:
                                key = "Parent"
                            else:
                                sys.stderr.write("transcript line does not have Parent or geneID field\n")

                            if last_gene_id == feature.attr[key]: 
                                transcript.add(feature.attr["ID"])
                            else:
                                if len(transcript) > 1:
                                    if feature.attr[key] not in _dict_counts:
					_dict_counts[feature.attr[key]] = [0]*NFILE
					_genes[feature.iv] +=  feature.attr[key]
                                transcript.clear()
                                transcript.add(feature.attr["ID"])
                                last_gene_id = feature.attr[key]
			if feature.type in EXON:
				transcript.add(feature.attr["Parent"])
        print "num of genes to simulate: ", len(_dict_counts) 
	_file_raw_count = open(_output+'.rawcounts','w')
	_file_nb_count = open(_output+'.nbcounts','w')
	## This loop read through the input list and call countSam for each input file  
	for f in _files:
		sam_file=HTSeq.SAM_Reader(f)
		_dict_counts=countSam(sam_file, _genes,_dict_counts, idx)
		f.close()
		idx += 1
		sys.stderr.write("library %d has generated.\n" % idx)
	## Print raw counts in file specified by <out>
	for key, value in sorted(_dict_counts.iteritems()):
		_file_raw_count.write(key+"\t"+"\t".join(map(str,value))+"\n")
	_file_raw_count.close()
	## calculate group mean and variance
	list_mean = list()
	list_var = list()
	for key, value in sorted(_dict_counts.iteritems()):
		list_mean.append(np.mean(np.array(value)))
		list_var.append(np.var(np.array(value)))
	
	## computer loess esimates	
	## The following code is using rpy2 module
	a = robjects.FloatVector(list_mean)
	b = robjects.FloatVector(list_var)
	df = robjects.DataFrame({"mean": a, "var": b})
	non0_df=df.rx(df.rx2("mean").ro > 0, True) ## subsetting if mean > 0
	loess_fit = r.loess("var ~ mean", data=non0_df, degree=2)
	'''
	#good-of-fit test:
	variance=r.predict(loess_fit, 1000)
	print variance[0]
	print (1000*1000)/(variance[0]-1000)
	'''
	var_pred = r.predict(loess_fit, a)
	# This loop overwrite global variable dict_counts for recoding new count data
	count_idx = 0

	for key, value in sorted(_dict_counts.iteritems()):
		n = math.pow(list_mean[count_idx],2)/(var_pred[count_idx]-list_mean[count_idx])
		n = int(n) # n: number of failures
		if n<=0:
			_dict_counts[key] = [0]*NREPS
		else:
			p = n/float(n+list_mean[count_idx]) # p: prob of success
			_dict_counts[key] = nbinom.rvs(n, p, size=NREPS).tolist()
		count_idx += 1
	#var_pred = r.predict(loess_fit, a)
	for key, value in sorted(_dict_counts.iteritems()):
		_file_nb_count.write(key+"\t"+"\t".join(map(str,value))+"\n")
	_file_nb_count.close()
	_file_raw_count.close()
	return _dict_counts
# import r package
from rpy2.robjects import r
from rpy2.robjects.packages import importr
stats = importr("stats")
base = importr('base')

# ask R to read csv data
team_data = r['read.csv']('data/nfl-teams.csv')
print (team_data)

# Do linear regression on categorical data
my_model = stats.lm("TOTAL ~ .", data=team_data)
print (base.summary(my_model))

# Get input from the user
conference, division, qb = input("Enter conference, division and QB salary ").split()
print (conference, division, qb)

my_test = r['data.frame']( CONF=conference,DIV=division,QB=int(qb))
print(my_test)

print (r.predict(my_model,my_test))
示例#9
0
def AFS(order, a=6, q=0.95, d=0.25):
    # Default value of q and d are 0.95 and 0.25.
    # Change the column names and format of the dataset.
    order.columns = ["wv", "intens"]
    # n records the number of pixels.
    n = order.shape[0]
    # ref is a pandas series recording wavelength
    ref = order["wv"]
    # Variable u is the parameter u in the step 1 of AFS algorithm. It scales the intensity vector.
    u = (ref.max() - ref.min()) / 10 / order["intens"].max()
    order["intens"] = order["intens"] * u

    # Let alpha be 1/6 of the wavelength range of the whole order.
    alpha = (order["wv"].max() - order["wv"].min()) / a

    # This chunk of code detects loops in the boundary of the alpha shape.
    # Ususally there is only one loop(polygon).
    # Variable loop is a list.
    # The indices of the k-th loop are recorded in the k-th element of variable loop.
    loops = []
    # Variable points is a list that represents all the sample point (lambda_i,y_i)
    points = [(order["wv"][i], order["intens"][i])
              for i in range(order.shape[0])]
    #tl=time()
    alpha_shape = alphashape.alphashape(points, 1 / alpha)

    #th=time()
    # print("alphashape function takes ", th-tl)

    # Input Vairables:
    # polygon: shapely polygon object
    # return Variable:
    # variable indices is a list recording the indices of the vertices in the polygon
    def find_vertices(polygon):
        coordinates = list(polygon.exterior.coords)
        return [
            ref[ref == coordinates[i][0]].index[0]
            for i in range(len(coordinates))
        ]

    # if alpha_shape is just a polygon, there is only one loop
    # if alpha_shape is a multi-polygon, we interate it and find all the loops.
    if (isinstance(alpha_shape, shapely.geometry.polygon.Polygon)):
        temp = find_vertices(alpha_shape)
        loops.append(temp)

    else:
        for polygon in alpha_shape:
            temp = find_vertices(polygon)
            loops.append(temp)

    # Use the loops to get the set W_alpha.
    # Variable Wa is a vector recording the indices of points in W_alpha.
    Wa = [0]
    for loop in loops:
        temp = loop
        temp = loop[:-1]
        temp = [i for i in temp if (i < n - 1)]
        max_k = max(temp)
        min_k = min(temp)
        len_k = len(temp)
        as_k = temp
        if ((as_k[0] == min_k and as_k[len_k - 1] == max_k) == False):
            index_max = as_k.index(max_k)
            index_min = as_k.index(min_k)
            if (index_min < index_max):
                as_k = as_k[index_min:(index_max + 1)]
            else:
                as_k = as_k[index_min:] + as_k[0:(index_max + 1)]

        Wa = Wa + as_k
    Wa.sort()
    Wa = Wa[1:]

    # AS is an n by 2 matrix recording tilde(AS_alpha). Each row is the wavelength and intensity of one pixel.
    AS = order.copy()
    for i in range(n - 1):
        indices = [m for m, v in enumerate(Wa) if v > i]
        if (len(indices) != 0):
            index = indices[0]
            a = Wa[index - 1]
            b = Wa[index]
            AS["intens"][i] = AS["intens"][a] + (
                AS["intens"][b] - AS["intens"][a]) * (
                    (AS["wv"][i] - AS["wv"][a]) / (AS["wv"][b] - AS["wv"][a]))
        else:
            # AS=AS.drop(list(range(i, n)))
            break

    # Run a local polynomial on tilde(AS_alpha), as described in step 3 of the AFS algorithm.
    # Use the function loess_1d() to run a second order local polynomial.
    # Variable y_result is the predicted output from input x
    x = AS["wv"].values
    y = AS["intens"].values
    # covert x and y to R vectors
    x = robjects.FloatVector(list(x))
    y = robjects.FloatVector(list(y))
    df = robjects.DataFrame({"x": x, "y": y})
    # run loess (haven't found a way to specify "control" parameters)
    loess_fit = r.loess("y ~ x", data=df, degree=2, span=d, surface="direct")
    B1 = r.predict(loess_fit, x)
    # Add a new column called select to the matrix order.
    # order["select"] records hat(y^(1)).
    select = order["intens"].values / B1

    order["select"] = select
    # Make indices in Wa to the format of small windows.
    # Each row of the variable window is a pair of neighboring indices in Wa.
    window = np.column_stack((Wa[0:len(Wa) - 1], Wa[1:]))

    # This chunk of code select the top q quantile of points in each window.
    # The point indices are recorded in variable index, which is S_alpha, q in step 4
    # of the AFS algorithm.
    index = [0]
    for i in range(window.shape[0]):
        loc_window = window[i, ]
        temp = order.loc[loc_window[0]:loc_window[1]]
        index_i = temp[temp["select"] >= np.quantile(temp["select"], q)].index
        index = index + list(index_i)
    index = np.unique(index[1:])
    index = np.sort(index)

    # Run Loess for the last time
    x_2 = order.iloc[index]["wv"].values
    y_2 = order.iloc[index]["intens"].values
    x_2 = robjects.FloatVector(list(x_2))
    y_2 = robjects.FloatVector(list(y_2))
    df2 = robjects.DataFrame({"x_2": x_2, "y_2": y_2})
    loess_fit2 = r.loess("y_2 ~ x_2",
                         data=df2,
                         degree=2,
                         span=d,
                         surface="direct")
    y_final = r.predict(loess_fit2, x)
    # Return the blaze-removed spectrum.
    result = order["intens"].values / y_final
    return result
示例#10
0
# <headingcell level=4>

# Using non-base packages in Rpy2

# <codecell>

import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
r = robjects.r

e1071 = importr('e1071')
Yr = np2r(iris['Type'])
Yr = r.factor(Yr)
svm = e1071.svm(Xr, Yr)
yhat = r.predict(svm, Xr)
print r.table(yhat, Yr)

# <headingcell level=4>

# ggplot2 in python with Rpy2

# <markdowncell>

# Thanks to [Fei Yu](http://www.thefeiyu.com/) for this vignette.

# <codecell>

import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
# <headingcell level=4>

# Using non-base packages in Rpy2

# <codecell>

import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
r = robjects.r

e1071 = importr('e1071')
Yr = np2r(iris['Type'])
Yr = r.factor(Yr)
svm = e1071.svm(Xr, Yr)
yhat = r.predict(svm, Xr)
print r.table(yhat, Yr)

# <headingcell level=4>

# ggplot2 in python with Rpy2

# <markdowncell>

# Thanks to [Fei Yu](http://www.thefeiyu.com/) for this vignette.

# <codecell>

import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
示例#12
0
 def spline_est(data, new_data):
     """ Estimate conditional b-splines for value function """
     model = r.lm('val ~ bs(OverallRank, df=4)', data=data)
     return r.predict(model, newdata=new_data)
示例#13
0
def ALSFS(order, led, a=6, q=0.95, d=0.25):

    pd.options.mode.chained_assignment = None
    # Default value of q and d are 0.95 and 0.25.
    # Change the column names and format of the dataset.
    order.columns = ["wv", "intens"]
    # n records the number of pixels.
    n = order.shape[0]
    ref = order["wv"]
    # Variable u is the parameter u in the step 1 of AFS algorithm. It scales the intensity vector.
    u = (ref.max() - ref.min()) / 10 / order["intens"].max()
    order["intens"] = order["intens"] * u

    # Let alpha be 1/6 of the wavelength range of the whole order.
    alpha = (order["wv"].max() - order["wv"].min()) / a

    # This chunk of code detects loops in the boundary of the alpha shape.
    # Ususally there is only one loop(polygon).
    # Variable loop is a list.
    # The indices of the k-th loop are recorded in the k-th element of variable loop.
    loops = []
    # Variable points is a list that represents all the sample point (lambda_i,y_i)
    points = [(order["wv"][i], order["intens"][i])
              for i in range(order.shape[0])]
    #t1=time()
    alpha_shape = alphashape.alphashape(points, 1 / alpha)

    #t2=time()
    #print('alphashape function takes')
    #print(t2-t1)

    # Input Vairables:
    # polygon: shapely polygon object
    # return Variable:
    # variable indices is a list recording the indices of the vertices in the polygon
    def find_vertices(polygon):
        coordinates = list(polygon.exterior.coords)
        return [
            ref[ref == coordinates[i][0]].index[0]
            for i in range(len(coordinates))
        ]

    # if alpha_shape is just a polygon, there is only one loop
    # if alpha_shape is a multi-polygon, we interate it and find all the loops.
    if (isinstance(alpha_shape, shapely.geometry.polygon.Polygon)):
        temp = find_vertices(alpha_shape)
        loops.append(temp)

    else:
        for polygon in alpha_shape:
            temp = find_vertices(polygon)
            loops.append(temp)

    # Use the loops to get the set W_alpha.
    # Variable Wa is a vector recording the indices of points in W_alpha.
    Wa = [0]
    for loop in loops:
        temp = loop
        temp = loop[:-1]
        temp = [i for i in temp if (i < n - 1)]
        max_k = max(temp)
        min_k = min(temp)
        len_k = len(temp)
        as_k = temp
        if ((as_k[0] == min_k and as_k[len_k - 1] == max_k) == False):
            index_max = as_k.index(max_k)
            index_min = as_k.index(min_k)
            if (index_min < index_max):
                as_k = as_k[index_min:(index_max + 1)]
            else:
                as_k = as_k[index_min:] + as_k[0:(index_max + 1)]

        Wa = Wa + as_k
    Wa.sort()
    Wa = Wa[1:]

    # AS is an n by 2 matrix recording tilde(AS_alpha). Each row is the wavelength and intensity of one pixel.
    AS = order.copy()
    for i in range(n - 1):
        indices = [m for m, v in enumerate(Wa) if v > i]
        if (len(indices) != 0):
            index = indices[0]
            a = Wa[index - 1]
            b = Wa[index]
            AS["intens"][i] = AS["intens"][a] + (
                AS["intens"][b] - AS["intens"][a]) * (
                    (AS["wv"][i] - AS["wv"][a]) / (AS["wv"][b] - AS["wv"][a]))
        else:
            # AS=AS.drop(list(range(i, n)))
            break

    # Run a local polynomial on tilde(AS_alpha), as described in step 3 of the AFS algorithm.
    # Use the function loess_1d() to run a second order local polynomial.
    # Variable y_result is the predicted output from input x
    x = AS["wv"].values
    y = AS["intens"].values
    # covert x and y to R vectors
    x = robjects.FloatVector(list(x))
    y = robjects.FloatVector(list(y))
    df = robjects.DataFrame({"x": x, "y": y})
    # run loess (haven't found a way to specify "control" parameters)
    loess_fit = r.loess("y ~ x", data=df, degree=2, span=d, surface="direct")
    #wv_vec= robjects.FloatVector(list(order["wv"]))
    B1 = r.predict(loess_fit, x)
    # Add a new column called select to the matrix order.
    # order["select"] records hat(y^(1)).
    select = order["intens"].values / B1
    order["select"] = select

    # Calculate Q_2q-1 in step 3 of the ALSFS algorithm.
    Q = np.quantile(order["select"], 1 - (1 - q) * 2)

    # Make indices in Wa to the format of small windows.
    # Each row of the variable window is a pair of neighboring indices in Wa.
    window = np.column_stack((Wa[0:len(Wa) - 1], Wa[1:]))

    # This chunk of code select the top q quantile of points in each window.
    # The point indices are recorded in variable index, which is S_alpha, q in step 4
    # of the AFS algorithm.
    index = [0]
    for i in range(window.shape[0]):
        loc_window = window[i, ]
        temp = order.loc[loc_window[0]:loc_window[1]]
        temp_q = max(np.quantile(temp["select"], q), Q)
        index_i = temp[temp["select"] >= temp_q].index
        index = index + list(index_i)
    index = np.unique(index[1:])
    index = np.sort(index)

    # The following chunk of code does step 5 of the ALSFS algorithm.
    # The function minimize()) is used to calculate the optimization of the three
    # linear transformation parameters.
    # The final estimate is in variable B2.
    m = len(index)
    led["intens"] = led["intens"] / np.max(led["intens"].values) * np.max(
        order["intens"].values)
    Xnew = led.iloc[index]
    Xnew["constants"] = np.ones(m)
    columnsTitles = ["constants", "intens", "wv"]
    Xnew = Xnew.reindex(columns=columnsTitles)
    order_new = order.iloc[index]
    beta = np.array([0, 1, 0])
    v1 = order_new["intens"].values
    m1 = Xnew.values

    # Define the function to be optimized
    def f(beta):
        return np.sum(
            np.square((np.divide(v1, np.matmul(m1, beta)) - np.ones(m))))

    op_result = minimize(f, beta)
    param = op_result.x
    B2 = param[1] * led["intens"].values + param[2] * led["wv"].values + param[
        0]

    return order["intens"].values / B2
示例#14
0
def meanVar(_files, _gff_file, _output):

    NFILE = len(_files)
    if NFILE == 1:
        sys.stderr.write("Need at least two samples for each group.\n")
        sys.exit(1)

    ## Dictionary of gene counts
    _dict_counts = dict()
    _genes = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    idx = 0
    count = 0
    transcript = set()
    cur_line = None
    lines = 0
    for feature in _gff_file:
        lines += 1
        if feature.type in GENE or lines == num_lines:
            if len(transcript) > 1:
                _dict_counts[cur_line.name] = [0] * NFILE
                _genes[cur_line.iv] += cur_line.name
                count += 1
            cur_line = feature
            transcript.clear()
        if feature.type in EXON:
            transcript.add(feature.attr["Parent"])
    print "Number of genes", count
    _file_raw_count = open(_output + '.rawcounts', 'w')
    _file_nb_count = open(_output + '.nbcounts', 'w')
    ## This loop read through the input list and call countbam for each input file
    for f in _files:
        bam_file = HTSeq.BAM_Reader(f)
        _dict_counts = countbam(bam_file, _genes, _dict_counts, idx)
        idx += 1
        sys.stderr.write("Library %d has generated.\n" % idx)
    ## Print raw counts in file specified by <out>
    for key, value in sorted(_dict_counts.iteritems()):
        _file_raw_count.write(key + "\t" + "\t".join(map(str, value)) + "\n")
    _file_raw_count.close()
    ## Calculate group mean and variance
    list_mean = list()
    list_var = list()
    for key, value in sorted(_dict_counts.iteritems()):
        list_mean.append(np.mean(np.array(value)))
        list_var.append(np.var(np.array(value)))

    ## Computer loess esimates
    ## The following code is using rpy2 module
    a = robjects.FloatVector(list_mean)
    b = robjects.FloatVector(list_var)
    df = robjects.DataFrame({"mean": a, "var": b})
    non0_df = df.rx(df.rx2("mean").ro > 0, True)  ## subsetting if mean > 0
    loess_fit = r.loess("var ~ mean", data=non0_df, degree=2)

    var_pred = r.predict(loess_fit, a)
    # This loop overwrite global variable dict_counts for recoding new count data
    count_idx = 0
    for key, value in sorted(_dict_counts.iteritems()):
        n = math.pow(list_mean[count_idx],
                     2) / (var_pred[count_idx] - list_mean[count_idx])
        n = int(n)  # n: number of failures
        if n <= 0:
            _dict_counts[key] = [0] * NREPS
        else:
            p = n / float(n + list_mean[count_idx])  # p: prob of success
            _dict_counts[key] = nbinom.rvs(n, p, size=NREPS).tolist()
        count_idx += 1
    for key, value in sorted(_dict_counts.iteritems()):
        _file_nb_count.write(key + "\t" + "\t".join(map(str, value)) + "\n")
    _file_nb_count.close()
    _file_raw_count.close()
    return _dict_counts
import rpy2.robjects as ro
from rpy2.robjects import r
import json
from pandas.rpy.common import convert_to_r_dataframe
import pandas as pd

fitted_model = r.readRDS("models/model1.RDS")

# (in real life maybe the json comes from the front end):
json_to_predict = '[{"carat":0.23,"cut":"Ideal","color":"E","clarity":"SI2"}]'

# Method 1: convert pandas data frame to R data frame
to_predict_dict = json.loads(json_to_predict)
to_predict_pandas_df = pd.DataFrame(to_predict_dict)
to_predict_R_df = convert_to_r_dataframe(to_predict_pandas_df)

# Make predictons
[prediction] = r.predict(fitted_model, to_predict_R_df)
print("Prediction from pandas DF is %f" % prediction)


# Method 2: Send the JSON to R, convert to dataframe in R
jsonlite = ro.packages.importr("jsonlite")

# This gets you an R dataframe (a bit of a weird default for a fromJSON function, but alas):
to_predict_R_df_from_json = jsonlite.fromJSON(json_to_predict)

# Make predictions
[prediction] = r.predict(fitted_model, to_predict_R_df_from_json)
print("Prediction from JSON is %f" % prediction)