def bigdata_mse(request,input_dict,output_dict,widget): from discomll.utils import accuracy from disco.core import result_iterator import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir folder = 'discomll_measures' tag = input_dict["predictions"] destination = MEDIA_ROOT+'/'+folder+"/"+tag[0][6:]+'.txt' ensure_dir(destination) if input_dict["dataset"].params["id_index"] == -1: input_dict["string"] = "ID index should be defined." elif not os.path.isfile(destination): #file doesnt exists results = accuracy.measure(test_data = input_dict["dataset"], predictions = input_dict["predictions"], measure = "mse") string = "Mean squared error\n" for k, v in result_iterator(results): string += str(v) + "\n" input_dict["string"] = string f = open(destination,'w') f.write(str(v)) f.close() else: string = "Mean squared error\n" f = open(destination,'r') input_dict["string"] = string + str(f.readlines()[0]) f.close() return render(request, 'visualizations/display_string.html',{'widget':widget,'input_dict':input_dict,'output_dict':output_dict})
def bigdata_ca(request, input_dict, output_dict, widget): from discomll.utils import accuracy import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir folder = 'discomll_measures' tag = input_dict["predictions"] destination = MEDIA_ROOT + '/' + folder + "/" + tag[0][6:] + '.txt' ensure_dir(destination) if input_dict["dataset"].params["id_index"] == -1: input_dict["string"] = "ID index should be defined." elif not os.path.isfile(destination): # file doesnt exists measure, acc = accuracy.measure(test_data=input_dict["dataset"], predictions=input_dict["predictions"], measure="ca") string = "Classification Accuracy \n" score = str(measure) + " " + str(acc) + "\n" string += score input_dict["string"] = string f = open(destination, 'w') f.write(score) f.close() else: #ca results are cached string = "Classification Accuracy \n" f = open(destination, 'r') input_dict["string"] = string + str(f.readlines()[0]) f.close() return render(request, 'visualizations/display_string.html', {'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict})
def bigdata_mse(request, input_dict, output_dict, widget): from discomll.utils import accuracy from disco.core import result_iterator import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir folder = 'discomll_measures' tag = input_dict["predictions"] destination = MEDIA_ROOT + '/' + folder + "/" + tag[0][6:] + '.txt' ensure_dir(destination) if input_dict["dataset"].params["id_index"] == -1: input_dict["string"] = "ID index should be defined." elif not os.path.isfile(destination): #file doesnt exists results = accuracy.measure(test_data=input_dict["dataset"], predictions=input_dict["predictions"], measure="mse") string = "Mean squared error\n" for k, v in result_iterator(results): string += str(v) + "\n" input_dict["string"] = string f = open(destination, 'w') f.write(str(v)) f.close() else: string = "Mean squared error\n" f = open(destination, 'r') input_dict["string"] = string + str(f.readlines()[0]) f.close() return render(request, 'visualizations/display_string.html', { 'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict })
def bigdata_ca(request, input_dict, output_dict, widget): from discomll.utils import accuracy import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir folder = 'discomll_measures' tag = input_dict["predictions"] destination = MEDIA_ROOT + '/' + folder + "/" + tag[0][6:] + '.txt' ensure_dir(destination) if input_dict["dataset"].params["id_index"] == -1: input_dict["string"] = "ID index should be defined." elif not os.path.isfile(destination): # file doesnt exists measure, acc = accuracy.measure(test_data=input_dict["dataset"], predictions=input_dict["predictions"], measure="ca") string = "Classification Accuracy \n" score = str(measure) + " " + str(acc) + "\n" string += score input_dict["string"] = string f = open(destination, 'w') f.write(score) f.close() else: #ca results are cached string = "Classification Accuracy \n" f = open(destination, 'r') input_dict["string"] = string + str(f.readlines()[0]) f.close() return render(request, 'visualizations/display_string.html', { 'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict })
from disco.core import result_iterator from discomll import dataset from discomll.ensemble import distributed_random_forest from discomll.utils import model_view from discomll.utils import accuracy train = dataset.Data(data_tag=[ ["http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"]], id_index=0, X_indices=xrange(1, 10), X_meta="http://ropot.ijs.si/data/datasets_meta/breastcancer_meta.csv", y_index=10, delimiter=",") fit_model = distributed_random_forest.fit(train, trees_per_chunk=3, max_tree_nodes=50, min_samples_leaf=10, min_samples_split=5, class_majority=1, measure="info_gain", accuracy=1, separate_max=True, random_state=None, save_results=True) print model_view.output_model(fit_model) # predict training dataset predictions = distributed_random_forest.predict(train, fit_model) # output results for k, v in result_iterator(predictions): print k, v # measure accuracy ca = accuracy.measure(train, predictions) print ca
# define test dataset test = dataset.Data( data_tag=["test:breast_cancer_disc_test"], data_type="chunk", X_indices=xrange(1, 10), id_index=0, y_index=10, delimiter=",", y_map=["2", "4"], # define mapping parameter. "2" is mapped to 1, "4" is mapped to -1. missing_vals=["?"], ) # define missing value symbol # fit model on training dataset fit_model = linear_svm.fit(train) # output model model = model_view.output_model(fit_model) print model # start MR job to predict given test data predictions = linear_svm.predict(test, fit_model) # output results for k, v in result_iterator(predictions): print k, v[0] # measure accuracy ca = accuracy.measure(test, predictions) print ca
]], id_index=0, X_indices=xrange(1, 10), X_meta="http://ropot.ijs.si/data/datasets_meta/breastcancer_meta.csv", y_index=10, delimiter=",") fit_model = distributed_weighted_forest_rand.fit(train, trees_per_chunk=3, max_tree_nodes=50, min_samples_leaf=5, min_samples_split=10, class_majority=1, measure="info_gain", num_medoids=10, accuracy=1, separate_max=True, random_state=None, save_results=True) # predict training dataset predictions = distributed_weighted_forest_rand.predict(train, fit_model) # output results for k, v in result_iterator(predictions): print k, v[0] # measure accuracy ca = accuracy.measure(train, predictions) print ca
# define test dataset test = dataset.Data(data_tag=["test:breast_cancer_disc_test"], data_type="chunk", X_indices=xrange(1, 10), X_meta=["d" for i in xrange(1, 10)], id_index=0, y_index=10, delimiter=",", y_map=["2", "4"], # define mapping parameter. "2" is mapped to 1, "4" is mapped to -1. missing_vals=["?"]) # define missing value symbol # fit model on training dataset fit_model = naivebayes.fit(train) # output model model = model_view.output_model(fit_model) print model # start MR job to predict given test data predictions = naivebayes.predict(test, fit_model) # output results for k, v in result_iterator(predictions): print k, v[0] # measure accuracy ca = accuracy.measure(test, predictions) print ca