def main(args): for Vs in VLIST: f = VLIST.index(Vs) plt.figure(f + 1) subfigure = read.readData(str(Vs) + '.csv')[0] stylelist = ['solid', 'dashed', 'dashdot'] zoomlist = [False, True] for zoom in zoomlist: for i in range(len(PLIST)): plt.plot(subfigure[0], subfigure[i+1], label = PLIST[i], linewidth = 4 if i == 0 else 2, linestyle = stylelist[i], markersize = 12 ) y = [float(S)**2/(2*(1-float(S))) for S in subfigure[0]] plt.plot(subfigure[0], y, label = "Universal lower bound", linewidth = 3, linestyle = 'dotted', markersize = 12 ) plt.xlabel('Traffic Intensity') plt.ylabel(r'$ E\left[\sum_{i,j} Q_{i,j}\right]$') plt.title(r'$ E\left[\sum_{i,j} Q_{i,j}\right]$ vs Traffic Intensity') #+ str(VLIST[f])) if (not zoom): plt.grid(True) plt.legend() plt.savefig("Avgq_Vs " + str(VLIST[f]) + ".png") plt.savefig("Avgq_Vs " + str(VLIST[f]) + ".eps") plt.savefig("Avgq_Vs " + str(VLIST[f]) + ".svg") plt.show() else: plt.xlim(0.9, 1.0) #zoom in the plot to high traffic intensity level plt.grid(True) plt.legend() plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f]) + ".png") plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f]) + ".eps") plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f]) + ".svg") plt.show()
def main(args): for Vs in VLIST: f = VLIST.index(Vs) plt.figure(f + 1) subfigure = read.readData(str(Vs) + '.csv')[1] stylelist = ['solid', 'dashed', 'dashdot'] for i in range(len(PLIST)): plt.plot(subfigure[0], subfigure[i + 1], label=PLIST[i], linewidth=4 if i == 0 else 2, linestyle=stylelist[i], markersize=12) y = [float(S)**2 / 2 for S in subfigure[0]] plt.plot(subfigure[0], y, label="Universal lower bound", linewidth=3, linestyle='dotted', markersize=12) plt.xlabel('Traffic Intensity') plt.ylabel(r'$(1-2\lambda) E[\sum_{i,j} Q_{i,j}]$') plt.title(r'$(1-2\lambda) E[\sum_{i,j} Q_{i,j}]$ vs Traffic Intensity') #+ str(VLIST[f])) plt.grid(True) plt.legend() plt.savefig("EpsilonXAvgq_Vs " + str(VLIST[f]) + ".png") plt.savefig("EpsilonXAvgq_Vs " + str(VLIST[f]) + ".eps") plt.savefig("EpsilonXAvgq_Vs " + str(VLIST[f]) + ".svg") plt.show()
def check_feature_rate(): import math import randomForest as rf missing_input = 'none' #'mean' transform = False scale = True use_text = False dummy = False use_feature_selection = False data_path = 'DorCirurgiaCategNA.csv' class_questionnaire = 'Q92510' class_name = 'Q92510_snDorPos' data, original_attributes, categories = read.readData( data_path=data_path, class_name=class_name, class_questionnaire=class_questionnaire, missing_input=missing_input, dummy=dummy, transform_numeric=transform, use_text=use_text, skip_class_questionnaire=True) #skip_class_questionnaire=False) X = data[:, 0:-1] y = np.array(data[:, -1]) ntrees = 5001 replace = False mtry = math.sqrt max_depth = None missing_branch = True seed = np.random.randint(0, 10000) clf1 = rf.RandomForest(ntrees=ntrees, oob_error=True, random_state=seed, mtry=mtry, missing_branch=missing_branch, prob_answer=False, max_depth=max_depth, replace=replace, balance=True) clf1.fit(X, y) attributes_used = {} for tree in clf1.forest: for attribute in tree.feature_indices: if (attribute not in attributes_used.keys()): attributes_used[attribute] = 1 else: attributes_used[attribute] += 1 if (len((attributes_used.keys())) != X.shape[1]): print(len(attributes_used.keys())) print(X.shape[1]) print('not equal!!! %r' % (1 - len(attributes_used.keys()) / X.shape[1])) print({original_attributes[a]: b for a, b in attributes_used.items()}) print(1 - clf1.oob_error_)
def plot_missing_rate(): data_path = 'RotEOmbroCirurgiaCategNAReduzido.csv' #'Dados/risk_factors_cervical_cancer.csv' class_name = 'Q92510_opcForca[RotEOmbro]' #class_name = 'Q92510_snDorPos' class_questionnaire = 'Q92510' missing_input = 'none' #'mean' transform = False scale = True use_text = False dummy = False use_feature_selection = False data, original_attributes, categories = read.readData( data_path=data_path, class_name=class_name, class_questionnaire=class_questionnaire, missing_input=missing_input, dummy=dummy, transform_numeric=transform, use_text=use_text, skip_class_questionnaire=True) X = data print(X.shape) features_missing = [0, 0, 0, 0, 0] m = 0 for j in range((X.shape[1])): cj = 0 for i in range((X.shape[0])): if (utils.isnan(X[i][j])): cj += 1 if (cj / X.shape[0] == 0): print(original_attributes[j]) features_missing[0] += 1 elif (cj / X.shape[0] <= 0.25): features_missing[1] += 1 elif (cj / X.shape[0] <= 0.5): features_missing[2] += 1 elif (cj / X.shape[0] <= 0.75): features_missing[3] += 1 elif (cj / X.shape[0] < 1): features_missing[4] += 1 m += cj / X.shape[0] print(m / X.shape[1]) exit() print(features_missing) plt.pie( features_missing[::-1], labels=['0%', '0.05% a 25%', '26% a 50%', '51% a 75%', '76% a 98%'][::-1], colors=colors, startangle=90, radius=1, autopct=lambda p: '{:.0f}'.format(p * sum(features_missing) / 100)) plt.show()
def main(args): for Vs in VLIST: f = VLIST.index(Vs) plt.figure(f + 1) subfigure = read.readData(str(VLIST.index(Vs)) + 'th Vs' + '.csv')[0] stylelist = ['solid', 'dashed', 'dashdot', ':'] markerlist = ['.', ',', 'o', '^'] zoomlist = [False, True] for zoom in zoomlist: for i in range(len(PLIST)): plt.rc('text', usetex=True) plt.rc('font', family='serif') plt.plot(subfigure[0], subfigure[i + 1], label=LABELS[i], linewidth=4 if i == 0 else 2, linestyle=stylelist[i], marker=markerlist[i], markersize=12) y = [float(S)**2 / (2 * (1 - float(S))) for S in subfigure[0]] plt.plot(subfigure[0], y, label="Universal lower bound", linewidth=3, linestyle='dotted', markersize=12) plt.xlabel(r'\text{Traffic Intensity}', fontsize=16) plt.ylabel(r'$E\left[\sum_{i,j} Q_{i,j}\right]$', fontsize=16) plt.title( r'$E\left[\sum_{i,j} Q_{i,j}\right]$' r' \text{ vs Traffic Intensity}', fontsize=16) plt.subplots_adjust(left=0.12, right=0.97, top=0.9, bottom=0.1) #+ str(VLIST[f])) if (not zoom): plt.grid(True) plt.legend(fontsize=16) plt.savefig("Avgq_Vs " + str(VLIST[f]) + ".png") #plt.savefig("Avgq_Vs " + str(VLIST[f]) + ".eps") #plt.savefig("Avgq_Vs " + str(VLIST[f]) + ".svg") #plt.savefig("Avgq_Vs " + str(VLIST[f])) plt.show() else: plt.xlim( 0.9, 1.0) #zoom in the plot to high traffic intensity level plt.grid(True) plt.legend(fontsize=16) plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f]) + ".png") #plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f]) + ".eps") #plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f]) + ".svg") #plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f])) plt.show()
def classify(pacient_filename,model_filename,class_name): if(model_filename[-7:] != '.pickle' ): model_filename = model_filename + '.pickle' try: with open(model_filename, 'rb') as handle: clf = pickle.load(handle) except(FileNotFoundError): print('Could not find file %r.\n' % model_filename) exit() data = read.readData(data_path = pacient_filename, class_name = class_name) X = data[data.columns[:-1]] classdict = (clf.predict(X,prob=True))[0] outcome = max(classdict,key=classdict.get) print(f"Outcome {outcome} with {classdict[outcome]/sum(classdict.values())*100}% of probabily.") transform_to_JSON(clf,clf.feature_contribution(X),out='classification_'+class_name+'.json',diffsur=False,addline=classdict)
def main(args): for Vs in VLIST: f = VLIST.index(Vs) fig = plt.figure(f + 1) xyzs = read.readData(str(Vs) + '.csv')[0][1:] ax = fig.add_subplot(111, projection='3d') xs = [float(num) for num in xyzs[0]] ys = [float(num) for num in xyzs[1]] zs = [float(num) for num in xyzs[2]] ax.scatter(xs, ys, zs, c='b', marker='o') ax.set_xlabel(LABELS[0]) ax.set_ylabel(LABELS[1]) ax.set_zlabel(LABELS[2]) plt.grid(True) plt.legend(fontsize=16) fig.savefig("Queue Lengths 3D " + str(VLIST[f]) + ".png") plt.show()
def main(args): for Vs in VLIST: f = VLIST.index(Vs) plt.figure(f + 1) subfigure = read.readData(str(Vs) + '.csv')[1] stylelist = ['solid', 'dashed', 'dashdot', 'dotted'] markerlist = ['.', ',', 'o', '^'] for i in range(len(PLIST)): plt.rc('text', usetex=True) plt.rc('font', family='serif') plt.plot(subfigure[0], subfigure[i + 1], label=LABELS[i], linewidth=4 if i == 0 else 2, linestyle=stylelist[i], marker=markerlist[i], markersize=5) y = [float(S)**2 / 2 for S in subfigure[0]] plt.plot(subfigure[0], y, label="Universal lower bound", linewidth=3, linestyle='dotted', markersize=5) plt.xlabel(r'\text{Traffic Intensity }' r'$(\rho)$', fontsize=16) plt.ylabel(r'$(1-\rho) E\left[\sum_{i,j} Q_{i,j}\right]$', fontsize=16) plt.title( r'$(1-\rho)E\left[\sum_{i,j} Q_{i,j}\right]$' r' \text{ vs Traffic Intensity}', fontsize=16) #+ str(VLIST[f])) plt.subplots_adjust(left=0.12, right=0.97, top=0.9, bottom=0.1) plt.grid(True) plt.legend(fontsize=16) plt.savefig("EpsilonXAvgq_Vs " + str(VLIST[f]) + ".png") #plt.savefig("EpsilonXAvgq_Vs " + str(VLIST[f]) + ".eps") #plt.savefig("EpsilonXAvgq_Vs " + str(VLIST[f]) + ".svg") plt.show()
# clear def clearData(data): clear_data = [] for item in data: clear_data.append(sanitize.st(item)) return clear_data # clear2 def clearData2(data): return [sanitize.st(item) for item in data] # get def getData(data): return sorted(data)[0:3] # get@set def getData2(data): return sorted(set(data))[0:3] james = getData(clearData(read.readData('data/james.txt'))) sarah = getData(clearData2(read.readData('data/sarah.txt'))) julie = getData2(clearData2(read.readData('data/julie.txt'))) print(james) print(sarah) print(julie)
import shelve import read user = shelve.open("testUser") while True: name = input("Enter the name: ") if not name: break age = input("Enter the age: ") country = input("Enter the country of origin: ") user[name] = (age, country) userData = read.readData(user) #read data back from shelve using the function in read.py file print(userData) user.clear() user.close()
from pyspark.sql.functions import length from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer from pyspark.ml.feature import VectorAssembler from pyspark.ml.linalg import Vector from pyspark.ml.classification import NaiveBayes from pyspark.ml import Pipeline from pyspark.ml.evaluation import MulticlassClassificationEvaluator from read import readData import numpy as np import seaborn as sns from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt spark = SparkSession.builder.appName('nlp').getOrCreate() data = readData() data.show() data = data.withColumn('length', length(data['text'])) data.show() data.groupby('class').mean().show() tokenizer = Tokenizer(inputCol="text", outputCol="token_text") stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens') count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec') idf = IDF(inputCol="c_vec", outputCol="tf_idf") ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label') clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')
missing_input = 'none' #'mean' transform = False scale = True use_text = False dummy = False use_feature_selection = False import random seed = random.randint(0, 10000) for data_path, class_name in data_paths: data = read.readData( data_path=data_path, class_name=class_name, class_questionnaire=class_questionnaire, missing_input=missing_input, dummy=dummy, transform_numeric=transform, use_text=use_text, skip_class_questionnaire=True) #skip_class_questionnaire=False) X = data[data.columns[:-1]] y = data[class_name] ntimes = 2 ntrees = 5 mtry = math.sqrt max_depth = None missing_branch = True #seed = 89444 replace = False
def main(): currDir = os.getcwd() path = os.path.join(currDir, 'data.csv') input, output = readData(path, 'Economy..GDP.per.Capita.', 'Freedom', 'Happiness.Score') # split in 80/20 percent np.random.seed(5) indexes = [i for i in range(len(input))] trainSample = np.random.choice(indexes, int(0.8 * len(input)), replace=False) testSample = [i for i in indexes if not i in trainSample] trainInputs = [input[i] for i in trainSample] trainOutputs = [output[i] for i in trainSample] testInputs = [input[i] for i in testSample] testOutputs = [output[i] for i in testSample] #data normalization trainInputs, testInputs, trainOutputs, testOutputs = executeNormalization( trainInputs, testInputs, trainOutputs, testOutputs) #tool univariate GDPTrainInputs = [trainInputs[0] for _ in range(len(trainInputs))] GDPTestInputs = [testInputs[0] for _ in range(len(testInputs))] print(" UNIVARIATE") print("SKLEARN REGRESSION") regressorSklearnUni = linear_model.SGDRegressor(alpha=0.005, max_iter=1000, average=len(trainInputs)) regressorSklearnUni.fit(GDPTrainInputs, trainOutputs) w = [regressorSklearnUni.intercept_[0], regressorSklearnUni.coef_[0]] print("Learnt model is: f(x) = " + str(w[0]) + " + " + str(w[1]) + " * x") print("MANUAL REGRESSION") regressorMySGDRegression = MySGDRegression() regressorMySGDRegression.fit(GDPTrainInputs, trainOutputs) w = [ regressorMySGDRegression.intercept_, regressorMySGDRegression.coef_[0] ] print("Learnt model is: f(x) = " + str(w[0]) + " + " + str(w[1]) + " * x") print("\n\n BIVARIATE") print("SKLEARN REGRESSION") toolRegression = tool_regression(trainInputs, trainOutputs) print("MANUAL REGRESSION") manual_regressor = manual_regression(trainInputs, trainOutputs) print("\n\n ERRORS") print( "1.TOOL UNIVARIATE ERROR: ", mean_squared_error(testOutputs, toolRegression.predict(GDPTestInputs))) print("2.MANUAL UNIVARIATE ERROR: ", meanSquareError(manual_regressor, GDPTestInputs, testOutputs)) print("3.TOOL BIVARIATE ERROR: ", mean_squared_error(testOutputs, toolRegression.predict(testInputs))) print("4.MANUAL BIVARIATE ERROR: ", meanSquareError(manual_regressor, testInputs, testOutputs))
#test_size=0.2,random_state=9) # exercise_index = np.where(attributes == 'Exercício?')[0][0] # feature_index = exercise_index # not_nan_rows = [a for a in range(X.shape[0]) if not utils.isnan(X[:,feature_index][a])] # Xs,ys,d = utils.split_categ(X[not_nan_rows],y[not_nan_rows],exercise_index,list(set(X[not_nan_rows,exercise_index]))) # print(utils.information_gain(y[not_nan_rows],ys)) # m.to_dot(attributes,out='out.dot') exit() data, original_attributes, categories = read.readData( data_path='../Dados/TestBaloonAdultAct.csv', class_name='inflated', dummy=dummy, transform_numeric=transform, use_text=use_text, missing_input='none') X = data[:, 0:-1] y = np.array(data[:, -1]) #import plot # plot.plot_randomforest_accuracy(X,y,original_attributes,ntrees=100,mtry=math.sqrt,replace=False,max_depth=None,missing_branch=False) # exit() seeds = [10, 25, 40, 50, 120, 35, 128, 90, 97, 100] import time dif = [] i = 0 for seed in seeds: starttime = time.time()
from sklearn.model_selection import train_test_split import decisionTree as dt from sklearn.model_selection import KFold data_path = '../RotEOmbroCirurgiaCategNAReduzido.csv' class_name = 'Q92510_opcForca[RotEOmbro]' #'Q92510_snDorPos' class_questionnaire = 'Q92510' missing_input = 'none' dummy = False transform = False use_text = False data, original_attributes, categories = read.readData( data_path=data_path, class_name=class_name, class_questionnaire=class_questionnaire, missing_input=missing_input, dummy=dummy, transform_numeric=transform, use_text=use_text, skip_class_questionnaire=True) sf = [] st = [] vp, vp1, fp, fp1, fn, fn1, vn, vn1 = 0, 0, 0, 0, 0, 0, 0, 0 X = data[:, 0:-1] y = np.array(data[:, -1]) n_splits = X.shape[0] sss = KFold(n_splits=n_splits, random_state=9) for train_index, test_index in sss.split(X, y): X_train = X[train_index]
import read import sanitize # clear def clearData(data): clear_data = [] for item in data: clear_data.append(sanitize.st(item)) return clear_data; # clear2 def clearData2(data): return [sanitize.st(item) for item in data] # get def getData(data): return sorted(data)[0 : 3] # get@set def getData2(data): return sorted(set(data))[0 : 3] james = getData(clearData(read.readData('data/james.txt'))) sarah = getData(clearData2(read.readData('data/sarah.txt'))) julie = getData2(clearData2(read.readData('data/julie.txt'))) print(james) print(sarah) print(julie)