def main(): plt.close("all") print "Welcome to my data science project" pickleFileName = 'billData.pickle' if REBUILD_DATA or not os.path.isfile(pickleFileName): print 'Loading data from json files' ld.loadData(pickleFileName) else: print 'Loading previously saved data' pickleFile = open(pickleFileName, 'rb') bills = pickle.load(pickleFile) (X,y,z) = getFeatures(bills) print 'Got data with: ' + str(len(X[:,0])) + ' observations' getPassedPercent(y) # try kfolds crossValidate(X,y) # look at feature distributions labels = ['Sponsor party', 'Number of cosponsors', '% Democratic Cosponsors', '% Republican Cosponsors', 'Numerical Category', 'Party Control', 'President Party'] getFeatureDistributions(X, labels) # train, leaving out most recent congress (113), and get confusion matrix cm = trainAndTest(X,y,z) targetNames = ['fail', 'pass'] plotConfusionMatrix(cm, targetNames)
def load_data(): #custom Mask loadImg = loadData('../../pairImg/images', '../../pairImg/newMasks', '../../pairImg/train.csv', 0.8) #binary mask #loadImg = loadData('../../pairImg/images', '../../pairImg/masks', '../../pairImg/train.csv', 0.8) x_trainN = loadImg.trainN x_testN = loadImg.testN x_trainP = loadImg.trainP x_testP = loadImg.testP ''' s = x_test[:10] for i in range(len(s)): img2avg = s[i] plt.imshow(img2avg) plt.savefig('vae_mlp/img'+str(i)+'.png') ''' y_trainN = np.ones(800) y_testN = np.ones(200) y_trainP = np.ones(800) y_testP = np.ones(200) return (x_trainN, y_trainN), (x_testN, y_testN), (x_trainP, y_trainP), (x_testP, y_testP)
def main(): #ex1data1. train_data = loadData('data/ex1data2.txt') #evaluateModels(train_data) tetas, mean, std_dev, predicted_y = gradient_descent_linear_regression( train_data, 0.5) test_data = loadData('data/testdata.txt') test_data = normalize_test_data(test_data, mean, std_dev) test_data = add_bias_term_in_data(test_data) print("Start predicting new instances") for i in range(len(test_data)): predicted_value = predict_instance(test_data[i], tetas) print("value of instance %(key1)s is %(key2)s" % { 'key1': i + 1, 'key2': predicted_value })
def main(): #dataset = "twoDcurve" dataset = "mnist" #dataset = "cifar10" #dataset = "imageNet" filename1 = "2442_original_as_7_with_confidence_0.999989330769.png" filename2 = "2442_7_modified_into_3_with_confidence_0.509171962738.png" imageNetPath1 = "%s/%s" % (directory_pic_string, filename1) imageNetPath2 = "%s/%s" % (directory_pic_string, filename2) image1 = NN.readImage(imageNetPath1) image2 = NN.readImage(imageNetPath2) k, euclideanDistance = diffImage(image1, image2) print "%s input elements are changed." % (k) print("The Euclidean distance is %s" % (euclideanDistance)) model = loadData() (class1, confidence1) = NN.predictWithImage(model, image1) classStr1 = dataBasics.LABELS(int(class1)) print "the class for the first image is %s (%s) with confidence %s" % ( class1, classStr1, confidence1) (class2, confidence2) = NN.predictWithImage(model, image2) classStr2 = dataBasics.LABELS(int(class2)) print "the class for the first image is %s (%s) with confidence %s" % ( class2, classStr2, confidence2) return 0
def main(): ds = loadData.loadData() iterator = ds.make_one_shot_iterator() batch_images, batch_labels = iterator.get_next() L1 = network.conv1(batch_images) L1 = network.pool1(L1) L1out = network.bn1(L1) L2 = network.conv2(L1out) L2 = network.bn2(L2) L2out = network.pool2(L2) L3out = network.fc1(L2out) L4out = network.fc2(L3out) L5out = network.final(L4out) count = 0 start_time = time.time() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) while True: count += 1 tom = sess.run(L5out) print(tom.shape) if count == 1: break print("Duration: {:,.0f} (minutes)".format( (time.time() - start_time) / 60))
def main(): model = loadData() plt.axis('off') if whichMode == "train": return if trainingModel == "autoencoder": (model, autoencoder) = model if startLayer == -1: autoencoder = model else: autoencoder = model # finding adversarial examples from original model dc = dataCollection( "%s_%s_%s" % (startIndexOfImage, dataProcessingBatchNum, manipulations[0])) succNum = 0 for i in range(dataProcessingBatchNum): for player_mode in twoPlayer_mode: dc.addComment("%s -- %s\n" % (i, player_mode)) re = handleOne(model, autoencoder, startIndexOfImage + i, manipulations[0], dc, player_mode) if re == True: succNum += 1 dc.addSuccPercent(succNum / float(dataProcessingBatchNum)) dc.provideDetails() dc.summarise() dc.close()
def tuneThreshold(): """ Explore different values of threshold to see which one fits best """ thresholds = np.linspace(0.4,0.6, 10) bestAcc = 0.0 bestModel = None X_tr, y_tr, w_tr = loadData() m, n = X_tr.shape for th in thresholds: model = LogisticRegression(features=['PRI_tau_eta', 'PRI_lep_eta', 'DER_deltar_tau_lep', 'PRI_met_sumet', 'DER_mass_transverse_met_lep'], threshold=th) model.train(X_tr, y_tr, w_tr) p, r = model.predict(X_tr) #calculate some accuracy on the same train set acc = 100.0*(p.flatten() == y_tr.flatten()).sum()/m print "%s %s%%"%(th, acc) if acc > bestAcc: bestAcc = acc bestModel = model #save the best model bestModel.save('data/logisticRegression%.2f.txt'%acc)
def weightAnalysis(): # 分析不同用户的评价数 [[m_row, m_col, m_val], implicit] = loadData() length = len(m_row) i = 0 rateNum = np.zeros(10) # rateNum[i]代表给了小于i*1000个评价的用户数 rateNumLess1000 = np.zeros(10) # 对于小于1000评价的,以100为一个分层 rateNumLess10 = 0 # 评论数小于10的用户 rateNumEqual1 = 0 # 评论数为1的用户 while i < length: # 遍历所有用户 if m_val[i] == 0: # 隐含信息,不统计 i += 1 continue rateCnt = 0 # 评价数计算 for j in range(i, length): # 遍历该用户的评价 if m_row[i] != m_row[j]: # 该用户遍历完毕 break if m_val[j] == 0: # 隐含信息,不统计 continue rateCnt += 1 if rateCnt == 1: rateNumEqual1 += 1 if rateCnt <= 10: rateNumLess10 += 1 if rateCnt <= 1000: rateNumLess1000[int(rateCnt / 100)] += 1 else: rateNum[int(rateCnt / 1000)] += 1 for j in range(i, length): # 跳到下一个用户 if m_row[i] != m_row[j]: i = j break if j == length - 1: i = length return [rateNumEqual1, rateNumLess10, rateNumLess1000, rateNum]
def main(): #ex1data1. train_data = loadData('data/ex1data1.txt') #evaluateModels(train_data) plotDatain2D(train_data, []) predicted_y, tetas = gradient_descent_linear_regression(train_data, 0.01) actual_y = [row[-1] for row in train_data] print("MAE %(key1)s" % {'key1': compute_MAE(actual_y, predicted_y)}) print("MSE %(key1)s" % {'key1': compute_MSE(actual_y, predicted_y)}) print("RMSE %(key1)s" % {'key1': compute_RMSE(actual_y, predicted_y)})
def main(): #ex1data1. train_data = loadData('data/ex1data2.txt') #evaluateModels(train_data) tetas = training(train_data) predicted_y = predict_test_data(train_data,tetas) actual_y = [row[-1] for row in train_data] print("MAE %(key1)s"%{'key1':compute_MAE(actual_y,predicted_y)}) print("MSE %(key1)s"%{'key1':compute_MSE(actual_y,predicted_y)}) print("RMSE %(key1)s"%{'key1':compute_RMSE(actual_y,predicted_y)})
def main(): ''' When this file is called, one training data file path should be given as argument. ''' print('Number of arguments:', len(sys.argv), 'arguments.') print('Argument List:', str(sys.argv)) if len(sys.argv) == 1: print("Please give the training data file.") else: file_path = sys.argv[1] nclass, dimension, labels, images = ld.loadData(file_path) u, delta_2, p = estimators(nclass, dimension, labels, images) usps_d_param(nclass, dimension, u, delta_2, p)
def main(): print('Number of arguments:', len(sys.argv), 'arguments.') print('Argument List:', str(sys.argv)) if len(sys.argv) == 1: print("Please give the parameters and testing files.") elif len(sys.argv) == 2: print("Please give one more file.") else: d, u, delta_2, p = ld.getParameters(sys.argv[1]) nclass, dimension, real_labels, testing_images = ld.loadData(sys.argv[2]) testing_labels, error_rate, con_matrix= classifer(nclass, dimension, u, delta_2, p, real_labels, testing_images) usps_d_error(error_rate) usps_d_cm(con_matrix,nclass)
def test(epochs, denoising=False, batch_size=32, num_hidden=100): train_x, train_y, test_x, test_y, valid_x, valid_y = loadData.loadData() auto_encoder = AutoEncoder(train_x, valid_x, test_x, num_hidden=num_hidden) auto_encoder.train(epochs, learning_rate=0.1, batch_size=batch_size, denoising=denoising) # hp.visualize_parameter(auto_encoder.W, name="DenoisingAutoEncoderWn30k5") hp.plotCrossEntropyError(auto_encoder.train_entropy, auto_encoder.validation_entropy, name="k5") hp.save_param(auto_encoder.W, "WAuto" + str(num_hidden) + str(denoising)) hp.save_param(auto_encoder.b, "bAuto" + str(num_hidden) + str(denoising)) hp.save_param(auto_encoder.c, "cAuto" + str(num_hidden) + str(denoising))
def build_model(stock): print "Running...this may take a few minutes." print "Initializing regression on 5-year historical data..." ld.fetchData(str(stock)) price_data, vol_data = ld.loadData(CSV) total = 0 plot_total = [] prev_action = 'sell' count = 100 while prev_action == 'sell': prev_action = tr.train(MID_TERM, count, price_data, vol_data) prev_amt = price_data[count] count += 1 plot_total.append((price_data[count - 1], prev_action)) for i in range(count, len(price_data) - MID_TERM, 30): action = tr.train(MID_TERM, i, price_data, vol_data) if action == 'buy': if action == prev_action: total += price_data[i] - prev_amt plot_total.append((price_data[i], 'hold')) else: plot_total.append((price_data[i], action)) prev_amt = price_data[i] prev_action = action elif action == 'sell': if action != prev_action: total += price_data[i] - prev_amt plot_total.append((price_data[i], action)) else: plot_total.append((price_data[i], 'hold')) prev_action = action # print "Action at time " + str(i) + " : " + action print "*********************" print "Total return: ", total print "Best action now: ", qt.getBestStock(MID_TERM, len(price_data)) count = 0 plt.clf() plt.plot() for point in plot_total: t, a = point plt.plot(count, t, get_color(a), zorder=1) count += 1 plt.plot(range(len(plot_total)), [x[0] for x in plot_total], 'b--', zorder=2, label='Price') plt.suptitle(stock) plt.legend(loc='upper left') plt.show()
def bagging_ELM(name, numberofHiddenNeurons, Type='W1', C=64, ActivationFunction='sig'): train, test = loadData(name) shapeOfAnswer = [] numOfBaseClasser = 10 trainStr = ELMDataStruct(train) testStr = ELMDataStruct(test) beginTrainTime = time() for i in range(numOfBaseClasser): print('Begin %d th train' % (i + 1)) baggingTrain = dataBagging(trainStr) baggingTrainStr = ELMDataStruct(baggingTrain) answer = WELM(numberofHiddenNeurons, baggingTrainStr, testStr, Type, ActivationFunction, C, baseclasser=True) if i == 0: answerMatrix = answer shapeOfAnswer = shape(answer) else: answerMatrix = column_stack((answerMatrix, answer)) outputAnswer = zeros((shapeOfAnswer)) endTrainTime = time() trainTime = endTrainTime - beginTrainTime #matrix2CSV_Once(answerMatrix,[]) for j in range(shapeOfAnswer[0]): voteAnswer = 1 maxVoteNum = 0 for k in range(trainStr.numOfClass): voteNum = sum(answerMatrix[j, :] == (k + 1)) if voteNum > maxVoteNum: maxVoteNum = voteNum voteAnswer = k + 1 outputAnswer[j] = voteAnswer #print(outputAnswer) #input() acc = accuracy(answer, testStr.y) print('-' * 20, 'Bagging result', '-' * 20) print('Bagging trainTime:', trainTime) gmean, Rn = G_mean(answer, testStr.y, testStr.numOfClass) print('-' * 20, 'Bagging result', '-' * 20) return acc, gmean, Rn, trainTime
def test(epoch, k, batch_size=32, num_hidden=100): train_x, train_y, test_x, test_y, valid_x, valid_y = loadData.loadData() rbm = RBM(train_x, input_validation=valid_x, input_test=test_x, batch_size=batch_size, n_hidden=num_hidden) rbm.train(epoch, k=k) # hp.visualize_parameter(rbm.W, name="Wn30k5") hp.plotCrossEntropyError(rbm.train_entropy, rbm.validation_entropy, name="k5") hp.save_param(rbm.W, "W" + str(num_hidden)) hp.save_param(rbm.b, "b" + str(num_hidden)) hp.save_param(rbm.c, "c" + str(num_hidden))
def _build_inver_table(self): """ 倒排表 :return: """ doc = loadData(self.fin, sep=',') count = 0 for d in doc: print(count) count += 1 UserID = d[0] if UserID not in self.userItems: self.userItems[UserID] = set() MovieID = d[1] self.userItems[UserID].add(MovieID)
def main(): #ex1data1. train_data = loadData('data2/ex2data2.txt') legends = ['y = 1', 'y = 0'] titles = [ "Microchip Test 1", "Microchip Test 1", "Scatter Plot of training data" ] plotTrainingData(train_data, [], titles, legends) feature_vactor = expand_features(train_data, 6) tetas = gradient_descent_logistic_regression(train_data, feature_vactor, 1, 1) tetas = gradient_descent_logistic_regression(train_data, feature_vactor, 1, 0) tetas = gradient_descent_logistic_regression(train_data, feature_vactor, 1, 100)
def main(): model = loadData() if whichMode == "train": return if trainingModel == "autoencoder": (model, autoencoder) = model if startLayer == -1: autoencoder = model else: autoencoder = model # initialise a dataCollection instance phase = "firstRound" # finding adversarial examples from original model handleOne(model, autoencoder, phase, startIndexOfImage, dataProcessingBatchNum, firstRound_manipulations[0])
def main(): model = loadData() dc = dataCollection() # handle a set of inputs starting from an index succNum = 0 for whichIndex in range(startIndexOfImage, startIndexOfImage + dataProcessingBatchNum): print "\n\nprocessing input of index %s in the dataset: " % ( str(whichIndex)) succ = handleOne(model, dc, whichIndex) if succ == True: succNum += 1 dc.addSuccPercent(succNum / float(dataProcessingBatchNum)) dc.provideDetails() dc.summarise() dc.close()
def main(): bits = 28 #load data X_tr, y_tr, w_tr = loadData() plotDistribution(X_tr, y_tr, w_tr) #select some features for plotting sel_features = [ features.index('PRI_tau_eta'), features.index('PRI_lep_eta'), features.index('DER_deltar_tau_lep'), features.index('PRI_met_sumet'), features.index('DER_mass_transverse_met_lep')] #and make all 2D combinations possible for f1, f2 in itertools.combinations(sel_features, 2): plot2DFeatures(X_tr, y_tr, w_tr, f1, f2, th=0.0)
def main(): #ex1data1. train_data = loadData('data2/ex2data1.txt') legends = ['Admitted', 'Not Admitted'] titles = ["Exam 1 Score", "Exam 2 Score", "Scatter Plot of training data"] plotTrainingData(train_data, [], titles, legends) #train_data = [[0,0,0],[0,1,1],[1,0,1],[1,1,1]] tetas, mean, std_dev = gradient_descent_logistic_regression( train_data, 0.8) test_data = [[45, 85, 1]] test_data = normalize_test_data(test_data, mean, std_dev) test_data = add_bias_term_in_data(test_data) for i in range(len(test_data)): predicted_value = predict_instance(test_data[i], tetas) print("Probability of Test Example is %(key2)s" % { 'key1': i + 1, 'key2': predicted_value })
def main(): with tf.Session() as sess: model = loadData() if whichMode == "train": return if trainingModel == "autoencoder": (model, autoencoder) = model if startLayer == -1: autoencoder = model else: autoencoder = model images = [] labels = [] for i in range(dataProcessingBatchNum): imageIndex = startIndexOfImage + i image = NN.getImage(model, imageIndex) (originalClass, originalConfident) = NN.predictWithImage(model, image) if dataset == "imageNet": label = np.zeros(NN.nb_classes) label[originalClass] = 1 else: label = NN.getLabel(model, imageIndex) # keep information for the original image origClassStr = dataBasics.LABELS(int(originalClass)) path0 = "%s/%s_original_as_%s_with_confidence_%s.png" % ( directory_pic_string, imageIndex, origClassStr, originalConfident) dataBasics.save(-1, np.squeeze(image), path0) print(np.max(image), np.min(image), image.shape) images.append(image - 0.5) labels.append(label) end_vars = tf.global_variables() test_attack(sess, model, np.array(images), np.array(labels)) return
def trainWithRealData(): """ Test with the real-deal """ X_tr, y_tr, w_tr = loadData() m, n = X_tr.shape model = LogisticRegression(features=['PRI_tau_eta', 'PRI_lep_eta', 'DER_deltar_tau_lep', 'PRI_met_sumet', 'DER_mass_transverse_met_lep']) #tune parameters later. model.train(X_tr, y_tr, w_tr) p, r = model.predict(X_tr) #calculate some accuracy on the same train set acc = 100.0*(p.flatten() == y_tr.flatten()).sum()/m print "%s%%"%acc #save the model model.save('data/logisticRegression%.2f.txt'%acc)
def __init__(self, num_class = 14, limit = None): self.num_class = num_class self.iteration = 0 # data self.data = loadData.loadData(limit = limit).getDataArray() print "number of training data:", len(self.data) # model parameters # for each pixel in each class, there are: # 1. pi representing probability of the class # 2. alpha representing probability of being foreground # 3. mu and sigma representing gaussian of observation vs fg/bg data self.param_pi = np.array([1.0/self.num_class] * self.num_class) self.param_alpha = np.zeros((self.num_class, len(self.data[0]))) + 0.5 self.param_mu = np.random.random((self.num_class, len(self.data[0]))) self.param_sigma = np.ones((self.num_class, len(self.data[0]))) * 10 # visualizer self.visualizer = visualizer.visualizer()
def main(): pd.set_option('display.max_columns', None) plt.rcParams['figure.figsize'] = (20, 20) ensenyament = "G1042" path = 'recommenderItemBL/' + ensenyament datas = loadData(ensenyament) primer = datas[0] segon = datas[1] lbl2 = datas[2] primer.fillna(value=5.0, inplace=True) segon.fillna(value=5.0, inplace=True) preds_eval1 = evalRecommender(primer, segon) #=========================================================================== # evalRecommender2(primer, segon) #=========================================================================== plotScatter(preds_eval1, "whitegrid", ensenyament, path, lbl2, primer, segon)
def biasAnalysis(): # 分析有多少用户有偏置 [[m_row, m_col, m_val], implicit] = loadData() length = len(m_row) posCnt, negCnt = 0, 0 # 偏好好评和偏好差评的人数 minNum = 5 # 至少需要minNum个评价才能认为有偏置 i = 0 while i < length: # 遍历所有用户 if m_val[i] == 0: # 隐含信息,不统计 i += 1 continue if m_val[i] >= 4: # 用户偏好好评 flag = True num = 1 for j in range(i + 1, length): # 遍历该用户的剩余评价 if m_row[i] != m_row[j]: # 该用户遍历完毕 break if m_val[j] < 4 and m_val[j] > 0: # 该用户不偏好好评 flag = False break num += 1 if flag and num >= minNum: posCnt += 1 elif m_val[i] <= 2 and m_val[i] > 0: # 用户偏好差评 flag = True num = 1 for j in range(i + 1, length): # 遍历该用户的剩余评价 if m_row[i] != m_row[j]: # 该用户遍历完毕 break if m_val[j] > 2: # 该用户不偏好差评 flag = False break num += 1 if flag and num >= minNum: negCnt += 1 for j in range(i, length): # 跳到下一个用户 if m_row[i] != m_row[j]: i = j break if j == length - 1: i = length return [m_row[-1], posCnt, negCnt]
def main(): model = loadData() dc = dataCollection() # handle a set of inputs starting from an index if dataProcessing == "batch": for whichIndex in range(startIndexOfImage, startIndexOfImage + dataProcessingBatchNum): print "\n\nprocessing input of index %s in the dataset: " % ( str(whichIndex)) if task == "safety_check": handleOne(model, dc, whichIndex) # handle a sinextNumSpane input else: print "\n\nprocessing input of index %s in the dataset: " % ( str(startIndexOfImage)) if task == "safety_check": handleOne(model, dc, startIndexOfImage) if dataProcessing == "batch": dc.provideDetails() dc.summarise() dc.close()
def loadData(params, newRun): # load or restore data print("start loading data, time: {}".format(time.ctime())) if params['loadBlob'] is not None: img_train, l_train, f_train, \ img_val, l_val, f_val = ld.restoreData(params['loadBlob']) elif newRun: img_train, l_train, f_train, \ img_val, l_val, f_val = ld.loadData(params['out_dir'], params) else: img_train, l_train, f_train, \ img_val, l_val, f_val = ld.restoreData(params['out_dir']) print("end loading data, time: {}".format(time.ctime())) print("Train images shape", img_train.shape, l_train.shape) print("Train images min/max", img_train.min(), img_train.max()) print("Train images data type ", img_train.dtype) for i in range(len(img_val)): print("Val images shape", img_val[i].shape, l_val[i].shape) print("Val images data type ", img_val[i].dtype) return img_train, l_train, f_train, img_val, l_val, f_val
def shortenData(): workbook = load_workbook("./Data/dataGathering/tokenizedReducedData.xlsx") sheet = workbook.active data, labels = ld.loadData() numericalData = ld.loadNumericalTags() tags0 = ld.getTags(0) tags1 = ld.getTags(1) tags2 = ld.getTags(2) tags3 = ld.getTags(3) tags4 = ld.getTags(4) tags5 = ld.getTags(5) shortenedLabels, shortenedNumericalData, shortenedTags0, shortenedTags1, shortenedTags2, shortenedTags3, shortenedTags4, shortenedTags5 = [], [], [], [], [], [], [], [] for i in range(0, len(labels)): if (labels[i] == "n" or labels[i] == "l" or labels[i] == "o"): shortenedLabels.append(labels[i]) shortenedNumericalData.append(numericalData[i]) shortenedTags0.append(tags0[i]) shortenedTags1.append(tags1[i]) shortenedTags2.append(tags2[i]) shortenedTags3.append(tags3[i]) shortenedTags4.append(tags4[i]) shortenedTags5.append(tags5[i]) for i in range(0, len(shortenedLabels)): sheet.cell(row=i + 2, column=1).value = shortenedLabels[i] sheet.cell(row=i + 2, column=2).value = shortenedNumericalData[i] sheet.cell(row=i + 2, column=3).value = shortenedTags0[i] sheet.cell(row=i + 2, column=4).value = shortenedTags1[i] sheet.cell(row=i + 2, column=5).value = shortenedTags2[i] sheet.cell(row=i + 2, column=6).value = shortenedTags3[i] sheet.cell(row=i + 2, column=7).value = shortenedTags4[i] sheet.cell(row=i + 2, column=8).value = shortenedTags5[i] workbook.save("./Data/dataGathering/tokenizedReducedData.xlsx")
def run(sample): # for missing value imputation # load data trainData, target = loadData('data', 'SalePrice') # get neighborhood geographical coordinates and valence bins if not os.path.isfile('neighborhood.json'): neighborhoods = prepNeighbors( trainData, target, bins=[0, 100000, 150000, 200000, 250000, 300000, np.inf]) else: with open('neighborhood.json', 'r') as f: neighborhoods = json.load(f) missingVal = np.nan #for price prediction #load model modelFile = 'RandomCVModel.rfmdl' if os.path.isfile(modelFile): model = pickle.load(open(modelFile, 'rb')) else: raise IOError( 'file {} could not be found.\n Specify directory and make sure file exists' ) price, imputedData = makePrediction(trainData, neighborhoods, sample, model, missingVal) imputedData.to_csv('processed_user_input.csv', sep=',', index=False, header=True) return (price, imputedData)
def plusTest(): # Reading data [m, implicit] = loadData() # Reading finished # Split data dimMax = 10**3 m1 = dataSplit(m, [dimMax, dimMax]) [m1_train, m1_test] = getTrainTest(m1) m1 = dataSplit(m, [2 * dimMax, 2 * dimMax]) [m2_train, m2_test] = getTrainTest(m1) m1 = dataSplit(m, [3 * dimMax, 3 * dimMax]) [m3_train, m3_test] = getTrainTest(m1) m1_train[3] = dimMax m1_train[4] = dimMax m2_train[3] = 2 * dimMax m2_train[4] = 2 * dimMax m3_train[3] = 3 * dimMax m3_train[4] = 3 * dimMax implicit1 = implicitSplit(implicit, [dimMax, dimMax]) implicit2 = implicitSplit(implicit, [2 * dimMax, 2 * dimMax]) implicit3 = implicitSplit(implicit, [3 * dimMax, 3 * dimMax]) [p1, q1, sigma1, b_user1, b_item1, y] = SVDplus(m1_train, 0.0001, 10, 0.1, implicit1) [p2, q2, sigma2, b_user2, b_item2, y] = SVDplus(m2_train, 0.0001, 10, 0.01, implicit2) [p3, q3, sigma3, b_user3, b_item3, y] = SVDplus(m3_train, 0.0001, 10, 0.01, implicit3) RMSE_SVDplus = [0, 0, 0] RMSE_SVDplus[0] = computeRMSEplus(m1_test, p1, q1, sigma1, b_user1, b_item1, y, implicit1) RMSE_SVDplus[1] = computeRMSEplus(m2_test, p2, q2, sigma2, b_user2, b_item2, y, implicit2) RMSE_SVDplus[2] = computeRMSEplus(m3_test, p3, q3, sigma3, b_user3, b_item3, y, implicit3) print(RMSE_SVDplus)
def newTest(): # Reading data [m, implicit] = loadData() # Reading finished # Split data dimMax = 10**3 m1 = dataSplit(m, [dimMax, dimMax]) [m1_train, m1_test] = getTrainTest(m1) m1 = dataSplit(m, [2 * dimMax, 2 * dimMax]) [m2_train, m2_test] = getTrainTest(m1) m1 = dataSplit(m, [3 * dimMax, 3 * dimMax]) [m3_train, m3_test] = getTrainTest(m1) implicit1 = implicitSplit(implicit, [dimMax, dimMax]) implicit2 = implicitSplit(implicit, [2 * dimMax, 2 * dimMax]) implicit3 = implicitSplit(implicit, [3 * dimMax, 3 * dimMax]) # Split finished # Funk-SVD ###### funkSVD(m, a, maxK, eps) [p1, q1] = funkSVD(m1_train, 0.1, 50, 1e-5) [p2, q2] = funkSVD(m2_train, 0.1, 50, 1e-5) [p3, q3] = funkSVD(m3_train, 0.1, 50, 1e-5) RMSE_funkSVD = [0, 0, 0] RMSE_funkSVD[0] = computeRMSE(m1_test, p1, q1) RMSE_funkSVD[1] = computeRMSE(m2_test, p2, q2) RMSE_funkSVD[2] = computeRMSE(m3_test, p3, q3) print(RMSE_funkSVD) # rfunk-SVD ###### rfunkSVD(m, a, maxK, lamb, eps) [p1, q1] = rfunkSVD(m1_train, 1.0, 50, 0.25, 1e-5) [p2, q2] = rfunkSVD(m2_train, 1.0, 50, 0.25, 1e-5) [p3, q3] = rfunkSVD(m3_train, 1.0, 50, 0.25, 1e-5) RMSE_rfunkSVD = [0, 0, 0] RMSE_rfunkSVD[0] = computeRMSE(m1_test, p1, q1) RMSE_rfunkSVD[1] = computeRMSE(m2_test, p2, q2) RMSE_rfunkSVD[2] = computeRMSE(m3_test, p3, q3) print(RMSE_rfunkSVD)
def main(self): try: l = loading.loadData() table_1, region, dates = l.activeCases() table_2, groups, dates, datesInDays = l.totalCases() print(colored("Loading completed.", 'blue')) p = plotting.plotData() for i in range(0, len(region)): p.plotActiveCases(table_1, region, dates, i) for i in range(0, len(groups)): p.plotTotalCases(table_2, groups, dates, datesInDays, i) if (groups[i] == 'Total number of cases'): p.plotEvolutionOfSigmoidParameter(table_2.iloc[i][1:], datesInDays) print(colored("Fitting completed.", 'blue')) print(colored("Plotting completed.", 'blue')) except Exception as e: print( colored("The following exception was catched: " + str(e), 'red')) print( colored( str(exc_tb.tb_frame.f_code.co_filename) + " at line " + str(exc_tb.tb_lineno), 'red'))
from labelDictionnary import labelDictionnary from training import training import numpy as np from loadData import loadData with open( 'twidf_window4_directed_weighted' ,"r") as File: X = np.loadtxt(File , delimiter=',') path = '../data/r8_train_stemmed.txt' trainData = True data = loadData(path,trainData) labels = data['labels'] (dictionnaryOfClasses , labelsInNumbers) = labelDictionnary(labels) lsi = True numberOfComponents = 100 (reducedMatrix , Y) = dimensialityReduction(X , labelsInNumbers , lsi , numberOfComponents) svm = True scores = training(reducedMatrix , Y , svm )
if r5_avg > best_r5_avg: best_r5_avg = r5_avg updateParameters(alpha, beta, gamma, delta, bestParams5) if r10_avg > best_r10_avg: best_r10_avg = r10_avg updateParameters(alpha, beta, gamma, delta, bestParams10) r5_tuple = (r5_avg, r5pop_avg, r5syn_avg, r5synpop_avg) r10_tuple = (r10_avg, r10pop_avg, r10syn_avg, r10synpop_avg) print '(%f, %f, %f, %f): r5 = %s, r10 = %s' % (alpha, beta, gamma, delta, r5_tuple, r10_tuple) if outfile: outfile.write('%f,%f,%f,%f,%s,%s\n' % (alpha, beta, gamma, delta, r5_tuple, r10_tuple)) return bestParams5, best_r5_avg, bestParams10, best_r10_avg ## Comment out this entire block if not running from Python shell ld.loadData(True) # This function must be run. Be careful if this is commented out. setQuestionModelModifications(ld.questions) folds = ld.getCVFolds() print 'Generating word vectors' frequentWords, wordToIndex = wordvectors.getFrequentWords(ld.questions) wordVecs = wordvectors.getWordVectors(ld.questions, wordToIndex) ## End block counter = 0 recall_test_scores = [0.0, 0.0] for fold in folds[0:5]: resetModels() counter += 1 print 'Starting Fold %d' % counter trainQuestions = fold[0]
# Machine Learning | K-Means Clustering # Jimmy Wallace import loadData as ld import random import matplotlib.pyplot as plt from tkinter import * from tkinter.ttk import * dataList = ld.loadData() data = [] class DataPoints(object): def __init__(self,vector): self.vector = vector def getDim(self): return len(self.vector) def getVector(self,vector): return self.vector def getDistance(self,cluster,method): dim = self.getDim() temp = [] for i in range(dim): x = (cluster[i] - self.vector[i])**2 temp.append(x) if method == 'e':
def get(self): cities=model.City.all() loadData.clearData(self.response, cities) loadData.loadData(self.response) self.response.out.write( 'data reloaded')
feature_dim = train_data.shape[1] label_dim = train_label.shape[1] train_data = normalizeData(train_data) test_data = normalizeData(test_data) elm = ELM(feature_dim, feature_dim*10, label_dim, 'lite', 'dec') elm.trainModel(train_data, train_label) elm.save(r"D:\workspace\Data\Data Synthesis\synthesis\synthesis\weights\elm1") elm.testModel(test_data, test_label) """ # Train on real data train_data , train_label, test_data, test_label = loadData('REAL\greyscale', percent = 1) feature_dim = train_data.shape[1] label_dim = train_label.shape[1] train_data = normalizeData(train_data) test_data = normalizeData(test_data) elm = ELM(feature_dim, feature_dim*10, label_dim, 'lite', 'dec') elm.trainModel(train_data, train_label) elm.save(r"D:\workspace\Data\Data Synthesis\synthesis\synthesis\weights\elmReal") elm.testModel(test_data, test_label) """ # Train on synthetic data and fun-tune on real data
#-*- coding=utf-8 -*- __author__ = "Xingwei He" from keras.utils import np_utils import Alexnet import loadData nb_classes = 10 #load data X_train,Y_train,X_test,Y_test= loadData.loadData() #normalize the data X_train = X_train.astype('float32') X_test = X_test.astype('float32') X_train /= 255 X_test /= 255 # convert class vectors to binary class matrices Y_train = np_utils.to_categorical(Y_train, nb_classes) Y_test = np_utils.to_categorical(Y_test, nb_classes) Alex=Alexnet.Alexnet(X_train,Y_train,X_test,Y_test) Alex.fit()
def sklClassify(): verbose = True trainingData, trainingTargets, validationData, validationTargets, testData, testTargets = ld.loadData('movements_day1-3.dat') clf = SVC() newTargets = [] for i in range(len(trainingTargets)): newTargets.append(np.argmax(trainingTargets[i])) newTestTargets = [] for i in range(len(testTargets)): newTestTargets.append(np.argmax(testTargets[i])) clf.fit(trainingData, newTargets) A = np.zeros((len(testTargets[0]), len(testTargets[0]))) for index in range(len(testTargets)): A[newTestTargets[index], clf.predict(testData[index])[0]] += 1 total = sum(sum(A)) correct = 0 for index in range(len(testTargets[0])): correct += A[index,index] m1 = 1 for index in range(len(testTargets[0])): denominator = sum(A[:,index]) if (denominator == 0): if (verbose): print "P( correct | yMax =",index,") = NO DATA" m1 = -1 else: v = A[index,index]/denominator if (v<m1): m1 = v if (verbose): print "P( correct | yMax =",index,") =",v m2 = 1 for index in range(len(testTargets[0])): denominator = sum(A[index,:]) if (denominator == 0): if (verbose): print "P( correct | target=",index,") = NO DATA" m2 = -1 else: v = A[index,index]/denominator if (v<m2): m2 = v if (verbose): print "P( correct | target=",index,") =",v if (verbose): print A print correct," correct of ", total print "P( correct ) = ", correct/total print "min( P( correct | yMax ) ) = ", m1 print "min( P( correct | target ) ) = ", m2 return correct/total,m1,m2
bleached_list = list(zip(*bleached_list)[::-1]) print("removed " + str(len(cleaned_list) - len(bleached_list)) + " columns.") return bleached_list def printL(l): s = "" for i in range(len(l)): for j in range(len(l[0])): if(l[i][j] == 0): s += "0" else: s += "X" s += "\n" return s # Permet de normaliser les valeurs du dataset # mini/maxi = valeur minimale/maximale qu'on souhaite obtenir pour chaque valeur def normaliser_bdd(dataset, mini, maxi): if(mini == maxi or maxi < mini): raise ValueError("valeurs mini maxi fausses") normalized_list = list(dataset) #pour chaque entrée for i in range(len(normalized_list)): for j in range(len(normalized_list[i])): #formule de la normalisation normalized_list[i][j] = (normalized_list[i][j] - mini) / (maxi - mini) return normalized_list a = loadD.loadData("Starting_Kit/sample_data/cifar10_train.data") nettoyer_bdd(a.getData())
#main program started here. maxLoad=140 terminateTime=12*60.0 filterAngle=30.0 siteSearchRange=1000 #only search sites within 5km maxDepth=10 #search depth stopCount=10000 # for each search, stop when we already got enough results debug=1 startTime=time.time() (locations,orders)=loadData('../original_data') if debug==1: xgap=1200 ygap=1200 shift=8000 locations=locations[(locations.x<xgap+shift)&(locations.x>-xgap+shift)&(locations.y<ygap+shift)&(locations.y>-ygap+shift)] orders=orders[(orders.ox<xgap+shift)&(orders.ox>-xgap+shift)&(orders.oy<ygap+shift)&(orders.oy>-ygap+shift)&(orders.dx<xgap+shift)&(orders.dx>-xgap+shift)&(orders.dy<ygap+shift)&(orders.dy>-ygap+shift)] sites=locations[locations['location_type']=='sites'] shops=locations[locations['location_type']=='shops'] spots=locations[locations['location_type']=='spots'] numOfSites=len(sites) numOfOrders=len(orders) normalOrders=orders[orders['order_type']==0]
lines += line with open(path + filename, 'wt') as fout: # 再次文本方式写入,不含空行 fout.write(lines) def array2CSV_Once(matrix, indexName, path='D:\桌面\ELM' + '\\', filename='test.csv'): # python2可以用file替代open with open(path + filename, "wt") as csvfile: writer = csv.writer(csvfile, dialect='excel') # 先写入columns_name writer.writerow(indexName) # 写入多行用writerows writer.writerows(matrix) with open(path + filename, 'rt') as fin: # 读有空行的csv文件,舍弃空行 lines = '' for line in fin: if line != '\n': lines += line with open(path + filename, 'wt') as fout: # 再次文本方式写入,不含空行 fout.write(lines) #测试 from loadData import loadData from numpy import mat train = loadData()[0].A array2CSV_Once(train, [])
# Counts number of games a team played, total def addPlays(game): teams = [game["Home Team"]["Team"], game["Vis Team"]["Team"]] for team in teams: if team in numGames: numGames[team] += 1 else: numGames[team] = 1 def printScores(num): ranked = rankScores(teamScores) for i in range(num): print("{}: {}, {:.5f}".format(i+1, ranked[i][0], ranked[i][1])) games = loadData('wagstatscfb2014.csv') teamScores = {} numGames = {} for game in games: addPlays(game) home = game["Home Team"] away = game["Vis Team"] weight = 1.0/game["Week"] homeTeam = home["Team"] awayTeam = away["Team"] homeTotal = 0