def dualProblem(inputFiles): errors = [] start_time = time.time() for File in inputFiles: data = tools.readData(File) X = data[:, :-1] Y = data[:-1] kf = KFold(len(Y), n_folds=10) trainError = 0 testError = 0 for train, test in kf: G = gaussian_function(X[train], 0.05) print "Done with G!" alpha = computeAlpha(G, Y[train], 0.05) theta = computeTheta(alpha, X[train]) Y_hat = np.dot(X[train], theta) Y_hat_test = np.dot(X[test], theta) trainError += tools.findError(Y_hat, Y[train]) testError += tools.findError(Y_hat_test, Y[test]) trainError = trainError / len(kf) testError = testError / len(kf) error = [trainError, testError] errors.append(error) time_taken = start_time - time.time() print "Time Taken for all data sets: %s" % str(time_taken) return np.asarray(errors)
def newtonRaphson(inputFiles): pol = PolynomialFeatures(2) errors = [] for File in inputFiles: data = tools.readData(File) X = data[:, :-1] Y = data[:, -1] kf = KFold(len(Y), n_folds=10) trainError = 0 testError = 0 for train, test in kf: Z = pol.fit_transform(X[train]) row, col = Z.shape theta = np.empty(col, dtype='float') meanDiff = 1.0 i = 1 #print "Theta iteration %s: \n%s" % ('0', str(theta)) while abs(meanDiff) > 1.0e-15: theta_new = recalculateTheta(theta, Z, Y[train]) diff = np.subtract(theta_new, theta) meanDiff = np.mean(diff) #print "Theta iteration %s: \n%s" % (str(i), str(theta_new)) #print "Diff: %s" % str(meanDiff) theta = theta_new i += 1 Z_test = pol.fit_transform(X[test]) Y_hat_test = np.dot(Z_test, theta) Y_hat = np.dot(Z, theta) trainError += tools.findError(Y_hat, Y[train]) testError += tools.findError(Y_hat_test, Y[test]) trainError = trainError / len(kf) testError = testError / len(kf) iterative_error = [trainError, testError] errors.append(iterative_error) return np.asarray(errors)
def polyRegressionKFold(inputFiles, deg=2): print "***************************" print "Degree: %s" % deg start_time = time.time() errors = [] for File in inputFiles: print "___________________________" print "Data Set: %s" % File data = tools.readData(File) data = data[np.argsort(data[:,0])] X = data[:, :-1] Y = data[:, len(data[1,:]) - 1] kf = KFold(len(data), n_folds = 10, shuffle = True) TrainError = 0 TestError = 0 for train, test in kf: pol = PolynomialFeatures(deg) Z = pol.fit_transform(X[train]) Z_test = pol.fit_transform(X[test]) theta = regress(Z, Y[train]) Y_hat = np.dot(Z, theta) Y_hat_test = np.dot(Z_test, theta) TrainError += mean_squared_error(Y[train], Y_hat) TestError += mean_squared_error(Y[test], Y_hat_test) TestError /= len(kf) TrainError /= len(kf) errors.append([TestError, deg]) print "---------------------------" print "Test Error: %s" % TestError print "Train Error: %s" % TrainError time_taken = start_time - time.time() print "Time Taken for primal: %s" % str(time_taken) return np.asarray(errors)
def dualProblem(inputFiles): errors = [] start_time = time.time() for File in inputFiles: data = tools.readData(File) X = data[:, :-1] Y = data[:-1] kf = KFold(len(Y), n_folds = 10) trainError = 0 testError = 0 for train, test in kf: G = gaussian_function(X[train], 0.05) print "Done with G!" alpha = computeAlpha(G, Y[train], 0.05) theta = computeTheta(alpha, X[train]) Y_hat = np.dot(X[train], theta) Y_hat_test = np.dot(X[test], theta) trainError += tools.findError(Y_hat, Y[train]) testError += tools.findError(Y_hat_test, Y[test]) trainError = trainError/len(kf) testError = testError/len(kf) error = [trainError, testError] errors.append(error) time_taken = start_time - time.time() print "Time Taken for all data sets: %s" % str(time_taken) return np.asarray(errors)
def newtonRaphson(inputFiles): pol = PolynomialFeatures(2) errors = [] for File in inputFiles: data = tools.readData(File) X = data[:, :-1] Y = data[:, -1] kf = KFold(len(Y), n_folds = 10) trainError = 0 testError = 0 for train, test in kf: Z = pol.fit_transform(X[train]) row, col = Z.shape theta = np.empty(col, dtype='float') meanDiff = 1.0 i = 1 #print "Theta iteration %s: \n%s" % ('0', str(theta)) while abs(meanDiff) > 1.0e-15 : theta_new = recalculateTheta(theta, Z, Y[train]) diff = np.subtract(theta_new, theta) meanDiff = np.mean(diff) #print "Theta iteration %s: \n%s" % (str(i), str(theta_new)) #print "Diff: %s" % str(meanDiff) theta = theta_new i += 1 Z_test = pol.fit_transform(X[test]) Y_hat_test = np.dot(Z_test, theta) Y_hat = np.dot(Z, theta) trainError += tools.findError(Y_hat, Y[train]) testError += tools.findError(Y_hat_test, Y[test]) trainError = trainError/len(kf) testError = testError/len(kf) iterative_error = [trainError, testError] errors. append(iterative_error) return np.asarray(errors)
def linearRegressionKFold(inputFiles, i=1): print "\nSingle Variable, Degree: %s" % i print "###########################" for File in inputFiles: print "===========================" print "Data Set %s" % File data = tools.readData(File) X = data[:, 0] Y = data[:, 1] kf = KFold(len(data), n_folds=10, shuffle=True) TrainError = 0 TestError = 0 for train, test in kf: Z = tools.createZ(X[train], i) theta = regress(Z, Y[train]) Y_hat = YHat(theta, X[train]) Y_hat_test = YHat(theta, X[test]) TrainError = TrainError + tools.findError(theta, Y[train]) TestError = TestError + tools.findError(theta, Y[test]) TestError = TestError / len(kf) TrainError = TrainError / len(kf) print "---------------------------" print "Test Error: %s" % TestError print "Train Error: %s" % TrainError py_linearRegression(X, Y) return TestError
def polyRegressionKFold(inputFiles, deg=2): print "***************************" print "Degree: %s" % deg start_time = time.time() errors = [] for File in inputFiles: print "___________________________" print "Data Set: %s" % File data = tools.readData(File) data = data[np.argsort(data[:, 0])] X = data[:, :-1] Y = data[:, len(data[1, :]) - 1] kf = KFold(len(data), n_folds=10, shuffle=True) TrainError = 0 TestError = 0 for train, test in kf: pol = PolynomialFeatures(deg) Z = pol.fit_transform(X[train]) Z_test = pol.fit_transform(X[test]) theta = regress(Z, Y[train]) Y_hat = np.dot(Z, theta) Y_hat_test = np.dot(Z_test, theta) TrainError += mean_squared_error(Y[train], Y_hat) TestError += mean_squared_error(Y[test], Y_hat_test) TestError /= len(kf) TrainError /= len(kf) errors.append([TestError, deg]) print "---------------------------" print "Test Error: %s" % TestError print "Train Error: %s" % TrainError time_taken = start_time - time.time() print "Time Taken for primal: %s" % str(time_taken) return np.asarray(errors)
def linearRegression(inputFiles, i = 1, quarters = 4, dataReduction = False): k = 1 regr = linear_model.LinearRegression(fit_intercept=False) for File in inputFiles: data = tools.readData(File) data [np.argsort(data[:, 0])] limit = quarters * (len(data)/4) Z = tools.createZ(data[:, 0], i) theta = regress(Z, data[:, 1]) Y_hat = YHat(theta, data[:, 0]) plt.subplot(2,2,k) plt.scatter(data[:, 0], data[:, 1], color="green") X = data[:, 0] plt.plot(X, Y_hat, color="red", lw=3, label = "Original Method") k = k + 1 if (dataReduction == False): regr.fit(Z, data[:, 1]) #plt.plot(X, regr.predict(Z), color="blue", lw="1", label ="Python functions") else: Z = tools.createZ(data[0:limit, 0], i) theta = regress(Z, data[0:limit, 1]) Y_hat_small = YHat(theta, data[:, 0]) plt.plot(X, Y_hat_small, color="blue", lw = 1, label = "Reduced Data Set") plt.title("Reduced Data %sn/4" % quarters) plt.suptitle("Single Variable Degree: %s" % i) plt.show()
def plotData(inputFiles): i = 1; for File in inputFiles: data = tools.readData(File) plt.subplot(2, 2, i) plt.scatter(data[:, 0], data[:, 1], color="black") i = i+1 plt.show()
def polyRegression(inputFiles): pol = PolynomialFeatures(2) errors = [] for Files in inputFiles: data = tools.readData(Files) data = data[np.argsort(data[:, 0])] X = data[:, :-1] Y = data[:, -1] kf = KFold(len(Y), n_folds=10) trainError = 0 testError = 0 for train, test in kf: Z = pol.fit_transform(X[train]) theta = regress(Z, Y[train]) Y_hat = np.dot(Z, theta) Z_test = pol.fit_transform(X[test]) Y_hat_test = np.dot(Z_test, theta) trainError += tools.findError(Y_hat, Y[train]) testError += tools.findError(Y_hat_test, Y[test]) testError = testError / len(kf) trainError = trainError / len(kf) explicit_error = [trainError, testError] errors.append(explicit_error) return np.asarray(errors)
def polyRegression(inputFiles): pol = PolynomialFeatures(2) errors = [] for Files in inputFiles: data = tools.readData(Files) data = data[np.argsort(data[:, 0])] X = data[:, :-1] Y = data[:, -1] kf = KFold(len(Y), n_folds = 10) trainError = 0 testError = 0 for train, test in kf: Z = pol.fit_transform(X[train]) theta = regress(Z, Y[train]) Y_hat = np.dot(Z, theta) Z_test = pol.fit_transform(X[test]) Y_hat_test = np.dot(Z_test, theta) trainError += tools.findError(Y_hat, Y[train]) testError += tools.findError(Y_hat_test, Y[test]) testError = testError/len(kf) trainError = trainError/len(kf) explicit_error = [trainError, testError] errors.append(explicit_error) return np.asarray(errors)
return data, errors if __name__ == "__main__": parser = OptionParser() parser.add_option("-l", "--lfile", dest="learnFile", help="Learning data (CSV file name)") parser.add_option("-t", "--tfile", dest="testFile", help="Testing data (CSV file name)") parser.add_option("-o", "--ofile", dest="outFile", help="Output file name to store testing data classification") (options, args) = parser.parse_args() learnFile = options.learnFile # e. g. vertebral_learn.csv testFile = options.testFile # e. g. vertebral_test.csv outFile = options.outFile # e. g. res learnData = readData(learnFile) c45= C45(learnData) tree = c45.constructTree() tree = c45.pruneTree(tree) printTree(tree) testData = readData(testFile) classifiedLearnData, learnErrors = classifyData(tree, learnData) classifiedTestData, testErrors = classifyData(tree, testData) writeData(classifiedTestData, outFile) print "Learning data error: %d/%d (%f)" % (learnErrors, len(learnData), float(learnErrors)/len(learnData)) print "Testing data error: %d/%d (%f)" % (testErrors, len(testData), float(testErrors)/len(testData))
def drawTree(dataFile, treeFile, colors=colors, save_to=''): '''Draws 2D plot and displays in it: ``data``: (att1, ..., attN, class) ``tree`` - displays as colored background ''' if type(dataFile) == str: data = readData(dataFile) else: data = dataFile if type(treeFile) == str: tree = loadTree(treeFile) else: tree = treeFile attribute_count = len(data[0]) - 1 if attribute_count != 2: print "Too many attributes (%d), could draw only 2D plots..." % attribute_count return 1 # Divide data vectors by their class, x, y; find min, max values. point_dict = dict() min_x, max_x, min_y, max_y = None, None, None, None for record in data: vector, cls = record[:-1], record[-1] if not point_dict.has_key(cls): point_dict[cls] = [[], []] x = vector[0] y = vector[1] point_dict[cls][0].append(x) point_dict[cls][1].append(y) if not min_x or x < min_x: min_x = x if not max_x or x > max_x: max_x = x if not min_y or y < min_y: min_y = y if not max_y or y > max_y: max_y = y # Draw tree: point_cons = 400 min_x -= 2 * (min_x + max_x) / point_cons max_x += 2 * (min_x + max_x) / point_cons min_y -= 2 * (min_y + max_y) / point_cons max_y += 2 * (min_y + max_y) / point_cons ## Draw tree: tree_dict = dict() for x in arange(min_x, max_x, (max_x - min_x) / point_cons): for y in arange(min_y, max_y, (max_y - min_y) / point_cons): point_cls = tree.getClass([x, y]) if not tree_dict.has_key(point_cls): tree_dict[point_cls] = [[], []] tree_dict[point_cls][0].append(x) tree_dict[point_cls][1].append(y) for cls in tree_dict: plt.scatter(tree_dict[cls][0], tree_dict[cls][1], 2, c=colors[int(cls)][1], marker='s', linewidth=0) # Draw data points: for cls in point_dict: plt.scatter(point_dict[cls][0], point_dict[cls][1], 33, c=colors[int(cls)][0], marker='o') plt.axis([min_x, max_x, min_y, max_y]) plt.title(', '.join([ colors[cls][2] + ': ' + str(count) for (cls, count) in classDistribution(data).items() ])) if save_to: plt.savefig(save_to) plt.cla() else: plt.show()
parser.add_option("-t", "--tfile", dest="testFile", help="Testing data (CSV file name)") parser.add_option( "-o", "--ofile", dest="outFile", help="Output file name to store testing data classification") (options, args) = parser.parse_args() learnFile = options.learnFile # e. g. vertebral_learn.csv testFile = options.testFile # e. g. vertebral_test.csv outFile = options.outFile # e. g. res learnData = readData(learnFile) c45 = C45(learnData) tree = c45.constructTree() tree = c45.pruneTree(tree) printTree(tree) testData = readData(testFile) classifiedLearnData, learnErrors = classifyData(tree, learnData) classifiedTestData, testErrors = classifyData(tree, testData) writeData(classifiedTestData, outFile) print "Learning data error: %d/%d (%f)" % ( learnErrors, len(learnData), float(learnErrors) / len(learnData)) print "Testing data error: %d/%d (%f)" % (
import tools import time while (True): tools.readData() time.sleep(600)
def drawTree(dataFile, treeFile, colors=colors, save_to=''): '''Draws 2D plot and displays in it: ``data``: (att1, ..., attN, class) ``tree`` - displays as colored background ''' if type(dataFile) == str: data = readData(dataFile) else: data = dataFile if type(treeFile) == str: tree = loadTree(treeFile) else: tree = treeFile attribute_count = len(data[0]) -1 if attribute_count != 2: print "Too many attributes (%d), could draw only 2D plots..." % attribute_count return 1 # Divide data vectors by their class, x, y; find min, max values. point_dict = dict() min_x, max_x, min_y, max_y = None, None, None, None for record in data: vector, cls = record[:-1], record[-1] if not point_dict.has_key(cls): point_dict[cls] = [[],[]] x = vector[0] y = vector[1] point_dict[cls][0].append(x) point_dict[cls][1].append(y) if not min_x or x < min_x: min_x = x if not max_x or x > max_x: max_x = x if not min_y or y < min_y: min_y = y if not max_y or y > max_y: max_y = y # Draw tree: point_cons = 400 min_x -= 2 * (min_x + max_x) / point_cons max_x += 2 * (min_x + max_x) / point_cons min_y -= 2 * (min_y + max_y) / point_cons max_y += 2 * (min_y + max_y) / point_cons ## Draw tree: tree_dict = dict() for x in arange(min_x, max_x, (max_x - min_x) / point_cons): for y in arange(min_y, max_y, (max_y - min_y) / point_cons): point_cls = tree.getClass([x, y]) if not tree_dict.has_key(point_cls): tree_dict[point_cls] = [[],[]] tree_dict[point_cls][0].append(x) tree_dict[point_cls][1].append(y) for cls in tree_dict: plt.scatter(tree_dict[cls][0], tree_dict[cls][1], 2, c=colors[int(cls)][1], marker='s', linewidth=0) # Draw data points: for cls in point_dict: plt.scatter(point_dict[cls][0], point_dict[cls][1], 33, c=colors[int(cls)][0], marker='o') plt.axis([min_x, max_x, min_y, max_y]) plt.title(', '.join([colors[cls][2] + ': ' + str(count) for (cls, count) in classDistribution(data).items()])) if save_to: plt.savefig(save_to) plt.cla() else: plt.show()