Exemplo n.º 1
0
def dualProblem(inputFiles):
    errors = []
    start_time = time.time()
    for File in inputFiles:
        data = tools.readData(File)
        X = data[:, :-1]
        Y = data[:-1]
        kf = KFold(len(Y), n_folds=10)
        trainError = 0
        testError = 0
        for train, test in kf:
            G = gaussian_function(X[train], 0.05)
            print "Done with G!"
            alpha = computeAlpha(G, Y[train], 0.05)
            theta = computeTheta(alpha, X[train])
            Y_hat = np.dot(X[train], theta)
            Y_hat_test = np.dot(X[test], theta)
            trainError += tools.findError(Y_hat, Y[train])
            testError += tools.findError(Y_hat_test, Y[test])
        trainError = trainError / len(kf)
        testError = testError / len(kf)
        error = [trainError, testError]
        errors.append(error)
    time_taken = start_time - time.time()
    print "Time Taken for all data sets: %s" % str(time_taken)
    return np.asarray(errors)
Exemplo n.º 2
0
def newtonRaphson(inputFiles):
    pol = PolynomialFeatures(2)
    errors = []
    for File in inputFiles:
        data = tools.readData(File)
        X = data[:, :-1]
        Y = data[:, -1]
        kf = KFold(len(Y), n_folds=10)
        trainError = 0
        testError = 0
        for train, test in kf:
            Z = pol.fit_transform(X[train])
            row, col = Z.shape
            theta = np.empty(col, dtype='float')
            meanDiff = 1.0
            i = 1
            #print "Theta iteration %s: \n%s" % ('0', str(theta))
            while abs(meanDiff) > 1.0e-15:
                theta_new = recalculateTheta(theta, Z, Y[train])
                diff = np.subtract(theta_new, theta)
                meanDiff = np.mean(diff)
                #print "Theta iteration %s: \n%s" % (str(i), str(theta_new))
                #print "Diff: %s" % str(meanDiff)
                theta = theta_new
                i += 1
            Z_test = pol.fit_transform(X[test])
            Y_hat_test = np.dot(Z_test, theta)
            Y_hat = np.dot(Z, theta)
            trainError += tools.findError(Y_hat, Y[train])
            testError += tools.findError(Y_hat_test, Y[test])
        trainError = trainError / len(kf)
        testError = testError / len(kf)
        iterative_error = [trainError, testError]
        errors.append(iterative_error)
    return np.asarray(errors)
Exemplo n.º 3
0
def polyRegressionKFold(inputFiles, deg=2):
      print "***************************"
      print "Degree: %s" % deg
      start_time = time.time()
      errors = []
      for File in inputFiles:
            print "___________________________"
            print "Data Set: %s" % File
            data = tools.readData(File)
            data = data[np.argsort(data[:,0])]
            X = data[:, :-1]
            Y = data[:, len(data[1,:]) - 1]
            kf = KFold(len(data), n_folds = 10, shuffle = True)
            TrainError = 0
            TestError = 0
            for train, test in kf:
                  pol = PolynomialFeatures(deg)
                  Z = pol.fit_transform(X[train]) 
                  Z_test = pol.fit_transform(X[test])     
                  theta = regress(Z, Y[train])
                  Y_hat = np.dot(Z, theta)
                  Y_hat_test = np.dot(Z_test, theta)
                  TrainError += mean_squared_error(Y[train], Y_hat)
                  TestError += mean_squared_error(Y[test], Y_hat_test)
            TestError /= len(kf)
            TrainError /= len(kf)
            errors.append([TestError, deg])
            print "---------------------------"
            print "Test Error: %s" % TestError
            print "Train Error: %s" % TrainError
      time_taken = start_time - time.time()
      print "Time Taken for primal: %s" % str(time_taken)
      return np.asarray(errors)
Exemplo n.º 4
0
def dualProblem(inputFiles):
      errors = []
      start_time = time.time()
      for File in inputFiles:
            data = tools.readData(File)
            X = data[:, :-1]
            Y = data[:-1]
            kf = KFold(len(Y), n_folds = 10)
            trainError = 0
            testError = 0
            for train, test in kf:
                  G = gaussian_function(X[train], 0.05)
                  print "Done with G!"
                  alpha = computeAlpha(G, Y[train], 0.05)
                  theta = computeTheta(alpha, X[train])
                  Y_hat = np.dot(X[train], theta)
                  Y_hat_test = np.dot(X[test], theta)
                  trainError += tools.findError(Y_hat, Y[train])
                  testError += tools.findError(Y_hat_test, Y[test])
            trainError = trainError/len(kf)
            testError = testError/len(kf)
            error = [trainError, testError]
            errors.append(error)
      time_taken = start_time - time.time()
      print "Time Taken for all data sets: %s" % str(time_taken)
      return np.asarray(errors)
Exemplo n.º 5
0
def newtonRaphson(inputFiles):
      pol = PolynomialFeatures(2)
      errors = []
      for File  in inputFiles:
            data = tools.readData(File)
            X = data[:, :-1]
            Y = data[:, -1]
            kf = KFold(len(Y), n_folds = 10)
            trainError = 0
            testError = 0
            for train, test in kf:
                  Z = pol.fit_transform(X[train])
                  row, col = Z.shape
                  theta = np.empty(col, dtype='float')
                  meanDiff = 1.0
                  i = 1
                  #print "Theta iteration %s: \n%s" % ('0', str(theta))
                  while abs(meanDiff) > 1.0e-15 :
                        theta_new = recalculateTheta(theta, Z, Y[train])
                        diff = np.subtract(theta_new, theta)
                        meanDiff = np.mean(diff)
                        #print "Theta iteration %s: \n%s" % (str(i), str(theta_new))
                        #print "Diff: %s" % str(meanDiff)
                        theta = theta_new
                        i += 1
                  Z_test = pol.fit_transform(X[test])
                  Y_hat_test = np.dot(Z_test, theta)
                  Y_hat = np.dot(Z, theta)
                  trainError += tools.findError(Y_hat, Y[train])
                  testError += tools.findError(Y_hat_test, Y[test])
            trainError = trainError/len(kf)
            testError = testError/len(kf)
            iterative_error = [trainError, testError]
            errors. append(iterative_error)
      return np.asarray(errors)
Exemplo n.º 6
0
def linearRegressionKFold(inputFiles, i=1):
      print "\nSingle Variable, Degree: %s" % i
      print "###########################"

      for File in inputFiles:
            print "==========================="
            print "Data Set %s" % File
            data = tools.readData(File)
            X = data[:, 0]
            Y = data[:, 1]
            kf = KFold(len(data), n_folds=10, shuffle=True)
            TrainError = 0
            TestError = 0
            for train, test in kf:
                  Z = tools.createZ(X[train], i)
                  theta = regress(Z, Y[train])
                  Y_hat = YHat(theta, X[train])
                  Y_hat_test = YHat(theta, X[test])
                  TrainError = TrainError + tools.findError(theta, Y[train])
                  TestError = TestError + tools.findError(theta, Y[test])  
            TestError = TestError / len(kf)
            TrainError = TrainError / len(kf)
            print "---------------------------"
            print "Test Error: %s" % TestError
            print "Train Error: %s" % TrainError
            py_linearRegression(X, Y)
      return TestError
Exemplo n.º 7
0
def polyRegressionKFold(inputFiles, deg=2):
    print "***************************"
    print "Degree: %s" % deg
    start_time = time.time()
    errors = []
    for File in inputFiles:
        print "___________________________"
        print "Data Set: %s" % File
        data = tools.readData(File)
        data = data[np.argsort(data[:, 0])]
        X = data[:, :-1]
        Y = data[:, len(data[1, :]) - 1]
        kf = KFold(len(data), n_folds=10, shuffle=True)
        TrainError = 0
        TestError = 0
        for train, test in kf:
            pol = PolynomialFeatures(deg)
            Z = pol.fit_transform(X[train])
            Z_test = pol.fit_transform(X[test])
            theta = regress(Z, Y[train])
            Y_hat = np.dot(Z, theta)
            Y_hat_test = np.dot(Z_test, theta)
            TrainError += mean_squared_error(Y[train], Y_hat)
            TestError += mean_squared_error(Y[test], Y_hat_test)
        TestError /= len(kf)
        TrainError /= len(kf)
        errors.append([TestError, deg])
        print "---------------------------"
        print "Test Error: %s" % TestError
        print "Train Error: %s" % TrainError
    time_taken = start_time - time.time()
    print "Time Taken for primal: %s" % str(time_taken)
    return np.asarray(errors)
Exemplo n.º 8
0
def linearRegression(inputFiles, i = 1, quarters = 4, dataReduction = False):
      k = 1
      regr = linear_model.LinearRegression(fit_intercept=False)
      for File in inputFiles:
            data = tools.readData(File)
            data [np.argsort(data[:, 0])]
            limit = quarters * (len(data)/4)
            Z = tools.createZ(data[:, 0], i)
            theta = regress(Z, data[:, 1]) 
            Y_hat = YHat(theta, data[:, 0])
            plt.subplot(2,2,k)
            plt.scatter(data[:, 0], data[:, 1], color="green")
            X = data[:, 0]
            plt.plot(X, Y_hat, color="red", lw=3, label = "Original Method")
            k = k + 1
            if (dataReduction == False):
                  regr.fit(Z, data[:, 1])
                  #plt.plot(X, regr.predict(Z), color="blue", lw="1", label ="Python functions")
            else:
                  Z = tools.createZ(data[0:limit, 0], i)
                  theta = regress(Z, data[0:limit, 1])
                  Y_hat_small = YHat(theta, data[:, 0])
                  plt.plot(X, Y_hat_small, color="blue", lw = 1, label = "Reduced Data Set")
                  plt.title("Reduced Data %sn/4" % quarters)
      
      plt.suptitle("Single Variable Degree: %s" % i)
      plt.show()
Exemplo n.º 9
0
def plotData(inputFiles):
      i = 1;
      for File in inputFiles:
            data = tools.readData(File)
            plt.subplot(2, 2, i)
            plt.scatter(data[:, 0], data[:, 1], color="black")
            i = i+1
      plt.show()
Exemplo n.º 10
0

        
Exemplo n.º 11
0
def polyRegression(inputFiles):
    pol = PolynomialFeatures(2)
    errors = []
    for Files in inputFiles:
        data = tools.readData(Files)
        data = data[np.argsort(data[:, 0])]
        X = data[:, :-1]
        Y = data[:, -1]
        kf = KFold(len(Y), n_folds=10)
        trainError = 0
        testError = 0
        for train, test in kf:
            Z = pol.fit_transform(X[train])
            theta = regress(Z, Y[train])
            Y_hat = np.dot(Z, theta)
            Z_test = pol.fit_transform(X[test])
            Y_hat_test = np.dot(Z_test, theta)
            trainError += tools.findError(Y_hat, Y[train])
            testError += tools.findError(Y_hat_test, Y[test])
        testError = testError / len(kf)
        trainError = trainError / len(kf)
        explicit_error = [trainError, testError]
        errors.append(explicit_error)
    return np.asarray(errors)
Exemplo n.º 12
0
def polyRegression(inputFiles):
      pol = PolynomialFeatures(2)
      errors = []
      for Files in inputFiles:
            data = tools.readData(Files)
            data = data[np.argsort(data[:, 0])]
            X = data[:, :-1]
            Y = data[:, -1]
            kf = KFold(len(Y), n_folds = 10)
            trainError = 0
            testError = 0
            for train, test in kf:
                  Z = pol.fit_transform(X[train])
                  theta = regress(Z, Y[train])
                  Y_hat = np.dot(Z, theta)
                  Z_test = pol.fit_transform(X[test])
                  Y_hat_test = np.dot(Z_test, theta)
                  trainError += tools.findError(Y_hat, Y[train])
                  testError += tools.findError(Y_hat_test, Y[test])
            testError = testError/len(kf)
            trainError = trainError/len(kf)
            explicit_error = [trainError, testError]
            errors.append(explicit_error)
      return np.asarray(errors)
Exemplo n.º 13
0
    return data, errors

if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("-l", "--lfile", dest="learnFile",
                              help="Learning data (CSV file name)")
    parser.add_option("-t", "--tfile", dest="testFile",
                              help="Testing data (CSV file name)")
    parser.add_option("-o", "--ofile", dest="outFile", help="Output file name to store testing data classification")
    (options, args) = parser.parse_args()

    learnFile = options.learnFile # e. g. vertebral_learn.csv
    testFile = options.testFile # e. g. vertebral_test.csv
    outFile = options.outFile # e. g. res

    learnData = readData(learnFile)
    c45= C45(learnData)
    tree = c45.constructTree()
    tree = c45.pruneTree(tree)
    printTree(tree)

    testData = readData(testFile)

    classifiedLearnData, learnErrors = classifyData(tree, learnData)
    classifiedTestData, testErrors = classifyData(tree, testData)

    writeData(classifiedTestData, outFile)

    print "Learning data error: %d/%d (%f)" % (learnErrors, len(learnData), float(learnErrors)/len(learnData))
    print "Testing data error: %d/%d (%f)" % (testErrors, len(testData), float(testErrors)/len(testData))
Exemplo n.º 14
0
def drawTree(dataFile, treeFile, colors=colors, save_to=''):
    '''Draws 2D plot and displays in it:
         ``data``: (att1, ..., attN, class)
         ``tree`` - displays as colored background
    '''
    if type(dataFile) == str:
        data = readData(dataFile)
    else:
        data = dataFile
    if type(treeFile) == str:
        tree = loadTree(treeFile)
    else:
        tree = treeFile

    attribute_count = len(data[0]) - 1
    if attribute_count != 2:
        print "Too many attributes (%d), could draw only 2D plots..." % attribute_count
        return 1

    # Divide data vectors by their class, x, y; find min, max values.
    point_dict = dict()
    min_x, max_x, min_y, max_y = None, None, None, None
    for record in data:
        vector, cls = record[:-1], record[-1]
        if not point_dict.has_key(cls):
            point_dict[cls] = [[], []]
        x = vector[0]
        y = vector[1]
        point_dict[cls][0].append(x)
        point_dict[cls][1].append(y)
        if not min_x or x < min_x:
            min_x = x
        if not max_x or x > max_x:
            max_x = x
        if not min_y or y < min_y:
            min_y = y
        if not max_y or y > max_y:
            max_y = y

    # Draw tree:
    point_cons = 400
    min_x -= 2 * (min_x + max_x) / point_cons
    max_x += 2 * (min_x + max_x) / point_cons
    min_y -= 2 * (min_y + max_y) / point_cons
    max_y += 2 * (min_y + max_y) / point_cons

    ## Draw tree:
    tree_dict = dict()
    for x in arange(min_x, max_x, (max_x - min_x) / point_cons):
        for y in arange(min_y, max_y, (max_y - min_y) / point_cons):
            point_cls = tree.getClass([x, y])
            if not tree_dict.has_key(point_cls):
                tree_dict[point_cls] = [[], []]
            tree_dict[point_cls][0].append(x)
            tree_dict[point_cls][1].append(y)

    for cls in tree_dict:
        plt.scatter(tree_dict[cls][0],
                    tree_dict[cls][1],
                    2,
                    c=colors[int(cls)][1],
                    marker='s',
                    linewidth=0)

    # Draw data points:
    for cls in point_dict:
        plt.scatter(point_dict[cls][0],
                    point_dict[cls][1],
                    33,
                    c=colors[int(cls)][0],
                    marker='o')
    plt.axis([min_x, max_x, min_y, max_y])
    plt.title(', '.join([
        colors[cls][2] + ': ' + str(count)
        for (cls, count) in classDistribution(data).items()
    ]))
    if save_to:
        plt.savefig(save_to)
        plt.cla()
    else:
        plt.show()
Exemplo n.º 15
0
    parser.add_option("-t",
                      "--tfile",
                      dest="testFile",
                      help="Testing data (CSV file name)")
    parser.add_option(
        "-o",
        "--ofile",
        dest="outFile",
        help="Output file name to store testing data classification")
    (options, args) = parser.parse_args()

    learnFile = options.learnFile  # e. g. vertebral_learn.csv
    testFile = options.testFile  # e. g. vertebral_test.csv
    outFile = options.outFile  # e. g. res

    learnData = readData(learnFile)
    c45 = C45(learnData)
    tree = c45.constructTree()
    tree = c45.pruneTree(tree)
    printTree(tree)

    testData = readData(testFile)

    classifiedLearnData, learnErrors = classifyData(tree, learnData)
    classifiedTestData, testErrors = classifyData(tree, testData)

    writeData(classifiedTestData, outFile)

    print "Learning data error: %d/%d (%f)" % (
        learnErrors, len(learnData), float(learnErrors) / len(learnData))
    print "Testing data error: %d/%d (%f)" % (
Exemplo n.º 16
0
import tools
import time
while (True):
    tools.readData()
    time.sleep(600)
Exemplo n.º 17
0
def drawTree(dataFile, treeFile, colors=colors, save_to=''):
    '''Draws 2D plot and displays in it:
         ``data``: (att1, ..., attN, class)
         ``tree`` - displays as colored background
    '''
    if type(dataFile) == str:
        data = readData(dataFile)
    else:
        data = dataFile
    if type(treeFile) == str:
        tree = loadTree(treeFile)
    else:
        tree = treeFile

    attribute_count = len(data[0]) -1
    if attribute_count != 2:
        print "Too many attributes (%d), could draw only 2D plots..." % attribute_count
        return 1

    # Divide data vectors by their class, x, y; find min, max values.
    point_dict = dict()
    min_x, max_x, min_y, max_y = None, None, None, None
    for record in data:
        vector, cls = record[:-1], record[-1]
        if not point_dict.has_key(cls):
            point_dict[cls] = [[],[]]
        x = vector[0]
        y = vector[1]
        point_dict[cls][0].append(x)
        point_dict[cls][1].append(y)
        if not min_x or x < min_x:
            min_x = x
        if not max_x or x > max_x:
            max_x = x
        if not min_y or y < min_y:
            min_y = y
        if not max_y or y > max_y:
            max_y = y

    # Draw tree:
    point_cons = 400
    min_x -= 2 * (min_x + max_x) / point_cons
    max_x += 2 * (min_x + max_x) / point_cons
    min_y -= 2 * (min_y + max_y) / point_cons
    max_y += 2 * (min_y + max_y) / point_cons

    ## Draw tree:
    tree_dict = dict()
    for x in arange(min_x, max_x, (max_x - min_x) / point_cons):
        for y in arange(min_y, max_y, (max_y - min_y) / point_cons):
            point_cls = tree.getClass([x, y])
            if not tree_dict.has_key(point_cls):
                tree_dict[point_cls] = [[],[]]
            tree_dict[point_cls][0].append(x)
            tree_dict[point_cls][1].append(y)

    for cls in tree_dict:
        plt.scatter(tree_dict[cls][0], tree_dict[cls][1], 2,
                                c=colors[int(cls)][1], marker='s', linewidth=0)

    # Draw data points:
    for cls in point_dict:
        plt.scatter(point_dict[cls][0], point_dict[cls][1], 33,
                                c=colors[int(cls)][0], marker='o')
    plt.axis([min_x, max_x, min_y, max_y])
    plt.title(', '.join([colors[cls][2] + ': ' + str(count) for (cls, count) in
                                                classDistribution(data).items()]))
    if save_to:
        plt.savefig(save_to)
        plt.cla()
    else:
        plt.show()