Exemplo n.º 1
0
def learnPredictor(trainExamples, testExamples, featureExtractor):
    weights = collections.Counter()
    def loss(w, phi, y):
        return max(1 - util.dotProduct(w, phi) * y, 0)
    
    eta = 0.1  
    numIters = 3 
    def sgradLoss(w, phi, y):
        if loss(w, phi, y) == 0:
            return collections.Counter()
        for key, value in phi.items():
            phi[key] = -1 * phi[key] * y
        return phi
    
    def predictor(x):
        if x == None:
            return -1
        if util.dotProduct(featureExtractor(x), weights) > 0:
            return 1
        else:
            return 0 

    for iteration in xrange(numIters):
        for input, output in trainExamples:
            if input == None:
                continue
            util.increment(weights, -1 * eta, sgradLoss(weights, 
                featureExtractor(input), output))
        
        if DEBUG:
            print util.evaluatePredictor(trainExamples, predictor) 
            #print util.evaluatePredictor(testExamples, predictor)
    
    return weights
Exemplo n.º 2
0
def trainAndEvaluate():
    """Trains a baseline predictor and prints its mean squared error.
    """
    # Import the training and test data as numpy matrices
    train_array = csvAsArray('data/train.csv')

    # Format the training data as a list of (input, output) tuples
    train_examples = []
    for i in range(len(train_array)):
        input_size = range(len(train_array[i]) - 1)
        input_data = (train_array[i][j] for j in input_size)
        output     = train_array[i][80] / 1000.0
        train_examples.append((input_data, output))

    # Define predictor functions for baseline and oracle
    baseline     = learnBaseline(train_array)
    oracle_train = learnOracle(train_examples)

    # Evaluate mean squared error of predictors
    baseline_error = evaluatePredictor(baseline, train_examples)
    oracle_error   = evaluatePredictor(oracle_train, train_examples)

    # Print the results
    print ""
    print "-------------------"
    print "BASELINE AND ORACLE"
    print "-------------------"
    print "Number of examples:    ", len(train_examples)
    print "Baseline (median) MSE: ", baseline_error
    print "Oracle MSE:            ", oracle_error
    print ""
Exemplo n.º 3
0
def trainAndTest():

    # Import the training and test data as numpy arrays
    train_array = util.csvAsArray('data/train_updated.csv')
    test_array = util.csvAsArray('data/test.csv')
    # Generate a list of (feature vector, value) tuples for the training data
    feature_names = util.getCsvHeaders('data/train_updated.csv')
    train_examples = []
    k_examples = []
    for i in range(len(train_array)):
        feature_count = range(len(train_array[i]) - 1)
        feature_values = [train_array[i][j] for j in feature_count]
        feature_vector = featurize(feature_values, feature_names)
        output = train_array[i][len(train_array[0]) - 1]
        train_examples.append((feature_vector, output))
        k_examples.append(feature_vector)
    # Train a k-means model on the training data and evaluate its mean
    # squared error with the test data

    random.shuffle(train_examples)
    for i in range(0, NUM_SPLITS, 2):
        startTest = i * len(train_examples) / NUM_SPLITS
        endTest = (i + 1) * len(train_examples) / NUM_SPLITS
        currentTrainExamples = train_examples[0:startTest] + train_examples[
            endTest:len(train_examples)]
        (centroids, assign, loss, loss_list,
         centroid_vals) = kmeans(currentTrainExamples, NUM_CLUSTERS, 500)

        currentBoostedExamples = [(currentTrainExamples[ind][0],
                                   loss_list[ind])
                                  for ind in range(len(currentTrainExamples))]

        boostedRegPredictor = learnBoostedRegression(currentBoostedExamples, 500, \
                0.00000000001, num_trees=NUM_B_TREES)

        pre_computed_centroid_dots = [
            util.dotProduct(centroids[ind], centroids[ind])
            for ind in range(NUM_CLUSTERS)
        ]

        def kmeanspredictor(x):
            assignment = 0
            min_dist = 1000000
            for j in range(NUM_CLUSTERS):
                cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct(
                    centroids[j], x) + pre_computed_centroid_dots[j]
                if cur_dist < min_dist:
                    assignment = j
                    min_dist = cur_dist
            return centroid_vals[assignment]

        def boostedKPredictor(x):
            return kmeanspredictor(x) + boostedRegPredictor(x)

        print "leaving out the", (
            i + 1
        ), "th segment of the data, the validation error for the regression is:", util.evaluatePredictor(
            boostedKPredictor, train_examples[startTest:endTest])
def crossValidate(predictor, num_folds):
    """Performs k-fold cross validation on a specified predictor function and
    prints the results.

    Args:
        predictor (func): A predictor function.
        num_folds (int): Number of data folds for cross-validation.
    """
    # Import the training data as a numpy array
    train_array = csvAsArray('data/train_updated.csv')

    # Generate a list of (feature vector, value) tuples for the training data
    feature_names = getCsvHeaders('data/train_updated.csv')

    # Convert the training array into ([features], value) example tuples
    train_examples = []
    for i in range(len(train_array)):
        feature_count = range(len(train_array[i]) - 1)
        feature_values = [train_array[i][j] for j in feature_count]
        feature_vector = featurize(feature_values, feature_names)
        output = train_array[i][len(train_array[0]) - 1]
        train_examples.append((feature_vector, output))

    # Randomize the order of the example tuples to aid validation
    random.shuffle(train_examples)

    # Validation on each fold
    validation_set_size = len(train_examples) / num_folds
    for fold in range(num_folds):

        # Create training and validation sets
        valdiation_start = fold * validation_set_size
        validation_end = validation_start + validation_set_size
        validation_set = train_examples[validation_start:validation_end]
        training_set = train_examples[:validation_start] + train_examples[
            validation_end:]

        # Train a regression model on the training data and evaluate its mean
        # squared error with the validation set
        tuning_parameter = 1
        predictor_fn = predictor(train, 1, 0.01, tuning_parameter)
        regression_error = evaluatePredictor(predictor_fn, validation_set)

        # Print the results
        print ""
        print "----------"
        print "REGRESSION"
        print "----------"
        print "Lambda: ", tuning_parameter
        print "Number of examples: ", len(train_examples)
        print "Regression MSE:     ", regression_error
        print ""
Exemplo n.º 5
0
def r_squared(examples, predictor):
    prediction_error = util.evaluatePredictor(predictor,
                                              examples) * len(examples)

    outputs = []
    for i in range(len(examples)):
        outputs.append(examples[i][1])

    mean = 1.0 * sum(outputs) / len(outputs)

    variance = 0
    for i in range(len(outputs)):
        variance += math.pow(outputs[i] - mean, 2)

    variance = 1.0 * variance

    print prediction_error / variance
    return 1 - (prediction_error / variance)
Exemplo n.º 6
0
def trainAndEvaluate():
    """Trains a linear regression predictor and prints its mean squared error.
    """
    # Import the training data as a numpy array
    train_array = csvAsArray('data/train_updated.csv')

    # Generate a list of (feature vector, value) tuples for the training data
    feature_names = getCsvHeaders('data/train_updated.csv')
    train_examples = []
    for i in range(len(train_array)):
        feature_count = range(len(train_array[i]) - 1)
        feature_values = [train_array[i][j] for j in feature_count]
        feature_vector = featurize(feature_values, feature_names)
        output = train_array[i][len(train_array[0]) - 1]
        train_examples.append((feature_vector, output))

    random.shuffle(train_examples)
    test = train_examples[:len(train_examples) / 10]
    train = train_examples[len(train_examples) / 10:]

    # Train a regression model on the training data and evaluate its mean
    # squared error with the test data
    for tuning_parameter in range(5, 21, 5):
        tuning_parameter = 1.0 * tuning_parameter / 10
        regressionPredictor = learnRegression(train, 500, 0.00000000001,
                                              tuning_parameter)
        regression_error = evaluatePredictor(regressionPredictor, test)

        # Print the results
        print ""
        print "----------"
        print "REGRESSION"
        print "----------"
        print "Lambda (lasso): ", tuning_parameter
        print "Number of examples: ", len(train_examples)
        print "Regression MSE:     ", regression_error
        print ""
Exemplo n.º 7
0
def trainAndEvaluate():
    """Trains a gradient-boosted linear regression predictor and prints its
    mean squared error.
    """
    # Import the training data as a numpy array
    train_array = csvAsArray('data/train_updated.csv')

    # Generate a list of (feature vector, value) tuples for the training data
    feature_names = getCsvHeaders('data/train_updated.csv')
    train_examples = []
    for i in range(len(train_array)):
        feature_count = range(len(train_array[i]) - 1)
        feature_values = [train_array[i][j] for j in feature_count]
        feature_vector = featurize(feature_values, feature_names)
        output = train_array[i][len(train_array[0]) - 1]
        train_examples.append((feature_vector, output))

    random.shuffle(train_examples)
    test = train_examples[:len(train_examples) / 10]
    train_examples = train_examples[len(train_examples) / 10:]

    # Train a regression model on the training data and evaluate its mean
    # squared error with the test data
    boostedRegressionPredictor = learnBoostedRegression(train_examples, 500, \
            0.000000001, num_trees=5)
    regression_error = evaluatePredictor(boostedRegressionPredictor, \
            test)

    # Print the results
    print ""
    print "------------------"
    print "BOOSTED REGRESSION"
    print "------------------"
    print "Number of examples: " + str(len(train_examples))
    print "Regression MSE:     " + str(regression_error)
    print ""
Exemplo n.º 8
0

def featureExtractor(x):
    # x = "took Mauritius into"
    phi = defaultdict(float)
    #phi[x] = 1
    tokens = x.split()
    left, entity, right = tokens[0], tokens[1:-1], tokens[-1]
    phi['entity is ' + ' '.join(entity)] = 1
    phi['left is ' + left] = 1
    phi['right is ' + right] = 1
    for word in entity:
        phi['entity contains ' + word] = 1
        phi['entity contains prefix ' + word[:4]] = 1
        phi['entity contains suffix ' + word[-4:]] = 1
    return phi


# Learn a predictor
weights = submission.learnPredictor(trainExamples, devExamples,
                                    featureExtractor, 30, 0.05)
util.outputWeights(weights, 'weights')
util.outputErrorAnalysis(devExamples, featureExtractor, weights,
                         'error-analysis')

# Test!!!
testExamples = util.readExamples('names.test')
predictor = lambda x: 1 if util.dotProduct(featureExtractor(x), weights
                                           ) > 0 else -1
print 'test error =', util.evaluatePredictor(testExamples, predictor)