def pegasos_sw(X_train, y_train, lambda_reg=1, max_it=1000, tol=1e-4): W = Counter() s = 1 t = 1 epoch = 1 objective = 1e5 objective2 = 10 m = len(y_train) while abs(objective - objective2) > tol and epoch <= max_it: objective2 = objective objective = 0 for j in range(m): t = t + 1 step = 1 / (t * lambda_reg) review = X_train[j] result = y_train[j] scale = -(step * lambda_reg) cond = result * s * util.dotProduct(W, review) if cond < 1: s = (1 + scale) * s util.increment(W, step * result / s, review) else: s = (1 + scale) * s objective += max(0, 1 - cond) objective = objective / m objective = objective + lambda_reg / 2 * (s**2) * util.dotProduct(W, W) epoch += 1 return s, W
def loss(x,y,l,w): loss = (l*dotProduct(w,w))/2 m = len(x) for i in range(m): loss = loss + (max(0, 1- y[i]*dotProduct(w,x[i])))/m return loss
def pegasos(X_train, y_train, lambda_reg=1, max_it=1000, tol=1e-6): w = Counter() t = 1 epoch = 1 objective = 1e5 objective2 = 10 m = len(y_train) while abs(objective - objective2) > tol and epoch <= max_it: objective2 = objective objective = 0 for j in range(m): t = t + 1 step = 1 / (t * lambda_reg) review = X_train[j] result = y_train[j] scale = -(step * lambda_reg) cond = result * util.dotProduct(w, review) if cond < 1: util.increment(w, scale, w) util.increment(w, step * result, review) else: util.increment(w, scale, w) objective += max(0, 1 - cond) objective = objective / m objective = objective + lambda_reg / 2 * util.dotProduct(w, w) epoch += 1 return w
def getIntensity(self, pos): """Returns the appropriate intensity of the sound being played assuming intensity falls off at 1/r^2""" #Camera doesnt have position so im just using the position of the followed object (of 1st camera) camPos = glad.renderer.cameraList[0].objectFollowed.getPos() r=(pos-camPos)#separation vector if r.isNullVector(): #if the vector is null, sound will be max anyways sin = 1 cos = 1 else: #calculate angles to determine where sound is coming from cos = dotProduct(r.getNormalized(),Vector(-1,0)) sin = dotProduct(r.getNormalized(), Vector(0,1)) #Calculate intensity for left and right channels #when sound is directly to the side have 80 percent come from that side speaker #hopefully this will give some directional sounds k = 130000 #arbitrary constant to calculate sound intensity if r.isNullVector(): intensity = k #removes division by zero error else: intensity = k/r.getMagnitude()**2 #major is the percent of the sound intensity from the side with the greater intensity a=0.68 #max percent of the intensity coming from one side major = (a*0.5)/((0.5*cos)**2+(a*sin)**2)**0.5 #equation for an ellipse if r[0] <= 0: right = major left = 1-major else: left = major right = 1-major right *= intensity left *= intensity if right > 1: right = 1 if left > 1: left = 1 return left,right
def kmeanspredictor(x): assignment = 0 min_dist = 1000000 for j in range(NUM_CLUSTERS): cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct( centroids[j], x) + pre_computed_centroid_dots[j] if cur_dist < min_dist: assignment = j min_dist = cur_dist return centroid_vals[assignment]
def predictor(x): centroid_ind = 0 minDist = float('inf') for k in range(len(centroids)): cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct( centroids[k], x) + pre_computed_centroid_dots[k] min_dist = float('inf') if cur_dist < min_dist: assignment = k min_dist = cur_dist return predictor_list[i](x)
def predictor(x): if x == None: return -1 if util.dotProduct(featureExtractor(x), weights) > 0: return 1 else: return 0
def pegasos_fast(x, y, l): w = dict() temp_w = dict() t = 2 s = 1 temp_loss = 0 flag = True while flag: for j in range(len(x)): t = t + 1 n = 1/(l*t) s = (1-n*l)*s if y[j]*(dotProduct(w, x[j])) < s: temp = x[j].copy() increment(temp, (n*y[j]-1), temp) increment(w,(1/s), temp) temp_w = w.copy() increment(temp_w, s-1, temp_w) loss_real = loss(x,y,l,temp_w) if abs(temp_loss - loss_real) < 10**-2: flag = False temp_loss = loss_real increment(w, s-1, w) return w
def lassoLossGradient(features, weights, true_value, tuning_parameter): """Computes the value of the training loss gradient (with respect to the weight vector) at a specific example. Training loss includes a lasso (L1) regularization term. Args: features (dict): A sparse vector of feature values. weights (dict): A sparse vector of feature weights. true_value (int): The true value of an example. tuning_parameter (double): Coefficient of the lasso regularization term. Returns: A sparse vector (dict) representing the gradient value. """ # Standard squared loss gradient = {} scale = 2 * (dotProduct(features, weights) - true_value) # Lasso term: add gradient of the lasso term to the scaling factor (i.e. # add gradient of |tuning_parameter| * (1-norm of weights) weight_signs = [np.sign(weights[w]) for w in weights] for w in weights: gradient[w] = tuning_parameter * np.sign(weights[w]) increment(gradient, scale, features) return gradient
def findExampleStatsFn(examples, weights, featureExtractor, examineFn): summ = 0 correct = 0 tot = 0 summNeg = 0 correctNeg = 0 totNeg = 0 for example in examples: prompt, response = example[0] if examineFn(prompt, response): phi = featureExtractor(example[0]) score = dotProduct(weights, phi) if example[1] == 1: summ += score if score > 0: correct += 1 tot += 1 if example[1] == -1: summNeg += score if score < 0: correctNeg += 1 totNeg += 1 if tot > 0: print "Average Score (+): {0}".format(1.0*summ/tot) print "Average Correct (+): {0}".format(1.0*correct/tot) if totNeg > 0: print "Average Score (-): {0}".format(1.0*summNeg/totNeg) print "Average Correct (-): {0}".format(1.0*correctNeg/totNeg)
def SparseGradChecker(loss_func, gradient_loss_func, x, y_val, theta, epsilon=0.01, tolerance=1e-4): """Question 3.2: Implement Generic Gradient Checker for Sparse Matrices. Check that the function gradient_loss_func returns the correct gradient for the given x, y_val, and theta. Let d be the number of features. Here we numerically estimate the gradient by approximating the directional derivative in each of the d coordinate directions: (e_1 = (1,0,0,...,0), e_2 = (0,1,0,...,0), ..., e_d = (0,...,0,1) The approximation for the directional derivative of J at the point theta in the direction e_i is given by: ( J(theta + epsilon * e_i) - J(theta - epsilon * e_i) ) / (2*epsilon). We then look at the Euclidean distance between the gradient computed using this approximation and the gradient computed by gradient_loss_func(x, y_val, theta). If the Euclidean distance exceeds tolerance, we say the gradient is incorrect. Args: loss_func - A function that computes the loss for (x, y_val, theta). gradient_loss_func - A function that computes gradient for (x, y_val, theta). x - A single row in the design matrix, represented by a dict/Counter object. (key length = num_features) y_val - the label for the corresponding x_row (-1 or 1) theta - the parameter vector, dict/Counter object. (key length = num_features) epsilon - the epsilon used in approximation tolerance - the tolerance error Return: A boolean value indicate whether the gradient is correct or not """ true_gradient = gradient_loss_func(x, y_val, theta) approx_grad = dict.fromkeys(theta.keys(), 0.0) for key in theta.iterkeys(): # Compute the approximate directional derivative in the chosen direction # Avoid copying since it's so slow. theta_key_original = theta[key] theta[key] += epsilon plus_loss = loss_func(x, y_val, theta) theta[key] = theta_key_original - epsilon minus_loss = loss_func(x, y_val, theta) theta[key] = theta_key_original # restore theta approx_grad[key] = (plus_loss - minus_loss) / (2 * epsilon) util.increment(approx_grad, -1, true_gradient) # approx_grad - true_gradient error = math.sqrt(util.dotProduct( approx_grad, approx_grad)) # np.linalg.norm(approx_grad - true_gradient) if error > tolerance: print 'gradient doesn\'t match approximation. Error:', error return (error < tolerance)
def regularizationLossGradient(features, weights, true_value, tuning_parameter): """Computes the value of the training loss gradient (with respect to the weight vector) at a specific example. Training loss includes a ridge (L2) regularization term. Args: features (dict): A sparse vector of feature values. weights (dict): A sparse vector of feature weights. true_value (int): The true value of an example. tuning_parameter (double): Coefficient of the ridge regularization term. Returns: A sparse vector (dict) representing the gradient value. """ # Standard squared loss gradient = {} scale = 2 * (dotProduct(features, weights) - true_value) # Regularization term: add gradient of the regularization term to the # scaling factor (i.e. add gradient of |tuning_parameter| * # (2-norm of weights)^2 increment(gradient, tuning_parameter, weights) increment(gradient, scale, features) return gradient
def pegasos_grad(X,y,w,lamb): tmp = y*dotProduct(w,X) if 1-tmp > 0: an1 = increment({},lamb,w) ans = increment(an1,y,X) else: ans = increment({},lamb,w) return ans
def PegasosSubgradientLoss(x, y_val, theta, lambda_reg): '''Question 3.2: The Subgradient of the Pegasos Loss function.''' margin = y_val * util.dotProduct(theta, x) subgrad = theta.copy() util.scale(subgrad, lambda_reg) if margin < 1: util.increment(subgrad, -y_val, x) return subgrad
def PercentageWrong(X, y, theta): '''Question 4.3: The percentage incorrect when using theta to predict y from X.''' num_wrong = 0 for i, x in enumerate(X): estimate_sign = np.sign(util.dotProduct(theta, x)) if estimate_sign != y[i]: num_wrong += 1 return 1.0 * num_wrong / len(y)
def per_loss(x,y,w): cnt = 0 total = len(y) for i in range(total): if np.sign(dotProduct(w, x[i])) != np.sign(y[i]): cnt = cnt + 1 error = (cnt/total)*100.0 return error
def learnBoostedRegression(examples, num_iters, step_size, num_trees): """Learns a linear regression model using boosted trees. Args: examples: An array of training examples. num_iters (int): Number of training iterations. step_size (int): Stochastic gradient descent step size. num_trees (int): Number of gradient boosting trees. Returns: A predictor function that outputs a price (int) given a single input tuple. """ list_weights = [] objectives = [cur[1] for cur in examples] filename = "boostedtree_" + str(num_trees - 1) + "_" + str(cross_val_seg) + ".p" if num_trees > 1 and SAVE: (list_weights, num_trees_prev, num_iters_prev) = pickle.load( open(os.path.join("boostedtree_weights", filename), "rb")) for k in range(num_trees): if k >= len(list_weights): print "" print "TREE " + str(k + 1) + " OF " + str(num_trees) curWeights = defaultdict(int) for i in range(num_iters): for ind in range(len(examples)): x = examples[ind][0] gradient = regression.lassoLossGradient( x, curWeights, objectives[ind], .5) increment(curWeights, -step_size / (i + 1), gradient) if VERBOSE: print "Training progress: " + str( 100.0 * (i + 1) / num_iters) + "%" list_weights.append(curWeights) else: curWeights = list_weights[k] for j in range(len(examples)): x, y = examples[j] objectives[j] = objectives[j] - dotProduct(x, curWeights) if VERBOSE: print "COMPLETE" if SAVE: filename = "boostedtree_" + str(num_trees) + "_" + str( cross_val_seg) + ".p" pickle.dump((list_weights, num_trees, num_iters), open(os.path.join("boostedtree_weights", filename), "wb")) # Define the predictor function def predictor(x): return sum(dotProduct(x, curWeight) for curWeight in list_weights) return predictor
def trainAndTest(): # Import the training and test data as numpy arrays train_array = util.csvAsArray('data/train_updated.csv') test_array = util.csvAsArray('data/test.csv') # Generate a list of (feature vector, value) tuples for the training data feature_names = util.getCsvHeaders('data/train_updated.csv') train_examples = [] k_examples = [] for i in range(len(train_array)): feature_count = range(len(train_array[i]) - 1) feature_values = [train_array[i][j] for j in feature_count] feature_vector = featurize(feature_values, feature_names) output = train_array[i][len(train_array[0]) - 1] train_examples.append((feature_vector, output)) k_examples.append(feature_vector) # Train a k-means model on the training data and evaluate its mean # squared error with the test data random.shuffle(train_examples) for i in range(0, NUM_SPLITS, 2): startTest = i * len(train_examples) / NUM_SPLITS endTest = (i + 1) * len(train_examples) / NUM_SPLITS currentTrainExamples = train_examples[0:startTest] + train_examples[ endTest:len(train_examples)] (centroids, assign, loss, loss_list, centroid_vals) = kmeans(currentTrainExamples, NUM_CLUSTERS, 500) currentBoostedExamples = [(currentTrainExamples[ind][0], loss_list[ind]) for ind in range(len(currentTrainExamples))] boostedRegPredictor = learnBoostedRegression(currentBoostedExamples, 500, \ 0.00000000001, num_trees=NUM_B_TREES) pre_computed_centroid_dots = [ util.dotProduct(centroids[ind], centroids[ind]) for ind in range(NUM_CLUSTERS) ] def kmeanspredictor(x): assignment = 0 min_dist = 1000000 for j in range(NUM_CLUSTERS): cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct( centroids[j], x) + pre_computed_centroid_dots[j] if cur_dist < min_dist: assignment = j min_dist = cur_dist return centroid_vals[assignment] def boostedKPredictor(x): return kmeanspredictor(x) + boostedRegPredictor(x) print "leaving out the", ( i + 1 ), "th segment of the data, the validation error for the regression is:", util.evaluatePredictor( boostedKPredictor, train_examples[startTest:endTest])
def test3c1(): weights = {} for _ in range(100): k = ''.join(random.choice(string.ascii_lowercase) for _ in range(5)) v = random.uniform(-1, 1) weights[k] = v data = submission.generateDataset(100, weights) for phi, y in data: grader.require_is_equal(util.dotProduct(phi, weights) >= 0, y == 1)
def gradLoss(phiX, w, y): score = util.dotProduct(w, phiX) margin = score * y if margin < 1: for name, feature in phiX.iteritems(): phiX[name] = -1 * y * feature return phiX else: return 0
def PlotScoresAgainstAccuracy(X_training, y_training, X_testing, y_testing, lambda_reg): '''Question 4.5. Divides the training set into buckets by score, and creates a bar chart showing the accuracy of each bucket. ''' NUM_BUCKETS = 10 theta = Pegasos(X_training, y_training, lambda_reg) # Calculate the score for each row in a list scores = [util.dotProduct(theta, x) for x in X_testing] low_score = min(scores) high_score = max(scores) # f(score) -> bucket score_to_bucket_func = lambda score: int( round((NUM_BUCKETS - 1) * (score - low_score) / (high_score - low_score))) # Make a list of empty lists with NUM_BUCKETS elements # Each entry is a list of the indexes of X's rows that fall in the same score bucket. score_histogram = [[] for _ in range(NUM_BUCKETS)] for row_index, score in enumerate(scores): bucket = score_to_bucket_func(score) score_histogram[bucket].append(row_index) bucket_means = [0.0] * NUM_BUCKETS bucket_accuracy = [0.0] * NUM_BUCKETS for bucket, row_indices in enumerate(score_histogram): # calculate the percentage wrong loss for each bucket # make a scatter plot of these bucket_scores = [scores[row_index] for row_index in row_indices] bucket_score_mean = abs(np.mean(bucket_scores)) bucket_means[bucket] = bucket_score_mean bucket_score_std = np.std(bucket_scores) # print 'Bucket', bucket, 'ranges from', min(bucket_scores), 'to', max(bucket_scores) # print 'Bucket', bucket, 'mean:', bucket_score_mean # print 'Bucket', bucket, 'stdev:', bucket_score_std X_bucket = [X_testing[row_index] for row_index in row_indices] y_bucket = [y_testing[row_index] for row_index in row_indices] bucket_accuracy[bucket] = 100 * ( 1.0 - PercentageWrong(X_bucket, y_bucket, theta)) fig, ax = plt.subplots() ax.set_xlabel('Mean Score for Bucket') ax.set_ylabel('Percentage Correct') ax.set_title('Pegasos Sentiment Analysis: Score vs. Accuracy') width = 0.4 positions = range(0, len(bucket_accuracy)) rects1 = ax.bar(positions, bucket_accuracy, width, color='b', alpha=0.8) plt.xticks(rotation=-45) ax.set_xticks([pos + width for pos in positions]) ax.set_xticklabels(["%0.1f" % mean for mean in bucket_means]) plt.show()
def find_center(ex_index, example, precomputed_x, precomputed_quantities, centroids): assign = 0 min_dist = 1,000 for i in range(K): cur_dist = precomputed_x[ex_index] - 2 * util.dotProduct( centroids[i], example) + precomputed_quantities[i] if cur_dist < min_dist: assign = i min_dist = cur_dist
def getAnswerProbs(weights, questionData): proposedAnswers = questionData["proposedAnswers"] correctIndex = questionData["correctAnswerIndex"] answerScores = [] for aIndex, proposed in enumerate(proposedAnswers): score = dotProduct(weights, featureExtractor(proposed)) answerScores.append(score) return softmax(answerScores)
def Pegasos(X, y, lambda_reg, max_epochs=1000, check_gradient=False): '''Question 4.2. Finds the sparse weight vector that minimizes the SVM loss function on X and y. ''' print 'Running Pegasos with regularization parameter', lambda_reg loss_func = lambda x, y_val, theta: PegasosLoss(x, y_val, theta, lambda_reg ) gradient_loss_func = lambda x, y_val, theta: PegasosSubgradientLoss( x, y_val, theta, lambda_reg) # Initialize theta to have zero for every word mentioned in any review theta = {key: 0.0 for x in X for key in x.keys()} t = 2 # NOTE: This normally starts at zero, but that causes a divide-by-zero error. weight_scalar = 1.0 for epoch in range(max_epochs): # print '--Epoch', epoch old_theta = theta.copy() for j, x in enumerate(X): t += 1 eta = 1.0 / (t * lambda_reg) margin = y[j] * weight_scalar * util.dotProduct(theta, x) # NOTE that the gradient is not differentiable at 1.0, so we don't check it near there. if check_gradient and abs(margin - 1.0) > 0.01: if SparseGradChecker(loss_func, gradient_loss_func, x, y[j], theta): print 'Computed gradient doesn\'t match approximations.' sys.exit(1) grad = gradient_loss_func(x, y[j], theta) util.increment(theta, -eta, grad) else: weight_scalar *= 1.0 - 1.0 / t if margin < 1: util.increment(theta, eta * y[j] / weight_scalar, x) util.increment(old_theta, -1, theta) util.scale(old_theta, weight_scalar) total_change = math.sqrt(util.dotProduct(old_theta, old_theta)) # print '----Change from previous theta:', total_change if total_change < 0.01: break util.scale(theta, weight_scalar) return theta
def learn(self, trainExamples): numIters = 10 step = 0.0001 for i in range(numIters): for feature_vec, y in trainExamples: score = util.dotProduct(self.weights, feature_vec) dloss = {} if score*y > 1: continue else: util.increment(dloss, -y, feature_vec) util.increment(self.weights, -step, dloss)
def percent_error(X, y, w): correct = 0 pos = 0 total = len(y) for i in range(total): sign_value = np.sign(dotProduct(X[i], w)) if y[i] == sign_value: correct += 1 if sign_value > 0: pos += 1 print pos return 1-float(correct)/total
def chooseEval(examples, weights): correct = 0 for i in range(len(examples)): prompt = examples[i][0][0] response1 = examples[i][0][1] randomInt = random.randint(0, len(examples)-1) response2 = examples[randomInt][0][1] # Relies on random to break loop while response1 == response2 or (neg_restrict_bad and isBadTurn(response2)) or response1[0].caller == response2[0].caller: randomInt = random.randint(0, len(examples)-1) response2 = examples[randomInt][0][1] guess1 = (prompt, response1) phi1 = swda_feature_extractor(guess1) score1 = dotProduct(weights, phi1) guess2 = (prompt, response2) phi2 = swda_feature_extractor(guess2) score2 = dotProduct(weights, phi2) if(score1 > score2): correct = correct + 1 if(score1 == score2): correct = correct + .5 return 1.0 * correct / len(examples)
def test3c0(): ans = 0.05 grader.require_is_equal( ans, util.dotProduct({ 'Movie': 0.1, 'is': 0.2, 'good': 0.25 }, { 'Movie': 0.1, 'is': 0.2, 'very': -0.25, 'bad': -0.25 }))
def predict(self, stories): if self.args.verbose > 3: print 'Predicting' if self.args.verbose: print "weights are ", self.weights ANS_LETTERS = ['A', 'B', 'C', 'D'] ans = [] for story in stories: if self.args.verbose > 0: print story.name print formatForPrint(story.rawPassage), "\n" print story.rawQuestions, "\n" print story.rawAnswers, "\n" for qid in range(len(story.questions)): scores = [] for aid in range(len(story.answers[qid])): score = max([(util.dotProduct(self.weights, self.extractFeatures(story, sid, qid, aid)),sid) for \ sid in range(len(story.passageSentences))]) scores.append((score, aid)) # if question contains "n't | not", and begin # with "what, who, whose", select the minium score. s = story.rawQuestions[qid][1].strip() if re.search('^(who|what|whose).*(n\'t|not)', s, flags=re.IGNORECASE): answer = min(scores)[1] else: answer = max(scores)[1] ans.append(ANS_LETTERS[answer]) if self.args.verbose > 0: if answer != story.correctAnswers[qid]: print 'WRONG: %s: correct answer %s, predicted answer %s, scores %s' \ %(story.rawQuestions[qid][0], ANS_LETTERS[story.correctAnswers[qid]], ANS_LETTERS[answer], scores) else: print 'RIGHT: %s: correct answer %s, predicted answer %s, scores %s' \ %(story.rawQuestions[qid][0], ANS_LETTERS[story.correctAnswers[qid]], ANS_LETTERS[answer], scores) if self.args.verbose > 0: print "\n" return ans
def pegasos_SGD(X,y,lamb,num_iter): w = {} t = 1 s = 1 for i in range(num_iter): for j in range(len(X)): t += 1 alpha = 1.0/(t*lamb) tmp = y[j] * s * dotProduct(X[j], w) g = l_de(tmp) s *= (1 - alpha * lamb) w = increment(w, -(alpha*y[j]*g/s), X[j]) print "epoch "+str(i) return increment({},s,w)
def show_incorrect_case(): train_data,test_data = read_files() X_train, y_train = data_label(train_data) X_test, y_test = data_label(test_data) lamb = 0.1 w_o= pickle.load(open("weight",'rb')) test_len = len(X_test) example = 0 for i in range(test_len): w = w_o tmp = dotProduct(X_test[i],w) if (np.sign(tmp)!=y_test[i]): example += 1 print "predicted_score: ",np.sign(tmp) print "true vote: ",y_test[i] dict_tmp = dotProduct_vector(X_test[i],w) sorted_dict = sorted(dict_tmp.items(),key=lambda x: abs(x[1]),reverse=True) print_list_wx = sorted_dict[:8] print_list_abs_wx = [(a,abs(b)) for a,b in print_list_wx] print_list_x = [X_test[i][x] for (x,k) in print_list_wx ] print_list_w = [w[x] for (x,k) in print_list_wx ] print "wx:" print print_list_wx print "\n" print "abs_wx" print print_list_abs_wx print "\n" print "x" print print_list_x print "\n" print "w" print print_list_w if example == 3: break
def score_interval(): train_data,test_data = read_files() X_train, y_train = data_label(train_data) X_test, y_test = data_label(test_data) lamb = 1e-1 w = pegasos_SGD(X_train,y_train,lamb,30) a = [] test_len = len(X_test) for i in range(test_len): a.append(dotProduct(X_test[i],w)) a.sort() print min(a) print max(a) plt.plot(a) plt.show()
def score_confidence(): train_data,test_data = read_files() X_train, y_train = data_label(train_data) X_test, y_test = data_label(test_data) lamb = 1e-1 # w = pegasos_SGD(X_train,y_train,lamb,30) # # pickle.dump(w,open("weight",'wb')) w = pickle.load(open("weight",'rb')) a = [0]*18 #correct count b = [0]*18 #incorrect count c = [0]*18 #total number test_len = len(X_test) for i in range(test_len): tmp = dotProduct(X_test[i],w) c[int(tmp+9)] += 1 if (np.sign(tmp)==y_test[i]): a[int(tmp+9)] += 1 else: b[int(tmp+9)] += 1 # for j in range(18): # if b[j]==0: # print "interval [%s,%s] has %s points and %s are correct, the ratio is %s "%(j-9,j-8,b[j],a[j],0) # else: # print "interval [%s,%s] has %s points and %s are correct, the ratio is %4.2f "%(j-9,j-8,b[j],a[j],float(a[j])/b[j]) wide = 0.35 p1 = plt.bar(np.arange(18),a,width=wide, color='g') p2 = plt.bar(np.arange(18),b,width=wide, color='r', bottom=a) plt.ylabel("Frequency") plt.xlabel("Score Intervals") # plt.title("Score Confidence") X_ticks = ["[%s,%s]"%(k-9,k-8) for k in np.arange(18)] plt.xticks(np.arange(18)+wide/2,X_ticks,rotation=45) plt.legend((p1[0],p2[0]),("Correct","Incorrect")) plt.savefig("t45_score_confidence.png")
def guessEval(examples, weights): correct = 0 for i in range(len(examples)): prompt = examples[i][0][0] maxScore = 0 maxResponse = examples[0][0][1] for j in range(len(examples)): response = examples[j][0][1] if prompt[0].caller == response[0].caller: continue guess = (prompt, response) phi = swda_feature_extractor(guess) score = dotProduct(weights, phi) if score > maxScore: maxScore = score maxResponse = response if maxResponse == examples[i][0][1]: correct = correct + 1 return 1.0 * correct / len(examples)
def fit(self, X): self.w_ = dict() t = 0 for j in range(len(X)): t += 1 step_size = 1 / (t * self.lambda_reg) w_dot_x = util.dotProduct(self.w_, X[j]) y = 1 if 1 in X[j] else -1 if y * w_dot_x < 1: util.increment(self.w_, (1 - 1 / t), self.w_) util.increment(self.w_, step_size * y, X[j]) else: util.increment(self.w_, (1 - 1 / t), self.w_) return self.w_
def expectimax_value(self, game, action_list, depth=1): #Do all of the actions in action_list for action in action_list: game = game.players[self.turn_num].do_move(game, action) turn_num = (self.turn_num+1) % 4 total_action_list = [] #Stores all of the actions made so that you can undo them while depth > 0: opp = game.players[turn_num] opp_action_list = self.guess_opp_move(opp, game) #Add opponent actions with the opponent object so we can undo them if opp_action_list: for opp_action in opp_action_list: game = opp.do_move(game, opp_action) total_action_list.append((opp.turn_num, opp_action)) turn_num = (turn_num+1) % 4 if turn_num == self.turn_num: depth -= 1 #Find value of your estimated future state expected_features = self.feature_extractor(game) expected_score = util.dotProduct(expected_features, self.weights) for i in range(len(game.players)): player = game.players[i] print "cities and settlements: ", player.turn_num, player.cities_and_settlements print "total actions list: ", total_action_list #Undo moves the player made for i in range(len(total_action_list)-1, -1, -1): to_undo = total_action_list[i] opp_num, opp_action = to_undo print "Loop cities and settles: ",opp_num, game.players[opp_num].cities_and_settlements game = game.players[opp_num].undo_move(game, opp_action) #Undo moves the player made for i in range(len(action_list)-1, -1, -1): game = game.players[self.turn_num].undo_move(game, action_list[i]) return expected_score
def pegasos(x, y, l): w = dict() t = 2 temp_loss = 0 cnt = 0 flag = True for i in range(2): for j in range(len(x)): t = t + 1 n = 1/(l*t) if y[j]*(dotProduct(w, x[j])) < 1: cnt = cnt +1 temp = x[j].copy() increment(temp, (n*y[j]-1), temp) increment(w,-n*l,w) increment(w,1,temp) else: increment(w,-n*l,w) return w
def updateWeights(self, game): # print game #Get current score cur_features = self.feature_extractor(game) target = util.dotProduct(cur_features, self.weights) pred = self.prevScore # print "f**k" , cur_features["Player "+ str(1) + " Settlements"] #If there are no previous features you can't update if self.prevFeatures: #Update weights diff = pred - target for feature, val in self.prevFeatures.items(): # print diff, val self.weights[feature] -= self.eta * diff * val self.prevFeatures = cur_features self.prevScore = target
def predict(weights, testSet, args): correct = 0 incorrect = 0 total = 0 for data in testSet: data = json.loads(data) title = data['title'] subreddit = data['subreddit'] features = FeatureExtractor.extractFeatures(title, args) maxScore = float('-inf') prediction = '' for key in weights.keys(): weightVector = weights[key] score = util.dotProduct(weightVector, features) if score > maxScore: prediction = key maxScore = score if prediction == subreddit: correct += 1 else: if args.verbose: try: print title print "predicted: " + prediction.encode('utf-8') print features printRelevantWeights(weights, features) print "-----------------" except UnicodeEncodeError: print "error" incorrect += 1 total += 1 print 'accuracy ' + str(float(correct) / total) print 'wrong ' + str(float(incorrect) / total)
def learnPredictor(trainExamples, testExamples, featureExtractor): ''' Given |trainExamples| and |testExamples| (each one is a list of (x,y) pairs), a |featureExtractor| to apply to x, and the number of iterations to train |numIters|, return the weight vector (sparse feature vector) learned. You should implement stochastic gradient descent. Note: call evaluatePredictor() on both trainExamples and testExamples to see how you're doing as you learn after each iteration. ''' weights = {} # feature => weight stepSize = 1 numIters = 15 for it in range(0, numIters): # iterate through every training example and extract the features of x for x, y in trainExamples: phi = featureExtractor(x) # print phi # if y * score < 1 (wrong prediction) then calculate gradient loss then update weight for each feature margin = y*util.dotProduct(weights, phi) if (1-margin) > 0: indicator = 1 else: indicator = 0 scale = stepSize*indicator*y increment(weights, scale, phi) # this uses the defined feature extractor to predict the classification of x def predictor(x): phi = featureExtractor(x) # create thresholds for different scores score = dotProduct(phi, weights) # return 1 if (dotProduct(phi, weights) > 0) else -1 # Print out training and test error for every iteration: # print 'TRAINING ERROR:', util.evaluatePredictor(trainExamples, predictor) # print 'TEST ERROR:', util.evaluatePredictor(testExamples, predictor) return weights
def main(): path = "" image_name = "" if len(sys.argv) > 1: path = sys.argv[1] if not os.path.exists(path): print "The path provided does not exist." return directories = path.split('/') file_name = directories[len(directories) - 1] else: print "Please supply a path to an image." return os.system("python segment.py " + path) segments = [] for f in os.listdir(SEGMENTS_PATH): if 'temp' in f and image_name in f: # Ways to identify segments of the given path segments.append(os.path.join(SEGMENTS_PATH, f)) f = open('weights.out') # Read in weights weights = eval(f.readline()) stop_sign_flag = False for segment in segments: score = util.dotProduct(weights, seg_util.segmentFeatureExtractor(segment)) if score >= 0: # Stop sign found stop_sign_flag = True break if stop_sign_flag: print "Stop sign detected!" else: print "No stop sign detected"
def ErrorAnalysis(X_training, y_training, X_testing, y_testing, lambda_reg): '''Question 5.1. Prints information about the top incorrect reviews, ordered by the magnitude of their score. ''' theta = Pegasos(X_training, y_training, lambda_reg) scores = [util.dotProduct(theta, x) for x in X_testing] # (index, score) pairs, sorted by the score's absolute value in descending order. score_indexes = sorted( enumerate(scores), reverse=True, key=lambda index_score_pair: abs(index_score_pair[1])) num_incorrect_examples = 0 MAX_NUM_WRONG_EXAMPLES = 10 # Print out the information about all the incorrect examples, in order of largest score. for row_index, score in score_indexes: if num_incorrect_examples >= MAX_NUM_WRONG_EXAMPLES: break y_testing_val = y_testing[row_index] if np.sign(score) != np.sign(y_testing_val): x_testing_row = X_testing[row_index] PrintReviewInfo(x_testing_row, y_testing_val, score, theta) num_incorrect_examples += 1
def main(): path = "" image_name = "" if len(sys.argv) > 1: path = sys.argv[1] if not os.path.exists(path): print "The path provided does not exist." return directories = path.split("/") file_name = directories[len(directories) - 1] else: print "Please supply a path to an image." return os.system("python segment.py " + path) segments = [] for f in os.listdir(SEGMENTS_PATH): if "temp" in f and image_name in f: # Ways to identify segments of the given path segments.append(os.path.join(SEGMENTS_PATH, f)) f = open("weights.out") # Read in weights weights = eval(f.readline()) stop_sign_flag = False for segment in segments: score = util.dotProduct(weights, seg_util.segmentFeatureExtractor(segment)) if score >= 0: # Stop sign found stop_sign_flag = True break if stop_sign_flag: print "Stop sign detected!" else: print "No stop sign detected"
def hingeLoss(w, features, y): return max(0, 1 - dotProduct(w, features) * y)
def predictor(x): return dotProduct(x, weights)
def trainAndTest(): """Defines K-means clustering and perform clustered regression. """ # Import the training and test data as numpy arrays train_array = util.csvAsArray('data/train_updated.csv') test_array = util.csvAsArray('data/test.csv') # Generate a list of (feature vector, value) tuples for the training data feature_names = util.getCsvHeaders('data/train_updated.csv') train_examples = [] k_examples = [] for i in range(len(train_array)): feature_count = range(len(train_array[i]) - 1) feature_values = [train_array[i][j] for j in feature_count] feature_vector = util.featurize(feature_values, feature_names) output = train_array[i][len(train_array[0]) - 1] train_examples.append((feature_vector, output)) random.shuffle(train_examples) for i in range(1, NUM_SPLITS, 2): startTest = i * len(train_examples) / NUM_SPLITS endTest = (i + 1) * len(train_examples) / NUM_SPLITS currentTrain = train_examples[0:startTest] + train_examples[ endTest:len(train_examples)] currentTest = train_examples[startTest:endTest] # Cluster the data using k-means (centroids, assign, loss, loss_list, centroid_vals) = kmeans.kmeans(currentTrain, NUM_CENTROIDS, K_ITERS) # Make clusters cluster_list = [[] for _ in range(len(centroids))] for j in range(len(currentTrain)): cluster_list[assign[i]].append(currentTrain[j]) # Train a regression model on the training data (by cluster) # and evaluate its mean squared error with the train data regression_error = 0 predictor_list = [] pre_computed_centroid_dots = [ util.dotProduct(centroids[k], centroids[k]) for k in range(len(centroids)) ] for cluster_points in cluster_list: boostedRegressionPredictor = boostedtree.learnBoostedRegression( cluster_points, SGD_ITERS, ETA, 5, 0) predictor_list.append(boostedRegressionPredictor) def predictor(x): centroid_ind = 0 minDist = float('inf') for k in range(len(centroids)): cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct( centroids[k], x) + pre_computed_centroid_dots[k] min_dist = float('inf') if cur_dist < min_dist: assignment = k min_dist = cur_dist return predictor_list[i](x) regression_error = boostedtree.evaluatePredictor( predictor, currentTest) #regression_error /= len(train_examples) # Print the results print "" print "------------------" print "CLUSTERED REGRESSION WITH BOOSTING" print "------------------" print "Leaving out segment: " + str(i) print "Number of centroids: " + str(10) print "Number of examples: " + str(len(train_examples)) print "Regression MSE: " + str(regression_error) print "" return predictor_list, centroids, regression_error
def test3c0(): weights = {"hello": 1, "world": 1} data = submission.generateDataset(5, weights) for datapt in data: grader.requireIsEqual((util.dotProduct(datapt[0], weights) >= 0), (datapt[1] == 1))
def test2c(): weights = {'hello': 1, 'world': 1} data = submission.generateDataset(5, weights) for datapt in data: grader.requireIsEqual(util.dotProduct(datapt[0], weights) >= 0, datapt[1] == 1)
def test3c_0(): weights = {"hello": 1, "world": 1} data = submission.generateDataset(5, weights) for datapt in data: grader.requireIsEqual((util.dotProduct(datapt[0], weights) >= 0), (datapt[1] == 1))
#Compare data # threeCards = {} # for _ in range(0,3): # thisChoice = random.choice(pCard.keys()) # threeCards[thisChoice] = util.extract(thisChoice,pCard) # modelValue = util.dotProduct(weights,threeCards[thisChoice]) # oracleValue = correctCount = 0 totalCount = 0 for key,value in testData.iteritems(): totalCount += 1 compareResults = [] myChoice = (0,0) trueChoice = (0,0) for i in range(0,3): if value[i] > trueChoice[1]: trueChoice = (key[i],value[i]) thisValue = [] modelValue = util.dotProduct(weights,util.extract(key[i],pCard)) if modelValue > myChoice[1]: myChoice = (key[i],modelValue) thisValue.append((key[i],value[i],modelValue)) compareResults.append(thisValue) if trueChoice[0] == myChoice[0]: compareResults.append((trueChoice,True)) correctCount += 1 else: compareResults.append(False) print compareResults print correctCount/float(totalCount)
def getTDScore(self, features): return util.dotProduct(self.weights_, features)
def pegasos_loss(X,y,w,lamb): ans = (lamb/2.0)*dotProduct(w,w)+max(0,1-y*dotProduct(w,X)) return ans
def predict(self, x): print "Learned Score:" + str(util.dotProduct(self.weights, x)) return math.copysign(1.0, util.dotProduct(self.weights, x))
def main(): #loading the shuffled data with open('data.pickle', 'rb') as f: review = pickle.load(f) #Splitting into training and test sets train, test = split(review) #Splitting x and y values and getting reasy for training x_train = [] x_test = [] y_train = [] y_test = [] for i in train: y_train.append(i.pop()) x_train.append(bag_of_words(i)) for i in test: y_test.append(i.pop()) x_test.append(bag_of_words(i)) l = 0.5 print("Pegasos fast") start_time = time.time() w1 = pegasos_fast(x_train, y_train, l) time1 = time.time() - start_time print("--- %s seconds ---" % (time1) ) #print(len(w1)) #print("percentage error:", per_loss(x_test,y_test,w1)) #error analysis pos = [] count = 0 for i in range(len(y_test)): if np.sign(dotProduct(w1, x_test[i])) != np.sign(y_test[i]) and count<2: count = count +1 pos.append(i) wrong1 = x_test[pos[0]].copy() new_wrong1 = wrong1.copy() abs_wrong1 = wrong1.copy() print("wrong 1, real:", y_test[pos[0]]) wrong2 = x_test[pos[1]].copy() new_wrong2 = wrong2.copy() abs_wrong2 = wrong2.copy() print("wrong 2, real:", y_test[pos[1]]) #multiplying weight for i in wrong2: wt = w1.get(i,0) abs_wrong2[i] = abs(abs_wrong2[i]*wt) wrong2[i] = wrong2[i]*wt keys = sorted(abs_wrong2) absvals = sorted(abs_wrong2.values()) #print("name:"," absolute product"," product", " x"," w") for i in range(len(keys)): print(keys[-i],",", abs_wrong2[keys[-i]],",", wrong2[keys[-i]],",", new_wrong2[keys[-i]],",", w1.get(keys[-i],0),",") '''
def loss(w, phi, y): return max(1 - util.dotProduct(w, phi) * y, 0)
def printExamples(examples, weights, featureExtractor, SCORE_THRESHOLD): # random.jumpahead(1) random.shuffle(examples) # SCORE_THRESHOLD = .5 print "Finding Interesting Examples..." tpFound = fpFound = tnFound = fnFound = False for example in examples: prompt, response = example[0] if not examineCollab(prompt, response) : continue phi = featureExtractor(example[0]) score = dotProduct(weights, phi) if not tpFound and score > SCORE_THRESHOLD and example[1] == 1: print "FOUND: True Positive" print "Prompt" for utt in example[0][0]: print utt.text_words(); print "Response" for utt in example[0][1]: print utt.text_words(); for key in phi: if key in weights: print "{0}: {1}".format(key, weights[key]) tpFound = True if not fpFound and score > SCORE_THRESHOLD and example[1] == -1: print "FOUND: False Positive" print "Prompt" for utt in example[0][0]: print utt.text_words(); print "Response" for utt in example[0][1]: print utt.text_words(); for key in phi: if key in weights: print "{0}: {1}".format(key, weights[key]) fpFound = True if not tnFound and score < -SCORE_THRESHOLD and example[1] == -1: print "FOUND: True Negative" print "Prompt" for utt in example[0][0]: print utt.text_words(); print "Response" for utt in example[0][1]: print utt.text_words(); for key in phi: if key in weights: print "{0}: {1}".format(key, weights[key]) tnFound = True if not fnFound and score < -SCORE_THRESHOLD and example[1] == 1: print "FOUND: False Negative" print "Prompt" for utt in example[0][0]: print utt.text_words(); print "Response" for utt in example[0][1]: print utt.text_words(); for key in phi: if key in weights: print "{0}: {1}".format(key, weights[key]) fnFound = True if tpFound and fpFound and tnFound and fnFound: break