def quiz3_logistic_regression_l2_penalty(products, important_words, lg_class): print "\n**************************************" print "* Logistic Regression: L2 penalty *" print "**************************************\n" train_data, validation_data = products.random_split(.8, seed=2) feature_matrix_train, sentiment_train = np_utils.get_numpy_data( train_data, important_words, 'sentiment') feature_matrix_valid, sentiment_valid = np_utils.get_numpy_data( validation_data, important_words, 'sentiment') table = get_table_with_logistic_model(lg_class, important_words, feature_matrix_train, sentiment_train) coefficients = list(table["coefficients [L2=0]"][1:]) # exclude intercept word_coefficient_tuples = [ (word, coefficient) for word, coefficient in zip(important_words, coefficients) ] word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x: x[1], reverse=True) # print word_coefficient_tuples[:5] positive_words = map(lambda x: x[0], word_coefficient_tuples[:5]) negative_words = map(lambda x: x[0], word_coefficient_tuples[-5:]) # print "Positive words: %s" % positive_words # print "Negative words: %s" % negative_words # quiz_word = ['love', 'disappointed', 'great', 'money', 'quality'] print "\nQ1: feature_derivative_with_L2, regularize the intercept: NO" print "\nQ2: L2 regularization increase/decrease the log likelihood ll(w): DECREASE" print "\nQ3: words is not listed in either positive_words or negative_words: QUALITY" l2_penalty_list = [0, 4, 10, 1e2, 1e3, 1e5] output_file = '../graphs/Coefficient_vs_L2penalty.png' np_plot.make_coefficient_plot(table, positive_words, negative_words, l2_penalty_list, output_file) print "\nQ4: All coefficients consistently get smaller in size as the L2 penalty is increased -> TRUE" train_accuracy = create_accuracy_table(table, feature_matrix_train, sentiment_train) validation_accuracy = create_accuracy_table(table, feature_matrix_valid, sentiment_valid) print "\nComputing accuracy ....\n" for key in sorted(validation_accuracy.keys()): print "\tL2 penalty = %g" % key print "\ttrain accuracy = %s, validation_accuracy = %s" % ( train_accuracy[key], validation_accuracy[key]) print "\t--------------------------------------------------------------------------------" output_file2 = '../graphs/Classification_Accuracy_vs_L2penalty.png' np_plot.make_classsification_accuracy_plot(train_accuracy, validation_accuracy, output_file2) print "\nQ6: highest accuracy on the training data: L2 penalty = 0" print "\nQ7: highest accuracy on the validation data: L2 penalty = 4" print "\nQ8: highest accuracy on the training data imply that the model is the best one: NO"
def get_normalized_datasets(train, test, validation, feature_list): features_train,output_train = np_utils.get_numpy_data(train,feature_list,'price') features_test,output_test = np_utils.get_numpy_data(test,feature_list,'price') features_valid,output_valid = np_utils.get_numpy_data(validation,feature_list,'price') features_train,norms = np_utils.normalize_features(features_train) # normalize training set features (columns) features_test = features_test / norms # normalize test set by training set norms features_valid = features_valid / norms # normalize validation set by training set norms return (features_train, features_test, features_valid, output_train, output_valid, output_test)
def quiz_2_ridge_grandient_descent(sales): print "\n**********************************" print "* Ridge Gradient Descent *" print "**********************************\n" simple_features = ['sqft_living'] my_output = 'price' train_data,test_data = sales.random_split(.8,seed=0) (simple_feature_matrix,output) = np_utils.get_numpy_data(train_data,simple_features,my_output) (simple_test_feature_matrix,test_output) = np_utils.get_numpy_data(test_data,simple_features,my_output) ridge = RidgeRegression() l2_no_reg,l2_high_reg = 0,1e11 initial_weights = np_utils.np.array([0.,0.]) print "\nQ1 & Q2 coefficients with features: %s" % (simple_features) ridge_weights = compute_ridge_regression(ridge,simple_feature_matrix,output,[l2_no_reg,l2_high_reg],initial_weights) # print ridge_weights print "\nQ3: Line fit with no regularization (l2_penalty=0) is steeper" print "\nQ4: high regularization (l2_penalty=1e11)" compute_ridge_rss([ridge_weights[l2_high_reg]],simple_test_feature_matrix,test_data) print "\t- Between 5e14 and 8e14" more_features = ['sqft_living','sqft_living15'] initial_w_morefeatures = np_utils.np.array([0.0,0.0,0.0]) (more_feature_matrix,output_more_features) = np_utils.get_numpy_data(train_data,more_features,my_output) (more_test_feature_matrix,test_output_more) = np_utils.get_numpy_data(test_data,more_features,my_output) print "\nQ5 & Q6 coefficients with features: %s" % (more_features) ridge_morefeatures = compute_ridge_regression(ridge,more_feature_matrix,output_more_features, [l2_no_reg,l2_high_reg],initial_w_morefeatures) print "\nQ7: using all zero weights with features: %s" % (simple_features) compute_ridge_rss([initial_w_morefeatures],more_test_feature_matrix,test_data) print "\t-Between 1e15 and 3e15" num_of_house = 1#5 print "\nQ8: Which model makes better predictions for 1st house:" for l2_penalty in [l2_no_reg,l2_high_reg]: print "L2:%s:" % l2_penalty current_predictions = np_utils.predict_output(more_test_feature_matrix,ridge_morefeatures[l2_penalty]) for house_predict in range(num_of_house): pred,real = current_predictions[house_predict],test_data['price'][house_predict] print '\t\t(predict) %s vs %s (real) diff: %s' % (pred,real,real - pred)
def quiz2_implementing_logistic_regression(products, important_words, lg_class): print "\n**************************************" print "* Implementing Logistic Regression *" print "**************************************\n" # set to 1 if the count of the word perfect >=1 products['contains_perfec'] = products['perfect'] >= 1 print "\nQ1: # of reviews containing word perfect is: %s" % products[ 'contains_perfec'].sum() print "\nTransforming data to numpy arrays ...." feature_matrix, sentiment = np_utils.get_numpy_data( products, important_words, 'sentiment') n_features, n_weights = feature_matrix.shape print "\nQ2: # of features in the feature_matrix is: %s" % n_weights #******************* # Logistic model #******************* print "\nCreating logistic model ...." coefficients = predict_coefficients_logistic_model(feature_matrix, sentiment, lg_class) print "\nQ4: As each iteration of gradient ascent passes the log- likelihood: increases" predictions_yi, correctly_classified = compute_correct_score_predictions( feature_matrix, coefficients) print "\nQ5: reviews were predicted to have positive sentiment is: %s" % correctly_classified accuracy = compute_accuracy_of_the_model(predictions_yi, products) print "\nQ6: accuracy of the model on predictions %s" % round(accuracy, 2) word_coefficient_tuples = get_word_coeff_tuples(important_words, coefficients) top_words = map(lambda x: x[0], word_coefficient_tuples[:10]) select = list({'love', 'easy', 'great', 'perfect', 'cheap'} - set(top_words)) print "\nQ7: not present in the top 10 most positive words: %s" % select least_words = map(lambda x: x[0], word_coefficient_tuples[-10:]) select_least = list({'need', 'work', 'disappointed', 'even'} - set(least_words)) print "\nQ8: not present in the top 10 most negative words: %s" % select_least
def more_features_with_lasso_coordinate(lasso, sales): train_data, test_data = sales.random_split(.8, seed=0) all_features = [ 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated' ] feature_matrix_norm, train_output, train_norms = np_utils.get_normalized_data( train_data, all_features, 'price') initial_weights = np_utils.np.zeros(len(all_features) + 1) weights_info, nnz_features = {}, {} penalty_tolerance = [[1e7, 1.0], [1e8, 1.0], [1e4, 5e5]] penalty_str = {1e7: '1e7', 1e8: '1e8', 1e4: '1e4'} print "\nFeatures assigned for Q5,Q6,Q7:" for penalty, tolerance in penalty_tolerance: weights = lasso.lasso_cyclical_coordinate_descent( feature_matrix_norm, train_output, initial_weights, penalty, tolerance) # print weights weights_normalized = weights / train_norms weights_info[penalty] = weights_normalized dict_weights = dict( zip(['constant'] + all_features, weights_normalized)) nnz_features[penalty] = filter(lambda x: dict_weights[x] > 0, dict_weights) print "\n\tL1_penalty_%s: %s" % (penalty_str[penalty], nnz_features[penalty]) print "\nQ8: three models RSS on the TEST data:" test_feature_matrix, test_output = np_utils.get_numpy_data( test_data, all_features, 'price') for penalty, tolerance in penalty_tolerance: current_predictions = np_utils.predict_output(test_feature_matrix, weights_info[penalty]) RSS = reg.compute_RSS(current_predictions, test_output) print "\n\tL1_penalty_%s: %s" % (penalty_str[penalty], RSS)
def get_predictions(dataset, features, output, weights): feature_matrix, output = np_utils.get_numpy_data(dataset, features, output) predictions = np_utils.predict_output(feature_matrix, weights) return predictions
def calculate_weights(gradient, dataset, features, output, parameters): initial_weights,step_size,tolerance = parameters feature_matrix, output_data = np_utils.get_numpy_data(dataset,features,output) weights = gradient.regression_gradient_descent(feature_matrix,output_data,initial_weights,step_size,tolerance) return weights
def main(): try: print "\n**************************************" print "* Online Learning *" print "**************************************\n" products = gp.load_data('../../data_sets/amazon_baby_subset.gl/') important_words = gp.load_json_file( '../../data_sets/important_words.json') # Remove Punctuation products['review_clean'] = products['review'].apply( gp.remove_punctuation) # Add important_words and its number of ocurrences per review for word in important_words: products[word] = products['review_clean'].apply( lambda s: s.split().count(word)) # print products[:10] train_data, validation_data = products.random_split(.9, seed=1) feature_matrix_train, sentiment_train = np_utils.get_numpy_data( train_data, important_words, 'sentiment') feature_matrix_valid, sentiment_valid = np_utils.get_numpy_data( validation_data, important_words, 'sentiment') print "\nQ1: stochastic gradient ascent affect the number of features NOT: Stays the same" print "\nQ2: llA (w) = (1/N) * ll(w) --> only add (1/N)" print "\nQ3: dli(w)/dwj is a --> scalar" print "\nQ4: dli(w)/dwj (minibatch) is a: scalar" print "\nQ5: to have the same as the full gradient set B=N (size of train_data): %s" % len( train_data) print "\nQ6: logistic_regression_SG act as a standard gradient ascent when B=N (size of train_data): %s" % len( train_data) lg = cl_utils.LogisticRregStochastic() coefficients, log_likelihood = lg.logistic_regression_SG( feature_matrix_train, sentiment_train, initial_coefficients=np_utils.np.zeros(194), step_size=5e-1, batch_size=1, max_iter=10, verbose=False) print "\nQ7: set batch_size = 1, as each iteration passes, the average log likelihood in the batch: Fluctuates" # print coefficients coefficients_batch, log_likelihood_batch = lg.logistic_regression_SG( feature_matrix_train, sentiment_train, initial_coefficients=np_utils.np.zeros(194), step_size=5e-1, batch_size=len(feature_matrix_train), max_iter=200, verbose=False) print "\nQ8: set batch_size = 47780, as each iteration passes, the average log likelihood in the batch: Increases" # print coefficients_batch print "\nQ9: gradient updates are performed at the end of two passes ((2*50000)/100.0) = %s" % ( (2 * 50000) / 100.0) # log_likelihood_metrics(lg,feature_matrix_train,sentiment_train) plot_stochastic_and_batch(lg, feature_matrix_train, sentiment_train, log_likelihood_batch) print "\nQ10: passes needed to achieve a similar log likelihood as stochastic gradient ascent: 150 passes or more" # effects_of_step_size(lg,feature_matrix_train,sentiment_train,train_data) print "\nQ11: worst step size is: 1e2" print "\nQ12: best step size is: 1e0" except Exception as details: print(">> Exit or Errors \n%s, %s" % (details, traceback.print_exc()))