def main(): y_train, tX_train, ids = load_csv_data(DATA_TRAIN_PATH) _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH) np.random.seed(2019) # Preprocess data together to have the same shifts while creating log or root features tX_stacked = np.vstack((tX_train, tX_test)) prep_param = { "bias": True, "fill": True, "standardize": False, "degree": 8, "log": True, "root": True } tX_stacked_prep, *_ = preprocess_data(tX_stacked, None, prep_param) tX_train_prep, tX_test_prep = np.split(tX_stacked_prep, [len(tX_train)]) # Split data according to PRI_jet_num value tX_tr_splitted, indices_tr = divide_data(tX_train_prep) tX_te_splitted, indices_te = divide_data(tX_test_prep) n_models = len(indices_tr) y_tr_splitted = [] for i in range(n_models): y_tr_splitted.append(y_train[indices_tr[i]]) # Train weights = [] for i in range(n_models): lambda_ = lambda_cv(tX_tr_splitted[i], y_tr_splitted[i]) print(f"Class {i}, lambda: {lambda_}") weights.append( ridge_regression(y_tr_splitted[i], tX_tr_splitted[i], lambda_)[0]) # Predict y_pr_tr = np.zeros(tX_train.shape[0]) y_pr_te = np.zeros(tX_test.shape[0]) for i in range(n_models): y_pr_tr[indices_tr[i]] = predict_labels(weights[i], tX_tr_splitted[i]) y_pr_te[indices_te[i]] = predict_labels(weights[i], tX_te_splitted[i]) acc_tr = compute_accuracy(y_train, y_pr_tr) print(f"Total accuracy train: {acc_tr}") _, counts = np.unique(y_pr_te, return_counts=True) print( f"Distribution on test data class -1: {counts[0]}, class +1: {counts[1]}" ) create_csv_submission(ids_test, y_pr_te, OUTPUT_PATH)
# -*- coding: utf-8 -*- #!/bin/python3.5 """ The run.py file produces our final submission into a "submission.csv" in the data folder """ from als import ALS from sgd import SGD from helpers import create_csv_submission, load_data if __name__ == '__main__': # Initializing dataset print("Loading dataset") path_dataset = "data/data_train.csv" ratings = load_data(path_dataset) # Creating the sub_file with the best prediction # prediction, test_rmse = ALS(ratings, None, 3, 0.2, 0.9) prediction, test_rmse = SGD(ratings, None, 0.04, 9, 0.1, 0.016) create_csv_submission(prediction) print("Submission created at data/submission.csv")
# Build the model initial_w = np.random.randn(D) optimal_gamma, optimal_lambda_, measure_tr, measure_te = \ gamma_lambda_selection_cv(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas[i], lambdas[i], seed = seed, batch_size = batch_size, metric = metric, model = model) print('CA_bs:', CA_baseline) print('Iter:', i, ' Best gamma:', optimal_gamma, ' Best lambda:', optimal_lambda_, '\n') # Update the expected training error exp_measure_tr += measure_tr * X_train_subset.shape[0] / X_train.shape[0] exp_measure_te += measure_te * X_test_subset.shape[0] / X_test.shape[0] # Build the model with the best hyperparameters w = get_model(model, y_train_subset, X_train_subset, initial_w, max_iters, optimal_gamma, optimal_lambda_, batch_size) # Get predictions y_pred_test = np.array(map_minus_1_1(predict_labels(w, X_test_subset))) # Insert the ids and predictions to the ids and y_pred arrays ids = np.concatenate((ids, ids_test_subset)) y_pred = np.concatenate((y_pred, y_pred_test)) # Sort the ids and y_pred arrays ids, y_pred = sort_arr(ids, y_pred) # Create the submission CSV file create_csv_submission(ids, y_pred, sumbission_fname) print("Expected training accuracy / loss:", exp_measure_tr) print("Expected test accuracy / loss:", exp_measure_te)
true_test = test2.map(lambda x:((x[0],x[1]), x[2])) pred_test = pred_test.map(lambda x:((x[0],x[1]), x[2])) true_pred = true_test.join(pred_test) MSE_test = true_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean() RMSE_test = np.sqrt(MSE_test) print("Train rmse : ", RMSE_train) print("Test rmse : ", RMSE_test) # Generate the submission testdata = sc.textFile("sample_submission.csv") testheader = testdata.first() #extract header testdata = testdata.filter(lambda row: row != testheader) testdata = testdata.map(lambda l: l.split(',')) testdata = testdata.map(lambda l: (row_col_spark(l[0], r))) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) pred = predictions.collect() matrix_pred = sp.dok_matrix((10000, 1000), dtype=np.float32) for row in pred: matrix_pred[row[0][0], row[0][1]] = row[1] path_dataset2 = "sample_submission.csv" sub_ex = load_data(path_dataset2) create_csv_submission(list(zip(*sub_ex.nonzero())), matrix_pred, 'submission.csv')
# Convert collection of text documents to a matrix of token counts vocabulary_to_load = pickle.load(open('models/vocabulary.p', 'rb')) vectorizer = CountVectorizer( analyzer = 'word', tokenizer = tokenize, lowercase = True, ngram_range = (1,3), max_df = 0.9261187281287935, min_df = 4, vocabulary = vocabulary_to_load ) vectorizer._validate_vocabulary() test_data_features = vectorizer.transform(test) # Transform count matrix to a normalized tf-idf representation tfidf_transformer = pickle.load(open('models/corpus_data_tfidf_fitted.p', 'rb')) test_data_features_tfidf = tfidf_transformer.transform(test_data_features) # PREDICT LABELS print("Predicting the labels...") predicted_labels = clf_logreg.predict(test_data_features_tfidf) predictions_list = [int(label) for label in predicted_labels] # CREATE SUBMISSION FILE print("Creating submission file in results/ folder") helpers.create_csv_submission(ids, predictions_list, 'results/submission.csv') print("Submission file successfully created!")
import pickle from helpers import samples_csv_submission, create_csv_submission with open('model.pickle', 'rb') as f: item_features, user_features, bias_item, bias_user = pickle.load(f) SUBMISSION_SAMPLES_PATH = "./Data/sample_submission.csv" samples_submission = samples_csv_submission(SUBMISSION_SAMPLES_PATH) create_csv_submission(samples_submission, item_features, user_features, bias_item, bias_user, 'submission_run.csv')
import preprocessing, pipeline, helpers print('Loading the dataset...') tweets, size_pos, size_neg = preprocessing.load_train_tweets( "./data/train_pos_full.txt", "./data/train_neg_full.txt") # creating the predictions pred = preprocessing.predictions(size_pos, size_neg) # loading the test tweets x_test, ids_test = preprocessing.load_test_tweets('./data/test_data.txt') print('Training classifier...') #Best pipeline with TF-IDFVectorizer with gram = (1,4) and LinearSVC which yielded the best results on crowdai.org clf = pipeline.pipeline_model(1, 4) clf.fit(tweets, pred) # create the submission csv file helpers.create_csv_submission(ids_test, clf.predict(x_test), "submission.csv")
def get_prediction(neural_net, global_vectors, full_corpus, total_training_tweets, nr_pos_tweets, kaggle_name, epochs, patience, split=0.8): """ Creates a csv file with kaggle predictions and returns the predictions. Input: neural_net: Name of a neural net model global_vectors: global vectors created out the gensim-.txt files. total_training_tweets: (int) Number of tweets that are training tweets. Assums that the first poriton of the corpus is training tweets, the second part is the unseen test set. nr_pos_tweets: (int) number of traning tweets that are positiv kaggle_name: Name for csv file, must end in '.csv'. Output: pred_ones: the predicions (1 or -1) a .csv file with name 'kaggle_name' """ num_of_dim = global_vectors.syn0.shape[1] # seperate traindata and testdata train_corpus = full_corpus[:total_training_tweets:] predict_corpus = full_corpus[total_training_tweets::] # Build a vector of all the words in a tweet train_document_vecs = np.concatenate([ GM.buildDocumentVector(doc, num_of_dim, global_vectors) for doc in train_corpus ]) train_document_vecs = sk.preprocessing.scale(train_document_vecs) labels = HL.create_labels(nr_pos_tweets, nr_pos_tweets, kaggle=False) train_document_vecs, labels = HL.shuffle_data(train_document_vecs, labels) train_x, val_x, train_y, val_y = HL.split_data(train_document_vecs, labels, split) test_document_vecs = np.concatenate([ GM.buildDocumentVector(doc, num_of_dim, global_vectors) for doc in predict_corpus ]) test_document_vecs = sk.preprocessing.scale(test_document_vecs) model = neural_net(num_of_dim) # Defining callbacks to be used under fitting process early_stopping = early_stopping_callback(patience_=patience, verbose_=1) model_checkpoint = model_checkpoint_callback( "neural_model_prediction.hdf5", verbose_=1) history = model.fit(train_x, train_y, epochs=epochs, batch_size=1024, verbose=1, callbacks=[early_stopping, model_checkpoint], validation_data=(val_x, val_y)) # Loading the best model found during training model = load_model('neural_model_prediction.hdf5') prediction = model.predict(test_document_vecs) prediction = [1 if i > 0.5 else -1 for i in prediction] # Creating prediction ids = list(range(1, 10000 + 1)) HL.create_csv_submission(ids, prediction, kaggle_name) return prediction
# Eksternal libraries import csv import pickle import time import keras as K # internal imports import helpers as HL # Loading pre-processed document vectors for test-set. test_document_vecs = pickle.load(open("final_document_vectors.pkl", "rb")) #Loading neural net model model = K.models.load_model('final_model_for_kaggle.hdf5') #Predicting on test set with neural net model prediction = model.predict(test_document_vecs) #Convert results to kaggle format ( -1, 1 ) prediction = [1 if i > 0.5 else -1 for i in prediction] #CREATING SUBMISSION ids = list(range(1, 10000 + 1)) HL.create_csv_submission(ids, prediction, 'powerpuffz_kagglescore.csv') print("Prediction created - powerpuffz_kagglescore.csv")
PRI_JET_NUM_INDEX) # We achieved our best results using Regularized Logistic Regression, # so we only load only those previously computed optimal params to generate the submission logistic_best_params = np.load("results/logistic_best_params.npy", allow_pickle=True) logistic_best_models = [] for (lambda_, deg, gamma), train_classes_split, train_data_split in \ zip(logistic_best_params, train_classes_jet_num_splits, train_data_jet_num_splits): data_split, columns_to_remove, mean, std = preprocessing_pipeline(train_data_split, degree=np.int(deg), cross_term=True, norm_first=False) initial_w = np.zeros((data_split.shape[1],)) w, loss = reg_logistic_regression(train_classes_split, data_split, lambda_, initial_w, 500, gamma, 1) print(f'Loss: {loss:.3f} Accuracy : {compute_accuracy(predict_labels(w, data_split), train_classes_split)}') logistic_best_models.append((w, loss, columns_to_remove, mean, std)) # Calculate the predictions for each of the 4 subsets using the weights and then combine them results = None for (w, _, col_to_rm, mean, std), (_, deg, _), test_classes_split, test_data_split, test_ids_split in \ zip(logistic_best_models, logistic_best_params, test_classes_jet_num_splits, test_data_jet_num_splits, test_ids_jet_num_splits): test_data_split, _, _, _ = preprocessing_pipeline(test_data_split, degree=np.int(deg), columns_to_remove=col_to_rm, cross_term=True, norm_first=False, mean=mean, std=std) pred = predict_labels(w, test_data_split) out = np.stack((test_ids_split, pred), axis=-1) results = out if results is None else np.vstack((results, out)) # Create the submission create_csv_submission(results[:, 0], results[:, 1], 'results/logistic_submission.csv')