def train_original(): fake, real = data_processing.load_data() data, keywords = data_processing.process_data(fake, real) training_set = data_processing.Headlines(data[0]) validation_set = data_processing.Headlines(data[1]) testing_set = data_processing.Headlines(data[2]) print('Data Loaded') model = classifiers.ConvnetClassifier(len(keywords), data[0][0][0].shape[1]).cuda() loss_fn = torch.nn.CrossEntropyLoss().cuda() training_loss, validation_loss = train.train_classifier(model, loss_fn, training_set, validation_set, patience=3) plt.plot(training_loss) plt.plot(validation_loss) plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend(('Training Set', 'Validation Set')) plt.savefig('error_orig.png') plt.show() torch.save(model.state_dict(), 'model_orig.pkl') model.eval() print('Acheived {:%} accuracy on the training set.'.format( train.get_accuracy(model, training_set))) print('Acheived {:%} accuracy on the validation set.'.format( train.get_accuracy(model, validation_set))) print('Acheived {:%} accuracy on the testing set.'.format( train.get_accuracy(model, testing_set)))
def main(): torch.manual_seed(24) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print(device) data_dir = './clean_data/full' model_name = '3LayerModel' word_length = 8 corpus, labels, vector_length, _, label_dict = process_data( data_dir, word_length=word_length) num_languages = len(label_dict.keys()) for i in range(num_languages): count = len([l for l in labels if l == i]) print(f'{label_dict[i]} word count = {count}') print(f'Vector length = {vector_length}') model = LanguageNet(vector_length, word_length, num_languages) model = model.to(device) # model = ConvLanguageNet(vector_length, word_length, num_languages) model.train() training_loop(model, corpus, labels, model_name='Linear', device=device) model = ConvLanguageNet(vector_length, word_length, num_languages) model = model.to(device) model.train() training_loop(model, corpus, labels, model_name='Convolution', device=device)
def main(filepath, modelpath): """ main function Args: filepath ([txt]): filepath containing our dataset modelpath ([h5]): modelpath containing our model """ df, eng_deu_lines = read_file(filepath) dfp = process_data(df) eng_deu = data_array(dfp) eng_tok = token(eng_deu[:, 0]) eng_len_vocab = len(eng_tok.word_index) + 1 deu_tok = token((eng_deu[:, 1])) deu_len_vocab = len(deu_tok.word_index) + 1 data_train, data_test = train_test_split(eng_deu, test_size=0.2, random_state=1) X_train = encoding(data_train[:, 1], 8, deu_tok) y_train = encoding(data_train[:, 0], 8, eng_tok) X_test = encoding(data_test[:, 1], 8, deu_tok) y_test = encoding(data_test[:, 1], 8, deu_tok) model = load_model(modelpath) preds = model.predict_classes(X_test.reshape((X_test.shape[0], X_test.shape[1]))) df_preds = prediction(preds, eng_tok) return df_preds
def main(trainPath, testPath, submissionPath, processData=True, X_train=None, X_test=None, y_train=None, y_test=None, X_submission_df=None, X_submission=None): max_score = 0 iter = 0 if processData: X_train, X_test, y_train, y_test, X_submission_df, X_submission = process_data( trainPath, testPath) for description, model in models.items(): print(description) print(model.fitModel(X_train, y_train)) score = model.getTestScore(X_test, y_test) if score > max_score: max_score = score print(score) model.evaluateModel(X_submission, X_submission_df, submissionPath, iter) iter += 1 return max_score
def largest_activations(): fake, real = data_processing.load_data() _, keywords = data_processing.process_data(fake, real) model = classifiers.ConvnetClassifier(len(keywords), 40) model.load_state_dict(torch.load('model_orig.pkl')) weights = model.classifier[1].weight.data.numpy() print("Real sequences") most_real = np.argsort(weights[0])[-10:] for most in most_real: if most < 100: conv = model.features3[0].weight.data.numpy()[most] elif most < 200: conv = model.features4[0].weight.data.numpy()[most - 100] else: conv = model.features5[0].weight.data.numpy()[most - 200] print(*keywords[np.argmax(conv, 0)]) print("Fake sequences") most_fake = np.argsort(weights[1])[-10:] for most in most_fake: if most < 100: conv = model.features3[0].weight.data.numpy()[most] elif most < 200: conv = model.features4[0].weight.data.numpy()[most - 100] else: conv = model.features5[0].weight.data.numpy()[most - 200] print(*keywords[np.argmax(conv, 0)])
def cross_validation(y, x, k_indices, k, lambda_, degree): # Dividing in subgroups te_indice = k_indices[k] tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)] tr_indice = tr_indice.reshape(-1) y_te = y[te_indice] y_tr = y[tr_indice] tx_te = x[te_indice] tx_tr = x[tr_indice] # Preprocessing data: cleaning, standardazing and adding constant column tx_tr, tx_te = process_data(tx_tr, tx_te, y_tr, y_te) # Feature augmentation through polynomials tx_tr = build_poly(tx_tr, degree) tx_te = build_poly(tx_te, degree) # Printing degree and lambda tested print("Test: d = ", degree, "; l = ", lambda_) # Training with ridge regression w, loss = ridge_regression(y_tr, tx_tr, lambda_) # Computing prediction vector y_pred = predict_labels(w, tx_te) # Computing accuracy on test set accuracy = compute_accuracy(y_te, y_pred) # Log informations print("Accuracy = ", accuracy, "; loss = ", loss, "\n") return loss_te, accuracy
def main(): data = process_data('test.txt') bar_plot_LEO(data) total_line_and_scatter_plot_LEO(data) total_bar_stacked_LEO(data) orbit_plot() compare_years_by_orbit(data) compare_by_alt(data)
def main(): df = dp.process_data('test.txt') df = probability_calc(df) df.to_csv('probability.csv') print('Probability Finished!')
def main(): df = pd.read_csv("GamingStudy_data.csv") data = data_processing.process_data(df) avg_GAD_over_20(data) avg_GAD_under_20(data) avg_hours_work(data) hours_game_age(data) narcissism_gaming_hours(data) narcissism_over_20_hours(data) narcissism_mental_health(data, 'GAD_T') narcissism_mental_health(data, 'SPIN_T') narcissism_mental_health(data, 'SWL_T')
def predict(model, data): #df = pd.read_json(data) df = pd.DataFrame([data]) X_top = dp.process_data(df) # with open('data/random_forest.pkl') as f: # rf_top = pickle.load(f) # Using unpickled model to predict on new new data y_pred = model.predict_proba(X_top) # Adds new column in df with predicted probability of fraud df['fraud_prob'] = y_pred[:,1] return df
def main(pd, gs): if pd: process_data('train.csv', 'test.csv') y_train_jets = [] tx_train_jets = [] ids_train_jets = [] y_test_jets = [] tx_test_jets = [] ids_test_jets = [] load_data_sets(y_train_jets, tx_train_jets, ids_train_jets, y_test_jets, tx_test_jets, ids_test_jets) degree_best_jets = [6, 6, 6, 6] lambda_best_jets = [6e-05, 0.0023, 4.6e-09, 5.7e-05] if gs: perform_grid_search_with_cross_validation(degree_best_jets, lambda_best_jets, y_train_jets, tx_train_jets) predictions = [] ids_predicted = [] learn(predictions, ids_predicted, y_train_jets, tx_train_jets, tx_test_jets, ids_test_jets, lambda_best_jets, degree_best_jets) combine_and_create_submission(predictions, ids_predicted, 'submit_E_M_D_best')
def create_classification_models(loop_features, loop_targets, feature_names, main_params): loop_features, loop_targets, new_feature_names, new_feature_indices = process_data(loop_features, loop_targets, feature_names, main_params["data_processing"], True) if np.shape(np.array(loop_features))[1] == 0: print("Canceling create_classification_models due to lack of features after data processing") return analyze_data(loop_features, loop_targets, new_feature_names, main_params["data_analysis"], False, True) # test_ratio = 1 - main_params["sampling_params"]["train_ratio"] # x_train, x_test, y_train, y_test = train_test_split(loop_features, loop_targets, test_size=test_ratio) run_sk_classification(loop_features, loop_targets, main_params, new_feature_indices, new_feature_names)
def test_shapes(self): proc = process_data() under_shape = proc.x_train_under.shape under_target_shape = (437, 3197) shrink_shape = proc.x_train_shrink.shape shrink_target_shape = (800, 3197) over_shape = proc.x_train_over.shape over_target_shape = (10100, 3197) # test if shape is correct self.assertEqual(under_shape, under_target_shape) self.assertEqual(shrink_shape, shrink_target_shape) self.assertEqual(over_shape, over_target_shape)
def create_regression_models(loop_features, loop_targets, feature_names, main_params): loop_features, loop_targets, new_feature_names, new_feature_indices = process_data(loop_features, loop_targets, feature_names, main_params["data_processing"], False) if np.shape(np.array(loop_features))[1] == 0: print("Canceling create_regression_models due to lack of features after data processing") return analyze_data(loop_features, loop_targets, new_feature_names, main_params["data_analysis"], False, False) x_train, x_test, y_train, y_test = utils.stratified_regressions_sampling(loop_features, loop_targets, main_params["sampling_params"]) if main_params["data_processing"]["regression_smote"]: x_train, y_train = smote_oversampling_regression(x_train, y_train, new_feature_names) run_sk_regression(x_train, x_test, y_train, y_test, main_params, new_feature_indices, new_feature_names)
def mean_absolute_percentage_error(y_true, y_pred): import numpy as np y_true, y_pred = np.array(y_true), np.array(y_pred) return np.mean(np.abs((y_true - y_pred) / y_true)) * 100 ks = [300, 300, 300] #every other #%% MAPES = [] for k_prof in ks: for k_hash in ks: X, y = process_data('Business Analytics/training_set.csv', k_prof=k_prof, k_hash=k_hash, training=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) model = XGBRegressor(colsample_bytree=0.5, gamma=0.05, max_depth=4, min_child_weight=4, n_estimators=1000, subsample=0.6) model.fit(X_train, y_train) print("MAPE Train Score ") print(mean_absolute_percentage_error(y_train, model.predict(X_train)))
# create best words dictionary which key is suggestion and value is the probability of word best_words = {s: probs.get(s, 0) for s in suggestions} # get highest values in a best_words dictionary n_best = Counter(best_words).most_common(n) #n_best = [[s, probs[s]] for s in list(reversed(suggestions))] if verbose: print("entered word =", word, "\nsuggestions =", suggestions) return n_best if __name__ == "__main__": # load corpus data word_l = data_processing.process_data("../input/shakespeare.txt") # create set of corpus words vocab = set(word_l) # get word frequencies word_count_dict = data_processing.get_count(word_l) # get probability of word in the corpus probs = data_processing.get_prob(word_count_dict) my_word = 'dys' tmp_corrections = get_corrections(my_word, probs, vocab, 2, verbose=True) for i, word_prob in enumerate(tmp_corrections):
return input_list_train, input_list_val, input_list_test def logloss(y, pred): N = len(y) score = 0 for i in range(N): score += y[i] * log(pred[i]) + (1 - y[i]) * log(1 - pred[i]) return -score / N ## Network training ## # Loading data df_train = process_data() print("First steps of the neural network... ") cols = [c for c in df_train.columns if c not in ['is_churn', 'msno']] X_train, y_train, X_test, y_test = train_test_split(df_train[cols], df_train['is_churn'], test_size=0.30, random_state=242) print(" X_train = ", X_train) print("y_train = ", y_train) print(" X_test = ", X_test) print("y_test = ", y_test)
def main(): df = process_data('test.txt') polynomial_fit_count(df) polynomial_fit_probability(df) get_orbit_tally(df, 'GEO')
from cnn import Net import data_processing from data_processing import process_data import torch import torch.nn as nn import torch.optim as optim from tqdm import tqdm train, test, val = process_data() device = torch.device('cuda:0') net = Net().to(device) def train_model(net): X = torch.Tensor([i[0] for i in train]).view(-1, 98, 98).to(device) # Normalize data X = X / 255.0 y = torch.Tensor([i[1] for i in train]).to(device) optimizer = optim.Adam(net.parameters(), lr=0.001) loss_function = nn.MSELoss() epochs = 2 batch_size = 32 for epoch in tqdm(range(epochs)): running_loss = 0.0 for i in range(0, len(X), batch_size): batch_X = X[i: i+batch_size].view(-1, 1, 98, 98).to(device) batch_y = y[i: i+batch_size].to(device)
#!/usr/bin/python from FitnessCalc import FitnessCalc from GeneticAlgorithm import GeneticAlgorithm from data_processing import process_data # initialize empty collection summary = process_data('../resources/train.csv', 500, True) # create GA instance ga = GeneticAlgorithm() # load data into fitness calculator fc = FitnessCalc(summary) #create initial population ga.create_initial_population() for generations in xrange(1, 51): max_fitness = 0.0 print 'generation ' + str(generations) print 'max fitness: ' + str(max_fitness) #assign fitness levels total_fitness = 0.0 for i, indiv in enumerate(ga.population): fitness = fc.calculate_fitness(indiv['chromosome']) total_fitness += fitness indiv['fitness'] = fitness print str(round(fitness, 2)) + ' ' + indiv['chromosome'] print 'avg fitness: ' + str(float(total_fitness) / (i + 1))
from implementations import ridge_regression from proj1_helpers import load_csv_data, predict_labels, create_csv_submission from data_processing import process_data, build_poly print("Loading data\n") # Loading data from csv files y_tr, tx_tr, ids_tr = load_csv_data("data/train.csv") y_te, tx_te, ids_te = load_csv_data("data/test.csv") # Hyper-parameters definitions degree = 7 lambda_ = 0.00025 # Preprocessing data: cleaning, standardazing and adding constant column tx_tr, tx_te = process_data(tx_tr, tx_te, y_tr, y_te) # Feature augmentation through polynomials tx_tr = build_poly(tx_tr, degree) tx_te = build_poly(tx_te, degree) # Training with ridge regression print("Training the model\n") weights, _ = ridge_regression(y_tr, tx_tr, lambda_) # Computing prediction vector y_pred = predict_labels(weights, tx_te) # Creating file for submission create_csv_submission(ids_te, y_pred, "prediction.csv")
# Print progress print(f"Current iteration: {i+1}/{len(max_depth_array)}") # Plot plt.style.use('seaborn-whitegrid') plt.plot(max_depth_array, train_score, label="Train score") plt.plot(max_depth_array, valid_score, label="Validation score") plt.ylabel("Recall score", fontsize=14) plt.xlabel("Max depth", fontsize=14) plt.title("Overfitting decision tree on oversampled train and validation", fontsize=16) plt.legend() plt.savefig("../visuals/overfitting_dt_depth.pdf") plt.show() if __name__ == "__main__": #getting training and testing data df = process_data(print_results=False) x_train_up = df.x_train_over # Oversampled training data y_train_up = df.y_train_over # Oversampled training target x_train_down = df.x_train_shrink # Shrunk and oversampled training data y_train_down = df.y_train_shrink # Shrunk and oversampled trainign target x_train = df.x_train # Training data y_train = df.y_train # Training targets x_test = df.x_test # Test data y_test = df.y_test # Test target tune_decision_tree()
'out/fly/acoustic-guitars.csv')) items.append( SearchItem( 'https://www.fly-music.ro/21-chitare-chitari-chitara-bass-electrice-electrica', 'out/fly/electric-bass.csv')) items.append( SearchItem('https://www.fly-music.ro/144-sintetizatoare-sintetizator', 'out/fly/organs.csv')) items.append( SearchItem( 'https://www.fly-music.ro/80-chitare-chitari-chitara-electro-acustice--acustica-seturi', 'out/fly/electro-acoustic-guitars.csv')) items.append( SearchItem('https://www.fly-music.ro/239-clape-midi-claviaturi-midi', 'out/fly/midi.csv')) crawler = FlyMusicCrawler(driver) for i in items: crawler.crawl_and_save(i.url, i.out_file) driver.quit() if __name__ == '__main__': # crawl_mcmusic() # crawl_flymusic() # pack('out/mc') # pack('out/fly') process_data('out/merged/', 'mc.csv', 'fly.csv')
def setup_class(cls): cls.X_train, cls.X_test, cls.y_train, cls.y_test, cls.X_submission_df, cls.X_submission = process_data( "/home/spolezhaev/train", "/home/spolezhaev/test") """This method is run once for each class before any tests are run"""
train_df = train_df.sample( frac=FRAC_DATA, random_state=RANDOM_STATE).reset_index(drop=True) val_df = val_df.sample(frac=FRAC_DATA, random_state=RANDOM_STATE).reset_index(drop=True) test_df = test_df.sample(frac=FRAC_DATA, random_state=RANDOM_STATE).reset_index(drop=True) print(f"Using {int(FRAC_DATA*100)}% of the dataset.") NB_SPECIES = len(set(train_df['label'])) # Number of classes print("NB_SPECIES: ", NB_SPECIES) print("Processing Training Data...") trainloader = process_data(df=train_df, batch_size=BATCH_SIZE, sample_rate=SR, audio_duration=AUDIO_DURATION, random_state=RANDOM_STATE, do_plot=False) print("Processing Validation Data...") validationloader = process_data(df=val_df, batch_size=BATCH_SIZE, sample_rate=SR, audio_duration=AUDIO_DURATION, random_state=RANDOM_STATE, do_plot=False) print("Processing Test Data...") testloader = process_data(df=test_df, batch_size=1, sample_rate=SR,
if (len(sys.argv) < 2) or (len(sys.argv) > 3): print("Usage:") print("\tmain.py data_path [pretrained_model_path]") sys.exit() DATA_PATH = sys.argv[1] MODEL_PATH = None if len(sys.argv) == 3: MODEL_PATH = sys.argv[2] data = pd.read_csv(DATA_PATH, sep='\t', header=None, names=['en', 'ru']) # Data preprocessing # Choose the sentences of word-length less than 14, eliminating only 1% of initial data max_sentence_length = 14 data = process_data(data, max_sentence_length, SOS, EOS) tokenizer_en = tokenize_data(data.en, vocab_size=2**15) tokenizer_ru = tokenize_data(data.ru, vocab_size=2**15) encoder_max_length = max( [len(tokenizer_en.encode(sentence)) for sentence in data.en]) + 5 decoder_max_length = max( [len(tokenizer_ru.encode(sentence)) for sentence in data.ru]) + 5 X_train, X_test, y_train, y_test = train_test_split(np.array(data.en), np.array(data.ru), test_size=0.15, random_state=15) # Model definition
update = [int(x) for x in update] freq = dict(Counter(update)) update = sorted(list(set(update))) index = range(int(update[0]), int(update[-1]) + 1) freq_list = [] for i in index: if i in freq: freq_list.append(freq[x]) else: freq_list.append(0) print index, freq_list return index, freq_list if __name__ == '__main__': X, Y = process_data('./hw1_15_train.dat') # Q15 _, index_record, _ = naive_cyclic_PLA(X, Y) print 'question 15: updates: %d, index that results in max updates: %d' % ( sum(index_record), index_record.argsort()[::-1][0]) #Q16 print 'question 16' update = [] for i in range(2000): _, index_record, _ = naive_cyclic_PLA(X, Y, random_ord=True) total_update = sum(index_record) update.append(int(total_update)) #update, freq = data_reorder(update) #histogram(update, freq, 'question16.png')
c_lst = np.logspace(-2, 2, 10) kernel_lst = ['linear', 'poly', 'sigmoid', 'rbf'] gamma_lst = np.logspace(-3, 1, 10) for i in range(10): for j in range(10): print(c_lst[i]) model = SVC(kernel='sigmoid', gamma=gamma_lst[j], C=c_lst[i], probability=True) evaluate_model(model, x_train_down, x_test, y_train_down, y_test, 'baseline_CM_SVM', 'Baseline confusion matrix: SVM ', 'svm_rp_name', 'svm_cm_name') if __name__ == "__main__": #getting training and testing data df = process_data(print_results=False, plot=True) x_train_up = df.x_train_over # Oversampled training data y_train_up = df.y_train_over # Oversampled training target x_train_down = df.x_train_shrink # Shrunk and oversampled training data y_train_down = df.y_train_shrink # Shrunk and oversampled trainign target x_train = df.x_train # Training data y_train = df.y_train # Training targets x_test = df.x_test # Test data y_test = df.y_test # Test target # tune_SVM()
#seq2seq train import tensorflow as tf import numpy as np import data_processing import config import data_utils import seq2seq_wrapper from os import path #load data and split into train and test sets idx_headings, idx_descriptions = data_processing.process_data() article_metadata = data_processing.unpickle_articles() (x_train, x_test), (y_train, y_test), (x_valid, y_valid) = data_utils.split_data( idx_descriptions, idx_headings) #define parameters xseq_length = x_train.shape[-1] yseq_length = y_train.shape[-1] batch_size = config.batch_size xvocab_size = len(article_metadata['idx2word']) yvocab_size = xvocab_size checkpoint_path = path.join(config.path_outputs, 'checkpoint') print(checkpoint_path) #define model model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_length, yseq_len=yseq_length, xvocab_size=xvocab_size, yvocab_size=yvocab_size,
import pandas as pd from data_processing import process_data, get_truth_cat from plot import plot_fig import math import numpy as np from baseline import FixedBaseline, ClinicalBaseline from lin_ucb import Env, LinUCB, ThompsonSampler, LinOracle, SupervisedBandit features, feature_df, dosage = process_data('./data/warfarin.csv') true_cat = get_truth_cat(dosage) env = Env(features, true_cat, dosage) clinical_baseline = ClinicalBaseline() fixed_baseline = FixedBaseline() lin_ucb = LinUCB(3, len(features[0]), 0.1) lin_thompson = ThompsonSampler(3, len(features[0]), 0.01) lin_oracle = LinOracle(3, features, true_cat) supervised_bandit = SupervisedBandit(len(features[0])) algo = {} algo["clinical_baseline"] = clinical_baseline algo["fixed_baseline"] = fixed_baseline algo["lin_ucb"] = lin_ucb algo["lin_thompson"] = lin_thompson algo["lin_oracle"] = lin_oracle algo["supervised_bandit"] = supervised_bandit reward_list = {} regret_list = {} for i in algo: reward_list[i] = [] if i != "lin_oracle": regret_list[i] = []
import tensorflow as tf import numpy as np import data_processing import config import data_utils import seq2seq_wrapper from os import path #load data and split into train and test sets idx_headings, idx_descriptions = data_processing.process_data() article_metadata = data_processing.unpickle_articles() (x_train, x_test), (y_train, y_test), (x_valid, y_valid) = data_utils.split_data(idx_descriptions, idx_headings) #define parameters xseq_length = x_train.shape[-1] yseq_length = y_train.shape[-1] batch_size = config.batch_size xvocab_size = len(article_metadata['idx2word']) yvocab_size = xvocab_size checkpoint_path = path.join(config.path_outputs, 'checkpoint') print (checkpoint_path) #define model model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_length, yseq_len=yseq_length, xvocab_size=xvocab_size, yvocab_size=yvocab_size, emb_dim=config.embedding_dim, num_layers=3, ckpt_path=checkpoint_path)
import numpy as np from data_processing import process_data, generate_data, histogram from Decision_stump import one_dimension_decision_stump, multi_dimension_decision_stump, check_accuracy,Out_of_sample_error if __name__ == '__main__': #Q17, Q18 E_in_list = [] E_out_list = [] for i in range(5000): X, Y = generate_data(10, 5) score, s, theta = one_dimension_decision_stump(X, Y) E_in_list.append((10-float(score))/10) E_out_list.append(Out_of_sample_error(s, theta)) histogram(E_in_list, 'qustion 17', 'in sample error', 'frequency') print "Question 17: average in sample error: %f" % (sum(E_in_list)/5000) histogram(E_out_list, 'qustion 18', 'out of sample error', 'frequency') print "Question 18: average out of sample error: %f" % (sum(E_out_list)/5000) #Q19 X_train, Y_train = process_data('./hw2_train.dat') X_test, Y_test = process_data('./hw2_test.dat') best_record, s, theta, index = multi_dimension_decision_stump(X_train, Y_train) print "Qustion 19: index: %d, h = %d * sign(x - %f), in sample error: %f" % (index, s, theta, (len(Y_train)-float(best_record))/len(Y_train)) X_test_trans = np.transpose(X_test) accuracy = check_accuracy(s, theta, X_test_trans[index], Y_test) print "Qustion 20: out of sample error: %f" % (1 - accuracy)