def test_load_data(self): train_df, valid_df = load_train_data(self.audio_path, self.validation_list_path) self.assertTrue(train_df.shape[0] == 57929) self.assertTrue(valid_df.shape[0] == 6798) df = train_df.groupby('label').apply(sampling(2000)) print(df.shape)
def _make_submission(params): test_paths = glob(params['test_path']) if params['sample']: print("Get small sample") test_paths = test_paths[:params['sample_size']] model = load_model(params['model_path']) train_data, validate_data = load_train_data(params['audio_path'], params['validation_list_path']) assert len(train_data) != 0 assert len(validate_data) != 0 wav_reader = SimpleWavFileReader(L) silence_data = get_silence(train_data, wav_reader) sound_chain = SoundChain( SimpleWavFileReader(L), sp.AdjustLenWavProcessor(silence_data, L, L), sp.EmphasisWavProcessor(silence_data, L, L, 0.97), sp.NormalizeWavProcessor(silence_data, L, L), sp.ReshapeWavProcessor(silence_data, L, L), sp.MinMaxWavProcessor(silence_data, L, L, (0, 1)), ) tests = test_generator(test_paths, params['batch_size_pred'], sound_chain) print("PREDICTING") predictions = model.predict_generator(tests, int(np.ceil(len(test_paths) / params['batch_size_pred']))) classes = np.argmax(predictions, axis=1) submission = {} print("SAVING") for i in range(len(test_paths)): fname, label = os.path.basename(test_paths[i]), id2name[classes[i]] submission[fname] = label return submission
def main_confusion_matrix(params): model = load_model(params['model_path']) train_df, valid_df = load_train_data(params['audio_path'], params['validation_list_path']) assert len(train_df) != 0 assert len(valid_df) != 0 wav_reader = SimpleWavFileReader(L) silence_data = get_silence(train_df, wav_reader) sound_chain = SoundChain( SimpleWavFileReader(L), sp.AdjustLenWavProcessor(silence_data, L, L), sp.EmphasisWavProcessor(silence_data, L, L, 0.97), sp.NormalizeWavProcessor(silence_data, L, L), sp.ReshapeWavProcessor(silence_data, L, L), sp.MinMaxWavProcessor(silence_data, L, L, (0, 1)), ) validate_gen = valid_generator(valid_df, params['batch_size'], sound_chain, with_y=False) predictions = model.predict_generator(validate_gen, int(np.ceil(valid_df.shape[0] / params['batch_size_pred']))) classes = [id2name[i] for i in np.argmax(predictions, axis=1)] y_true = valid_df['label'].values labels = np.unique(valid_df['label'].values) cm = confusion_matrix(y_true, classes, labels=labels) df = pd.DataFrame(cm, columns=labels, index=labels) df.to_csv(os.path.join(params['output_path'], 'confusion.csv'), index_label='index') print(df) return df
def train_model(training_iterations, batch_size, train_data_file): accuracy_list, entropy_list = [], [] x_data, y_data = load_train_data(train_data_file, train_with_only_known_age_data) #print("Length of input data is: ", len(x_data)) start_time = time.time() saver = tf.train.Saver() for i in range(training_iterations): x_batch, y_batch = get_batch(x_data, y_data, batch_size) training_data = {x: x_batch, y_: y_batch} accrcy, entropy = sess.run([accuracy, cross_entropy], feed_dict=training_data) #Backpropagation sess.run(train_step, feed_dict=training_data) accuracy_list.append(accrcy) entropy_list.append(entropy) # Saving checkpoints to load trained model later directory = "checkpoints/trained_model" if not os.path.exists(directory): os.makedirs(directory) saver.save(sess, directory, global_step=checkpoint_every) # printing the training performance if i % 100 == 0: print("Accuracy after %s training steps is: %s" % (i, accrcy)) print("") print("Training process is done in time: ", time.time() - start_time, "seconds.") return accuracy_list, entropy_list
def main(): FNAME = "model_train_lgbm" logname = "%s_%s.log" % (FNAME, now) logger = logging_utils._get_logger(config.LOG_DIR, logname) # Load raw data train_raw = dl.load_train_data() # Load generated features train_features = load_combined_features(logger) train_column_names = list(train_features.columns.values) logger.info("Training set column names: " + str(train_column_names)) # train_features = pd.concat([train_features, train_raw[config.NUMBER_FEATURES]], axis=1) logger.info('Final training data shape: %s' % str(train_features.shape)) x_train, x_valid, y_train, y_valid = train_test_split( train_features, train_raw[config.TARGET_FEATURE], test_size=0.20, random_state=42) del train_raw del train_features gc.collect() lgtrain = lgb.Dataset(x_train, label=y_train, feature_name=train_column_names, categorical_feature=config.ENCODED_CATEGORY_FEATURES) lgvalid = lgb.Dataset(x_valid, label=y_valid, feature_name=train_column_names, categorical_feature=config.ENCODED_CATEGORY_FEATURES) t0 = time() lightgbm_model = lgb.train( config.LGBM_PARAMS, lgtrain, config.LGBM_NUM_ROUNDS, valid_sets=lgvalid, verbose_eval=50, early_stopping_rounds=config.LGBM_EARLY_STOPPING_ROUNDS) logger.info('Training LightGBM model took: %s minutes' % round( (time() - t0) / 60, 1)) # Save model t0 = time() MODEL_FILE_NAME = "lightgbm_model" model_file = os.path.join(config.DATA_MODELS_DIR, MODEL_FILE_NAME + config.FEAT_FILE_SUFFIX) logger.info("Save to %s" % model_file) lightgbm_model.save_model(model_file, num_iteration=lightgbm_model.best_iteration) logger.info('Saving %s lightgbm model took: %s minutes' % (MODEL_FILE_NAME, round((time() - t0) / 60, 1))) generate_figure_importance(lightgbm_model, logger)
def main_train(params, clf: Type[Classifier]): model = clf(L, LABELS) name = "{}--{}".format(model.name, int(datetime.now().timestamp())) print(params) chekpoints_path = os.path.join(params['output_path'], name + '_weights') os.makedirs(chekpoints_path, exist_ok=True) batch_size = params['batch_size'] n = params['sample_size'] train_data, validate_data = load_train_data(params['audio_path'], params['validation_list_path']) assert len(train_data) != 0 assert len(validate_data) != 0 wav_reader = SimpleWavFileReader(L) silence_data = get_silence(train_data, wav_reader) train_sound_chain = SoundChain( SimpleWavFileReader(L), sp.AdjustLenWavProcessor(silence_data, L, L), sp.AddNoiseWavProcessor(silence_data, L, L, 20), sp.ShiftWavProcessor(silence_data, L, L), sp.EmphasisWavProcessor(silence_data, L, L, 0.97), sp.NormalizeWavProcessor(silence_data, L, L), sp.ReshapeWavProcessor(silence_data, L, L), sp.MinMaxWavProcessor(silence_data, L, L, (0, 1)), ) valid_sound_chain = SoundChain( SimpleWavFileReader(L), sp.AdjustLenWavProcessor(silence_data, L, L), sp.EmphasisWavProcessor(silence_data, L, L, 0.97), sp.NormalizeWavProcessor(silence_data, L, L), sp.ReshapeWavProcessor(silence_data, L, L), sp.MinMaxWavProcessor(silence_data, L, L, (0, 1)), ) if params['sample']: print("Get small sample") train_data, validate_data = get_sample_data(train_data, validate_data, n) train_gen = train_generator(train_data, batch_size, train_sound_chain, n=n) validate_gen = valid_generator(validate_data, batch_size, valid_sound_chain, True) model.train( train_gen, validate_gen, dict(epochs=params['epochs'], batch_size=batch_size, tensorboard_dir=os.path.join(params['tensorboard_root'], name), chekpoints_path=chekpoints_path, steps_per_epoch=n * len(LABELS) / batch_size, validation_steps=int(np.ceil(validate_data.shape[0] / batch_size))))
def load_data(train_data_path='./aclImdb/train/', test_data_path='./aclImdb/test/'): # Load data print("Load Data...") Xtr_text, Ytr, Xva_text, Yva = load_train_data(train_data_path, 0.1) Xte_text, Yte = load_test_data(test_data_path) # Combine training and validation data: Xtr_text = np.append(Xtr_text, Xva_text) Ytr = np.append(Ytr, Yva) print("Done loading data!\n") return Xtr_text, Ytr, Xte_text, Yte
def train_model(training_iterations, batch_size, train_data_file): accuracy_list, cross_entropy_list = [], [] xs_data, ys_data = load_train_data(train_data_file, True) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for i in range(training_iterations): x_batch, y_batch = get_batch(xs_data, ys_data, batch_size) training_data = {x: x_batch, y_: y_batch} accrcy, s_cross = sess.run([accuracy, cross_entropy], feed_dict=training_data) #Backpropagation sess.run(train_step, feed_dict=training_data) accuracy_list.append(accrcy) cross_entropy_list.append(s_cross) return accuracy_list, cross_entropy_list
def test_train_generator(self): train_df, valid_df = load_train_data(self.audio_path, self.validation_list_path) wav_reader = SimpleWavFileReader(L) silence_data = get_silence(train_df, wav_reader) train_sound_chain = SoundChain( SimpleWavFileReader(L), sp.AdjustLenWavProcessor(silence_data, L, L), sp.EmphasisWavProcessor(silence_data, L, L, 0.97), sp.NormalizeWavProcessor(silence_data, L, L), sp.ReshapeWavProcessor(silence_data, L, L), sp.MinMaxWavProcessor(silence_data, L, L, (0, 1)), ) n = 2 gen = train_generator(train_df, 64, train_sound_chain, n) batch = gen.__next__() self.assertEqual(batch[0].shape, (len(LABELS) * n, L, 1)) self.assertEqual(batch[1].shape, (len(LABELS) * n, len(LABELS)))
def load_train_data_with_dictionary(file_path, freq_threshold=0): dictionary = {"PADDING_TOKEN": 0, "UNKNOWN_TOKEN": 1} word_freq = collections.defaultdict(int) train_df, valid_df = data_loader.load_train_data(file_path) train_doc, valid_doc = [], [] for idx, row in train_df.iterrows(): doc = process_row(row) train_doc.append(doc) for token in doc.words: word_freq[token] += 1 for word, freq in word_freq.items(): if freq >= freq_threshold: dictionary[word] = len(dictionary) for idx, row in valid_df.iterrows(): doc = process_row(row) valid_doc.append(doc) return train_doc, valid_doc, dictionary
def train(): init_seed(111) split_num = 720 train_num = 1600 val_num = 100 train_indices = torch.randperm(1700)[:train_num] val_indices = torch.randperm(1700)[:val_num] train_numerical = np.array([0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14]) val_numerical = np.array([6]) # train_numerical = np.array([]) # val_numerical = np.array([]) train_datasets = [ Subset(load_train_data('cmip', which_num=num), train_indices) for num in train_numerical ] train_datasets.append(load_train_data('soda', split_num=split_num)) print('Training Samples: {}'.format( len(train_numerical) * train_num + split_num)) valid_datasets = [ Subset(load_val_data('cmip', which_num=num), val_indices) for num in val_numerical ] valid_datasets.append(load_val_data('soda', split_num=split_num + 60)) print('Validation Samples: {}'.format( len(val_numerical) * val_num + 1200 - split_num)) train_loaders = [ DataLoader(train_dataset, batch_size=args['batch_size']) for train_dataset in train_datasets ] valid_loaders = [ DataLoader(valid_dataset, batch_size=args['batch_size']) for valid_dataset in valid_datasets ] device = args['device'] model = args['model_list'][args['model_name']]() print_model_parameters(model) if args['pretrain'] and os.path.exists(save_dir): model.load_state_dict(torch.load(save_dir, map_location=device)) print('load model from:', save_dir) optimizer = torch.optim.AdamW(model.parameters(), lr=args['learning_rate']) if args['lr_decay']: print('Applying learning rate decay.') lr_decay_steps = [int(i) for i in args['lr_decay_step']] lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer=optimizer, milestones=lr_decay_steps, gamma=args['lr_decay_rate']) else: lr_scheduler = None loss_fn = nn.MSELoss().to(device) # loss_fn = score_loss model.to(device) best_score = float('-inf') not_improved_count = 0 for i in range(args['n_epochs']): model.train() loss_epoch = 0 for train_loader in train_loaders: for step, (sst, t300, ua, va, label, month, sst_label) in enumerate(train_loader): sst = sst.to(device).float() t300 = t300.to(device).float() ua = ua.to(device).float() va = va.to(device).float() # month = month[:, :1].to(device).long() month = month.to(device).long() label = label.to(device).float() optimizer.zero_grad() preds = model(sst, t300, ua, va, month) loss = loss_fn(preds, label) loss.backward() loss_epoch += loss.item() if args['grad_norm']: torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm']) optimizer.step() del preds, loss if args['lr_decay']: lr_scheduler.step() model.eval() y_true, y_pred = [], [] for valid_loader in valid_loaders: for step, (sst, t300, ua, va, label, month, sst_label) in enumerate(valid_loader): sst = sst.to(device).float() t300 = t300.to(device).float() ua = ua.to(device).float() va = va.to(device).float() month = month.to(device).long() label = label.to(device).float() preds = model(sst, t300, ua, va, month) y_pred.append(preds.detach()) y_true.append(label.detach()) del preds y_true = torch.cat(y_true, axis=0) y_pred = torch.cat(y_pred, axis=0) sco = eval_score(y_true.cpu().detach().numpy(), y_pred.cpu().detach().numpy()) print('Epoch: {}, Train Loss: {}, Valid Score: {}'.format( i + 1, loss_epoch, sco)) if sco > best_score: best_score = sco not_improved_count = 0 best_state = True else: not_improved_count += 1 best_state = False if not_improved_count == args['early_stop_patience']: print("Validation performance didn\'t improve for {} epochs. " "Training stops.".format(args['early_stop_patience'])) break if best_state: best_model = deepcopy(model.state_dict()) torch.save(best_model, save_dir) # torch.save(model, '../user_data/ref.pkl') print('Model saved successfully:', save_dir)
import augmentation_methods as am import data_loader as dl import word_vectors as wv import data_preprocessing as dp import classifier as cl import testing as t import visualization as vis if __name__ == "__main__": # get original data in tokenized form orig_corpus, y_train_orig = dl.load_train_data() test_corpus, y_test_orig = dl.load_test_data() # develop word vectors word_vectors = wv.get_word_vectors(orig_corpus) # augment corpi corpus_method_1, y_train_method_1 = am.method_1(orig_corpus.copy(), y_train_orig.copy(), word_vectors) corpus_method_2, y_train_method_2 = am.method_2(orig_corpus.copy(), y_train_orig.copy(), word_vectors) corpus_method_3, y_train_method_3 = am.method_3(orig_corpus.copy(), y_train_orig.copy(), word_vectors) # process data so they are in a form(td-idf) that can be fed to classifiers X_orig, vectorizer = dp.process_corpus_orig(orig_corpus) X_method_1 = dp.process_corpus(corpus_method_1, vectorizer) X_method_2 = dp.process_corpus(corpus_method_2, vectorizer) X_method_3 = dp.process_corpus(corpus_method_3, vectorizer) X_test = dp.process_corpus(test_corpus, vectorizer) # train classifiers on original corpus and all augmented corpi classifier_orig = cl.train_classifier_bayes(X_orig, y_train_orig) classifier_method_1 = cl.train_classifier_bayes(X_method_1, y_train_method_1)
import h5py import numpy as np from nltk.corpus import stopwords import string from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from nltk.stem.wordnet import WordNetLemmatizer import data_loader import matplotlib.pyplot as plt import warnings warnings.filterwarnings('ignore') # %matplotlib inline plt.set_cmap('RdYlBu') import pre_processing train, valid = data_loader.load_train_data('data/train.csv') test = data_loader.load_test_data('data/test.csv', 'data/test_labels.csv') list_classes = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] train_y = train[list_classes].values valid_y = valid[list_classes].values test_y = test[list_classes].values train = train.fillna('') valid = valid.fillna('') test = test.fillna('') """## Data Exploration"""
def main(): image_size = 50 number_of_classes = 12 # cached_files = os.listdir('cache/') # if no cached features and labels exist locally create them and then cache them # if 'train_features.csv' not in cached_files or 'train_labels.csv' not in cached_files: features, labels, categories = load_train_data(train_data_path='./data/train/', image_size=image_size) # TODO create a fast caching system # np.savetxt('cache/train_features.csv', train_features, delimiter=',', fmt='%.4f') # np.savetxt('cache/train_labels.csv', train_labels, delimiter=',', fmt='%i') # # if cached features and labels are detected load them into variables # else: # train_features = np.genfromtxt('cache/train_features.csv', delimiter=',') # print('training features loaded from cache') # train_labels = np.genfromtxt('cache/train_labels.csv', delimiter=',') # print('training labels loaded from cache') binary_training_labels = keras.utils.to_categorical(labels, num_classes=number_of_classes) train_features, train_labels, crosval_features, crosval_labels, test_features, test_labels = \ split_data(features, binary_training_labels, train_fraction=0.9, crosval_fraction=0.0, test_fraction=0.1) reg_value = 0.02 # building nn topology model = Sequential() model.add(Dense(units=2500, activation='relu', input_dim=image_size ** 2, kernel_regularizer=regularizers.l2(reg_value))) model.add(Dense(units=300, activation='relu', kernel_regularizer=regularizers.l2(reg_value))) model.add(Dense(units=300, activation='relu', kernel_regularizer=regularizers.l2(reg_value))) model.add(Dense(units=number_of_classes, activation='sigmoid', kernel_regularizer=regularizers.l2(reg_value))) model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy']) # training_epochs = 200 # model.fit(train_features, train_labels, epochs=training_epochs, batch_size=100) epoch = 0 # hold historical training and test accuracy train_accuracy = {} test_accuracy = {} try: while epoch < 2000: model.fit(train_features, train_labels, epochs=1, batch_size=128) test_accuracy[epoch] = model.evaluate(test_features, test_labels, batch_size=128)[1] train_accuracy[epoch] = model.evaluate(train_features, train_labels, batch_size=128)[1] # TODO add sequential model saving print('\nepoch = %i\n' % epoch) epoch += 1 except KeyboardInterrupt: pass # plotting training and test accuracy histories plt.plot(train_accuracy.keys(), train_accuracy.values(), label='train') plt.plot(test_accuracy.keys(), test_accuracy.values(), label='test') axes = plt.gca() # axes.set_ylim([0.8, 0.90]) plt.xlabel('epoch') plt.ylabel('accuracy') plt.legend() plt.show() test_accuracy = model.evaluate(test_features, test_labels, batch_size=1000)[1] print('trained model accuracy on test set = %f' % test_accuracy)
acc = 0 acc_freq = [] freq_list = [] for freq, c in sorted(freq_counter.items()): if freq > 200: break freq_list.append(freq) acc += c acc_freq.append(acc / float(total_count)) plt.plot(freq_list, acc_freq, label=name) plt.title("Token frequency distribution of train data") plt.ylabel("cutoff proportion") plt.xlabel("cutoff token frequency") plt.legend() plt.savefig("token_frequency_cutoff_f200_{}.png".format(name)) plt.close() if __name__ == '__main__': train_df, valid_df = data_loader.load_train_data("./resources/train.csv") label_distribution(train_df, name="train") label_distribution(valid_df, name="valid") sentence_length_distribution() token_frequency_distribution() label_correlation()
# In[15]: import data_loader import numpy as np import pandas # In[16]: import warnings warnings.filterwarnings('ignore') # In[17]: # load data and assign names trdf, valdf = data_loader.load_train_data("data/adult.data", is_df=True) ## adding columns labels https://chartio.com/resources/tutorials/how-to-rename-columns-in-the-pandas-python-library/ trdf.columns = [ "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "target" ] valdf.columns = [ "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "target" ] # # Pipelines
else: print("ERROR: Invalid algorithm name", args.algo) print("Algorithm:", args.algo, "| Scenario:", args.scenario) param_object = ParameterGrid(tuned_parameters) NUMBER_OF_FOLDS = 5 skf = StratifiedKFold( n_splits=NUMBER_OF_FOLDS) # Splits the data into stratified folds NGRAMS = [(1, 1), (2, 2), (3, 3), (1, 3)] # Unigrams, Bigrams, Trigrams, UniBiTri_combined MAX_FEATURES = [None, 1000, 100000] X, Y = data_loader.load_train_data(scenario=args.scenario, N_WORDS=PASSAGE_LENGTH, exp=experiment) print("X: {} | Y: {} | Distribution: {}".format(len(X), len(Y), Counter(Y))) print("Y preview:", Y[:3]) results_path = '/path/Augmentation-for-Literary-Data/results/' + args.algo + '-params-' + str( PASSAGE_LENGTH ) + '/' + args.algo + '_' + str(NUMBER_OF_FOLDS) + 'foldCV_Case_' + str( args.scenario) + '_exp' + experiment + '.tsv' # name of output file print("\n-------\nResults path:", results_path, "\n\n") results_file = open(results_path, "w") results_file.write( "Model\tF1-score\tAUROC\tWeighted F1\tPrecision\tRecall\tAccuracy\tAUPRC\tParameters\n" ) run_experiments(X, Y)
def main(): if len(sys.argv) != 2 or sys.argv[1] not in ['svm', 'nn']: print("Invalid command. Expected 'svm' or 'nn'.") return c_name = sys.argv[1] print('Running job: TF-IDF vectorization and ' + c_name.upper() + ' classifier.') train_data = data_loader.load_train_data().sample( frac=1, random_state=42).reset_index(drop=True) test_data = data_loader.load_test_data() if c_name == 'svm': classifier = LinearSVC(random_state=42) param_grid = { 'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)], 'classifier__C': [0.1, 1] } else: classifier = MLPClassifier((50, ), solver='lbfgs', learning_rate_init=1e-4, tol=1e-6, max_iter=200, random_state=42) param_grid = {'vectorizer__ngram_range': [(1, 1)]} pipe = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', classifier)]) cv_grid = GridSearchCV(pipe, n_jobs=2, cv=5, verbose=3, param_grid=param_grid) start_time = time.time() cv_grid.fit(train_data.text, train_data.sentiment) end_time = time.time() print('Total fit time: {}'.format(end_time - start_time)) # Classification report pred = cv_grid.predict(train_data.text) cr = classification_report(train_data.sentiment, pred) print(cr) # Test predictions pred = cv_grid.predict(test_data.text) print('Predictions finished.') # Save predictions results = pd.DataFrame({'Id': test_data.index, 'Prediction': pred}) results = results.set_index('Id') data_loader.save_submission(results, 'tfidf_' + c_name.upper() + '_submission.csv') print('Predictions saved.') # Save classification results cvr_path = path.join( 'pickles', 'tfidf_' + c_name.upper() + '_cross_validation_results') # Cross validation results be_path = path.join('pickles', 'tfidf_' + c_name.upper() + '_best_estimator') # Best estimator dump(cv_grid.cv_results_, open(cvr_path, 'wb')) dump(cv_grid.best_estimator_, open(be_path, 'wb')) print('Classification results saved.')
from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate from sklearn.feature_selection import mutual_info_classif from sklearn.feature_extraction.text import CountVectorizer from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier # load data (provided method) train_data, valid_data = data_loader.load_train_data('Data/adult.data', valid_rate=0.1, is_df=True) test_data = data_loader.load_test_data('Data/adult.test', is_df=True) #update fields native_country_dict = { ' ?': '?', ' Cambodia': 'Africa', ' Canada': 'North America', ' China': 'Asia', ' Columbia': 'Latin America', ' Cuba': 'Latin America', ' Dominican-Republic': 'Latin America', ' Ecuador': 'Latin America', ' El-Salvador': 'Latin America', ' England': 'Europe',
parser.add_argument( '--cda', help='Use CDA for Data Augmentation', action="store_true") # Use CDA; default: no Data Augmentation args = parser.parse_args() # Load training data: if args.eda: # with EDA X_train, Y_train = data_loader.load_train_data_with_EDA( scenario=args.scenario) elif args.cda: # with CDA pass else: # wihtout any Data Augmentation X_train, Y_train = data_loader.load_train_data(args.scenario) X_train = X_train.tolist() Y_train = Y_train.tolist() # convert to list labels_train = labels_str_to_int(Y_train) # convert labels to integers # Test data: X_test, Y_test, test_IDs = data_loader.load_test_data() X_test = X_test.tolist() Y_test = Y_test.tolist() test_IDs = test_IDs.tolist() # convert to list labels_test = labels_str_to_int(Y_test) # convert labels to integers testIDs_idx = np.linspace( 0, len(test_IDs), len(test_IDs), False ) # can't create a tensor of strings, so create a corresponding list of indexes; we use that to index into test_IDs print("testIDs indexes:", len(testIDs_idx))
# Load training data: if args.eda and not args.cda: # only EDA train_passages, Y_train = data_loader.load_train_data_with_EDA( scenario=args.scenario) elif args.cda and not args.eda: # only CDA train_passages, Y_train = data_loader.load_train_data_with_CDA( scenario=args.scenario) elif args.eda and args.cda: # both EDA and CDA train_passages, Y_train = data_loader.load_train_data_with_EDA_and_CDA( scenario=args.scenario) else: # wihtout any Data Augmentation train_passages, Y_train = data_loader.load_train_data( scenario=args.scenario, N_WORDS=DOCUMENT_LENGTH, exp=experiment) print("\nTrain Set ---- X: {} | Y: {} | Distribution: {}".format( len(train_passages), len(Y_train), Counter(Y_train))) print("Y train preview:", Y_train[:3]) # Load test data (same for each scenario, with or without augmentation): test_passages, Y_test, test_IDs = data_loader.load_test_data( N_WORDS=DOCUMENT_LENGTH) print( "Test Set ---- X: {} | Y: {} | Distribution: {} | Test IDs: {}, preview: {}" .format(len(test_passages), len(Y_test), Counter(Y_test), len(test_IDs), test_IDs[:3])) print("Y test preview:", Y_test[:3])
import model import data_loader import torch train_loader, neg_loader, neg_len = data_loader.load_train_data() g = model.Generator() d = model.Discriminator() g_optim = torch.optim.Adam(g.parameters(), lr=0.001, weight_decay=0) d_optim = torch.optim.Adam(d.parameters(), lr=0.001, weight_decay=0) model.train(g, d, train_loader, neg_loader, 100, g_optim, d_optim, neg_len)
def load_train_data_from_file(train_file_path): df, _ = load_train_data(train_file_path, 0) df.replace(' ?', np.nan, inplace=True) df = df.dropna() return df
def train(): init_seed(1995) sample_num = 150 indices = torch.randperm(1700)[:sample_num] train_numerical = np.array([1, 2, 4, 5, 7, 8, 10, 11, 13, 14, 15]) - 1 val_numerical = np.array([3, 6, 9, 12]) - 1 train_datasets = [ Subset(load_train_data('cmip', which_num=num), indices) for num in train_numerical ] print('Training Samples: {}'.format(len(train_numerical) * sample_num)) valid_datasets = [ Subset(load_val_data('cmip', which_num=num), indices) for num in val_numerical ] print('Validation Samples: {}'.format(len(val_numerical) * sample_num)) train_loaders = [ DataLoader(train_dataset, batch_size=args['batch_size']) for train_dataset in train_datasets ] valid_loaders = [ DataLoader(valid_dataset, batch_size=args['batch_size']) for valid_dataset in valid_datasets ] device = args['device'] model = args['model_list'][args['model_name']]() if args['pretrain'] and os.path.exists(save_dir): model.load_state_dict(torch.load(save_dir, map_location=device)) print('load model from:', save_dir) optimizer = torch.optim.Adam(model.parameters(), lr=args['learning_rate']) loss_fn = nn.MSELoss() model.to(device) loss_fn.to(device) print_model_parameters(model) best_score = float('-inf') not_improved_count = 0 for i in range(args['n_epochs']): model.train() loss_epoch = 0 for cmip_num, train_loader in enumerate(train_loaders): loss_numodel = 0 for step, (sst, t300, ua, va, label, sst_label) in enumerate(train_loader): sst = sst.to(device).float() t300 = t300.to(device).float() ua = ua.to(device).float() va = va.to(device).float() # sst_label = sst_label.to(device).float() optimizer.zero_grad() label = label.to(device).float() # output, preds = model(sst, t300, ua, va) preds = model(sst, t300, ua, va) loss1 = loss_fn(preds, label) # loss2 = loss_fn(output, sst_label) # loss = loss1 + loss2 loss1.backward() loss_numodel += loss1.item() if args['grad_norm']: torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm']) optimizer.step() del preds, loss1 loss_epoch += loss_numodel print('numerical model {} loss: {}'.format(cmip_num, loss_numodel)) model.eval() y_true, y_pred = [], [] for valid_loader in valid_loaders: for step, (sst, t300, ua, va, label, sst_label) in enumerate(valid_loader): sst = sst.to(device).float() t300 = t300.to(device).float() ua = ua.to(device).float() va = va.to(device).float() label = label.to(device).float() preds = model(sst, t300, ua, va) y_pred.append(preds.detach()) y_true.append(label.detach()) del preds y_true = torch.cat(y_true, axis=0) y_pred = torch.cat(y_pred, axis=0) sco = eval_score(y_true.cpu().numpy(), y_pred.cpu().numpy()) print('Epoch: {}, Train Loss: {}, Valid Score: {}'.format( i + 1, loss_epoch, sco)) if sco > best_score: best_score = sco not_improved_count = 0 best_state = True else: not_improved_count += 1 best_state = False if not_improved_count == args['early_stop_patience']: print("Validation performance didn\'t improve for {} epochs. " "Training stops.".format(args['early_stop_patience'])) break if best_state: best_model = deepcopy(model.state_dict()) torch.save(best_model, save_dir) # torch.save(model, '../user_data/ref.pkl') print('Model saved successfully:', save_dir)
"""Trains the model.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf import numpy as np from data_loader import load_train_data, univariate_data from rnn_predictor import RNNPredictor raw_data = load_train_data() data, labels = univariate_data(raw_data["ConfirmedCases"], 0, None, 5, 0) val_data, val_labels = [list(data[20]), list(data[40]), list(data[63])], [labels[20], labels[40], labels[63]] train_data, train_labels = np.delete(data, [20, 40, 63], axis=0), np.delete(labels, [20, 40, 63], axis=0) val_data_set = tf.data.Dataset.from_tensor_slices((val_data, val_labels)) data_set = tf.data.Dataset.from_tensor_slices((train_data, train_labels)) data_set = data_set.shuffle(len(list(data_set.as_numpy_iterator())), reshuffle_each_iteration=True) data_set = data_set.repeat(5) predictor = RNNPredictor(256) epochs = range(10000) loss_object = tf.keras.losses.MeanSquaredError() optimizer = tf.keras.optimizers.Adam(lr=1e-2, clipvalue=1)
from inference import inference from postprocess import get_rle_encoding CW_DIR = os.getcwd() TRAIN_DIRS = [os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'data', 'stage1_train'), os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'extra_data'), os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'stage1_test',\ 'DSB2018_stage1_test-master', 'stage1_test')] TEST_DIR = os.path.join(os.path.dirname(CW_DIR), 'data_bowl', 'stage2_test') IMG_DIR_NAME = 'images' MASK_DIR_NAME = 'masks' train_df = read_train_data_properties(TRAIN_DIRS, IMG_DIR_NAME, MASK_DIR_NAME) test_df = read_test_data_properties(TEST_DIR, IMG_DIR_NAME) x_train, y_train, contour_train, no_contour_train = load_train_data(train_df) y_train_full = np.array([ np.concatenate((x, y, z), axis=2) for x, y, z in zip(y_train, contour_train, no_contour_train) ]) labels_train = get_train_labels(train_df) x_test = load_test_data(test_df) model_paths = train(train_df, y_train_full, labels_train) y_prediction = inference(x_test, model_paths) y_test_rle, y_test_ids = get_rle_encoding(test_df, y_prediction) sub = pd.DataFrame() sub['ImageId'] = y_test_ids sub['EncodedPixels'] = pd.Series(y_test_rle).apply(