def train(config, Model): # Load the data and create X and Y matrices data = get_data(config) num_features = data.shape[1] - 1 X = data[:, :num_features] Y = data[:, -1] # split the data into training and test set X_train, Y_train, X_test, Y_test = split_data(X, Y, 0.80, balance_dist=True) X_train = np.expand_dims(X_train, axis=2) X_test = np.expand_dims(X_test, axis=2) Y_train = to_categorical(Y_train) Y_test = to_categorical(Y_test) # instantiate the CNN model and train on the data model = Model(num_features, Y_train.shape[1]) history = model.fit(X_train, Y_train, batch_size=128, epochs=100, verbose=2) # Evaluate the trained model on test data and print the accuracy score = model.model.evaluate(X_test, Y_test) print("\nTest accuracy: ", round(score[1] * 100, 2)) print("Test loss: ", round(score[0], 2)) return history
def main(): import config from model import load_model model = load_model() while not model: config.model_path = input('valid model: ') model = load_model() from data import load_data, split_data d = load_data(with_meta=True) d, _ = split_data(d) # from random import shuffle # shuffle(d) d = d[:config.hm_output_file] for i, (seq, meta) in enumerate(d): from model import respond_to _, seq = respond_to(model, [seq[:config.hm_extra_steps]], training_run=False, extra_steps=config.hm_extra_steps) seq = seq.detach() if config.use_gpu: seq = seq.cpu() seq = seq.numpy() from data import data_to_audio, write seq = data_to_audio(seq, meta) write(f'{config.output_file}{i}.wav', config.sample_rate, seq)
def main(): # Generate and split data # Try and play with arguments all_data = data.generate_data_gauss(numSamples=1000, noise=0.5) train_data, valid_data = data.split_data(all_data, val_factor=0.3) # Set show to True if you want to see generated dataset data.plot_data(train_data, valid_data, show=False) # Directory to save summaries to # From your conda environment run # tensorbard --logdir ../tf_playground/output # to see training details output = utils.get_output_dir() # Create model # Go to model.py file to make changes to the model model = Model() # Lets train # Try changing number of epochs and batch_size trainer = Trainer(train_data=train_data, valid_data=valid_data, model=model, epochs=10, batch_size=2, output=output) trainer.train() trainer.save_final_accuracy()
def main(unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception('Problem with flags: %s' % unused_argv) # choose what level of logging you want tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # user_rating has the elements in the following order # user_id, item_id, rating, time, num_words, review user_rating, user_id_to_idx, item_id_to_idx = read_file(FLAGS.data_path) num_users = len(user_id_to_idx) num_items = len(item_id_to_idx) num_reviews = len(user_rating) print('Number of total users / items / reviews: %d / %d / %d' % (num_users, num_items, num_reviews)) users_ratings = [ur for ur in user_rating] train_ratings, test_ratings, valid_ratings = split_data(users_ratings) # build vocabulary id_to_word, word_to_id = build_vocab(users_ratings, FLAGS.vocab_size) train_item_doc = token_to_id(train_ratings, word_to_id) valid_item_doc = token_to_id(valid_ratings, word_to_id) current_datetime = datetime.now() subfolder_timestamp = datetime.strftime(current_datetime, '%Y%m%d-%H%M%S') subfolder_dataname = os.path.basename(FLAGS.data_path) log_folder = os.path.join(FLAGS.log_root, subfolder_dataname + '-' + subfolder_timestamp) # save vocab to output folder pathlib.Path(log_folder).mkdir(parents=True, exist_ok=True) with open(os.path.join(log_folder, 'vocab.csv'), 'w') as f: for idx, token in id_to_word.items(): f.write('%s,%s\n' % (idx, token)) # Try offset model offset_model = offsetModel(train_ratings, valid_ratings, test_ratings) offset_model.train() # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['init_stddev', 'emb_dim', 'min_kappa', 'max_kappa', 'vocab_size', 'mu', 'max_iter_steps', 'num_iter_steps', 'threshold'] hps_dict = {} for key,val in FLAGS.flag_values_dict().items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps = namedtuple('HParams', hps_dict.keys())(**hps_dict) hft_model = HFTModel(hps, train_ratings, valid_ratings, test_ratings, train_item_doc, valid_item_doc, num_users, num_items, num_reviews, log_folder) hft_model.build_graph() hft_model.train()
def test_precision(self): df = pd.read_pickle('../data/final/df_final.pkl') data = d.split_data(df, True) data_train = data[0] data_test = data[1] data_val = data[2] b = base.baseline(df, False) als_result = als_precision(data_train, data_val, b) assert 1 == 1
def main(disp_text=True): if config.fresh_model: config.all_losses = [] save_model(make_model()) model = load_model() if disp_text: print('created model.', end=' ') else: model = load_model() if not model: save_model(make_model()) model = load_model() if disp_text: print('created model.', end=' ') else: if disp_text: print('loaded model.', end=' ') data = load_data() data, data_dev = split_data(data) data = [d for i,d in enumerate(data) if i in [8,10,13,14]] print() seq_lens = [len(d) for d in data] print(f'seq lens: {seq_lens}') min_seq_len = min(seq_lens) print(f'min seq len: {min_seq_len}') if not config.max_seq_len or config.max_seq_len > min_seq_len: config.max_seq_len = min_seq_len data = [d[:config.max_seq_len] for d in data] # from random import choice # from torch import randn # data = [[randn(config.in_size) for _ in range(choice(range(config.max_seq_len//2,config.max_seq_len)))] for _ in range(10)] # data_dev = [] # for d in data: print(len(d)) if not config.batch_size or config.batch_size >= len(data): config.batch_size = len(data) elif config.batch_size < 1: config.batch_size = int(len(data)*config.batch_size) if disp_text: print(f'hm data: {len(data)}, hm dev: {len(data_dev)}, bs: {config.batch_size}, lr: {config.learning_rate}, \ntraining started @ {now()}') for ep in range(config.hm_epochs): for i, batch in enumerate(batchify_data(data)): train_on(model, batch) return model
def load_data(paths, report=sys.stdout, **kwargs): import data dtypes = [parse_type(path) for path in paths] if not all(t == dtypes[0] for t in dtypes[1:]): print >>report, "Error: all files must be of same type but were {}".format(dtypes) raise Exception() dtype = dtypes[0] labels = dtype == labeled Xt, Yt, Xv, Yv = data.split_data(read_files(paths,report=report,**kwargs), labels=labels , report=sys.stdout, **kwargs) if labels: labels = max(np.max(Yt), np.max(Yv))+1 else: labels = 0 return Xt, Yt, Xv, Yv, labels
def main(args): x, fx = get_data(args) device = torch.device("cuda" if args.cuda else "cpu") train_data, val_data = split_data(args, x, fx) if args.save_splits: save_splits(train_data, val_data) train_loader, val_loader = get_loaders(train_data, val_data) model = get_model(args) trainer = get_trainer(model, train_loader, val_loader, device, args) trainer.train()
def main(): import config from model import load_model model = load_model(config.model_path + '_final') while not model: config.model_path = input('valid model: ') model = load_model() from data import load_data, split_data d = load_data() d, _ = split_data(d) # from random import shuffle # shuffle(d) #d = d[:config.hm_output_file] d = [d[8]] # [8,10,13,14]] config.polyphony = True for i, seq in enumerate(d): from model import respond_to seq = respond_to(model, seq[:1]) seq = [t.detach() for t in seq] if config.use_gpu: seq = [t.cpu() for t in seq] seq = [t.numpy() for t in seq] from data import note_reverse_dict, convert_to_midi seq_converted = [] for timestep in seq: if config.act_fn == 't': timestep = (timestep + 1) / 2 if config.polyphony: t_converted = '' for i, e in enumerate(timestep[0]): if e > config.pick_threshold: t_converted += note_reverse_dict[i % 12] + str( int(i / 12) + config.min_octave ) if i != config.out_size - 1 else 'R' t_converted += ',' t_converted = t_converted[:-1] if len(t_converted) else 'R' else: i = timestep[0].argmax() t_converted = note_reverse_dict[i % 12] + str( int(i / 12) + config.min_octave) seq_converted.append(t_converted) convert_to_midi(seq_converted).show()
def get_stats(): config = { 'unknown_freq': 2, 'gold_ratio': 0.1, 'inc_option': 'auxiliary', 'auxiliary_option': 'detection', 'seed': 66 } dir_path = '/path/to/working/dir' set_random_seed(config['seed']) train_file = dir_path + '/data/ontonotes.development.ner' print('load data') train_data = get_data(train_file) gold_data, inc_data = split_data(train_data, config) print('get vocabulary') word_to_ix, pos_to_ix, ner_to_ix = get_vocabulary(train_data, config) config['ner_to_ix'] = ner_to_ix config['pos_to_ix'] = pos_to_ix config['word_to_ix'] = word_to_ix config['output_size'] = len(ner_to_ix) print('ner_to_ix', ner_to_ix) print('word_to_ix', len(word_to_ix)) print('process data') inc_input_ids, inc_sent_ids, inc_pos_ids, inc_ner_ids = process_data( inc_data, word_to_ix, pos_to_ix, ner_to_ix) inc_ner_ids = get_incidental_data(inc_sent_ids, inc_input_ids, inc_pos_ids, inc_ner_ids, config) inc_label_counter = Counter() for label in inc_ner_ids: # if label[0] == 'B' or label[0] == 'I': # label = label[2:] inc_label_counter[label] += 1 / len(inc_ner_ids) print('inc label counter', inc_label_counter) inputs, sent_ids, pos_labels, ner_labels = inc_data word_seqs = generate_sent_seqs(inputs, sent_ids) pos_seqs = generate_sent_seqs(pos_labels, sent_ids) ner_seqs = generate_sent_seqs(ner_labels, sent_ids) inc_data = [] sent_counter = Counter() for x in range(len(word_seqs)): inc_data.append((word_seqs[x], pos_seqs[x], ner_seqs[x])) sent_counter[len(word_seqs[x])] += 1 / len(word_seqs) print('average sent length', len(sent_ids) / len(word_seqs)) print('sent length distribution', sent_counter.items())
def main(images_path, labels_path): keras.backend.clear_session() data_df = get_data(images_path, labels_path) raw_train, valid = split_data(data_df) model = create_model(num_classes=28, input_shape=input_shape) model.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["acc", f1]) # model.compile(loss=[_focal_loss(gamma=2,alpha=0.75)], optimizer=Adam(), metrics=["acc", f1]) epochs = 50 batch_size = 64 checkpointer = ModelCheckpoint("../working/InceptionResNetV2.model", verbose=2, save_best_only=True) early_stopping = EarlyStopping(monitor="val_loss", patience=2) reduce_lr = ReduceLROnPlateau(monitor="val_loss", patience=1, factor=0.1) train_generator = DataGenerator.create_train(raw_train, batch_size, DEFAULT_IMG_SIZE_WHC, augument=True) validation_generator = DataGenerator.create_train(valid, 100, DEFAULT_IMG_SIZE_WHC, augument=False) train_steps = raw_train.shape[0] // batch_size valid_steps = valid.shape[0] // batch_size # train model history = model.fit_generator( train_generator, steps_per_epoch=train_steps, validation_data=next(validation_generator), validation_steps=valid_steps, epochs=epochs, verbose=1, callbacks=[checkpointer, reduce_lr], )
def main(): try: feature1 = request.form["feature1"] feature2 = request.form["feature2"] classifier = request.form["classifier"] except KeyError: error = "Warning! Missing selections. Please select two features from the dataset, and one classifier!" return render_template('select.html', error=error) df = read_diabetes() x_train, x_test, y_train, y_test = split_data(df) x_train, x_test = select_features(x_train, x_test, [feature1, feature2]) clf = eval(classifier + "()") clf.fit(x_train, y_train) plot_data = build_plot(clf, x_test, y_test) accuracy = clf.score(x_test, y_test) return render_template('plot.html', accuracy=accuracy, plot_url=plot_data)
def preproc_data(): from data import split_data split_data('../data/hin-eng/hin.txt', '../data/hin-eng')
def CNN(X_train, X_test, y_train, y_test): X_train, X_test, y_train, y_test = reshape_data(X_train, X_test, y_train, y_test) model = Sequential() model.add( Conv2D(64, kernel_size=3, activation='relu', input_shape=(28, 28, 1))) model.add(Conv2D(32, kernel_size=3, activation='relu')) model.add(Flatten()) model.add(Dense(10, activation='softmax')) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=100) plt.plot(history.history['loss']) # plt.plot(history.history['val_loss']) plt.title('Loss Function') plt.ylabel('loss') plt.xlabel('epoch') # plt.legend(['train', 'test'], loc='upper left') plt.show() if __name__ == "__main__": train_x, test_x, train_y, test_y = split_data() CNN(train_x, test_x, train_y, test_y)
output, hidden = model(input, hidden) output = output.squeeze() output = softmax(output, dim=0) p = output[current_idx].data # 概率 total_p += math.log(p) #e为底 return math.exp(-total_p * (1 / sentence_len)) def evaluate(model, test_dataset, dict): ppl = 0 for sentence in test_dataset: ppl += evaluate_iter(model, sentence, dict) ppl = ppl / len(test_dataset) print("evaluation ppl:", ppl) return ppl if __name__ == '__main__': dataset = data.get_dataset(file_path) dict = data.build_dict(dataset) config.vocab_size = len(dict) train_dataset, test_dataset = data.split_data( dataset, train_proportion=config.train_proportion) train_tokens = data.tokenize(train_dataset, dict) model = RNNModel(config) train_batch_source = data.batchify(train_tokens, config.batch_size) #传入batchify好的数据直接训练 train(model, batch_source=train_batch_source) #test evaluate(model, test_dataset, dict)
def train(gpu: int, args: Namespace): """Implements the training loop for PyTorch a model. Args: gpu: the GPU device args: user defined arguments """ # setup process groups rank = args.nr * args.gpus + gpu setup(rank, args) # define the model model = ResNext().architecture model.cuda(gpu) # Wrap the model model = DDP(model, device_ids=[gpu]) # define loss function (criterion) and optimizer criterion = nn.BCEWithLogitsLoss() optimizer = Adam(model.parameters(), args.lr) # split data train_df = split_data(args.folds) for fold in range(args.folds): losses = [] scores = [] train_loader, valid_loader = get_data(args, train_df, fold, rank) if gpu == 0: print(f"Training started using fold {fold} for validation") # train model.train() for epoch in range(args.epochs): for i, (images, labels) in enumerate(train_loader): images = images.cuda(gpu) labels = labels.cuda(gpu) output = model(images) loss = criterion(output, labels) loss.backward() optimizer.step() optimizer.zero_grad() if i % args.log_interval == 0 and gpu == 0: print("Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format( epoch+1, i, len(train_loader), 100. * i / len(train_loader), loss.item())) # evaluate model.eval() with torch.no_grad(): for i, (images, labels) in enumerate(valid_loader): images = images.cuda(gpu) labels = labels.cuda(gpu) output = model(images) loss = criterion(output, labels).item() score = get_score(labels.detach().cpu(), output.detach().cpu()) losses.append(loss) scores.append(score) if gpu == 0: print("Validation loss={:.4f}\tAUC score={:.4f}".format( statistics.mean(losses), statistics.mean(scores))) # checkpoint model model = checkpoint(model, gpu, fold) if args.save_model and gpu == 0: torch.save(model.module.state_dict(), "model.pt") cleanup()
from data import get_train_data, get_vocab, split_data, response_len, post_len, padding import random import os from pprint import pprint import numpy as np import time id2w, w2id, freq = get_vocab() from emo_cls.classification import Classification from seq2seq_attention_9emo import Seq2SeqAttentionMinDis, Seq2SeqAttentionMaxDis, Seq2SeqAttentionEmoContent from seq2seq_attention_9emo import Seq2SeqAttentionHappy, Seq2SeqAttentionSad, Seq2SeqAttentionAnger, Seq2SeqAttentionDisgust from seq2seq_attention_9emo import Seq2SeqAttentionLike #,Seq2SeqAttentionSurprise,Seq2SeqAttentionFear train_datas, val_datas, test_datas = split_data() keys = ['posts', 'postLen', 'resps', 'respLen', 'resp_tfidf'] train_datas = [train_datas[k] for k in keys] val_datas = [val_datas[k] for k in keys] print("train num:%s" % len(train_datas[0])) seq_len = 20 batch_size = 128 D_step = 5 G_step = 1 is_debug = True # Emotion Classifier emo_clas = Classification(sequence_length=20, num_classes=6, l2_reg_lambda=0.1) emo_clas.restore_last_session(base_path="./emo_cls")
import data import utils import log log = log.log log.struct_log(log) pre_method = 'Rescaling' train_method = 'RandomForest' time_train = 300 time_test = 10 n_stock_select = 10 seed = 41 data = data.data data.read_data(data) data.split_data(data) data.pre_process(data,pre_method) model = Model.Model model.read_data(model) model.roll_train(model,train_method,time_train,time_test) utils = utils.utils utils.parameter(utils,n_stock_select,seed) utils.struct_strategy(utils) utils.merging_index(utils) log.logger.info(utils.strategy) utils.print_winrate(utils) utils.plot_value(utils)
model_files = glob.glob('models/*.hdf5') other_models = glob.glob('models/*/*-0.6*hdf5') model_files.extend(other_models) public_test_dict = {} private_test_dict = {} results = {} for model_file in model_files: model = load_model(model_file, compile=False) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) input_shape = model.input_shape[1:4] if input_shape not in public_test_dict.keys(): faces, emotions = load_emotion_data('data/fer2013/fer2013.csv', input_shape) train, test = split_data(faces, emotions, 0.2) public_test_dict[input_shape], private_test_dict[input_shape] = split_data(test[0], test[1], 0.5) start = time.time() public_test_result = model.evaluate(public_test_dict[input_shape][0], public_test_dict[input_shape][1]) private_test_result = model.evaluate(private_test_dict[input_shape][0], private_test_dict[input_shape][1]) duration = time.time() - start print(model_file) print('public test', public_test_result) print('private test', private_test_result) results[model_file] = {'public_acc': public_test_result[1], 'private_acc': private_test_result[1], 'time': duration} print(results) import json json.dump(results, open('test.json', 'w'))
def main(): global ADV_WEIGHT, TRANSFER_WEIGHT # set random seed np.random.seed(42) torch.manual_seed(42) torch.backends.cudnn.deterministic = True # Parsing arguments parser = argparse.ArgumentParser(description='signer-independent project') parser.add_argument('--model', type=str, required=True) parser.add_argument('--dataset', type=str, required=True) parser.add_argument('--mode', type=str, default='test') parser.add_argument('--gpu', type=int, required=True) parser.add_argument('--adv_weight', type=float, required=True) parser.add_argument('--transf_weight', type=float, required=True) parser.add_argument('--output', default='./output_cnn/') args = parser.parse_args() # set adversarial and transfer weights TRANSFER_WEIGHT = args.transf_weight ADV_WEIGHT = args.adv_weight # Make output direcotiry if not exists if not os.path.isdir(args.output): os.mkdir(args.output) # select gpu os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # dataset dataset = DATASETS_LIST[args.dataset](model=args.model) X_to_split = np.zeros((len(dataset), 1)) print(len(dataset[0])) # evaluation protocol (IM_SIZE, MODE, SPLITS, n_signers, n_classes) = get_evaluation_protocol(args.dataset) # get data splitter dataSplitter = getSplitter(dataset, n_splits=SPLITS, mode=MODE, test_size=.10) results = [] split = 0 for split, (tr_indexes, test_indexes) in enumerate(dataSplitter): output_fn = os.path.join(args.output, 'split_' + str(split)) if not os.path.isdir(output_fn): os.mkdir(output_fn) # split data (train_loader, valid_loader, test_loader) = split_data(dataset, (tr_indexes, test_indexes), BATCH_SIZE, dataAug=True, mode=MODE) # Initialize the model model = MODEL_LIST[args.model](input_shape=IM_SIZE, output_signers=n_signers, output_classes=n_classes, hasAdversial=True).to(device) print(model) # Train or test if args.mode == 'train': # Fit model model, train_history, valid_loader = fit(model=model, data=(train_loader, valid_loader), device=device, output=output_fn, n_signers=n_signers) # save train history res_fn = os.path.join(*(output_fn, '_history.pckl')) pickle.dump(train_history, open(res_fn, "wb")) elif args.mode == 'test': model.load_state_dict( torch.load(os.path.join(*(output_fn, 'cnn.pth')))) # load train history res_fn = os.path.join(*(output_fn, '_history.pckl')) train_history = pickle.load(open(res_fn, "rb")) plot_fn = os.path.join(*(output_fn, 'cnn_history.png')) plot_train_history(train_history, plot_fn=plot_fn) # Test results (_, test_loss, _, _, _, test_acc, test_acc_3, test_acc_5) = eval_model(model, test_loader, n_signers, device, debug=True) print('##!!!! Test loss: {:.5f} |'.format(test_loss.item()) + ' Test Acc: {:.5f}'.format(test_acc)) results.append((test_loss.item(), test_acc, test_acc_3, test_acc_5)) # TSNE maps # tsne(model, test_loader, device, # plot_fn=os.path.join(*(output_fn, 'tsne.png'))) # save results print(results) # asdas res_fn = os.path.join(args.output, 'res.pckl') pickle.dump(results, open(res_fn, "wb")) results = pickle.load(open(res_fn, "rb")) # Compute average and std print(results) acc_array = np.array([i[1] for i in results]) acc3_array = np.array([i[2] for i in results]) acc5_array = np.array([i[3] for i in results]) print('Average acc: ', np.mean(acc_array)) print('Average acc3: ', np.mean(acc3_array)) print('Average acc5: ', np.mean(acc5_array)) print('Std acc: ', np.std(acc_array))
plt.ylabel(test_x.iloc[:, 1].name) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks(()) plt.yticks(()) model_accuracy = trained_model.score(test_x, test_y) print("train_accuracy::", model_accuracy) return fig if __name__ == '__main__': doctors = read_diabetes() train_x, test_x, train_y, test_y = split_data(doctors) features = ['glucose', 'mass'] train_x, test_x = select_features(train_x, test_x, features) # Training Logistic regression model clf = train_logistic_regression(train_x, train_y) #clf = eval(classifier+"()") #clf = MLPClassifier() #clf.fit(train_x, train_y) fig = visualize_plot(clf, test_x, test_y) plt.savefig('./static/visualize_plot.png') plt.show()
def preproc_data(): from data import split_data split_data('/content/itr/itr/data/hin-eng/hin.txt', '/content/itr/itr/data/hin-eng')
import data import baseline import numpy as np x_data, y_data = data.get_keras_data() x_data = [" ".join(x.split()[:100]) for x in x_data] X_train, X_val, y_train, y_val = data.split_data(x_data,y_data) X_train, X_val = data.tf_idf(X_train,X_val) #get baseline accuracy models = baseline.Base_models(X_train, y_train, X_val, y_val) print(models.BinaryRe()) print(powerset()) print(mlknn())
Muraro = data.read_dataset(_path+"../data/Muraro/data.h5") Enge = data.read_dataset(_path+"../data/Enge/data.h5") Segerstolpe = data.read_dataset(_path+"../data/Segerstolpe/data.h5") Xin_2016 = data.read_dataset(_path+"../data/Xin_2016/data.h5") Lawlor = data.read_dataset(_path+"../data/Lawlor/data.h5") merge = {'Baron_human':Baron_human, 'Muraro':Muraro, 'Enge':Enge, 'Segerstolpe':Segerstolpe, 'Xin_2016':Xin_2016, 'Lawlor':Lawlor} mergedexpr, mergedl = data.merge_datasets(merge) s = mergedexpr.sum(axis=1) x = (mergedexpr.T/s).T x = x*10000 #x = x[: ,:1000] whole_set = dataset.Single(x, mergedl) x,y,z,w = data.split_data(x, mergedl) whole_set.print_info() for exit() x = np.load("./data/train15720data.npy") z = np.load("./data/train15720label.npy") y = np.load("./data/test15720data.npy") w = np.load("./data/test15720label.npy") train_set = dataset.Single(x, z) test_set = dataset.Single(y, w) dl = DataLoader(train_set, batch_size=60, shuffle=True)
def main(disp_text=True): if config.fresh_model: config.all_losses = [] save_model(make_model()) model = load_model() if disp_text: print('created model.', end=' ') else: model = load_model() if not model: save_model(make_model()) model = load_model() if disp_text: print('created model.', end=' ') else: if disp_text: print('loaded model.', end=' ') data = load_data() data, data_dev = split_data(data) # from random import choice # from torch import randn # data = [[randn(config.in_size) for _ in range(choice(range(config.max_seq_len//2,config.max_seq_len)))] for _ in range(40)] # data_dev = [] # for d in data: print(len(d)) if config.max_seq_len: data = [d[:config.max_seq_len] for d in data] if not config.batch_size or config.batch_size >= len(data): config.batch_size = len(data) one_batch = True elif config.batch_size < 1: config.batch_size = int(len(data) * config.batch_size) one_batch = False else: one_batch = False if disp_text: print( f'hm data: {len(data)}, hm dev: {len(data_dev)}, bs: {config.batch_size}, lr: {config.learning_rate}, \ntraining started @ {now()}' ) data_losss, dev_losss = [], [] if not one_batch: if not config.all_losses: config.all_losses.append(dev_loss(model, data)) data_losss.append(config.all_losses[-1]) if config.dev_ratio: dev_losss.append(dev_loss(model, data_dev)) if data_losss or dev_losss: if disp_text: print( f'initial loss(es): {data_losss[-1] if data_losss else ""} {dev_losss[-1] if dev_losss else ""}' ) for ep in range(config.hm_epochs): loss = 0 for i, batch in enumerate(batchify_data(data)): loss += respond_to(model, batch) sgd(model) if config.optimizer == 'sgd' else adaptive_sgd(model) loss /= len(data) if not one_batch: loss = dev_loss(model, data) data_losss.append(loss) config.all_losses.append(loss) if config.dev_ratio: dev_losss.append(dev_loss(model, data_dev)) if disp_text: print( f'epoch {ep}, loss {loss}, dev loss {dev_losss[-1] if config.dev_ratio else ""}, completed @ {now()}', flush=True) if config.ckp_per_ep and ((ep + 1) % config.ckp_per_ep == 0): save_model(model, config.model_path + f'_ckp{ep}') if one_batch: data_losss.append(dev_loss(model, data)) if disp_text: print( f'training ended @ {now()} \nfinal losses: {data_losss[-1]}, {dev_losss[-1] if config.dev_ratio else ""}', flush=True) show(plot(data_losss)) if config.dev_ratio: show(plot(dev_losss)) if not config.fresh_model: show(plot(config.all_losses)) return model, [data_losss, dev_losss]
plt.show() ################################################################################ ################################################################################ ################################################################################ ################################################################################ ## MAIN ######################################################################## ################################################################################ if __name__ == '__main__': X,Y = load_data_from_csv('../data/binary.csv', -1, float) X,Y = bootstrap_data(X, Y, 25) X = X[:,2:] Xtr,Xte,Ytr,Yte = split_data(X, Y, .8) knn = KNNClassify(Xtr, Ytr) print(cols((X,knn.predict(X)))) plot_classify_2D(knn, X, Y) ################################################################################ ################################################################################ ################################################################################
def main(): if config.attention_only: from model2 import make_model_higher, respond_to else: from model import make_model_higher, respond_to if config.fresh_model: save_model(make_model_higher()) model = load_model() print('created model.',end=' ') else: model = load_model() if not model: save_model(make_model_higher()) model = load_model() print('created model.',end=' ') else: print('loaded model.',end=' ') print(f'info: {config.creation_info}') data = load_data(frames=not config.attention_only) data, data_dev = split_data(data) if not config.batch_size or config.batch_size >= len(data): config.batch_size = len(data) one_batch = True elif config.batch_size < 1: config.batch_size = int(len(data)*config.batch_size) one_batch = False else: one_batch = False print(f'hm data: {len(data)}, hm dev: {len(data_dev)}, bs: {config.batch_size}, lr: {config.learning_rate}, \ntraining started @ {now()}') data_losss, dev_losss = [], [] if config.batch_size != len(data): data_losss.append(dev_loss(model, data)) if config.dev_ratio: dev_losss.append(dev_loss(model, data_dev)) if data_losss or dev_losss: print(f'initial loss(es): {data_losss[-1] if data_losss else ""} {dev_losss[-1] if dev_losss else ""}') for ep in range(config.hm_epochs): loss = 0 for i, batch in enumerate(batchify_data(data, do_shuffle=not one_batch)): # print(f'\tbatch {i}, started @ {now()}', flush=True) batch_size = sum(len(sequence) for sequence in batch) loss += respond_to(model, batch) sgd(model, batch_size=batch_size) if config.optimizer == 'sgd' else \ adaptive_sgd(model, batch_size=batch_size) # loss /= sum(len(sequence) for sequence in data) if not one_batch: loss = dev_loss(model, data) data_losss.append(loss) if config.dev_ratio: dev_losss.append(dev_loss(model, data_dev)) print(f'epoch {ep}, loss {loss}, dev loss {dev_losss[-1] if config.dev_ratio else ""}, completed @ {now()}', flush=True) if config.ckp_per_ep and ((ep+1)%config.ckp_per_ep==0): save_model(model,config.model_path+f'_ckp{ep}') # data_losss.append(dev_loss(model, data)) # if config.dev_ratio: # dev_losss.append(dev_loss(model, data_dev)) print(f'training ended @ {now()} \nfinal losses: {data_losss[-1]}, {dev_losss[-1] if config.dev_ratio else ""}', flush=True) show(plot(data_losss)) if config.dev_ratio: show(plot(dev_losss)) # if input(f'Save model as {config.model_path}? (y/n): ').lower() == 'y': # save_model(load_model(), config.model_path + '_prev') # save_model(model) return model, [data_losss, dev_losss]
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('corpus', help = 'the name of the input corpus file') parser.add_argument('--seeds', help = 'percentage of seeds', type = float, default = 0.01) parser.add_argument('--epochs', help = 'number of epochs', type = int, default = 40) parser.add_argument('--learning_rate', help = 'learning rate', type = float, default = 1.0) parser.add_argument('--param_reg', help = 'the regularization factor of the parameters', type = float, default = 0.001) parser.add_argument('--ent_reg', help = 'the factor of entropy regularization', type = float, default = 0.0) args = parser.parse_args() lasagne.random.set_rng(np.random) np.random.seed(0) features, labels, label_set = data.read_content_citeseer(args.corpus) split = data.split_data(labels, args.seeds) maxf = get_maxf(features) trainx, trainy = constuct_dataset(features, labels, label_set, split[0], maxf) testx, testy = constuct_dataset(features, labels, label_set, split[1], maxf) allx, ally = constuct_dataset(features, labels, label_set, features.keys(), maxf) input_var = sparse.csr_matrix(name = 'x', dtype = 'float32') un_var = sparse.csr_matrix(name = 'ux', dtype = 'float32') target_var = T.imatrix('targets') ent_target = T.ivector('ent_targets') network, l_entropy = build_model(input_var, maxf + 1, trainy.shape[1], args.ent_reg > 0, un_var) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() + regularize_layer_params(network, l2) * args.param_reg
def prepare_data(self): from data import split_data split_data('/content/itr/hin.txt', '/content/itr/')
def main(FLAGS): # set seed np.random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) with tf.device('/cpu:0'), tf.name_scope('input'): # load data data, meta = load_data(FLAGS.dataset_root, FLAGS.dataset, is_training=True) train_data, val_data = split_data(data, FLAGS.validate_rate) batch_size = FLAGS.n_class_per_iter * FLAGS.n_img_per_class img_shape = train_data[0].shape[1:] # build DataSampler train_data_sampler = DataSampler(train_data, meta['n_class'], FLAGS.n_class_per_iter, FLAGS.n_img_per_class) val_data_sampler = DataSampler(val_data, meta['n_class'], FLAGS.n_class_per_iter, FLAGS.n_img_per_class) # build tf_dataset for training train_dataset = (tf.data.Dataset.from_generator( lambda: train_data_sampler, (tf.float32, tf.int32), ([batch_size, *img_shape ], [batch_size])).take(FLAGS.n_iter_per_epoch).flat_map( lambda x, y: tf.data.Dataset.from_tensor_slices((x, y))).map( preprocess_for_train, 8).batch(batch_size).prefetch(1)) # build tf_dataset for val val_dataset = (tf.data.Dataset.from_generator( lambda: val_data_sampler, (tf.float32, tf.int32), ([batch_size, *img_shape], [batch_size])).take(100).flat_map( lambda x, y: tf.data.Dataset.from_tensor_slices((x, y))).map( preprocess_for_eval, 8).batch(batch_size).prefetch(1)) # clean up del data, train_data, val_data # construct data iterator data_iterator = tf.data.Iterator.from_structure( train_dataset.output_types, train_dataset.output_shapes) # construct iterator initializer for training and validation train_data_init = data_iterator.make_initializer(train_dataset) val_data_init = data_iterator.make_initializer(val_dataset) # get data from data iterator images, labels = data_iterator.get_next() tf.summary.image('images', images) # define useful scalars learning_rate = tf.placeholder(tf.float32, shape=(), name='learning_rate') tf.summary.scalar('lr', learning_rate) is_training = tf.placeholder(tf.bool, [], name='is_training') global_step = tf.train.create_global_step() # define optimizer optimizer = tf.train.AdamOptimizer(learning_rate) # optimizer = tf.train.GradientDescentOptimizer(learning_rate) # build the net model = importlib.import_module('models.{}'.format(FLAGS.model)) net = model.Net(n_feats=FLAGS.n_feats, weight_decay=FLAGS.weight_decay) if net.data_format == 'channels_first' or net.data_format == 'NCHW': images = tf.transpose(images, [0, 3, 1, 2]) # get features features = net(images, is_training) tf.summary.histogram('features', features) # summary variable defined in net for w in net.global_variables: tf.summary.histogram(w.name, w) with tf.name_scope('losses'): # compute loss, if features is l2 normed, then 2 * cosine_distance will # equal squared l2 distance. distance = 2 * custom_ops.cosine_distance(features) # hard mining arch_idx, pos_idx, neg_idx = custom_ops.semi_hard_mining( distance, FLAGS.n_class_per_iter, FLAGS.n_img_per_class, FLAGS.threshold) # triplet loss N_pair_lefted = tf.shape(arch_idx)[0] def true_fn(): pos_distance = tf.gather_nd(distance, tf.stack([arch_idx, pos_idx], 1)) neg_distance = tf.gather_nd(distance, tf.stack([arch_idx, neg_idx], 1)) return custom_ops.triplet_distance(pos_distance, neg_distance, FLAGS.threshold) loss = tf.cond(N_pair_lefted > 0, true_fn, lambda: 0.) pair_rate = N_pair_lefted / (FLAGS.n_class_per_iter * FLAGS.n_img_per_class**2) # compute l2 regularization l2_reg = tf.losses.get_regularization_loss() with tf.name_scope('metrics') as scope: mean_loss, mean_loss_update_op = tf.metrics.mean(loss, name='mean_loss') mean_pair_rate, mean_pair_rate_update_op = tf.metrics.mean( pair_rate, name='mean_pair_rate') reset_metrics = tf.variables_initializer( tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope)) metrics_update_op = tf.group(mean_loss_update_op, mean_pair_rate_update_op) # collect metric summary alone, because it need to # summary after metrics update metric_summary = [ tf.summary.scalar('loss', mean_loss, collections=[]), tf.summary.scalar('pair_rate', mean_pair_rate, collections=[]) ] # compute grad grads_and_vars = optimizer.compute_gradients(loss + l2_reg) # summary grads for g, v in grads_and_vars: tf.summary.histogram(v.name + '/grad', g) # run train_op and update_op together train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(train_op, *update_ops) # build summary jpg_img_str = tf.placeholder(tf.string, shape=[], name='jpg_img_str') emb_summary_str = tf.summary.image( 'emb', tf.expand_dims(tf.image.decode_image(jpg_img_str, 3), 0), collections=[]) train_summary_str = tf.summary.merge_all() metric_summary_str = tf.summary.merge(metric_summary) # init op init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # prepare for the logdir if not tf.gfile.Exists(FLAGS.logdir): tf.gfile.MakeDirs(FLAGS.logdir) # saver saver = tf.train.Saver(max_to_keep=FLAGS.n_epoch) # summary writer train_writer = tf.summary.FileWriter(os.path.join(FLAGS.logdir, 'train'), tf.get_default_graph()) val_writer = tf.summary.FileWriter(os.path.join(FLAGS.logdir, 'val'), tf.get_default_graph()) # session config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, intra_op_parallelism_threads=8, inter_op_parallelism_threads=0) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # do initialization sess.run(init_op) # restore if FLAGS.restore: saver.restore(sess, FLAGS.restore) lr_boundaries = list(map(int, FLAGS.boundaries.split(','))) lr_values = list(map(float, FLAGS.values.split(','))) lr_manager = LRManager(lr_boundaries, lr_values) time_meter = TimeMeter() # start to train for e in range(FLAGS.n_epoch): print('-' * 40) print('Epoch: {:d}'.format(e)) # training loop try: i = 0 sess.run([train_data_init, reset_metrics]) while True: lr = lr_manager.get(e) fetch = [train_summary_str] if i % FLAGS.log_every == 0 else [] time_meter.start() result = sess.run([train_op, metrics_update_op] + fetch, { learning_rate: lr, is_training: True }) time_meter.stop() if i % FLAGS.log_every == 0: # fetch summary str t_summary = result[-1] t_metric_summary = sess.run(metric_summary_str) t_loss, t_pr = sess.run([mean_loss, mean_pair_rate]) sess.run(reset_metrics) spd = batch_size / time_meter.get_and_reset() print( 'Iter: {:d}, LR: {:g}, Loss: {:.4f}, PR: {:.2f}, Spd: {:.2f} i/s' .format(i, lr, t_loss, t_pr, spd)) train_writer.add_summary(t_summary, global_step=sess.run(global_step)) train_writer.add_summary(t_metric_summary, global_step=sess.run(global_step)) i += 1 except tf.errors.OutOfRangeError: pass # save checkpoint saver.save(sess, '{}/{}'.format(FLAGS.logdir, FLAGS.model), global_step=sess.run(global_step), write_meta_graph=False) # val loop try: sess.run([val_data_init, reset_metrics]) v_flist, v_llist = [], [] v_iter = 0 while True: v_feats, v_labels, _ = sess.run( [features, labels, metrics_update_op], {is_training: False}) if v_iter < FLAGS.n_iter_for_emb: v_flist.append(v_feats) v_llist.append(v_labels) v_iter += 1 except tf.errors.OutOfRangeError: pass v_loss, v_pr = sess.run([mean_loss, mean_pair_rate]) print('[VAL]Loss: {:.4f}, PR: {:.2f}'.format(v_loss, v_pr)) v_jpg_str = feat2emb( np.concatenate(v_flist, axis=0), np.concatenate(v_llist, axis=0), TSNE_transform if int(FLAGS.n_feats) > 2 else None) val_writer.add_summary(sess.run(metric_summary_str), global_step=sess.run(global_step)) val_writer.add_summary(sess.run(emb_summary_str, {jpg_img_str: v_jpg_str}), global_step=sess.run(global_step)) print('-' * 40)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--job-dir", default="", help="Job directory for output plots") parser.add_argument("--data-path", default="diabetes.csv", help="Data directory for PIMA") parser.add_argument("--n-folds", type=int, default=10, help="Number of Folds for K-Fold Cross Validation") parser.add_argument("--n-trees", type=int, default=100, help="Number of Trees") parser.add_argument("--n-neighbors", type=int, default=10, help="Number of neighbors for critical set") parser.add_argument("--max-depth", type=int, default=10, help="Depth of search for random forest") parser.add_argument("--min-size", type=int, default=1, help="Minimum size for random forest") parser.add_argument("--n-features", type=int, default=2, help="Number of features for random forest") parser.add_argument("--sample-size", type=float, default=1.0, help="Sample size for random forest") parser.add_argument("--p-critical", type=float, default=0.5, help="Percentage of forest size using critical set") parser.add_argument("--seed", type=int, default=0, help="Random seed") args = parser.parse_args() # Set seeds for reproducibility np.random.seed(seed=args.seed) seed(args.seed) # Prep data for model training data = load_data(args.data_path) raw_data_train, raw_data_test = split_data(data) data_train, scaler, medians = pima_training_data_transformation( raw_data_train) # Evaluate algorithm on training data using K-Fold cross validation _ = evaluate_algorithm(data_train, biased_random_forest, args.n_folds, args.n_neighbors, args.p_critical, args.max_depth, args.min_size, args.sample_size, args.n_trees, args.n_features) # Train tree model on full training dataset trees = train_biased_random_forest(data_train, args.n_neighbors, args.max_depth, args.min_size, args.sample_size, args.n_trees, args.n_features, args.p_critical) # Evaluate model on test data # Prepare test data data_test = pima_test_data_transformation(raw_data_test, scaler, medians).to_numpy() test_set = list() for row in data_test: row_copy = list(row) test_set.append(row_copy) row_copy[-1] = None # Run inference on test set test_predictions, test_probs = test_random_forest(trees, test_set) test_actual = data_test[:, -1] # Evaluate test data performance print('Test Data Performance') fp_rates, tp_rates, recalls, precisions = display_metrics( test_actual, test_predictions, test_probs) # Plot final outname = "Test Data" save_prc_curve(recalls, precisions, name=outname) save_roc_curve(fp_rates, tp_rates, name=outname) # LIME df_features = data_train.iloc[:, :-1] feature_cols = df_features.columns data_features = df_features.values data_labels = data_train.iloc[:, -1].values explainer = lime.lime_tabular.LimeTabularExplainer( data_features, mode='classification', training_labels=data_labels, feature_names=feature_cols) model = BiasedRandomForestModel(trees) # ipdb is useful here for further exploration in LIME. This can also be moved to a follow-up notebook. # ipdb.set_trace() idx = 0 exp = explainer.explain_instance(data_features[idx], model.get_probs, num_features=7) exp.save_to_file('lime_rf_example0.html')
def main(): load_dotenv('.env.general') config = load_config('config.yml') Path(config.logging.handlers.debug_file_handler.filename).parent.mkdir( parents=True, exist_ok=True) Path(config.logging.handlers.info_file_handler.filename).parent.mkdir( parents=True, exist_ok=True) logging.config.dictConfig(config.logging) _logger.info("Loading the data") x, y = load_training_data() x_train, x_test, y_train, y_test = split_data(x, y) with tempfile.TemporaryDirectory() as td: temp_dir = Path(td) mlflow.set_experiment(config.experiment.name) params = {} tags = {} metrics = {} artifacts = {} with mlflow.start_run(): _logger.info("Fitting the preprocessor") preprocessor = get_preprocessor() preprocessor.fit(x_train, y_train) _logger.info("Preprocessing the training data") x_train_prep = preprocessor.transform(x_train) x_test_prep = preprocessor.transform(x_test) estimator_params, search_space = get_params() if search_space is None: estimator, estimator_tags, estimator_metrics, estimator_artifacts = train_run( estimator_params=estimator_params, x_train_prep=x_train_prep, y_train=y_train, x_test_prep=x_test_prep, y_test=y_test, temp_dir=temp_dir) model = make_pipeline(preprocessor, estimator) params.update( {f"estimator_{k}": v for k, v in estimator_params.items()}) tags.update( {f"estimator_{k}": v for k, v in estimator_tags.items()}) metrics.update(estimator_metrics) artifacts.update(estimator_artifacts) else: def hyperopt_objective(search_params): # This function is called for each set of hyper-parameters being tested by HyperOpt. run_name = str(len(trials) - 1) ho_params = {} ho_tags = {} ho_metrics = {} ho_artifacts = {} search_params = flatten_params(search_params) search_params = prep_params(search_params) ho_estimator_params = estimator_params.copy() ho_estimator_params.update(search_params) with mlflow.start_run(nested=True, run_name=run_name): ho_estimator, ho_estimator_tags, ho_estimator_metrics, ho_estimator_artifacts = train_run( estimator_params=ho_estimator_params, x_train_prep=x_train_prep, y_train=y_train, x_test_prep=x_test_prep, y_test=y_test, temp_dir=temp_dir / run_name) ho_model = make_pipeline(preprocessor, ho_estimator) ho_params.update({ f"estimator_{k}": v for k, v in ho_estimator_params.items() }) ho_tags.update({ f"estimator_{k}": v for k, v in ho_estimator_tags.items() }) ho_metrics.update(ho_estimator_metrics) ho_artifacts.update(ho_estimator_artifacts) ho_tags['hyperopt'] = True log_sk_model(ho_model, registered_model_name=None, params=ho_params, tags=ho_tags, metrics=ho_metrics, artifacts=ho_artifacts) loss = 1 - ho_metrics[config.evaluation.primary_metric] return { 'loss': loss, 'status': STATUS_OK, 'model': ho_model, 'params': ho_params, 'tags': ho_tags, 'metrics': ho_metrics, 'artifacts': ho_artifacts } trials = Trials() fmin(fn=hyperopt_objective, space=search_space, algo=tpe.suggest, trials=trials, max_evals=config.training.max_evals, rstate=np.random.RandomState(1), show_progressbar=False) model = trials.best_trial['result']['model'] params = trials.best_trial['result']['params'] tags = trials.best_trial['result']['tags'] metrics = trials.best_trial['result']['metrics'] artifacts = trials.best_trial['result']['artifacts'] if config.evaluation.shap_analysis: _logger.info("Starting shap analysis") shap_tags, shap_artifacts = shap_analyse( model=model, x=x_train, temp_dir=Path(temp_dir) / 'shap') tags.update(shap_tags) artifacts.update(shap_artifacts) else: _logger.info("Shap analysis skipped") log_sk_model(model, registered_model_name=None, params=params, tags=tags, metrics=metrics, artifacts=artifacts) return (x_train, y_train, x_test, y_test), model, params, tags, metrics, artifacts
args = parser.parse_args() # for TPU os.environ["WANDB_API_KEY"] = "0" # to silence warning device = xm.xla_device() print('Found TPU at: {}'.format(device)) # For reproducibility np.random.seed(args.seed) # Open train and test csv files using pandas library train_df = pd.read_csv(args.train_file) test_df = pd.read_csv(args.test_file) # Split training dataset into two parts - the data we will train the model with and a validation set. train_df, validation_df = data.split_data(train_df) # Check the number of rows and columns in the subsets after split print("Train data shape after split: {} \n".format(train_df.shape)) print("Validation data shape after split: {} \n".format( validation_df.shape)) # Augment training data train_df = data.augment_data(train_df, test_df, use_xnli=args.load_xnli, use_mnli=args.load_mnli, use_bt=args.back_translate, bt_filepath=args.bt_file) # Define the tokenizer to preprocess the input data