def write_results(config: configure_finetuning.FinetuningConfig, results): """Write evaluation metrics to disk.""" utils.log("Writing results to", config.results_txt) utils.mkdir(config.results_txt.rsplit("/", 1)[0]) utils.write_pickle(results, config.results_pkl) with tf.io.gfile.GFile(config.results_txt, "a") as f: results_str = "" for trial_results in results: for task_name, task_results in trial_results.items(): if task_name == "time" or task_name == "global_step": continue results_str += task_name + ": " + " - ".join([ "{:}: {:.2f}".format(k, v) for k, v in task_results.items() ]) + "\n" # Neptune Metric Logging neptune.append_tag('ft') neptune.append_tag('tensorflow') neptune.set_property('task', task_name) for k, v in task_results.items(): neptune.log_metric(k, v) f.write(results_str) utils.write_pickle(results, config.results_pkl)
def main(argv): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param, skip_unknown=True) op_config_str = gin.config._CONFIG use_neptune = "NEPTUNE_API_TOKEN" in os.environ if use_neptune: params = utils.get_gin_params_as_dict(gin.config._CONFIG) neptune.init(project_qualified_name="melindafkiss/sandbox") exp = neptune.create_experiment(params=params, name="exp") #ONLY WORKS FOR ONE GIN-CONFIG FILE with open(FLAGS.gin_file[0]) as ginf: param = ginf.readline() while param: param = param.replace('.','-').replace('=','-').replace(' ','').replace('\'','').replace('\n','').replace('@','') #neptune.append_tag(param) param = ginf.readline() #for tag in opts['tags'].split(','): # neptune.append_tag(tag) else: neptune.init('shared/onboarding', api_token='ANONYMOUS', backend=neptune.OfflineBackend()) er = ExperimentRunner(prefix=exp.id) er.train() params = utils.get_gin_params_as_dict(gin.config._OPERATIVE_CONFIG) for k, v in params.items(): neptune.set_property(k, v) neptune.stop() print('fin')
def main(): neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'), project_qualified_name=os.getenv('NEPTUNE_PROJECT')) application_table_path = os.path.join(RAW_DATA_DIRPATH, 'application_train.csv.zip') application_table = pd.read_csv(application_table_path, nrows=NROWS) index_table = application_table[['SK_ID_CURR', 'TARGET']] with neptune.create_experiment(name='validation schema', tags=['processed', 'validation'], upload_source_files=get_filepaths()): train_idx, valid_idx = train_test_split(index_table, test_size=TEST_SIZE, random_state=SEED) train_idx_path = os.path.join(INTERIM_FEATURES_DIRPATH, 'train_idx.csv') train_idx.to_csv(train_idx_path, index=None) neptune.send_artifact(train_idx_path) neptune.set_property('train_split_version', md5_hash(train_idx_path)) valid_idx_path = os.path.join(INTERIM_FEATURES_DIRPATH, 'valid_idx.csv') valid_idx.to_csv(valid_idx_path, index=None) neptune.send_artifact(valid_idx_path) neptune.set_property('valid_split_version', md5_hash(valid_idx_path))
def resource_event(self, filename): if filename not in self.resources: md5 = get_digest(filename) self.resources[filename] = md5 neptune.set_property('resources', str(list(self.resources.keys()))) neptune.set_property(filename, self.resources[filename])
def resource_event(self, filename): if filename not in self.resources: new_prefix = self._create_new_prefix() self.resources[filename] = new_prefix md5 = get_digest(filename) neptune.set_property('{}data_path'.format(new_prefix), filename) neptune.set_property('{}data_version'.format(new_prefix), md5)
def main(): print('loading data') train_features_path = os.path.join( FEATURES_DATA_PATH, 'train_features_' + FEATURE_NAME + '.csv') print('... train') train = pd.read_csv(train_features_path, nrows=TRAINING_PARAMS['nrows']) idx_split = int( (1 - VALIDATION_PARAMS['validation_fraction']) * len(train)) train, valid = train[:idx_split], train[idx_split:] train = sample_negative_class( train, fraction=TRAINING_PARAMS['negative_sample_fraction'], seed=TRAINING_PARAMS['negative_sample_seed']) @skopt.utils.use_named_args(SPACE) def objective(**params): model_params = {**params, **STATIC_PARAMS} valid_preds = fit_predict(train, valid, None, model_params, TRAINING_PARAMS, fine_tuning=True) valid_auc = roc_auc_score(valid['isFraud'], valid_preds) return -1.0 * valid_auc experiment_params = { **STATIC_PARAMS, **TRAINING_PARAMS, **HPO_PARAMS, } with neptune.create_experiment(name='skopt forest sweep', params=experiment_params, tags=['skopt', 'forest', 'tune'], upload_source_files=get_filepaths()): print('logging data version') log_data_version(train_features_path, prefix='train_features_') results = skopt.forest_minimize(objective, SPACE, callback=[sk_utils.NeptuneMonitor()], **HPO_PARAMS) best_auc = -1.0 * results.fun best_params = results.x neptune.send_metric('valid_auc', best_auc) neptune.set_property('best_parameters', str(best_params)) sk_utils.send_best_parameters(results) sk_utils.send_plot_convergence(results, channel_name='diagnostics_hpo') sk_utils.send_plot_evaluations(results, channel_name='diagnostics_hpo') sk_utils.send_plot_objective(results, channel_name='diagnostics_hpo')
def validate(self): x = np.random.randn(10, 50) y = np.random.randn(10, 10) costs = [] with torch.no_grad(): for i in range(100): x = torch.tensor(x, device=device, dtype=torch.float) y = torch.tensor(y, device=device, dtype=torch.float) y_hat = self.lin(x) cost = torch.mean((y - y_hat) ** 2).item() costs.append(cost) total_cost = np.mean(costs) neptune.set_property( "validation", {"step": self.step, "epoch": self.epoch, "cost": total_cost}, )
def main(): print('started experimnent') with neptune.create_experiment( name='feature engineering', tags=['feature-extraction', FEATURE_NAME], upload_source_files=get_filepaths(), properties={'feature_version': FEATURE_NAME}): print('loading data') train = load_and_merge(RAW_DATA_PATH, 'train', NROWS)[ID_COLS + V1_COLS + ['isFraud']] test = load_and_merge(RAW_DATA_PATH, 'test', NROWS)[ID_COLS + V1_COLS] categorical_cols = set(V1_CAT_COLS) print('cleaning data') email_cols = ['P_emaildomain', 'R_emaildomain'] train, new_email_cols = clean_email(train, email_cols) test, _ = clean_email(test, email_cols) categorical_cols.update(new_email_cols) for col in email_cols: categorical_cols.remove(col) categorical_cols = list(categorical_cols) neptune.set_property('categorical_columns', str(categorical_cols)) print('encoding categoricals') encoder = OrdinalEncoder(cols=categorical_cols).fit( train[ID_COLS + categorical_cols]) train[ID_COLS + categorical_cols] = encoder.transform( train[ID_COLS + categorical_cols]) test[ID_COLS + categorical_cols] = encoder.transform( test[ID_COLS + categorical_cols]) train_features_path = os.path.join( FEATURES_DATA_PATH, 'train_features_{}.csv'.format(FEATURE_NAME)) print('saving train to {}'.format(train_features_path)) train.to_csv(train_features_path, index=None) log_data_version(train_features_path, prefix='train_features_') test_features_path = os.path.join( FEATURES_DATA_PATH, 'test_features_{}.csv'.format(FEATURE_NAME)) print('saving test to {}'.format(test_features_path)) test.to_csv(test_features_path, index=None) log_data_version(test_features_path, prefix='test_features_')
def add_params(self, params, step=None): ''' Adds parameters to experiment log Parameters ---------- params : Dict Key-Value pairs Returns ------- None. ''' if self.neptune: for key, value in params.items(): neptune.set_property(key, value) if step is not None: neptune.set_property('step', step) if self.comet: self.comet_experiment.log_parameters(params, step=step)
def train(self): x = np.random.randn(10, 50) y = np.random.randn(10, 10) for epoch in range(10): self.epoch = epoch for i in range(100): x = torch.tensor(x, device=device, dtype=torch.float) y = torch.tensor(y, device=device, dtype=torch.float) y_hat = self.lin(x) cost = torch.mean((y - y_hat) ** 2) self.opt.zero_grad() cost.backward() self.opt.step() neptune.send_metric("epoch_cost", self.epoch, cost.item()) self.step += 1 self.validate() neptune.set_property("epoch", self.epoch) neptune.set_property("cost", cost.item()) neptune.set_property("step", self.step)
def main(): neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'), project_qualified_name=os.getenv('NEPTUNE_PROJECT')) interim_feature_paths = [APPLICATION_FEATURES_PATH, BUREAU_FEATURES_PATH] with neptune.create_experiment( name='feature_extraction', tags=['processed', 'feature_extraction', 'joined_features'], upload_source_files=get_filepaths()): features = pd.read_csv(interim_feature_paths[0], usecols=['SK_ID_CURR'], nrows=NROWS) for path in interim_feature_paths: df = pd.read_csv(path, nrows=NROWS) features = features.merge(df, on='SK_ID_CURR') features.to_csv(PROCESSED_FEATURES_FILEPATH, index=None) neptune.set_property('features_version', md5_hash(PROCESSED_FEATURES_FILEPATH)) neptune.set_property('features_path', PROCESSED_FEATURES_FILEPATH)
def main(): neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'), project_qualified_name=os.getenv('NEPTUNE_PROJECT')) application_raw_path = os.path.join(RAW_DATA_DIRPATH, 'application_train.csv.zip') application_raw = pd.read_csv(application_raw_path, nrows=NROWS) with neptune.create_experiment( name='feature_extraction', tags=['interim', 'application', 'feature_extraction'], upload_source_files=get_filepaths()): application_features, (numeric_cols, categorical_cols) = extract(application_raw) application_features.to_csv(INTERIM_FEATURES_DIRPATH, index=None) neptune.set_property('numeric_features', str(numeric_cols)) neptune.set_property('categorical_features', str(categorical_cols)) neptune.set_property('features_version', md5_hash(INTERIM_FEATURES_DIRPATH)) neptune.set_property('features_path', INTERIM_FEATURES_DIRPATH)
} # create experiment with neptune.create_experiment( name='classification_example', tags=['classification', 'tf_2'], upload_source_files=['classification-example.py', 'requirements.txt'], params=PARAMS): # dataset fashion_mnist = keras.datasets.fashion_mnist (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data() train_images = train_images / 255.0 test_images = test_images / 255.0 neptune.set_property('train_images_version', hashlib.md5(train_images).hexdigest()) neptune.set_property('train_labels_version', hashlib.md5(train_labels).hexdigest()) neptune.set_property('test_images_version', hashlib.md5(test_images).hexdigest()) neptune.set_property('test_labels_version', hashlib.md5(test_labels).hexdigest()) class_names = [ 'T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot' ] neptune.set_property('class_names', class_names) for j, class_name in enumerate(class_names):
import hashlib # prepare dataset (x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data() x_train = x_train / 255.0 x_test = x_test / 255.0 class_names = [ 'T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot' ] # log data version neptune.set_property('x_train_version', hashlib.md5(x_train).hexdigest()) neptune.set_property('y_train_version', hashlib.md5(y_train).hexdigest()) neptune.set_property('x_test_version', hashlib.md5(x_test).hexdigest()) neptune.set_property('y_test_version', hashlib.md5(y_test).hexdigest()) neptune.set_property('class_names', class_names) # Prepare model and log model architecture summary # prepare model model = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28)), tf.keras.layers.Dense(parameters['dense_units'], activation=parameters['activation']), tf.keras.layers.Dropout(parameters['dropout']), tf.keras.layers.Dense(parameters['dense_units'],
validation_steps=len(split_data['val'][0])//PARAMS['batch_size'] # split_datasets = { # k:base_dataset.BaseDataset \ # .from_dataframe( # pd.DataFrame({ # 'path':v[0], # 'family':v[1] # })) \ # for k,v in split_data.items() # } with neptune.create_experiment(name=experiment_name, params=PARAMS): neptune.set_property('num_classes',data.num_classes) neptune.set_property('class_distribution',data.metadata.class_distribution) ########################## train_data=get_data_loader(data=split_data['train'], data_subset_mode='train', batch_size=PARAMS['batch_size'], num_channels=PARAMS['num_channels'], infinite=True, seed=2836) validation_data=get_data_loader(data=split_data['val'], data_subset_mode='val', batch_size=PARAMS['batch_size'], num_channels=PARAMS['num_channels'], infinite=True, seed=2836) train_batch = next(iter(train_data)) train_images, train_labels = train_batch[0].numpy(), train_batch[1].numpy() print(train_images.min(), train_images.max()) plt.imshow(train_images[5,:,:,:].squeeze()) ########################## num_val_samples = len(split_data['val'][0]) cm_val_data_loader = iter(get_data_loader(data=split_data['val'], data_subset_mode='val', batch_size=num_val_samples, num_channels=PARAMS['num_channels'], infinite=True, seed=2836))# \ cm_val_imgs, cm_val_labels = next(cm_val_data_loader) cm_callback = ConfusionMatrixCallback(log_dir, cm_val_imgs, cm_val_labels, classes=data.classes, seed=PARAMS['seed'])
PARAMS['epoch'] = args.epoch PARAMS['hidden1'] = args.hidden1 PARAMS['hidden2'] = args.hidden2 PARAMS['batch_size'] = args.batch_size val_test_size = 0.1 if args.log: neptune.create_experiment(name='example_with_parameters', params=PARAMS, upload_stdout=True, upload_stderr=True, send_hardware_metrics=True, upload_source_files='**/*.py') neptune.set_property("val_test_size", val_test_size) if not args.real: run = RunDecagonToy() run.run(adj_path=None, path_to_split=f'data/split/toy/{PARAMS["batch_size"]}', val_test_size=val_test_size, batch_size=PARAMS['batch_size'], num_epochs=PARAMS['epoch'], dropout=PARAMS['dropout'], max_margin=PARAMS['max_margin'], print_progress_every=150, log=args.log, on_cpu=args.cpu, upload_saved=args.upload_saved) else:
params=experiment_params, tags=['skopt', 'gp'], upload_source_files=['search_gp.py', 'basic_sweep.py', 'utils.py']): results = skopt.gp_minimize(objective, SPACE, callback=[monitor], **HPO_PARAMS) best_auc = -1.0 * results.fun best_params = results.x # log metrics print('Best Validation AUC: {}'.format(best_auc)) print('Best Params: {}'.format(best_params)) neptune.send_metric('validation auc', best_auc) neptune.set_property('best_params', str(to_named_params(best_params))) # log results skopt.dump(results, 'artifacts/gp_results.pkl') joblib.dump(SPACE, 'artifacts/gp_space.pkl') neptune.send_artifact('artifacts/gp_results.pkl') neptune.send_artifact('artifacts/gp_space.pkl') # log diagnostic plots fig, ax = plt.subplots(figsize=(16, 12)) skopt.plots.plot_convergence(results, ax=ax) fig.savefig('plots/gp_convergence.png') neptune.send_image('diagnostics', 'plots/gp_convergence.png')
def training_pipeline(args): ############################################################################### # Environment setup ############################################################################### # Set the random seed manually for reproducibility. random.seed(args.seed) torch.manual_seed(args.seed) # Check if CUDA device is available and set training on CPUs or GPUs if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) device = torch.device(args.cuda_device if args.cuda else "cpu") ############################################################################### # Experiment tracking setup ############################################################################### neptune.init(project_qualified_name='karexar/GSW-dialect-classifier') args_dict = vars(args) neptune.create_experiment(params=args_dict) if hasattr(args, 'experiment_id'): neptune.append_tag(args.experiment_id) neptune.set_property('lm_algo', 'lstm') for key in args_dict.keys(): neptune.set_property(key, args_dict[key]) ############################################################################### # Load data ############################################################################### print('Loading data') data_manager = DataManager(args.data, device, args.batch_size, args.eval_batch_size) ############################################################################### # Build the model ############################################################################### print('Building model') num_tokens = data_manager.vocab_size num_labels = data_manager.num_labels embeddings_matrix = None if args.use_pretrained_embed: # Load pre-trained word embeddings model # and generate the embeddings weight matrix for the entire vocabulary assert args.embed_algo is not None print(f'Using {args.embed_algo} pre-trained word embeddings') if args.embed_algo == 'word2vec': pretrained_embeddings = Word2VecModel(args.model_path_embed, args.model_name_embed, load_from_disk=True) embeddings_matrix = pretrained_embeddings.get_vocabulary_embeddings( data_manager.idx2word, args.embed_size) elif args.embed_algo == 'glove': pretrained_embeddings = GloveModel(args.model_path_embed, args.model_name_embed, load_from_disk=True) embeddings_matrix = pretrained_embeddings.get_vocabulary_embeddings( data_manager.idx2word, args.embed_size) model = LSTM(num_tokens, args.embed_size, args.num_hidden, args.num_layers, args.dropout, num_labels, embeddings_matrix).to(device) print('Model architecture') print(model) criterion = nn.CrossEntropyLoss() ############################################################################### # Training code ############################################################################### print('Initialising model executor') model_executor = ModelExecutor(model, data_manager, device, criterion) if args.train_lstm: # Loop over epochs learning_rate = args.learning_rate best_val_accuracy = None last_val_accuracy = 0 model_optimiser = optim.SGD(model.parameters(), lr=learning_rate) # At any point you can hit Ctrl + C to break out of training early. try: print('Starting the training process') for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() _, _ = model_executor.train(epoch, args.batch_size, learning_rate, model_optimiser, args.clip, args.log_interval) val_loss, val_accuracy = model_executor.evaluate( data_manager.val_iter, args.eval_batch_size) # Log result in Neptune ML neptune.send_metric('valid_loss', epoch, val_loss) neptune.send_metric('valid_accuracy', epoch, val_accuracy) neptune.send_metric('learning_rate', epoch, learning_rate) if epoch % 3 == 0: learning_rate *= 0.9 # correct the learning rate after some number of epochs print('-' * 89) print( '| End of epoch {:3d} | Time: {:5.2f}s | Valid loss {:6.2f} | ' 'Valid accuracy {:8.2f}'.format( epoch, (time.time() - epoch_start_time), val_loss, val_accuracy)) print('-' * 89) # Save the model if the validation accuracy is the best we've seen so far. if not best_val_accuracy or val_accuracy > best_val_accuracy: model_executor.model.export_model(args.model_path_lstm) best_val_accuracy = val_accuracy if val_accuracy < last_val_accuracy: # Anneal the learning rate if no improvement has been seen in the validation dataset. learning_rate /= 2.0 for group in model_optimiser.param_groups: group['lr'] = learning_rate last_val_accuracy = val_accuracy except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') ############################################################################### # Evaluation code ############################################################################### test_loss = None test_accuracy = None if args.eval_lstm: print('Evaluating on the test set') # Load the best saved model. model_executor.load_pre_trained_model(args.model_path_lstm, device=device) # Run on test data. test_loss, test_accuracy = model_executor.evaluate( data_manager.test_iter, args.eval_batch_size) # Log result in Neptune ML neptune.send_metric('test_loss', test_loss) neptune.send_metric('test_accuracy', test_accuracy) print('-' * 89) print('| End of evaluation | Test loss {:6.2f}'.format(test_loss) + ' | Test accuracy {:8.2f}'.format(test_accuracy)) print('-' * 89) ############################################################################### # Stop the experiment tracking ############################################################################### neptune.stop() return test_loss, test_accuracy
def modify_properties(self): neptune.set_property("prop", "some text") neptune.set_property("prop_number", 42) neptune.set_property("nested/prop", 42) neptune.set_property("prop_to_del", 42) neptune.set_property("prop_list", [1, 2, 3]) with open(self.text_file_path, mode="r") as f: neptune.set_property("prop_IO", f) neptune.set_property("prop_datetime", datetime.now()) neptune.remove_property("prop_to_del") exp = neptune.get_experiment() properties = exp.get_properties() assert properties["prop"] == "some text" assert properties["prop_number"] == "42" assert properties["nested/prop"] == "42" assert properties["prop_list"] == "[1, 2, 3]" assert "prop_to_del" not in properties assert (properties["prop_IO"] == "<_io.TextIOWrapper name='alpha_integration_dev/data/text.txt'" " mode='r' encoding='UTF-8'>") print(f"Properties: {properties}")
def learning(self, model, criterion, train_dataset, val_dataset, optimizer=None): self.init_learning(model, criterion) # define train and val transform train_dataset.transform = self.state['train_transform'] train_dataset.target_transform = self._state('train_target_transform') val_dataset.transform = self.state['val_transform'] val_dataset.target_transform = self._state('val_target_transform') # data loading code train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.state['batch_size'], shuffle=True, num_workers=self.state['workers']) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=self.state['batch_size_test'], shuffle=False, num_workers=self.state['workers']) # optionally resume from a checkpoint if self._state('resume') is not None: if os.path.isfile(self.state['resume']): print("=> loading checkpoint '{}'".format(self.state['resume'])) checkpoint = torch.load(self.state['resume']) self.state['start_epoch'] = checkpoint['epoch'] self.state['best_score'] = checkpoint['best_score'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(self.state['evaluate'], checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(self.state['resume'])) if self.state['use_gpu']: train_loader.pin_memory = True val_loader.pin_memory = True cudnn.benchmark = False model = torch.nn.DataParallel(model, device_ids=self.state['device_ids']).cuda() criterion = criterion.cuda() if self.state['evaluate']: self.validate(val_loader, model, criterion) return # TODO define optimizer for epoch in range(self.state['start_epoch'], self.state['max_epochs']): self.state['epoch'] = epoch lr = self.adjust_learning_rate(optimizer) print('lr:',lr, '|', 'step:' ,self.state['epoch_step'],'|', 'decay: ', self.state['lr_decay']) # train for one epoch self.train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = self.validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint is_best = prec1 > self.state['best_score'] self.state['best_score'] = max(prec1, self.state['best_score']) self.save_checkpoint({ 'epoch': epoch + 1, 'arch': self._state('arch'), 'state_dict': model.module.state_dict() if self.state['use_gpu'] else model.state_dict(), 'best_score': self.state['best_score'], }, is_best) print(' *** best={best:.3f}'.format(best=self.state['best_score'])) if self.state['neptune']: try: neptune.set_property('top', float(self.state['best_score'])) except: print("Neptune exception occurred") return self.state['best_score']
def log_property(self, name: str, value: Union[str, int, float]): if not self.disabled: neptune.set_property(name, str(value))
def main(**kwargs): import sys for k, v in kwargs.items(): sys.argv += [k, v] from pprint import pprint import argparse import datetime import json import os parser = argparse.ArgumentParser() parser.add_argument('--neptune_project_name', default='jacobarose/sandbox', type=str, help='Neptune.ai project name to log under') parser.add_argument('--experiment_name', default='pnas_minimal_example', type=str, help='Neptune.ai experiment name to log under') parser.add_argument('--config_path', default=r'/home/jacob/projects/pyleaves/pyleaves/configs/example_configs/pnas_resnet_config.json', type=str, help='JSON config file') parser.add_argument('-gpu', '--gpu_id', default='1', type=str, help='integer number of gpu to train on', dest='gpu_id') parser.add_argument('-tags', '--add-tags', default=[], type=str, nargs='*', help='Add arbitrary list of tags to apply to this run in neptune', dest='tags') parser.add_argument('-f', default=None) args = parser.parse_args() with open(args.config_path, 'r') as config_file: PARAMS = json.load(config_file) # print(gpu) # os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) pprint(PARAMS) import tensorflow as tf import neptune # tf.debugging.set_log_device_placement(True) print(tf.__version__) import arrow import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import io from stuf import stuf from more_itertools import unzip from functools import partial # import tensorflow as tf # tf.compat.v1.enable_eager_execution() AUTOTUNE = tf.data.experimental.AUTOTUNE from pyleaves.leavesdb.tf_utils.tf_utils import set_random_seed, reset_keras_session import pyleaves from pyleaves.utils.img_utils import random_pad_image from pyleaves.utils.utils import ensure_dir_exists from pyleaves.datasets import leaves_dataset, fossil_dataset, pnas_dataset, base_dataset from pyleaves.models.vgg16 import VGG16, VGG16GrayScale from pyleaves.models import resnet, vgg16 from tensorflow.compat.v1.keras.callbacks import Callback, ModelCheckpoint, TensorBoard, LearningRateScheduler, EarlyStopping from tensorflow.keras import metrics from tensorflow.keras.preprocessing.image import load_img, img_to_array from tensorflow.keras import layers from tensorflow.keras import backend as K import tensorflow_datasets as tfds import neptune_tensorboard as neptune_tb seed = 346 # set_random_seed(seed) # reset_keras_session() def get_preprocessing_func(model_name): if model_name.startswith('resnet'): from tensorflow.keras.applications.resnet_v2 import preprocess_input elif model_name == 'vgg16': from tensorflow.keras.applications.vgg16 import preprocess_input elif model_name=='shallow': def preprocess_input(x): return x/255.0 # ((x/255.0)-0.5)*2.0 return preprocess_input #lambda x,y: (preprocess_input(x),y) def _load_img(image_path):#, img_size=(224,224)): img = tf.io.read_file(image_path) img = tf.image.decode_jpeg(img, channels=3) img = tf.image.convert_image_dtype(img, tf.float32) return img # return tf.compat.v1.image.resize_image_with_pad(img, *img_size) def _encode_label(label, num_classes=19): label = tf.cast(label, tf.int32) label = tf.one_hot(label, depth=num_classes) return label def _load_example(image_path, label, num_classes=19): img = _load_img(image_path) one_hot_label = _encode_label(label, num_classes=num_classes) return img, one_hot_label def _load_uint8_example(image_path, label, num_classes=19): img = tf.image.convert_image_dtype(_load_img(image_path)*255.0, dtype=tf.uint8) one_hot_label = _encode_label(label, num_classes=num_classes) return img, one_hot_label def rgb2gray_3channel(img, label): ''' Convert rgb image to grayscale, but keep num_channels=3 ''' img = tf.image.rgb_to_grayscale(img) img = tf.image.grayscale_to_rgb(img) return img, label def rgb2gray_1channel(img, label): ''' Convert rgb image to grayscale, num_channels from 3 to 1 ''' img = tf.image.rgb_to_grayscale(img) return img, label def log_data(logs): for k, v in logs.items(): neptune.log_metric(k, v) neptune_logger = tf.keras.callbacks.LambdaCallback(on_epoch_end=lambda epoch, logs: log_data(logs)) def focal_loss(gamma=2.0, alpha=4.0): gamma = float(gamma) alpha = float(alpha) def focal_loss_fixed(y_true, y_pred): """Focal loss for multi-classification FL(p_t)=-alpha(1-p_t)^{gamma}ln(p_t) Notice: y_pred is probability after softmax gradient is d(Fl)/d(p_t) not d(Fl)/d(x) as described in paper d(Fl)/d(p_t) * [p_t(1-p_t)] = d(Fl)/d(x) Focal Loss for Dense Object Detection https://arxiv.org/abs/1708.02002 Arguments: y_true {tensor} -- ground truth labels, shape of [batch_size, num_cls] y_pred {tensor} -- model's output, shape of [batch_size, num_cls] Keyword Arguments: gamma {float} -- (default: {2.0}) alpha {float} -- (default: {4.0}) Returns: [tensor] -- loss. """ epsilon = 1.e-9 y_true = tf.convert_to_tensor(y_true, tf.float32) y_pred = tf.convert_to_tensor(y_pred, tf.float32) model_out = tf.add(y_pred, epsilon) ce = tf.multiply(y_true, -tf.log(model_out)) weight = tf.multiply(y_true, tf.pow(tf.subtract(1., model_out), gamma)) fl = tf.multiply(alpha, tf.multiply(weight, ce)) reduced_fl = tf.reduce_max(fl, axis=1) return tf.reduce_mean(reduced_fl) return focal_loss_fixed def per_class_accuracy(y_true, y_pred): return tf.metrics.mean_per_class_accuracy(y_true, y_pred, num_classes=PARAMS['num_classes']) def build_model(model_params, optimizer, loss, METRICS): if model_params['name']=='vgg16': model_builder = vgg16.VGG16GrayScale(model_params) elif model_params['name'].startswith('resnet'): model_builder = resnet.ResNet(model_params) base = model_builder.build_base() model = model_builder.build_head(base) model.compile(optimizer=optimizer, loss=loss, metrics=METRICS) return model def build_shallow(input_shape=(224,224,3), num_classes=10, optimizer=None, loss=None, METRICS=None): model = tf.keras.models.Sequential() model.add(layers.Conv2D(64, (7, 7), activation='relu', input_shape=input_shape, kernel_initializer=tf.initializers.GlorotNormal())) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Conv2D(64, (7, 7), activation='relu', kernel_initializer=tf.initializers.GlorotNormal())) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Conv2D(64, (7, 7), activation='relu', kernel_initializer=tf.initializers.GlorotNormal())) model.add(layers.Flatten()) model.add(layers.Dense(64*2, activation='relu', kernel_initializer=tf.initializers.GlorotNormal())) model.add(layers.Dense(num_classes,activation='softmax', kernel_initializer=tf.initializers.GlorotNormal())) model.compile(optimizer=optimizer, loss=loss, metrics=METRICS) return model class ImageLogger: '''Tensorflow 2.0 version''' def __init__(self, log_dir: str, max_images: int, name: str): self.file_writer = tf.summary.create_file_writer(log_dir) self.log_dir = log_dir self.max_images = max_images self.name = name self._counter = tf.Variable(0, dtype=tf.int64) self.filepaths = [] def add_log(self, img, counter=None, name=None): ''' Intention is to generalize this to an abstract class for logging to any experiment management platform (e.g. neptune, mlflow, etc) Currently takes a filepath pointing to an image file and logs to current neptune experiment. ''' # scaled_images = (img - tf.math.reduce_min(img))/(tf.math.reduce_max(img) - tf.math.reduce_min(img)) # keep = 0 # scaled_images = tf.image.convert_image_dtype(tf.squeeze(scaled_images[keep,:,:,:]), dtype=tf.uint8) # scaled_images = tf.expand_dims(scaled_images, 0) # tf.summary.image(name=self.name, data=scaled_images, step=self._counter, max_outputs=self.max_images) scaled_img = (img - np.min(img))/(np.max(img) - np.min(img)) * 255.0 scaled_img = scaled_img.astype(np.uint32) neptune.log_image(log_name= name or self.name, x=counter, y=scaled_img) return scaled_img def __call__(self, images, labels): with self.file_writer.as_default(): scaled_images = (images - tf.math.reduce_min(images))/(tf.math.reduce_max(images) - tf.math.reduce_min(images)) keep = 0 scaled_images = tf.image.convert_image_dtype(tf.squeeze(scaled_images[keep,:,:,:]), dtype=tf.uint8) scaled_images = tf.expand_dims(scaled_images, 0) labels = tf.argmax(labels[[keep], :],axis=1) tf.summary.image(name=self.name, data=scaled_images, step=self._counter, max_outputs=self.max_images) filepath = os.path.join(self.log_dir,'sample_images',f'{self.name}-{self._counter}.jpg') scaled_images = tf.image.encode_jpeg(tf.squeeze(scaled_images)) tf.io.write_file(filename=tf.constant(filepath), contents=scaled_images) # self.add_log(scaled_images) self._counter.assign_add(1) return images, labels def _cond_apply(x, y, func, prob): """Conditionally apply func to x and y with probability prob. Parameters ---------- x : type Input to conditionally pass through func y : type Label func : type Function to conditionally be applied to x and y prob : type Probability of applying function, within range [0.0,1.0] Returns ------- x, y """ return tf.cond((tf.random.uniform([], 0, 1) >= (1.0 - prob)), lambda: func(x,y), lambda: (x,y)) class ImageAugmentor: """Short summary. Parameters ---------- augmentations : dict Maps a sequence of named augmentations to a scalar probability, according to which they'll be conditionally applied in order. resize_w_pad : tuple, default=None Description of parameter `resize_w_pad`. random_crop : tuple, default=None Description of parameter `random_crop`. random_jitter : dict First applies resize_w_pad, then random_crop. If user desires only 1 of these, set this to None. Should be a dict with 2 keys: 'resize':(height, width) 'crop_size':(crop_height,crop_width, channels) Only 1 of these 3 kwargs should be provided to any given augmentor: {'resize_w_pad', 'random_crop', 'random_jitter'} Example values for each: resize_w_pad=(224,224) random_crop=(224,224,3) random_jitter={'resize':(338,338), 'crop_size':(224,224, 3)} seed : int, default=None Random seed to apply to all augmentations Examples ------- Examples should be written in doctest format, and should illustrate how to use the function/class. >>> Attributes ---------- augmentations """ def __init__(self, name='', augmentations={'rotate':1.0, 'flip':1.0, 'color':1.0, 'rgb2gray_3channel':1.0}, resize_w_pad=None, random_crop=None, random_jitter={'resize':(338,338), 'crop_size':(224,224,3)}, log_dir=None, seed=None): self.name = name self.augmentations = augmentations self.seed = seed if resize_w_pad: self.target_h = resize_w_pad[0] self.target_w = resize_w_pad[1] # self.resize = self.resize_w_pad elif random_crop: self.crop_size = random_crop self.target_h = self.crop_size[0] self.target_w = self.crop_size[1] # self.resize = self.random_crop elif random_jitter: # self.target_h = tf.random.uniform([], random_jitter['crop_size'][0], random_jitter['resize'][0], dtype=tf.int32, seed=self.seed) # self.target_w = tf.random.uniform([], random_jitter['crop_size'][1], random_jitter['resize'][1], dtype=tf.int32, seed=self.seed) self.crop_size = random_jitter['crop_size'] # self.resize = self.random_jitter self.target_h = random_jitter['crop_size'][0] self.target_w = random_jitter['crop_size'][1] self.resize = self.resize_w_pad self.maps = {'rotate':self.rotate, 'flip':self.flip, 'color':self.color, 'rgb2gray_3channel':self.rgb2gray_3channel, 'rgb2gray_1channel':self.rgb2gray_1channel} self.log_dir = log_dir def rotate(self, x: tf.Tensor, label: tf.Tensor) -> tf.Tensor: """Rotation augmentation Args: x, tf.Tensor: Image label, tf.Tensor: arbitrary tensor, passes through unchanged Returns: Augmented image, label """ # Rotate 0, 90, 180, 270 degrees return tf.image.rot90(x, tf.random.uniform(shape=[], minval=0, maxval=4, dtype=tf.int32,seed=self.seed)), label def flip(self, x: tf.Tensor, label: tf.Tensor) -> tf.Tensor: """Flip augmentation Args: x, tf.Tensor: Image to flip label, tf.Tensor: arbitrary tensor, passes through unchanged Returns: Augmented image, label """ x = tf.image.random_flip_left_right(x, seed=self.seed) x = tf.image.random_flip_up_down(x, seed=self.seed) return x, label def color(self, x: tf.Tensor, label: tf.Tensor) -> tf.Tensor: """Color augmentation Args: x, tf.Tensor: Image label, tf.Tensor: arbitrary tensor, passes through unchanged Returns: Augmented image, label """ x = tf.image.random_hue(x, 0.08, seed=self.seed) x = tf.image.random_saturation(x, 0.6, 1.6, seed=self.seed) x = tf.image.random_brightness(x, 0.05, seed=self.seed) x = tf.image.random_contrast(x, 0.7, 1.3, seed=self.seed) return x, label def rgb2gray_3channel(self, x: tf.Tensor, label: tf.Tensor) -> tf.Tensor: """Convert RGB image -> grayscale image, maintain number of channels = 3 Args: x, tf.Tensor: Image label, tf.Tensor: arbitrary tensor, passes through unchanged Returns: Augmented image, label """ x = tf.image.rgb_to_grayscale(x) x = tf.image.grayscale_to_rgb(x) return x, label def rgb2gray_1channel(self, x: tf.Tensor, label: tf.Tensor) -> tf.Tensor: """Convert RGB image -> grayscale image, reduce number of channels from 3 -> 1 Args: x, tf.Tensor: Image label, tf.Tensor: arbitrary tensor, passes through unchanged Returns: Augmented image, label """ x = tf.image.rgb_to_grayscale(x) return x, label def resize_w_pad(self, x: tf.Tensor, label: tf.Tensor) -> tf.Tensor: # TODO Finish this # random_pad_image(x,min_image_size=None,max_image_size=None,pad_color=None,seed=self.seed) return tf.image.resize_with_pad(x, target_height=self.target_h, target_width=self.target_w), label def random_crop(self, x: tf.Tensor, label: tf.Tensor) -> tf.Tensor: return tf.image.random_crop(x, size=self.crop_size), label @tf.function def random_jitter(self, x: tf.Tensor, label: tf.Tensor) -> tf.Tensor: x, label = self.resize_w_pad(x, label) x, label = self.random_crop(x, label) return x, label def apply_augmentations(self, dataset: tf.data.Dataset): """ Call this function to apply all of the augmentation in the order of specification provided to the constructor __init__() of ImageAugmentor. Args: dataset, tf.data.Dataset: must yield individual examples of form (x, y) Returns: Augmented dataset """ dataset = dataset.map(self.resize, num_parallel_calls=AUTOTUNE) for aug_name, aug_p in self.augmentations.items(): aug = self.maps[aug_name] dataset = dataset.map(lambda x,y: _cond_apply(x, y, aug, prob=aug_p), num_parallel_calls=AUTOTUNE) # dataset = dataset.map(lambda x,y: _cond_apply(x, y, func=aug, prob=aug_p), num_parallel_calls=AUTOTUNE) return dataset class ImageLoggerCallback(Callback): '''Tensorflow 2.0 version Callback that keeps track of a tf.data.Dataset and logs the correct batch to neptune based on the current batch. ''' def __init__(self, data :tf.data.Dataset, freq=1, max_images=-1, name='', encoder=None): self.data = data self.freq = freq self.max_images = max_images self.name = name self.encoder=encoder self.init_iterator() def init_iterator(self): self.data_iter = iter(self.data) self._batch = 0 self._count = 0 self.finished = False def yield_batch(self): batch_data = next(self.data_iter) self._batch += 1 self._count += batch_data[0].shape[0] return batch_data def add_log(self, img, counter=None, name=None): ''' Intention is to generalize this to an abstract class for logging to any experiment management platform (e.g. neptune, mlflow, etc) Currently takes a filepath pointing to an image file and logs to current neptune experiment. ''' scaled_img = (img - np.min(img))/(np.max(img) - np.min(img)) * 255.0 scaled_img = scaled_img.astype(np.uint32) neptune.log_image(log_name= name or self.name, x=counter, y=scaled_img) return scaled_img def on_train_batch_begin(self, batch, logs=None): if batch % self.freq or self.finished: return while batch >= self._batch: x, y = self.yield_batch() if self.max_images==-1: self.max_images=x.shape[0] if x.ndim==3: np.newaxis(x, axis=0) if x.shape[0]>self.max_images: x = x[:self.max_images,...] y = y[:self.max_images,...] x = x.numpy() y = np.argmax(y.numpy(),axis=1) if self.encoder: y = self.encoder.decode(y) for i in range(x.shape[0]): # self.add_log(x[i,...], counter=i, name = f'{self.name}-{y[i]}-batch_{str(self._batch).zfill(3)}') self.add_log(x[i,...], counter=self._count+i, name = f'{self.name}-{y[i]}') print(f'Batch {self._batch}: Logged {np.max([x.shape[0],self.max_images])} {self.name} images to neptune') def on_epoch_end(self, epoch, logs={}): self.finished = True class ConfusionMatrixCallback(Callback): '''Tensorflow 2.0 version''' def __init__(self, log_dir, imgs : dict, labels : dict, classes, freq=1, include_train=False, seed=None): self.file_writer = tf.summary.create_file_writer(log_dir) self.log_dir = log_dir self.seed = seed self._counter = 0 assert np.all(np.array(imgs.keys()) == np.array(labels.keys())) self.imgs = imgs for k,v in labels.items(): if v.ndim==2: labels[k] = tf.argmax(v,axis=-1) self.labels = labels self.num_samples = {k:l.numpy().shape[0] for k,l in labels.items()} self.classes = classes self.freq = freq self.include_train = include_train def log_confusion_matrix(self, model, imgs, labels, epoch, name='', norm_cm=False): pred_labels = model.predict_classes(imgs) # pred_labels = tf.argmax(pred_labels,axis=-1) pred_labels = pred_labels[:,None] con_mat = tf.math.confusion_matrix(labels=labels, predictions=pred_labels, num_classes=len(self.classes)).numpy() if norm_cm: con_mat = np.around(con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis], decimals=2) con_mat_df = pd.DataFrame(con_mat, index = self.classes, columns = self.classes) figure = plt.figure(figsize=(12, 12)) sns.heatmap(con_mat_df, annot=True, cmap=plt.cm.Blues) plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') buf = io.BytesIO() plt.savefig(buf, format='png') buf.seek(0) image = tf.image.decode_png(buf.getvalue(), channels=4) image = tf.expand_dims(image, 0) with self.file_writer.as_default(): tf.summary.image(name=name+'_confusion_matrix', data=image, step=self._counter) neptune.log_image(log_name=name+'_confusion_matrix', x=self._counter, y=figure) plt.close(figure) self._counter += 1 return image def on_epoch_end(self, epoch, logs={}): if (not self.freq) or (epoch%self.freq != 0): return if self.include_train: cm_summary_image = self.log_confusion_matrix(self.model, self.imgs['train'], self.labels['train'], epoch=epoch, name='train') cm_summary_image = self.log_confusion_matrix(self.model, self.imgs['val'], self.labels['val'], epoch=epoch, name='val') #################################################################################### #################################################################################### #################################################################################### neptune.init(project_qualified_name=args.neptune_project_name) # neptune_tb.integrate_with_tensorflow() experiment_dir = '/media/data/jacob/sandbox_logs' experiment_name = args.experiment_name experiment_start_time = arrow.utcnow().format('YYYY-MM-DD_HH-mm-ss') log_dir =os.path.join(experiment_dir, experiment_name, 'log_dir',PARAMS['loss'], experiment_start_time) ensure_dir_exists(log_dir) print('Tensorboard log_dir: ', log_dir) # os.system(f'neptune tensorboard {log_dir} --project {args.neptune_project_name}') weights_best = os.path.join(log_dir, 'model_ckpt.h5') restore_best_weights=False histogram_freq=0 patience=25 num_epochs = PARAMS['num_epochs'] initial_epoch=0 src_db = pyleaves.DATABASE_PATH datasets = { 'PNAS': pnas_dataset.PNASDataset(src_db=src_db), 'Leaves': leaves_dataset.LeavesDataset(src_db=src_db), 'Fossil': fossil_dataset.FossilDataset(src_db=src_db) } # data = datasets[PARAMS['dataset_name']] data_config = stuf(threshold=PARAMS['data_threshold'], num_classes=PARAMS['num_classes'] , data_splits_meta={ 'train':PARAMS['train_size'], 'val':PARAMS['val_size'], 'test':PARAMS['test_size'] } ) preprocess_input = get_preprocessing_func(PARAMS['model_name']) preprocess_input(tf.zeros([4, 224, 224, 3])) load_example = partial(_load_uint8_example, num_classes=data_config.num_classes) # load_example = partial(_load_example, num_classes=data_config.num_classes) if PARAMS['num_channels']==3: color_aug = {'rgb2gray_3channel':1.0} elif PARAMS['num_channels']==1: color_aug = {'rgb2gray_1channel':1.0} resize_w_pad=None random_jitter=None if not PARAMS['random_jitter']['resize']: resize_w_pad = PARAMS['image_size'] else: random_jitter=PARAMS['random_jitter'] TRAIN_image_augmentor = ImageAugmentor(name='train', augmentations={**PARAMS["augmentations"], **color_aug},#'rotate':1.0,'flip':1.0,**color_aug}, resize_w_pad=resize_w_pad, random_crop=None, random_jitter=random_jitter, log_dir=log_dir, seed=None) VAL_image_augmentor = ImageAugmentor(name='val', augmentations={**color_aug}, resize_w_pad=PARAMS['image_size'], random_crop=None, random_jitter=None, log_dir=log_dir, seed=None) TEST_image_augmentor = ImageAugmentor(name='test', augmentations={**color_aug}, resize_w_pad=PARAMS['image_size'], random_crop=None, random_jitter=None, log_dir=log_dir, seed=None) def neptune_log_augmented_images(split_data, num_demo_samples=40, PARAMS=PARAMS): num_demo_samples = 40 cm_data_x = {'train':[],'val':[]} cm_data_y = {'train':[],'val':[]} cm_data_x['train'], cm_data_y['train'] = next(iter(get_data_loader(data=split_data['train'], data_subset_mode='train', batch_size=num_demo_samples, infinite=True, augment=False,seed=2836))) cm_data_x['val'], cm_data_y['val'] = next(iter(get_data_loader(data=split_data['val'], data_subset_mode='val', batch_size=num_demo_samples, infinite=True, augment=False, seed=2836))) for (k_x,v_x), (k_y, v_y) in zip(cm_data_x.items(), cm_data_y.items()): x = tf.data.Dataset.from_tensor_slices(v_x) y = tf.data.Dataset.from_tensor_slices(v_y) xy_data = tf.data.Dataset.zip((x, y)) v = xy_data.map(VAL_image_augmentor.resize, num_parallel_calls=AUTOTUNE) v_aug = TRAIN_image_augmentor.apply_augmentations(xy_data) v_x, v_y = [i.numpy() for i in next(iter(v.batch(10*num_demo_samples)))] v_x_aug, v_y_aug = [i.numpy() for i in next(iter(v_aug.batch(10*num_demo_samples)))] k = k_x for i in range(num_demo_samples): print(f'Neptune: logging {k}_{i}') print(f'{v_x[i].shape}, {v_x_aug[i].shape}') idx = np.random.randint(0,len(v_x)) if True: #'train' in k: TRAIN_image_augmentor.logger.add_log(v_x[idx],counter=i, name=k) TRAIN_image_augmentor.logger.add_log(v_x_aug[idx],counter=i, name=k+'_aug') def get_data_loader(data : tuple, data_subset_mode='train', batch_size=32, num_classes=None, infinite=True, augment=True, seed=2836): num_samples = len(data[0]) x = tf.data.Dataset.from_tensor_slices(data[0]) labels = tf.data.Dataset.from_tensor_slices(data[1]) data = tf.data.Dataset.zip((x, labels)) data = data.cache() if data_subset_mode == 'train': data = data.shuffle(buffer_size=num_samples) # data = data.map(lambda x,y: (tf.image.convert_image_dtype(load_img(x)*255.0,dtype=tf.uint8),y), num_parallel_calls=-1) # data = data.map(load_example, num_parallel_calls=AUTOTUNE) data = data.map(load_example, num_parallel_calls=AUTOTUNE) data = data.map(lambda x,y: (preprocess_input(x), y), num_parallel_calls=AUTOTUNE) if infinite: data = data.repeat() if data_subset_mode == 'train': data = data.shuffle(buffer_size=200, seed=seed) augmentor = TRAIN_image_augmentor elif data_subset_mode == 'val': augmentor = VAL_image_augmentor elif data_subset_mode == 'test': augmentor = TEST_image_augmentor if augment: data = augmentor.apply_augmentations(data) data = data.batch(batch_size, drop_remainder=True) return data.prefetch(AUTOTUNE) def get_tfds_data_loader(data : tf.data.Dataset, data_subset_mode='train', batch_size=32, num_samples=100, num_classes=19, infinite=True, augment=True, seed=2836): def encode_example(x, y): x = tf.image.convert_image_dtype(x, tf.float32) * 255.0 y = _encode_label(y, num_classes=num_classes) return x, y test_d = next(iter(data)) print(test_d[0].numpy().min()) print(test_d[0].numpy().max()) data = data.shuffle(buffer_size=num_samples) \ .cache() \ .map(encode_example, num_parallel_calls=AUTOTUNE) test_d = next(iter(data)) print(test_d[0].numpy().min()) print(test_d[0].numpy().max()) data = data.map(preprocess_input, num_parallel_calls=AUTOTUNE) test_d = next(iter(data)) print(test_d[0].numpy().min()) print(test_d[0].numpy().max()) if data_subset_mode == 'train': data = data.shuffle(buffer_size=100, seed=seed) augmentor = TRAIN_image_augmentor elif data_subset_mode == 'val': augmentor = VAL_image_augmentor elif data_subset_mode == 'test': augmentor = TEST_image_augmentor if augment: data = augmentor.apply_augmentations(data) test_d = next(iter(data)) print(test_d[0].numpy().min()) print(test_d[0].numpy().max()) data = data.batch(batch_size, drop_remainder=True) if infinite: data = data.repeat() return data.prefetch(AUTOTUNE) # y_true = [[0, 1, 0], [0, 0, 1]] # y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]] def accuracy(y_true, y_pred): y_pred = tf.argmax(y_pred, axis=-1) y_true = tf.argmax(y_true, axis=-1) return tf.reduce_mean(tf.cast(tf.equal(y_true, y_pred), tf.float32)) def true_pos(y_true, y_pred): # y_true = K.ones_like(y_true) return K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) def false_pos(y_true, y_pred): # y_true = K.ones_like(y_true) true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) all_positives = K.sum(K.round(K.clip(y_true, 0, 1))) return all_positives - true_positives def true_neg(y_true, y_pred): # y_true = K.ones_like(y_true) return K.sum(1-K.round(K.clip(y_true * y_pred, 0, 1))) def recall(y_true, y_pred): # y_true = K.ones_like(y_true) true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) all_positives = K.sum(K.round(K.clip(y_true, 0, 1))) recall = true_positives / (all_positives + K.epsilon()) return recall def precision(y_true, y_pred): y_true = K.ones_like(y_true) true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = true_positives / (predicted_positives + K.epsilon()) # tf.print(y_true, y_pred) return precision def f1_score(y_true, y_pred): m_precision = precision(y_true, y_pred) m_recall = recall(y_true, y_pred) # pdb.set_trace() return 2*((m_precision*m_recall)/(m_precision+m_recall+K.epsilon())) # def false_neg(y_true, y_pred): # y_true = K.ones_like(~y_true) # true_neg = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) # all_negative = K.sum(K.round(K.clip(y_true, 0, 1))) # return all_negatives - true_ # return K.mean(K.argmax(y_true,axis=1)*K.argmax(y_pred,axis=1)) # 'accuracy', # metrics.TrueNegatives(name='tn'), # metrics.FalseNegatives(name='fn'), METRICS = [ f1_score, metrics.TruePositives(name='tp'), metrics.FalsePositives(name='fp'), metrics.CategoricalAccuracy(name='accuracy'), metrics.TopKCategoricalAccuracy(name='top_3_categorical_accuracy', k=3), metrics.TopKCategoricalAccuracy(name='top_5_categorical_accuracy', k=5) ] PARAMS['sys.argv'] = ' '.join(sys.argv) with neptune.create_experiment(name=experiment_name, params=PARAMS, upload_source_files=[__file__]): print('Logging experiment tags:') for tag in args.tags: print(tag) neptune.append_tag(tag) neptune.append_tag(PARAMS['dataset_name']) neptune.append_tag(PARAMS['model_name']) neptune.log_artifact(args.config_path) cm_data_x = {'train':[],'val':[]} cm_data_y = {'train':[],'val':[]} if PARAMS['dataset_name'] in tfds.list_builders(): num_demo_samples=40 tfds_builder = tfds.builder(PARAMS['dataset_name']) tfds_builder.download_and_prepare() num_samples = tfds_builder.info.splits['train'].num_examples num_samples_dict = {'train':int(num_samples*PARAMS['train_size']), 'val':int(num_samples*PARAMS['val_size']), 'test':int(num_samples*PARAMS['test_size'])} classes = tfds_builder.info.features['label'].names num_classes = len(classes) train_slice = [0,int(PARAMS['train_size']*100)] val_slice = [int(PARAMS['train_size']*100), int((PARAMS['train_size']+PARAMS['val_size'])*100)] test_slice = [100 - int(PARAMS['test_size']*100), 100] tfds_train_data = tfds.load(PARAMS['dataset_name'], split=f"train[{train_slice[0]}%:{train_slice[1]}%]", shuffle_files=True, as_supervised=True) tfds_validation_data = tfds.load(PARAMS['dataset_name'], split=f"train[{val_slice[0]}%:{val_slice[1]}%]", shuffle_files=True, as_supervised=True) tfds_test_data = tfds.load(PARAMS['dataset_name'], split=f"train[{test_slice[0]}%:{test_slice[1]}%]", shuffle_files=True, as_supervised=True) # PARAMS['batch_size']=1 train_data = get_tfds_data_loader(data = tfds_train_data, data_subset_mode='train', batch_size=PARAMS['batch_size'], num_samples=num_samples_dict['train'], num_classes=num_classes, infinite=True, augment=True, seed=2836) validation_data = get_tfds_data_loader(data = tfds_validation_data, data_subset_mode='val', batch_size=PARAMS['batch_size'], num_samples=num_samples_dict['val'], num_classes=num_classes, infinite=True, augment=True, seed=2837) test_data = get_tfds_data_loader(data = tfds_test_data, data_subset_mode='test', batch_size=PARAMS['batch_size'], num_samples=num_samples_dict['test'], num_classes=num_classes, infinite=True, augment=True, seed=2838) # tfds_train_data = tfds.load(PARAMS['dataset_name'], split=f"train[{train_slice[0]}%:{train_slice[1]}%]", shuffle_files=True, as_supervised=True) # tfds_validation_data = tfds.load(PARAMS['dataset_name'], split=f"train[{val_slice[0]}%:{val_slice[1]}%]", shuffle_files=True, as_supervised=True) # tfds_test_data = tfds.load(PARAMS['dataset_name'], split=f"train[{test_slice[0]}%:{test_slice[1]}%]", shuffle_files=True, as_supervised=True) split_data = {'train':get_tfds_data_loader(data = tfds_train_data, data_subset_mode='train', batch_size=num_demo_samples, num_samples=num_samples_dict['train'], num_classes=num_classes, infinite=True, augment=True, seed=2836), 'val':get_tfds_data_loader(data = tfds_validation_data, data_subset_mode='val', batch_size=num_demo_samples, num_samples=num_samples_dict['val'], num_classes=num_classes, infinite=True, augment=True, seed=2837), 'test':get_tfds_data_loader(data = tfds_test_data, data_subset_mode='test', batch_size=num_demo_samples, num_samples=num_samples_dict['test'], num_classes=num_classes, infinite=True, augment=True, seed=2838) } steps_per_epoch=num_samples_dict['train']//PARAMS['batch_size'] validation_steps=num_samples_dict['val']//PARAMS['batch_size'] cm_data_x['train'], cm_data_y['train'] = next(iter(split_data['train'])) cm_data_x['val'], cm_data_y['val'] = next(iter(split_data['val'])) else: data = datasets[PARAMS['dataset_name']] neptune.set_property('num_classes',data.num_classes) neptune.set_property('class_distribution',data.metadata.class_distribution) encoder = base_dataset.LabelEncoder(data.data.family) split_data = base_dataset.preprocess_data(data, encoder, data_config) # import pdb;pdb.set_trace() for subset, subset_data in split_data.items(): split_data[subset] = [list(i) for i in unzip(subset_data)] PARAMS['batch_size'] = 32 steps_per_epoch=len(split_data['train'][0])//PARAMS['batch_size']#//10 validation_steps=len(split_data['val'][0])//PARAMS['batch_size']#//10 split_datasets = { k:base_dataset.BaseDataset.from_dataframe( pd.DataFrame({ 'path':v[0], 'family':v[1] })) \ for k,v in split_data.items() } for k,v in split_datasets.items(): print(k, v.num_classes) classes = split_datasets['train'].classes train_data=get_data_loader(data=split_data['train'], data_subset_mode='train', batch_size=PARAMS['batch_size'], infinite=True, augment=True, seed=2836) validation_data=get_data_loader(data=split_data['val'], data_subset_mode='val', batch_size=PARAMS['batch_size'], infinite=True, augment=True, seed=2837) if 'test' in split_data.keys(): test_data=get_data_loader(data=split_data['test'], data_subset_mode='test', batch_size=PARAMS['batch_size'], infinite=True, augment=True, seed=2838) num_demo_samples=150 # neptune_log_augmented_images(split_data, num_demo_samples=num_demo_samples, PARAMS=PARAMS) cm_data_x['train'], cm_data_y['train'] = next(iter(get_data_loader(data=split_data['train'], data_subset_mode='train', batch_size=num_demo_samples, infinite=True, augment=True, seed=2836))) cm_data_x['val'], cm_data_y['val'] = next(iter(get_data_loader(data=split_data['val'], data_subset_mode='val', batch_size=num_demo_samples, infinite=True, augment=True, seed=2836))) ######################################################################################## train_image_logger_cb = ImageLoggerCallback(data=train_data, freq=20, max_images=-1, name='train', encoder=encoder) val_image_logger_cb = ImageLoggerCallback(data=validation_data, freq=20, max_images=-1, name='val', encoder=encoder) ######################################################################################## cm_callback = ConfusionMatrixCallback(log_dir, cm_data_x, cm_data_y, classes=classes, seed=PARAMS['seed'], include_train=True) checkpoint = ModelCheckpoint(weights_best, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='min',restore_best_weights=restore_best_weights) tfboard = TensorBoard(log_dir=log_dir, histogram_freq=histogram_freq, write_images=True) early = EarlyStopping(monitor='val_loss', patience=patience, verbose=1) callbacks = [checkpoint,tfboard,early, cm_callback, neptune_logger, train_image_logger_cb, val_image_logger_cb] ########################## if PARAMS['optimizer'] == 'Adam': optimizer = tf.keras.optimizers.Adam( learning_rate=PARAMS['lr'] ) elif PARAMS['optimizer'] == 'Nadam': optimizer = tf.keras.optimizers.Nadam( learning_rate=PARAMS['lr'] ) elif PARAMS['optimizer'] == 'SGD': optimizer = tf.keras.optimizers.SGD( learning_rate=PARAMS['lr'] ) ########################## if PARAMS['loss']=='focal_loss': loss = focal_loss(gamma=2.0, alpha=4.0) elif PARAMS['loss']=='categorical_crossentropy': loss = 'categorical_crossentropy' ########################## model_params = stuf(name=PARAMS['model_name'], model_dir=os.path.join(experiment_dir, experiment_name, 'models'), num_classes=PARAMS['num_classes'], frozen_layers = PARAMS['frozen_layers'], input_shape = (*PARAMS['image_size'],PARAMS['num_channels']), base_learning_rate = PARAMS['lr'], regularization = PARAMS['regularization']) #### if PARAMS['model_name']=='shallow': model = build_shallow(input_shape=model_params.input_shape, num_classes=PARAMS['num_classes'], optimizer=optimizer, loss=loss, METRICS=METRICS) else: model = build_model(model_params, optimizer, loss, METRICS) print(f"TRAINING {PARAMS['model_name']}") model.summary(print_fn=lambda x: neptune.log_text('model_summary', x)) history = model.fit(train_data, epochs=num_epochs, callbacks=callbacks, validation_data=validation_data, shuffle=True, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps) if 'test' in split_data: results = model.evaluate(test_data, steps=len(split_data['test'][0])) else: results = model.evaluate(validation_data, steps=validation_steps)
def test_main(args, neptune): # some constants error_scaler = 1E8 ar = [1240, 1460] # anomaly range @200608-03:10 in_n = args.dim_input # load model and obtain some stats model = torch.load(args.out_dir + '/' + args.exp_id).to('cpu') fp = open(args.stat_file, 'r') lines = fp.readlines() x_avg = torch.tensor([float(s) for s in lines[0].split(',')]) x_std = torch.tensor([float(s) for s in lines[1].split(',')]) fp.close() # 1. load test data val_data = np.loadtxt(args.val_path, delimiter=',') val_data = torch.tensor(val_data).type(torch.float32) val_recon = forward(val_data, model, x_avg, x_std, args) val_err = torch.sum((val_recon - val_data)**2, dim=1, keepdim=True) # squared error ve = val_err * error_scaler test_data = np.loadtxt(args.test_path, delimiter=',') test_lbl = test_data[:, -1] data_len = test_data.shape[0] # retreive labels and length test_data = torch.tensor(test_data[:, :-1]).type(torch.float32) test_recon = forward(test_data, model, x_avg, x_std, args) test_err = torch.sum((test_recon - test_data)**2, dim=1, keepdim=True) # squared error te = test_err * error_scaler # 2. measure validation error and test error neptune.set_property('validation error', torch.sum(ve).item()) neptune.set_property('test error', torch.sum(te).item()) # 2. plot reconstruction results cols = ['sensor1', 'sensor2'] # features ids_col = range(test_data.shape[0]) # for index for j, (data, recon, str) in enumerate([(val_data, val_recon, 'Validation'), (test_data, test_recon, 'Test')]): fig, axs = plt.subplots(len(cols), 1, figsize=(12, 3)) for i, col in enumerate(cols): axs[i].plot(ids_col, data.numpy()[:, i], '-c', linewidth=2, label='Raw Data') axs[i].plot(ids_col, recon.detach().numpy()[:, i], '-b', linewidth=1, label='Reconstructed Data') axs[1].legend() # only add legend for second row fig.suptitle('Time Series of ' + str) log_chart('Data-Reconstruction', fig) #3. find threshold T = (torch.mean(ve) + 2 * torch.std(ve)).item() T_ = np.empty((data_len, 1)) T_[:] = T # for plotting threshold if args.use_smoothing == 1: ve = smooth(ve.detach().numpy(), args.window_size) te = smooth( te.detach().numpy(), args.window_size ) # smoothing removes first window_size samples. (Tx, 1) -> (Tx-args.window_size+1, 1) pred = (te > T) # pred is classification result pad = np.empty((args.window_size - 1, 1)) pad[:] = 0 pred = np.vstack([pad, pred]) # add 0 padding from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score acc = accuracy_score(test_lbl, pred) neptune.set_property('acc', acc) prec = precision_score(test_lbl, pred) neptune.set_property('prec', prec) rec = recall_score(test_lbl, pred) neptune.set_property('rec', rec) f1 = f1_score(test_lbl, pred) neptune.set_property('f1', f1) # 4. draw plot fig = plt.figure(figsize=(24, 4)) ids = list(range(data_len)) pad[:] = np.nan te = np.vstack([pad, te]) # add nan padding plt.plot(ids[:ar[0]], te[:ar[0]], '-c', label='Test Reconstruction Error (Normal)') plt.plot(ids[ar[0]:ar[1]], te[ar[0]:ar[1]], '-r', label='Test Reconstruction Error (Anomaly)') plt.plot(ids[ar[1]:], te[ar[1]:], '-c') plt.plot(ids, T_, '--b', label='Threshold') plt.xlabel('Time') plt.ylabel('Error') plt.legend() #plt.ylim((0,2E5)) plt.title('Reconstruction Error') log_chart('Reconstruction Error', fig)
def train_imagenette(PARAMS): neptune.append_tag(PARAMS['dataset_name']) neptune.append_tag(PARAMS['model_name']) K.clear_session() tf.random.set_seed(34) target_size = PARAMS['target_size'] BATCH_SIZE = PARAMS['BATCH_SIZE'] train_dataset, validation_dataset, info = create_Imagenette_dataset( BATCH_SIZE, target_size=target_size, augment_train=PARAMS['augment_train']) num_classes = info.features['label'].num_classes encoder = base_dataset.LabelEncoder(info.features['label'].names) train_dataset = train_dataset.map( lambda x, y: apply_preprocess(x, y, num_classes), num_parallel_calls=-1) validation_dataset = validation_dataset.map( lambda x, y: apply_preprocess(x, y, num_classes), num_parallel_calls=-1) PARAMS['num_classes'] = num_classes steps_per_epoch = info.splits['train'].num_examples // BATCH_SIZE validation_steps = info.splits['validation'].num_examples // BATCH_SIZE neptune.set_property('num_classes', num_classes) neptune.set_property('steps_per_epoch', steps_per_epoch) neptune.set_property('validation_steps', validation_steps) optimizer = tf.keras.optimizers.Adam(learning_rate=PARAMS['learning_rate']) loss = 'categorical_crossentropy' METRICS = ['accuracy'] base = tf.keras.applications.vgg16.VGG16( weights='imagenet', include_top=False, input_tensor=Input(shape=(*target_size, 3))) # TODO try freezing weights for input_shape != (224,224) model = build_head(base, num_classes=num_classes) model.compile(optimizer=optimizer, loss=loss, metrics=METRICS) callbacks = [ neptune_logger, ImageLoggerCallback(data=train_dataset, freq=10, max_images=-1, name='train', encoder=encoder), ImageLoggerCallback(data=validation_dataset, freq=10, max_images=-1, name='val', encoder=encoder), EarlyStopping(monitor='val_loss', patience=2, verbose=1) ] model.summary(print_fn=lambda x: neptune.log_text('model_summary', x)) pprint(PARAMS) history = model.fit(train_dataset, epochs=10, callbacks=callbacks, validation_data=validation_dataset, shuffle=True, initial_epoch=0, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps)
def train_main(args, neptune): device = torch.device("cuda") # iterators trainiter = RNNIterator(args.tr_path, stat_file=args.stat_file, batch_size=args.batch_size) validiter = RNNIterator(args.val_path, stat_file=args.stat_file, batch_size=args.batch_size) testiter = RNNIterator(args.test_path, stat_file=args.stat_file, batch_size=args.batch_size) if args.n_cmt > 1: model = RNN_ESM(n_cmt=args.n_cmt, dim_input=args.dim_input, dim_lstm_hidden=args.dim_lstm_hidden, dim_fc_hidden=args.dim_fc_hidden, dim_output=args.dim_out).to(device) elif args.n_cmt == 1: model = RNN_MODEL1(dim_input=args.dim_input, dim_lstm_hidden=args.dim_lstm_hidden, dim_fc_hidden=args.dim_fc_hidden, dim_output=args.dim_out).to(device) else: print('n_cmt must be natual number') import sys sys.exit(0) start = time.time() # train the model if args.n_cmt > 1: for i in range(args.n_cmt): model.model_list[i] = train(net=model.model_list[i], train_loader=trainiter, valid_loader=validiter, patience=args.patience, args=args, dtype=torch.float32, device=device, savedir=args.out_dir + '/' + args.out_file, neptune=neptune) else: model = train(net=model, train_loader=trainiter, valid_loader=validiter, patience=args.patience, args=args, dtype=torch.float32, device=device, savedir=args.out_dir + '/' + args.out_file, neptune=neptune) acc, prec, rec, f1 = test(model, testiter, device) print('acc: {:.4f} | prec: {:.4f} | rec: {:.4f} | f1: {:.4f}'.format( acc, prec, rec, f1)) neptune.set_property('acc', acc) neptune.set_property('prec', prec) neptune.set_property('rec', rec) neptune.set_property('f1', f1)
def record_eval_metric(neptune, metrics): for k, v in metrics.items(): neptune.log_metric(k, v) # %% model_path = '/workspace/ml-workspace/thesis_git/thesis/models/' best_eval_f1 = 0 # Measure the total training time for the whole run. total_t0 = time.time() with neptune.create_experiment(name="HierarchicalSemanticGraphNetwork", params=PARAMS, upload_source_files=['HSGN_GAT.py']): neptune.append_tag( ["homogeneous_graph", "GATConv", "bidirectional_token_node_edge"]) neptune.set_property('server', 'IRGPU2') neptune.set_property('training_set_path', training_path) neptune.set_property('dev_set_path', dev_path) # For each epoch... for epoch_i in range(0, epochs): # ======================================== # Training # ======================================== # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...')
# select project neptune.init('USERNAME/example-project') # define parameters PARAMS = {'timeseries_factor': 1.7, 'n_iterations': 200, 'n_images': 7} # create experiment neptune.create_experiment(name='timeseries_example', params=PARAMS) # log some metrics for i in range(1, PARAMS['n_iterations']): neptune.log_metric('iteration', i) neptune.log_metric('timeseries', PARAMS['timeseries_factor'] * np.cos(i / 10)) neptune.log_text('text_info', 'some value {}'.format(0.95 * i**2)) # log property (key:value pair) neptune.set_property('timeseries_data_hash', '123e4567') # add tag to the experiment neptune.append_tag('timeseries_modeling') # log some images for j in range(PARAMS['n_images']): array = np.random.rand(10, 10, 3) * 255 array = np.repeat(array, 30, 0) array = np.repeat(array, 30, 1) neptune.log_image('mosaics', array) neptune.stop()
def run_roshambo(): seed = 0x1B random.seed(seed) np.random.seed(seed) os.environ["PYTHONHASHSEED"] = str(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False neptune.set_property("seed", seed) neptune.append_tag("ROSHAMBO") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") _logger.info("Using device type %s", str(device)) reduction_factor = 5 # Reduce dimension axis by this factor neptune.set_property("reduction_factor", reduction_factor) width = 240 // reduction_factor height = 180 // reduction_factor n_features = width * height * 2 batch_size = 5 neptune.set_property("batch_size", batch_size) dt = 1 * ms neptune.set_property("dt", dt) bin_size = 50 * ms neptune.set_property("bin_size", bin_size) bin_steps = rescale(bin_size, dt, int) duration_per_sample = 500 * ms neptune.set_property("duration_per_sample", duration_per_sample) number_of_steps = rescale(duration_per_sample, dt, int) topology = SmallWorldTopology( SmallWorldTopology.Configuration( minicolumn_shape=(7, 7, 7), macrocolumn_shape=(3, 3, 3), minicolumn_spacing=300, p_max=0.025, sparse_init=True, ) ) n_neurons = topology.number_of_nodes() nb_of_bins = 1 + number_of_steps // bin_steps linear_readout = LinearWithBN(n_neurons * nb_of_bins, 3).to(device) loss_fn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(linear_readout.parameters(), lr=0.001) neptune.set_property("adam.lr", 0.001) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1) neptune.set_property("steplr.gamma", 0.1) neptune.set_property("steplr.step_size", 2) p_critical_configs = { "alpha": 0.0025, "beta": 0.00025, "tau_v": 50 * ms, "tau_i": 5 * ms, "v_th": 1.0, } for k, v in p_critical_configs.items(): neptune.set_property(k, v) model = PCritical( n_features, batch_size, topology, dt=dt, **p_critical_configs, ).to(device) all_transforms = Compose( [ ScaleDown(240, 180, factor=reduction_factor), ToDense(width, height, duration_per_sample, dt=dt), Flatten(), ] ) label_dict = { "scissors": 0, "paper": 1, "rock": 2, } data = INIRoshambo( os.getenv("ROSHAMBO_DATASET_LOCATION_500ms_subsamples"), transforms=all_transforms, ) train_data, val_data = split_per_user(data, train_ratio=0.85) _logger.info( "Keeping %i samples for training and %i for validation", len(train_data), len(val_data), ) def labels_to_tensor(labels): return torch.tensor([label_dict[l] for l in labels]) def run_batch(X, y): current_batch_size = len(y) model.batch_size = current_batch_size bins = torch.zeros(current_batch_size, n_neurons, nb_of_bins, device=device) for t in range(number_of_steps): out_spikes = model.forward(X[:, :, t]) bins[:, :, t // bin_steps] += out_spikes return bins for iter_nb in range(10): train_generator = torch_data.DataLoader( train_data, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True, timeout=120, ) for i, (X, labels) in enumerate(tqdm(train_generator)): if i >= 20: break neptune.log_metric("iteration", i) X, y = X.to(device), labels_to_tensor(labels).to(device) # fig, axs = plt.subplots() # display_spike_train(axs, X[0]) # plt.show() # print(X.shape) # exit(0) bins = run_batch(X, y) # fig, axs = plt.subplots() # activity = bins[0].sum(dim=0) # axs.plot(np.arange(nb_of_bins), activity.cpu().numpy()) # plt.show() optimizer.zero_grad() out = linear_readout(bins.view(len(y), -1)) loss = loss_fn(out, y) loss.backward() optimizer.step() loss_val = loss.cpu().detach().item() _logger.info("Loss: %.3f", loss_val) neptune.log_metric("loss", loss_val) total_accurate = 0 total_elems = 0 val_generator = torch_data.DataLoader( val_data, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True, timeout=120, ) for i, (X, labels) in enumerate(tqdm(val_generator)): if i >= 10: break X, y = X.to(device), labels_to_tensor(labels).to(device) bins = run_batch(X, y) out = linear_readout(bins.view(len(y), -1)) preds = torch.argmax(out, dim=1) total_accurate += torch.sum(preds == y).cpu().float().item() total_elems += len(y) _logger.info("Current accuracy: %.4f", total_accurate / total_elems) neptune.log_metric("current_accuracy", total_accurate / total_elems) scheduler.step() _logger.info( "Final accuracy at iter %i: %.4f", iter_nb, total_accurate / total_elems ) neptune.log_metric("final_accuracy", total_accurate / total_elems)
def train_pnas(PARAMS): ensure_dir_exists(PARAMS['log_dir']) ensure_dir_exists(PARAMS['model_dir']) neptune.append_tag(PARAMS['dataset_name']) neptune.append_tag(PARAMS['model_name']) neptune.append_tag(str(PARAMS['target_size'])) neptune.append_tag(PARAMS['num_channels']) neptune.append_tag(PARAMS['color_mode']) K.clear_session() tf.random.set_seed(34) train_dataset, validation_dataset, data_files = create_dataset( dataset_name=PARAMS['dataset_name'], batch_size=PARAMS['BATCH_SIZE'], target_size=PARAMS['target_size'], num_channels=PARAMS['num_channels'], color_mode=PARAMS['color_mode'], splits=PARAMS['splits'], augment_train=PARAMS['augment_train'], aug_prob=PARAMS['aug_prob']) PARAMS['num_classes'] = data_files.num_classes PARAMS['splits_size'] = {'train': {}, 'validation': {}} PARAMS['splits_size'][ 'train'] = data_files.num_samples * PARAMS['splits']['train'] PARAMS['splits_size'][ 'validation'] = data_files.num_samples * PARAMS['splits']['validation'] steps_per_epoch = PARAMS['splits_size']['train'] // PARAMS['BATCH_SIZE'] validation_steps = PARAMS['splits_size']['validation'] // PARAMS[ 'BATCH_SIZE'] neptune.set_property('num_classes', PARAMS['num_classes']) neptune.set_property('steps_per_epoch', steps_per_epoch) neptune.set_property('validation_steps', validation_steps) encoder = base_dataset.LabelEncoder(data_files.classes) # train_dataset = train_dataset.map(lambda x,y: apply_preprocess(x,y,PARAMS['num_classes']),num_parallel_calls=-1) # validation_dataset = validation_dataset.map(lambda x,y: apply_preprocess(x,y,PARAMS['num_classes']),num_parallel_calls=-1) # METRICS = ['accuracy'] callbacks = [ neptune_logger, ImageLoggerCallback(data=train_dataset, freq=10, max_images=-1, name='train', encoder=encoder), ImageLoggerCallback(data=validation_dataset, freq=10, max_images=-1, name='val', encoder=encoder), EarlyStopping(monitor='val_loss', patience=25, verbose=1) ] PARAMS['base_learning_rate'] = PARAMS['lr'] PARAMS['input_shape'] = (*PARAMS['target_size'], PARAMS['num_channels']) model = build_model(PARAMS) # if PARAMS['optimizer']=='Adam': # optimizer = tf.keras.optimizers.Adam(learning_rate=PARAMS['lr']) # base = tf.keras.applications.vgg16.VGG16(weights='imagenet', # include_top=False, # input_tensor=Input(shape=(*PARAMS['target_size'],3))) # model = build_head(base, num_classes=PARAMS['num_classes']) # model.compile(optimizer=optimizer, # loss=PARAMS['loss'], # metrics=METRICS) model.summary(print_fn=lambda x: neptune.log_text('model_summary', x)) pprint(PARAMS) history = model.fit(train_dataset, epochs=PARAMS['num_epochs'], callbacks=callbacks, validation_data=validation_dataset, shuffle=True, initial_epoch=0, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps) for k, v in PARAMS.items(): neptune.set_property(str(k), str(v)) return history
## Create an experiment and log model hyper-parameters neptune.create_experiment(name='pytorch-run', tags=['pytorch', 'MNIST'], params=PARAMS) ## Log data version to the experiment dataset = datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([transforms.ToTensor()])) neptune.set_property( 'data_version', hashlib.md5(dataset.data.cpu().detach().numpy()).hexdigest()) ## Log losses, accuracy score and image predictions during training train_loader = torch.utils.data.DataLoader(dataset, batch_size=PARAMS['batch_size'], shuffle=True) model = Net(PARAMS['fc_out_features']) optimizer = optim.SGD(model.parameters(), PARAMS['lr'], PARAMS['momentum']) for batch_idx, (data, target) in enumerate(train_loader): optimizer.zero_grad() outputs = model(data) loss = F.nll_loss(outputs, target)