def run_search(search_param, args, params): """Train a model and evaluate the model for a set of parameters during parameter search""" # Define unique job name job_name = "{}_{}".format(args.hyperparameter, search_param) # Set the logger set_logger(os.path.join(args.parent_dir, job_name + '.log')) # Train the model corpus, dictionary, _ = process_data(path=args.train_path, params=params, dictionary=None) lda = train_lda(corpus, params, dictionary) # Save results log_results(lda, params, corpus, dictionary, True, args) if not args.test_mode: # Evaluate the model on dev set eval_results(dev_path=args.dev_path, lda=lda, dictionary=dictionary, params=params) else: # Evaluate model in the test set eval_results(dev_path='data/processed/test/test_200.csv', lda=lda, dictionary=dictionary, params=params)
def main(): # Load the parameters from json file args = parser.parse_args() json_path = os.path.join(args.model_dir, 'params.json') params = Params(json_path) # Set the logger set_logger(os.path.join(args.model_dir, 'train.log')) # Create the input data pipeline logging.info('Creating the dataset...') data_dir = args.data_dir valid_data_dir = os.path.join(data_dir, 'valid') # Get the filenames and labels from the test set valid_filenames, valid_labels = get_filenames_and_labels( valid_data_dir, params) params.valid_size = len(valid_filenames) params.num_labels = len(set(valid_labels)) # Create the two iterators over the two datasets valid_inputs = input_fn(False, valid_filenames, valid_labels, params) # Define the model logging.info("Creating the model...") model_spec = model_fn('eval', valid_inputs, params, reuse=False) logging.info("Starting evaluation") evaluate(model_spec, args.model_dir, params, args.restore_from)
def __init__(self): # Load the parameters args = EvaluatePointConfig() json_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile( json_path), "No json configuration file found at {}".format( json_path) params = Params(json_path) if params.mlp_sizes is None or len(params.mlp_sizes) == 0: logging.error( 'mlp_sizes are not set correctly, at least one MLP layer is required' ) params.dict['loss_fn'] = args.loss_fn # Load the parameters from the dataset, that gives the size etc. into params json_path = os.path.join(args.data_dir, 'dataset_params.json') assert os.path.isfile( json_path), "No json file found at {}, run build.py".format( json_path) params.update(json_path) # Set the logger set_logger(os.path.join(args.model_dir, 'evaluate.log')) # # Get paths for tfrecords path_eval_tfrecords = os.path.join(args.data_dir, 'test_' + args.tfrecords_filename) # Create the input data pipeline logging.info("Creating the dataset...") eval_dataset = load_dataset_from_tfrecords(path_eval_tfrecords) # Create iterator over the test set # eval_inputs = input_fn('test', eval_dataset, params) eval_inputs = online_input_fn() logging.info("- done.") # print(type(eval_inputs)) # Define the model logging.info("Creating the model...") weak_learner_id = load_best_ndcgs( os.path.join(args.model_dir, args.restore_from, 'learner.json'))[0] self.model_spec = model_fn('test', eval_inputs, params, reuse=False, weak_learner_id=int(weak_learner_id)) # node_names = [n.name for n in tf.get_default_graph().as_graph_def().node] # print(node_names) logging.info("- done.") logging.info("Starting evaluation") logging.info("Optimized using {} learners".format(weak_learner_id)) self.saver = tf.train.Saver() self.sess = tf.Session() self.params = params self.sess.run(self.model_spec['variable_init_op']) save_path = os.path.join(args.model_dir, args.restore_from) if os.path.isdir(save_path): save_path = tf.train.latest_checkpoint(save_path) self.saver.restore(self.sess, save_path)
def main(): # Load the parameters from json file args = parser.parse_args() json_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile(json_path), 'No json configuration file found at {}'.format(json_path) params = Params(json_path) # Set the logger set_logger(os.path.join(args.model_dir, 'train.log')) if not os.path.exists(args.restore_from): os.makedirs(args.restore_from) # Create the input data pipeline logging.info('Creating the datasets...') data_dir = args.data_dir train_data_dir = os.path.join(data_dir, 'train') valid_data_dir = os.path.join(data_dir, 'valid') # Get the filenames and labels from the train and valid sets train_filenames, train_labels = get_filenames_and_labels( train_data_dir, params) valid_filenames, valid_labels = get_filenames_and_labels( valid_data_dir, params) params.train_size = len(train_filenames) params.valid_size = len(valid_filenames) params.num_labels = len(set(train_labels)) # Create the two iterators over the two datasets train_inputs = input_fn(True, train_filenames, train_labels, params) valid_inputs = input_fn(False, valid_filenames, valid_labels, params) # Define the model logging.info('Creating the model...') train_model_spec = model_fn('train', train_inputs, params) valid_model_spec = model_fn('eval', valid_inputs, params, reuse=True) # Train the model logging.info('Starting training for {} epoch(s)'.format( params.num_epochs)) train_and_evaluate(train_model_spec, valid_model_spec, args.model_dir, params, args.restore_from)
def train(): # Set the logger set_logger(os.path.join(params['model_dir'], 'train.log')) # log params logging.info(params) # Load vacabulary vocab = tf.contrib.lookup.index_table_from_file(vocab_path, num_oov_buckets=1) # Create the input data pipeline logging.info('Creating the datasets...') train_input_words = load_dataset_from_text(data_dir, train_input_filename, vocab) train_context_words = load_dataset_from_text(data_dir, train_context_filename, vocab) # Create the iterator over the dataset train_inputs = input_fn('train', train_input_words, train_context_words, params) eval_inputs = input_fn('eval', train_input_words, train_context_words, params) logging.info("- done") # Define the model logging.info('Creating the model...') train_model_spec = model_fn('train', train_inputs, params, reuse=tf.AUTO_REUSE) eval_model_spec = model_fn('eval', eval_inputs, params, reuse=True) logging.info('- done.') # Train the model logging.info('Starting training for {} epochs'.format( params['num_epochs'])) normalized_embedding_matrix = train_and_evaluate(train_model_spec, eval_model_spec, params) save_dict_to_json(params, params['model_dir'] + '/params.json') pd.DataFrame(normalized_embedding_matrix).to_csv(os.path.join( params['model_dir'], 'normalized_embedding_matrix.tsv'), index=False, header=None, sep='\t')
def funct(x): # Set the random seed for the whole graph tf.set_random_seed(230) # Load the parameters args = parser.parse_args() json_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path) params = Params(json_path) # Set the logger set_logger(os.path.join(args.data_dir, 'predict.log')) # Create the input data pipeline data_dir = args.data_dir test_data_dir = os.path.join(data_dir) # Get the filenames from the test set test_filenames = [os.path.join(test_data_dir, 'predict.jpg') ] test_labels = [x] # print(test_labels) # specify the size of the evaluation set params.eval_size = len(test_filenames) # create the iterator over the dataset test_inputs = input_fn(False, test_filenames, test_labels, params) # Define the model model_spec = model_fn('eval', test_inputs, params, reuse=tf.AUTO_REUSE) evaluate(model_spec, args.model_dir, params, args.restore_from)
def main(): # Load parameters from json file args = parser.parse_args() json_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile( json_path), "No json configuration file found at {}".format(json_path) params = Params(json_path) # use GPU if found params.cuda = torch.cuda.is_available() if params.cuda: params.device = torch.device('cuda:0') else: params.device = torch.device('cpu') # Set a seed for reproducible experiments torch.manual_seed(141) if params.cuda: torch.cuda.manual_seed(141) # Set the training logger for updates set_logger(os.path.join(args.model_dir, 'train.log')) logging.info("Creating input pipelines...") data_pipelines = fetch_pipeline(['train', 'validation'], args.data_dir, params) train_pipeline = data_pipelines['train'] logging.info("Completed (Training Dataset)!") valid_pipeline = data_pipelines['validation'] logging.info("Completed (Validation Dataset)!") logging.info("Building network model...") model_spec = model_fn(params) logging.info("Building completed!") logging.info("Initiate training procedure!") train_and_validate(model_spec, train_pipeline, valid_pipeline, args.model_dir, params, args.restore_from) logging.info("Training completed!")
def __init__(self, params): logging.info("Initializing dataset ...") self.dataset_path = params.dataset_path self.params = params if not os.path.isdir(params.experiment_path): os.mkdir(params.experiment_path) tf.set_random_seed(100) set_logger(os.path.join(params.experiment_path, 'experiment.log')) #Get the file paths for data self.get_data_path() #Split the train data and verify all data contents self.read_and_verify_data() #Build tf datasets for each set of inputs self.train_dataset = self.build_tf_dataset(self.train_filenames, self.train_labels, is_training=True) self.eval_dataset = self.build_tf_dataset(self.eval_filenames, self.eval_labels, is_training=False) self.test_dataset = self.build_tf_dataset(self.test_filenames, self.test_labels, is_training=False) #Build singular dataset iterator self.dataset_iterator = tf.data.Iterator.from_structure( self.train_dataset.output_types, self.train_dataset.output_shapes) self.inputs, self.labels, self.is_training = self.dataset_iterator.get_next( ) #Build init ops for train, eval, and test datasets self.train_init_op = self.dataset_iterator.make_initializer( self.train_dataset) self.eval_init_op = self.dataset_iterator.make_initializer( self.eval_dataset) self.test_init_op = self.dataset_iterator.make_initializer( self.eval_dataset)
args = parser.parse_args() json_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile( json_path), "No json configuration file found at {}".format(json_path) params = Params(json_path) params.dict['loss_fn'] = args.loss_fn # # Load the parameters from the dataset, that gives the size etc. into params json_path = os.path.join(args.data_dir, 'dataset_params.json') assert os.path.isfile( json_path), "No json file found at {}, run prepare_data.py".format( json_path) params.update(json_path) # Set the logger set_logger(os.path.join(args.model_dir, 'train.log')) path_train_tfrecords = os.path.join(args.data_dir, 'train_' + args.tfrecords_filename) path_eval_tfrecords = os.path.join(args.data_dir, 'eval_' + args.tfrecords_filename) # Create the input data pipeline logging.info("Creating the datasets...") train_dataset = load_dataset_from_tfrecords(path_train_tfrecords) eval_dataset = load_dataset_from_tfrecords(path_eval_tfrecords) # Specify other parameters for the dataset and the model # Create the two iterators over the two datasets train_inputs = input_fn('train', train_dataset, params)
# Load the parameters args = parser.parse_args() json_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path) params = Params(json_path) params.dict['loss_fn'] = args.loss_fn params.dict['collect'] = False params.dict['use_kfac'] = args.use_kfac params.dict['finetune'] = args.finetune params.dict['training_keep_prob'] = 1.0 # Load the parameters from the dataset, that gives the size etc. into params json_path = os.path.join(args.data_dir, 'dataset_params.json') assert os.path.isfile(json_path), "No json file found at {}, run build.py".format(json_path) params.update(json_path) # Set the logger set_logger(os.path.join(args.model_dir, 'test{}.log'.format(args.log))) # # Get paths for tfrecords dataset = 'test' path_eval_tfrecords = os.path.join(args.data_dir, dataset + args.tfrecords_filename) # Create the input data pipeline logging.info("Creating the dataset...") eval_dataset = load_dataset_from_tfrecords(path_eval_tfrecords) # Create iterator over the test set eval_inputs = input_fn('test', eval_dataset, params) logging.info("- done.") # Define the model logging.info("Creating the model...") # weak_learner_id = load_learner_id(os.path.join(args.model_dir, args.restore_from, 'learner.json'))[0] eval_model_spec = model_fn('test', eval_inputs, params, reuse=False) # node_names = [n.name for n in tf.get_default_graph().as_graph_def().node] # print(node_names)
import boto3 import pickle import cv2 from model.opts import configure_args from model.utils import set_logger, pre_process import numpy as np from model.network_architecture import create_model import tensorflow as tf if __name__ == "__main__": user_id = 4642 args = configure_args() set_logger('output/train_{}.log'.format(args.name)) s3 = boto3.client('s3') response = s3.get_object(Bucket='cureskin-dataset', Key='new_data/image_{}.pkl'.format(user_id)) body = response['Body'].read() img_frame = pickle.loads(body) x, mask = pre_process(img_frame, args) print(mask) checkpoint_path = 'ckpts'.format(args.name) + '/cp-0005.ckpt' model = create_model(args) model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=tf.keras.optimizers.SGD(), metrics=['accuracy', tf.keras.metrics.Precision()]) model.load_weights(checkpoint_path) prob = model.predict_on_batch([x, mask])
args = parser.parse_args() tf.set_random_seed(233) json_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile( json_path), "could't find the json configuration file at {}".format( json_path) params = Params(json_path) model_dir_has_best_weights = os.path.isdir( os.path.join(args.model_dir, "best_weight")) overwritting = model_dir_has_best_weights and args.restore_from is None assert not overwritting, "weights found in model_dir, aborting to avoid overwrite" # Set the logger set_logger(os.path.join("../", 'train.log')) # Create the input data pipeline logging.info("Creating the datasets...") data_dir = args.data_dir ground_truth_dir = args.ground_truth_dir train_data_dir = os.path.join(data_dir, "train") dev_data_dir = os.path.join(data_dir, "dev") train_masks_dir = os.path.join(ground_truth_dir, "train") dev_masks_dir = os.path.join(ground_truth_dir, "dev") # Get the filenames from the train and dev sets train_filenames = [ os.path.join(train_data_dir, f) for f in os.listdir(train_data_dir) if f.endswith('.jpg') ]
help = "Whether to download MS or not") parser.add_argument('--scrap_InChi', default= True,\ help = "Whether to download InChi or not") args = parser.parse_args() #Check if file containing CAS ids exist assert os.path.isfile(args.cas_list), "No file named {} exists".format( args.cas_list) #Create data directory to store logs and spectra data_dir = args.save_dir if not os.path.exists(data_dir): os.makedirs(data_dir) set_logger(data_dir, 'scrap.log') #Obtain CAS ids used for downloading the content from NIST logging.info('Loading CAS file') cas_df = pd.read_csv(args.cas_list, sep='\t', names=['name', 'formula', 'cas'], header=0) cas_df.dropna(subset=['cas'], inplace=True) cas_df.cas = cas_df.cas.str.replace('-', '') cas_ids = list(cas_df.cas) logging.info('Scrap Mass spectra') if args.scrap_MS: params = params = {'JCAMP': '', 'Index': 0, 'Type': 'Mass'}
.keys()) if __name__ == '__main__': #Parsing the data from jdx and storing it in csv parser = argparse.ArgumentParser() parser.add_argument('--data_dir', default= './data',\ help = "Directory path containing scrapped data") parser.add_argument('--cas_list', default= 'species.txt',\ help = "File containing CAS number and smiles of molecules") args = parser.parse_args() data_dir = args.data_dir set_logger(data_dir, 'prepare_data.log') # Create bins for IR and mass spectra logging.info('Creating bins for standardizing the spectra') ir_bins = np.arange(min_ir - eps, max_ir + eps, step_ir) mass_bins = np.arange(min_mass - eps, max_mass + eps, step_mass) # Compute structures of different molecular groups logging.info('Computing the structures of functional groups') func_grp_structs = {func_name : Chem.MolFromSmarts(func_smarts)\ for func_name, func_smarts in func_grp_smarts.items()} # Create and save csv files of spectra for root, dirs, files in os.walk(data_dir): if root == os.path.join(data_dir, 'ir'): logging.info('Starting to parse IR jdx files')
return 1 else: neg_count += 1 return 0 if __name__ == "__main__": args = configure_args() if not os.path.exists('output'): os.makedirs('output') if not os.path.exists('data'): os.makedirs('output') set_logger('output/train.log') fs = s3fs.S3FileSystem() bucket_name = 'cureskin-dataset' data_key = 'dr_msg_stats.csv' data_location = 's3://{}/{}'.format(bucket_name, data_key) df_stats = pd.read_csv(data_location) s3 = boto3.resource('s3') bucket = s3.Bucket('cureskin-dataset') df = pd.DataFrame() # extract users with doctor's notes for user in df_stats['user_id'].unique()[:args.data_size]: data_location = 's3://cureskin-dataset/followup_data/user_{0:012}.json'.format(
assert os.path.isfile( json_path), "No json configuration file found at {}".format(json_path) params = Params(json_path) # Set the random seed for the whole graph tf.set_random_seed(params.seed) # Load the parameters from the dataset, that gives the size etc. into params json_path = os.path.join(args.data_dir, 'dataset_params.json') assert os.path.isfile( json_path), "No json file found at {}, run build.py".format(json_path) params.update(json_path) num_oov_buckets = params.num_oov_buckets # number of buckets for unknown words # Set the logger set_logger(os.path.join(args.model_dir, 'evaluate.log')) # Get paths for vocabularies and dataset path_vocab = os.path.join(args.data_dir, 'vocab{}'.format(params.min_freq)) params.vocab_path = path_vocab path_test_queries = os.path.join(args.data_dir, 'dev/queries.txt') path_test_articles = os.path.join(args.data_dir, 'dev/articles.txt') # Load Vocabularies vocab = tf.contrib.lookup.index_table_from_file( path_vocab, num_oov_buckets=num_oov_buckets, key_column_index=0) # Create the input data pipeline logging.info("Creating the dataset...") test_queries = load_dataset_from_text(path_test_queries, vocab, params) test_articles = load_dataset_from_text(path_test_articles, vocab, params)
help = "Directory path containing IR and MS spectra data") parser.add_argument('--restore_ae_from', default= None,\ help = "Restore AE weights before training the model") parser.add_argument('--restore_mlp_from', default= None,\ help = "Restore MLP weights before training the model") args = parser.parse_args() #Model directory should contain params.json file listing all hyperparameters json_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile(json_path),"No params.json found at {} path".format(args.model_dir) with open(json_path) as json_data: params = json.load(json_data) set_logger(args.model_dir, 'train.log') logging.info('Load the dataset from {}'.format(args.data_dir)) X, y, func_names = load_dataset(args.data_dir, True, **params['preprocess']) #Train and test generator for every fold data_generator = train_test_generator(X, y, params['n_splits']) train_predictions = [] test_predictions = [] for cv, (train_data, test_data) in enumerate(data_generator): logging.info('Starting fold {}'.format(cv+1)) train_size = train_data[0].shape[0] eval_size = test_data[0].shape[0]
y_pred=y_) def grad(model, inputs, targets): with tf.GradientTape() as tape: loss_value = loss(model, inputs, targets, training=True) return loss_value, tape.gradient(loss_value, model.trainable_variables) if __name__ == '__main__': # Set the random seed for the whole graph for reproductible experiments tf.random.set_seed(123) # Set the logger cwd = os.getcwd() set_logger(os.path.join(cwd, 'train.log')) # Create the input data pipeline logging.info("Creating the datasets...") # For shorter training time, We'll use caltech101 instead of imagenet used in the paper data_dir = pathlib.Path(r'C:\Users\K\tensorflow_datasets\caltech101') batch_size = 32 img_height = 256 img_width = 256 train_ds = tf.keras.preprocessing.image_dataset_from_directory( data_dir, label_mode='categorical', validation_split=0.2,