def _init_loaders(self): self.train_loader = DatasetManager.get_dataloader( self.config.datalist_config.trainlist_config, self.config.train_process_config) self.val_loader = DatasetManager.get_dataloader( self.config.datalist_config.testlist_configs, self.config.train_process_config, shuffle=False)
def test_should_get_dataset(self): data = DatasetManager("./tests/resources/local_data") dataset = { "local_test": { "source": "./tests/resources/local_data/train.csv", "description": "my little dataset local" } } self.assertDictEqual(data.get_dataset("local_test"), dataset.get("local_test"))
def test_should_read_yaml_from_dir(self): expected = { "one_test": { "source": "http://source/teste", "description": "my little dataset" } } data = DatasetManager("./tests/resources/one_data") self.assertDictEqual(data.get_datasets(), expected)
def test_should_print_ascii(self): self.maxDiff = None result = """+---------------------+------------+-----------------------------------------------------------------------------+ | description | identifier | source | +---------------------+------------+-----------------------------------------------------------------------------+ | my little dataset | one_test | https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv | | my little dataset 2 | two_test | https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv | +---------------------+------------+-----------------------------------------------------------------------------+""" data = DatasetManager("./tests/resources/multiple_data") printer = Printer(data.get_datasets()) self.assertEqual(result, printer.__repr__())
def main(): dataset_manager = DatasetManager() epochs = 10 output_dir = os.path.abspath("workspace") model = MaskRCNN(output_dir) ratio = 0.2 datasetA, datasetB = dataset_manager.split_dataset('ade20k_train', ratio=ratio) weights = model.train(datasetA, weights=None, epochs=10) small_datasetB = dataset_manager.random_subset(datasetB, 1000) result = model.predict(small_datasetB, weights)
def setUp(self): with open('training_set_list.pickle', 'rb') as handle: self.training_dict = pickle.load(handle) with open('validation_set_list.pickle', 'rb') as handle: self.validation_dict = pickle.load(handle) with open('test_set_list.pickle', 'rb') as handle: self.test_dict = pickle.load(handle) with open('genres.json') as json_data: self.genres = json.load(json_data) with open('labels.json') as json_data: self.dataset = json.load(json_data) self.dataset_manager = DatasetManager(self.training_dict, self.validation_dict, self.test_dict, self.genres, self.dataset) self.batch_size = 50
def evaluate(experiment_name, step=''): logging.info('*' * 50) logging.info('RUNNING EVALUATION FOR MODEL: %s', experiment_name) if step == '': interesting_checkpoint = tf.train.latest_checkpoint( os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name)) else: interesting_checkpoint = os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name, 'step-{}'.format(step)) dataset_manager = DatasetManager() dataset_manager.boot() with tf.Graph().as_default() as gr: logging.info('-- Restoring graph for model: %s', interesting_checkpoint) saver = tf.train.import_meta_graph( '{}.meta'.format(interesting_checkpoint)) logging.info('-- Restored graph for model named: %s', interesting_checkpoint) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True)).as_default() as sess: saver.restore(sess=sess, save_path=interesting_checkpoint) logging.info('-- Restored variables for model named: %s', interesting_checkpoint) list_predictions = [] list_labels = [] for docs, labels in dataset_manager.get_test_by_batch( batch_size=FLAGS.BATCH_SIZE): tf_input = gr.get_tensor_by_name('input/tf_input:0') tf_predictions = gr.get_tensor_by_name('prediction:0') prediction = sess.run(tf_predictions, feed_dict={tf_input: docs}) list_predictions.extend(prediction) list_labels.extend(labels) logging.debug('-- Prediction length: %s/%s', len(list_predictions), dataset_manager.test_y.shape[0]) logging.info('-- Report for model: %s', experiment_name) logging.info( classification_report(y_true=list_labels, y_pred=list_predictions, digits=4)) logging.info( confusion_matrix(y_true=list_labels, y_pred=list_predictions))
def get_mat_id(self, mat_id_name='mat_id'): """ Get material ID numbers of the underlying mesh elements. """ if self.source is not None: dm = DatasetManager(dataset=self.source.outputs[0]) mat_id = dm.cell_scalars[mat_id_name] return mat_id
def test_should_create_dataset_with_custom_data(self): data = DatasetManager(self.trash_dir, fs=self.os) identifier = "data_name_custom" dataset = { "identifier": identifier, "description": "description", "source": "/tmp/test.csv" } data.create_dataset(**dataset) self.assertTrue( self.os.isfile("{}/{}.yaml".format(self.trash_dir, identifier))) self.assertEqual(len(os.listdir(self.trash_dir)), 2) loaded_dataset = data.get_datasets() self.assertEqual(list(loaded_dataset.keys()), [identifier]) datasource_configs = loaded_dataset.get(identifier) self.assertEqual(datasource_configs["description"], dataset["description"]) self.assertEqual(datasource_configs["source"], dataset["source"])
def test_should_read_multiple_yaml_from_dir(self): expected = { "one_test": { "source": "https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv", "description": "my little dataset" }, "two_test": { "source": "https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv", "description": "my little dataset 2" } } data = DatasetManager("./tests/resources/multiple_data", fs=self.os) result = list(data.get_datasets().keys()) result.sort() expected = ["one_test", "two_test"] self.assertListEqual(expected, result)
def main(): output_dir = os.path.abspath("workspace") dataset_manager = DatasetManager() model = MaskRCNN(output_dir) ratio = 0.2 datasetA, datasetB = dataset_manager.split_dataset('ade20k_train', ratio=ratio) weights = model.train(datasetA, weights=None, epochs=10) while True: small_datasetB = dataset_manager.random_subset(datasetB, 100) predictions = model.predict(small_datasetB, weights) annotations = annotator.filter(predictions) new_dataset = dataset_manager.create_dataset_with_new_annotations( datasetB, annotations) weights = model.train(new_dataset, weights, epochs=1)
def predict(list_docs, experiment_name, step='', batch_size=64): logging.info('*' * 50) logging.info('RUNNING PREDICT FOR MODEL: %s', experiment_name) if step == '': interesting_checkpoint = tf.train.latest_checkpoint(os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name)) else: interesting_checkpoint = os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name, 'step-{}'.format(step)) dataset_manager = DatasetManager() dataset_manager.boot() list_preprocessed_sentences = preprocessor.preprocess(list_docs) list_vecs = dataset_manager.text2vec.doc_to_vec(list_preprocessed_sentences) print(dataset_manager.text2vec.vec_to_doc(list_vecs)) list_vecs = dataset_manager.equalize_vector_length_to_np(list_vectors=list_vecs, max_length=model_v6.SENTENCE_LENGTH_MAX) with tf.Graph().as_default() as gr: logging.info('-- Restoring graph for model: %s', interesting_checkpoint) saver = tf.train.import_meta_graph('{}.meta'.format(interesting_checkpoint)) logging.info('-- Restored graph for model named: %s', interesting_checkpoint) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default() as sess: saver.restore(sess=sess, save_path=interesting_checkpoint) logging.info('-- Restored variables for model named: %s', interesting_checkpoint) list_predictions = [] num_steps = len(list_vecs) // batch_size logging.info('There will be %s steps', num_steps + 1) for i in range(num_steps + 1): tf_input = gr.get_tensor_by_name('input/tf_input:0') tf_predictions = gr.get_tensor_by_name('prediction:0') prediction = sess.run(tf_predictions, feed_dict={ tf_input: list_vecs[i*batch_size: (i+1)*batch_size] }) list_predictions.extend([dataset_manager.LABEL_UNMAPPING[p] for p in prediction]) return list_predictions
def test_should_create_dataset(self): data = DatasetManager(self.trash_dir, fs=self.os) identifier = "data_name" dataset = { "identifier": identifier, "description": "description", "source": "/tmp/test.csv", } data.create_dataset(**dataset) loaded_datasets = data.get_datasets() dataset_config = loaded_datasets.get(identifier) self.assertTrue( self.os.isfile("{}/{}.yaml".format(self.trash_dir, identifier))) self.assertEqual(len(self.os.listdir(self.trash_dir)), 2) self.assertEqual(list(loaded_datasets.keys())[0], identifier) self.assertEqual(dataset_config.get("description"), dataset["description"]) self.assertEqual(dataset_config.get("source"), dataset["source"])
def test_should_print_html(self): self.maxDiff = None result = """<table> <tr> <th>description</th> <th>identifier</th> <th>source</th> </tr> <tr> <td>my little dataset</td> <td>one_test</td> <td>https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv</td> </tr> <tr> <td>my little dataset 2</td> <td>two_test</td> <td>https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv</td> </tr> </table>""" data = DatasetManager("./tests/resources/multiple_data") printer = Printer(data.get_datasets()) self.assertEqual(result, printer._repr_html_())
def create_source(self): """ Create a VTK source from data in a SfePy-supported file. Notes ----- All data need to be set here, otherwise time stepping will not work properly - data added by user later will be thrown away on time step change. """ if self.io is None: self.read_common(self.filename) dataset = self.create_dataset() try: out = self.io.read_data(self.step) except ValueError: out = None if out is not None: self.add_data_to_dataset(dataset, out) if self.mat_id_name is not None: mat_id = nm.concatenate(self.mesh.mat_ids) if self.single_color: rm = mat_id.min(), mat_id.max() mat_id[mat_id > rm[0]] = rm[1] dm = DatasetManager(dataset=dataset) dm.add_array(mat_id, self.mat_id_name, 'cell') src = VTKDataSource(data=dataset) # src.print_traits() # debug() return src
def create_source(self): """ Create a VTK source from data in a SfePy-supported file. Notes ----- All data need to be set here, otherwise time stepping will not work properly - data added by user later will be thrown away on time step change. """ if self.io is None: self.read_common(self.filename) dataset = self.create_dataset() try: out = self.io.read_data(self.step) except ValueError: out = None if out is not None: self.add_data_to_dataset(dataset, out) if self.mat_id_name is not None: mat_id = self.mesh.cmesh.cell_groups if self.single_color: rm = mat_id.min(), mat_id.max() mat_id[mat_id > rm[0]] = rm[1] dm = DatasetManager(dataset=dataset) dm.add_array(mat_id, self.mat_id_name, 'cell') src = VTKDataSource(data=dataset) # src.print_traits() # debug() return src
def add_subdomains_surface(obj, position, mat_id_name='mat_id', threshold_limits=(None, None), **kwargs): dm = DatasetManager(dataset=obj.outputs[0]) mat_id = dm.cell_scalars[mat_id_name] rm = mat_id.min(), mat_id.max() active = mlab.pipeline.set_active_attribute(obj) active.cell_scalars_name = mat_id_name aa = mlab.pipeline.set_active_attribute(obj) aa.cell_scalars_name = mat_id_name threshold = mlab.pipeline.threshold(aa) threshold.threshold_filter.progress = 1.0 if threshold_limits[0] is not None: threshold.lower_threshold = threshold_limits[0] + 0.1 if threshold_limits[1] is not None: threshold.upper_threshold = threshold_limits[1] - 0.1 threshold.auto_reset_lower = False threshold.auto_reset_upper = False surface = mlab.pipeline.surface(threshold, opacity=0.3) surface.actor.actor.position = position module_manager = surface.parent lm = module_manager.scalar_lut_manager lm.lut_mode = 'Blues' if (rm[1] - rm[0]) == 1: lm.reverse_lut = True surface2 = mlab.pipeline.surface(active, opacity=0.2) surface2.actor.actor.position = position module_manager = surface2.parent module_manager.scalar_lut_manager.lut_mode = 'Blues' return surface, surface2
def test_should_remove_dataset(self): data = DatasetManager(self.trash_dir, fs=self.os) identifier = "data_name" dataset = { "identifier": identifier, "description": "description", "source": "/tmp/test.csv" } data.create_dataset(**dataset) self.assertTrue( os.path.isfile("{}/{}.yaml".format(self.trash_dir, identifier))) self.assertEqual(len(os.listdir(self.trash_dir)), 2) data.remove_dataset(identifier) self.assertFalse( os.path.isfile("{}/{}.yaml".format(self.trash_dir, identifier))) self.assertEqual(len(os.listdir(self.trash_dir)), 1)
params = "pd_fixed_trainratio80_outcome_all_data_singletask" #params = "lstmsize%s_dropout%s_shared%s_specialized%s"%(lstmsize, dropout, n_shared_layers, n_specialized_layers) checkpoint_prefix = os.path.join( output_dir, "checkpoints/model_%s_%s" % (dataset_name, params)) model_filename = glob.glob("%s*.hdf5" % checkpoint_prefix)[-1] #model_filename = "code/output_files/models/model_28-1.51.h5" results_file = os.path.join( output_dir, "evaluation_results/results_%s_%s_%s.csv" % (cls_method, dataset_name, params)) ##### MAIN PART ###### print('Preparing data...') start = time.time() dataset_manager = DatasetManager(dataset_name) data = dataset_manager.read_dataset() train, test = dataset_manager.split_data( data, train_ratio, split=data_split_type ) # to reproduce results of Tax et al., use 'ordered' instead of 'temporal' dt_train = dataset_manager.encode_data_with_label_all_data(train) dt_test = dataset_manager.encode_data_with_label_all_data(test) if normalize_over == "train": dataset_manager.calculate_divisors(dt_train) elif normalize_over == "all": dt_all = dataset_manager.extract_timestamp_features(data) dt_all = dataset_manager.extract_duration_features(dt_all) dataset_manager.calculate_divisors(dt_all) else:
output_dir = "results" n_estimators = 1000 max_features = 0.5 params = "nestimators%s_maxfeatures%s" % (n_estimators, max_features) ##### MAIN PART ###### for dataset_name in datasets: results_file = os.path.join( output_dir, "evaluation_results/results_%s_%s_%s.csv" % (cls_method, dataset_name, params)) print("Loading data...") start = time.time() dataset_manager = DatasetManager(dataset_name) data = dataset_manager.read_dataset() train, test = dataset_manager.split_data(data, train_ratio, split="temporal") train = dataset_manager.get_train_sample(train, sample_size) #train, val = dataset_manager.get_train_val_data(train, sample_size, val_sample_size) print("Done: %s" % (time.time() - start)) print('Encoding data...') start = time.time() dt_train = dataset_manager.encode_data(train) #dt_val = dataset_manager.encode_data(val) dt_test = dataset_manager.encode_data(test) #X, y = dataset_manager.generate_3d_data(dt_train, max_len) #X_val, y_val = dataset_manager.generate_3d_data(dt_val, max_len)
from flask import Flask, send_file from flask_restful import Resource, Api, reqparse from flask_cors import CORS import numpy as np from data_backend import Dataset as HDF_Dataset from dataset_manager import DatasetManager from thumbnailer import Thumbnailer from utils import merge_overlapping_filters DATASET_PATH = "./datasets" dataset_manager = DatasetManager(DATASET_PATH) API_BASE_STR = "/api/v1" # Init thumbnails (clean directory) thumbnailer = Thumbnailer("./thumbnails") thumbnailer.clean(); dataset_list = [] for dset_index, name in enumerate(dataset_manager.get_dataset_names()): dset = HDF_Dataset(DATASET_PATH, name) dataset_list.append({ "id": dset_index, "name": name, "device": { "name": dset.device.name, "version": dset.device.version }, "subsets": [ { "id": subset_index, "name": subset,
print('Preparing data...') start = time.time() dataset_name = argv[1] embedding_type = argv[2] embedding_dim = int(argv[3]) scale_model = "row" train_ratio = 0.8 val_ratio = 0.2 activation = "sigmoid" optimizer = "adam" nb_epoch = 50 dataset_manager = DatasetManager(dataset_name) data = dataset_manager.read_dataset() train, _ = dataset_manager.split_data_strict(data, train_ratio, split="temporal") train, val = dataset_manager.split_val(train, val_ratio, split="random") if embedding_type == "none": dt_train = dataset_manager.encode_data_with_label_all_data(train) dt_val = dataset_manager.encode_data_with_label_all_data(val) else: dt_train = dataset_manager.encode_data_with_label_all_data_act_res_embedding( train, embedding_type=embedding_type, embedding_dim=embedding_dim, scale_model=scale_model)
lstmsize, lstmsize2, int(dropout * 100), int( learning_rate * 100000), nb_epoch, batch_size, sample_size) ##### MAIN PART ###### for dataset_name in datasets: results_file = os.path.join( output_dir, "evaluation_results/results_lstm_%s_%s.csv" % (dataset_name, params)) checkpoint_prefix = os.path.join( output_dir, "checkpoints/weights_%s_%s" % (dataset_name, params)) print("Loading data...") start = time.time() dataset_manager = DatasetManager(dataset_name) data = dataset_manager.read_dataset() train, test = dataset_manager.split_data(data, train_ratio, split="temporal") train, val = dataset_manager.get_train_val_data(train, sample_size, val_sample_size) print("Done: %s" % (time.time() - start)) print('Encoding data...') start = time.time() dt_train = dataset_manager.encode_data(train) dt_val = dataset_manager.encode_data(val) dt_test = dataset_manager.encode_data(test) X, y = dataset_manager.generate_3d_data(dt_train, max_len) X_val, y_val = dataset_manager.generate_3d_data(dt_val, max_len)
def run(experiment_name): BEST_THRES = 3 WORST_THRES = 3 POPULATION_STEPS = 500 ITERATIONS = 100 POPULATION_SIZE = 10 accuracy_hist = np.zeros((POPULATION_SIZE, POPULATION_STEPS)) l1_scale_hist = np.zeros((POPULATION_SIZE, POPULATION_STEPS)) best_accuracy_hist = np.zeros((POPULATION_STEPS, )) best_l1_scale_hist = np.zeros((POPULATION_STEPS, )) with tf.Graph().as_default() as gr: with tf.variable_scope('input'): tf_input = tf.placeholder( dtype=tf.int32, shape=[ None, model_population_based_tunning.SENTENCE_LENGTH_MAX ], name='tf_input') tf_labels = tf.placeholder(dtype=tf.int32, shape=[None], name='tf_labels') models = [ create_model( i, is_included_regularization=FLAGS.IS_INCLUDED_REGULARIZATION) for i in range(10) ] # It will help us with creation of different scope_name for each model for index, model in enumerate(models): with tf.variable_scope(str(index)): model.boot(tf_input, tf_labels) logging.info('Graph size: %s', utils.count_trainable_variables()) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.GPU) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=FLAGS.LOG_DEVICE_PLACEMENT)).as_default( ) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) dataset_manager = DatasetManager() dataset_manager.boot() dataset_generator = dataset_manager.get_batch( batch_size=FLAGS.BATCH_SIZE, number_epochs=10 * FLAGS.NUMBER_EPOCHS) for i in range(POPULATION_STEPS): # Copy best sess.run([ m.get_copy_from_op(models[0]) for m in models[-WORST_THRES:] ]) # Perturb others sess.run([m.l1_scale_perturb_op for m in models[BEST_THRES:]]) # Training for _ in range(ITERATIONS): docs, labels = next(dataset_generator) sess.run([m.tf_optimizer for m in models], feed_dict={ tf_input: docs, tf_labels: labels }) docs, labels = next(dataset_generator) # Evaluate l1_scales = sess.run({m: m.l1_scale for m in models}) accuracies = sess.run({m: m.tf_acc for m in models}, feed_dict={ tf_input: docs, tf_labels: labels }) models.sort(key=lambda m: accuracies[m], reverse=True) # Logging best_accuracy_hist[i] = accuracies[models[0]] best_l1_scale_hist[i] = l1_scales[models[0]] for m in models: l1_scale_hist[m.model_id, i] = l1_scales[m] accuracy_hist[m.model_id, i] = accuracies[m] with open('temp', 'w') as output_f: json.dump( { 'accuracy_hist': accuracy_hist, 'l1_scale_hist': l1_scale_hist, 'best_accuracy_hist': best_accuracy_hist, 'best_l1_scale_hist': best_l1_scale_hist }, output_f)
def add_data_to_dataset(self, dataset, data): """Add point and cell data to the dataset.""" dim = self.dim sym = (dim + 1) * dim / 2 dm = DatasetManager(dataset=dataset) for key, val in data.iteritems(): vd = val.data ## print vd.shape if val.mode == 'vertex': if vd.shape[1] == 1: aux = vd.reshape((vd.shape[0],)) elif vd.shape[1] == 2: zz = nm.zeros((vd.shape[0], 1), dtype=vd.dtype) aux = nm.c_[vd, zz] elif vd.shape[1] == 3: aux = vd else: raise ValueError('unknown vertex data format! (%s)'\ % vd.shape) dm.add_array(aux, key, 'point') elif val.mode == 'cell': ne, aux, nr, nc = vd.shape if (nr == 1) and (nc == 1): aux = vd.reshape((ne,)) elif (nr == dim) and (nc == 1): if dim == 3: aux = vd.reshape((ne, dim)) else: zz = nm.zeros((vd.shape[0], 1), dtype=vd.dtype); aux = nm.c_[vd.squeeze(), zz] elif (((nr == sym) or (nr == (dim * dim))) and (nc == 1)) \ or ((nr == dim) and (nc == dim)): vd = vd.squeeze() if dim == 3: if nr == sym: aux = vd[:,[0,3,4,3,1,5,4,5,2]] elif nr == (dim * dim): aux = vd[:,[0,3,4,6,1,5,7,8,2]] else: aux = vd.reshape((vd.shape[0], dim*dim)) else: zz = nm.zeros((vd.shape[0], 1), dtype=vd.dtype); if nr == sym: aux = nm.c_[vd[:,[0,2]], zz, vd[:,[2,1]], zz, zz, zz, zz] elif nr == (dim * dim): aux = nm.c_[vd[:,[0,2]], zz, vd[:,[3,1]], zz, zz, zz, zz] else: aux = nm.c_[vd[:,0,[0,1]], zz, vd[:,1,[0,1]], zz, zz, zz, zz] dm.add_array(aux, key, 'cell')
def run(experiment_name): with tf.Graph().as_default() as gr: with tf.variable_scope('input'): tf_input = tf.placeholder(dtype=tf.int32, shape=[None, model.SENTENCE_LENGTH_MAX], name='tf_input') tf_labels = tf.placeholder(dtype=tf.int32, shape=[None], name='tf_labels') tf_logits = model.inference(tf_input) tf_loss = model.loss(tf_logits, tf_labels) tf_optimizer, tf_global_step = model.optimize(tf_loss) model.measure_acc(tf_logits, tf_labels) tf_all_summary = tf.summary.merge_all() tf_train_writer = tf.summary.FileWriter(logdir=os.path.join( CURRENT_DIR, '..', 'summary', 'train_' + experiment_name), graph=gr) tf_test_writer = tf.summary.FileWriter(logdir=os.path.join( CURRENT_DIR, '..', 'summary', 'test_' + experiment_name), graph=gr) tf_embedding_writer = tf.summary.FileWriter(logdir=os.path.join( CURRENT_DIR, '..', 'checkpoint', experiment_name)) # Visual word embedding config = projector.ProjectorConfig() embedding = config.embeddings.add() embedding.tensor_name = 'embedding/word_embeddings' # Reference model_v6.py embedding.metadata_path = os.path.join(CURRENT_DIR, 'data', DatasetManager.VOCAB_FILE) projector.visualize_embeddings(tf_embedding_writer, config) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=0.03) logging.info('Graph size: %s', utils.count_trainable_variables()) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.GPU) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=FLAGS.LOG_DEVICE_PLACEMENT)).as_default( ) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) dataset_manager = DatasetManager() dataset_manager.boot() for docs, labels in dataset_manager.get_batch( batch_size=FLAGS.BATCH_SIZE, number_epochs=FLAGS.NUMBER_EPOCHS): _, global_step = sess.run([tf_optimizer, tf_global_step], feed_dict={ tf_input: docs, tf_labels: labels }) summary_interval_step = 10 if global_step % summary_interval_step == 0: logging.debug('Global step: %s', global_step) train_summary_data = sess.run(tf_all_summary, feed_dict={ tf_input: docs, tf_labels: labels }) tf_train_writer.add_summary(train_summary_data, global_step=global_step) if global_step % summary_interval_step == 0: docs_test, labels_test = dataset_manager.get_test_set( FLAGS.TEST_SIZE, is_shuffled=True) test_summary_data = sess.run(tf_all_summary, feed_dict={ tf_input: docs_test, tf_labels: labels_test }) tf_test_writer.add_summary(test_summary_data, global_step=global_step) if global_step % 200 == 0: path_to_save = os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name) if not os.path.exists(path_to_save): os.makedirs(path_to_save) saved_file = saver.save(sess, save_path=os.path.join( path_to_save, 'step'), global_step=global_step, write_meta_graph=True) logging.debug('Saving model at %s', saved_file)
class DataLoadingTests(unittest.TestCase): def setUp(self): with open('training_set_list.pickle', 'rb') as handle: self.training_dict = pickle.load(handle) with open('validation_set_list.pickle', 'rb') as handle: self.validation_dict = pickle.load(handle) with open('test_set_list.pickle', 'rb') as handle: self.test_dict = pickle.load(handle) with open('genres.json') as json_data: self.genres = json.load(json_data) with open('labels.json') as json_data: self.dataset = json.load(json_data) self.dataset_manager = DatasetManager(self.training_dict, self.validation_dict, self.test_dict, self.genres, self.dataset) self.batch_size = 50 def test_normal_training_image_load(self): images = self.dataset_manager.next_batch(50, "train") self.assertEqual(images[0].shape, (50, 227, 227, 3)) def test_normal_training_labels_load(self): images = self.dataset_manager.next_batch(50, "train") self.assertEqual(images[1].shape, (50, 26)) def test_last_traninig_image_load(self): self.dataset_manager.cur_train = \ len(self.dataset_manager.training_list) - \ (self.batch_size / 2) images = self.dataset_manager.next_batch(50, "train") self.assertEqual(images[0].shape, (50, 227, 227, 3)) def test_last_traninig_labels_load(self): self.dataset_manager.cur_train = \ len(self.dataset_manager.training_list) - \ (self.batch_size / 2) images = self.dataset_manager.next_batch(50, "train") self.assertEqual(images[1].shape, (50, 26)) def test_normal_validation_image_load(self): images = self.dataset_manager.next_batch(50, "val") self.assertEqual(images[0].shape, (50, 227, 227, 3)) def test_normal_validation_labels_load(self): images = self.dataset_manager.next_batch(50, "val") self.assertEqual(images[1].shape, (50, 26)) def test_last_validation_image_load(self): self.dataset_manager.cur_val = \ len(self.dataset_manager.validation_list) - \ (self.batch_size / 2) images = self.dataset_manager.next_batch(50, "val") self.assertEqual(images[0].shape, (50, 227, 227, 3)) def test_last_validation_labels_load(self): self.dataset_manager.cur_val = \ len(self.dataset_manager.validation_list) - \ (self.batch_size / 2) images = self.dataset_manager.next_batch(50, "val") self.assertEqual(images[1].shape, (50, 26)) def test_normal_test_image_load(self): images = self.dataset_manager.next_batch(50, "test") self.assertEqual(images[0].shape, (50, 227, 227, 3)) def test_normal_test_labels_load(self): images = self.dataset_manager.next_batch(50, "test") self.assertEqual(images[1].shape, (50, 26)) def test_last_test_image_load(self): self.dataset_manager.cur_test = \ len(self.dataset_manager.test_list) - \ (self.batch_size / 2) images = self.dataset_manager.next_batch(50, "test") self.assertEqual(images[0].shape, (50, 227, 227, 3)) def test_last_test_labels_load(self): self.dataset_manager.cur_test = \ len(self.dataset_manager.test_list) - \ (self.batch_size / 2) images = self.dataset_manager.next_batch(50, "test") self.assertEqual(images[1].shape, (50, 26)) def test_create_label_vector(self): label_vector = self.dataset_manager.create_label_vector( [" Action", " Documentary", " Drama", " Horror", " News", " War"]) self.assertEqual(label_vector, [ 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0 ]) def test_create_label_vector_end(self): label_vector = self.dataset_manager.create_label_vector([ " Action", " Documentary", " Drama", " Horror", " News", " War", " Western" ]) self.assertEqual(label_vector, [ 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1 ]) def test_no_duplicate_between_test_and_train(self): self.assertEqual(self.training_dict.intersection(self.test_dict), set())
def test_should_remove_unknown_dataset(self): data = DatasetManager("./tests/resources/local_data", fs=self.os) with self.assertRaises(IOError): data.remove_dataset("unknown_dataset")
def main(): # Load dataset manager with open('training_set_list.pickle', 'rb') as handle: training_set = pickle.load(handle) with open('validation_set_list.pickle', 'rb') as handle: validation_set = pickle.load(handle) with open('test_set_list.pickle', 'rb') as handle: test_set = pickle.load(handle) with open('genres.json') as json_data: genres = json.load(json_data) with open('labels.json') as json_data: labels = json.load(json_data) dataset_manager = DatasetManager(training_set, validation_set, test_set, genres, labels) batch_size = 1 n_classes = 26 # Graph input x = tf.placeholder(tf.float32, [batch_size, 227, 227, 3]) y = tf.placeholder(tf.float32, [None, n_classes]) keep_var = tf.placeholder(tf.float32) # Model pred = Model.alexnet(x, keep_var) # definition of the network architecture # Loss and optimize # Init init = tf.global_variables_initializer() # Initialize an saver for store model checkpoints saver = tf.train.Saver() # Launch the graph with tf.Session() as sess: sess.run(init) # Load pretrained model # Skip weights from fc8 (fine-tuning) # load_with_skip('pretrained_alexnet.npy', sess, ['fc8']) # saver.restore(sess, "saved_models/MSE_without_data_augmentation_0.75_0.001/film_genre_model.ckpt") saver.restore( sess, "saved_models/models/model_dropout05_mean_square_error.ckpt") print('Model Restored') test_map_global = 0. test_count = 0 # test accuracy by group of batch_size images for _ in range(int(len(dataset_manager.test_list) / batch_size) + 1): batch_tx, batch_ty = dataset_manager.next_batch(batch_size, 'test') # print(batch_tx[0], batch_ty[0]) test_output = sess.run(pred, feed_dict={x: batch_tx, keep_var: 1}) # print(test_output[0]) MAP = mean_average_precision(test_output, batch_ty) test_map_global += MAP test_count += 1 test_map_global /= test_count print("Global Test Accuracy = {:.4f}".format(test_map_global)) # Load one image img = cv2.imread('saved_models/images_tests/yellow.jpg') img = cv2.resize(img, (227, 227)) img = img.astype(np.float32) img -= np.array([104., 117., 124.]) print(img) test_output = sess.run(pred, feed_dict={ x: np.reshape(img, (1, 227, 227, 3)), keep_var: 1 }) score_dict = {} for score, genre in zip(test_output[0], genres): score_dict[genre] = score print(list(reversed(sorted(score_dict.items(), key=lambda x: x[1]))))
def predict(list_sentences, output_file, experiment_name, step='', list_labels=[]): dataset_manager = DatasetManager() dataset_manager.boot() list_preprocessed_sentences = preprocessor.preprocess(list_sentences) list_vecs = dataset_manager.text2vec.doc_to_vec( list_preprocessed_sentences) list_vecs = dataset_manager.equalize_vector_length_to_np( list_vectors=list_vecs, max_length=model_v1.SENTENCE_LENGTH_MAX) list_labels = dataset_manager.convert_labels_to_np(list_labels) if step == '': interesting_checkpoint = tf.train.latest_checkpoint( os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name)) else: interesting_checkpoint = os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name, 'step-{}'.format(step)) with tf.Graph().as_default() as gr: logging.info('-- Restoring graph for model: %s', interesting_checkpoint) saver = tf.train.import_meta_graph( '{}.meta'.format(interesting_checkpoint)) logging.info('-- Restored graph for model named: %s', interesting_checkpoint) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True)).as_default() as sess: saver.restore(sess=sess, save_path=interesting_checkpoint) logging.info('-- Restored variables for model named: %s', interesting_checkpoint) tf_input = gr.get_tensor_by_name('input/tf_input:0') tf_predictions = gr.get_tensor_by_name('prediction:0') prediction = sess.run(tf_predictions, feed_dict={tf_input: list_vecs}) if len(list_labels) != 0: logging.info('-- Report for model: %s', experiment_name) logging.info( classification_report(y_true=list_labels, y_pred=prediction)) result_dict = dict() result_dict['sentence'] = list_sentences result_dict['pre-processed'] = list_preprocessed_sentences result_dict[ 'pre-processed_recover'] = dataset_manager.text2vec.vec_to_doc( list_vecs) result_dict['predict'] = prediction if len(list_labels) != 0: result_dict['label'] = list_labels pd.DataFrame(result_dict).to_csv(output_file, index=None) logging.debug('Saved result at %s', output_file)
def test_should_get_dataset_unknown(self): data = DatasetManager("./tests/resources/local_data") with self.assertRaises(IOError): data.get_dataset("unknown_test")