def get_mat_id(self, mat_id_name='mat_id'): """ Get material ID numbers of the underlying mesh elements. """ if self.source is not None: dm = DatasetManager(dataset=self.source.outputs[0]) mat_id = dm.cell_scalars[mat_id_name] return mat_id
def test_should_print_ascii(self): self.maxDiff = None result = """+---------------------+------------+-----------------------------------------------------------------------------+ | description | identifier | source | +---------------------+------------+-----------------------------------------------------------------------------+ | my little dataset | one_test | https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv | | my little dataset 2 | two_test | https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv | +---------------------+------------+-----------------------------------------------------------------------------+""" data = DatasetManager("./tests/resources/multiple_data") printer = Printer(data.get_datasets()) self.assertEqual(result, printer.__repr__())
def test_should_read_yaml_from_dir(self): expected = { "one_test": { "source": "http://source/teste", "description": "my little dataset" } } data = DatasetManager("./tests/resources/one_data") self.assertDictEqual(data.get_datasets(), expected)
def test_should_get_dataset(self): data = DatasetManager("./tests/resources/local_data") dataset = { "local_test": { "source": "./tests/resources/local_data/train.csv", "description": "my little dataset local" } } self.assertDictEqual(data.get_dataset("local_test"), dataset.get("local_test"))
def main(): dataset_manager = DatasetManager() epochs = 10 output_dir = os.path.abspath("workspace") model = MaskRCNN(output_dir) ratio = 0.2 datasetA, datasetB = dataset_manager.split_dataset('ade20k_train', ratio=ratio) weights = model.train(datasetA, weights=None, epochs=10) small_datasetB = dataset_manager.random_subset(datasetB, 1000) result = model.predict(small_datasetB, weights)
def test_should_remove_dataset(self): data = DatasetManager(self.trash_dir, fs=self.os) identifier = "data_name" dataset = { "identifier": identifier, "description": "description", "source": "/tmp/test.csv" } data.create_dataset(**dataset) self.assertTrue( os.path.isfile("{}/{}.yaml".format(self.trash_dir, identifier))) self.assertEqual(len(os.listdir(self.trash_dir)), 2) data.remove_dataset(identifier) self.assertFalse( os.path.isfile("{}/{}.yaml".format(self.trash_dir, identifier))) self.assertEqual(len(os.listdir(self.trash_dir)), 1)
def setUp(self): with open('training_set_list.pickle', 'rb') as handle: self.training_dict = pickle.load(handle) with open('validation_set_list.pickle', 'rb') as handle: self.validation_dict = pickle.load(handle) with open('test_set_list.pickle', 'rb') as handle: self.test_dict = pickle.load(handle) with open('genres.json') as json_data: self.genres = json.load(json_data) with open('labels.json') as json_data: self.dataset = json.load(json_data) self.dataset_manager = DatasetManager(self.training_dict, self.validation_dict, self.test_dict, self.genres, self.dataset) self.batch_size = 50
def evaluate(experiment_name, step=''): logging.info('*' * 50) logging.info('RUNNING EVALUATION FOR MODEL: %s', experiment_name) if step == '': interesting_checkpoint = tf.train.latest_checkpoint( os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name)) else: interesting_checkpoint = os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name, 'step-{}'.format(step)) dataset_manager = DatasetManager() dataset_manager.boot() with tf.Graph().as_default() as gr: logging.info('-- Restoring graph for model: %s', interesting_checkpoint) saver = tf.train.import_meta_graph( '{}.meta'.format(interesting_checkpoint)) logging.info('-- Restored graph for model named: %s', interesting_checkpoint) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True)).as_default() as sess: saver.restore(sess=sess, save_path=interesting_checkpoint) logging.info('-- Restored variables for model named: %s', interesting_checkpoint) list_predictions = [] list_labels = [] for docs, labels in dataset_manager.get_test_by_batch( batch_size=FLAGS.BATCH_SIZE): tf_input = gr.get_tensor_by_name('input/tf_input:0') tf_predictions = gr.get_tensor_by_name('prediction:0') prediction = sess.run(tf_predictions, feed_dict={tf_input: docs}) list_predictions.extend(prediction) list_labels.extend(labels) logging.debug('-- Prediction length: %s/%s', len(list_predictions), dataset_manager.test_y.shape[0]) logging.info('-- Report for model: %s', experiment_name) logging.info( classification_report(y_true=list_labels, y_pred=list_predictions, digits=4)) logging.info( confusion_matrix(y_true=list_labels, y_pred=list_predictions))
def add_subdomains_surface(obj, position, mat_id_name='mat_id', threshold_limits=(None, None), **kwargs): dm = DatasetManager(dataset=obj.outputs[0]) mat_id = dm.cell_scalars[mat_id_name] rm = mat_id.min(), mat_id.max() active = mlab.pipeline.set_active_attribute(obj) active.cell_scalars_name = mat_id_name aa = mlab.pipeline.set_active_attribute(obj) aa.cell_scalars_name = mat_id_name threshold = mlab.pipeline.threshold(aa) threshold.threshold_filter.progress = 1.0 if threshold_limits[0] is not None: threshold.lower_threshold = threshold_limits[0] + 0.1 if threshold_limits[1] is not None: threshold.upper_threshold = threshold_limits[1] - 0.1 threshold.auto_reset_lower = False threshold.auto_reset_upper = False surface = mlab.pipeline.surface(threshold, opacity=0.3) surface.actor.actor.position = position module_manager = surface.parent lm = module_manager.scalar_lut_manager lm.lut_mode = 'Blues' if (rm[1] - rm[0]) == 1: lm.reverse_lut = True surface2 = mlab.pipeline.surface(active, opacity=0.2) surface2.actor.actor.position = position module_manager = surface2.parent module_manager.scalar_lut_manager.lut_mode = 'Blues' return surface, surface2
def main(): output_dir = os.path.abspath("workspace") dataset_manager = DatasetManager() model = MaskRCNN(output_dir) ratio = 0.2 datasetA, datasetB = dataset_manager.split_dataset('ade20k_train', ratio=ratio) weights = model.train(datasetA, weights=None, epochs=10) while True: small_datasetB = dataset_manager.random_subset(datasetB, 100) predictions = model.predict(small_datasetB, weights) annotations = annotator.filter(predictions) new_dataset = dataset_manager.create_dataset_with_new_annotations( datasetB, annotations) weights = model.train(new_dataset, weights, epochs=1)
def test_should_create_dataset_with_custom_data(self): data = DatasetManager(self.trash_dir, fs=self.os) identifier = "data_name_custom" dataset = { "identifier": identifier, "description": "description", "source": "/tmp/test.csv" } data.create_dataset(**dataset) self.assertTrue( self.os.isfile("{}/{}.yaml".format(self.trash_dir, identifier))) self.assertEqual(len(os.listdir(self.trash_dir)), 2) loaded_dataset = data.get_datasets() self.assertEqual(list(loaded_dataset.keys()), [identifier]) datasource_configs = loaded_dataset.get(identifier) self.assertEqual(datasource_configs["description"], dataset["description"]) self.assertEqual(datasource_configs["source"], dataset["source"])
def test_should_read_multiple_yaml_from_dir(self): expected = { "one_test": { "source": "https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv", "description": "my little dataset" }, "two_test": { "source": "https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv", "description": "my little dataset 2" } } data = DatasetManager("./tests/resources/multiple_data", fs=self.os) result = list(data.get_datasets().keys()) result.sort() expected = ["one_test", "two_test"] self.assertListEqual(expected, result)
def predict(list_docs, experiment_name, step='', batch_size=64): logging.info('*' * 50) logging.info('RUNNING PREDICT FOR MODEL: %s', experiment_name) if step == '': interesting_checkpoint = tf.train.latest_checkpoint(os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name)) else: interesting_checkpoint = os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name, 'step-{}'.format(step)) dataset_manager = DatasetManager() dataset_manager.boot() list_preprocessed_sentences = preprocessor.preprocess(list_docs) list_vecs = dataset_manager.text2vec.doc_to_vec(list_preprocessed_sentences) print(dataset_manager.text2vec.vec_to_doc(list_vecs)) list_vecs = dataset_manager.equalize_vector_length_to_np(list_vectors=list_vecs, max_length=model_v6.SENTENCE_LENGTH_MAX) with tf.Graph().as_default() as gr: logging.info('-- Restoring graph for model: %s', interesting_checkpoint) saver = tf.train.import_meta_graph('{}.meta'.format(interesting_checkpoint)) logging.info('-- Restored graph for model named: %s', interesting_checkpoint) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default() as sess: saver.restore(sess=sess, save_path=interesting_checkpoint) logging.info('-- Restored variables for model named: %s', interesting_checkpoint) list_predictions = [] num_steps = len(list_vecs) // batch_size logging.info('There will be %s steps', num_steps + 1) for i in range(num_steps + 1): tf_input = gr.get_tensor_by_name('input/tf_input:0') tf_predictions = gr.get_tensor_by_name('prediction:0') prediction = sess.run(tf_predictions, feed_dict={ tf_input: list_vecs[i*batch_size: (i+1)*batch_size] }) list_predictions.extend([dataset_manager.LABEL_UNMAPPING[p] for p in prediction]) return list_predictions
def test_should_print_html(self): self.maxDiff = None result = """<table> <tr> <th>description</th> <th>identifier</th> <th>source</th> </tr> <tr> <td>my little dataset</td> <td>one_test</td> <td>https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv</td> </tr> <tr> <td>my little dataset 2</td> <td>two_test</td> <td>https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv</td> </tr> </table>""" data = DatasetManager("./tests/resources/multiple_data") printer = Printer(data.get_datasets()) self.assertEqual(result, printer._repr_html_())
def test_should_create_dataset(self): data = DatasetManager(self.trash_dir, fs=self.os) identifier = "data_name" dataset = { "identifier": identifier, "description": "description", "source": "/tmp/test.csv", } data.create_dataset(**dataset) loaded_datasets = data.get_datasets() dataset_config = loaded_datasets.get(identifier) self.assertTrue( self.os.isfile("{}/{}.yaml".format(self.trash_dir, identifier))) self.assertEqual(len(self.os.listdir(self.trash_dir)), 2) self.assertEqual(list(loaded_datasets.keys())[0], identifier) self.assertEqual(dataset_config.get("description"), dataset["description"]) self.assertEqual(dataset_config.get("source"), dataset["source"])
def create_source(self): """ Create a VTK source from data in a SfePy-supported file. Notes ----- All data need to be set here, otherwise time stepping will not work properly - data added by user later will be thrown away on time step change. """ if self.io is None: self.read_common(self.filename) dataset = self.create_dataset() try: out = self.io.read_data(self.step) except ValueError: out = None if out is not None: self.add_data_to_dataset(dataset, out) if self.mat_id_name is not None: mat_id = self.mesh.cmesh.cell_groups if self.single_color: rm = mat_id.min(), mat_id.max() mat_id[mat_id > rm[0]] = rm[1] dm = DatasetManager(dataset=dataset) dm.add_array(mat_id, self.mat_id_name, 'cell') src = VTKDataSource(data=dataset) # src.print_traits() # debug() return src
def test_should_get_dataset_unknown(self): data = DatasetManager("./tests/resources/local_data") with self.assertRaises(IOError): data.get_dataset("unknown_test")
def main(): # Load dataset manager with open('training_set_list.pickle', 'rb') as handle: training_set = pickle.load(handle) with open('validation_set_list.pickle', 'rb') as handle: validation_set = pickle.load(handle) with open('test_set_list.pickle', 'rb') as handle: test_set = pickle.load(handle) with open('genres.json') as json_data: genres = json.load(json_data) with open('labels.json') as json_data: labels = json.load(json_data) dataset_manager = DatasetManager(training_set, validation_set, test_set, genres, labels) batch_size = 1 n_classes = 26 # Graph input x = tf.placeholder(tf.float32, [batch_size, 227, 227, 3]) y = tf.placeholder(tf.float32, [None, n_classes]) keep_var = tf.placeholder(tf.float32) # Model pred = Model.alexnet(x, keep_var) # definition of the network architecture # Loss and optimize # Init init = tf.global_variables_initializer() # Initialize an saver for store model checkpoints saver = tf.train.Saver() # Launch the graph with tf.Session() as sess: sess.run(init) # Load pretrained model # Skip weights from fc8 (fine-tuning) # load_with_skip('pretrained_alexnet.npy', sess, ['fc8']) # saver.restore(sess, "saved_models/MSE_without_data_augmentation_0.75_0.001/film_genre_model.ckpt") saver.restore( sess, "saved_models/models/model_dropout05_mean_square_error.ckpt") print('Model Restored') test_map_global = 0. test_count = 0 # test accuracy by group of batch_size images for _ in range(int(len(dataset_manager.test_list) / batch_size) + 1): batch_tx, batch_ty = dataset_manager.next_batch(batch_size, 'test') # print(batch_tx[0], batch_ty[0]) test_output = sess.run(pred, feed_dict={x: batch_tx, keep_var: 1}) # print(test_output[0]) MAP = mean_average_precision(test_output, batch_ty) test_map_global += MAP test_count += 1 test_map_global /= test_count print("Global Test Accuracy = {:.4f}".format(test_map_global)) # Load one image img = cv2.imread('saved_models/images_tests/yellow.jpg') img = cv2.resize(img, (227, 227)) img = img.astype(np.float32) img -= np.array([104., 117., 124.]) print(img) test_output = sess.run(pred, feed_dict={ x: np.reshape(img, (1, 227, 227, 3)), keep_var: 1 }) score_dict = {} for score, genre in zip(test_output[0], genres): score_dict[genre] = score print(list(reversed(sorted(score_dict.items(), key=lambda x: x[1]))))
def add_data_to_dataset(self, dataset, data): """Add point and cell data to the dataset.""" dim = self.dim sym = (dim + 1) * dim / 2 dm = DatasetManager(dataset=dataset) for key, val in data.iteritems(): vd = val.data ## print vd.shape if val.mode == 'vertex': if vd.shape[1] == 1: aux = vd.reshape((vd.shape[0],)) elif vd.shape[1] == 2: zz = nm.zeros((vd.shape[0], 1), dtype=vd.dtype) aux = nm.c_[vd, zz] elif vd.shape[1] == 3: aux = vd else: raise ValueError('unknown vertex data format! (%s)'\ % vd.shape) dm.add_array(aux, key, 'point') elif val.mode == 'cell': ne, aux, nr, nc = vd.shape if (nr == 1) and (nc == 1): aux = vd.reshape((ne,)) elif (nr == dim) and (nc == 1): if dim == 3: aux = vd.reshape((ne, dim)) else: zz = nm.zeros((vd.shape[0], 1), dtype=vd.dtype); aux = nm.c_[vd.squeeze(), zz] elif (((nr == sym) or (nr == (dim * dim))) and (nc == 1)) \ or ((nr == dim) and (nc == dim)): vd = vd.squeeze() if dim == 3: if nr == sym: aux = vd[:,[0,3,4,3,1,5,4,5,2]] elif nr == (dim * dim): aux = vd[:,[0,3,4,6,1,5,7,8,2]] else: aux = vd.reshape((vd.shape[0], dim*dim)) else: zz = nm.zeros((vd.shape[0], 1), dtype=vd.dtype); if nr == sym: aux = nm.c_[vd[:,[0,2]], zz, vd[:,[2,1]], zz, zz, zz, zz] elif nr == (dim * dim): aux = nm.c_[vd[:,[0,2]], zz, vd[:,[3,1]], zz, zz, zz, zz] else: aux = nm.c_[vd[:,0,[0,1]], zz, vd[:,1,[0,1]], zz, zz, zz, zz] dm.add_array(aux, key, 'cell')
from flask import Flask, send_file from flask_restful import Resource, Api, reqparse from flask_cors import CORS import numpy as np from data_backend import Dataset as HDF_Dataset from dataset_manager import DatasetManager from thumbnailer import Thumbnailer from utils import merge_overlapping_filters DATASET_PATH = "./datasets" dataset_manager = DatasetManager(DATASET_PATH) API_BASE_STR = "/api/v1" # Init thumbnails (clean directory) thumbnailer = Thumbnailer("./thumbnails") thumbnailer.clean(); dataset_list = [] for dset_index, name in enumerate(dataset_manager.get_dataset_names()): dset = HDF_Dataset(DATASET_PATH, name) dataset_list.append({ "id": dset_index, "name": name, "device": { "name": dset.device.name, "version": dset.device.version }, "subsets": [ { "id": subset_index, "name": subset,
def run(experiment_name): with tf.Graph().as_default() as gr: with tf.variable_scope('input'): tf_input = tf.placeholder(dtype=tf.int32, shape=[None, model.SENTENCE_LENGTH_MAX], name='tf_input') tf_labels = tf.placeholder(dtype=tf.int32, shape=[None], name='tf_labels') tf_logits = model.inference(tf_input) tf_loss = model.loss(tf_logits, tf_labels) tf_optimizer, tf_global_step = model.optimize(tf_loss) model.measure_acc(tf_logits, tf_labels) tf_all_summary = tf.summary.merge_all() tf_train_writer = tf.summary.FileWriter(logdir=os.path.join( CURRENT_DIR, '..', 'summary', 'train_' + experiment_name), graph=gr) tf_test_writer = tf.summary.FileWriter(logdir=os.path.join( CURRENT_DIR, '..', 'summary', 'test_' + experiment_name), graph=gr) tf_embedding_writer = tf.summary.FileWriter(logdir=os.path.join( CURRENT_DIR, '..', 'checkpoint', experiment_name)) # Visual word embedding config = projector.ProjectorConfig() embedding = config.embeddings.add() embedding.tensor_name = 'embedding/word_embeddings' # Reference model_v6.py embedding.metadata_path = os.path.join(CURRENT_DIR, 'data', DatasetManager.VOCAB_FILE) projector.visualize_embeddings(tf_embedding_writer, config) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=0.03) logging.info('Graph size: %s', utils.count_trainable_variables()) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.GPU) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=FLAGS.LOG_DEVICE_PLACEMENT)).as_default( ) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) dataset_manager = DatasetManager() dataset_manager.boot() for docs, labels in dataset_manager.get_batch( batch_size=FLAGS.BATCH_SIZE, number_epochs=FLAGS.NUMBER_EPOCHS): _, global_step = sess.run([tf_optimizer, tf_global_step], feed_dict={ tf_input: docs, tf_labels: labels }) summary_interval_step = 10 if global_step % summary_interval_step == 0: logging.debug('Global step: %s', global_step) train_summary_data = sess.run(tf_all_summary, feed_dict={ tf_input: docs, tf_labels: labels }) tf_train_writer.add_summary(train_summary_data, global_step=global_step) if global_step % summary_interval_step == 0: docs_test, labels_test = dataset_manager.get_test_set( FLAGS.TEST_SIZE, is_shuffled=True) test_summary_data = sess.run(tf_all_summary, feed_dict={ tf_input: docs_test, tf_labels: labels_test }) tf_test_writer.add_summary(test_summary_data, global_step=global_step) if global_step % 200 == 0: path_to_save = os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name) if not os.path.exists(path_to_save): os.makedirs(path_to_save) saved_file = saver.save(sess, save_path=os.path.join( path_to_save, 'step'), global_step=global_step, write_meta_graph=True) logging.debug('Saving model at %s', saved_file)
def run(experiment_name): BEST_THRES = 3 WORST_THRES = 3 POPULATION_STEPS = 500 ITERATIONS = 100 POPULATION_SIZE = 10 accuracy_hist = np.zeros((POPULATION_SIZE, POPULATION_STEPS)) l1_scale_hist = np.zeros((POPULATION_SIZE, POPULATION_STEPS)) best_accuracy_hist = np.zeros((POPULATION_STEPS, )) best_l1_scale_hist = np.zeros((POPULATION_STEPS, )) with tf.Graph().as_default() as gr: with tf.variable_scope('input'): tf_input = tf.placeholder( dtype=tf.int32, shape=[ None, model_population_based_tunning.SENTENCE_LENGTH_MAX ], name='tf_input') tf_labels = tf.placeholder(dtype=tf.int32, shape=[None], name='tf_labels') models = [ create_model( i, is_included_regularization=FLAGS.IS_INCLUDED_REGULARIZATION) for i in range(10) ] # It will help us with creation of different scope_name for each model for index, model in enumerate(models): with tf.variable_scope(str(index)): model.boot(tf_input, tf_labels) logging.info('Graph size: %s', utils.count_trainable_variables()) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.GPU) with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=FLAGS.LOG_DEVICE_PLACEMENT)).as_default( ) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) dataset_manager = DatasetManager() dataset_manager.boot() dataset_generator = dataset_manager.get_batch( batch_size=FLAGS.BATCH_SIZE, number_epochs=10 * FLAGS.NUMBER_EPOCHS) for i in range(POPULATION_STEPS): # Copy best sess.run([ m.get_copy_from_op(models[0]) for m in models[-WORST_THRES:] ]) # Perturb others sess.run([m.l1_scale_perturb_op for m in models[BEST_THRES:]]) # Training for _ in range(ITERATIONS): docs, labels = next(dataset_generator) sess.run([m.tf_optimizer for m in models], feed_dict={ tf_input: docs, tf_labels: labels }) docs, labels = next(dataset_generator) # Evaluate l1_scales = sess.run({m: m.l1_scale for m in models}) accuracies = sess.run({m: m.tf_acc for m in models}, feed_dict={ tf_input: docs, tf_labels: labels }) models.sort(key=lambda m: accuracies[m], reverse=True) # Logging best_accuracy_hist[i] = accuracies[models[0]] best_l1_scale_hist[i] = l1_scales[models[0]] for m in models: l1_scale_hist[m.model_id, i] = l1_scales[m] accuracy_hist[m.model_id, i] = accuracies[m] with open('temp', 'w') as output_f: json.dump( { 'accuracy_hist': accuracy_hist, 'l1_scale_hist': l1_scale_hist, 'best_accuracy_hist': best_accuracy_hist, 'best_l1_scale_hist': best_l1_scale_hist }, output_f)
import sys from dataset_manager import DatasetManager from gloss_lookup import GlossLookup # Instantiate singleton classes gl = GlossLookup() dm = DatasetManager() def menu(): print() print(15 * '-', 'MAIN MENU', 15 * '-') print('[1] Gloss Lookup' + '\n[2] Dataset manager' + '\n[3] Exit program') print(41 * '-') def glossLookup(): loop = True while loop: gl.menu() answer = int(input('Enter the class (an integer between 0 to 999): ')) if gl.wordExists(answer): print('Word: ' + gl.searchGlossary(answer)) loop = False else: print('Invalid class. Please enter a valid class.') def datasetManager(): loop = True
params = "pd_fixed_trainratio80_outcome_all_data_singletask" #params = "lstmsize%s_dropout%s_shared%s_specialized%s"%(lstmsize, dropout, n_shared_layers, n_specialized_layers) checkpoint_prefix = os.path.join( output_dir, "checkpoints/model_%s_%s" % (dataset_name, params)) model_filename = glob.glob("%s*.hdf5" % checkpoint_prefix)[-1] #model_filename = "code/output_files/models/model_28-1.51.h5" results_file = os.path.join( output_dir, "evaluation_results/results_%s_%s_%s.csv" % (cls_method, dataset_name, params)) ##### MAIN PART ###### print('Preparing data...') start = time.time() dataset_manager = DatasetManager(dataset_name) data = dataset_manager.read_dataset() train, test = dataset_manager.split_data( data, train_ratio, split=data_split_type ) # to reproduce results of Tax et al., use 'ordered' instead of 'temporal' dt_train = dataset_manager.encode_data_with_label_all_data(train) dt_test = dataset_manager.encode_data_with_label_all_data(test) if normalize_over == "train": dataset_manager.calculate_divisors(dt_train) elif normalize_over == "all": dt_all = dataset_manager.extract_timestamp_features(data) dt_all = dataset_manager.extract_duration_features(dt_all) dataset_manager.calculate_divisors(dt_all) else:
def test_should_remove_unknown_dataset(self): data = DatasetManager("./tests/resources/local_data", fs=self.os) with self.assertRaises(IOError): data.remove_dataset("unknown_dataset")
def predict(list_sentences, output_file, experiment_name, step='', list_labels=[]): dataset_manager = DatasetManager() dataset_manager.boot() list_preprocessed_sentences = preprocessor.preprocess(list_sentences) list_vecs = dataset_manager.text2vec.doc_to_vec( list_preprocessed_sentences) list_vecs = dataset_manager.equalize_vector_length_to_np( list_vectors=list_vecs, max_length=model_v1.SENTENCE_LENGTH_MAX) list_labels = dataset_manager.convert_labels_to_np(list_labels) if step == '': interesting_checkpoint = tf.train.latest_checkpoint( os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name)) else: interesting_checkpoint = os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name, 'step-{}'.format(step)) with tf.Graph().as_default() as gr: logging.info('-- Restoring graph for model: %s', interesting_checkpoint) saver = tf.train.import_meta_graph( '{}.meta'.format(interesting_checkpoint)) logging.info('-- Restored graph for model named: %s', interesting_checkpoint) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True)).as_default() as sess: saver.restore(sess=sess, save_path=interesting_checkpoint) logging.info('-- Restored variables for model named: %s', interesting_checkpoint) tf_input = gr.get_tensor_by_name('input/tf_input:0') tf_predictions = gr.get_tensor_by_name('prediction:0') prediction = sess.run(tf_predictions, feed_dict={tf_input: list_vecs}) if len(list_labels) != 0: logging.info('-- Report for model: %s', experiment_name) logging.info( classification_report(y_true=list_labels, y_pred=prediction)) result_dict = dict() result_dict['sentence'] = list_sentences result_dict['pre-processed'] = list_preprocessed_sentences result_dict[ 'pre-processed_recover'] = dataset_manager.text2vec.vec_to_doc( list_vecs) result_dict['predict'] = prediction if len(list_labels) != 0: result_dict['label'] = list_labels pd.DataFrame(result_dict).to_csv(output_file, index=None) logging.debug('Saved result at %s', output_file)
def main(): # Load dataset manager with open('training_set_list.pickle', 'rb') as handle: training_set = pickle.load(handle) with open('validation_set_list.pickle', 'rb') as handle: validation_set = pickle.load(handle) with open('test_set_list.pickle', 'rb') as handle: test_set = pickle.load(handle) with open('genres.json') as json_data: genres = json.load(json_data) with open('labels.json') as json_data: labels = json.load(json_data) log_file_name = str(datetime.now()) + '-logs.txt' with open("logs/" + log_file_name, 'w') as log_file: log_file.write('Training logs \n') iteration_file_name = str(datetime.now()) + '-iteration.txt' with open("logs/" + iteration_file_name, 'w') as log_file: log_file.write('Training iterations \n') best_iteration_file_name = str(datetime.now()) + '-best_iteration.txt' with open("logs/" + best_iteration_file_name, 'w') as log_file: log_file.write('Best Training iterations \n') dataset_manager = DatasetManager(training_set, validation_set, test_set, genres, labels) # Learning params learning_rate = 0.001 batch_size = 50 # Nombre d'iterations training_iters = 1000 # display training information (loss, training accuracy, ...) every 10 # iterations local_train_step = 20 global_validation_step = 25 # test every global_test_step iterations global_train_step = 100 # Network params n_classes = 26 keep_rate = 0.5 # for dropout # Graph input x = tf.placeholder(tf.float32, [batch_size, 227, 227, 3]) y = tf.placeholder(tf.float32, [None, n_classes]) keep_var = tf.placeholder(tf.float32) # Model pred = Model.alexnet(x, keep_var) # definition of the network architecture # Loss and optimizer # loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=pred, labels=y)) loss = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(y, pred)))) # optimizer = tf.train.GradientDescentOptimizer( # learning_rate=learning_rate).minimize(loss) optimizer = tf.train.RMSPropOptimizer( learning_rate=learning_rate).minimize(loss) # Init init = tf.global_variables_initializer() # Initialize an saver for store model checkpoints saver = tf.train.Saver() # To do early stopping max_validation_map = 0 # Launch the graph with tf.Session() as sess: sess.run(init) # Load pretrained model # Skip weights from fc8 (fine-tuning) load_with_skip('pretrained_alexnet.npy', sess, ['fc8']) print('Start training.') step = 1 while step < training_iters: # print("Iter ", step) with open("logs/" + iteration_file_name, 'a') as log_file: log_file.write("Iter {} \n".format(step)) batch_xs, batch_ys = dataset_manager.next_batch( batch_size, 'train') sess.run(optimizer, feed_dict={ x: batch_xs, y: batch_ys, keep_var: keep_rate }) # Display on batch training status if step % local_train_step == 0: local_train_output = sess.run(pred, feed_dict={ x: batch_xs, keep_var: 1 }) MAP = mean_average_precision(local_train_output, batch_ys) batch_loss = sess.run(loss, feed_dict={ x: batch_xs, y: batch_ys, keep_var: 1. }) with open("logs/" + log_file_name, 'a') as log_file: log_file.write("Iter {} Training Loss = {:.4f}, " "Mean average precision = {:.4f} \n".format( step, batch_loss, MAP)) # Display global training error if step % global_train_step == 0: train_map_global = 0. test_count = 0 # test accuracy by group of batch_size images for _ in range( int(len(dataset_manager.training_list) / batch_size) + 1): batch_tx, batch_ty = dataset_manager.next_batch( batch_size, 'train') test_output = sess.run(pred, feed_dict={ x: batch_tx, keep_var: 1 }) MAP = mean_average_precision(test_output, batch_ty) train_map_global += MAP test_count += 1 train_map_global /= test_count with open("logs/" + log_file_name, 'a') as log_file: log_file.write( "Global Training Accuracy = {:.4f} \n".format( train_map_global)) # Display global testing error if step % global_validation_step == 0: validation_map_global = 0. validation_count = 0 # test accuracy by group of batch_size images for _ in range( int(len(dataset_manager.test_list) / batch_size) + 1): batch_tx, batch_ty = dataset_manager.next_batch( batch_size, 'val') test_output = sess.run(pred, feed_dict={ x: batch_tx, keep_var: 1 }) MAP = mean_average_precision(test_output, batch_ty) validation_map_global += MAP validation_count += 1 validation_map_global /= validation_count with open("logs/" + log_file_name, 'a') as log_file: log_file.write( "Iter {} Global Validation Accuracy = {:.4f} \n". format(step, validation_map_global)) if validation_map_global >= max_validation_map: max_validation_map = validation_map_global with open("logs/" + best_iteration_file_name, 'a') as log_file: log_file.write("Iter {} \n".format(step)) # Save model saver.save( sess, "saved_models/model_dropout05_mean_square_error.ckpt") step += 1 # print("Finish!") with open("logs/finish", 'w') as finish_file: finish_file.write("Finish")