def _dump_dataset(self, dataset, suffix='', **kwargs): """ Create a dump of the given dataset in secondary storage, appending the given suffix to the filename (to identify the intermediate result). The dataset must be a tuple of four elements corresponding respectively to: train data, test data, train labels, test labels. :type dataset: tuple :param dataset: the object that represents the dataset. :param suffix: the string that identifies the intermediate step. :param kwargs: a dict that provides extra descriptive parameters of the given dataset. """ dataset_dict = { SPECTRUM_KEY: self.ngrams_length, LABELS_KEY: self.considered_labels, INPUTS_PER_LABEL_KEY: self.inputs_per_label, TIME_KEY: self.time, TRAIN_LABELS_KEY: dataset[TRAIN_LABELS_POS], TEST_LABELS_KEY: dataset[TEST_LABELS_POS] } if suffix != RNN_SUFFIX: data_dict = { TRAIN_DATA_KEY: dataset[TRAIN_DATA_POS], TEST_DATA_KEY: dataset[TEST_DATA_POS] } dataset_dict.update(data_dict) if kwargs: # merge dicts (with second dict's values overwriting those from the first, if key conflicts exist). dataset_dict.update(kwargs) if not self.dump_basename: size = len(dataset[TRAIN_DATA_POS]) + len(dataset[TEST_DATA_POS]) self.dump_basename = FILENAME_SEPARATOR.join( [str(self.time), str(self.ngrams_length), str(size)] + self.considered_labels) if suffix != '': dirname = FILENAME_SEPARATOR.join([self.dump_basename, suffix]) else: dirname = self.dump_basename dirname = os.path.join(DATA_FOLDER, dirname) archive = klepto.archives.dir_archive(dirname, cached=True, serialized=True) for key, val in dataset_dict.items(): archive[key] = val try: archive.dump() except MemoryError: print( 'The dataset dump %s has not been stored due to memory error.' % dirname, file=sys.stderr)
def _get_glove_embedded_data(self, data, train_size): """ Compute two matrices which rows correspond to sequences GloVe embeddings of train and test splits respectively. Each sequence embedding is computed as the sequence of the embedding of its n-grams. :param data: a list of input data. :param train_size: the size of the training split. :return: the GloVe embeddings matrices for train and test splits respectively and the max sequence length. """ max_cols_num = len(max(data, key=len)) train_filename = os.path.join( DATA_FOLDER, FILENAME_SEPARATOR.join([self.dump_basename, GLOVE_TRAIN_SUFFIX])) test_filename = os.path.join( DATA_FOLDER, FILENAME_SEPARATOR.join([self.dump_basename, GLOVE_TEST_SUFFIX])) glove_matrix_train = np.memmap(train_filename, dtype='float32', mode='w+', shape=(train_size, max_cols_num, EMBEDDING_SIZE)) glove_matrix_test = np.memmap(test_filename, dtype='float32', mode='w+', shape=(len(data) - train_size, max_cols_num, EMBEDDING_SIZE)) glove_model = self._train_glove_model(data) # build sequences embeddings and partition the dataset into train and test splits for idx, shingle_list in enumerate(data): embeddings = [ glove_model.embedding_for(shingle) for shingle in shingle_list ] # pad the sequence with respect to max length padding_length = max_cols_num - len(embeddings) embeddings += [[PADDING_VALUE] * EMBEDDING_SIZE] * padding_length if idx < train_size: glove_matrix_train[idx] = np.asarray(embeddings) glove_matrix_train.flush() else: glove_matrix_test[idx - train_size] = np.asarray(embeddings) glove_matrix_test.flush() return glove_matrix_train, glove_matrix_test, max_cols_num
def _load_dataset(cached_dataset, suffix=None): """ Load a dataset in memory from a dump in secondary storage identified by the given filename and optional suffix (to identify the intermediate result). :param cached_dataset: the filename of the dataset. :param suffix: the string that identifies the intermediate step. :return: the object that represents the dataset. """ if suffix: dirname = FILENAME_SEPARATOR.join([cached_dataset, suffix]) else: dirname = cached_dataset dirname = os.path.join(DATA_FOLDER, dirname) if not os.path.isdir(dirname): raise FileNotFoundError archive = klepto.archives.dir_archive(dirname, cached=True, serialized=True) archive.load() return archive
def main(considered_labels=None, cached_dataset=None, inputs_per_label=1000, ngrams_length=3): # retrieve input data from database clf_input = SequenceClassifierInput(considered_labels=considered_labels, cached_dataset=cached_dataset, inputs_per_label=inputs_per_label, ngrams_length=ngrams_length) train_data, test_data, train_labels, test_labels = clf_input.get_rnn_train_test_data( ) """ INITIALIZE COMPUTATION GRAPH """ sequence_max_length = len(train_data[0]) frame_dimension = len(train_data[0][0]) # sequences number (i.e. batch_size) defined at runtime data = tf.placeholder(tf.float32, [None, sequence_max_length, frame_dimension]) target = tf.placeholder(tf.float32, [None, clf_input.labels_num]) dropout_keep_prob = tf.placeholder(tf.float32) model = RNNSequenceClassifier(data, target, dropout_keep_prob) # to save and restore variables after training saver = tf.train.Saver() # start session start_time = time.time() sess = tf.Session() sess.run(tf.global_variables_initializer()) train_size = len(train_data) indices_num = int(MINI_BATCH_SIZE * train_size) errors = [] print('Inputs per label: {0}'.format(clf_input.inputs_per_label)) print('Neurons per layer: {0}'.format(NEURONS_NUM)) print('Dropout keep prob: {0}'.format(DROPOUT_KEEP_PROB)) for epoch in range(EPOCHS_NUM): print('Epoch {:2d}'.format(epoch + 1)) for step in range(STEPS_NUM): print('\tstep {:3d}'.format(step + 1)) rand_index = np.random.choice(train_size, indices_num) mini_batch_xs = train_data[rand_index] mini_batch_ys = train_labels[rand_index] sess.run( model.optimize, { data: mini_batch_xs, target: mini_batch_ys, dropout_keep_prob: DROPOUT_KEEP_PROB }) # dropout_keep_prob is set to 1 (i.e. keep all) only for testing error = sess.run(model.error, { data: test_data, target: test_labels, dropout_keep_prob: 1 }) error_percentage = 100 * error errors.append(error) print('\taccuracy: {:3.1f}% \n\terror: {:3.1f}%'.format( 100 - error_percentage, error_percentage)) elapsed_time = (time.time() - start_time) print('RNN running time:', timedelta(seconds=elapsed_time)) # save model variables model_checkpoint_time = str(int(time.time())) model_checkpoint_dir = os.path.join(TRAINED_MODELS_FOLDER, model_checkpoint_time) if not os.path.exists(model_checkpoint_dir): os.makedirs(model_checkpoint_dir) saver.save( sess, os.path.join(model_checkpoint_dir, model_checkpoint_time) + TF_MODEL_EXT) """ PLOT ERROR FUNCTION """ _, fig_basename = unique_filename( os.path.join(model_checkpoint_dir, clf_input.dump_basename)) fig = fig_basename + IMG_EXT fig_zoom = FILENAME_SEPARATOR.join([fig_basename, 'zoom']) + IMG_EXT fig_avg = FILENAME_SEPARATOR.join([fig_basename, 'avg']) + IMG_EXT measures_num = EPOCHS_NUM * STEPS_NUM plt.figure() plt.plot(range(1, measures_num + 1), errors) plt.axis([1, measures_num, 0, 1]) plt.savefig(fig, bbox_inches='tight') plt.figure() plt.plot(range(1, measures_num + 1), errors) plt.savefig(fig_zoom, bbox_inches='tight') plt.figure() # group steps errors of the same epoch and compute the average error in epoch plt.plot( range(1, EPOCHS_NUM + 1), [sum(group) / STEPS_NUM for group in zip(*[iter(errors)] * STEPS_NUM)]) plt.savefig(fig_avg, bbox_inches='tight')
def get_rnn_train_test_data(self): """ Create training and test splits (data and corresponding labels) for RNN. :return: the dataset splits to be fed to RNN. """ if self.sequences and self.labels: train_data, test_data, train_labels, test_labels = train_test_split( self.sequences, self.labels, test_size=self.test_size, random_state=self.random_state) self.time = int(time.time()) self._dump_dataset( (train_data, test_data, train_labels, test_labels)) elif self.cached_dataset: # return cached intermediate dataset if exists try: dataset_dict = self._load_dataset(self.cached_dataset, suffix=RNN_SUFFIX) train_filename = os.path.join( DATA_FOLDER, FILENAME_SEPARATOR.join( [self.dump_basename, GLOVE_TRAIN_SUFFIX])) test_filename = os.path.join( DATA_FOLDER, FILENAME_SEPARATOR.join( [self.dump_basename, GLOVE_TEST_SUFFIX])) # retrieve train and test data splits shapes train_size = len(dataset_dict[TRAIN_LABELS_KEY]) test_size = len(dataset_dict[TEST_LABELS_KEY]) max_cols_num = dataset_dict[MAX_COLS_NUM_KEY] glove_embedding_size = dataset_dict[GLOVE_EMBEDDING_SIZE_KEY] # restore the memory mappings for train and test data splits glove_matrix_train = np.memmap(train_filename, dtype='float32', mode='r+', shape=(train_size, max_cols_num, glove_embedding_size)) glove_matrix_test = np.memmap(test_filename, dtype='float32', mode='r+', shape=(test_size, max_cols_num, glove_embedding_size)) return (glove_matrix_train, glove_matrix_test, dataset_dict[TRAIN_LABELS_KEY], dataset_dict[TEST_LABELS_KEY]) except FileNotFoundError: dataset_dict = self._load_dataset(self.cached_dataset) train_data = dataset_dict[TRAIN_DATA_KEY] test_data = dataset_dict[TEST_DATA_KEY] train_labels = dataset_dict[TRAIN_LABELS_KEY] test_labels = dataset_dict[TEST_LABELS_KEY] else: train_data, test_data, train_labels, test_labels = self._get_training_inputs_by_labels( ) train_size = len(train_data) data = self._preprocess_data(train_data + test_data) train_labels = np.asarray(self._labels_to_prob_vectors(train_labels)) test_labels = np.asarray(self._labels_to_prob_vectors(test_labels)) # perform data embedding through GloVe model train_data, test_data, max_cols_num = self._get_glove_embedded_data( data, train_size) split_dataset = (train_data, test_data, train_labels, test_labels) self._dump_dataset(split_dataset, suffix=RNN_SUFFIX, glove_embedding_size=EMBEDDING_SIZE, max_cols_num=max_cols_num) return split_dataset
# Filter out short sequences from dataset. print('Filtering dataset...') filter_dataset(validation_data, NOISE_THRESHOLD, clf_input.ngrams_length) print('\tFiltered dataset size:', str(len(validation_data))) # compute predictions and show stats print('Computing predictions...') start_time = time.time() predictions = clf.predict(validation_data) elapsed_time = (time.time() - start_time) total_sequences_num = len(validation_data) good_sequences_num = sum(1 for _ in filter( lambda x: x == 1, predictions)) # count positive predictions print('\tTime:', timedelta(seconds=elapsed_time)) print('\tFraction of good sequences: {:3.1f}%'.format( good_sequences_num / total_sequences_num * 100)) # dump results print('Dumping predictions...') predictions_info = [ str(int(time.time())), 'l_min', str(NOISE_THRESHOLD), 'predictions' ] predictions_filename = os.path.join( DATA_FOLDER, FILENAME_SEPARATOR.join(predictions_info) + PICKLE_EXT) with open(predictions_filename, 'wb') as dump: pickle.dump(predictions, dump)
print('Loading validation data...') clf_input = SequenceClassifierInput(cached_dataset='1561471958_3_26387_GOOD') train_data, test_data, *_ = clf_input.get_spectrum_train_test_data() # ignoring labels # SequenceClassifierInput splits the dataset in train and test by default. # We join them to perform validation. validation_data = train_data + test_data # Filter out short sequences from dataset. print('Filtering dataset...') filter_dataset(validation_data, NOISE_THRESHOLD, clf_input.ngrams_length) print('\tFiltered dataset size:', str(len(validation_data))) # compute predictions and show stats print('Computing predictions...') start_time = time.time() predictions = clf.predict(validation_data) elapsed_time = (time.time() - start_time) total_sequences_num = len(validation_data) good_sequences_num = sum(1 for _ in filter(lambda x: x == 1, predictions)) # count positive predictions print('\tTime:', timedelta(seconds=elapsed_time)) print('\tFraction of good sequences: {:3.1f}%'.format(good_sequences_num / total_sequences_num * 100)) # dump results print('Dumping predictions...') predictions_info = [str(int(time.time())), 'l_min', str(NOISE_THRESHOLD), 'predictions'] predictions_filename = os.path.join(DATA_FOLDER, FILENAME_SEPARATOR.join(predictions_info) + PICKLE_EXT) with open(predictions_filename, 'wb') as dump: pickle.dump(predictions, dump)