def build_sequence_clf_training_set(segmented_log, sensor_id_pos, ngrams_length): print('Building training set...') sequences = [] labels = [] for segment_ in segmented_log.segments: sequence = '' for c in segment_: sequence += c[sensor_id_pos] sequences.append(sequence) labels.append(GOOD_LABEL) clf_input = SequenceClassifierInput(sequences=sequences, labels=labels, ngrams_length=ngrams_length) train_data, *_ = clf_input.get_spectrum_train_test_data() return len(train_data[0]) # return the max sequence length
def build_sequence_clf_validation_set(segmented_log, sensor_id_pos, ngrams_length, max_vector_length): print('Building validation set...') sequences = [] labels = [] for b_step in segmented_log.b_steps: # compute the cartesian product of the two collections of segments in the current b step. for segment_ in b_step.segments: for compat_segment_ in b_step.compat_segments: sequence = '' for c in segment_ + compat_segment_: sequence += c[sensor_id_pos] sequences.append(sequence) labels.append(GOOD_LABEL) clf_input = SequenceClassifierInput(sequences=sequences, labels=labels, ngrams_length=ngrams_length) clf_input.get_spectrum_train_test_data(max_vector_length)
def main(considered_labels=None, cached_dataset=None, inputs_per_label=1000, ngrams_length=3): # retrieve input data from database clf_input = SequenceClassifierInput(considered_labels=considered_labels, cached_dataset=cached_dataset, inputs_per_label=inputs_per_label, ngrams_length=ngrams_length) train_data, test_data, train_labels, test_labels = clf_input.get_rnn_train_test_data( ) """ INITIALIZE COMPUTATION GRAPH """ sequence_max_length = len(train_data[0]) frame_dimension = len(train_data[0][0]) # sequences number (i.e. batch_size) defined at runtime data = tf.placeholder(tf.float32, [None, sequence_max_length, frame_dimension]) target = tf.placeholder(tf.float32, [None, clf_input.labels_num]) dropout_keep_prob = tf.placeholder(tf.float32) model = RNNSequenceClassifier(data, target, dropout_keep_prob) # to save and restore variables after training saver = tf.train.Saver() # start session start_time = time.time() sess = tf.Session() sess.run(tf.global_variables_initializer()) train_size = len(train_data) indices_num = int(MINI_BATCH_SIZE * train_size) errors = [] print('Inputs per label: {0}'.format(clf_input.inputs_per_label)) print('Neurons per layer: {0}'.format(NEURONS_NUM)) print('Dropout keep prob: {0}'.format(DROPOUT_KEEP_PROB)) for epoch in range(EPOCHS_NUM): print('Epoch {:2d}'.format(epoch + 1)) for step in range(STEPS_NUM): print('\tstep {:3d}'.format(step + 1)) rand_index = np.random.choice(train_size, indices_num) mini_batch_xs = train_data[rand_index] mini_batch_ys = train_labels[rand_index] sess.run( model.optimize, { data: mini_batch_xs, target: mini_batch_ys, dropout_keep_prob: DROPOUT_KEEP_PROB }) # dropout_keep_prob is set to 1 (i.e. keep all) only for testing error = sess.run(model.error, { data: test_data, target: test_labels, dropout_keep_prob: 1 }) error_percentage = 100 * error errors.append(error) print('\taccuracy: {:3.1f}% \n\terror: {:3.1f}%'.format( 100 - error_percentage, error_percentage)) elapsed_time = (time.time() - start_time) print('RNN running time:', timedelta(seconds=elapsed_time)) # save model variables model_checkpoint_time = str(int(time.time())) model_checkpoint_dir = os.path.join(TRAINED_MODELS_FOLDER, model_checkpoint_time) if not os.path.exists(model_checkpoint_dir): os.makedirs(model_checkpoint_dir) saver.save( sess, os.path.join(model_checkpoint_dir, model_checkpoint_time) + TF_MODEL_EXT) """ PLOT ERROR FUNCTION """ _, fig_basename = unique_filename( os.path.join(model_checkpoint_dir, clf_input.dump_basename)) fig = fig_basename + IMG_EXT fig_zoom = FILENAME_SEPARATOR.join([fig_basename, 'zoom']) + IMG_EXT fig_avg = FILENAME_SEPARATOR.join([fig_basename, 'avg']) + IMG_EXT measures_num = EPOCHS_NUM * STEPS_NUM plt.figure() plt.plot(range(1, measures_num + 1), errors) plt.axis([1, measures_num, 0, 1]) plt.savefig(fig, bbox_inches='tight') plt.figure() plt.plot(range(1, measures_num + 1), errors) plt.savefig(fig_zoom, bbox_inches='tight') plt.figure() # group steps errors of the same epoch and compute the average error in epoch plt.plot( range(1, EPOCHS_NUM + 1), [sum(group) / STEPS_NUM for group in zip(*[iter(errors)] * STEPS_NUM)]) plt.savefig(fig_avg, bbox_inches='tight')
from sklearn import svm from sklearn.externals import joblib import time from sequence_classification.spectrum_kernel import occurrence_dict_spectrum_kernel from sequence_classification.sequence_classifier_input import SequenceClassifierInput from utils.constants import TRAINED_MODELS_FOLDER, PICKLE_EXT from utils.dataset_management import filter_dataset if __name__ == '__main__': NOISE_THRESHOLD = 10 print('Loading dataset...') clf_input = SequenceClassifierInput( cached_dataset='1568968144_3_7306_GOOD') train_data, test_data, *_ = clf_input.get_spectrum_train_test_data( ) # ignoring labels # SequenceClassifierInput splits the dataset in train and test by default. # Since the validation is performed separately, we join the splits. train_data = train_data + test_data # Filter out short sequences from dataset. print('Filtering dataset...') filter_dataset(train_data, NOISE_THRESHOLD, clf_input.ngrams_length) print('Training One-class SVM...') clf = svm.OneClassSVM(kernel=occurrence_dict_spectrum_kernel) start_time = time.time() clf.fit(train_data)
from sklearn.externals import joblib from sequence_classification.sequence_classifier_input import SequenceClassifierInput from utils.dataset_management import filter_dataset from utils.constants import TRAINED_MODELS_FOLDER, PICKLE_EXT, DATA_FOLDER, FILENAME_SEPARATOR if __name__ == '__main__': NOISE_THRESHOLD = 15 print('Loading model dump...') predictions_filename = os.path.join(TRAINED_MODELS_FOLDER, 'l_min_15.pkl') clf = joblib.load(predictions_filename) print('Loading validation data...') clf_input = SequenceClassifierInput( cached_dataset='1498490206_3_28519_GOOD_validation') train_data, test_data, *_ = clf_input.get_spectrum_train_test_data( ) # ignoring labels # SequenceClassifierInput splits the dataset in train and test by default. # We join them to perform validation. validation_data = train_data + test_data # Filter out short sequences from dataset. print('Filtering dataset...') filter_dataset(validation_data, NOISE_THRESHOLD, clf_input.ngrams_length) print('\tFiltered dataset size:', str(len(validation_data))) # compute predictions and show stats print('Computing predictions...') start_time = time.time()
from sklearn.externals import joblib from sequence_classification.sequence_classifier_input import SequenceClassifierInput from utils.dataset_management import filter_dataset from utils.constants import TRAINED_MODELS_FOLDER, PICKLE_EXT, DATA_FOLDER, FILENAME_SEPARATOR if __name__ == '__main__': NOISE_THRESHOLD = 15 print('Loading model dump...') predictions_filename = os.path.join(TRAINED_MODELS_FOLDER, '1561476910.pkl') clf = joblib.load(predictions_filename) print('Loading validation data...') clf_input = SequenceClassifierInput(cached_dataset='1561471958_3_26387_GOOD') train_data, test_data, *_ = clf_input.get_spectrum_train_test_data() # ignoring labels # SequenceClassifierInput splits the dataset in train and test by default. # We join them to perform validation. validation_data = train_data + test_data # Filter out short sequences from dataset. print('Filtering dataset...') filter_dataset(validation_data, NOISE_THRESHOLD, clf_input.ngrams_length) print('\tFiltered dataset size:', str(len(validation_data))) # compute predictions and show stats print('Computing predictions...') start_time = time.time() predictions = clf.predict(validation_data)
from sklearn import svm from sklearn.externals import joblib import time from sequence_classification.spectrum_kernel import occurrence_dict_spectrum_kernel from sequence_classification.sequence_classifier_input import SequenceClassifierInput from utils.constants import TRAINED_MODELS_FOLDER, PICKLE_EXT from utils.dataset_management import filter_dataset if __name__ == '__main__': NOISE_THRESHOLD = 10 print('Loading dataset...') clf_input = SequenceClassifierInput( cached_dataset='1498483802_3_17732_GOOD_training') train_data, test_data, *_ = clf_input.get_spectrum_train_test_data( ) # ignoring labels # SequenceClassifierInput splits the dataset in train and test by default. # Since the validation is performed separately, we join the splits. train_data = train_data + test_data # Filter out short sequences from dataset. print('Filtering dataset...') filter_dataset(train_data, NOISE_THRESHOLD, clf_input.ngrams_length) print('Training One-class SVM...') clf = svm.OneClassSVM(kernel=occurrence_dict_spectrum_kernel) start_time = time.time() clf.fit(train_data)