예제 #1
0
def build_sequence_clf_training_set(segmented_log, sensor_id_pos,
                                    ngrams_length):
    print('Building training set...')
    sequences = []
    labels = []
    for segment_ in segmented_log.segments:
        sequence = ''
        for c in segment_:
            sequence += c[sensor_id_pos]
        sequences.append(sequence)
        labels.append(GOOD_LABEL)
    clf_input = SequenceClassifierInput(sequences=sequences,
                                        labels=labels,
                                        ngrams_length=ngrams_length)
    train_data, *_ = clf_input.get_spectrum_train_test_data()
    return len(train_data[0])  # return the max sequence length
예제 #2
0
def build_sequence_clf_validation_set(segmented_log, sensor_id_pos,
                                      ngrams_length, max_vector_length):
    print('Building validation set...')
    sequences = []
    labels = []
    for b_step in segmented_log.b_steps:
        # compute the cartesian product of the two collections of segments in the current b step.
        for segment_ in b_step.segments:
            for compat_segment_ in b_step.compat_segments:
                sequence = ''
                for c in segment_ + compat_segment_:
                    sequence += c[sensor_id_pos]
                sequences.append(sequence)
                labels.append(GOOD_LABEL)
    clf_input = SequenceClassifierInput(sequences=sequences,
                                        labels=labels,
                                        ngrams_length=ngrams_length)
    clf_input.get_spectrum_train_test_data(max_vector_length)
def main(considered_labels=None,
         cached_dataset=None,
         inputs_per_label=1000,
         ngrams_length=3):
    # retrieve input data from database
    clf_input = SequenceClassifierInput(considered_labels=considered_labels,
                                        cached_dataset=cached_dataset,
                                        inputs_per_label=inputs_per_label,
                                        ngrams_length=ngrams_length)

    train_data, test_data, train_labels, test_labels = clf_input.get_rnn_train_test_data(
    )
    """
    INITIALIZE COMPUTATION GRAPH
    """
    sequence_max_length = len(train_data[0])
    frame_dimension = len(train_data[0][0])

    # sequences number (i.e. batch_size) defined at runtime
    data = tf.placeholder(tf.float32,
                          [None, sequence_max_length, frame_dimension])
    target = tf.placeholder(tf.float32, [None, clf_input.labels_num])
    dropout_keep_prob = tf.placeholder(tf.float32)
    model = RNNSequenceClassifier(data, target, dropout_keep_prob)

    # to save and restore variables after training
    saver = tf.train.Saver()

    # start session
    start_time = time.time()

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    train_size = len(train_data)
    indices_num = int(MINI_BATCH_SIZE * train_size)
    errors = []

    print('Inputs per label:  {0}'.format(clf_input.inputs_per_label))
    print('Neurons per layer: {0}'.format(NEURONS_NUM))
    print('Dropout keep prob: {0}'.format(DROPOUT_KEEP_PROB))

    for epoch in range(EPOCHS_NUM):
        print('Epoch {:2d}'.format(epoch + 1))

        for step in range(STEPS_NUM):
            print('\tstep {:3d}'.format(step + 1))
            rand_index = np.random.choice(train_size, indices_num)
            mini_batch_xs = train_data[rand_index]
            mini_batch_ys = train_labels[rand_index]
            sess.run(
                model.optimize, {
                    data: mini_batch_xs,
                    target: mini_batch_ys,
                    dropout_keep_prob: DROPOUT_KEEP_PROB
                })

            # dropout_keep_prob is set to 1 (i.e. keep all) only for testing
            error = sess.run(model.error, {
                data: test_data,
                target: test_labels,
                dropout_keep_prob: 1
            })
            error_percentage = 100 * error
            errors.append(error)
            print('\taccuracy: {:3.1f}% \n\terror: {:3.1f}%'.format(
                100 - error_percentage, error_percentage))

    elapsed_time = (time.time() - start_time)
    print('RNN running time:', timedelta(seconds=elapsed_time))

    # save model variables
    model_checkpoint_time = str(int(time.time()))
    model_checkpoint_dir = os.path.join(TRAINED_MODELS_FOLDER,
                                        model_checkpoint_time)
    if not os.path.exists(model_checkpoint_dir):
        os.makedirs(model_checkpoint_dir)
    saver.save(
        sess,
        os.path.join(model_checkpoint_dir, model_checkpoint_time) +
        TF_MODEL_EXT)
    """
    PLOT ERROR FUNCTION
    """
    _, fig_basename = unique_filename(
        os.path.join(model_checkpoint_dir, clf_input.dump_basename))
    fig = fig_basename + IMG_EXT
    fig_zoom = FILENAME_SEPARATOR.join([fig_basename, 'zoom']) + IMG_EXT
    fig_avg = FILENAME_SEPARATOR.join([fig_basename, 'avg']) + IMG_EXT

    measures_num = EPOCHS_NUM * STEPS_NUM
    plt.figure()
    plt.plot(range(1, measures_num + 1), errors)
    plt.axis([1, measures_num, 0, 1])
    plt.savefig(fig, bbox_inches='tight')

    plt.figure()
    plt.plot(range(1, measures_num + 1), errors)
    plt.savefig(fig_zoom, bbox_inches='tight')

    plt.figure()
    # group steps errors of the same epoch and compute the average error in epoch
    plt.plot(
        range(1, EPOCHS_NUM + 1),
        [sum(group) / STEPS_NUM for group in zip(*[iter(errors)] * STEPS_NUM)])
    plt.savefig(fig_avg, bbox_inches='tight')
예제 #4
0
from sklearn import svm
from sklearn.externals import joblib
import time

from sequence_classification.spectrum_kernel import occurrence_dict_spectrum_kernel
from sequence_classification.sequence_classifier_input import SequenceClassifierInput
from utils.constants import TRAINED_MODELS_FOLDER, PICKLE_EXT
from utils.dataset_management import filter_dataset

if __name__ == '__main__':

    NOISE_THRESHOLD = 10

    print('Loading dataset...')
    clf_input = SequenceClassifierInput(
        cached_dataset='1568968144_3_7306_GOOD')
    train_data, test_data, *_ = clf_input.get_spectrum_train_test_data(
    )  # ignoring labels

    # SequenceClassifierInput splits the dataset in train and test by default.
    # Since the validation is performed separately, we join the splits.
    train_data = train_data + test_data

    # Filter out short sequences from dataset.
    print('Filtering dataset...')
    filter_dataset(train_data, NOISE_THRESHOLD, clf_input.ngrams_length)

    print('Training One-class SVM...')
    clf = svm.OneClassSVM(kernel=occurrence_dict_spectrum_kernel)
    start_time = time.time()
    clf.fit(train_data)
예제 #5
0
from sklearn.externals import joblib

from sequence_classification.sequence_classifier_input import SequenceClassifierInput
from utils.dataset_management import filter_dataset
from utils.constants import TRAINED_MODELS_FOLDER, PICKLE_EXT, DATA_FOLDER, FILENAME_SEPARATOR

if __name__ == '__main__':

    NOISE_THRESHOLD = 15

    print('Loading model dump...')
    predictions_filename = os.path.join(TRAINED_MODELS_FOLDER, 'l_min_15.pkl')
    clf = joblib.load(predictions_filename)

    print('Loading validation data...')
    clf_input = SequenceClassifierInput(
        cached_dataset='1498490206_3_28519_GOOD_validation')
    train_data, test_data, *_ = clf_input.get_spectrum_train_test_data(
    )  # ignoring labels

    # SequenceClassifierInput splits the dataset in train and test by default.
    # We join them to perform validation.
    validation_data = train_data + test_data

    # Filter out short sequences from dataset.
    print('Filtering dataset...')
    filter_dataset(validation_data, NOISE_THRESHOLD, clf_input.ngrams_length)
    print('\tFiltered dataset size:', str(len(validation_data)))

    # compute predictions and show stats
    print('Computing predictions...')
    start_time = time.time()
from sklearn.externals import joblib

from sequence_classification.sequence_classifier_input import SequenceClassifierInput
from utils.dataset_management import filter_dataset
from utils.constants import TRAINED_MODELS_FOLDER, PICKLE_EXT, DATA_FOLDER, FILENAME_SEPARATOR

if __name__ == '__main__':

    NOISE_THRESHOLD = 15

    print('Loading model dump...')
    predictions_filename = os.path.join(TRAINED_MODELS_FOLDER, '1561476910.pkl')
    clf = joblib.load(predictions_filename)

    print('Loading validation data...')
    clf_input = SequenceClassifierInput(cached_dataset='1561471958_3_26387_GOOD')
    train_data, test_data, *_ = clf_input.get_spectrum_train_test_data()  # ignoring labels

    # SequenceClassifierInput splits the dataset in train and test by default.
    # We join them to perform validation.
    validation_data = train_data + test_data

    # Filter out short sequences from dataset.
    print('Filtering dataset...')
    filter_dataset(validation_data, NOISE_THRESHOLD, clf_input.ngrams_length)
    print('\tFiltered dataset size:', str(len(validation_data)))

    # compute predictions and show stats
    print('Computing predictions...')
    start_time = time.time()
    predictions = clf.predict(validation_data)
예제 #7
0
from sklearn import svm
from sklearn.externals import joblib
import time

from sequence_classification.spectrum_kernel import occurrence_dict_spectrum_kernel
from sequence_classification.sequence_classifier_input import SequenceClassifierInput
from utils.constants import TRAINED_MODELS_FOLDER, PICKLE_EXT
from utils.dataset_management import filter_dataset

if __name__ == '__main__':

    NOISE_THRESHOLD = 10

    print('Loading dataset...')
    clf_input = SequenceClassifierInput(
        cached_dataset='1498483802_3_17732_GOOD_training')
    train_data, test_data, *_ = clf_input.get_spectrum_train_test_data(
    )  # ignoring labels

    # SequenceClassifierInput splits the dataset in train and test by default.
    # Since the validation is performed separately, we join the splits.
    train_data = train_data + test_data

    # Filter out short sequences from dataset.
    print('Filtering dataset...')
    filter_dataset(train_data, NOISE_THRESHOLD, clf_input.ngrams_length)

    print('Training One-class SVM...')
    clf = svm.OneClassSVM(kernel=occurrence_dict_spectrum_kernel)
    start_time = time.time()
    clf.fit(train_data)