示例#1
0
    def _dump_dataset(self, dataset, suffix='', **kwargs):
        """
        Create a dump of the given dataset in secondary storage, appending the given suffix to the filename (to identify
        the intermediate result). The dataset must be a tuple of four elements corresponding respectively to:
        train data, test data, train labels, test labels.

        :type dataset: tuple
        :param dataset: the object that represents the dataset.
        :param suffix: the string that identifies the intermediate step.
        :param kwargs: a dict that provides extra descriptive parameters of the given dataset.
        """
        dataset_dict = {
            SPECTRUM_KEY: self.ngrams_length,
            LABELS_KEY: self.considered_labels,
            INPUTS_PER_LABEL_KEY: self.inputs_per_label,
            TIME_KEY: self.time,
            TRAIN_LABELS_KEY: dataset[TRAIN_LABELS_POS],
            TEST_LABELS_KEY: dataset[TEST_LABELS_POS]
        }

        if suffix != RNN_SUFFIX:
            data_dict = {
                TRAIN_DATA_KEY: dataset[TRAIN_DATA_POS],
                TEST_DATA_KEY: dataset[TEST_DATA_POS]
            }
            dataset_dict.update(data_dict)

        if kwargs:
            # merge dicts (with second dict's values overwriting those from the first, if key conflicts exist).
            dataset_dict.update(kwargs)

        if not self.dump_basename:
            size = len(dataset[TRAIN_DATA_POS]) + len(dataset[TEST_DATA_POS])
            self.dump_basename = FILENAME_SEPARATOR.join(
                [str(self.time),
                 str(self.ngrams_length),
                 str(size)] + self.considered_labels)

        if suffix != '':
            dirname = FILENAME_SEPARATOR.join([self.dump_basename, suffix])
        else:
            dirname = self.dump_basename

        dirname = os.path.join(DATA_FOLDER, dirname)
        archive = klepto.archives.dir_archive(dirname,
                                              cached=True,
                                              serialized=True)
        for key, val in dataset_dict.items():
            archive[key] = val
        try:
            archive.dump()
        except MemoryError:
            print(
                'The dataset dump %s has not been stored due to memory error.'
                % dirname,
                file=sys.stderr)
示例#2
0
    def _get_glove_embedded_data(self, data, train_size):
        """
        Compute two matrices which rows correspond to sequences GloVe embeddings of train and test splits respectively.
        Each sequence embedding is computed as the sequence of the embedding of its n-grams.

        :param data: a list of input data.
        :param train_size: the size of the training split.
        :return: the GloVe embeddings matrices for train and test splits respectively and the max sequence length.
        """
        max_cols_num = len(max(data, key=len))
        train_filename = os.path.join(
            DATA_FOLDER,
            FILENAME_SEPARATOR.join([self.dump_basename, GLOVE_TRAIN_SUFFIX]))
        test_filename = os.path.join(
            DATA_FOLDER,
            FILENAME_SEPARATOR.join([self.dump_basename, GLOVE_TEST_SUFFIX]))

        glove_matrix_train = np.memmap(train_filename,
                                       dtype='float32',
                                       mode='w+',
                                       shape=(train_size, max_cols_num,
                                              EMBEDDING_SIZE))
        glove_matrix_test = np.memmap(test_filename,
                                      dtype='float32',
                                      mode='w+',
                                      shape=(len(data) - train_size,
                                             max_cols_num, EMBEDDING_SIZE))

        glove_model = self._train_glove_model(data)

        # build sequences embeddings and partition the dataset into train and test splits
        for idx, shingle_list in enumerate(data):
            embeddings = [
                glove_model.embedding_for(shingle) for shingle in shingle_list
            ]

            # pad the sequence with respect to max length
            padding_length = max_cols_num - len(embeddings)
            embeddings += [[PADDING_VALUE] * EMBEDDING_SIZE] * padding_length

            if idx < train_size:
                glove_matrix_train[idx] = np.asarray(embeddings)
                glove_matrix_train.flush()
            else:
                glove_matrix_test[idx - train_size] = np.asarray(embeddings)
                glove_matrix_test.flush()
        return glove_matrix_train, glove_matrix_test, max_cols_num
示例#3
0
    def _load_dataset(cached_dataset, suffix=None):
        """
        Load a dataset in memory from a dump in secondary storage identified by the given filename and optional suffix 
        (to identify the intermediate result).

        :param cached_dataset: the filename of the dataset.
        :param suffix: the string that identifies the intermediate step.
        :return: the object that represents the dataset.
        """
        if suffix:
            dirname = FILENAME_SEPARATOR.join([cached_dataset, suffix])
        else:
            dirname = cached_dataset
        dirname = os.path.join(DATA_FOLDER, dirname)

        if not os.path.isdir(dirname):
            raise FileNotFoundError

        archive = klepto.archives.dir_archive(dirname,
                                              cached=True,
                                              serialized=True)
        archive.load()
        return archive
def main(considered_labels=None,
         cached_dataset=None,
         inputs_per_label=1000,
         ngrams_length=3):
    # retrieve input data from database
    clf_input = SequenceClassifierInput(considered_labels=considered_labels,
                                        cached_dataset=cached_dataset,
                                        inputs_per_label=inputs_per_label,
                                        ngrams_length=ngrams_length)

    train_data, test_data, train_labels, test_labels = clf_input.get_rnn_train_test_data(
    )
    """
    INITIALIZE COMPUTATION GRAPH
    """
    sequence_max_length = len(train_data[0])
    frame_dimension = len(train_data[0][0])

    # sequences number (i.e. batch_size) defined at runtime
    data = tf.placeholder(tf.float32,
                          [None, sequence_max_length, frame_dimension])
    target = tf.placeholder(tf.float32, [None, clf_input.labels_num])
    dropout_keep_prob = tf.placeholder(tf.float32)
    model = RNNSequenceClassifier(data, target, dropout_keep_prob)

    # to save and restore variables after training
    saver = tf.train.Saver()

    # start session
    start_time = time.time()

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    train_size = len(train_data)
    indices_num = int(MINI_BATCH_SIZE * train_size)
    errors = []

    print('Inputs per label:  {0}'.format(clf_input.inputs_per_label))
    print('Neurons per layer: {0}'.format(NEURONS_NUM))
    print('Dropout keep prob: {0}'.format(DROPOUT_KEEP_PROB))

    for epoch in range(EPOCHS_NUM):
        print('Epoch {:2d}'.format(epoch + 1))

        for step in range(STEPS_NUM):
            print('\tstep {:3d}'.format(step + 1))
            rand_index = np.random.choice(train_size, indices_num)
            mini_batch_xs = train_data[rand_index]
            mini_batch_ys = train_labels[rand_index]
            sess.run(
                model.optimize, {
                    data: mini_batch_xs,
                    target: mini_batch_ys,
                    dropout_keep_prob: DROPOUT_KEEP_PROB
                })

            # dropout_keep_prob is set to 1 (i.e. keep all) only for testing
            error = sess.run(model.error, {
                data: test_data,
                target: test_labels,
                dropout_keep_prob: 1
            })
            error_percentage = 100 * error
            errors.append(error)
            print('\taccuracy: {:3.1f}% \n\terror: {:3.1f}%'.format(
                100 - error_percentage, error_percentage))

    elapsed_time = (time.time() - start_time)
    print('RNN running time:', timedelta(seconds=elapsed_time))

    # save model variables
    model_checkpoint_time = str(int(time.time()))
    model_checkpoint_dir = os.path.join(TRAINED_MODELS_FOLDER,
                                        model_checkpoint_time)
    if not os.path.exists(model_checkpoint_dir):
        os.makedirs(model_checkpoint_dir)
    saver.save(
        sess,
        os.path.join(model_checkpoint_dir, model_checkpoint_time) +
        TF_MODEL_EXT)
    """
    PLOT ERROR FUNCTION
    """
    _, fig_basename = unique_filename(
        os.path.join(model_checkpoint_dir, clf_input.dump_basename))
    fig = fig_basename + IMG_EXT
    fig_zoom = FILENAME_SEPARATOR.join([fig_basename, 'zoom']) + IMG_EXT
    fig_avg = FILENAME_SEPARATOR.join([fig_basename, 'avg']) + IMG_EXT

    measures_num = EPOCHS_NUM * STEPS_NUM
    plt.figure()
    plt.plot(range(1, measures_num + 1), errors)
    plt.axis([1, measures_num, 0, 1])
    plt.savefig(fig, bbox_inches='tight')

    plt.figure()
    plt.plot(range(1, measures_num + 1), errors)
    plt.savefig(fig_zoom, bbox_inches='tight')

    plt.figure()
    # group steps errors of the same epoch and compute the average error in epoch
    plt.plot(
        range(1, EPOCHS_NUM + 1),
        [sum(group) / STEPS_NUM for group in zip(*[iter(errors)] * STEPS_NUM)])
    plt.savefig(fig_avg, bbox_inches='tight')
示例#5
0
    def get_rnn_train_test_data(self):
        """
        Create training and test splits (data and corresponding labels) for RNN.
        
        :return: the dataset splits to be fed to RNN.
        """
        if self.sequences and self.labels:
            train_data, test_data, train_labels, test_labels = train_test_split(
                self.sequences,
                self.labels,
                test_size=self.test_size,
                random_state=self.random_state)
            self.time = int(time.time())
            self._dump_dataset(
                (train_data, test_data, train_labels, test_labels))
        elif self.cached_dataset:
            # return cached intermediate dataset if exists
            try:
                dataset_dict = self._load_dataset(self.cached_dataset,
                                                  suffix=RNN_SUFFIX)
                train_filename = os.path.join(
                    DATA_FOLDER,
                    FILENAME_SEPARATOR.join(
                        [self.dump_basename, GLOVE_TRAIN_SUFFIX]))
                test_filename = os.path.join(
                    DATA_FOLDER,
                    FILENAME_SEPARATOR.join(
                        [self.dump_basename, GLOVE_TEST_SUFFIX]))

                # retrieve train and test data splits shapes
                train_size = len(dataset_dict[TRAIN_LABELS_KEY])
                test_size = len(dataset_dict[TEST_LABELS_KEY])
                max_cols_num = dataset_dict[MAX_COLS_NUM_KEY]
                glove_embedding_size = dataset_dict[GLOVE_EMBEDDING_SIZE_KEY]

                # restore the memory mappings for train and test data splits
                glove_matrix_train = np.memmap(train_filename,
                                               dtype='float32',
                                               mode='r+',
                                               shape=(train_size, max_cols_num,
                                                      glove_embedding_size))
                glove_matrix_test = np.memmap(test_filename,
                                              dtype='float32',
                                              mode='r+',
                                              shape=(test_size, max_cols_num,
                                                     glove_embedding_size))

                return (glove_matrix_train, glove_matrix_test,
                        dataset_dict[TRAIN_LABELS_KEY],
                        dataset_dict[TEST_LABELS_KEY])
            except FileNotFoundError:
                dataset_dict = self._load_dataset(self.cached_dataset)
                train_data = dataset_dict[TRAIN_DATA_KEY]
                test_data = dataset_dict[TEST_DATA_KEY]
                train_labels = dataset_dict[TRAIN_LABELS_KEY]
                test_labels = dataset_dict[TEST_LABELS_KEY]
        else:
            train_data, test_data, train_labels, test_labels = self._get_training_inputs_by_labels(
            )

        train_size = len(train_data)
        data = self._preprocess_data(train_data + test_data)
        train_labels = np.asarray(self._labels_to_prob_vectors(train_labels))
        test_labels = np.asarray(self._labels_to_prob_vectors(test_labels))

        # perform data embedding through GloVe model
        train_data, test_data, max_cols_num = self._get_glove_embedded_data(
            data, train_size)

        split_dataset = (train_data, test_data, train_labels, test_labels)
        self._dump_dataset(split_dataset,
                           suffix=RNN_SUFFIX,
                           glove_embedding_size=EMBEDDING_SIZE,
                           max_cols_num=max_cols_num)
        return split_dataset
示例#6
0
    # Filter out short sequences from dataset.
    print('Filtering dataset...')
    filter_dataset(validation_data, NOISE_THRESHOLD, clf_input.ngrams_length)
    print('\tFiltered dataset size:', str(len(validation_data)))

    # compute predictions and show stats
    print('Computing predictions...')
    start_time = time.time()
    predictions = clf.predict(validation_data)
    elapsed_time = (time.time() - start_time)

    total_sequences_num = len(validation_data)
    good_sequences_num = sum(1 for _ in filter(
        lambda x: x == 1, predictions))  # count positive predictions
    print('\tTime:', timedelta(seconds=elapsed_time))
    print('\tFraction of good sequences: {:3.1f}%'.format(
        good_sequences_num / total_sequences_num * 100))

    # dump results
    print('Dumping predictions...')
    predictions_info = [
        str(int(time.time())), 'l_min',
        str(NOISE_THRESHOLD), 'predictions'
    ]
    predictions_filename = os.path.join(
        DATA_FOLDER,
        FILENAME_SEPARATOR.join(predictions_info) + PICKLE_EXT)
    with open(predictions_filename, 'wb') as dump:
        pickle.dump(predictions, dump)
    print('Loading validation data...')
    clf_input = SequenceClassifierInput(cached_dataset='1561471958_3_26387_GOOD')
    train_data, test_data, *_ = clf_input.get_spectrum_train_test_data()  # ignoring labels

    # SequenceClassifierInput splits the dataset in train and test by default.
    # We join them to perform validation.
    validation_data = train_data + test_data

    # Filter out short sequences from dataset.
    print('Filtering dataset...')
    filter_dataset(validation_data, NOISE_THRESHOLD, clf_input.ngrams_length)
    print('\tFiltered dataset size:', str(len(validation_data)))

    # compute predictions and show stats
    print('Computing predictions...')
    start_time = time.time()
    predictions = clf.predict(validation_data)
    elapsed_time = (time.time() - start_time)

    total_sequences_num = len(validation_data)
    good_sequences_num = sum(1 for _ in filter(lambda x: x == 1, predictions))  # count positive predictions
    print('\tTime:', timedelta(seconds=elapsed_time))
    print('\tFraction of good sequences: {:3.1f}%'.format(good_sequences_num / total_sequences_num * 100))

    # dump results
    print('Dumping predictions...')
    predictions_info = [str(int(time.time())), 'l_min', str(NOISE_THRESHOLD), 'predictions']
    predictions_filename = os.path.join(DATA_FOLDER, FILENAME_SEPARATOR.join(predictions_info) + PICKLE_EXT)
    with open(predictions_filename, 'wb') as dump:
        pickle.dump(predictions, dump)