Пример #1
0
def test_ngram_delta_small_threshold_tf():
    """Test to verify that a small threshold work in ngrams."""

    with temporary_directory() as tmp_dir:
        tmp_url = 'file://{}'.format(tmp_dir)
        ids = range(0, 99, 5)
        create_test_dataset(tmp_url, ids)

        fields = {
            0: [
                TestSchema.id, TestSchema.id2, TestSchema.image_png,
                TestSchema.matrix
            ],
            1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
        }
        ngram = NGram(fields=fields,
                      delta_threshold=1,
                      timestamp_field=TestSchema.id)
        reader = Reader(
            schema_fields=ngram,
            dataset_url=tmp_url,
            reader_pool=DummyPool(),
        )

        with tf.Session() as sess:
            with pytest.raises(OutOfRangeError):
                sess.run(tf_tensors(reader))

        reader.stop()
        reader.join()
Пример #2
0
def _test_noncontinuous_ngram_tf(ngram_fields, synthetic_dataset):
    """Test non continuous ngram in tf of a certain length. Non continuous here refers
    to that the reader will not necessarily return consecutive ngrams because partition is more
    than one and false is true."""

    dataset_dicts = synthetic_dataset.data
    ngram = NGram(fields=ngram_fields,
                  delta_threshold=10,
                  timestamp_field=TestSchema.id)
    reader = Reader(
        schema_fields=ngram,
        dataset_url=synthetic_dataset.url,
        reader_pool=ThreadPool(1),
    )

    readout_examples = tf_tensors(reader)

    # Make sure we have static shape info for all fields
    for timestep in readout_examples:
        for field in readout_examples[timestep]:
            assert field.get_shape().dims is not None

    # Read a bunch of entries from the dataset and compare the data to reference
    with tf.Session() as sess:
        for _ in range(5):
            actual = sess.run(readout_examples)
            expected_ngram = _get_named_tuple_from_ngram(
                ngram, dataset_dicts, actual[min(actual.keys())].id)
            _assert_equal_ngram(actual, expected_ngram)

    reader.stop()
    reader.join()
Пример #3
0
    def _read_from_tf_tensors(self, count, shuffling_queue_capacity,
                              min_after_dequeue, ngram):
        """Used by several test cases. Reads a 'count' rows using reader.

        The reader is configured without row-group shuffling and guarantees deterministic order of rows up to the
        results queue TF shuffling which is controlled by 'shuffling_queue_capacity', 'min_after_dequeue' arguments.

        The function returns a tuple with: (actual data read from the dataset, a TF tensor returned by the reader)
        """

        # Nullable fields can not be read by tensorflow (what would be the dimension of a tensor for null data?)
        fields = set(TestSchema.fields.values()) - {
            TestSchema.matrix_nullable, TestSchema.string_array_nullable
        }
        schema_fields = (fields if ngram is None else ngram)

        reader = Reader(schema_fields=schema_fields,
                        dataset_url=self._dataset_url,
                        reader_pool=DummyPool(),
                        shuffle=False)

        row_tensors = tf_tensors(
            reader,
            shuffling_queue_capacity=shuffling_queue_capacity,
            min_after_dequeue=min_after_dequeue)

        # Read a bunch of entries from the dataset and compare the data to reference
        with tf.Session() as sess:
            sess.run([
                tf.global_variables_initializer(),
                tf.local_variables_initializer()
            ])

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord, start=True)

            # Collect all the data we need from 'count' number of reads
            rows_data = [sess.run(row_tensors) for _ in range(count)]

            coord.request_stop()
            coord.join(threads)

        reader.stop()
        reader.join()

        return rows_data, row_tensors
Пример #4
0
def test_partition_multi_node(synthetic_dataset):
    """Tests that the reader only returns half of the expected data consistently"""
    reader = Reader(synthetic_dataset.url,
                    reader_pool=DummyPool(),
                    training_partition=0,
                    num_training_partitions=5)
    reader_2 = Reader(synthetic_dataset.url,
                      reader_pool=DummyPool(),
                      training_partition=0,
                      num_training_partitions=5)

    results_1 = []
    expected = []
    for row in reader:
        actual = dict(row._asdict())
        results_1.append(actual)
        expected.append(
            next(d for d in synthetic_dataset.data if d['id'] == actual['id']))

    results_2 = [dict(row._asdict()) for row in reader_2]

    # Since order is non deterministic, we need to sort results by id
    results_1.sort(key=lambda x: x['id'])
    results_2.sort(key=lambda x: x['id'])
    expected.sort(key=lambda x: x['id'])

    np.testing.assert_equal(expected, results_1)
    np.testing.assert_equal(results_1, results_2)

    assert len(results_1) < len(synthetic_dataset.data)

    # Test that separate partitions also have no overlap by checking ids
    id_set = set([item['id'] for item in results_1])
    for partition in range(1, 5):
        with Reader(synthetic_dataset.url,
                    reader_pool=DummyPool(),
                    training_partition=partition,
                    num_training_partitions=5) as reader_other:

            for row in reader_other:
                assert dict(row._asdict())['id'] not in id_set

    reader.stop()
    reader.join()
    reader_2.stop()
    reader_2.join()
Пример #5
0
def test_ngram_length_1_tf(synthetic_dataset):
    """Test to verify that ngram generalize to support length 1"""
    dataset_dicts = synthetic_dataset.data
    fields = {0: [TestSchema.id, TestSchema.id2]}
    ngram = NGram(fields=fields,
                  delta_threshold=0.012,
                  timestamp_field=TestSchema.id)
    reader = Reader(synthetic_dataset.url,
                    schema_fields=ngram,
                    shuffle_options=ShuffleOptions(True, 5),
                    reader_pool=DummyPool())
    with tf.Session() as sess:
        for _ in range(10):
            actual = sess.run(tf_tensors(reader))
            expected_ngram = _get_named_tuple_from_ngram(
                ngram, dataset_dicts, actual[min(actual.keys())].id)
            _assert_equal_ngram(actual, expected_ngram)

    reader.stop()
    reader.join()