예제 #1
0
class TrainIdSplitTest(unittest.TestCase):
    def setUp(self):
        self.input = os.path.join(os.path.dirname(__file__),
                                  "identifiers.csv.tar.gz")

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_id_split_train(self):
        try:
            with tempfile.TemporaryDirectory() as tmpdir:
                args = argparse.Namespace(input=self.input,
                                          output=tmpdir,
                                          model="CNN",
                                          devices="-1",
                                          test_ratio=0.2,
                                          padding="post",
                                          optimizer="Adam",
                                          batch_size=2,
                                          val_batch_size=2,
                                          length=10,
                                          dim_reduction=2,
                                          epochs=1,
                                          samples_before_report=10,
                                          lr=0.001,
                                          final_lr=0.00001,
                                          seed=1989,
                                          csv_identifier=3,
                                          csv_identifier_split=4,
                                          stack=2,
                                          include_csv_header=True,
                                          filters="64,32,16,8",
                                          kernel_sizes="2,4,8,16")
                train_id_split(args)
        except Exception as e:
            self.fail("CNN training raised %s with log: %s" %
                      (type(e), str(e)))
예제 #2
0
class MetricsTests(unittest.TestCase):
    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_register_metric(self):
        from sourced.ml.algorithms.id_splitter.nn_model import register_metric, METRICS
        fake_metric = "fake metric"
        register_metric(fake_metric)
        self.assertIn(fake_metric, METRICS)
        METRICS.pop()
        self.assertNotIn(fake_metric, METRICS)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_raise_register_metric(self):
        from sourced.ml.algorithms.id_splitter.nn_model import register_metric, METRICS
        bad_metric = 1
        with self.assertRaises(AssertionError):
            register_metric(bad_metric)
        self.assertNotIn(bad_metric, METRICS)
예제 #3
0
class ModelsTests(unittest.TestCase):
    def setUp(self):
        from sourced.ml.algorithms.id_splitter.nn_model import build_rnn, build_cnn
        self.n_uniq = len(string.ascii_lowercase)
        self.model_rnn = build_rnn(maxlen=5,
                                   units=24,
                                   stack=2,
                                   rnn_layer="LSTM",
                                   optimizer="Adam",
                                   dev0="/cpu:0",
                                   dev1="/cpu:0")
        self.model_cnn = build_cnn(maxlen=5,
                                   filters=[64, 32, 16, 8],
                                   output_n_filters=32,
                                   stack=2,
                                   kernel_sizes=[2, 4, 8, 16],
                                   optimizer="Adam",
                                   device="/cpu:0")

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_build_rnn(self):
        self.assertTrue(self.model_rnn.built)
        self.assertTrue(self.model_rnn.trainable)
        self.assertIsInstance(self.model_rnn.get_weights()[0], numpy.ndarray)
        self.assertEqual(self.model_rnn.get_weights()[0].shape,
                         (self.n_uniq + 1, self.n_uniq + 1))
        self.assertTrue(self.model_rnn.uses_learning_phase)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_build_cnn(self):
        self.assertTrue(self.model_cnn.built)
        self.assertTrue(self.model_cnn.trainable)
        self.assertIsInstance(self.model_cnn.get_weights()[0], numpy.ndarray)
        self.assertEqual(self.model_cnn.get_weights()[0].shape,
                         (self.n_uniq + 1, self.n_uniq + 1))
        self.assertTrue(self.model_cnn.uses_learning_phase)
예제 #4
0
class TensorFlowModelTests(unittest.TestCase):
    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_serialize(self):
        import tensorflow as tf
        a = tf.constant([[1, 0], [0, 1]])
        b = tf.constant([[0, 1], [1, 0]])
        c = tf.matmul(a, b)
        gd = tf.get_default_graph().as_graph_def()
        buffer = io.BytesIO()
        TensorFlowModel().construct(graphdef=gd).save(buffer)
        buffer.seek(0)
        model = TensorFlowModel().load(buffer)
        self.assertEqual(gd.node, model.graphdef.node)

        buffer = io.BytesIO()
        with tf.Session() as session:
            TensorFlowModel().construct(session=session,
                                        outputs=[c.name[:-2]]).save(buffer)
        buffer.seek(0)
        model = TensorFlowModel().load(buffer)
        self.assertEqual(gd.node, model.graphdef.node)
예제 #5
0
class IdEmbeddingTests(unittest.TestCase):
    def test_preprocess_bad_params(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            args = default_preprocess_params(tmpdir, VOCAB)
            args.shard_size = VOCAB + 1
            self.assertRaises(ValueError, lambda: id2vec_preprocess(args))

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_preprocess(self):
        import tensorflow as tf
        with tempfile.TemporaryDirectory() as tmpdir:
            args = default_preprocess_params(tmpdir, VOCAB)
            with captured_output() as (out, err, log):
                id2vec_preprocess(args)
            self.assertFalse(out.getvalue())
            self.assertFalse(err.getvalue())
            self.assertEqual(sorted(os.listdir(tmpdir)), [
                "col_sums.txt", "col_vocab.txt", "row_sums.txt",
                "row_vocab.txt", "shard-000-000.pb"
            ])
            df = OrderedDocumentFrequencies().load(source=args.docfreq_in)
            self.assertEqual(len(df), VOCAB)
            with open(os.path.join(tmpdir, "col_sums.txt")) as fin:
                col_sums = fin.read()
            with open(os.path.join(tmpdir, "row_sums.txt")) as fin:
                row_sums = fin.read()
            self.assertEqual(col_sums, row_sums)
            with open(os.path.join(tmpdir, "col_vocab.txt")) as fin:
                col_vocab = fin.read()
            with open(os.path.join(tmpdir, "row_vocab.txt")) as fin:
                row_vocab = fin.read()
            self.assertEqual(col_vocab, row_vocab)
            self.assertEqual(row_vocab.split("\n"), df.tokens())
            for word in row_vocab.split("\n"):
                self.assertGreater(df[word], 0)
            with open(os.path.join(tmpdir, "shard-000-000.pb"), "rb") as fin:
                features = tf.parse_single_example(
                    fin.read(),
                    features={
                        "global_row": tf.FixedLenFeature([VOCAB],
                                                         dtype=tf.int64),
                        "global_col": tf.FixedLenFeature([VOCAB],
                                                         dtype=tf.int64),
                        "sparse_local_row": tf.VarLenFeature(dtype=tf.int64),
                        "sparse_local_col": tf.VarLenFeature(dtype=tf.int64),
                        "sparse_value": tf.VarLenFeature(dtype=tf.float32)
                    })
            with tf.Session() as session:
                global_row, global_col, local_row, local_col, value = session.run(
                    [
                        features[n]
                        for n in ("global_row", "global_col",
                                  "sparse_local_row", "sparse_local_col",
                                  "sparse_value")
                    ])
            self.assertEqual(set(range(VOCAB)), set(global_row))
            self.assertEqual(set(range(VOCAB)), set(global_col))
            nnz = 16001
            self.assertEqual(value.values.shape, (nnz, ))
            self.assertEqual(local_row.values.shape, (nnz, ))
            self.assertEqual(local_col.values.shape, (nnz, ))
            numpy.random.seed(0)
            all_tokens = row_vocab.split("\n")
            chosen_indices = numpy.random.choice(list(range(VOCAB)),
                                                 128,
                                                 replace=False)
            chosen = [all_tokens[i] for i in chosen_indices]
            freqs = numpy.zeros((len(chosen), ) * 2, dtype=int)
            index = {w: i for i, w in enumerate(chosen)}
            chosen = set(chosen)
            with asdf.open(args.input) as model:
                matrix = assemble_sparse_matrix(model.tree["matrix"]).tocsr()
                tokens = split_strings(model.tree["tokens"])
                interesting = {i for i, t in enumerate(tokens) if t in chosen}
                for y in interesting:
                    row = matrix[y]
                    yi = index[tokens[y]]
                    for x, v in zip(row.indices, row.data):
                        if x in interesting:
                            freqs[yi, index[tokens[x]]] += v
            matrix = coo_matrix(
                (value.values,
                 ([global_row[row] for row in local_row.values
                   ], [global_col[col] for col in local_col.values])),
                shape=(VOCAB, VOCAB))
            matrix = matrix.tocsr()[chosen_indices][:, chosen_indices].todense(
            ).astype(int)
            self.assertTrue((matrix == freqs).all())

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_swivel_bad_params_submatrix_cols(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            args = default_swivel_args(tmpdir)
            args.submatrix_cols += 1
            self.assertRaises(ValueError, lambda: run_swivel(args))

            args.submatrix_cols -= 1
            args.submatrix_rows += 1
            self.assertRaises(ValueError, lambda: run_swivel(args))

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_swivel(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            args = default_swivel_args(tmpdir)
            run_swivel(args)
            check_swivel_results(self, tmpdir)

    def test_postprocess(self):
        buffer = BytesIO()
        args = argparse.Namespace(swivel_data=os.path.join(
            os.path.dirname(__file__), "postproc"),
                                  output=buffer)
        prepare_postproc_files(args.swivel_data)

        id2vec_postprocess(args)

        buffer.seek(0)
        check_postproc_results(self, buffer)
예제 #6
0
class IdSplitterTest(unittest.TestCase):
    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_prepare_features(self):
        from sourced.ml.algorithms.id_splitter.features import prepare_features
        # check feature extraction
        text = "a a"
        n_lines = 10
        max_identifier_len = 20
        with tempfile.NamedTemporaryFile() as tmp:
            with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar:
                write_fake_identifiers(tmp_tar, n_lines=n_lines, char_sizes=1, n_cols=2, text=text)
            feat = prepare_features(csv_path=tmp.name, use_header=True, identifier_col=0,
                                    max_identifier_len=max_identifier_len, split_identifier_col=1,
                                    shuffle=True, test_ratio=0.5, padding="post")
            x_train, x_test, y_train, y_test = feat
            # because of test_ratio=0.5 - shapes should be equal
            self.assertEqual(x_test.shape, x_train.shape)
            self.assertEqual(y_test.shape, y_train.shape)
            # each line contains only one split -> so it should be only 5 nonzero for train/test
            self.assertEqual(numpy.sum(y_test), 5)
            self.assertEqual(numpy.sum(y_train), 5)
            # each line contains only two chars -> so it should be only 10 nonzero for train/test
            self.assertEqual(numpy.count_nonzero(x_test), 10)
            self.assertEqual(numpy.count_nonzero(x_train), 10)
            # y should be 3 dimensional matrix
            self.assertEqual(y_test.ndim, 3)
            self.assertEqual(y_train.ndim, 3)
            # x should be 2 dimensional matrix
            self.assertEqual(x_test.ndim, 2)
            self.assertEqual(x_train.ndim, 2)
            # check number of samples
            self.assertEqual(x_test.shape[0] + x_train.shape[0], n_lines)
            self.assertEqual(y_test.shape[0] + y_train.shape[0], n_lines)
            # check max_identifier_len
            self.assertEqual(x_test.shape[1], max_identifier_len)
            self.assertEqual(x_train.shape[1], max_identifier_len)
            self.assertEqual(y_test.shape[1], max_identifier_len)
            self.assertEqual(y_train.shape[1], max_identifier_len)

        # normal file
        try:
            prepare_features(csv_path=IDENTIFIERS, use_header=True, identifier_col=0,
                             max_identifier_len=max_identifier_len, split_identifier_col=1,
                             shuffle=True, test_ratio=0.5, padding="post")
        except Exception as e:
            self.fail("prepare_features raised %s with log %s" % (type(e), str(e)))

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_read_identifiers(self):
        from sourced.ml.algorithms.id_splitter.features import read_identifiers
        # read with header
        with tempfile.NamedTemporaryFile() as tmp:
            with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar:
                write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5)

            res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=10,
                                   identifier_col=3, split_identifier_col=4)
            self.assertEqual(len(res), 10)

        # read without header
        with tempfile.NamedTemporaryFile() as tmp:
            with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar:
                write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5)

            res = read_identifiers(csv_path=tmp.name, use_header=False, max_identifier_len=10,
                                   identifier_col=3, split_identifier_col=4)
            self.assertEqual(len(res), 9)

        # read with max_identifier_len equal to 0 -> expect empty list
        with tempfile.NamedTemporaryFile() as tmp:
            with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar:
                write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5)

            res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=0,
                                   identifier_col=3, split_identifier_col=4)
            self.assertEqual(len(res), 0)

        # generate temporary file with identifiers of specific lengths and filter by length
        char_sizes = list(range(1, 11))

        with tempfile.NamedTemporaryFile() as tmp:
            with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar:
                write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=char_sizes, n_cols=5)

            # check filtering
            # read last two columns as identifiers
            for i in range(11):
                res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=i,
                                       identifier_col=3, split_identifier_col=4)
                self.assertEqual(len(res), i)

        # read wrong columns
        with tempfile.NamedTemporaryFile() as tmp:
            with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar:
                write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=char_sizes, n_cols=2)

            with self.assertRaises(IndexError):
                read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=10,
                                 identifier_col=3, split_identifier_col=4)

        # normal file
        try:
            read_identifiers(csv_path=IDENTIFIERS, use_header=True, max_identifier_len=10,
                             identifier_col=3, split_identifier_col=4)
        except Exception as e:
            self.fail("read_identifiers raised %s with log %s" % (type(e), str(e)))
예제 #7
0
class IdSplitterPipelineTest(unittest.TestCase):
    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_binarize(self):
        from sourced.ml.algorithms.id_splitter.pipeline import binarize
        thresholds = [0, 0.09, 0.19, 0.29, 0.39, 0.49, 0.59, 0.69, 0.79, 0.89, 0.99]
        n_pos = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

        for th, n_p in zip(thresholds, n_pos):
            vals = numpy.arange(10) / 10
            res = binarize(vals, th)
            self.assertEqual(sum(binarize(vals, th)), n_p)
            if th in (0, 0.99):
                self.assertEqual(numpy.unique(res).shape[0], 1)
            else:
                self.assertEqual(numpy.unique(res).shape[0], 2)

        vals = numpy.arange(10) / 10
        old_vals = vals.copy()
        for th, n_p in zip(thresholds, n_pos):
            res = binarize(vals, th, inplace=False)
            self.assertEqual(sum(res), n_p)
            self.assertTrue(numpy.array_equal(old_vals, vals))
            if th in (0, 0.99):
                self.assertEqual(numpy.unique(res).shape[0], 1)
            else:
                self.assertEqual(numpy.unique(res).shape[0], 2)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_prepare_devices(self):
        from sourced.ml.algorithms.id_splitter.nn_model import prepare_devices
        correct_args = ["1", "0,1", "-1"]
        resulted_dev = [("/gpu:1", "/gpu:1"), ("/gpu:0", "/gpu:1"), ("/cpu:0", "/cpu:0")]
        for res, arg in zip(resulted_dev, correct_args):
            self.assertEqual(res, prepare_devices(arg))

        bad_args = ["", "1,2,3"]
        for arg in bad_args:
            with self.assertRaises(ValueError):
                prepare_devices(arg)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_build_schedule(self):
        from sourced.ml.algorithms.id_splitter.pipeline import build_schedule
        start_lr = 10
        end_lr = 1
        n_epochs = 9

        lr_schedule = build_schedule(lr=start_lr, final_lr=end_lr, n_epochs=n_epochs)

        for i in range(n_epochs):
            self.assertEqual(start_lr - i, lr_schedule(epoch=i))

        with self.assertRaises(AssertionError):
            lr_schedule(-1)
        with self.assertRaises(AssertionError):
            lr_schedule(n_epochs + 1)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_build_train_generator(self):
        from sourced.ml.algorithms.id_splitter.pipeline import build_train_generator
        batch_size = 3
        # mismatch number of samples
        bad_x = numpy.zeros(3)
        bad_y = numpy.zeros(4)
        with self.assertRaises(AssertionError):
            build_train_generator(bad_x, bad_y, batch_size=batch_size)

        # check generator with correct inputs
        x = numpy.zeros(5)
        gen = build_train_generator(x, x, batch_size=batch_size)
        expected_n_samples = [3, 2]
        for n_samples in expected_n_samples:
            x_gen, y_gen = next(gen)
            self.assertEqual(x_gen.shape, y_gen.shape)
            self.assertEqual(n_samples, x_gen.shape[0])

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_train_parameters(self):
        from sourced.ml.algorithms.id_splitter.pipeline import create_generator_params
        batch_size = 500
        samples_per_epoch = 10 ** 6
        n_samples = 40 * 10 ** 6
        epochs = 10

        steps_per_epoch_ = samples_per_epoch // batch_size
        n_epochs_ = numpy.ceil(epochs * n_samples / samples_per_epoch)

        steps_per_epoch, n_epochs = create_generator_params(batch_size, samples_per_epoch,
                                                            n_samples, epochs)
        self.assertEqual(steps_per_epoch, steps_per_epoch_)
        self.assertEqual(n_epochs, n_epochs_)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_config_keras(self):
        from keras.backend.tensorflow_backend import get_session
        from sourced.ml.algorithms.id_splitter.pipeline import config_keras
        config_keras()
        sess = get_session()
        self.assertTrue(sess._config.gpu_options.allow_growth)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_prepare_callbacks(self):
        from keras.callbacks import TensorBoard, CSVLogger, ModelCheckpoint
        from sourced.ml.algorithms.id_splitter.pipeline import prepare_callbacks
        with tempfile.TemporaryDirectory() as tmpdir:
            callbacks = prepare_callbacks(tmpdir)

            # TensorBoard
            self.assertIsInstance(callbacks[0], TensorBoard)
            self.assertTrue(callbacks[0].log_dir.startswith(tmpdir))

            # CSVLogger
            self.assertIsInstance(callbacks[1], CSVLogger)
            self.assertTrue(callbacks[1].filename.startswith(tmpdir))

            # ModelCheckpoint
            self.assertIsInstance(callbacks[2], ModelCheckpoint)
            self.assertTrue(callbacks[2].filepath.startswith(tmpdir))