示例#1
0
class ModelsTests(unittest.TestCase):
    def setUp(self):
        from sourced.ml.core.algorithms.id_splitter.nn_model import build_rnn, build_cnn
        self.n_uniq = len(string.ascii_lowercase)
        self.model_rnn = build_rnn(maxlen=5, units=24, stack=2, rnn_layer="LSTM",
                                   optimizer="Adam", dev0="/cpu:0", dev1="/cpu:0")
        self.model_cnn = build_cnn(maxlen=5, filters=[64, 32, 16, 8], output_n_filters=32,
                                   stack=2, kernel_sizes=[2, 4, 8, 16], optimizer="Adam",
                                   device="/cpu:0")

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_build_rnn(self):
        self.assertTrue(self.model_rnn.built)
        self.assertTrue(self.model_rnn.trainable)
        self.assertIsInstance(self.model_rnn.get_weights()[0], numpy.ndarray)
        self.assertEqual(self.model_rnn.get_weights()[0].shape, (self.n_uniq+1, self.n_uniq+1))
        self.assertTrue(self.model_rnn.uses_learning_phase)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_build_cnn(self):
        self.assertTrue(self.model_cnn.built)
        self.assertTrue(self.model_cnn.trainable)
        self.assertIsInstance(self.model_cnn.get_weights()[0], numpy.ndarray)
        self.assertEqual(self.model_cnn.get_weights()[0].shape, (self.n_uniq+1, self.n_uniq+1))
        self.assertTrue(self.model_cnn.uses_learning_phase)
示例#2
0
class NNTokenParserTests(unittest.TestCase):
    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def setUp(self):
        self.tp = TokenParser(stem_threshold=4,
                              max_token_length=20,
                              attach_upper=False,
                              use_nn=True)
        self.tp._single_shot = False

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_process_token(self):
        self.tp.max_token_length = 100

        tokens = [
            ("ONLYCAPS", ["only", "caps"]),
            ("nocaps", ["no", "caps"]),
            ("UpperCamelCase", ["upper", "camel", "case"]),
            ("camelCase", ["camel", "case"]),
            ("FRAPScase", ["frap", "case"]),
            ("SQLThing", ["sql", "thing"]),
            ("_Astra", ["astra"]),
            ("CAPS_CONST", ["caps", "const"]),
            ("_something_SILLY_", ["someth", "silli"]),
            ("blink182", ["blink"]),
            ("FooBar100500Bingo", ["foobar", "bingo"]),
            ("Man45var", ["man", "var"]),
            ("method_name", ["method", "name"]),
            ("Method_Name", ["method", "name"]),
            ("101dalms", ["dalm"]),
            ("101_dalms", ["dalm"]),
            ("101_DalmsBug", ["dalmsbug"]),
            ("101_Dalms45Bug7", ["dalm", "bug"]),
            ("wdSize", ["wd", "size"]),
            ("Glint", ["glint"]),
            ("foo_BAR", ["foo", "bar"]),
            ("sourced.ml.algorithms.uast_ids_to_bag",
             ["sourc", "d", "ml", "algorithm", "uast", "ids", "to", "bag"]),
            ("WORSTnameYOUcanIMAGINE",
             ["worst", "name", "you", "can", "imagin"]),
            # Another bad example. Parser failed to parse it correctly
            ("SmallIdsToFoOo", ["small", "ids", "to", "fooo"]),
            ("SmallIdFooo", ["small", "id", "foo", "o"]),
            ("ONE_M0re_.__badId.example",
             ["one", "m", "re", "badid", "exampl"]),
            ("never_use_Such__varsableNames",
             ["never", "use", "such", "varsabl", "name"]),
            ("a.b.c.d", ["a", "b", "c", "d"]),
            ("A.b.Cd.E", ["a", "b", "cd", "e"]),
            ("looong_sh_loooong_sh", ["looong", "sh", "loooong", "sh"]),
            ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]),
            ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"]),
        ]

        for token, correct in tokens:
            res = list(self.tp.process_token(token))
            self.assertEqual(res, correct)
示例#3
0
class MetricsTests(unittest.TestCase):
    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_register_metric(self):
        from sourced.ml.core.algorithms.id_splitter.nn_model import register_metric, METRICS
        fake_metric = "fake metric"
        register_metric(fake_metric)
        self.assertIn(fake_metric, METRICS)
        METRICS.pop()
        self.assertNotIn(fake_metric, METRICS)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_raise_register_metric(self):
        from sourced.ml.core.algorithms.id_splitter.nn_model import register_metric, METRICS
        bad_metric = 1
        with self.assertRaises(AssertionError):
            register_metric(bad_metric)
        self.assertNotIn(bad_metric, METRICS)
示例#4
0
class TensorFlowModelTests(unittest.TestCase):
    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_serialize(self):
        import tensorflow as tf
        a = tf.constant([[1, 0], [0, 1]])
        b = tf.constant([[0, 1], [1, 0]])
        c = tf.matmul(a, b)
        gd = tf.get_default_graph().as_graph_def()
        buffer = io.BytesIO()
        TensorFlowModel().construct(graphdef=gd).save(
            buffer, series="tensorflow-model")
        buffer.seek(0)
        model = TensorFlowModel().load(buffer)
        self.assertEqual(gd.node, model.graphdef.node)

        buffer = io.BytesIO()
        with tf.Session() as session:
            TensorFlowModel().construct(session=session,
                                        outputs=[c.name[:-2]]).save(
                                            buffer, series="tensorflow-model")
        buffer.seek(0)
        model = TensorFlowModel().load(buffer)
        self.assertEqual(gd.node, model.graphdef.node)
示例#5
0
class IdSplitterTest(unittest.TestCase):
    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_prepare_features(self):
        from sourced.ml.core.algorithms.id_splitter.features import prepare_features
        # check feature extraction
        text = "a a"
        n_lines = 10
        max_identifier_len = 20
        with tempfile.NamedTemporaryFile() as tmp:
            with tarfile.open(None, "w", fileobj=tmp,
                              encoding="utf-8") as tmp_tar:
                write_fake_identifiers(tmp_tar,
                                       n_lines=n_lines,
                                       char_sizes=1,
                                       n_cols=2,
                                       text=text)
            feat = prepare_features(csv_path=tmp.name,
                                    use_header=True,
                                    identifier_col=0,
                                    max_identifier_len=max_identifier_len,
                                    split_identifier_col=1,
                                    shuffle=True,
                                    test_ratio=0.5,
                                    padding="post")
            x_train, x_test, y_train, y_test = feat
            # because of test_ratio=0.5 - shapes should be equal
            self.assertEqual(x_test.shape, x_train.shape)
            self.assertEqual(y_test.shape, y_train.shape)
            # each line contains only one split -> so it should be only 5 nonzero for train/test
            self.assertEqual(numpy.sum(y_test), 5)
            self.assertEqual(numpy.sum(y_train), 5)
            # each line contains only two chars -> so it should be only 10 nonzero for train/test
            self.assertEqual(numpy.count_nonzero(x_test), 10)
            self.assertEqual(numpy.count_nonzero(x_train), 10)
            # y should be 3 dimensional matrix
            self.assertEqual(y_test.ndim, 3)
            self.assertEqual(y_train.ndim, 3)
            # x should be 2 dimensional matrix
            self.assertEqual(x_test.ndim, 2)
            self.assertEqual(x_train.ndim, 2)
            # check number of samples
            self.assertEqual(x_test.shape[0] + x_train.shape[0], n_lines)
            self.assertEqual(y_test.shape[0] + y_train.shape[0], n_lines)
            # check max_identifier_len
            self.assertEqual(x_test.shape[1], max_identifier_len)
            self.assertEqual(x_train.shape[1], max_identifier_len)
            self.assertEqual(y_test.shape[1], max_identifier_len)
            self.assertEqual(y_train.shape[1], max_identifier_len)

        # normal file
        try:
            prepare_features(csv_path=IDENTIFIERS,
                             use_header=True,
                             identifier_col=0,
                             max_identifier_len=max_identifier_len,
                             split_identifier_col=1,
                             shuffle=True,
                             test_ratio=0.5,
                             padding="post")
        except Exception as e:
            self.fail("prepare_features raised %s with log %s" %
                      (type(e), str(e)))

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_read_identifiers(self):
        from sourced.ml.core.algorithms.id_splitter.features import read_identifiers
        # read with header
        with tempfile.NamedTemporaryFile() as tmp:
            with tarfile.open(None, "w", fileobj=tmp,
                              encoding="utf-8") as tmp_tar:
                write_fake_identifiers(tmp_tar,
                                       n_lines=10,
                                       char_sizes=1,
                                       n_cols=5)

            res = read_identifiers(csv_path=tmp.name,
                                   use_header=True,
                                   max_identifier_len=10,
                                   identifier_col=3,
                                   split_identifier_col=4)
            self.assertEqual(len(res), 10)

        # read without header
        with tempfile.NamedTemporaryFile() as tmp:
            with tarfile.open(None, "w", fileobj=tmp,
                              encoding="utf-8") as tmp_tar:
                write_fake_identifiers(tmp_tar,
                                       n_lines=10,
                                       char_sizes=1,
                                       n_cols=5)

            res = read_identifiers(csv_path=tmp.name,
                                   use_header=False,
                                   max_identifier_len=10,
                                   identifier_col=3,
                                   split_identifier_col=4)
            self.assertEqual(len(res), 9)

        # read with max_identifier_len equal to 0 -> expect empty list
        with tempfile.NamedTemporaryFile() as tmp:
            with tarfile.open(None, "w", fileobj=tmp,
                              encoding="utf-8") as tmp_tar:
                write_fake_identifiers(tmp_tar,
                                       n_lines=10,
                                       char_sizes=1,
                                       n_cols=5)

            res = read_identifiers(csv_path=tmp.name,
                                   use_header=True,
                                   max_identifier_len=0,
                                   identifier_col=3,
                                   split_identifier_col=4)
            self.assertEqual(len(res), 0)

        # generate temporary file with identifiers of specific lengths and filter by length
        char_sizes = list(range(1, 11))

        with tempfile.NamedTemporaryFile() as tmp:
            with tarfile.open(None, "w", fileobj=tmp,
                              encoding="utf-8") as tmp_tar:
                write_fake_identifiers(tmp_tar,
                                       n_lines=10,
                                       char_sizes=char_sizes,
                                       n_cols=5)

            # check filtering
            # read last two columns as identifiers
            for i in range(11):
                res = read_identifiers(csv_path=tmp.name,
                                       use_header=True,
                                       max_identifier_len=i,
                                       identifier_col=3,
                                       split_identifier_col=4)
                self.assertEqual(len(res), i)

        # read wrong columns
        with tempfile.NamedTemporaryFile() as tmp:
            with tarfile.open(None, "w", fileobj=tmp,
                              encoding="utf-8") as tmp_tar:
                write_fake_identifiers(tmp_tar,
                                       n_lines=10,
                                       char_sizes=char_sizes,
                                       n_cols=2)

            with self.assertRaises(IndexError):
                read_identifiers(csv_path=tmp.name,
                                 use_header=True,
                                 max_identifier_len=10,
                                 identifier_col=3,
                                 split_identifier_col=4)

        # normal file
        try:
            read_identifiers(csv_path=IDENTIFIERS,
                             use_header=True,
                             max_identifier_len=10,
                             identifier_col=3,
                             split_identifier_col=4)
        except Exception as e:
            self.fail("read_identifiers raised %s with log %s" %
                      (type(e), str(e)))
示例#6
0
        self.assertIsInstance(self.model_rnn.get_weights()[0], numpy.ndarray)
        self.assertEqual(self.model_rnn.get_weights()[0].shape,
                         (self.n_uniq + 1, self.n_uniq + 1))
        self.assertTrue(self.model_rnn.uses_learning_phase)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_build_cnn(self):
        self.assertTrue(self.model_cnn.built)
        self.assertTrue(self.model_cnn.trainable)
        self.assertIsInstance(self.model_cnn.get_weights()[0], numpy.ndarray)
        self.assertEqual(self.model_cnn.get_weights()[0].shape,
                         (self.n_uniq + 1, self.n_uniq + 1))
        self.assertTrue(self.model_cnn.uses_learning_phase)


@unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
class NNModelTest(unittest.TestCase):
    def setUp(self):
        from sourced.ml.core.models.id_splitter import IdentifierSplitterBiLSTM
        self.test_X = [
            "networkSocket", "variablename", "loadfile", "blahblah", "foobar"
        ]
        self.test_y = [
            "network", "socket", "variable", "name", "load", "file", "blah",
            "blah", "foobar"
        ]
        self.id_splitter = IdentifierSplitterBiLSTM()
        self.id_splitter.load(ID_SPLITTER_BILSTM)

    def test_load_and_run_model(self):
        self.assertEqual(self.id_splitter.split(self.test_X), self.test_y)
示例#7
0
class IdSplitterPipelineTest(unittest.TestCase):
    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_binarize(self):
        from sourced.ml.core.algorithms.id_splitter.pipeline import binarize
        thresholds = [
            0, 0.09, 0.19, 0.29, 0.39, 0.49, 0.59, 0.69, 0.79, 0.89, 0.99
        ]
        n_pos = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

        for th, n_p in zip(thresholds, n_pos):
            vals = numpy.arange(10) / 10
            res = binarize(vals, th)
            self.assertEqual(sum(binarize(vals, th)), n_p)
            if th in (0, 0.99):
                self.assertEqual(numpy.unique(res).shape[0], 1)
            else:
                self.assertEqual(numpy.unique(res).shape[0], 2)

        vals = numpy.arange(10) / 10
        old_vals = vals.copy()
        for th, n_p in zip(thresholds, n_pos):
            res = binarize(vals, th, inplace=False)
            self.assertEqual(sum(res), n_p)
            self.assertTrue(numpy.array_equal(old_vals, vals))
            if th in (0, 0.99):
                self.assertEqual(numpy.unique(res).shape[0], 1)
            else:
                self.assertEqual(numpy.unique(res).shape[0], 2)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_prepare_devices(self):
        from sourced.ml.core.algorithms.id_splitter.nn_model import prepare_devices
        correct_args = ["1", "0,1", "-1"]
        resulted_dev = [("/gpu:1", "/gpu:1"), ("/gpu:0", "/gpu:1"),
                        ("/cpu:0", "/cpu:0")]
        for res, arg in zip(resulted_dev, correct_args):
            self.assertEqual(res, prepare_devices(arg))

        bad_args = ["", "1,2,3"]
        for arg in bad_args:
            with self.assertRaises(ValueError):
                prepare_devices(arg)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_build_schedule(self):
        from sourced.ml.core.algorithms.id_splitter.pipeline import build_schedule
        start_lr = 10
        end_lr = 1
        n_epochs = 9

        lr_schedule = build_schedule(lr=start_lr,
                                     final_lr=end_lr,
                                     n_epochs=n_epochs)

        for i in range(n_epochs):
            self.assertEqual(start_lr - i, lr_schedule(epoch=i))

        with self.assertRaises(AssertionError):
            lr_schedule(-1)
        with self.assertRaises(AssertionError):
            lr_schedule(n_epochs + 1)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_build_train_generator(self):
        from sourced.ml.core.algorithms.id_splitter.pipeline import build_train_generator
        batch_size = 3
        # mismatch number of samples
        bad_x = numpy.zeros(3)
        bad_y = numpy.zeros(4)
        with self.assertRaises(AssertionError):
            build_train_generator(bad_x, bad_y, batch_size=batch_size)

        # check generator with correct inputs
        x = numpy.zeros(5)
        gen = build_train_generator(x, x, batch_size=batch_size)
        expected_n_samples = [3, 2]
        for n_samples in expected_n_samples:
            x_gen, y_gen = next(gen)
            self.assertEqual(x_gen.shape, y_gen.shape)
            self.assertEqual(n_samples, x_gen.shape[0])

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_train_parameters(self):
        from sourced.ml.core.algorithms.id_splitter.pipeline import create_generator_params
        batch_size = 500
        samples_per_epoch = 10**6
        n_samples = 40 * 10**6
        epochs = 10

        steps_per_epoch_ = samples_per_epoch // batch_size
        n_epochs_ = numpy.ceil(epochs * n_samples / samples_per_epoch)

        steps_per_epoch, n_epochs = create_generator_params(
            batch_size, samples_per_epoch, n_samples, epochs)
        self.assertEqual(steps_per_epoch, steps_per_epoch_)
        self.assertEqual(n_epochs, n_epochs_)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_config_keras(self):
        from keras.backend.tensorflow_backend import get_session
        from sourced.ml.core.algorithms.id_splitter.pipeline import config_keras
        config_keras()
        sess = get_session()
        self.assertTrue(sess._config.gpu_options.allow_growth)

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_prepare_callbacks(self):
        from keras.callbacks import TensorBoard, CSVLogger, ModelCheckpoint
        from sourced.ml.core.algorithms.id_splitter.pipeline import prepare_callbacks
        with tempfile.TemporaryDirectory() as tmpdir:
            callbacks = prepare_callbacks(tmpdir)

            # TensorBoard
            self.assertIsInstance(callbacks[0], TensorBoard)
            self.assertTrue(callbacks[0].log_dir.startswith(tmpdir))

            # CSVLogger
            self.assertIsInstance(callbacks[1], CSVLogger)
            self.assertTrue(callbacks[1].filename.startswith(tmpdir))

            # ModelCheckpoint
            self.assertIsInstance(callbacks[2], ModelCheckpoint)
            self.assertTrue(callbacks[2].filepath.startswith(tmpdir))