示例#1
0
    def test_save_decomposed_filter_layers(self, get_model_mock, extraction_mock):
        "Saving activations in multiple hdf5 files, for specific layers"
        get_model_mock.return_value = (None, None)
        extraction_mock.side_effect = self.mocked_model

        base_output_file = os.path.join(self.tmpdir.name, "output.hdf5")
        filter_layers = [1,3,5,7,12]

        output_files = [os.path.join(self.tmpdir.name, f"output-layer{layer_idx}.hdf5") for layer_idx in filter_layers]

        transformers_extractor.extract_representations("non-existant model", self.input_file, base_output_file, decompose_layers=True, filter_layers=",".join(map(str,filter_layers)))

        for layer_idx, output_file in zip(filter_layers, output_files):
            saved_activations = h5py.File(output_file, "r")

            # Check hdf5 structure
            self.assertEqual(len(saved_activations.keys()), len(self.test_sentences) + 1)
            self.assertTrue("sentence_to_index" in saved_activations)
            for idx in range(len(self.test_sentences)):
                self.assertTrue(str(idx) in saved_activations)

            # Check saved sentences
            self.assertEqual(len(saved_activations["sentence_to_index"]), 1)
            sentence_to_index = json.loads(saved_activations["sentence_to_index"][0])
            self.assertEqual(len(sentence_to_index), len(self.test_sentences))
            for sentence in sentence_to_index:
                self.assertEqual(sentence, self.test_sentences[int(sentence_to_index[sentence])])

            # Check saved activations
            for sentence in sentence_to_index:
                idx = sentence_to_index[sentence]
                self.assertTrue(torch.equal(torch.FloatTensor(saved_activations[idx]), self.expected_activations[int(idx)][layer_idx, :, :]))
示例#2
0
    def test_save_json(self, get_model_mock, extraction_mock):
        "Saving activations in single json file"
        get_model_mock.return_value = (None, None)
        extraction_mock.side_effect = self.mocked_model

        output_file = os.path.join(self.tmpdir.name, "output.json")

        transformers_extractor.extract_representations("non-existant model", self.input_file, output_file, output_type="json")

        with open(output_file) as fp:
            saved_activations = []
            for line in fp:
                saved_activations.append(json.loads(line))

        # Check json structure
        self.assertEqual(len(saved_activations), len(self.test_sentences))

        for representation in saved_activations:
            self.assertIn("linex_index", representation)
            self.assertIn("features", representation)

        # Check sentences and activations
        for idx, representation in enumerate(saved_activations):
            tokens = self.test_sentences[idx].split(" ")
            self.assertEqual(len(representation["features"]), len(tokens))
            for token_idx, token_repr in enumerate(representation["features"]):
                self.assertEqual(token_repr["token"], tokens[token_idx])
                self.assertEqual(len(token_repr["layers"]), 13)
                for layer_idx in range(13):
                    # Using allclose instead of equals since json is a lossy format
                    self.assertTrue(torch.allclose(
                        torch.Tensor(token_repr["layers"][layer_idx]["values"]),
                        self.expected_activations[idx][layer_idx, token_idx, :]
                    ))
    def test_save_filter_layers(self, get_model_mock, extraction_mock):
        "Saving activations from specific layers"
        get_model_mock.return_value = (None, None)
        extraction_mock.side_effect = self.mocked_model

        output_file = os.path.join(self.tmpdir.name, "output.hdf5")
        filter_layers = [1, 3, 5, 7, 12]

        transformers_extractor.extract_representations(
            "non-existant model",
            self.input_file,
            output_file,
            filter_layers=",".join(map(str, filter_layers)),
        )

        saved_activations = h5py.File(output_file, "r")
        sentence_to_index = json.loads(
            saved_activations["sentence_to_index"][0])

        # Check saved activations
        for sentence in self.test_sentences:
            idx = sentence_to_index[sentence]
            self.assertTrue(
                torch.equal(
                    torch.FloatTensor(saved_activations[idx]),
                    self.expected_activations[int(idx)][filter_layers, :, :],
                ))
示例#4
0
    def test_save_hdf5(self, get_model_mock, extraction_mock):
        "Saving activations in single hdf5 file"
        get_model_mock.return_value = (None, None)
        extraction_mock.side_effect = self.mocked_model

        output_file = os.path.join(self.tmpdir.name, "output.hdf5")

        transformers_extractor.extract_representations("non-existant model", self.input_file, output_file, output_type="hdf5")

        saved_activations = h5py.File(output_file, "r")

        # Check hdf5 structure
        self.assertEqual(len(saved_activations.keys()), len(self.test_sentences) + 1)
        self.assertTrue("sentence_to_index" in saved_activations)
        for idx in range(len(self.test_sentences)):
            self.assertTrue(str(idx) in saved_activations)

        # Check saved sentences
        self.assertEqual(len(saved_activations["sentence_to_index"]), 1)
        sentence_to_index = json.loads(saved_activations["sentence_to_index"][0])
        self.assertEqual(len(sentence_to_index), len(self.test_sentences))
        for sentence in sentence_to_index:
            self.assertEqual(sentence, self.test_sentences[int(sentence_to_index[sentence])])

        # Check saved activations
        for sentence in sentence_to_index:
            idx = sentence_to_index[sentence]
            self.assertTrue(torch.equal(torch.FloatTensor(saved_activations[idx]), self.expected_activations[int(idx)]))
    def test_save_decomposed(self, get_model_mock, extraction_mock):
        "Saving activations in multiple files, one per layer"
        get_model_mock.return_value = (None, None)
        extraction_mock.side_effect = self.mocked_model

        base_output_file = os.path.join(self.tmpdir.name, "output.hdf5")
        output_files = [
            os.path.join(self.tmpdir.name, f"output-layer{layer_idx}.hdf5")
            for layer_idx in range(13)
        ]

        transformers_extractor.extract_representations(
            "non-existant model",
            self.input_file,
            base_output_file,
            decompose_layers=True,
        )

        for layer_idx, output_file in enumerate(output_files):
            saved_activations = h5py.File(output_file, "r")
            sentence_to_index = json.loads(
                saved_activations["sentence_to_index"][0])

            # Check saved activations
            for sentence in self.test_sentences:
                idx = sentence_to_index[sentence]
                self.assertTrue(
                    torch.equal(
                        torch.FloatTensor(saved_activations[idx]),
                        self.expected_activations[int(idx)][[layer_idx], :, :],
                    ))