def test_decomposition_json(self): "Test decomposition of all layers into separate files for json" output_file = f"{self.tmpdir.name}/somename.json" actual_output_files = [ f"{self.tmpdir.name}/somename-layer{layer_idx}.json" for layer_idx in range(self.num_layers) ] writer = ActivationsWriter.get_writer(output_file, decompose_layers=True) for s_idx in range(len(self.sentences)): writer.write_activations( s_idx, self.sentences[s_idx].split(" "), self.expected_activations[s_idx], ) writer.close() for layer_idx, output_file in enumerate(actual_output_files): saved_activations, num_layers = loader.load_activations( output_file) # Decomposed files should only have 1 layer each self.assertEqual(1, num_layers) # Check saved activations for sentence_idx, sentence_activations in enumerate( saved_activations): curr_saved_activations = torch.FloatTensor( saved_activations[sentence_idx]) curr_expected_activations = self.expected_activations[ sentence_idx][layer_idx, :, :] self.assertTrue( torch.allclose(curr_saved_activations, curr_expected_activations))
def test_filter_layers_json(self): "Test layer filtering for json" output_file = f"{self.tmpdir.name}/somename.json" writer = ActivationsWriter.get_writer(output_file, filter_layers=",".join( map(str, self.filter_layers))) for s_idx in range(len(self.sentences)): writer.write_activations( s_idx, self.sentences[s_idx].split(" "), self.expected_activations[s_idx], ) writer.close() saved_activations, num_layers = loader.load_activations(output_file) self.assertEqual(len(self.filter_layers), num_layers) # Check saved activations for sentence_idx, sentence_activations in enumerate(saved_activations): curr_saved_activations = torch.FloatTensor( saved_activations[sentence_idx].reshape(( self.expected_activations[sentence_idx].shape[1], len(self.filter_layers), -1, )).swapaxes(0, 1)) curr_expected_activations = self.expected_activations[ sentence_idx][self.filter_layers, :, :] self.assertTrue( torch.allclose(curr_saved_activations, curr_expected_activations))
def test_binary_data_wrapper(self, mock_create_binary_data): mock_create_binary_data.return_value = ( self.test_sentences, self.test_sentences, self.activations, ) annotate.annotate_data( f"{self.tmpdir.name}/gold.word", f"{self.tmpdir.name}/gold.hdf5", {"test"}, f"{self.tmpdir.name}/test", ) with open(f"{self.tmpdir.name}/test.word") as fp: for line_idx, line in enumerate(fp): self.assertEqual(self.test_sentences[line_idx], line.strip()) # Load and check activations as well test_activations, test_num_layers = data_loader.load_activations( f"{self.tmpdir.name}/test.hdf5") self.assertEqual(self.num_layers, test_num_layers) gold_activations = [ a.reshape((a.shape[1], -1)) for a in self.activations ] for act_idx, act in enumerate(test_activations): self.assertTrue( torch.allclose(gold_activations[act_idx], torch.FloatTensor(act)))
def annotate_data( source_path, activations_path, binary_filter, output_prefix, output_type="hdf5", decompose_layers=False, filter_layers=None, ): """ Given a set of sentences, per word activations, a binary_filter and output_prefix, creates binary data and save it to the disk. A binary filter can be a set of words, a regex object or a function Parameters ---------- source_path : text file with one sentence per line activations: list A list of sentence-wise activations binary_filter: a set of words or a regex object or a function output_prefix: prefix of the output files that will be saved as the output of this script Returns ------- Saves a word file, a binary label file and their activations Example ------- annotate_data(source_path, activations_path, re.compile(r'^\w\w$')) select words of two characters only as a positive class annotate_data(source_path, activations_path, {'is', 'can'}) select occrrences of 'is' and 'can' as a positive class """ activations, num_layers = data_loader.load_activations(activations_path) # giving source_path instead of labels since labels will be generated later tokens = data_loader.load_data(source_path, source_path, activations, max_sent_l=512) words, labels, activations = _create_binary_data(tokens, activations, binary_filter) activations = [ np.swapaxes(a.reshape((a.shape[1], num_layers, -1)), 0, 1) for a in activations ] data_utils.save_files( words, labels, activations, output_prefix, output_type, decompose_layers, filter_layers, )
def test_decomposition_and_filter_layers_hdf5(self): "Test decomposition of specific layers into separate files for hdf5" output_file = f"{self.tmpdir.name}/somename.hdf5" actual_output_files = [ f"{self.tmpdir.name}/somename-layer{layer_idx}.hdf5" for layer_idx in self.filter_layers ] writer = ActivationsWriter.get_writer( output_file, decompose_layers=True, filter_layers=",".join(map(str, self.filter_layers)), ) for s_idx in range(len(self.sentences)): writer.write_activations( s_idx, self.sentences[s_idx].split(" "), self.expected_activations[s_idx], ) writer.close() for layer_idx, output_file in enumerate(actual_output_files): saved_activations, num_layers = loader.load_activations( output_file) # Decomposed files should only have 1 layer each self.assertEqual(1, num_layers) # Check saved activations for sentence_idx, sentence_activations in enumerate( saved_activations): curr_saved_activations = torch.FloatTensor( saved_activations[sentence_idx]) curr_expected_activations = self.expected_activations[ sentence_idx][self.filter_layers[layer_idx], :, :] self.assertTrue( torch.equal(curr_saved_activations, curr_expected_activations))