def test_creating_buffered_context_check_labels_shape(self): with self.assertRaises(ValueError): BufferedPathContext.create_from_lists( ([[], [], []], ConvertParameters(0, False, {})), { FROM_TOKEN: ([[], []], ConvertParameters(0, False, {})), PATH_TYPES: ([[], []], ConvertParameters(0, False, {})), TO_TOKEN: ([[], []], ConvertParameters(0, False, {})), }, )
def _convert_raw_buffer(convert_args: Tuple[List[str], PreprocessingConfig, Vocabulary, str, int]): lines, config, vocab, output_folder, buffer_id = convert_args labels, from_tokens, path_types, to_tokens = [], [], [], [] for line in lines: label, *path_contexts = line.split() label = _parse_token(label, config.split_target) labels.append([vocab.label_to_id.get(_l, vocab.label_to_id[UNK]) for _l in label]) converted_context = [_convert_path_context_to_ids(config.split_names, pc, vocab) for pc in path_contexts] from_tokens.append([cc[0] for cc in converted_context]) path_types.append([cc[1] for cc in converted_context]) to_tokens.append([cc[2] for cc in converted_context]) bpc = BufferedPathContext.create_from_lists( (labels, ConvertParameters(config.max_target_parts, config.wrap_target, vocab.label_to_id)), { FROM_TOKEN: (from_tokens, ConvertParameters(config.max_name_parts, config.wrap_name, vocab.token_to_id),), PATH_TYPES: (path_types, ConvertParameters(config.max_path_length, config.wrap_path, vocab.type_to_id)), TO_TOKEN: (to_tokens, ConvertParameters(config.max_name_parts, config.wrap_name, vocab.token_to_id)), }, ) with open(path.join(output_folder, DESCRIPTION_FILE), "a") as desc_file: n_samples = len(bpc.contexts_per_label) n_paths = sum(bpc.contexts_per_label) desc_file.write(f"{buffer_id},{BUFFERED_PATH_TEMPLATE.format(buffer_id)},{n_samples},{n_paths}\n") bpc.dump(path.join(output_folder, BUFFERED_PATH_TEMPLATE.format(buffer_id)))
def test_creating_standard_path_context(self): token_to_id = {SOS: 0, EOS: 1, PAD: 2} type_to_id = {SOS: 1, EOS: 2, PAD: 0} label_to_id = {SOS: 2, EOS: 0, PAD: 1} labels = [[4], [], [4, 5, 6]] from_tokens = [ [[4], [5, 6]], [[], [], []], [[6, 5, 4]], ] path_types = [ [[4, 5], [6]], [[], [], []], [[6, 5, 4]], ] to_tokens = [ [[6], [4, 5]], [[], [], []], [[4, 6, 4]], ] buffered_path_context = BufferedPathContext.create_from_lists( (labels, ConvertParameters(3, True, label_to_id)), { FROM_TOKEN: (from_tokens, ConvertParameters(3, False, token_to_id)), PATH_TYPES: (path_types, ConvertParameters(3, True, type_to_id)), TO_TOKEN: (to_tokens, ConvertParameters(3, False, token_to_id)), }, ) true_labels = numpy.array([[2, 2, 2], [4, 0, 4], [0, 1, 5], [1, 1, 6]]) true_from_tokens = numpy.array([[4, 5, 2, 2, 2, 6], [2, 6, 2, 2, 2, 5], [2, 2, 2, 2, 2, 4]]) true_path_types = numpy.array([[1, 1, 1, 1, 1, 1], [4, 6, 2, 2, 2, 6], [5, 2, 0, 0, 0, 5], [2, 0, 0, 0, 0, 4]]) true_to_tokens = numpy.array([[6, 4, 2, 2, 2, 4], [2, 5, 2, 2, 2, 6], [2, 2, 2, 2, 2, 4]]) self.assertListEqual([2, 3, 1], buffered_path_context.contexts_per_label) numpy.testing.assert_array_equal(true_labels, buffered_path_context.labels) numpy.testing.assert_array_equal( true_from_tokens, buffered_path_context.contexts[FROM_TOKEN]) numpy.testing.assert_array_equal( true_path_types, buffered_path_context.contexts[PATH_TYPES]) numpy.testing.assert_array_equal( true_to_tokens, buffered_path_context.contexts[TO_TOKEN])