def setup_datasets(dataset_name, root='.data', vocab_size=20000, include_unk=False): dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive(dataset_tar) for fname in extracted_files: if fname.endswith('train.csv'): train_csv_path = fname if fname.endswith('test.csv'): test_csv_path = fname # generate sentencepiece pretrained tokenizer if not path.exists('m_user.model'): logging.info('Generate SentencePiece pretrained tokenizer...') generate_sp_model(train_csv_path, vocab_size) sp_model = load_sp_model("m_user.model") sp_generator = sentencepiece_numericalizer(sp_model) train_data, train_labels = _create_data_with_sp_transform( sp_generator, train_csv_path) test_data, test_labels = _create_data_with_sp_transform( sp_generator, test_csv_path) if len(train_labels ^ test_labels) > 0: raise ValueError("Training and test labels don't match") return (text_classification.TextClassificationDataset( None, train_data, train_labels), text_classification.TextClassificationDataset( None, test_data, test_labels))
def __init__(self, prefix, pretrained_text=False): self.prefix = prefix self.pretrained_text = pretrained_text self.context_length = 128 self.num_classes = 700 self.vocab_size = sp_vocab_size self.sp_model_path = sp_model_path self.sp_model = load_sp_model(sp_model_path) self.sp_id_generator = sentencepiece_numericalizer(self.sp_model) self.start_token, self.end_token = self.sp_id_generator( ["<|startoftext|>", "<|endoftext|>"]) self.text_dict = pickle.load( open(f"{raw_text_dir}/kinetics_{prefix}.pickle", "rb")) # Fetch paths to audio, video, text features for samples self.a_paths, self.v_paths, self.t_paths = get_npy_paths( prefix, pretrained_text=pretrained_text) print(len(self.t_paths)) # length = len(self.a_paths) # limit = length // 3 # self.a_paths, self.v_paths, self.t_paths = self.a_paths[:limit], self.v_paths[:limit], self.t_paths[:limit] self.tags = []
def __init__(self, prefix, num_classes=700, zero_shot=False, pretrained_text=False): self.prefix = prefix self.num_classes = num_classes self.zero_shot = zero_shot self.pretrained_text = pretrained_text self.context_length = 128 self.vocab_size = 20000 self.sp_model_path = sp_model_path self.sp_model = load_sp_model(sp_model_path) self.sp_id_generator = sentencepiece_numericalizer(self.sp_model) self.start_token, self.end_token = self.sp_id_generator( ["<|startoftext|>", "<|endoftext|>"]) self.text_dict = pickle.load( open(f"{raw_text_dir}/kinetics_{prefix}.pickle", "rb")) # Fetch paths to audio, video, text features for samples self.a_paths, self.v_paths, self.t_paths = get_npy_paths( prefix, pretrained_text=pretrained_text) self.labels = pickle.load( open("{}/{}.pickle".format(pickle_root_dir, prefix), "rb"))
def test_sentencepiece_numericalizer(self): test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer' model_path = 'test/asset/spm_example.model' sp_model = load_sp_model(model_path) self.assertEqual(len(sp_model), 20000) spm_generator = sentencepiece_numericalizer(sp_model) ref_results = [15340, 4286, 981, 1207, 1681, 17, 84, 684, 8896, 5366, 144, 3689, 9, 5602, 12114, 6, 560, 649, 5602, 12114] self.assertEqual(list(spm_generator([test_sample]))[0], ref_results)
def __init__(self, prefix, pretrained_text=False): self.prefix = prefix self.pretrained_text = pretrained_text self.context_length = 128 self.num_classes = 700 self.vocab_size = sp_vocab_size self.sp_model_path = sp_model_path self.sp_model = load_sp_model(sp_model_path) self.sp_id_generator = sentencepiece_numericalizer(self.sp_model) self.start_token, self.end_token = self.sp_id_generator( ["<|startoftext|>", "<|endoftext|>"]) self.text_dict = pickle.load( open(f"{raw_text_dir}/kinetics_{prefix}.pickle", "rb")) # Fetch paths to audio, video, text features for samples self.paths = glob.glob(f'{downsample_root_dir}/{prefix}/*.mp4') print(len(self.paths))
def __init__(self, lava_model_path=lava_weights_path, guse_model_path="https://tfhub.dev/google/universal-sentence-encoder/4", sp_model_path=sp_model_path, prefix="train"): self.guse_model_path = guse_model_path self.lava_model_path = lava_model_path self.sp_model_path = sp_model_path self.sp_model = load_sp_model(sp_model_path) self.sp_id_generator = sentencepiece_numericalizer(self.sp_model) self.start_token, self.end_token = self.sp_id_generator(["<|startoftext|>","<|endoftext|>"]) self.context_length = 128 # self.guse_model = hub.load(self.guse_model_path) self.model = LAVALightning(pretrained_text=False, num_heads=4, num_layers=4, model_dimension=1024) self.model.load_state_dict(torch.load(lava_model_path, map_location='cpu')['state_dict'], strict=True) self.model.eval() self.lava_model = self.model.encoder # self.a_feature_model = self.lava_model._audio_feature_model # self.a_projection = self.lava_model._audio_input_projection # self.a_encoder = self.lava_model._audio_encoder # self.v_feature_model = self.lava_model._video_feature_model # self.v_projection = self.lava_model._video_input_projection # self.v_encoder = self.lava_model._video_encoder # self.t_feature_model = self.lava_model._text_feature_model # self.t_projection = self.lava_model._text_input_projection # self.feature_dimension = self.lava_model._feature_dimension # self.model_dimension = self.lava_model._model_dimension self.a_encoder = self.lava_model.a_encoder self.v_encoder = self.lava_model.v_encoder self.t_encoder = self.lava_model.t_encoder self.feature_dimension = self.lava_model.feature_dimension self.model_dimension = self.lava_model.model_dimension
# %% [markdown] # We use the same tokenizer as in the data-preprocessing line # %% sp_deu = load_sp_model("preprocessed_data/sp_model/de.wiki.bpe.vs10000.model") sp_nds = load_sp_model("preprocessed_data/sp_model/nds.wiki.bpe.vs10000.model") # %% sp_deu_tokens_generator = sentencepiece_tokenizer(sp_deu) list_a = [ "Komplizierte Wörter sind Baustelle.", "Morgen soll es regnen und übermorgen scheint die Sonne" ] print(list(sp_deu_tokens_generator(list_a))) sp_numericalize_generator = sentencepiece_numericalizer(sp_deu) print(list(sp_numericalize_generator(list_a))) # %% sp_deu_tokens_generator = sentencepiece_tokenizer(sp_deu) sp_nds_tokens_generator = sentencepiece_tokenizer(sp_nds) def tokenize_de(text): return list(sp_deu_tokens_generator([text]))[0] def tokenize_nds(text): return list(sp_nds_tokens_generator([text]))[0]