Exemplo n.º 1
0
def setup_datasets(dataset_name,
                   root='.data',
                   vocab_size=20000,
                   include_unk=False):
    dataset_tar = download_from_url(URLS[dataset_name], root=root)
    extracted_files = extract_archive(dataset_tar)

    for fname in extracted_files:
        if fname.endswith('train.csv'):
            train_csv_path = fname
        if fname.endswith('test.csv'):
            test_csv_path = fname

    # generate sentencepiece  pretrained tokenizer
    if not path.exists('m_user.model'):
        logging.info('Generate SentencePiece pretrained tokenizer...')
        generate_sp_model(train_csv_path, vocab_size)

    sp_model = load_sp_model("m_user.model")
    sp_generator = sentencepiece_numericalizer(sp_model)
    train_data, train_labels = _create_data_with_sp_transform(
        sp_generator, train_csv_path)
    test_data, test_labels = _create_data_with_sp_transform(
        sp_generator, test_csv_path)

    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (text_classification.TextClassificationDataset(
        None, train_data, train_labels),
            text_classification.TextClassificationDataset(
                None, test_data, test_labels))
Exemplo n.º 2
0
    def __init__(self, prefix, pretrained_text=False):
        self.prefix = prefix
        self.pretrained_text = pretrained_text
        self.context_length = 128
        self.num_classes = 700
        self.vocab_size = sp_vocab_size
        self.sp_model_path = sp_model_path
        self.sp_model = load_sp_model(sp_model_path)
        self.sp_id_generator = sentencepiece_numericalizer(self.sp_model)
        self.start_token, self.end_token = self.sp_id_generator(
            ["<|startoftext|>", "<|endoftext|>"])
        self.text_dict = pickle.load(
            open(f"{raw_text_dir}/kinetics_{prefix}.pickle", "rb"))

        # Fetch paths to audio, video, text features for samples
        self.a_paths, self.v_paths, self.t_paths = get_npy_paths(
            prefix, pretrained_text=pretrained_text)
        print(len(self.t_paths))

        # length = len(self.a_paths)
        # limit = length // 3

        # self.a_paths, self.v_paths, self.t_paths = self.a_paths[:limit], self.v_paths[:limit], self.t_paths[:limit]

        self.tags = []
Exemplo n.º 3
0
    def __init__(self,
                 prefix,
                 num_classes=700,
                 zero_shot=False,
                 pretrained_text=False):

        self.prefix = prefix
        self.num_classes = num_classes
        self.zero_shot = zero_shot
        self.pretrained_text = pretrained_text

        self.context_length = 128
        self.vocab_size = 20000
        self.sp_model_path = sp_model_path
        self.sp_model = load_sp_model(sp_model_path)
        self.sp_id_generator = sentencepiece_numericalizer(self.sp_model)
        self.start_token, self.end_token = self.sp_id_generator(
            ["<|startoftext|>", "<|endoftext|>"])
        self.text_dict = pickle.load(
            open(f"{raw_text_dir}/kinetics_{prefix}.pickle", "rb"))

        # Fetch paths to audio, video, text features for samples
        self.a_paths, self.v_paths, self.t_paths = get_npy_paths(
            prefix, pretrained_text=pretrained_text)
        self.labels = pickle.load(
            open("{}/{}.pickle".format(pickle_root_dir, prefix), "rb"))
Exemplo n.º 4
0
    def test_sentencepiece_numericalizer(self):
        test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
        model_path = 'test/asset/spm_example.model'
        sp_model = load_sp_model(model_path)
        self.assertEqual(len(sp_model), 20000)
        spm_generator = sentencepiece_numericalizer(sp_model)

        ref_results = [15340, 4286, 981, 1207, 1681, 17, 84, 684, 8896, 5366,
                       144, 3689, 9, 5602, 12114, 6, 560, 649, 5602, 12114]

        self.assertEqual(list(spm_generator([test_sample]))[0],
                         ref_results)
Exemplo n.º 5
0
    def __init__(self, prefix, pretrained_text=False):
        self.prefix = prefix
        self.pretrained_text = pretrained_text
        self.context_length = 128
        self.num_classes = 700
        self.vocab_size = sp_vocab_size
        self.sp_model_path = sp_model_path
        self.sp_model = load_sp_model(sp_model_path)
        self.sp_id_generator = sentencepiece_numericalizer(self.sp_model)
        self.start_token, self.end_token = self.sp_id_generator(
            ["<|startoftext|>", "<|endoftext|>"])
        self.text_dict = pickle.load(
            open(f"{raw_text_dir}/kinetics_{prefix}.pickle", "rb"))

        # Fetch paths to audio, video, text features for samples
        self.paths = glob.glob(f'{downsample_root_dir}/{prefix}/*.mp4')
        print(len(self.paths))
Exemplo n.º 6
0
    def __init__(self,
                 lava_model_path=lava_weights_path,
                 guse_model_path="https://tfhub.dev/google/universal-sentence-encoder/4",
                 sp_model_path=sp_model_path,
                 prefix="train"):

        self.guse_model_path = guse_model_path
        self.lava_model_path = lava_model_path

        self.sp_model_path = sp_model_path
        self.sp_model = load_sp_model(sp_model_path)
        self.sp_id_generator = sentencepiece_numericalizer(self.sp_model)
        self.start_token, self.end_token = self.sp_id_generator(["<|startoftext|>","<|endoftext|>"])
        self.context_length = 128

        # self.guse_model = hub.load(self.guse_model_path)
        self.model = LAVALightning(pretrained_text=False, num_heads=4, num_layers=4, model_dimension=1024)
        self.model.load_state_dict(torch.load(lava_model_path, map_location='cpu')['state_dict'], strict=True)
        self.model.eval()

        self.lava_model = self.model.encoder

        # self.a_feature_model = self.lava_model._audio_feature_model
        # self.a_projection = self.lava_model._audio_input_projection
        # self.a_encoder = self.lava_model._audio_encoder

        # self.v_feature_model = self.lava_model._video_feature_model
        # self.v_projection = self.lava_model._video_input_projection
        # self.v_encoder = self.lava_model._video_encoder

        # self.t_feature_model = self.lava_model._text_feature_model
        # self.t_projection = self.lava_model._text_input_projection

        # self.feature_dimension = self.lava_model._feature_dimension
        # self.model_dimension = self.lava_model._model_dimension

        self.a_encoder = self.lava_model.a_encoder
        self.v_encoder = self.lava_model.v_encoder
        self.t_encoder = self.lava_model.t_encoder

        self.feature_dimension = self.lava_model.feature_dimension
        self.model_dimension = self.lava_model.model_dimension
Exemplo n.º 7
0
# %% [markdown]
# We use the same tokenizer as in the data-preprocessing line

# %%
sp_deu = load_sp_model("preprocessed_data/sp_model/de.wiki.bpe.vs10000.model")
sp_nds = load_sp_model("preprocessed_data/sp_model/nds.wiki.bpe.vs10000.model")

# %%

sp_deu_tokens_generator = sentencepiece_tokenizer(sp_deu)
list_a = [
    "Komplizierte Wörter sind Baustelle.",
    "Morgen soll es regnen und übermorgen scheint die Sonne"
]
print(list(sp_deu_tokens_generator(list_a)))
sp_numericalize_generator = sentencepiece_numericalizer(sp_deu)
print(list(sp_numericalize_generator(list_a)))

# %%

sp_deu_tokens_generator = sentencepiece_tokenizer(sp_deu)
sp_nds_tokens_generator = sentencepiece_tokenizer(sp_nds)


def tokenize_de(text):
    return list(sp_deu_tokens_generator([text]))[0]


def tokenize_nds(text):
    return list(sp_nds_tokens_generator([text]))[0]