def load_data( data_cfg: dict) -> (Dataset, Dataset, Dataset, Vocabulary, Vocabulary): """ Load train, dev and optionally test data as specified in configuration. Vocabularies are created from the training set with a limit of `voc_limit` tokens and a minimum token frequency of `voc_min_freq` (specified in the configuration dictionary). The training data is filtered to include sentences up to `max_sent_length` on source and target side. If you set ``random_train_subset``, a random selection of this size is used from the training set instead of the full training set. If you set ``random_dev_subset``, a random selection of this size is used from the dev development instead of the full development set. :param data_cfg: configuration dictionary for data ("data" part of configuration file) :return: - train_data: training dataset - dev_data: development dataset - test_data: test dataset if given, otherwise None - gls_vocab: gloss vocabulary extracted from training data - txt_vocab: spoken text vocabulary extracted from training data """ data_path = data_cfg.get("data_path", "./data") train_paths = [os.path.join(data_path, x) for x in data_cfg["train"]] dev_paths = os.path.join(data_path, data_cfg["dev"]) test_paths = os.path.join(data_path, data_cfg["test"]) pad_feature_size = data_cfg["feature_size"] pad_feature_size_keypoints = data_cfg["feature_size_keypoints"] level = data_cfg["level"] txt_lowercase = data_cfg["txt_lowercase"] max_sent_length = data_cfg["max_sent_length"] lst_names = data_cfg['names'] def tokenize_text(text): if level == "char": return list(text) else: return text.split() def tokenize_features(features): ft_list = torch.split(features, 1, dim=0) return [ft.squeeze() for ft in ft_list] # NOTE (Cihan): The something was necessary to match the function signature. def stack_features(features, something): return torch.stack([torch.stack(ft, dim=0) for ft in features], dim=0) sequence_field = data.RawField() signer_field = data.RawField() sgn_field = data.Field( use_vocab=False, init_token=None, dtype=torch.float32, preprocessing=tokenize_features, tokenize=lambda features: features, # TODO (Cihan): is this necessary? batch_first=True, include_lengths=True, postprocessing=stack_features, pad_token=torch.zeros((pad_feature_size, )), ) keypoints_field = data.Field( use_vocab=False, init_token=None, dtype=torch.float32, preprocessing=tokenize_features, tokenize=lambda features: features, # TODO (Cihan): is this necessary? batch_first=True, include_lengths=True, postprocessing=stack_features, pad_token=torch.zeros((pad_feature_size_keypoints, )), ) gls_field = data.Field( pad_token=PAD_TOKEN, tokenize=tokenize_text, batch_first=True, lower=False, include_lengths=True, ) txt_field = data.Field( init_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, tokenize=tokenize_text, unk_token=UNK_TOKEN, batch_first=True, lower=txt_lowercase, include_lengths=True, ) train_data = SignTranslationDataset( path=train_paths, names=lst_names, fields=(sequence_field, signer_field, sgn_field, keypoints_field, gls_field, txt_field), filter_pred=lambda x: len(vars(x)["sgn"]) <= max_sent_length and len( vars(x)["txt"]) <= max_sent_length, ) gls_max_size = data_cfg.get("gls_voc_limit", sys.maxsize) gls_min_freq = data_cfg.get("gls_voc_min_freq", 1) txt_max_size = data_cfg.get("txt_voc_limit", sys.maxsize) txt_min_freq = data_cfg.get("txt_voc_min_freq", 1) gls_vocab_file = data_cfg.get("gls_vocab", None) txt_vocab_file = data_cfg.get("txt_vocab", None) gls_vocab = build_vocab( field="gls", min_freq=gls_min_freq, max_size=gls_max_size, dataset=train_data, vocab_file=gls_vocab_file, ) txt_vocab = build_vocab( field="txt", min_freq=txt_min_freq, max_size=txt_max_size, dataset=train_data, vocab_file=txt_vocab_file, ) random_train_subset = data_cfg.get("random_train_subset", -1) if random_train_subset > -1: # select this many training examples randomly and discard the rest keep_ratio = random_train_subset / len(train_data) keep, _ = train_data.split(split_ratio=[keep_ratio, 1 - keep_ratio], random_state=random.getstate()) train_data = keep dev_data = SignTranslationDataset( path=dev_paths, names=lst_names, fields=(sequence_field, signer_field, sgn_field, keypoints_field, gls_field, txt_field), ) random_dev_subset = data_cfg.get("random_dev_subset", -1) if random_dev_subset > -1: # select this many development examples randomly and discard the rest keep_ratio = random_dev_subset / len(dev_data) keep, _ = dev_data.split(split_ratio=[keep_ratio, 1 - keep_ratio], random_state=random.getstate()) dev_data = keep # check if target exists test_data = SignTranslationDataset( path=test_paths, names=lst_names, fields=(sequence_field, signer_field, sgn_field, keypoints_field, gls_field, txt_field), ) gls_field.vocab = gls_vocab txt_field.vocab = txt_vocab return train_data, dev_data, test_data, gls_vocab, txt_vocab
def save_vocab(): data_cfg = load_config('configs/sign.yaml') data_path = data_cfg.get("data_path", "./data") if isinstance(data_cfg["train"], list): train_paths = [os.path.join(data_path, x) for x in data_cfg["train"]] dev_paths = [os.path.join(data_path, x) for x in data_cfg["dev"]] test_paths = [os.path.join(data_path, x) for x in data_cfg["test"]] pad_feature_size = sum(data_cfg["feature_size"]) else: train_paths = os.path.join(data_path, data_cfg["train"]) dev_paths = os.path.join(data_path, data_cfg["dev"]) test_paths = os.path.join(data_path, data_cfg["test"]) pad_feature_size = data_cfg["feature_size"] level = data_cfg["level"] txt_lowercase = data_cfg["txt_lowercase"] max_sent_length = data_cfg["max_sent_length"] def tokenize_text(text): if level == "char": return list(text) else: return text.split() def tokenize_features(features): ft_list = torch.split(features, 1, dim=0) return [ft.squeeze() for ft in ft_list] # NOTE (Cihan): The something was necessary to match the function signature. def stack_features(features, something): return torch.stack([torch.stack(ft, dim=0) for ft in features], dim=0) sequence_field = data.RawField() signer_field = data.RawField() sgn_field = data.Field( use_vocab=False, init_token=None, dtype=torch.float32, preprocessing=tokenize_features, tokenize=lambda features: features, # TODO (Cihan): is this necessary? batch_first=True, include_lengths=True, postprocessing=stack_features, pad_token=torch.zeros((pad_feature_size,)), ) gls_field = data.Field( pad_token=PAD_TOKEN, tokenize=tokenize_text, batch_first=True, lower=False, include_lengths=True, ) txt_field = data.Field( init_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, tokenize=tokenize_text, unk_token=UNK_TOKEN, batch_first=True, lower=txt_lowercase, include_lengths=True, ) train_data = SignTranslationDataset( path=train_paths, fields=(sequence_field, signer_field, sgn_field, gls_field, txt_field), filter_pred=lambda x: len(vars(x)["sgn"]) <= max_sent_length and len(vars(x)["txt"]) <= max_sent_length, ) gls_max_size = data_cfg.get("gls_voc_limit", sys.maxsize) gls_min_freq = data_cfg.get("gls_voc_min_freq", 1) txt_max_size = data_cfg.get("txt_voc_limit", sys.maxsize) txt_min_freq = data_cfg.get("txt_voc_min_freq", 1) gls_vocab_file = data_cfg.get("gls_vocab", None) txt_vocab_file = data_cfg.get("txt_vocab", None) gls_vocab = build_vocab( field="gls", min_freq=gls_min_freq, max_size=gls_max_size, dataset=train_data, vocab_file=gls_vocab_file, ) txt_vocab = build_vocab( field="txt", min_freq=txt_min_freq, max_size=txt_max_size, dataset=train_data, vocab_file=txt_vocab_file, ) if not os.path.exists('data/gls_vocab'): os.makedirs('data/gls_vocab') if not os.path.exists('data/txt_vocab'): os.makedirs('data/txt_vocab') gls_vocab.to_file('data/gls_vocab/gls_vocab.txt') txt_vocab.to_file('data/txt_vocab/txt_vocab.txt')