def load_dataset(batch_size): spacy_de = spacy.load('de') spacy_en = spacy.load('en') url = re.compile('(<url>.*</url>)') def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))] DE = Field(tokenize=tokenize_de, include_lengths=True, init_token='<sos>', eos_token='<eos>') EN = Field(tokenize=tokenize_en, include_lengths=True, init_token='<sos>', eos_token='<eos>') #train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN)) train, val, test = TranslationDataset.splits( path = '.data/multi30k', exts = ['.de', '.en'], fields = [('src', DE), ('trg', EN)], train = 'train', validation = 'val', test = 'test2016') DE.build_vocab(train.src, min_freq=2) EN.build_vocab(train.trg, max_size=10000) train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=batch_size, repeat=False) return train_iter, val_iter, test_iter, DE, EN
def get_data(args): # batch batch_size = args.batch device = "cuda" if (torch.cuda.is_available() and args.use_cuda) else "cpu" # set up fields src = Field( sequential=True, tokenize=str.split, use_vocab=True, lower=True, include_lengths=False, fix_length=args.max_length, # fix max length batch_first=True) trg = Field( sequential=True, tokenize=str.split, use_vocab=True, init_token='<s>', eos_token='</s>', lower=True, fix_length=args.max_length, # fix max length batch_first=True) print('set up fields ... done') if args.data_type == "koen": train, valid, test = TranslationDataset.splits(('.ko', '.en'), (src, trg), train='train', validation='valid', test='test', path=args.root_dir) # build the vocabulary src.build_vocab(train.src, min_freq=args.min_freq) trg.build_vocab(train.trg, min_freq=args.min_freq) # save the voabulary src_vocabs = src.vocab.stoi trg_vocabs = trg.vocab.stoi with open('./src_vocabs.pkl', 'wb') as f: pickle.dump(src_vocabs, f, pickle.HIGHEST_PROTOCOL) with open('./trg_vocabs.pkl', 'wb') as f: pickle.dump(trg_vocabs, f, pickle.HIGHEST_PROTOCOL) else: assert False, "Please Insert data_type" train_iter, valid_iter, test_iter = BucketIterator.splits( (train, valid, test), batch_sizes=([batch_size] * 3), device=device) return (src, trg), (train, valid, test), (train_iter, valid_iter, test_iter)
def __init__(self, module_name, train_bs, eval_bs, device, log): self.module_name = module_name # split_chars = lambda x: list("".join(x.split())) split_chars = lambda x: list(x) # keeps whitespaces source = Field(tokenize=split_chars, init_token='<sos>', eos_token='<eos>', batch_first=True) target = Field(tokenize=split_chars, init_token='<sos>', eos_token='<eos>', batch_first=True) log("Loading FULL datasets ...") folder = os.path.join(DATASET_TARGET_DIR, module_name) train_dataset, eval_dataset, _ = TranslationDataset.splits( path=folder, root=folder, exts=(INPUTS_FILE_ENDING, TARGETS_FILE_ENDING), fields=(source, target), train=TRAIN_FILE_NAME, validation=EVAL_FILE_NAME, test=EVAL_FILE_NAME) log("Building vocab ...") source.build_vocab(train_dataset) target.vocab = source.vocab log("Creating iterators ...") train_iterator = Iterator(dataset=train_dataset, batch_size=train_bs, train=True, repeat=True, shuffle=True, device=device) eval_iterator = Iterator(dataset=eval_dataset, batch_size=eval_bs, train=False, repeat=False, shuffle=False, device=device) self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.train_iterator = train_iterator self.eval_iterator = eval_iterator self.source = source self.target = target
def load_dataset(batch_size, device): """ Load the dataset from the files into iterator and initialize the vocabulary :param batch_size :param device :return: source and data iterators """ source = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True) train_data, valid_data, test_data = TranslationDataset.splits( path=DATA_FOLDER, exts=(POSITIVE_FILE_EXTENSION, NEGATIVE_FILE_EXTENSION), fields=(source, source)) source.build_vocab(train_data, min_freq=5) return source, BucketIterator.splits((train_data, valid_data, test_data), shuffle=True, batch_size=batch_size, device=device)
def get_data(path='data/'): SRC = Field(tokenize=tokenize_cn, init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>', lower=True) TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>', lower=True) train_data, valid_data, test_data = TranslationDataset.splits( path=path, train='train', validation='val', test='test', exts=('.cn', '.en'), fields=(SRC, TRG)) print("train: {}".format(len(train_data.examples))) print("valid: {}".format(len(valid_data.examples))) print("test: {}".format(len(test_data.examples))) SRC.build_vocab(train_data, min_freq=params.MIN_FREQ) TRG.build_vocab(train_data, min_freq=params.MIN_FREQ) print("源语言词表大小: {}".format(len(SRC.vocab))) print("目标语言词表大小: {}".format(len(TRG.vocab))) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=params.BATCH_SIZE, device=device) return train_iterator, valid_iterator, test_iterator, SRC, TRG
def load_dataset(args): def tokenzie_zhcha(text): #return [tok for tok in re.sub('\s','',text).strip()] return [tok for tok in text.strip()] def tokenzie_zhword(text): return [tok for tok in text.strip().split()] def tokenzie_ticha(text): return [tok for tok in text.strip().split()] def tokenzie_tiword(text): return [tok for tok in text.strip().split()] ZH_CHA = Field(tokenize=tokenzie_zhcha, include_lengths=True, init_token='<sos>', eos_token='<eos>') ZH_WORD = Field(tokenize=tokenzie_zhword, include_lengths=True, init_token='<sos>', eos_token='<eos>') Ti_CHA = Field(tokenize=tokenzie_ticha, include_lengths=True, init_token='<sos>', eos_token='<eos>') Ti_WORD = Field(tokenize=tokenzie_tiword, include_lengths=True, init_token='<sos>', eos_token='<eos>') #pdb.set_trace() #According to training mode, load data if args.mode == 'ctc': exts = (args.extension.split()[0], args.extension.split()[1]) train, val, test = Trans.splits(path=args.path, exts=exts, fields=(Ti_CHA, Ti_WORD), train=args.train, validation=args.valid, test=args.test) Ti_CHA.build_vocab(train.src) Ti_WORD.build_vocab(train.trg) train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=args.batch_size, repeat=False) return train_iter, val_iter, test_iter, Ti_CHA, Ti_WORD elif args.mode == 'nmt': exts = (args.extension.split()[0], args.extension.split()[1]) train, val, test = Trans.splits(path=args.path, exts=exts, fields=(Ti_WORD, ZH_WORD), train=args.train, validation=args.valid, test=args.test) Ti_WORD.build_vocab(train.src, max_size=50000) ZH_WORD.build_vocab(train.trg, max_size=50000) train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=args.batch_size, repeat=False) return train_iter, val_iter, test_iter, Ti_WORD, ZH_WORD elif args.mode == 'nmt_char': exts = (args.extension.split()[0], args.extension.split()[1]) train, val, test = Trans.splits(path=args.path, exts=exts, fields=(Ti_CHA, ZH_CHA), train=args.train, validation=args.valid, test=args.test) Ti_CHA.build_vocab(train.src) ZH_CHA.build_vocab(train.trg) train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=args.batch_size, repeat=False) return train_iter, val_iter, test_iter, Ti_CHA, ZH_CHA elif args.mode == 'combine': exts = (args.extension.split()[0], args.extension.split()[1]) train, val, test = Trans.splits(path=args.path, exts=exts, fields=(Ti_CHA, ZH_WORD), train=args.train, validation=args.valid, test=args.test) Ti_CHA.build_vocab(train.src) ZH_WORD.build_vocab(train.trg, max_size=50000) train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=args.batch_size, repeat=False) return train_iter, val_iter, test_iter, Ti_CHA, ZH_WORD elif args.mode == 'refine_ctc': exts = (args.extension.split()[0], args.extension.split()[1]) train, val, test = Trans.splits(path=args.path, exts=exts, fields=(Ti_CHA, Ti_WORD), train=args.train, validation=args.valid, test=args.test) Ti_CHA.build_vocab(train.src) Ti_WORD.build_vocab(train.trg, max_size=50000) train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=args.batch_size, repeat=False) return train_iter, val_iter, test_iter, Ti_CHA, Ti_WORD elif args.mode == 'update_twoLoss': exts = (args.extension.split()[0], args.extension.split()[1], args.extension.split()[2]) train, val, test, = mydataset.splits(path=args.path, exts=exts, fields=(Ti_CHA, ZH_WORD, Ti_WORD), train=args.train, validation=args.valid, test=args.test) Ti_CHA.build_vocab(train.src) ZH_WORD.build_vocab(train.trg, max_size=50000) Ti_WORD.build_vocab(train.ctc, max_size=50000) train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=args.batch_size, repeat=False) return train_iter, val_iter, test_iter, Ti_CHA, ZH_WORD, Ti_WORD
random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True SRC = Field(tokenize=None, init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize=None, init_token='<sos>', eos_token='<eos>', lower=True) myData = TranslationDataset('./E_V/train', ('.en', '.vi'), (SRC, TRG)) train_data, test_data = myData.splits(exts=('.en', '.vi'), fields=(SRC, TRG), path="./E_V/", train='train', validation=None, test='tst2012') vocabData = TranslationDataset('./E_V/vocab', ('.en', '.vi'), (SRC, TRG)) print(f"Number of training examples: {len(train_data.examples)}") # # print(f"Number of validation examples: {len(valid_data.examples)}") print(f"Number of testing examples: {len(test_data.examples)}") SRC.build_vocab(train_data, min_freq=3) TRG.build_vocab(train_data, min_freq=3) print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}") print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}") device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
def __init__(self, data_dir: str, packed: bool, vocab_max_sizes: Tuple[int, int], vocab_min_freqs: Tuple[int, int], batch_sizes: Tuple[int, int, int], test: bool = False): print(f"Creating DataLoader for {'testing' if test else 'training'}") # Rebuild the vocabs during testin, as the saved can be build from a different config if test: vocab_exists = False else: vocab_exists = has_vocabs(data_dir, vocab_max_sizes, vocab_min_freqs) # Define torch text fields for processing text if vocab_exists: print("Loading fields and vocabs...") SRC, TRG = load_vocabs(data_dir, vocab_max_sizes, vocab_min_freqs) else: print("Building fields...") # Include the sentence length for source SRC = Field(tokenize=tokenize_diff, init_token='<sos>', eos_token='<eos>', include_lengths=packed, lower=True) TRG = Field(tokenize=tokenize_msg, init_token='<sos>', eos_token='<eos>', lower=True) print("Loading commit data...") train_data, valid_data, test_data = TranslationDataset.splits( exts=('.diff', '.msg'), train='TrainingSet/train.26208', validation='TrainingSet/valid.3000', test='TestSet/test.3000', fields=(SRC, TRG), path=data_dir) if not vocab_exists: # Build vocabs print("Building vocabulary...") specials = ['<unk>', '<pad>', '<sos>', '<eos>'] SRC.build_vocab(train_data, min_freq=vocab_min_freqs[0], max_size=vocab_max_sizes[0], specials=specials) TRG.build_vocab(train_data, min_freq=vocab_min_freqs[1], max_size=vocab_max_sizes[1], specials=specials) if not test: save_vocabs(data_dir, SRC, TRG, vocab_max_sizes, vocab_min_freqs) print(f"Number of training examples: {len(train_data.examples)}") print(f"Number of validation examples: {len(valid_data.examples)}") print(f"Number of testing examples: {len(test_data.examples)}") print( f"Unique tokens in source (diff) training vocabulary: {len(SRC.vocab)}" ) print( f"Unique tokens in target (msg) training vocabulary: {len(TRG.vocab)}" ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Bucketing (minimizes the amount of padding by grouping similar length sentences) # Sort the sequences based on their non-padded length train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_sizes=batch_sizes, sort_within_batch=packed, sort_key=lambda x: len(x.src) if packed else None, device=device) super().__init__(train_iterator, valid_iterator, test_iterator, SRC, TRG, tokenize_diff, tokenize_msg)
def __init__(self, module_name, train_bs, eval_bs, device, vocab=None, base_folder=None, train_name=None, eval_name=None, x_ext=None, y_ext=None, tokens=None, specials=None, tokenizer=None, sort_within_batch=None, shuffle=None): self.module_name = module_name # split_chars = lambda x: list("".join(x.split())) split_chars = lambda x: list(x) # keeps whitespaces if not tokenizer: tokenizer = split_chars # NOTE: on Jul-20-2020, removed fix_length=200 since it forces # all batches to be of size (batch_size, 200) which # really wastes GPU memory source = Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', batch_first=True) target = Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', batch_first=True) base_folder = os.path.expanduser(base_folder) folder = os.path.join(base_folder, module_name) # fix slashes folder = os.path.abspath(folder) print("loading FULL datasets from folder={}".format(folder)) train_dataset, eval_dataset, _ = TranslationDataset.splits( path=folder, root=folder, exts=(x_ext, y_ext), fields=(source, target), train=train_name, validation=eval_name, test=eval_name) if vocab: print("Setting vocab to prebuilt file...") source.vocab = vocab target.vocab = vocab elif tokens: print("Building vocab from tokens...") #source.build_vocab(tokens, specials) counter = Counter(tokens) source.vocab = source.vocab_cls(counter, specials=specials) target.vocab = source.vocab else: print("Building vocab from TRAIN and EVAL datasets...") source.build_vocab(train_dataset, eval_dataset) target.vocab = source.vocab print("Creating iterators ...") do_shuffle = True if shuffle is None else shuffle train_iterator = Iterator(dataset=train_dataset, batch_size=train_bs, train=True, repeat=True, shuffle=do_shuffle, sort_within_batch=sort_within_batch, device=device) eval_iterator = Iterator(dataset=eval_dataset, batch_size=eval_bs, train=False, repeat=False, shuffle=False, sort_within_batch=sort_within_batch, device=device) self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.train_iterator = train_iterator self.eval_iterator = eval_iterator self.source = source self.target = target
MAX_LEN = 100 from torchtext.datasets import TranslationDataset, Multi30k ROOT = './' Multi30k.download(ROOT) SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD) TGT = data.Field(tokenize=tokenize_en, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD) (trnset, valset, testset) = TranslationDataset.splits(path='./Multi30k/multi30k', exts=['.en', '.de'], fields=[('src', SRC), ('trg', TGT)], test='test2016') #list(enumerate(testset)) import pandas as pd df = pd.read_csv("./SQuAD_csv/train_SQuAD.csv", sep=';', header=None) df = df.iloc[1:, :] df = df.iloc[:, [1, 2]] from sklearn.model_selection import train_test_split train, val = train_test_split(df, test_size=0.1) train.to_csv("train.csv", index=False) val.to_csv("val.csv", index=False)
SRC = Field(tokenize=tokenize, init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize=tokenize, init_token='<sos>', eos_token='<eos>', lower=True) fields, exts = (SRC, TRG), ('.ig', '.en') train_data, validate_data, test_data = TranslationDataset.splits( fields=fields, exts=exts, path=os.path.join('./..', 'data'), train='train', validation='val', test='test') print(f"{'Training examples':>20s}: {len(train_data.examples)}") print(f"{'Validation examples':>20s}: {len(validate_data.examples)}") print(f"{'Testing examples':>20s}: {len(test_data.examples)}") print(vars(train_data.examples[0])) SEED = 1234 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED)
data_path_inp = 'enghin/train.en' data_path_inp_val = 'enghin/dev.en' data_path_tar = 'enghin/train.hi' data_path_tar_val = 'enghin/dev.hi' data_path_inp_test = 'enghin/test.en' data_path_tar_test = 'enghin/test.hi' torch.backends.cudnn.deterministic = True def tokenize(text): return text.split() src_field = Field(tokenize=tokenize, lower=True, init_token='<SOL>', eos_token='<EOL>') trg_field = Field(tokenize=tokenize, lower=True, init_token='<SOL>', eos_token='<EOL>') train_data, valid_data, test_data = TranslationDataset.splits(exts=(".en",".hi"), fields=(src_field, trg_field), path="", train="train_med", validation="dev", test="test") src_field.build_vocab(train_data, min_freq=2, max_size=10000) trg_field.build_vocab(train_data, min_freq=2, max_size=10000) train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device) device = 'cuda' class Encoder(nn.Module): def __init__(self, inp_dim, embed_dim, encoder_hidden_dim, decoder_hidden_dim, dropout): super().__init__() self.inp_dim = inp_dim self.embed_dim = embed_dim
def load_dataset( dataset_name="SQUAD", tokenizer=word_tokenizer, init_token="<sos>", eos_token="<eos>", lower=True, use_glove=True, source_vocab=45000, target_vocab=28000, batch_size=VANILLA_SEQ2SEQ["BATCHSIZE"], ): """ Method Loads the dataset from location and returns three iterators and SRC and TRG fields """ logger.debug("Loading {} dataset".format(dataset_name)) SRC = data.Field( tokenize=tokenizer, init_token=init_token, eos_token=eos_token, lower=True, include_lengths=True, ) TRG = data.Field( tokenize=tokenizer, init_token=init_token, eos_token=eos_token, lower=True ) location = os.path.join(FILE_PATH, dataset_name) logger.debug("Loading from location: {}".format(location)) start_time = time.time() train_dataset, valid_dataset, test_dataset = TranslationDataset.splits( exts=(".paragraphs", ".questions"), fields=(SRC, TRG), path=location, train="train", validation="valid", test="test", ) logger.debug( "Number of Samples: Training = {} | Validation = {} | Testing = {}".format( len(train_dataset.examples), len(valid_dataset.examples), len(test_dataset.examples), ) ) logger.debug("Time Taken: {:.6f}s".format(time.time() - start_time)) logger.debug("Building Vocab") start_time = time.time() if use_glove: logger.debug("Using Glove vectors") SRC.build_vocab(train_dataset, max_size=source_vocab, vectors="glove.6B.300d") TRG.build_vocab(train_dataset, max_size=target_vocab, vectors="glove.6B.300d") else: SRC.build_vocab(train_dataset, max_size=source_vocab) TRG.build_vocab(train_dataset, max_size=target_vocab) logger.info( "Vocabulary Built! Source Tokens = {} | Target Tokens = {} \nCreating Iterators".format( len(SRC.vocab), len(TRG.vocab) ) ) logger.debug("Time Taken: {:.6f}s".format(time.time() - start_time)) return ( BucketIterator.splits( (train_dataset, valid_dataset, test_dataset), batch_size=batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, ), SRC, TRG, )
return text.split() SRC = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize=tokenize_hi, init_token='<sos>', eos_token='<eos>', lower=True) train_data, valid_data, test_data = TranslationDataset.splits( path='IITB_small', validation='dev', exts=('.en', '.hi'), fields=(SRC, TRG)) print(f"Number of training examples: {len(train_data.examples)}") print(f"Number of validation examples: {len(valid_data.examples)}") print(f"Number of testing examples: {len(test_data.examples)}") vars(train_data.examples[0]) SRC.build_vocab(train_data, min_freq=2) TRG.build_vocab(train_data, min_freq=2, specials=['<pad>', '<sop>', '<eop>']) print(f"Unique tokens in source (en) vocabulary: {len(SRC.vocab)}") print(f"Unique tokens in target (hi) vocabulary: {len(TRG.vocab)}")
def dataset_construction_from_raw_dataset( src_language: str, trg_language: str, path: str, filenames_exts: Tuple[str, str], min_freq: int = 1, train_filename: str = 'train', valid_filename: str = 'val', test_filename: str = 'test', init_token: Optional[str] = '<sos>', eos_token: Optional[str] = '<eos>', ) -> Tuple[TranslationDataset, TranslationDataset, TranslationDataset, Field, Field]: """ This function construct the train, validation and test datasets starting from raw files. It also builds the vocabulary from the training dataset. Raw files should be text files where each line correspond to a sentence in the respective language, and the extension should be language dependent. For example, if you have an English and German dataset, the train file should be called 'train.en' and 'train.de' respectively. :param src_language: the language of the source sequences, to be passed onto the Field tokenizer_language argument. Follows spacy's language abbreviations, i.e. 'en' for English, 'de' for German etc. See https://spacy.io/usage/models#languages for supporterd languages and their abbreviations. :param trg_language: the language of the target sequences, to be passed onto the Field tokenizer_language argument. Same conventions as for src_language (see above). :param path: the folder where the raw files are stored. :param filenames_exts: a tuple containing the extension to path for source and target language respectively. For German (source) and English (target), this would be filenames_exts = ('.de', '.en') :param min_freq: the minimum frequency a word must have, in the training corpus, in order to be included in the vocabulary. Default: 1. :param train_filename: the prefix of the train dataset (without extension). Default: 'train'. :param valid_filename: the prefix of the validation dataset (without extension). Default: 'val'. :param test_filename: the prefix of the test dataset (without extension). Default: 'test'. :param init_token: a token that will be prepended to every sentence, or None for no initial token. Default: '<sos>'. :param eos_token: a token that will be appended to every sentence, or None for no end-of-sentence token. Default: '<eos>'. :return: train: the training dataset, converted to a torchtest.datasets.TranslationDataset valid: the validation dataset, converted to a torchtest.datasets.TranslationDataset test: the test dataset, converted to a torchtest.datasets.TranslationDataset src_field: the Field object for the source dataset. Defines a datatype together with instructions for converting to Tensor. This might be needed if we want to convert new text to integers or viceversa using the vocabulary built with our input training corpus. trg_field: the Field object for the target dataset. See src_field for a description. """ src_field = Field( sequential=True, use_vocab=True, init_token=init_token, eos_token=eos_token, tokenize='spacy', tokenizer_language=src_language, batch_first=True, is_target=False, ) trg_field = Field(sequential=True, use_vocab=True, init_token=init_token, eos_token=eos_token, tokenize='spacy', tokenizer_language=trg_language, batch_first=True, is_target=True) train, valid, test = TranslationDataset.splits( exts=filenames_exts, fields=(src_field, trg_field), path=path, train= train_filename, # these will be suffixed with the extensions given in the exts tuple. validation=valid_filename, test=test_filename) src_field.build_vocab(train, min_freq=min_freq) trg_field.build_vocab(train, min_freq=min_freq) return train, valid, test, src_field, trg_field
spacy_de = nl_core_news_sm.load() spacy_en = en_core_web_sm.load() SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True) train, valid, test = TranslationDataset.splits(path='./data/multi30k/', exts=['.de', '.en'], fields=[('src', SRC), ('trg', TRG)], train='train', validation='val', test='test2016') print(vars(train.examples[0])) SRC.build_vocab(train, min_freq=2) TRG.build_vocab(train, min_freq=2) BATCH_SIZE = 128 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train, valid, test), batch_size=BATCH_SIZE, repeat=False) INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TRG.vocab) ENC_EMB_DIM = 256
def load_dataset(batch_size): spacy_de = spacy.load( 'de') #run it on your env or virtrual env:#python -m spacy download de spacy_en = spacy.load( 'en') #run it on your env or virtrual env:#python -m spacy download en url = re.compile('(<url>.*</url>)') def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))] # create dataset according to Field object. # Field define the basic token and tokenize. # Field can create vocab. # If you don't define init_token and eos_token, you will not get these token when you get training batch data from train_iter # Because you define the init_token and eos_token in here, you can get init_token + sentence + eos_token when you create train, val, test from TranslationDataset.splits DE = Field(tokenize=tokenize_de, include_lengths=True, init_token='<sos>', eos_token='<eos>') EN = Field(tokenize=tokenize_en, include_lengths=True, init_token='<sos>', eos_token='<eos>') #you can find: len(val.examples)=1014; len(test.examples)=1000; len(train.examples)=29000 in Multi30k.splits... #train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN)) #I download the data and read it directly: #if your file name is not the same as defualt, you must change the function input parameter: train='train', validation='val', test='test' #exts parameter is the data file ext name. #So the data file depends on the parameter:path+(train\validation\test)+exts train, val, test = TranslationDataset.splits(path='./data/', exts=('.de', '.en'), fields=(DE, EN)) #build vocabury #You can find one word from: DE.vocab.itos[0], it will depend on the order of frenquency #You can also find the index of word from: DE.vocab.stoi['word name'] #It will automatically create the '<pad>' into vocab even you never use it. The '<pad>' sometimes only be used after creating iterators. #It is the same to unkonw_token '<pad>'. If you want: init_token='<sos>', eos_token='<eos>', #you need to give a arguement in creating the Field object. DE.build_vocab( train.src, min_freq=2 ) # you can just use DE.build_vocab(train, min_freq=2), but not: DE.build_vocab(train.trg, min_freq=2) EN.build_vocab(train.trg, max_size=10000 ) # you can just use EN.build_vocab(train, max_size=10000) # Create batch and make the length of every sentence in one batch become the same # If repeat=True, program will forever run in: 'for b, batch in enumerate(train_iter):' train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=batch_size, repeat=False) return train_iter, val_iter, test_iter, DE, EN # import re # import spacy # import torch # from torchtext.data import Field, BucketIterator # from torchtext.datasets import Multi30k, TranslationDataset # spacy_de = spacy.load('de')#run it on your env or virtrual env:#python -m spacy download de # spacy_en = spacy.load('en')#run it on your env or virtrual env:#python -m spacy download en # url = re.compile('(<url>.*</url>)') # def tokenize_de(text): # return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))] # def tokenize_en(text): # return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))] # DE = Field(tokenize=tokenize_de, include_lengths=True, # init_token='<sos>', eos_token='<eos>') # EN = Field(tokenize=tokenize_en, include_lengths=True, # init_token='<sos>', eos_token='<eos>') # #you can find: len(val.examples)=1014; len(test.examples)=1000; len(train.examples)=29000 in Multi30k.splits... # #train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN)) # #I download the data and read it directly: # #if your file name is not the same as defualt, you must change the function input parameter: train='train', validation='val', test='test' # #exts parameter is the data file ext name. # #So the data file depends on the parameter:path+(train\validation\test)+exts # train, val, test = TranslationDataset.splits(path='./data2/',exts=('.de', '.en'), fields=(DE, EN)) # #build vocabury # #You can find one word from: DE.vocab.itos[0], it will depend on the order of frenquency # #You can also find the index of word from: DE.vocab.stoi['word name'] # DE.build_vocab(train, min_freq=2) # EN.build_vocab(train, max_size=10000) # for i in range(5): # print(DE.vocab.itos[i]) # train_iter, val_iter, test_iter = BucketIterator.splits( # (train, val, test), batch_size=2, repeat=False, sort=True, sort_within_batch=False) # DE.vocab.stoi # for i in range(5): # print(DE.vocab.itos[i]) # for i in range(len(EN.vocab)): # print(EN.vocab.itos[i]) # for e in range(3): # for b, batch in enumerate(train_iter): # src, len_src = batch.src # trg, len_trg = batch.trg # tensorToCsv2D(src,name='src',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt') # tensorToCsv2D(len_src,name='len_src',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt') # tensorToCsv2D(trg,name='trg',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt') # tensorToCsv2D(len_trg,name='len_trg',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt') # import numpy # def tensorToCsv2D(tensor,name='defualt',path=None,token=','): # def get_variable_name(variable): # callers_local_vars = inspect.currentframe().f_back.f_locals.items() # return [var_name for var_name, var_val in callers_local_vars if var_val is variable] # name = ''.join(get_variable_name(tensor)) # assert(path is not None) # z = tensor.numpy().tolist() # if len(numpy.shape(z)) == 2: # with open(path,'a') as f: # f.write(name) # f.write('\r') # for i in range(numpy.shape(z)[0]): # for j in range(numpy.shape(z)[1]): # f.write(str(z[i][j])) # f.write(token) # f.write('\r') # elif len(numpy.shape(z)) == 1: # with open(path,'a') as f: # f.write(name) # f.write('\r') # for i in range(numpy.shape(z)[0]): # f.write(str(z[i])) # f.write(token) # f.write('\r') # tensorToCsv2D(src,name='src',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt') # tensorToCsv2D(len_src,name='len_src',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt') # tensorToCsv2D(trg,name='trg',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt') # tensorToCsv2D(len_trg,name='len_trg',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt') # with open('/home/yj/Documents/Python/Github/seq2seq/data/gan.txt','w') as f: # f.write(str(src)) # f.write(str(len_src)) # f.write(str(trg)) # f.write(str(len_trg)) # f # z = src.numpy().tolist() # z[0][0] # len(numpy.shape(z)) # numpy.shape(z)[0]