#uE = " ".join(uE) DS.append([o, e, uE]) train = DS[:trainingSz] test = DS[trainingSz:] #Train x = open("train300k.tsv", "w") for i in train: x.write(i[0] + "\t" + i[1] + "\t" + i[2] + "\n") x.close() #Test x = open("test10k.tsv", "w") for i in test: x.write(i[0] + "\t" + i[1] + "\t" + i[2] + "\n") x.close() #Other general stuff: sp_gec = load_sp_model("GCEBPE30k.model") SRC = Field(use_vocab=False, tokenize=sp_gec.encode, init_token=sp_gec.bos_id(), eos_token=sp_gec.eos_id(), pad_token=sp_gec.pad_id(), batch_first=True) noSW = Field(use_vocab=True, tokenize=tokenize, init_token='<sos>', eos_token='<eos>', lower=True) tv_datafields = [("orig", SRC), ("correction1", SRC), ("correction2", noSW)] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') trn, tst = TabularDataset.splits(path=".",
def test_sentencepiece_unsupported_input_type(self): with self.assertRaisesRegex( TypeError, 'Unsupported type for spm argument: dict. ' 'Supported types are: str, io.BufferedReader'): load_sp_model(dict())
def __init__(self, model_path): super().__init__() self.spm = load_sp_model(model_path)
def getData(): #sp_gec = load_sp_model("BPE/GCEBPE30k.model") sp_gec = load_sp_model("BPE/zaid_sp_model.model") print("print(len(sp_gec)) 1", len(sp_gec)) bpe_field = Field(use_vocab=False, tokenize=sp_gec.encode, init_token=sp_gec.bos_id(), eos_token=sp_gec.eos_id(), pad_token=sp_gec.pad_id(), batch_first=True) tv_datafields = [("src", bpe_field), ("trg", bpe_field)] # german = Field(tokenize=tokenize_ger, lower=True, # init_token="<sos>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>") # english = Field( # tokenize=tokenize_eng, lower=True, # init_token="<sos>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>") print("===============================before ") # train_data, valid_data, test_data = Multi30k.splits( # exts=(".ennsw", ".en"), fields=(german, english), # # root='.data', # train='train', # validation='val', # test='test2016', # path = '.data/multi30k' # ) train_data, valid_data, test_data = Multi30k.splits( exts=(".ennsw", ".en"), fields=tv_datafields, # root='.data', train='train', validation='val', test='test2016', path='.data/multi30k') print(train_data) # example_id = 0 # ex = train_data.examples[example_id].src # dec_ex = sp_gec.decode(ex) # print(ex) # print(dec_ex) # ex = train_data.examples[example_id].trg # dec_ex = sp_gec.decode(ex) # print(ex) # print(dec_ex) #==================================================================================== # exit() # train_data, valid_data, test_data = Multi30k.splits( # exts=(".con", ".tgt"), fields=(german, english), # # root='.data', # train='shortouttest300k', # validation='shortout10k', # test='shortout10k', # path='/data/chaudhryz/ank_data' # ) #The study’s questions are carefully worded and chosen. # The study questions were carefully worded and chosen. # train_data, valid_data, test_data = Multi30k.splits( # exts=(".src", ".tgt"), fields=(german, english), # # root='.data', # train='train', # validation='valid', # test='test', # path = '/data/chaudhryz/uwstudent1/GDATA' # ) # build vocabulary # why is the vocabulary size the same for both datasets # german.build_vocab(train_data, max_size=10000, min_freq=2) # english.build_vocab(train_data, max_size=10000, min_freq=2) # german.vocab.init_token = "<sos>" # german.vocab.eos_token = "<eos>" # english.vocab.init_token = "<sos>" # english.vocab.eos_token = "<eos>" # init_token = sp_gec.bos_id(), eos_token = sp_gec.eos_id() # print("Train") # for i in range(10): # #print(train_data[i].src, train_data[i].trg) # printSent(train_data[i].src) # printSent(train_data[i].trg) # print("Test") # for i in range(10): # #print(train_data[i].src, train_data[i].trg) # printSent(test_data[i].src) # printSent(test_data[i].trg) # exit() # store multi30k vocabulary # a = {'GermanVocab': german.vocab, 'EnglishVocab': english.vocab} # with open('filename.pickle', 'wb') as handle: # pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL) # use multi30k's vocabulary # with open('filename.pickle', 'rb') as handle: # b = pickle.load(handle) # # german.vocab = b['GermanVocab'] # english.vocab = b['EnglishVocab'] # # print # a == b return sp_gec, train_data, valid_data, test_data
def __init__(self, spm_file): super(PretrainedSPTokenizer, self).__init__() self.sp_model = load_sp_model(spm_file)
def getData(): # filename = '.data/multi30k/train.en' # generate_sp_model(filename, vocab_size=8000, model_type='bpe', model_prefix='zaid_sp_model') # #s = spm.SentencePieceProcessor(model_file='zaid_sp_model.model') # print(vars(s)) # print(dir(s)) # print(s.vocab_size()) # print(s.bos_id())#exit() # print(s.eos_id()) # print(s.unk_id()) # print(s.pad_id()) #exit() sp_gec = load_sp_model("BPE/GCEBPE30k.model") #sp_gec = load_sp_model("zaid_sp_model.model") # sp_gec =s # print(dir(sp_gec)) # print(vars(sp_gec)) #exit() src_pad_idx = sp_gec.pad_id() #english_vocab.stoi["<pad>"] print("pad_index = ", src_pad_idx) # print("pad = ", sp_gec.decode(src_pad_idx)) #exit() # print("print(len(sp_gec)) 1", len(sp_gec)) # print(vars(sp_gec)) # print(dir(sp_gec)) #exit() bpe_field = Field(use_vocab=False, tokenize=sp_gec.encode, init_token=sp_gec.bos_id(), eos_token=sp_gec.eos_id(), pad_token=sp_gec.pad_id(), unk_token=sp_gec.unk_id(), batch_first=True) print("-----------------------------------------------") #print(TabularDataset.splits.__doc__) #tv_datafields = [("ignore", bpe_field), ("trg", bpe_field), ("src", bpe_field)] # train_data, valid_data, test_data = TabularDataset.splits(path = "/data/chaudhryz/ankit", train = "test10k.tsv", # validation="test10k.tsv", test = "test10k.tsv", format='tsv', skip_header=False, fields = tv_datafields) tv_datafields = [("trg", bpe_field), ("src", bpe_field)] train_data, valid_data, test_data = TabularDataset.splits( path=".data/multi30k", train="train.tsv", validation="val.tsv", test="test2016.tsv", format='tsv', skip_header=False, fields=tv_datafields) # train_data, valid_data, test_data = Multi30k.splits( # exts=(".ennsw", ".en"), fields=tv_datafields, # train='train', # validation='val', # test='test2016', # path = '.data/multi30k' # ) print(train_data) return sp_gec, train_data, valid_data, test_data