示例#1
0
         #uE = " ".join(uE)
         DS.append([o, e, uE])
     train = DS[:trainingSz]
     test = DS[trainingSz:]
     #Train
     x = open("train300k.tsv", "w")
     for i in train:
         x.write(i[0] + "\t" + i[1] + "\t" + i[2] + "\n")
     x.close()
     #Test
     x = open("test10k.tsv", "w")
     for i in test:
         x.write(i[0] + "\t" + i[1] + "\t" + i[2] + "\n")
     x.close()
 #Other general stuff:
 sp_gec = load_sp_model("GCEBPE30k.model")
 SRC = Field(use_vocab=False,
             tokenize=sp_gec.encode,
             init_token=sp_gec.bos_id(),
             eos_token=sp_gec.eos_id(),
             pad_token=sp_gec.pad_id(),
             batch_first=True)
 noSW = Field(use_vocab=True,
              tokenize=tokenize,
              init_token='<sos>',
              eos_token='<eos>',
              lower=True)
 tv_datafields = [("orig", SRC), ("correction1", SRC),
                  ("correction2", noSW)]
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 trn, tst = TabularDataset.splits(path=".",
示例#2
0
 def test_sentencepiece_unsupported_input_type(self):
     with self.assertRaisesRegex(
             TypeError, 'Unsupported type for spm argument: dict. '
             'Supported types are: str, io.BufferedReader'):
         load_sp_model(dict())
示例#3
0
 def __init__(self, model_path):
     super().__init__()
     self.spm = load_sp_model(model_path)
示例#4
0
def getData():

    #sp_gec = load_sp_model("BPE/GCEBPE30k.model")
    sp_gec = load_sp_model("BPE/zaid_sp_model.model")
    print("print(len(sp_gec)) 1", len(sp_gec))

    bpe_field = Field(use_vocab=False,
                      tokenize=sp_gec.encode,
                      init_token=sp_gec.bos_id(),
                      eos_token=sp_gec.eos_id(),
                      pad_token=sp_gec.pad_id(),
                      batch_first=True)

    tv_datafields = [("src", bpe_field), ("trg", bpe_field)]
    # german = Field(tokenize=tokenize_ger, lower=True,
    #                init_token="<sos>", eos_token="<eos>",  pad_token="<pad>", unk_token="<unk>")

    # english = Field(
    #     tokenize=tokenize_eng, lower=True,
    #     init_token="<sos>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>")

    print("===============================before ")
    # train_data, valid_data, test_data = Multi30k.splits(
    #     exts=(".ennsw", ".en"), fields=(german, english),
    #     # root='.data',
    #     train='train',
    #     validation='val',
    #     test='test2016',
    #     path = '.data/multi30k'
    # )

    train_data, valid_data, test_data = Multi30k.splits(
        exts=(".ennsw", ".en"),
        fields=tv_datafields,
        # root='.data',
        train='train',
        validation='val',
        test='test2016',
        path='.data/multi30k')
    print(train_data)

    # example_id = 0
    # ex = train_data.examples[example_id].src
    # dec_ex = sp_gec.decode(ex)
    # print(ex)
    # print(dec_ex)

    # ex = train_data.examples[example_id].trg
    # dec_ex = sp_gec.decode(ex)
    # print(ex)
    # print(dec_ex)

    #====================================================================================
    # exit()

    # train_data, valid_data, test_data = Multi30k.splits(
    #     exts=(".con", ".tgt"), fields=(german, english),
    #     # root='.data',
    #     train='shortouttest300k',
    #     validation='shortout10k',
    #     test='shortout10k',
    #     path='/data/chaudhryz/ank_data'
    # )

    #The study’s questions are carefully worded and chosen.
    # The study questions were carefully worded and chosen.

    # train_data, valid_data, test_data = Multi30k.splits(
    #     exts=(".src", ".tgt"), fields=(german, english),
    #     # root='.data',
    #     train='train',
    #     validation='valid',
    #     test='test',
    #     path = '/data/chaudhryz/uwstudent1/GDATA'
    # )

    # build vocabulary

    # why is the vocabulary size the same for both datasets
    # german.build_vocab(train_data, max_size=10000, min_freq=2)
    # english.build_vocab(train_data, max_size=10000, min_freq=2)

    # german.vocab.init_token = "<sos>"
    # german.vocab.eos_token = "<eos>"

    # english.vocab.init_token = "<sos>"
    # english.vocab.eos_token = "<eos>"

    # init_token = sp_gec.bos_id(), eos_token = sp_gec.eos_id()

    # print("Train")
    # for i in range(10):
    #     #print(train_data[i].src, train_data[i].trg)
    #     printSent(train_data[i].src)
    #     printSent(train_data[i].trg)

    # print("Test")
    # for i in range(10):
    #     #print(train_data[i].src, train_data[i].trg)
    #     printSent(test_data[i].src)
    #     printSent(test_data[i].trg)
    # exit()

    # store multi30k vocabulary

    # a = {'GermanVocab': german.vocab, 'EnglishVocab': english.vocab}

    # with open('filename.pickle', 'wb') as handle:
    #     pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # use multi30k's vocabulary

    # with open('filename.pickle', 'rb') as handle:
    #     b = pickle.load(handle)
    #
    # german.vocab = b['GermanVocab']
    # english.vocab = b['EnglishVocab']

    #
    # print
    # a == b

    return sp_gec, train_data, valid_data, test_data
示例#5
0
 def __init__(self, spm_file):
     super(PretrainedSPTokenizer, self).__init__()
     self.sp_model = load_sp_model(spm_file)
def getData():
    # filename = '.data/multi30k/train.en'
    # generate_sp_model(filename, vocab_size=8000, model_type='bpe', model_prefix='zaid_sp_model')
    # #s = spm.SentencePieceProcessor(model_file='zaid_sp_model.model')
    # print(vars(s))
    # print(dir(s))
    # print(s.vocab_size())
    # print(s.bos_id())#exit()
    # print(s.eos_id())
    # print(s.unk_id())
    # print(s.pad_id())

    #exit()
    sp_gec = load_sp_model("BPE/GCEBPE30k.model")
    #sp_gec = load_sp_model("zaid_sp_model.model")
    # sp_gec =s
    # print(dir(sp_gec))
    # print(vars(sp_gec))
    #exit()
    src_pad_idx = sp_gec.pad_id()  #english_vocab.stoi["<pad>"]
    print("pad_index = ", src_pad_idx)
    #    print("pad = ", sp_gec.decode(src_pad_idx))
    #exit()

    # print("print(len(sp_gec)) 1", len(sp_gec))
    # print(vars(sp_gec))
    # print(dir(sp_gec))
    #exit()

    bpe_field = Field(use_vocab=False,
                      tokenize=sp_gec.encode,
                      init_token=sp_gec.bos_id(),
                      eos_token=sp_gec.eos_id(),
                      pad_token=sp_gec.pad_id(),
                      unk_token=sp_gec.unk_id(),
                      batch_first=True)

    print("-----------------------------------------------")
    #print(TabularDataset.splits.__doc__)
    #tv_datafields = [("ignore", bpe_field), ("trg", bpe_field), ("src", bpe_field)]
    # train_data, valid_data, test_data = TabularDataset.splits(path = "/data/chaudhryz/ankit", train = "test10k.tsv",
    #                                         validation="test10k.tsv", test = "test10k.tsv", format='tsv', skip_header=False, fields = tv_datafields)

    tv_datafields = [("trg", bpe_field), ("src", bpe_field)]
    train_data, valid_data, test_data = TabularDataset.splits(
        path=".data/multi30k",
        train="train.tsv",
        validation="val.tsv",
        test="test2016.tsv",
        format='tsv',
        skip_header=False,
        fields=tv_datafields)

    # train_data, valid_data, test_data = Multi30k.splits(
    #     exts=(".ennsw", ".en"), fields=tv_datafields,
    #     train='train',
    #     validation='val',
    #     test='test2016',
    #     path = '.data/multi30k'
    # )
    print(train_data)

    return sp_gec, train_data, valid_data, test_data