Пример #1
0
    def vocab_builder(self):
        #self.eid_field = Field(sequential=False,tokenize)

        print('Build Vocabulary')
        tokenize = BiGraphTextDataset.tokenize_text
        TEXT = Field(sequential=True,
                     tokenize=tokenize,
                     lower=True,
                     include_lengths=True,
                     batch_first=True,
                     fix_length=35,
                     use_vocab=True)

        datafields = [('eid', None), ('idxP', None), ('idxC', None),
                      ('MaxDegree', None), ('MaxL', None), ('text', TEXT)]
        path = '/data1/home2/AgainstRumor/data/Pheme/data.text.txt'
        train_data = TabularDataset(path=path,
                                    format='tsv',
                                    skip_header=False,
                                    fields=datafields)
        TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))

        #train_iter = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)
        self.stoi_dict = TEXT.vocab.stoi
        self.vocab_vectors = TEXT.vocab.vectors
Пример #2
0
def prepare_data(args):
    TEXT = Field(lower=True,
                 include_lengths=True,
                 batch_first=True,
                 tokenize='spacy',
                 tokenizer_language="en_core_web_sm")
    LABEL = Field(sequential=False)
    # make splits for data

    print("Creating splits")
    if args.subset:
        train, dev, test = SNLI.splits(TEXT, LABEL, root='./subdata')
    else:
        train, dev, test = SNLI.splits(TEXT, LABEL, root='./data')
    print("Loading GloVe")
    glove = torchtext.vocab.GloVe(name='840B', dim=300)
    print("Aligning GloVe vocab")
    TEXT.build_vocab(train, vectors=glove)
    LABEL.build_vocab(train, specials_first=False)
    n_vocab = len(TEXT.vocab.itos)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)
    print("Creating BucketIterator")
    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
        (train, dev, test),
        batch_sizes=(args.batch, 256, 256),
        device=device,
        shuffle=False)
    return TEXT, train_iter, dev_iter, test_iter
Пример #3
0
    def build(self):
        print('Build Vocabulary from ', self.path)

        tokenize = BuildVocab.tokenize_text
        TEXT = Field(sequential=True,
                     tokenize=tokenize,
                     lower=True,
                     include_lengths=True,
                     batch_first=True,
                     fix_length=35,
                     use_vocab=True)
        datafields = [('eid', None), ('idxP', None), ('idxC', None),
                      ('MaxDegree', None), ('MaxL', None), ('text', TEXT)]

        data = TabularDataset(path=self.path,
                              format='tsv',
                              skip_header=False,
                              fields=datafields)
        TEXT.build_vocab(data,
                         vectors=GloVe(name='6B', dim=300),
                         max_size=1000)

        #train_iter = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)
        self.stoi = TEXT.vocab.stoi
        self.vectors = TEXT.vocab.vectors
Пример #4
0
class DataLoader:
    source: Field = None
    target: Field = None

    def __init__(self, ext, tokenize_en, tokenize_de, sos_token, eos_token):
        self.ext = ext
        self.tokenize_en = tokenize_en
        self.tokenize_de = tokenize_de
        self.sos_token = sos_token
        self.eos_token = eos_token
        print('data initializing start')

    # generate field
    def make_dataset(self):
        if self.ext == ('.de', '.en'):
            self.source = Field(tokenize=self.tokenize_de,
                                init_token=self.sos_token,
                                eos_token=self.eos_token,
                                lower=True,
                                batch_first=True)
            self.target = Field(tokenize=self.tokenize_en,
                                init_token=self.sos_token,
                                eos_token=self.eos_token,
                                lower=True,
                                batch_first=True)
        elif self.ext == ('.en', '.de'):
            self.source = Field(tokenize=self.tokenize_en,
                                init_token=self.sos_token,
                                eos_token=self.eos_token,
                                lower=True,
                                batch_first=True)
            self.target = Field(tokenize=self.tokenize_de,
                                init_token=self.sos_token,
                                eos_token=self.eos_token,
                                lower=True,
                                batch_first=True)

        train_data, valid_data, test_data = Multi30k.splits(
            exts=self.ext, fields=(self.source, self.target))
        return train_data, valid_data, test_data

    # build the vocabulary & mapping integer
    def build_vocab(self, train_data, min_freq):
        # min_freq : lower bound frequency of the word's appearance
        self.source.build_vocab(train_data, min_freq=min_freq)
        self.target.build_vocab(train_data, min_freq=min_freq)

    def make_iter(self, train, validate, test, batch_size, device):
        train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train, validate, test), batch_size=batch_size, device=device)

        print('dataset initializing done')
        return train_iterator, valid_iterator, test_iterator
Пример #5
0
def prepare(params, samples):
    # print(type(params))
    # print(type(samples))
    TEXT = Field(lower=True,
                 include_lengths=True,
                 batch_first=True,
                 tokenize='spacy',
                 tokenizer_language="en_core_web_sm")

    # data = [' '.join(s) for s in samples],
    data = samples
    # print("data",len(data[0]))
    # print(data)
    TEXT.build_vocab(data, vectors=params.glove)

    params.model.emb_vec = torch.nn.Embedding.from_pretrained(
        TEXT.vocab.vectors, freeze=True).to(device=params.device)
    params["TEXT"] = TEXT
Пример #6
0
    def __init__(self, device=None, jit=False):
        super().__init__()
        self.device = device
        self.jit = jit

        # Download and the load default data.
        WORD = Field(include_lengths=True)
        UD_TAG = Field(init_token="<bos>",
                       eos_token="<eos>",
                       include_lengths=True)

        # Download and the load default data.
        train, val, test = UDPOS.splits(
            fields=(("word", WORD), ("udtag", UD_TAG), (None, None)),
            filter_pred=lambda ex: 5 < len(ex.word) < 30,
        )

        WORD.build_vocab(train.word, min_freq=3)
        UD_TAG.build_vocab(train.udtag)
        self.train_iter = torch_struct.data.TokenBucket(train,
                                                        batch_size=100,
                                                        device=device)

        H = 256
        T = 30
        NT = 30
        self.model = NeuralCFG(len(WORD.vocab), T, NT, H)
        if jit:
            self.model = torch.jit.script(self.model)
        self.model.to(device=device)
        self.opt = torch.optim.Adam(self.model.parameters(),
                                    lr=0.001,
                                    betas=[0.75, 0.999])
        for i, ex in enumerate(self.train_iter):
            words, lengths = ex.word
            self.words = words.long().to(device).transpose(0, 1)
            self.lengths = lengths.to(device)
            break
def produce_iterators(train_filename,
                      valid_filename,
                      test_filename,
                      asr_tokenizer,
                      ttx_tokenizer=None):
    """
    Produce datasets for each of training, validation and test data. Also build vocabs for true text, tags, and ASR.
    :param train_filename: location of train data csv
    :param valid_filename: location of valid data csv
    :param test_filename: location of test data csv
    :return:
    """
    TTX = Field(tokenize=lambda x: tokenize_TTX(x, ttx_tokenizer),
                init_token='<sos>',
                eos_token='<eos>',
                lower=False,
                batch_first=True)

    TRG = Field(tokenize=tokenize_TRG,
                init_token='<sos>',
                eos_token='<eos>',
                lower=False,
                batch_first=True)

    ASR = Field(tokenize=lambda x: tokenize_ASR(x, asr_tokenizer),
                init_token='<sos>',
                eos_token='<eos>',
                lower=False,
                batch_first=True)

    fields = {
        'true_text': ('true_text', TTX),
        'tags': ('tags', TRG),
        'asr': ('asr', ASR)
    }

    train_data, valid_data, test_data = TabularDataset.splits(
        path='.\\',
        train=train_filename,
        validation=valid_filename,
        test=test_filename,
        format='csv',
        fields=fields)

    # Put min_freq at 2 or higher for real data
    TTX.build_vocab(train_data, min_freq=1)
    TRG.build_vocab(train_data, min_freq=1)
    ASR.build_vocab(train_data, min_freq=1)

    return train_data, valid_data, test_data, TTX, TRG, ASR
Пример #8
0
import random
import torch.optim as optim
import time

tokenizer = word_tokenize
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
TEXT = Field(tokenize=tokenizer, include_lengths=True)
LABEL = LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data,
                 max_size=MAX_VOCAB_SIZE,
                 vectors="glove.6B.300d",
                 unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, vaild_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    device=device)


class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 n_layers, bidirectional, dropout, pad_idx):
Пример #9
0
    def test_xnli(self):
        batch_size = 4

        # create fields
        TEXT = Field()
        GENRE = LabelField()
        LABEL = LabelField()
        LANGUAGE = LabelField()

        # create val/test splits, XNLI does not have a test set
        val, test = XNLI.splits(TEXT, LABEL, GENRE, LANGUAGE)

        # check both are XNLI datasets
        assert type(val) == type(test) == XNLI

        # check all have the correct number of fields
        assert len(val.fields) == len(test.fields) == 5

        # check fields are the correct type
        assert type(val.fields['premise']) == Field
        assert type(val.fields['hypothesis']) == Field
        assert type(val.fields['label']) == LabelField
        assert type(val.fields['genre']) == LabelField
        assert type(val.fields['language']) == LabelField

        assert type(test.fields['premise']) == Field
        assert type(test.fields['hypothesis']) == Field
        assert type(test.fields['label']) == LabelField
        assert type(test.fields['genre']) == LabelField
        assert type(test.fields['language']) == LabelField

        # check each is the correct length
        assert len(val) == 37350
        assert len(test) == 75150

        # build vocabulary
        TEXT.build_vocab(val)
        LABEL.build_vocab(val)
        GENRE.build_vocab(val)
        LANGUAGE.build_vocab(val)

        # ensure vocabulary has been created
        assert hasattr(TEXT, 'vocab')
        assert hasattr(TEXT.vocab, 'itos')
        assert hasattr(TEXT.vocab, 'stoi')

        # create iterators
        val_iter, test_iter = Iterator.splits((val, test),
                                              batch_size=batch_size)

        # get a batch to test
        batch = next(iter(val_iter))

        # split premise and hypothesis from tuples to tensors
        premise = batch.premise
        hypothesis = batch.hypothesis
        label = batch.label
        genre = batch.genre
        language = batch.language

        # check each is actually a tensor
        assert type(premise) == torch.Tensor
        assert type(hypothesis) == torch.Tensor
        assert type(label) == torch.Tensor
        assert type(genre) == torch.Tensor
        assert type(language) == torch.Tensor

        # check have the correct batch dimension
        assert premise.shape[-1] == batch_size
        assert hypothesis.shape[-1] == batch_size
        assert label.shape[-1] == batch_size
        assert genre.shape[-1] == batch_size
        assert language.shape[-1] == batch_size

        # xnli cannot use the iters method, ensure raises error
        with self.assertRaises(NotImplementedError):
            val_iter, test_iter = XNLI.iters(batch_size=batch_size)

        # remove downloaded xnli directory
        shutil.rmtree('.data/xnli')
Пример #10
0
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    parser = argparse.ArgumentParser()
    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-b', '--batch_size', type=int, default=2048)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)
    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-lr_mul', type=float, default=2.0)
    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-output_dir', type=str, default=None)
    parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000)

    opt = parser.parse_args()

    english = Field(sequential=True,
                    use_vocab=True,
                    tokenize=tokenize_eng,
                    lower=True,
                    pad_token='<blank>',
                    init_token='<s>',
                    eos_token='</s>')

    german = Field(sequential=True,
                   use_vocab=True,
                   tokenize=tokenize_ger,
                   lower=True,
                   pad_token='<blank>',
                   init_token='<s>',
                   eos_token='</s>')

    fields = {'English': ('eng', english), 'German': ('ger', german)}
    train_data, test_data = TabularDataset.splits(path='',
                                                  train='train.json',
                                                  test='test.json',
                                                  format='json',
                                                  fields=fields)

    english.build_vocab(train_data, max_size=1000, min_freq=1)
    print('[Info] Get source language vocabulary size:', len(english.vocab))

    german.build_vocab(train_data, max_size=1000, min_freq=1)
    print('[Info] Get target language vocabulary size:', len(german.vocab))

    batch_size = opt.batch_size
    # data = pickle.load(open(opt.data_file, 'rb'))

    opt.src_pad_idx = english.vocab.stoi['<blank>']
    opt.trg_pad_idx = german.vocab.stoi['<blank>']

    opt.src_vocab_size = len(english.vocab)
    opt.trg_vocab_size = len(german.vocab)

    devices = [0, 1, 2, 3]
    pad_idx = opt.trg_vocab_size
    model = make_model(len(english.vocab), len(german.vocab), N=6)
    model.cuda()
    criterion = LabelSmoothing(size=len(german.vocab),
                               padding_idx=pad_idx,
                               smoothing=0.1)
    criterion.cuda()
    BATCH_SIZE = 12000
    train_iter = MyIterator(train_data,
                            batch_size=BATCH_SIZE,
                            device=0,
                            repeat=False,
                            sort_key=lambda x: (len(x.eng), len(x.ger)),
                            batch_size_fn=batch_size_fn,
                            train=True)
    valid_iter = MyIterator(test_data,
                            batch_size=BATCH_SIZE,
                            device=0,
                            repeat=False,
                            sort_key=lambda x: (len(x.eng), len(x.ger)),
                            batch_size_fn=batch_size_fn,
                            train=False)
    model_par = nn.DataParallel(model, device_ids=devices)

    model_opt = NoamOpt(
        model.src_embed[0].d_model, 1, 2000,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98),
                         eps=1e-9))
    for epoch in range(10):
        model_par.train()
        run_epoch((rebatch(pad_idx, b) for b in train_iter), model_par,
                  MultiGPULossCompute(model.generator,
                                      criterion,
                                      devices=devices,
                                      opt=model_opt))
        model_par.eval()
        loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), model_par,
                         MultiGPULossCompute(model.generator,
                                             criterion,
                                             devices=devices,
                                             opt=None))
        print(loss)

    for i, batch in enumerate(valid_iter):
        src = batch.src.transpose(0, 1)[:1]
        src_mask = (src != english.vocab.stoi["<blank>"]).unsqueeze(-2)
        out = greedy_decode(model,
                            src,
                            src_mask,
                            max_len=60,
                            start_symbol=german.vocab.stoi["<s>"])
        print("Translation:", end="\t")
        for i in range(1, out.size(1)):
            sym = german.vocab.itos[out[0, i]]
            if sym == "</s>": break
            print(sym, end=" ")
        print()
        print("Target:", end="\t")
        for i in range(1, batch.trg.size(0)):
            sym = german.vocab.itos[batch.trg.data[i, 0]]
            if sym == "</s>": break
            print(sym, end=" ")
        print()
        break
Пример #11
0
    batch_size=200,
    sort_key=lambda x: len(x.words),
    device=device,
    sort=True,
    sort_within_batch=True,
)
test_iter = BucketIterator(
    test,
    batch_size=200,
    sort_key=lambda x: len(x.words),
    device=device,
    sort=True,
    sort_within_batch=True,
)

text_field.build_vocab(train, min_freq=flor.log("min_freq", 5))


# LSTM model
class LSTM(nn.Module):
    def __init__(self, dimension=128):
        super(LSTM, self).__init__()

        self.embedding = nn.Embedding(len(text_field.vocab), dimension)
        flor.log("embedding", self.embedding)
        self.lstm = nn.LSTM(
            input_size=dimension,
            hidden_size=dimension,
            num_layers=1,
            batch_first=True,
            bidirectional=True,
Пример #12
0

german = Field(tokenize=tokenizer_ger,
               lower=True,
               init_token='<sos>',
               eos_token='<eos>')
english = Field(tokenize=tokenizer_eng,
                lower=True,
                init_token='<sos>',
                eos_token='<eos>')

train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                         fields=(german,
                                                                 english))

german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)


# model
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers,
                 dropout):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
def data_preprocessing():
    SEED = 1234

    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

    # import de_core_news_sm, en_core_web_sm
    # spacy_de = de_core_news_sm.load()
    # spacy_en = en_core_web_sm.load()
    # spacy_de = spacy.load('de_core_news_sm')
    # spacy_en = spacy.load('en_core_web_sm')

    # Field对象 :指定要如何处理某个字段,比如指定分词方法,是否转成小写,起始字符,结束字符,补全字符以及词典等。
    # 我们创建SRC和TRG两个Field对象,tokenize为我们刚才定义的分词器函数
    # 在每句话的开头加入字符SOS,结尾加入字符EOS,将所有单词转换为小写。
    SRC = Field(tokenize=tokenize_de,
                init_token='<sos>',
                eos_token='<eos>',
                lower=True)
    TRG = Field(tokenize=tokenize_en,
                init_token='<sos>',
                eos_token='<eos>',
                lower=True)

    # splits方法可以同时加载训练集,验证集和测试集,
    # 参数exts指定使用哪种语言作为源语言和目标语言,fileds指定定义好的Field类
    train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                        fields=(SRC, TRG))

    # print(f"Number of training examples: {len(train_data.examples)}")
    # print(f"Number of validation examples: {len(valid_data.examples)}")
    # print(f"Number of testing examples: {len(test_data.examples)}")

    # vars() 函数返回对象object的属性和属性值的字典对象。
    # print(vars(train_data.examples[0]))

    # 构建词表,即给每个单词编码,用数字表示每个单词,这样才能传入模型
    SRC.build_vocab(train_data, min_freq=2)
    TRG.build_vocab(train_data, min_freq=2)

    # print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
    # print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # print(device)

    BATCH_SIZE = 128

    # BucketIterator:相比于标准迭代器,会将类似长度的样本当做一批来处理
    # 因为在文本处理中经常会需要将每一批样本长度补齐为当前批中最长序列的长度
    # 因此当样本长度差别较大时,使用BucketIerator可以带来填充效率的提高。
    # 除此之外,我们还可以在Field中通过fix_length参数来对样本进行截断补齐操作。

    # 当使用迭代器生成一个batch时,我们需要确保所有的源语言句子都padding到相同的长度,目标语言的句子也是。
    # 这些功能torchtext可以自动的完成,其使用了动态padding,意味着一个batch内的所有句子会pad成batch内最长的句子长度。
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=BATCH_SIZE,
        device=device)

    return SRC, TRG, device, train_iterator, valid_iterator, test_iterator
Пример #14
0
    print(batch_data.text)


# **Embedding**

# 下面介绍如何在torchtext中使用预训练的词向量,进而传送给神经网络模型进行训练

# In[25]:


# 加载预训练的词向量
vectors = Vectors(name="data/tnews_jieba_tencent_embeddings.txt")
# 指定Vector缺失值的初始化方式,oov词的初始化方式
vectors.unk_init = nn.init.uniform_
# 这里只使用训练集的数据进行词汇表的构建
TEXT.build_vocab(train_dataset_torchtext, vectors=vectors)


# In[26]:


# 统计词频
TEXT.vocab.freqs.most_common(10)


# **迭代器**
# 
# * **Iterator**:保持数据样本顺序不变来构建批数据,适用测试集
# 
# * **BucketIterator**:自动将相似长度的示例批处理在一起,最大程度地减少所需的填充量,适合训练集和验证集
# 
Пример #15
0
class Preprocessing(Dataset):

    __tokPattern = r"""[0-9A-Za-z_]*[A-Za-z_-]+[0-9A-Za-z_]*|\.|\!|\?|\d+|\-|%|[.,!?;'"]"""
    __supportedExtensions = ['txt', 'csv', 'json']
    __seedAttrs = ['nFirst', 'minFreq']

    def __init__(self,
                 fileParams={},
                 tokenizationOption='regex',
                 seedParams={
                     'nFirst': 1,
                     'minFreq': 5
                 },
                 fieldParams={
                     'lower': True,
                     'eos_token': '<!EOS!>'
                 },
                 spacyObj=None):

        self.__fileName, self.__fileExtension, self.__parsingColumn = checkFileParams(
            fileParams)
        self.__seedParams = checkSeedParams(seedParams)
        self.__DataVocab = Field(**fieldParams)
        self.__spacyObj = spacyObj
        self.__customTokenize = self.__tokenizationMethod(tokenizationOption)
        self.__readFile()

    @property
    def getFileName(self):
        return self.__fileName

    @property
    def getVocab(self):
        return self.__DataVocab

    def __readFile(self):
        text = readFiles(self.__fileName, self.__fileExtension,
                         self.__parsingColumn)
        self.examples = self.__getObjects(text)
        self.__seeds = getStartWords(self.__seedParams, text)
        self.__build_vocab()

    def __getObjects(self, text):
        self.fields = {"src": self.__DataVocab}
        return [Document(**self.__tokenize(instance)) for instance in text]

    def __build_vocab(self):
        self.__DataVocab.build_vocab(self)
        for instance in self.examples:
            instance.create_tokens(self.__DataVocab)

    def __regexTokenization(self, document):
        return re.findall(self.__tokPattern, document)

    def __nltkTokenization(self, document):
        return self.tokenizer(document)

    def __spacyTokenization(self, instance):
        return [
            entity.text.strip() for entity in self.__spacyObj(instance)
            if entity.text.strip()
        ]

    def __tokenize(self, instance):
        instance = self.__customTokenize(instance)
        return {'src': instance, 'trg': instance[1:]}

    @checkParams(str)
    def __tokenizationMethod(self, param):
        param = param.lower()

        if param == 'nltk':
            self.tokenizer = importNltk()
            return self.__nltkTokenization

        elif param == 'regex':
            return self.__regexTokenization

        elif param == 'spacy':
            if not self.__spacyObj:
                raise Exception(
                    "Please provide the spacy object to tokenize with.")

            return self.__spacyTokenization

        raise Exception(
            "The parameter 'tokenizationOption' can only be nltk, regex and spacy"
        )

    def getSeed(self):
        """
            return a weighted seed. 
            In case static seed is enabled, then the most frequent token will be the seed.
        """
        seeds = list(self.__seeds.keys())
        probs = list(self.__seeds.values())
        return choice(seeds, 1, probs).tolist()
Пример #16
0
texts = np.concatenate((train_, eval_))
labels = np.concatenate((train_labels, eval_labels))

df = pd.DataFrame({'text': texts, 'label': labels})

text_field = Field(sequential=True,
                   tokenize='basic_english',
                   fix_length=5,
                   lower=True)

label_field = Field(sequential=False, use_vocab=False, is_target=True)

preprocessed_text = df['text'].apply(lambda x: text_field.preprocess(x))
# text_field.build_vocab(preprocessed_text, vectors='fasttext.simple.300d')
text_field.build_vocab(preprocessed_text, vectors='glove.6B.50d')
vocab = text_field.vocab

ltoi = {l: i for i, l in enumerate(df['label'].unique())}
df['label'] = df['label'].apply(lambda y: ltoi[y])


class DataFrameDataset(torchtext.legacy.data.Dataset):
    def __init__(self, df: pd.DataFrame, fields: list):
        super(DataFrameDataset, self).__init__(
            [Example.fromlist(list(r), fields) for i, r in df.iterrows()],
            fields)


train_dataset, test_dataset = DataFrameDataset(df=df,
                                               fields=(('text', text_field),
Пример #17
0
Valoracion = Field(sequential=False, use_vocab=False)

fields = {"Texto": ("t", Texto), "Valoracion": ("v", Valoracion)}

train_data, test_data = TabularDataset.splits(
                                        path='/content/Dataset',
                                        train='train.csv',
                                        test='test.csv',
                                        format='csv',
                                        fields=fields)

len(train_data) , len(test_data)

print(vars(train_data.examples[0]))

Texto.build_vocab(train_data, max_size=10000, min_freq=1,vectors="glove.6B.100d")

Texto.vocab.freqs.most_common(25)

Texto.vocab.itos[:10]

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), batch_size=2, device=device
)

class RNN_LSTM(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers):
        super(RNN_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
datafields = [("input", INPUT), ("target", TARGET)]

trn, vld, tst = TabularDataset.splits(path="data/" + data_size,
                                      train=train_csv,
                                      validation=validation_csv,
                                      test=test_csv,
                                      format='csv',
                                      skip_header=True,
                                      fields=datafields)

print(f"Number of {data_size} training examples: {len(trn.examples)}")
print(f"Number of {data_size} validation examples: {len(vld.examples)}")
print(f"Number of {data_size} test examples: {len(tst.examples)}")

INPUT.build_vocab(trn)
TARGET.build_vocab(trn)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_iter, val_iter, test_iter = BucketIterator.splits(
    (trn, vld, tst),
    sort_key=lambda x: len(x.input),
    sort_within_batch=False,
    batch_size=BATCH_SIZE,
    device=device)
"""
Build Transformer
"""

Пример #19
0
SRC = Field(tokenize=tokenize_de,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

TGT = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fields=(SRC, TGT))

SRC.build_vocab(train_data, min_freq=2)
TGT.build_vocab(train_data, min_freq=2)

BATCH_SIZE = 8
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE)

if __name__ == "__main__":
    syn_data = synthetic_data(8, 2, 1)
    for i, batch in enumerate(syn_data):
        logging.info("batch-src shape {}, batch-src: {}".format(
            batch.src.shape, batch.src))
        logging.info("batch-tgt shape {}, batch-tgt: {}".format(
            batch.tgt.shape, batch.tgt))
        logging.info("batch-src-mask shape {}, batch-src-mask: {}".format(
            batch.src_mask.shape, batch.src_mask))
Пример #20
0
device

eng=spacy.load('en')
ger=spacy.load('de_core_news_sm')

def Tokenize_eng(text):
  return [a.text for a in eng.tokenizer(text)]
def Tokenize_german(text):
  return [b.text for b in ger.tokenizer(text)]

german=Field(tokenize=Tokenize_german,lower=True,init_token='<sos>',eos_token='<eos>')
english=Field(tokenize=Tokenize_eng,lower=True,init_token='<sos>',eos_token='<eos>')

Train,Val,Test=Multi30k.splits(exts=('.de','.en'),fields=(german,english))

german.build_vocab(Train,max_size=10000,min_freq=2)
english.build_vocab(Train,max_size=10000,min_freq=2)

##building encoder
class Encoder(Module):
  def __init__(self,inp_size,emd_size,hidden_size):
    super(Encoder,self).__init__()
    self.inp_size=inp_size
    self.emd_size=emd_size
    self.hidden_size=hidden_size
    self.drop=Dropout(0.5)
    self.embed=Embedding(self.inp_size,self.emd_size)
    self.lstm=LSTM(self.emd_size,self.hidden_size,bidirectional=True)
    self.fc_hidden=Linear(self.hidden_size*2,self.hidden_size)
    self.fc_cell=Linear(self.hidden_size*2,self.hidden_size)
  def forward(self,x):
Пример #21
0
random.seed(1337)
torch.manual_seed(1337)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Download and the load default data.
WORD = Field(include_lengths=True)
UD_TAG = Field(init_token="<bos>", eos_token="<eos>", include_lengths=True)

# Download and the load default data.
train, val, test = UDPOS.splits(
    fields=(("word", WORD), ("udtag", UD_TAG), (None, None)),
    filter_pred=lambda ex: 5 < len(ex.word) < 30,
)

WORD.build_vocab(train.word, min_freq=3)
UD_TAG.build_vocab(train.udtag)
train_iter = torch_struct.data.TokenBucket(train,
                                           batch_size=100,
                                           device="cuda:0")

H = 256
T = 30
NT = 30
model = NeuralCFG(len(WORD.vocab), T, NT, H)
if args.script:
    print("scripting...")
    model = torch.jit.script(model)
model.cuda()
opt = torch.optim.Adam(model.parameters(), lr=0.001, betas=[0.75, 0.999])
Пример #22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-raw_dir', required=True)
    parser.add_argument('-data_dir', required=True)
    parser.add_argument('-codes', required=True)
    parser.add_argument('-save_data', required=True)
    parser.add_argument('-prefix', required=True)
    parser.add_argument('-max_len', type=int, default=100)
    parser.add_argument('--symbols',
                        '-s',
                        type=int,
                        default=32000,
                        help="Vocabulary size")
    parser.add_argument(
        '--min-frequency',
        type=int,
        default=6,
        metavar='FREQ',
        help=
        'Stop if no symbol pair has frequency >= FREQ (default: %(default)s))')
    parser.add_argument(
        '--dict-input',
        action="store_true",
        help=
        "If set, input file is interpreted as a dictionary where each line contains a word-count pair"
    )
    parser.add_argument(
        '--separator',
        type=str,
        default='@@',
        metavar='STR',
        help=
        "Separator between non-final subword units (default: '%(default)s'))")
    parser.add_argument('--total-symbols', '-t', action="store_true")
    opt = parser.parse_args()

    # Create folder if needed.
    mkdir_if_needed(opt.raw_dir)
    mkdir_if_needed(opt.data_dir)

    # Download and extract raw data.
    raw_train = get_raw_files(opt.raw_dir, _TRAIN_DATA_SOURCES)
    raw_val = get_raw_files(opt.raw_dir, _VAL_DATA_SOURCES)
    raw_test = get_raw_files(opt.raw_dir, _TEST_DATA_SOURCES)

    # Merge files into one.
    train_src, train_trg = compile_files(opt.raw_dir, raw_train,
                                         opt.prefix + '-train')
    val_src, val_trg = compile_files(opt.raw_dir, raw_val, opt.prefix + '-val')
    test_src, test_trg = compile_files(opt.raw_dir, raw_test,
                                       opt.prefix + '-test')

    # Build up the code from training files if not exist
    opt.codes = os.path.join(opt.data_dir, opt.codes)
    if not os.path.isfile(opt.codes):
        sys.stderr.write(
            f"Collect codes from training data and save to {opt.codes}.\n")
        learn_bpe(raw_train['src'] + raw_train['trg'], opt.codes, opt.symbols,
                  opt.min_frequency, True)
    sys.stderr.write(f"BPE codes prepared.\n")

    sys.stderr.write(f"Build up the tokenizer.\n")
    with codecs.open(opt.codes, encoding='utf-8') as codes:
        bpe = BPE(codes, separator=opt.separator)

    sys.stderr.write(f"Encoding ...\n")
    encode_files(bpe, train_src, train_trg, opt.data_dir,
                 opt.prefix + '-train')
    encode_files(bpe, val_src, val_trg, opt.data_dir, opt.prefix + '-val')
    encode_files(bpe, test_src, test_trg, opt.data_dir, opt.prefix + '-test')
    sys.stderr.write(f"Done.\n")

    field = Field(tokenize=str.split,
                  lower=True,
                  pad_token=Constants.PAD_WORD,
                  init_token=Constants.BOS_WORD,
                  eos_token=Constants.EOS_WORD)

    fields = (field, field)

    MAX_LEN = opt.max_len

    def filter_examples_with_length(x):
        return len(vars(x)['src']) <= MAX_LEN and len(
            vars(x)['trg']) <= MAX_LEN

    enc_train_files_prefix = opt.prefix + '-train'
    train = TranslationDataset(fields=fields,
                               path=os.path.join(opt.data_dir,
                                                 enc_train_files_prefix),
                               exts=('.src', '.trg'),
                               filter_pred=filter_examples_with_length)

    from itertools import chain
    field.build_vocab(chain(train.src, train.trg), min_freq=2)

    data = {
        'settings': opt,
        'vocab': field,
    }
    opt.save_data = os.path.join(opt.data_dir, opt.save_data)

    print('[Info] Dumping the processed data to pickle file', opt.save_data)
    pickle.dump(data, open(opt.save_data, 'wb'))
Пример #23
0
def train():
    spacy_ger = de_core_news_md.load()
    spacy_eng = en_core_web_sm.load()

    def tokenize_ger(text):
        return [tok.text for tok in spacy_ger.tokenizer(text)]

    def tokenize_eng(text):
        return [tok.text for tok in spacy_eng.tokenizer(text)]

    german = Field(tokenize=tokenize_ger,
                   lower=True,
                   init_token="<sos>",
                   eos_token="<eos>")

    english = Field(tokenize=tokenize_eng,
                    lower=True,
                    init_token="<sos>",
                    eos_token="<eos>")

    train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"),
                                                        fields=(german,
                                                                english))

    german.build_vocab(train_data, max_size=10000, min_freq=2)
    english.build_vocab(train_data, max_size=10000, min_freq=2)

    ### We're ready to define everything we need for training our Seq2Seq model ###

    # Training hyperparameters
    num_epochs = 20
    learning_rate = 0.001
    batch_size = 64

    # Model hyperparameters
    load_model = False
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_size_encoder = len(german.vocab)
    input_size_decoder = len(english.vocab)
    output_size = len(english.vocab)
    encoder_embedding_size = 300
    decoder_embedding_size = 300
    hidden_size = 1024  # Needs to be the same for both RNN's
    num_layers = 2
    enc_dropout = 0.5
    dec_dropout = 0.5

    # Tensorboard to get nice loss plot
    writer = SummaryWriter(f"runs/loss_plot")
    step = 0

    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=batch_size,
        sort_within_batch=True,
        sort_key=lambda x: len(x.src),
        device=device,
    )

    encoder_net = Encoder(input_size_encoder, encoder_embedding_size,
                          hidden_size, num_layers, enc_dropout).to(device)

    decoder_net = Decoder(
        input_size_decoder,
        decoder_embedding_size,
        hidden_size,
        output_size,
        num_layers,
        dec_dropout,
    ).to(device)

    model = Seq2Seq(encoder_net, decoder_net, len(english.vocab),
                    device).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print(
        f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: The model has {count_parameters(model):,} trainable parameters"
    )

    pad_idx = english.vocab.stoi["<pad>"]
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

    if load_model:
        load_checkpoint(torch.load("my_checkpoint_2_2.pth.tar"), model,
                        optimizer)

    sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

    for epoch in range(num_epochs):
        print(
            f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: [Epoch {epoch} / {num_epochs}]"
        )

        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict()
        }
        # save_checkpoint(checkpoint)

        model.eval()

        translated_sentence = translate_sentence(model,
                                                 sentence,
                                                 german,
                                                 english,
                                                 device,
                                                 max_length=50)

        print(f"Translated example sentence: \n {translated_sentence}")

        model.train()

        for batch_idx, batch in enumerate(train_iterator):
            # Get input and targets and get to cuda
            inp_data = batch.src.to(device)
            target = batch.trg.to(device)

            # Forward prop
            output = model(inp_data, target)

            # print('\n')
            # print('Input', inp_data.shape)
            # print('Target', target.shape)
            # print('Output', output.shape)
            # print('---------------------')

            # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
            # doesn't take input in that form. For example if we have MNIST we want to have
            # output to be: (N, 10) and targets just (N). Here we can view it in a similar
            # way that we have output_words * batch_size that we want to send in into
            # our cost function, so we need to do some reshapin. While we're at it
            # Let's also remove the start token while we're at it
            output = output[1:].reshape(-1, output.shape[2])
            target = target[1:].reshape(-1)

            optimizer.zero_grad()
            loss = criterion(output, target)

            # Back prop
            loss.backward()

            # Clip to avoid exploding gradient issues, makes sure grads are
            # within a healthy range
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            # Gradient descent step
            optimizer.step()

            # Plot to tensorboard
            writer.add_scalar("Training loss", loss, global_step=step)
            # print("Training loss", loss)
            step += 1

    score = bleu(test_data[1:100], model, german, english, device)
    print(f"Bleu score {score*100:.2f}")
Пример #24
0
def main_wo_bpe():
    '''
    Usage: python preprocess.py -lang_src de -lang_trg en -save_data multi30k_de_en.pkl -share_vocab
    '''

    spacy_support_langs = [
        'de', 'el', 'en', 'es', 'fr', 'it', 'lt', 'nb', 'nl', 'pt'
    ]

    parser = argparse.ArgumentParser()
    parser.add_argument('-lang_src',
                        required=True,
                        choices=spacy_support_langs)
    parser.add_argument('-lang_trg',
                        required=True,
                        choices=spacy_support_langs)
    parser.add_argument('-save_data', required=True)
    parser.add_argument('-data_src', type=str, default=None)
    parser.add_argument('-data_trg', type=str, default=None)

    parser.add_argument('-max_len', type=int, default=100)
    parser.add_argument('-min_word_count', type=int, default=3)
    parser.add_argument('-keep_case', action='store_true')
    parser.add_argument('-share_vocab', action='store_true')
    #parser.add_argument('-ratio', '--train_valid_test_ratio', type=int, nargs=3, metavar=(8,1,1))
    #parser.add_argument('-vocab', default=None)

    opt = parser.parse_args()
    assert not any([opt.data_src, opt.data_trg
                    ]), 'Custom data input is not support now.'
    assert not any([opt.data_src, opt.data_trg]) or all(
        [opt.data_src, opt.data_trg])
    print(opt)

    src_lang_model = spacy.load(opt.lang_src)
    trg_lang_model = spacy.load(opt.lang_trg)

    def tokenize_src(text):
        return [tok.text for tok in src_lang_model.tokenizer(text)]

    def tokenize_trg(text):
        return [tok.text for tok in trg_lang_model.tokenizer(text)]

    SRC = Field(tokenize=tokenize_src,
                lower=not opt.keep_case,
                pad_token=Constants.PAD_WORD,
                init_token=Constants.BOS_WORD,
                eos_token=Constants.EOS_WORD)

    TRG = Field(tokenize=tokenize_trg,
                lower=not opt.keep_case,
                pad_token=Constants.PAD_WORD,
                init_token=Constants.BOS_WORD,
                eos_token=Constants.EOS_WORD)

    MAX_LEN = opt.max_len
    MIN_FREQ = opt.min_word_count

    if not all([opt.data_src, opt.data_trg]):
        assert {opt.lang_src, opt.lang_trg} == {'de', 'en'}
    else:
        # Pack custom txt file into example datasets
        raise NotImplementedError

    def filter_examples_with_length(x):
        return len(vars(x)['src']) <= MAX_LEN and len(
            vars(x)['trg']) <= MAX_LEN

    train, val, test = Multi30k.splits(exts=('.' + opt.lang_src,
                                             '.' + opt.lang_trg),
                                       fields=(SRC, TRG),
                                       filter_pred=filter_examples_with_length)

    SRC.build_vocab(train.src, min_freq=MIN_FREQ)
    print('[Info] Get source language vocabulary size:', len(SRC.vocab))
    TRG.build_vocab(train.trg, min_freq=MIN_FREQ)
    print('[Info] Get target language vocabulary size:', len(TRG.vocab))

    if opt.share_vocab:
        print('[Info] Merging two vocabulary ...')
        for w, _ in SRC.vocab.stoi.items():
            # TODO: Also update the `freq`, although it is not likely to be used.
            if w not in TRG.vocab.stoi:
                TRG.vocab.stoi[w] = len(TRG.vocab.stoi)
        TRG.vocab.itos = [None] * len(TRG.vocab.stoi)
        for w, i in TRG.vocab.stoi.items():
            TRG.vocab.itos[i] = w
        SRC.vocab.stoi = TRG.vocab.stoi
        SRC.vocab.itos = TRG.vocab.itos
        print('[Info] Get merged vocabulary size:', len(TRG.vocab))

    data = {
        'settings': opt,
        'vocab': {
            'src': SRC,
            'trg': TRG
        },
        'train': train.examples,
        'valid': val.examples,
        'test': test.examples
    }

    print('[Info] Dumping the processed data to pickle file', opt.save_data)
    pickle.dump(data, open(opt.save_data, 'wb'))
Пример #25
0
    batch_size=200,
    sort_key=lambda x: len(x.words),
    device=device,
    sort=True,
    sort_within_batch=True,
)
test_iter = BucketIterator(
    test,
    batch_size=200,
    sort_key=lambda x: len(x.words),
    device=device,
    sort=True,
    sort_within_batch=True,
)

text_field.build_vocab(train, min_freq=5)


# LSTM model
class LSTM(nn.Module):
    def __init__(self, dimension=128):
        super(LSTM, self).__init__()

        self.embedding = nn.Embedding(len(text_field.vocab), dimension)
        flor.log("embedding", self.embedding)
        self.lstm = nn.LSTM(
            input_size=dimension,
            hidden_size=dimension,
            num_layers=1,
            batch_first=True,
            bidirectional=True,
Пример #26
0
train_data = pd.DataFrame(train_lines)
valid_data = pd.DataFrame(train_lines[dataset_length - valid_size:])

train_data = [
    Example.fromlist([train_data.questions[i], train_data.answers[i]], fields)
    for i in range(train_data.shape[0])
]
valid_data = [
    Example.fromlist([valid_data.questions[i], valid_data.answers[i]], fields)
    for i in range(valid_data.shape[0])
]

train_data = Dataset(train_data, fields)
valid_data = Dataset(valid_data, fields)

Question.build_vocab(train_data, min_freq=2)
Answer.build_vocab(
    train_data,
    vectors=torchtext.vocab.Vectors("./python_code_glove_embedding_300.txt"),
    min_freq=2)

print(f"Unique tokens in Question vocabulary: {len(Question.vocab)}")
print(f"Unique tokens in Answer vocabulary: {len(Answer.vocab)}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 32

print('BATCH_SIZE:', 32)

train_iterator, valid_iterator = BucketIterator.splits(