"""##Tokenizer"""

tokenizer = NLTKMosesTokenizer()
# check
tokenizer(['Kaggle is the best place to study machine learning.'])

train_x_lower_tokenized = str_lower(tokenizer(train_iterator.get_instances(data_type='train')[0]))

"""##Vocabulary"""

# initialize simple vocabulary to collect all appeared in the dataset classes
classes_vocab = SimpleVocabulary(
    save_path='./tmp/classes.dict',
    load_path='./tmp/classes.dict')

classes_vocab.fit((train_iterator.get_instances(data_type='train')[1]))
classes_vocab.save()

# show classes
list(classes_vocab.items())

# also one can collect vocabulary of textual tokens appeared 2 and more times in the dataset
token_vocab = SimpleVocabulary(
    save_path='./tmp/tokens.dict',
    load_path='./tmp/tokens.dict',
    min_freq=2,
    special_tokens=('<PAD>', '<UNK>',),
    unk_token='<UNK>')

token_vocab.fit(train_x_lower_tokenized)
token_vocab.save()
Exemplo n.º 2
0
import torch.cuda as cuda
from torch.nn.functional import softmax
from torch.utils.data import DataLoader
from deeppavlov.core.data.simple_vocab import SimpleVocabulary
from Task_2_work_ver.Task_1_character_lm.plot_loss import plot_loss
from Task_2_work_ver.Task_1_character_lm.get_func import read_infile, Dataset, Padder, Config

base_path = r'C:\Users\Andrey'
experiments_path = r'C:\Users\Andrey\Google Диск\courses\DeepPavlov\Task-2-preduct0r\data\Task_1'

train_words = read_infile(os.path.join(base_path, "russian-train-high"))
test_words = read_infile(os.path.join(base_path, "russian-test"))
vocab = SimpleVocabulary(special_tokens=('PAD', 'UNK', 'BEGIN', 'END'),
                         unk_token='UNK',
                         save_path=experiments_path)
vocab.fit([list(x) for x in train_words])
config = Config(lr=0.0001, batch_size=512, num_epochs=1000)

net = torch.load(os.path.join(experiments_path, "net.pb"))

if cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'
# ============================================================================

# Write a function predict_on_batch that outputs letter probabilities of all words in the batch.


def generate(model, max_length=20, start_index=2, end_index=3, device=device):
    cur_index, length = start_index, 0
Exemplo n.º 3
0
reader = BasicClassificationDatasetReader()
data = reader.read(data_path="./stanfordSentimentTreebank",
                   train="/content/train.csv",
                   valid="/content/valid.csv",
                   test="/content/test1.csv",
                   x="original",
                   y="meanGrade")
iterator = BasicClassificationDatasetIterator(data, seed=42, shuffle=True)
bert_preprocessor = BertPreprocessor(
    vocab_file=
    "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12/vocab.txt",
    do_lower_case=False,
    max_seq_length=64)
vocab = SimpleVocabulary(save_path="./binary_classes.dict")
iterator.get_instances(data_type="train")
vocab.fit(iterator.get_instances(data_type="train")[1])
one_hotter = OneHotter(depth=vocab.len, single_vector=True)
prob2labels = Proba2Labels(max_proba=True)
bert_classifier = BertClassifierModel(
    n_classes=vocab.len,
    return_probas=True,
    one_hot_labels=True,
    bert_config_file=
    "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12/bert_config.json",
    pretrained_bert=
    "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12/bert_model.ckpt",
    save_path="sst_bert_model/model",
    load_path="sst_bert_model/model",
    keep_prob=0.5,
    learning_rate=0.5,
    learning_rate_drop_patience=5,