"""##Tokenizer""" tokenizer = NLTKMosesTokenizer() # check tokenizer(['Kaggle is the best place to study machine learning.']) train_x_lower_tokenized = str_lower(tokenizer(train_iterator.get_instances(data_type='train')[0])) """##Vocabulary""" # initialize simple vocabulary to collect all appeared in the dataset classes classes_vocab = SimpleVocabulary( save_path='./tmp/classes.dict', load_path='./tmp/classes.dict') classes_vocab.fit((train_iterator.get_instances(data_type='train')[1])) classes_vocab.save() # show classes list(classes_vocab.items()) # also one can collect vocabulary of textual tokens appeared 2 and more times in the dataset token_vocab = SimpleVocabulary( save_path='./tmp/tokens.dict', load_path='./tmp/tokens.dict', min_freq=2, special_tokens=('<PAD>', '<UNK>',), unk_token='<UNK>') token_vocab.fit(train_x_lower_tokenized) token_vocab.save()
import torch.cuda as cuda from torch.nn.functional import softmax from torch.utils.data import DataLoader from deeppavlov.core.data.simple_vocab import SimpleVocabulary from Task_2_work_ver.Task_1_character_lm.plot_loss import plot_loss from Task_2_work_ver.Task_1_character_lm.get_func import read_infile, Dataset, Padder, Config base_path = r'C:\Users\Andrey' experiments_path = r'C:\Users\Andrey\Google Диск\courses\DeepPavlov\Task-2-preduct0r\data\Task_1' train_words = read_infile(os.path.join(base_path, "russian-train-high")) test_words = read_infile(os.path.join(base_path, "russian-test")) vocab = SimpleVocabulary(special_tokens=('PAD', 'UNK', 'BEGIN', 'END'), unk_token='UNK', save_path=experiments_path) vocab.fit([list(x) for x in train_words]) config = Config(lr=0.0001, batch_size=512, num_epochs=1000) net = torch.load(os.path.join(experiments_path, "net.pb")) if cuda.is_available(): device = 'cuda' else: device = 'cpu' # ============================================================================ # Write a function predict_on_batch that outputs letter probabilities of all words in the batch. def generate(model, max_length=20, start_index=2, end_index=3, device=device): cur_index, length = start_index, 0
reader = BasicClassificationDatasetReader() data = reader.read(data_path="./stanfordSentimentTreebank", train="/content/train.csv", valid="/content/valid.csv", test="/content/test1.csv", x="original", y="meanGrade") iterator = BasicClassificationDatasetIterator(data, seed=42, shuffle=True) bert_preprocessor = BertPreprocessor( vocab_file= "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12/vocab.txt", do_lower_case=False, max_seq_length=64) vocab = SimpleVocabulary(save_path="./binary_classes.dict") iterator.get_instances(data_type="train") vocab.fit(iterator.get_instances(data_type="train")[1]) one_hotter = OneHotter(depth=vocab.len, single_vector=True) prob2labels = Proba2Labels(max_proba=True) bert_classifier = BertClassifierModel( n_classes=vocab.len, return_probas=True, one_hot_labels=True, bert_config_file= "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12/bert_config.json", pretrained_bert= "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12/bert_model.ckpt", save_path="sst_bert_model/model", load_path="sst_bert_model/model", keep_prob=0.5, learning_rate=0.5, learning_rate_drop_patience=5,