def __init__(self, model: SiameseModel, batch_size: int, num_context_turns: int = 1, ranking: bool = True, attention: bool = False, responses: SimpleVocabulary = None, preproc_func: Callable = None, interact_pred_num: int = 3, *args, **kwargs) -> None: super().__init__() self.batch_size = batch_size self.num_context_turns = num_context_turns self.ranking = ranking self.attention = attention self.preproc_responses = [] self.response_embeddings = None self.preproc_func = preproc_func self.interact_pred_num = interact_pred_num self.model = model if self.ranking: self.responses = {el[1]: el[0] for el in responses.items()} self._build_preproc_responses() if not self.attention: self._build_response_embeddings()
def __init__(self, data: Dict[str, List[Union[str, Path]]], load_path: Union[str, Path], seed: Optional[int] = None, shuffle: bool = True, unroll_steps: Optional[int] = None, n_gpus: Optional[int] = None, max_word_length: Optional[int] = None, bos: str = "<S>", eos: str = "</S>", *args, **kwargs) -> None: self.unroll_steps = unroll_steps self.n_gpus = n_gpus self.bos = bos self.eos = eos self.str_utf8_encoder = StrUTF8Encoder( max_word_length=max_word_length, pad_special_char_use=True, word_boundary_special_char_use=True, sentence_boundary_special_char_use=False, reversed_sentense_tokens=False, bos=self.bos, eos=self.eos, save_path=load_path, load_path=load_path, ) self.simple_vocab = SimpleVocabulary( min_freq=2, special_tokens=[self.eos, self.bos, "<UNK>"], unk_token="<UNK>", freq_drop_load=True, save_path=load_path, load_path=load_path, ) super().__init__(data, seed, shuffle, *args, **kwargs)
import numpy as np import torch import torch.cuda as cuda from torch.nn.functional import softmax from torch.utils.data import DataLoader from deeppavlov.core.data.simple_vocab import SimpleVocabulary from Task_2_work_ver.Task_1_character_lm.plot_loss import plot_loss from Task_2_work_ver.Task_1_character_lm.get_func import read_infile, Dataset, Padder, Config base_path = r'C:\Users\Andrey' experiments_path = r'C:\Users\Andrey\Google Диск\courses\DeepPavlov\Task-2-preduct0r\data\Task_1' train_words = read_infile(os.path.join(base_path, "russian-train-high")) test_words = read_infile(os.path.join(base_path, "russian-test")) vocab = SimpleVocabulary(special_tokens=('PAD', 'UNK', 'BEGIN', 'END'), unk_token='UNK', save_path=experiments_path) vocab.fit([list(x) for x in train_words]) config = Config(lr=0.0001, batch_size=512, num_epochs=1000) net = torch.load(os.path.join(experiments_path, "net.pb")) if cuda.is_available(): device = 'cuda' else: device = 'cpu' # ============================================================================ # Write a function predict_on_batch that outputs letter probabilities of all words in the batch.
# check str_lower(['Kaggle is the best place to study machine learning.']) """##Tokenizer""" tokenizer = NLTKMosesTokenizer() # check tokenizer(['Kaggle is the best place to study machine learning.']) train_x_lower_tokenized = str_lower(tokenizer(train_iterator.get_instances(data_type='train')[0])) """##Vocabulary""" # initialize simple vocabulary to collect all appeared in the dataset classes classes_vocab = SimpleVocabulary( save_path='./tmp/classes.dict', load_path='./tmp/classes.dict') classes_vocab.fit((train_iterator.get_instances(data_type='train')[1])) classes_vocab.save() # show classes list(classes_vocab.items()) # also one can collect vocabulary of textual tokens appeared 2 and more times in the dataset token_vocab = SimpleVocabulary( save_path='./tmp/tokens.dict', load_path='./tmp/tokens.dict', min_freq=2, special_tokens=('<PAD>', '<UNK>',), unk_token='<UNK>')
import warnings warnings.filterwarnings("ignore") import sys sys.path.append('/rapids/notebooks/my_data/BMSTU_hack/') import torch import biGRU_model from deeppavlov.core.data.simple_vocab import SimpleVocabulary import numpy as np gru = torch.load('/rapids/notebooks/my_data/BMSTU_hack/models/biGRU') device = torch.device('cpu') vocab = SimpleVocabulary( save_path="/rapids/notebooks/my_data/BMSTU_hack/models/vocab.dict") gru = biGRU_model.BiGRU(vocab.count, embedding_dim=10, hidden_size=50, device='cpu') gru.load_state_dict( torch.load('/rapids/notebooks/my_data/BMSTU_hack/models/biGRU', map_location=device)) from tbot import config import telebot bot = telebot.TeleBot(config.token) @bot.message_handler(content_types=["text"]) def repeat_all_messages(message):
from deeppavlov.models.bert.bert_classifier import BertClassifierModel from deeppavlov.metrics.accuracy import sets_accuracy reader = BasicClassificationDatasetReader() data = reader.read(data_path="./stanfordSentimentTreebank", train="/content/train.csv", valid="/content/valid.csv", test="/content/test1.csv", x="original", y="meanGrade") iterator = BasicClassificationDatasetIterator(data, seed=42, shuffle=True) bert_preprocessor = BertPreprocessor( vocab_file= "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12/vocab.txt", do_lower_case=False, max_seq_length=64) vocab = SimpleVocabulary(save_path="./binary_classes.dict") iterator.get_instances(data_type="train") vocab.fit(iterator.get_instances(data_type="train")[1]) one_hotter = OneHotter(depth=vocab.len, single_vector=True) prob2labels = Proba2Labels(max_proba=True) bert_classifier = BertClassifierModel( n_classes=vocab.len, return_probas=True, one_hot_labels=True, bert_config_file= "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12/bert_config.json", pretrained_bert= "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12/bert_model.ckpt", save_path="sst_bert_model/model", load_path="sst_bert_model/model", keep_prob=0.5,
dr = BasicClassificationDatasetReader().read(data_path='./', train='train.csv', valid='valid.csv', test='test.csv', x='original', y='meanGrade') train_iterator = BasicClassificationDatasetIterator(data=dr, seed=42) x_train, y_train = train_iterator.get_instances(data_type='train') for x, y in list(zip(x_train, y_train))[:5]: print('x:', x) print('y:', y) print('=================') tokenizer = NLTKMosesTokenizer() train_x_lower_tokenized = str_lower( tokenizer(train_iterator.get_instances(data_type='train')[0])) classes_vocab = SimpleVocabulary(save_path='./snips/classes.dict', load_path='./snips/classes.dict') vocab = SimpleVocabulary(save_path="./binary_classes.dict") classes_vocab.fit((train_iterator.get_instances(data_type='train')[1])) classes_vocab.save() token_vocab = SimpleVocabulary(save_path='./snips/tokens.dict', load_path='./snips/tokens.dict', min_freq=2, special_tokens=( '<PAD>', '<UNK>', ), unk_token='<UNK>') token_vocab.fit(train_x_lower_tokenized) token_vocab.save() token_vocab.freqs.most_common()[:10] tfidf = SklearnComponent(
#print([(k, len(dr[k])) for k in dr.keys()]) # print a few x, y pairs x_train, y_train = train_iterator.get_instances(data_type='train') for x, y in list(zip(x_train, y_train))[:3]: print('x:', x) print('y:', y) print('=================') # tokenize all input data tokenizer = NLTKMosesTokenizer() train_x_lower_tokenized = str_lower( tokenizer(train_iterator.get_instances(data_type='train')[0])) # get the intent categories classes_vocab = SimpleVocabulary(save_path='./tmp/classes.dict', load_path='./tmp/classes.dict') classes_vocab.fit(train_iterator.get_instances(data_type='train')[1]) classes_vocab.save() print(list(classes_vocab.items())) # display classes # get all token vocab token_vocab = SimpleVocabulary(save_path='./tmp/tokens.dict', load_path='./tmp/tokens.dict') token_vocab.fit(train_x_lower_tokenized) token_vocab.save() # we will use GLOVE embedding if not os.path.isfile("./glove.6B.100d.txt"): simple_download( url="http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt", destination="./glove.6B.100d.txt")
words.append(temp[1]) return words #================================================== train_words = read_infile(os.path.join(base_path, "russian-train-high")) dev_words = read_infile(os.path.join(base_path, "russian-dev")) test_words = read_infile(os.path.join(base_path, "russian-test")) print(len(train_words), len(dev_words), len(test_words)) print(*train_words[:10]) #================================================== vocab = SimpleVocabulary( special_tokens=('PAD', 'UNK', 'BEGIN', 'END'), unk_token='UNK', save_path=r'C:\Users\Andrey\Google Диск\courses\DeepPavlov\Task-2-preduct0r' ) vocab.fit([list(x) for x in train_words]) #================================================== train_dataset = Dataset(train_words, vocab) dev_dataset = Dataset(dev_words, vocab) test_dataset = Dataset(test_words, vocab) #================================================== train_batcher = DataLoader(train_dataset, batch_size=1) dev_batcher = DataLoader(dev_dataset, batch_size=1) test_batcher = DataLoader(test_dataset, batch_size=1) # for i, (items, classes) in enumerate(train_batcher):