Пример #1
0
    def __init__(self,
                 model: SiameseModel,
                 batch_size: int,
                 num_context_turns: int = 1,
                 ranking: bool = True,
                 attention: bool = False,
                 responses: SimpleVocabulary = None,
                 preproc_func: Callable = None,
                 interact_pred_num: int = 3,
                 *args,
                 **kwargs) -> None:

        super().__init__()

        self.batch_size = batch_size
        self.num_context_turns = num_context_turns
        self.ranking = ranking
        self.attention = attention
        self.preproc_responses = []
        self.response_embeddings = None
        self.preproc_func = preproc_func
        self.interact_pred_num = interact_pred_num
        self.model = model
        if self.ranking:
            self.responses = {el[1]: el[0] for el in responses.items()}
            self._build_preproc_responses()
            if not self.attention:
                self._build_response_embeddings()
Пример #2
0
    def __init__(self,
                 model: SiameseModel,
                 batch_size: int,
                 num_context_turns: int = 1,
                 ranking: bool = True,
                 attention: bool = False,
                 responses: SimpleVocabulary = None,
                 preproc_func: Callable = None,
                 interact_pred_num: int = 3,
                 *args, **kwargs) -> None:

        super().__init__()

        self.batch_size = batch_size
        self.num_context_turns = num_context_turns
        self.ranking = ranking
        self.attention = attention
        self.preproc_responses = []
        self.response_embeddings = None
        self.preproc_func = preproc_func
        self.interact_pred_num = interact_pred_num
        self.model = model
        if self.ranking:
            self.responses = {el[1]: el[0] for el in responses.items()}
            self._build_preproc_responses()
            if not self.attention:
                self._build_response_embeddings()
tokenizer(['Kaggle is the best place to study machine learning.'])

train_x_lower_tokenized = str_lower(tokenizer(train_iterator.get_instances(data_type='train')[0]))

"""##Vocabulary"""

# initialize simple vocabulary to collect all appeared in the dataset classes
classes_vocab = SimpleVocabulary(
    save_path='./tmp/classes.dict',
    load_path='./tmp/classes.dict')

classes_vocab.fit((train_iterator.get_instances(data_type='train')[1]))
classes_vocab.save()

# show classes
list(classes_vocab.items())

# also one can collect vocabulary of textual tokens appeared 2 and more times in the dataset
token_vocab = SimpleVocabulary(
    save_path='./tmp/tokens.dict',
    load_path='./tmp/tokens.dict',
    min_freq=2,
    special_tokens=('<PAD>', '<UNK>',),
    unk_token='<UNK>')

token_vocab.fit(train_x_lower_tokenized)
token_vocab.save()

# number of tokens in dictionary
len(token_vocab)
for x, y in list(zip(x_train, y_train))[:3]:
    print('x:', x)
    print('y:', y)
    print('=================')

# tokenize all input data
tokenizer = NLTKMosesTokenizer()
train_x_lower_tokenized = str_lower(
    tokenizer(train_iterator.get_instances(data_type='train')[0]))

# get the intent categories
classes_vocab = SimpleVocabulary(save_path='./tmp/classes.dict',
                                 load_path='./tmp/classes.dict')
classes_vocab.fit(train_iterator.get_instances(data_type='train')[1])
classes_vocab.save()
print(list(classes_vocab.items()))  # display classes

# get all token vocab
token_vocab = SimpleVocabulary(save_path='./tmp/tokens.dict',
                               load_path='./tmp/tokens.dict')
token_vocab.fit(train_x_lower_tokenized)
token_vocab.save()

# we will use GLOVE embedding
if not os.path.isfile("./glove.6B.100d.txt"):
    simple_download(
        url="http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt",
        destination="./glove.6B.100d.txt")
embedder = GloVeEmbedder(load_path='./glove.6B.100d.txt',
                         dim=100,
                         pad_zero=True)