Python SimpleVocabulary示例，deeppavlov.core.data.simple_vocab.SimpleVocabulary Python示例

示例#1

0

显示文件

文件： siamese_predictor.py 项目： RileyShe/DeepPavlov

    def __init__(self,
                 model: SiameseModel,
                 batch_size: int,
                 num_context_turns: int = 1,
                 ranking: bool = True,
                 attention: bool = False,
                 responses: SimpleVocabulary = None,
                 preproc_func: Callable = None,
                 interact_pred_num: int = 3,
                 *args, **kwargs) -> None:

        super().__init__()

        self.batch_size = batch_size
        self.num_context_turns = num_context_turns
        self.ranking = ranking
        self.attention = attention
        self.preproc_responses = []
        self.response_embeddings = None
        self.preproc_func = preproc_func
        self.interact_pred_num = interact_pred_num
        self.model = model
        if self.ranking:
            self.responses = {el[1]: el[0] for el in responses.items()}
            self._build_preproc_responses()
            if not self.attention:
                self._build_response_embeddings()

示例#2

0

显示文件

    def __init__(self,
                 model: SiameseModel,
                 batch_size: int,
                 num_context_turns: int = 1,
                 ranking: bool = True,
                 attention: bool = False,
                 responses: SimpleVocabulary = None,
                 preproc_func: Callable = None,
                 interact_pred_num: int = 3,
                 *args,
                 **kwargs) -> None:

        super().__init__()

        self.batch_size = batch_size
        self.num_context_turns = num_context_turns
        self.ranking = ranking
        self.attention = attention
        self.preproc_responses = []
        self.response_embeddings = None
        self.preproc_func = preproc_func
        self.interact_pred_num = interact_pred_num
        self.model = model
        if self.ranking:
            self.responses = {el[1]: el[0] for el in responses.items()}
            self._build_preproc_responses()
            if not self.attention:
                self._build_response_embeddings()

示例#3

0

显示文件

文件： elmo_file_paths_iterator.py 项目： zr940326/DeepPavlov

 def __init__(self,
              data: Dict[str, List[Union[str, Path]]],
              load_path: Union[str, Path],
              seed: Optional[int] = None,
              shuffle: bool = True,
              unroll_steps: Optional[int] = None,
              n_gpus: Optional[int] = None,
              max_word_length: Optional[int] = None,
              bos: str = "<S>",
              eos: str = "</S>",
              *args, **kwargs) -> None:
     self.unroll_steps = unroll_steps
     self.n_gpus = n_gpus
     self.bos = bos
     self.eos = eos
     self.str_utf8_encoder = StrUTF8Encoder(
         max_word_length=max_word_length,
         pad_special_char_use=True,
         word_boundary_special_char_use=True,
         sentence_boundary_special_char_use=False,
         reversed_sentense_tokens=False,
         bos=self.bos,
         eos=self.eos,
         save_path=load_path,
         load_path=load_path,
     )
     self.simple_vocab = SimpleVocabulary(
         min_freq=2,
         special_tokens=[self.eos, self.bos, "<UNK>"],
         unk_token="<UNK>",
         freq_drop_load=True,
         save_path=load_path,
         load_path=load_path,
     )
     super().__init__(data, seed, shuffle, *args, **kwargs)

示例#4

0

显示文件

import numpy as np
import torch
import torch.cuda as cuda
from torch.nn.functional import softmax
from torch.utils.data import DataLoader
from deeppavlov.core.data.simple_vocab import SimpleVocabulary
from Task_2_work_ver.Task_1_character_lm.plot_loss import plot_loss
from Task_2_work_ver.Task_1_character_lm.get_func import read_infile, Dataset, Padder, Config

base_path = r'C:\Users\Andrey'
experiments_path = r'C:\Users\Andrey\Google Диск\courses\DeepPavlov\Task-2-preduct0r\data\Task_1'

train_words = read_infile(os.path.join(base_path, "russian-train-high"))
test_words = read_infile(os.path.join(base_path, "russian-test"))
vocab = SimpleVocabulary(special_tokens=('PAD', 'UNK', 'BEGIN', 'END'),
                         unk_token='UNK',
                         save_path=experiments_path)
vocab.fit([list(x) for x in train_words])
config = Config(lr=0.0001, batch_size=512, num_epochs=1000)

net = torch.load(os.path.join(experiments_path, "net.pb"))

if cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'
# ============================================================================

# Write a function predict_on_batch that outputs letter probabilities of all words in the batch.

示例#5

0

显示文件

文件： ProyekPBA_IF05.py 项目： DittoSilalahi/ProyekPBA_IF05

# check
str_lower(['Kaggle is the best place to study machine learning.'])

"""##Tokenizer"""

tokenizer = NLTKMosesTokenizer()
# check
tokenizer(['Kaggle is the best place to study machine learning.'])

train_x_lower_tokenized = str_lower(tokenizer(train_iterator.get_instances(data_type='train')[0]))

"""##Vocabulary"""

# initialize simple vocabulary to collect all appeared in the dataset classes
classes_vocab = SimpleVocabulary(
    save_path='./tmp/classes.dict',
    load_path='./tmp/classes.dict')

classes_vocab.fit((train_iterator.get_instances(data_type='train')[1]))
classes_vocab.save()

# show classes
list(classes_vocab.items())

# also one can collect vocabulary of textual tokens appeared 2 and more times in the dataset
token_vocab = SimpleVocabulary(
    save_path='./tmp/tokens.dict',
    load_path='./tmp/tokens.dict',
    min_freq=2,
    special_tokens=('<PAD>', '<UNK>',),
    unk_token='<UNK>')

示例#6

0

显示文件

文件： bot.py 项目： ahtyamovdanil/kaspersky_hackathon

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('/rapids/notebooks/my_data/BMSTU_hack/')

import torch
import biGRU_model
from deeppavlov.core.data.simple_vocab import SimpleVocabulary
import numpy as np

gru = torch.load('/rapids/notebooks/my_data/BMSTU_hack/models/biGRU')
device = torch.device('cpu')
vocab = SimpleVocabulary(
    save_path="/rapids/notebooks/my_data/BMSTU_hack/models/vocab.dict")
gru = biGRU_model.BiGRU(vocab.count,
                        embedding_dim=10,
                        hidden_size=50,
                        device='cpu')
gru.load_state_dict(
    torch.load('/rapids/notebooks/my_data/BMSTU_hack/models/biGRU',
               map_location=device))

from tbot import config
import telebot

bot = telebot.TeleBot(config.token)


@bot.message_handler(content_types=["text"])
def repeat_all_messages(message):

示例#7

0

显示文件

from deeppavlov.models.bert.bert_classifier import BertClassifierModel
from deeppavlov.metrics.accuracy import sets_accuracy
reader = BasicClassificationDatasetReader()
data = reader.read(data_path="./stanfordSentimentTreebank",
                   train="/content/train.csv",
                   valid="/content/valid.csv",
                   test="/content/test1.csv",
                   x="original",
                   y="meanGrade")
iterator = BasicClassificationDatasetIterator(data, seed=42, shuffle=True)
bert_preprocessor = BertPreprocessor(
    vocab_file=
    "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12/vocab.txt",
    do_lower_case=False,
    max_seq_length=64)
vocab = SimpleVocabulary(save_path="./binary_classes.dict")
iterator.get_instances(data_type="train")
vocab.fit(iterator.get_instances(data_type="train")[1])
one_hotter = OneHotter(depth=vocab.len, single_vector=True)
prob2labels = Proba2Labels(max_proba=True)
bert_classifier = BertClassifierModel(
    n_classes=vocab.len,
    return_probas=True,
    one_hot_labels=True,
    bert_config_file=
    "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12/bert_config.json",
    pretrained_bert=
    "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12/bert_model.ckpt",
    save_path="sst_bert_model/model",
    load_path="sst_bert_model/model",
    keep_prob=0.5,

示例#8

0

显示文件

文件： bert.py 项目： aniton/SO_SemEval-2020_News_Headlines

dr = BasicClassificationDatasetReader().read(data_path='./',
                                             train='train.csv',
                                             valid='valid.csv',
                                             test='test.csv',
                                             x='original',
                                             y='meanGrade')
train_iterator = BasicClassificationDatasetIterator(data=dr, seed=42)
x_train, y_train = train_iterator.get_instances(data_type='train')
for x, y in list(zip(x_train, y_train))[:5]:
    print('x:', x)
    print('y:', y)
    print('=================')
tokenizer = NLTKMosesTokenizer()
train_x_lower_tokenized = str_lower(
    tokenizer(train_iterator.get_instances(data_type='train')[0]))
classes_vocab = SimpleVocabulary(save_path='./snips/classes.dict',
                                 load_path='./snips/classes.dict')
vocab = SimpleVocabulary(save_path="./binary_classes.dict")
classes_vocab.fit((train_iterator.get_instances(data_type='train')[1]))
classes_vocab.save()
token_vocab = SimpleVocabulary(save_path='./snips/tokens.dict',
                               load_path='./snips/tokens.dict',
                               min_freq=2,
                               special_tokens=(
                                   '<PAD>',
                                   '<UNK>',
                               ),
                               unk_token='<UNK>')
token_vocab.fit(train_x_lower_tokenized)
token_vocab.save()
token_vocab.freqs.most_common()[:10]
tfidf = SklearnComponent(

示例#9

0

显示文件

文件： IntentTrainer.py 项目： akash-kaul/MinimumViableProduct

#print([(k, len(dr[k])) for k in dr.keys()])

# print a few x, y pairs
x_train, y_train = train_iterator.get_instances(data_type='train')
for x, y in list(zip(x_train, y_train))[:3]:
    print('x:', x)
    print('y:', y)
    print('=================')

# tokenize all input data
tokenizer = NLTKMosesTokenizer()
train_x_lower_tokenized = str_lower(
    tokenizer(train_iterator.get_instances(data_type='train')[0]))

# get the intent categories
classes_vocab = SimpleVocabulary(save_path='./tmp/classes.dict',
                                 load_path='./tmp/classes.dict')
classes_vocab.fit(train_iterator.get_instances(data_type='train')[1])
classes_vocab.save()
print(list(classes_vocab.items()))  # display classes

# get all token vocab
token_vocab = SimpleVocabulary(save_path='./tmp/tokens.dict',
                               load_path='./tmp/tokens.dict')
token_vocab.fit(train_x_lower_tokenized)
token_vocab.save()

# we will use GLOVE embedding
if not os.path.isfile("./glove.6B.100d.txt"):
    simple_download(
        url="http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt",
        destination="./glove.6B.100d.txt")

示例#10

0

显示文件

                words.append(temp[1])
    return words


#==================================================

train_words = read_infile(os.path.join(base_path, "russian-train-high"))
dev_words = read_infile(os.path.join(base_path, "russian-dev"))
test_words = read_infile(os.path.join(base_path, "russian-test"))
print(len(train_words), len(dev_words), len(test_words))
print(*train_words[:10])
#==================================================

vocab = SimpleVocabulary(
    special_tokens=('PAD', 'UNK', 'BEGIN', 'END'),
    unk_token='UNK',
    save_path=r'C:\Users\Andrey\Google Диск\courses\DeepPavlov\Task-2-preduct0r'
)
vocab.fit([list(x) for x in train_words])
#==================================================

train_dataset = Dataset(train_words, vocab)
dev_dataset = Dataset(dev_words, vocab)
test_dataset = Dataset(test_words, vocab)
#==================================================

train_batcher = DataLoader(train_dataset, batch_size=1)
dev_batcher = DataLoader(dev_dataset, batch_size=1)
test_batcher = DataLoader(test_dataset, batch_size=1)

# for i, (items, classes) in enumerate(train_batcher):