- test_texts - test_labels :return: """ pass def tokenizer(texts): # create a tokenizer function tok = TextTokenizer('en') return tok.process_all(texts) if __name__ == "__main__": # 1. Download data # untar_data(URI) # 2. Read data and save with 'normal' format: text, label # texts, labels, label_index = parse_text_data() # df = pd.DataFrame.from_dict({'text': texts, 'label': labels}) # df.to_csv('./data/20_newsgroup.csv', index=None) # 3. Tokenize text to create vocabulary df = pd.read_csv('./data/20_newsgroup.csv') tokens = tokenizer(df[:10]['text'].tolist()) vocab = Vocab.create(tokens, max_vocab=1000, min_freq=2) print(vocab.itos) print(vocab.stoi) # 4. create embedding matrix from pretrained word vectors
import numpy import torch from torch.nn import functional from torch import nn from fastai.text.transform import Vocab import unidecode import string # Taken from https://gist.github.com/jvns/b6dda36b2fdcc02b833ed5b0c7a09112 # Download Hans Christian Anderson's fairy tales # !wget -O fairy-tales.txt https://www.gutenberg.org/cache/epub/27200/pg27200.txt > /dev/null 2>&1 file = unidecode.unidecode(open('fairy-tales.txt').read()) # Remove the table of contents & Gutenberg preamble text = file[5000:] v = Vocab.create((x for x in text), max_vocab=400, min_freq=1) num_letters = len(v.itos) # training_set = torch.Tensor(v.numericalize([x for x in text])).type(torch.LongTensor).cuda() training_set = torch.Tensor(v.numericalize([x for x in text ])).type(torch.LongTensor) training_set = training_set[:100000] class MyLSTM(nn.Module): def __init__(self, input_size, hidden_size): super().__init__() self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True) self.h2o = nn.Linear(hidden_size, input_size) self.input_size = input_size self.hidden = None