def __init__(self, cf, mode='train', transform = None): # mode: 'train' or 'test' self.cf = cf self.mode = mode self.transform = transform if mode=='train': # For some reason, this has to be done this way..train = True, then, test=True! self.data = imdb_dataset(directory=cf.folder_of_data+'/imdb/', train = True) self.clean_all_text() # the clean text will replace the original one in self.data, it is necessary to do it here, as it the text might be used to build the w2v models self.load_w2v_models() else: self.data = imdb_dataset(directory= cf.folder_of_data+'/imdb/', test=True) self.clean_all_text()
def preprocess_imdb(train_size: int = 1000, test_size: int = 100) -> dict: train_data, test_data = imdb_dataset(train=True, test=True) random.shuffle(train_data) random.shuffle(test_data) train_data = train_data[:train_size] test_data = test_data[:test_size] train_texts, test_texts = ( [d["text"] for d in data] for data in (train_data, test_data) ) train_labels, test_labels = ( [d["sentiment"] for d in data] for data in (train_data, test_data) ) train_tokens, train_tokens_ids = tokenize(train_texts) test_tokens, test_tokens_ids = tokenize(test_texts) train_y, test_y = ( np.array(labels) == "pos" for labels in (train_labels, test_labels) ) return { "test_labels": test_labels, "test_texts": test_texts, "test_tokens": test_tokens, "test_tokens_ids": test_tokens_ids, "test_y": test_y, "train_labels": train_labels, "train_texts": train_texts, "train_tokens": train_tokens, "train_tokens_ids": train_tokens_ids, "train_y": train_y, }
def test_imdb_dataset_row(mock_urlretrieve): mock_urlretrieve.side_effect = urlretrieve_side_effect # Check a row are parsed correctly train, test = imdb_dataset(directory=directory, test=True, train=True) assert len(train) > 0 assert len(test) > 0 assert test[0] == { 'text': "My boyfriend and I went to watch The Guardian.At first I didn't want to watch it, " + "but I loved the movie- It was definitely the best movie I have seen in sometime." + "They portrayed the USCG very well, it really showed me what they do and I think " + "they should really be appreciated more.Not only did it teach but it was a really " + "good movie. The movie shows what the really do and how hard the job is.I think " + "being a USCG would be challenging and very scary. It was a great movie all around. " + "I would suggest this movie for anyone to see.The ending broke my heart but I know " + "why he did it. The storyline was great I give it 2 thumbs up. I cried it was very " + "emotional, I would give it a 20 if I could!", 'sentiment': 'pos' } # Clean up shutil.rmtree(os.path.join(directory, 'aclImdb'))
def load_save_docs(cls, out_dir): train = imdb_dataset(train=True) test = imdb_dataset(test=True) train_ = [] test_ = [] for td in train: sent = normalize_str(td['text']) tup = (sent, LABELS[td['sentiment']]) train_.append(tup) for td in test: sent = normalize_str(td['text']) tup = (sent, LABELS[td['sentiment']]) test_.append(tup) ds = IMDBData() ds.build(train_, test_) ds.save(out_dir)
def readLang(dataset_title): """ Args: dataset_title: either 'imdb' or 'ptb' """ print("Reading lines...") if dataset_title == 'imdb': train = imdb_dataset(train=True, directory='../data/') # Read the dataset and split into lines lines = [train[ind]['text'].strip() for ind, doc in enumerate(train)] # Normalize lines lines = [ ' '.join(["SOSTOKEN", normalizeString(s), "EOSTOKEN"]) for s in lines ] lang = Lang(dataset_title) elif dataset_title == 'ptb': raise NotImplementedError return lang, lines
def imdb_to_df(is_train, label_to_idx): dset = imdb_dataset(train=is_train, test=not is_train) # create one hot encoding of labels num_labels = len(label_to_idx) all_labels = np.zeros((len(dset.rows), num_labels)) all_label_indices = [[label_to_idx[row["sentiment"]]] for row in dset.rows] for i, labs in enumerate(all_label_indices): # binary encode the labels all_labels[i][labs] = 1 all_labels = all_labels.astype(int) cols = ["text"] label_cols = ["topic_{}".format(lab) for lab in label_to_idx.keys()] cols.extend(label_cols) df = pd.DataFrame(columns=cols) df["text"] = [row["text"] for row in dset.rows] df[label_cols] = all_labels return df
def test_imdb_dataset_row(mock_urlretrieve): mock_urlretrieve.side_effect = urlretrieve_side_effect # Check a row are parsed correctly train, test = imdb_dataset(directory=directory, test=True, train=True) assert len(train) > 0 assert len(test) > 0 test = sorted(test, key=lambda r: len(r['text'])) assert test[0] == { 'text': "This movie was sadly under-promoted but proved to be truly exceptional. Entering " + "the theatre I knew nothing about the film except that a friend wanted to see it." + "<br /><br />I was caught off guard with the high quality of the film. I couldn't " + "image Ashton Kutcher in a serious role, but his performance truly exemplified his " + "character. This movie is exceptional and deserves our monetary support, unlike so " + "many other movies. It does not come lightly for me to recommend any movie, but in " + "this case I highly recommend that everyone see it.<br /><br />This films is Truly " + "Exceptional!", 'sentiment': 'pos' } # Clean up shutil.rmtree(os.path.join(directory, 'aclImdb'))
def __init__(self, is_train: bool, tokenizer): super(ImdbDataset).__init__() self.tokenizer = tokenizer self.data = imdb_dataset(train=is_train, test=not is_train)
from pytorch_pretrained_bert import BertModel from torch import nn from torchnlp.datasets import imdb_dataset from pytorch_pretrained_bert import BertTokenizer from keras.preprocessing.sequence import pad_sequences from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.optim import Adam from torch.nn.utils import clip_grad_norm_ from IPython.display import clear_output rn.seed(321) np.random.seed(321) torch.manual_seed(321) torch.cuda.manual_seed(321) train_data, test_data = imdb_dataset(train=True, test=True) rn.shuffle(train_data) rn.shuffle(test_data) train_data = train_data[:1000] test_data = test_data[:100] train_texts, train_labels = list( zip(*map(lambda d: (d['text'], d['sentiment']), train_data))) test_texts, test_labels = list( zip(*map(lambda d: (d['text'], d['sentiment']), test_data))) len(train_texts), len(train_labels), len(test_texts), len(test_labels) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
def main(): train_test = ('imdb', 'tomato') if TRAIN_IMDB else ('tomato', 'imdb') print("====training on {} and testing on {}======".format(*train_test)) """read imdb dataset""" train_data, test_data = imdb_dataset(train=True, test=True) if not FULL: train_valid_data = random.sample(train_data, 10000) train_data, valid_data = train_valid_data[:8000], train_valid_data[8000:] test_data = random.sample(test_data, 2000) else: test_valid_data = random.sample(test_data, 4000) valid_data, test_data = test_valid_data[:2000], test_valid_data[2000:] train_dataset, valid_dataset, test_dataset = SentDataset(train_data), SentDataset(valid_data), SentDataset(test_data) trainIteration = data.DataLoader(dataset=train_dataset, collate_fn=sort_batch, batch_size=50, shuffle=True) validIteration = data.DataLoader(dataset=valid_dataset, collate_fn=sort_batch, batch_size=50) testIteration = data.DataLoader(dataset=test_dataset, collate_fn=sort_batch, batch_size=50) """read tomato dataset""" with open("../data/rotten_tomato_train.json", "r") as read_file: tomato_train_data = json.load(read_file) with open("../data/rotten_tomato_dev.json", "r") as read_file: tomato_valid_data = json.load(read_file) with open("../data/rotten_tomato_test.json", "r") as read_file: tomato_test_data = json.load(read_file) tomato_train_dataset, tomato_valid_dataset, tomato_test_dataset = SentDataset(tomato_train_data), SentDataset(tomato_valid_data), SentDataset(tomato_test_data) tomato_trainIteration = data.DataLoader(dataset=tomato_train_dataset, collate_fn=sort_batch, batch_size=50, shuffle=True) tomato_validIteration = data.DataLoader(dataset=tomato_valid_dataset, collate_fn=sort_batch, batch_size=50) tomato_testIteration = data.DataLoader(dataset=tomato_test_dataset, collate_fn=sort_batch, batch_size=50) """create model""" model = BERT_biLSTM(HIDDEN_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT) model = model.to(device) if device == 'cuda': model = nn.DataParallel(model) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE) criterion = nn.BCEWithLogitsLoss() criterion = criterion.to(device) best_valid_loss = float('inf') """start training""" for epoch in range(N_EPOCHS): start_time = time.time() train_loss, train_acc = train(model, trainIteration if TRAIN_IMDB else tomato_trainIteration, optimizer, criterion, epoch) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) valid_loss, valid_acc = evaluate(model, validIteration if TRAIN_IMDB else tomato_validIteration, criterion) if valid_loss < best_valid_loss: best_valid_loss = valid_loss if SAVE: torch.save(model.state_dict(), '../save/robust_BERT_model_{}.pt'.format('imdb' if TRAIN_IMDB else 'tomato')) print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%') print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}%') """start testing on other dataset""" print("=====test result on own dataset=====") test_loss, test_acc = evaluate(model, testIteration if TRAIN_IMDB else tomato_testIteration, criterion) print(f'\t Test. Loss: {test_loss:.3f} | Test. Acc: {test_acc * 100:.2f}%') print() print("=====test result on other dataset=====") test_loss, test_acc = evaluate(model, tomato_testIteration if TRAIN_IMDB else testIteration, criterion) print(f'\t Test. Loss: {test_loss:.3f} | Test. Acc: {test_acc * 100:.2f}%')
def training(batch_size, epoch_size, filename): '''very unethical way of loading and traing the data in same function''' pd.set_option('display.max_columns', None) train_data, test_data = imdb_dataset(train=True, test=True) df = pd.read_csv("./data/fake.csv") df = df[['text', 'type']] #print(len(df)) #print(Counter(df['type'].values)) df = df[df['type'].isin(['fake', 'satire'])] df.dropna(inplace=True) df_fake = df[df['type'] == 'fake'] df_statire = df[df['type'] == 'satire'] df_statire = df_statire.sample(n=len(df_fake)) df = df_statire.append(df_fake) df = df.sample(frac=1, random_state=24).reset_index(drop=True) #print(Counter(df['type'].values)) train_data = df.head(19) test_data = df.tail(19) #print(train_data) train_data = [{ 'text': text, 'type': type_data } for text in list(train_data['text']) for type_data in list(train_data['type'])] test_data = [{ 'text': text, 'type': type_data } for text in list(test_data['text']) for type_data in list(test_data['type'])] train_texts, train_labels = list( zip(*map(lambda d: (d['text'], d['type']), train_data))) test_texts, test_labels = list( zip(*map(lambda d: (d['text'], d['type']), test_data))) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) train_tokens = list( map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts)) test_tokens = list( map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], test_texts)) train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens)) test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens)) train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int") test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int") train_y = np.array(train_labels) == 'fake' test_y = np.array(test_labels) == 'fake' BATCH_SIZE = batch_size EPOCHS = epoch_size train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids] test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids] train_masks_tensor = torch.tensor(train_masks) test_masks_tensor = torch.tensor(test_masks) train_tokens_tensor = torch.tensor(train_tokens_ids) train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float() test_tokens_tensor = torch.tensor(test_tokens_ids) test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float() train_dataset = torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor) train_sampler = torch.utils.data.RandomSampler(train_dataset) train_dataloader = torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE) test_dataset = torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor) test_sampler = torch.utils.data.SequentialSampler(test_dataset) test_dataloader = torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE) bert_clf = BertBinaryClassifier() optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6) for epoch_num in range(EPOCHS): bert_clf.train() train_loss = 0 for step_num, batch_data in enumerate(train_dataloader): token_ids, masks, labels = tuple(t for t in batch_data) probas = bert_clf(token_ids, masks) loss_func = nn.BCELoss() batch_loss = loss_func(probas, labels) train_loss += batch_loss.item() bert_clf.zero_grad() batch_loss.backward() optimizer.step() print('Epoch: ', epoch_num + 1) print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1))) torch.save(bert_clf, filename) return
from slp.plbind.module import RnnPLModule from slp.plbind.trainer import make_trainer, watch_model from slp.util.log import configure_logging MAX_LENGTH = 1024 collate_fn = SequenceClassificationCollator(device="cpu", max_length=MAX_LENGTH) # collate_fn = SequenceClassificationCollator(device="cpu") if __name__ == "__main__": pl.utilities.seed.seed_everything(seed=42) EXPERIMENT_NAME = "imdb-words-sentiment-classification" configure_logging(f"logs/{EXPERIMENT_NAME}") train, test = imdb_dataset(directory="./data/", train=True, test=True) raw_train = [d["text"] for d in train] labels_train = [d["sentiment"] for d in train] raw_test = [d["text"] for d in test] labels_test = [d["sentiment"] for d in test] ldm = PLDataModuleFromCorpus( raw_train, labels_train, test=raw_test, test_labels=labels_test, batch_size=64, batch_size_eval=32, collate_fn=collate_fn,
tokenizer = SpacyTokenizer() to_token_ids = ToTokenIds(word2idx) to_tensor = ToTensor(device='cpu') def create_dataloader(d): d = (DatasetWrapper(d).map(tokenizer).map(to_token_ids).map(to_tensor)) return DataLoader(d, batch_size=32, num_workers=1, pin_memory=True, shuffle=True, collate_fn=collate_fn) train_loader, dev_loader = map( create_dataloader, imdb_dataset(directory='../data/', train=True, test=True)) model = Classifier( WordRNN(256, embeddings, bidirectional=True, merge_bi='cat', packed_sequence=True, attention=True, device=DEVICE), 512, 3) optimizer = Adam([p for p in model.parameters() if p.requires_grad], lr=1e-3) criterion = nn.CrossEntropyLoss() metrics = {'accuracy': Accuracy(), 'loss': Loss(criterion)} trainer = SequentialTrainer(
#!/usr/bin/python import numpy as np import xgboost as xgb import pandas as pd from torchnlp.datasets import imdb_dataset # Load the imdb training dataset train = imdb_dataset(train=True) train[0] # RETURNS: {'text': 'For a movie that gets..', 'sentiment': 'pos'}
def prepare_data_bert(batch_size): """:returns train and test loader for the IMDB dataset formatted correctly for BERT, each item in the dataset is in the form (token_ids, masks, labels)""" print('Loading IMDB data...') train_data, test_data = imdb_dataset(train=True, test=True) rn.shuffle(train_data) rn.shuffle(test_data) train_data = train_data[:1000] test_data = test_data[:100] train_texts, train_labels = list( zip(*map(lambda d: (d['text'], d['sentiment']), train_data))) test_texts, test_labels = list( zip(*map(lambda d: (d['text'], d['sentiment']), test_data))) print('Tokenizing for BERT') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) train_tokens = list( map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts)) test_tokens = list( map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts)) print(list(map(tokenizer.convert_tokens_to_ids, train_tokens))[0]) train_tokens_ids = pad_sequences(list( map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating='post', padding='post', dtype='int') # print(train_tokens_ids[0]) test_tokens_ids = pad_sequences(list( map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating='post', padding='post', dtype='int') train_y = np.array(np.array(train_labels) == 'pos', dtype=np.uint8) test_y = np.array(np.array(test_labels) == 'pos', dtype=np.uint8) train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y) train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids] test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids] train_tokens_tensor = torch.tensor(train_tokens_ids) train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float() test_tokens_tensor = torch.tensor(test_tokens_ids) test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float() train_masks_tensor = torch.tensor(train_masks) test_masks_tensor = torch.tensor(test_masks) train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size) return train_dataloader, test_dataloader
def __init__(self, root: str, normal_class=0, tokenizer='spacy', use_tfidf_weights=False, append_sos=False, append_eos=False, clean_txt=False, max_seq_len_prior=None): super().__init__(root) self.n_classes = 2 # 0: normal, 1: outlier classes = ['pos', 'neg'] if normal_class == -1: self.normal_classes = classes self.outlier_classes = [] else: self.normal_classes = [classes[normal_class]] del classes[normal_class] self.outlier_classes = classes if root not in nltk.data.path: nltk.data.path.append(root) # Load the imdb dataset self.train_set, self.test_set = imdb_dataset(directory=root, train=True, test=True) # Pre-process self.train_set.columns.add('index') self.test_set.columns.add('index') self.train_set.columns.remove('sentiment') self.test_set.columns.remove('sentiment') self.train_set.columns.add('label') self.test_set.columns.add('label') self.train_set.columns.add('weight') self.test_set.columns.add('weight') train_idx_normal = [] # for subsetting train_set to normal class for i, row in enumerate(self.train_set): row['label'] = row.pop('sentiment') if row['label'] in self.normal_classes: train_idx_normal.append(i) row['label'] = torch.tensor(0) else: row['label'] = torch.tensor(1) if clean_txt: row['text'] = clean_text(row['text'].lower()) else: row['text'] = row['text'].lower() test_n_idx = [] # subsetting test_set to selected normal classes test_a_idx = [] # subsetting test_set to selected anomalous classes for i, row in enumerate(self.test_set): row['label'] = row.pop('sentiment') if row['label'] in self.normal_classes: test_n_idx.append(i) else: test_a_idx.append(i) row['label'] = torch.tensor( 0) if row['label'] in self.normal_classes else torch.tensor(1) if clean_txt: row['text'] = clean_text(row['text'].lower()) else: row['text'] = row['text'].lower() # Subset train_set to normal class self.train_set = Subset(self.train_set, train_idx_normal) # Subset test_set to selected normal classes self.test_n_set = Subset(self.test_set, test_n_idx) # Subset test_set to selected anomalous classes self.test_a_set = Subset(self.test_set, test_a_idx) # Make corpus and set encoder text_corpus = [ row['text'] for row in datasets_iterator(self.train_set, self.test_set) ] if tokenizer == 'spacy': self.encoder = SpacyEncoder(text_corpus, min_occurrences=3, append_eos=append_eos) if tokenizer == 'bert': self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased', cache_dir=root) # Encode self.max_seq_len = 0 for row in datasets_iterator(self.train_set, self.test_set): if append_sos: sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN] row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0), self.encoder.encode(row['text']))) else: row['text'] = self.encoder.encode(row['text']) if len(row['text']) > self.max_seq_len: self.max_seq_len = len(row['text']) # Compute tf-idf weights if use_tfidf_weights: compute_tfidf_weights(self.train_set, self.test_set, vocab_size=self.encoder.vocab_size) else: for row in datasets_iterator(self.train_set, self.test_set): row['weight'] = torch.empty(0) # Get indices after pre-processing for i, row in enumerate(self.train_set): row['index'] = i for i, row in enumerate(self.test_set): row['index'] = i # length prior sent_lengths = [len(row['text']) for row in self.train_set] sent_lengths_freq = np.bincount(np.array(sent_lengths)) sent_lengths_freq = np.concatenate( (sent_lengths_freq, np.array((max_seq_len_prior - max(sent_lengths)) * [0])), axis=0) sent_lengths_freq = sent_lengths_freq + 1 self.length_prior = np.log(sent_lengths_freq) - np.log( sent_lengths_freq.sum())
train = smt_dataset(train=True, fine_grained=True) valid = smt_dataset(dev=True, fine_grained=True) test = smt_dataset(test=True, fine_grained=True) train_labels = create_SMT_labels(train, len(train)) train_text = np.array(train.__getitem__('text')) valid_labels = create_SMT_labels(valid, len(valid)) valid_text = np.array(valid.__getitem__('text')) test_labels = create_SMT_labels(test, len(test)) test_text = np.array(test.__getitem__('text')) np.save('sst_train_text', train_text) np.save('sst_train_labels', train_labels) np.save('sst_valid_text', valid_text) np.save('sst_valid_labels', valid_labels) np.save('sst_test_text', test_text) np.save('sst_test_labels', test_labels) train = imdb_dataset(train=True) test = imdb_dataset(test=True) train_labels = create_IMDB_labels(train, len(train)) test_labels = create_IMDB_labels(test, len(test)) train_text = np.array(train.__getitem__('text')) test_text = np.array(test.__getitem__('text')) np.save('imdb_train_text', train_text) np.save('imdb_train_labels', train_labels) np.save('imdb_test_text', test_text) np.save('imdb_test_labels', test_labels)
def __init__(self, root: str, normal_class=0, tokenizer='spacy', use_tfidf_weights=False, append_sos=False, append_eos=False, clean_txt=False): super().__init__(root) self.n_classes = 2 # 0: normal, 1: outlier classes = ['pos', 'neg'] if normal_class == -1: self.normal_classes = classes self.outlier_classes = [] else: self.normal_classes = [classes[normal_class]] del classes[normal_class] self.outlier_classes = classes # Load the imdb dataset self.train_set, self.test_set = imdb_dataset(directory=root, train=True, test=True) # Pre-process self.train_set.columns.add('index') self.test_set.columns.add('index') self.train_set.columns.remove('sentiment') self.test_set.columns.remove('sentiment') self.train_set.columns.add('label') self.test_set.columns.add('label') self.train_set.columns.add('weight') self.test_set.columns.add('weight') train_idx_normal = [] # for subsetting train_set to normal class for i, row in enumerate(self.train_set): row['label'] = row.pop('sentiment') if row['label'] in self.normal_classes: train_idx_normal.append(i) row['label'] = torch.tensor(0) else: row['label'] = torch.tensor(1) if clean_txt: row['text'] = clean_text(row['text'].lower()) else: row['text'] = row['text'].lower() for i, row in enumerate(self.test_set): row['label'] = row.pop('sentiment') row['label'] = torch.tensor(0) if row['label'] in self.normal_classes else torch.tensor(1) if clean_txt: row['text'] = clean_text(row['text'].lower()) else: row['text'] = row['text'].lower() # Subset train_set to normal class self.train_set = Subset(self.train_set, train_idx_normal) # Make corpus and set encoder text_corpus = [row['text'] for row in datasets_iterator(self.train_set, self.test_set)] if tokenizer == 'spacy': self.encoder = SpacyEncoder(text_corpus, min_occurrences=3, append_eos=append_eos) if tokenizer == 'bert': self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased', cache_dir=root) # Encode for row in datasets_iterator(self.train_set, self.test_set): if append_sos: sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN] row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0), self.encoder.encode(row['text']))) else: row['text'] = self.encoder.encode(row['text']) # Compute tf-idf weights if use_tfidf_weights: compute_tfidf_weights(self.train_set, self.test_set, vocab_size=self.encoder.vocab_size) else: for row in datasets_iterator(self.train_set, self.test_set): row['weight'] = torch.empty(0) # Get indices after pre-processing for i, row in enumerate(self.train_set): row['index'] = i for i, row in enumerate(self.test_set): row['index'] = i
NUM_CLASSES = 2 BATCH_SIZE = 100 LEARNING_RATE = 0.003 # Device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if init == True: sentiment = { 'pos': 1, 'neg': 0, } train_texts = [ text_to_word_sequence(data['text']) for data in tqdm(imdb_dataset(train=True)) ] train_labels = [ sentiment[data['sentiment']] for data in imdb_dataset(train=True) ] test_texts = [ text_to_word_sequence(data['text']) for data in tqdm(imdb_dataset(test=True)) ] test_labels = [ sentiment[data['sentiment']] for data in imdb_dataset(test=True) ] # test = imdb_dataset(test=True)