def score(self, X, y=None): train_loss = 0 criterion = nn.CrossEntropyLoss() iter_data = DataLoader(X, batch_size=self.module__batch_size, shuffle=True) log_exp_run = make_logger(name="experiment_" + self.mode) predictions = [] labels = [] self.module_.to(self.device) self.module_.eval() with torch.no_grad(): for bach in iter_data: x_test = bach['features'].type(torch.LongTensor) y_test = bach['labels'].type(torch.LongTensor) x_test = x_test.to(self.device) y_test = y_test.to(self.device) prob = self.module_(x_test) loss = criterion(prob, y_test) train_loss += loss.item() _, predicted = torch.max(prob.data, 1) predictions.extend(predicted.cpu().numpy()) labels.extend(y_test.cpu().numpy()) accuracy = accuracy_score(predictions, labels) log_exp_run.experiments("Cross-entropy loss for each fold: " + str(train_loss)) log_exp_run.experiments("Accuracy for each fold: " + str(accuracy)) log_exp_run.experiments("\n"+classification_report(labels, predictions)) return accuracy
def __init__(self,*args,mode="Adam",**kargs): super().__init__(*args, **kargs) #self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu") self.mode=mode log_exp_run = make_logger(name="experiment_" + self.mode) log_exp_run.experiments("Running on device: "+str(self.device)) log_exp_run.experiments("Training model by Back-propagation with optimizer: "+mode)
def build_spanish_glove_from_pretrained(url_pretrained_model,url_dictionary): from gensim.models.keyedvectors import KeyedVectors wordvectors_file_vec = url_pretrained_model+'/glove-sbwc.i25.vec' cantidad = 100000 wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file_vec, limit=cantidad) embedding_dict = {} log_exp_run = make_logger() start_time = time.time() for word in wordvectors.vocab: embedding_dict[word] = np.asarray(wordvectors.wv.get_vector(word), dtype='float32') log_exp_run.experiments("Loaded spanish word embedding model with GloVe:") log_exp_run.experiments("EMBEDDING_SIZE: " + str(len(embedding_dict["the"]))) log_exp_run.experiments("DICTIONARY LENGTH: " + str(len(embedding_dict))) log_exp_run.experiments("Time elapsed for loading embedding vectors from file: " + str(time.time() - start_time)) word_index = torch.load(url_dictionary) embedding_matrix = np.random.random((len(word_index) + 1, 300)) log_exp_run.experiments("Length of dictionary of dataset: " + str(len(word_index))) for word, i in word_index.items(): embedding_vector = embedding_dict.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector # print(wordvectors.similarity("celular","computadora")) # print(wordvectors.most_similar_cosmul(positive=['cantante','grabación'],negative=['concierto'])) return torch.FloatTensor(embedding_matrix)
def build_glove_from_pretrained(url_pretrained_model,url_dictionary): embedding_dict={} log_exp_run = make_logger() file_pretrained = open(url_pretrained_model+"/glove.6B.100d.txt","r",encoding='ANSI')#removing encoding in unix-based os ,encoding='ANSI' start_time = time.time() lines = file_pretrained.readlines() for line in lines: values = line.split(' ') word = values[0] coefs = np.asarray(values[1:],dtype='float32') embedding_dict[word] = coefs file_pretrained.close() log_exp_run.experiments("Loaded word embedding model with GloVe:") log_exp_run.experiments("EMBEDDING_SIZE: " + str(len(embedding_dict["the"]))) log_exp_run.experiments("DICTIONARY LENGTH: " + str(len(embedding_dict))) log_exp_run.experiments("Time elapsed for loading embedding vectors from file: " + str(time.time() - start_time)) word_index=torch.load(url_dictionary) embedding_matrix=np.random.random((len(word_index)+1,100)) log_exp_run.experiments("Length of dictionary of dataset: "+str(len(word_index))) for word,i in word_index.items(): embedding_vector=embedding_dict.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector return torch.FloatTensor(embedding_matrix)
def on_epoch_end(self, net, **kwargs): loss = net.history[-1, 'train_loss'] self.loss_all.append(loss) self.cont_epoch += 1 # early stoping if len(self.loss_all) > 1: if abs(self.loss_all[self.cont_epoch - 1] - self.loss_all[self.cont_epoch - 2]) < self.min_diference: self.cont -= 1 else: self.cont = 10 if self.cont == 0: log_exp_run = make_logger() log_exp_run.experiments(self.loss_all) raise Overfit_Exception()
def build_dataset_and_dict(): categories = ['comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware','comp.windows.x','sci.electronics'] newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True,categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True,categories=categories) texts = newsgroups_train['data'] labels = newsgroups_train['target'] log_exp_run = make_logger() log_exp_run.experiments("Categories-labels: ") log_exp_run.experiments(list(newsgroups_train.target_names)) log_exp_run.experiments("Dictionary scheme: ") log_exp_run.experiments(list(newsgroups_train.keys())) log_exp_run.experiments("Number of instances for training: ") log_exp_run.experiments(len(newsgroups_train['data'])) log_exp_run.experiments("Number of instances for testing: ") log_exp_run.experiments(len(newsgroups_test['data'])) removing_stop_words(texts) dataset_train = {'features': [], 'labels': []} max_sequence_length = 1000 max_nb_words = 2000 tokenizer=Tokenizer(num_words=max_nb_words) tokenizer.fit_on_texts(texts) sequences_train=tokenizer.texts_to_sequences(texts) word_index=tokenizer.word_index log_exp_run.experiments("Found unique tokens: "+str(len(word_index))) wdir = os.getcwd() if not os.path.exists(wdir + '/datasets/dataset_train_20ng_nosw_six_labels'): dataset_train['features']=pad_sequences(sequences_train, maxlen=max_sequence_length)#[0:5] dataset_train['labels']=labels#[0:5] torch.save(dataset_train, wdir + "/datasets/dataset_train_20ng_nosw_six_labels") dataset_test = {'features': [], 'labels': []} texts = newsgroups_test['data'] labels = newsgroups_test['target'] removing_stop_words(texts) sequences_test = tokenizer.texts_to_sequences(texts) if not os.path.exists(wdir + '/datasets/dataset_test_20ng_nosw_six_labels'): dataset_test['features']=pad_sequences(sequences_test, maxlen=max_sequence_length)#[0:5] dataset_test['labels']=labels#[0:5] torch.save(dataset_test, wdir + "/datasets/dataset_test_20ng_nosw_six_labels") if not os.path.exists(wdir + '/datasets/dictionary_20ng_nosw_six_labels'): torch.save(word_index, wdir + "/datasets/dictionary_20ng_nosw_six_labels")
def build_word_embedding(url_pretrained_model): model=None log_exp_run = make_logger() if not os.path.exists(url_pretrained_model+"/word2vec"+"_"+str(EMBEDDING_SIZE)+".model"): model = gensim.models.Word2Vec(brown.sents(), size=EMBEDDING_SIZE, window=WINDOW, min_count=MIN_COUNT, negative=NEGATIVE_SAMPLING, iter=EPOCHS, workers=multiprocessing.cpu_count()) model.save(url_pretrained_model+"/word2vec"+"_"+str(EMBEDDING_SIZE)+".model") log_exp_run.experiments("Created and saved word embedding model with:") log_exp_run.experiments("EMBEDDING_SIZE: "+ str(EMBEDDING_SIZE)) log_exp_run.experiments("DICTIONARY LENGTH: " + str(len(model.wv.vocab))) else: model=gensim.models.Word2Vec.load(url_pretrained_model+"/word2vec"+"_"+str(EMBEDDING_SIZE)+".model") log_exp_run.experiments("Loaded word embedding model with:") log_exp_run.experiments("EMBEDDING_SIZE: " + str(EMBEDDING_SIZE)) log_exp_run.experiments("DICTIONARY LENGTH: " + str(len(model.wv.vocab))) return model
def build_dataset_and_dict(): os.chdir('../') path_dataset = "C:\\Users\\Laptop\\Desktop\\youtube" file_train = path_dataset + "/Youtube01-Psy.csv" f = open(file_train, "r", encoding="utf8") lines = f.readlines() lines = lines[1:] y = [] X = [] for line in lines: text, label = extract_text_label(line) X.append(text) y.append(label) file_train = path_dataset + "/Youtube02-KatyPerry.csv" f = open(file_train, "r", encoding="utf8") lines = f.readlines() lines = lines[1:] for line in lines: text, label = extract_text_label(line) X.append(text) y.append(label) file_train = path_dataset + "/Youtube03-LMFAO.csv" f = open(file_train, "r", encoding="utf8") lines = f.readlines() lines = lines[1:] for line in lines: text, label = extract_text_label(line) X.append(text) y.append(label) file_train = path_dataset + "/Youtube04-Eminem.csv" f = open(file_train, "r", encoding="utf8") lines = f.readlines() lines = lines[1:] for line in lines: text, label = extract_text_label(line) X.append(text) y.append(label) file_train = path_dataset + "/Youtube05-Shakira.csv" f = open(file_train, "r", encoding="utf8") lines = f.readlines() lines = lines[1:] for line in lines: text, label = extract_text_label(line) X.append(text) y.append(label) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=142) bbc_train = {'data': X_train, 'target': y_train} bbc_test = {'data': X_test, 'target': y_test} texts = bbc_train['data'] labels_target = bbc_train['target'] log_exp_run = make_logger() log_exp_run.experiments("Number of instances for training: ") log_exp_run.experiments(len(bbc_train['data'])) log_exp_run.experiments("Number of instances for testing: ") log_exp_run.experiments(len(bbc_test['data'])) removing_stop_words(texts) dataset_train = {'features': [], 'labels': []} max_sequence_length = 1000 max_nb_words = 2000 tokenizer = Tokenizer(num_words=max_nb_words) tokenizer.fit_on_texts(texts) sequences_train = tokenizer.texts_to_sequences(texts) wdir = os.getcwd() if not os.path.exists(wdir + '/datasets/dataset_train_youtube_nosw'): dataset_train['features'] = pad_sequences( sequences_train, maxlen=max_sequence_length) #[0:5] dataset_train['labels'] = labels_target #[0:5] torch.save(dataset_train, wdir + "/datasets/dataset_train_youtube_nosw") dataset_test = {'features': [], 'labels': []} texts = bbc_test['data'] labels_target = bbc_test['target'] removing_stop_words(texts) sequences_test = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index log_exp_run.experiments("Found unique tokens: " + str(len(word_index))) if not os.path.exists(wdir + '/datasets/dataset_test_youtube_nosw'): dataset_test['features'] = pad_sequences( sequences_test, maxlen=max_sequence_length) #[0:5] dataset_test['labels'] = labels_target #[0:5] torch.save(dataset_test, wdir + "/datasets/dataset_test_youtube_nosw") if not os.path.exists(wdir + '/datasets/dictionary_youtube_nosw'): torch.save(word_index, wdir + "/datasets/dictionary_youtube_nosw")
def fit(self, X, y=None, **fit_params): log_exp_run = make_logger(name="experiment_" + self.mode) if not self.warm_start or not self.initialized_: self.initialize() self.X_ = X train_loss_acc=[] self.module_.to(self.device) optimizer = self.optimizer_ criterion = self.criterion_ iter_data = DataLoader(X, batch_size=self.module__batch_size, shuffle=True) patientia = fit_params["patientia"] if fit_params.get('fit_param') is None else fit_params["fit_param"]["patientia"] cont_early_stoping = fit_params["patientia"] if fit_params.get('fit_param') is None else fit_params["fit_param"]["patientia"] min_diference = fit_params["min_diference"] if fit_params.get('fit_param') is None else fit_params["fit_param"]["min_diference"] self.notify('on_train_begin', X=X, y=y) isinstance(optimizer,Adam) log_exp_run.experiments("Run using {} as optimizer".format("Adam" if isinstance(optimizer,Adam) else "SGD")) if isinstance(optimizer, Adam): log_exp_run.experiments("lr: {}".format(self.lr)) else: log_exp_run.experiments("lr: {} and momentum: {}".format(self.lr, self.optimizer__momentum)) on_epoch_kwargs = { 'dataset_train': X, 'dataset_valid': None, } for epoch in range(self.max_epochs): train_loss = 0 self.notify('on_epoch_begin',**on_epoch_kwargs) for bach in iter_data: self.module_.zero_grad() x_train = bach['features'].type(torch.LongTensor) y_train = bach['labels'].type(torch.LongTensor) x_train = x_train.to(self.device) y_train = y_train.to(self.device) self.notify("on_batch_begin", X=x_train, y=y_train, training=True) prob = self.module_(x_train) loss = criterion(prob, y_train) loss.backward() train_loss += loss.item() optimizer.step() self.notify("on_batch_end", X=x_train, y=y_train, training=True) log_exp_run.experiments("Epoch ran: " + str(epoch) + " loss: " + str(train_loss)) train_loss_acc.append(train_loss) self.notify('on_epoch_end',**on_epoch_kwargs) if len(train_loss_acc) > 1: if abs(train_loss_acc[epoch - 1] - train_loss_acc[epoch - 2]) < min_diference: cont_early_stoping -= 1 else: cont_early_stoping = patientia if cont_early_stoping == 0: break log_exp_run.experiments("Train loss series:") log_exp_run.experiments(train_loss_acc) self.notify('on_train_end', X=X, y=y) return self
def initialize_module(self,*args,**kargs): super().initialize_module(*args, **kargs) param_length = sum([p.numel() for p in self.module_.parameters() if p.requires_grad]) log_exp_run = make_logger(name="experiment_" + self.mode) log_exp_run.experiments("Amount of parameters: " + str(param_length)) return self
import torch import os from utils.custom_dataloader import CustomDataLoader from utils.logging_custom import make_logger from utils.file_arguments_reader import load_param_from_file from scripts.main_gradient_based import train_model_sgd if __name__ == "__main__": # Load train arguments from file os.chdir("../") wdir = os.getcwd() + "/" if not os.path.exists( "/home/CLUSTER/uclv_ogtoledano/doctorado/Text_Cat_Based_EDA/" ) else "/home/CLUSTER/uclv_ogtoledano/doctorado/Text_Cat_Based_EDA/" # only slurm cluster dic_param = load_param_from_file(wdir + "scripts/arguments.txt") log_exp_run = make_logger(name="" + dic_param['name_log_experiments_result']) device = "cuda:" + str( dic_param['cuda_device_id']) if torch.cuda.is_available() else "cpu" # Load pre-trained word embedding model with specific language: Spanish or English tensor_embedding = build_spanish_glove_from_pretrained(wdir + 'utils/pretrained_models', wdir + 'datasets/' + dic_param['dataset_dictionary']) if \ dic_param['word_embedding_pretrained_glove_language'] == 'Spanish' \ else build_glove_from_pretrained(wdir + 'utils/pretrained_models', wdir + 'datasets/' + dic_param['dataset_dictionary']) # Create lazy Dataloader from Tensor dataset train_data = CustomDataLoader(wdir + 'datasets/' + dic_param['dataset_train']) test_data = CustomDataLoader(wdir + 'datasets/' + dic_param['dataset_test'])
def build_dataset_and_dict(): path_dataset = "C:\\Users\\Laptop\\Desktop\\ecured_five_tags" labels = ['ciencia', 'cultura', 'deporte', 'historia', 'salud'] y = [] X = [] for i, label in enumerate(labels): folder = path_dataset + "/" + label for file in os.listdir(folder): total_text = [] f = open(folder + "/" + file, "r", encoding='UTF-8') text = f.read() parse = parse_documents_from_html_format(text) for pattern in parse.data: X.append(pattern) total_text.append(pattern) y.append(i) total_length, count_sw = stop_words_count_and_length(total_text) print("Found tokens: {} for label: {}, and count stop-words {}".format( total_length, label, count_sw)) total_text = [] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=142) bbc_train = {'data': X_train, 'target': y_train} bbc_test = {'data': X_test, 'target': y_test} texts = bbc_train['data'] labels_target = bbc_train['target'] os.chdir("../") log_exp_run = make_logger() log_exp_run.experiments("Number of instances for training: ") log_exp_run.experiments(len(bbc_train['data'])) log_exp_run.experiments("Number of instances for testing: ") log_exp_run.experiments(len(bbc_test['data'])) removing_stop_words(texts) dataset_train = {'features': [], 'labels': []} max_sequence_length = 1000 max_nb_words = 2000 tokenizer = Tokenizer(num_words=max_nb_words) tokenizer.fit_on_texts(texts) sequences_train = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index log_exp_run.experiments("Found unique tokens: " + str(len(word_index))) wdir = os.getcwd() if not os.path.exists(wdir + '/datasets/dataset_train_ecured_nosw'): dataset_train['features'] = pad_sequences( sequences_train, maxlen=max_sequence_length) #[0:5] dataset_train['labels'] = labels_target #[0:5] torch.save(dataset_train, wdir + "/datasets/dataset_train_ecured_nosw") dataset_test = {'features': [], 'labels': []} texts = bbc_test['data'] labels_target = bbc_test['target'] removing_stop_words(texts) sequences_test = tokenizer.texts_to_sequences(texts) if not os.path.exists(wdir + '/datasets/dataset_test_ecured_nosw'): dataset_test['features'] = pad_sequences( sequences_test, maxlen=max_sequence_length) #[0:5] dataset_test['labels'] = labels_target #[0:5] torch.save(dataset_test, wdir + "/datasets/dataset_test_ecured_nosw") if not os.path.exists(wdir + '/datasets/dictionary_ecured_nosw'): torch.save(word_index, wdir + "/datasets/dictionary_ecured_nosw")
def build_dataset_and_dict(): path_dataset = "C:\\Users\\StarWar\\Desktop\\Cora_enrich" file_train = path_dataset + "/texts.txt" file_labels = path_dataset + "/labels.txt" f = open(file_train, "r") X = f.readlines() f = open(file_labels, "r") y = f.readlines() vocab = {} index = 0 for i in range(len(y)): word = y[i].split("\n")[0].lower() if word not in vocab: vocab[word] = index index += 1 for i in range(len(y)): word = y[i].split("\n")[0].lower() if word in vocab: y[i] = vocab[word] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=142) bbc_train = {'data': X_train, 'target': y_train} bbc_test = {'data': X_test, 'target': y_test} texts = bbc_train['data'] labels_target = bbc_train['target'] log_exp_run = make_logger() log_exp_run.experiments("Labels: ") log_exp_run.experiments(vocab) log_exp_run.experiments("Number of instances for training: ") log_exp_run.experiments(len(bbc_train['data'])) log_exp_run.experiments("Number of instances for testing: ") log_exp_run.experiments(len(bbc_test['data'])) removing_stop_words(texts) dataset_train = {'features': [], 'labels': []} max_sequence_length = 1000 max_nb_words = 2000 tokenizer = Tokenizer(num_words=max_nb_words) tokenizer.fit_on_texts(texts) sequences_train = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index log_exp_run.experiments("Found unique tokens: " + str(len(word_index))) wdir = os.getcwd() if not os.path.exists(wdir + '/datasets/dataset_train_cora_enrich_nosw'): dataset_train['features'] = pad_sequences( sequences_train, maxlen=max_sequence_length) #[0:5] dataset_train['labels'] = labels_target #[0:5] torch.save(dataset_train, wdir + "/datasets/dataset_train_cora_enrich_nosw") dataset_test = {'features': [], 'labels': []} texts = bbc_test['data'] labels_target = bbc_test['target'] removing_stop_words(texts) sequences_test = tokenizer.texts_to_sequences(texts) if not os.path.exists(wdir + '/datasets/dataset_test_cora_enrich_nosw'): dataset_test['features'] = pad_sequences( sequences_test, maxlen=max_sequence_length) #[0:5] dataset_test['labels'] = labels_target #[0:5] torch.save(dataset_test, wdir + "/datasets/dataset_test_cora_enrich_nosw") if not os.path.exists(wdir + '/datasets/dictionary_cora_enrich_nosw'): torch.save(word_index, wdir + "/datasets/dictionary_cora_enrich_nosw")
def build_dataset_and_dict(): path_dataset = "C:\\Users\\StarWar\\Desktop\\AGnews" X_train = [] y_train = [] X_test = [] y_test = [] labels = ['World', 'Sports', 'Business', 'Sci/Tech'] file_train = path_dataset + "/train.txt" file_test = path_dataset + "/test.txt" f = open(file_train, "r") lines = f.readlines() for line in lines: text = line.split(',') X_train.append(text[1] + " " + text[2]) y_train.append(text[0][1]) f = open(file_test, "r") lines = f.readlines() for line in lines: text = line.split(',') X_test.append(text[1] + " " + text[2]) y_test.append(text[0][1]) bbc_train = {'data': X_train, 'target': y_train} bbc_test = {'data': X_test, 'target': y_test} texts = bbc_train['data'] labels_target = bbc_train['target'] print(texts[0]) log_exp_run = make_logger() log_exp_run.experiments("Categories-labels: ") log_exp_run.experiments(labels) log_exp_run.experiments("Number of instances for training: ") log_exp_run.experiments(len(bbc_train['data'])) log_exp_run.experiments("Number of instances for testing: ") log_exp_run.experiments(len(bbc_test['data'])) removing_stop_words(texts) dataset_train = {'features': [], 'labels': []} max_sequence_length = 1000 max_nb_words = 2000 tokenizer = Tokenizer(num_words=max_nb_words) tokenizer.fit_on_texts(texts) sequences_train = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index log_exp_run.experiments("Found unique tokens: " + str(len(word_index))) wdir = os.getcwd() if not os.path.exists(wdir + '/datasets/dataset_train_ag_news_nosw'): dataset_train['features'] = pad_sequences( sequences_train, maxlen=max_sequence_length) #[0:5] dataset_train['labels'] = labels_target #[0:5] torch.save(dataset_train, wdir + "/datasets/dataset_train_ag_news_nosw") dataset_test = {'features': [], 'labels': []} texts = bbc_test['data'] labels_target = bbc_test['target'] removing_stop_words(texts) sequences_test = tokenizer.texts_to_sequences(texts) if not os.path.exists(wdir + '/datasets/dataset_test_ag_news_nosw'): dataset_test['features'] = pad_sequences( sequences_test, maxlen=max_sequence_length) #[0:5] dataset_test['labels'] = labels_target #[0:5] torch.save(dataset_test, wdir + "/datasets/dataset_test_ag_news_nosw") if not os.path.exists(wdir + '/datasets/dictionary_ag_news_nosw'): torch.save(word_index, wdir + "/datasets/dictionary_ag_news_nosw")
def build_dataset_and_dict(): os.chdir('../') path_dataset = "C:\\Users\\Laptop\\Desktop\\bbc" X = [] y = [] labels = ['business', 'entertainment', 'politics', 'sport', 'tech'] for i, label in enumerate(labels): folder = path_dataset + "/" + label for file in os.listdir(folder): f = open(folder + "/" + file, "r") text = f.read() X.append(text) y.append(i) x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=12345) bbc_train = {'data': x_train, 'target': y_train} bbc_test = {'data': x_test, 'target': y_test} texts = bbc_train['data'] labels_target = bbc_train['target'] print(texts[0]) log_exp_run = make_logger() log_exp_run.experiments("Categories-labels: ") log_exp_run.experiments(labels) log_exp_run.experiments("Number of instances for training: ") log_exp_run.experiments(len(bbc_train['data'])) log_exp_run.experiments("Number of instances for testing: ") log_exp_run.experiments(len(bbc_test['data'])) removing_stop_words(texts) dataset_train = {'features': [], 'labels': []} max_sequence_length = 1000 max_nb_words = 2000 tokenizer = Tokenizer(num_words=max_nb_words) tokenizer.fit_on_texts(texts) sequences_train = tokenizer.texts_to_sequences(texts) wdir = os.getcwd() if not os.path.exists(wdir + '/datasets/dataset_train_bbc_news_nosw'): dataset_train['features'] = pad_sequences( sequences_train, maxlen=max_sequence_length) #[0:5] dataset_train['labels'] = labels_target #[0:5] torch.save(dataset_train, wdir + "/datasets/dataset_train_bbc_news_nosw") dataset_test = {'features': [], 'labels': []} texts = bbc_test['data'] labels_target = bbc_test['target'] removing_stop_words(texts) sequences_test = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index log_exp_run.experiments("Found unique tokens: " + str(len(word_index))) if not os.path.exists(wdir + '/datasets/dataset_test_bbc_news_nosw'): dataset_test['features'] = pad_sequences( sequences_test, maxlen=max_sequence_length) #[0:5] dataset_test['labels'] = labels_target #[0:5] torch.save(dataset_test, wdir + "/datasets/dataset_test_bbc_news_nosw") if not os.path.exists(wdir + '/datasets/dictionary_bbc_news_nosw'): torch.save(word_index, wdir + "/datasets/dictionary_bbc_news_nosw")