def run(df, fold): train_df = df[df.kfold != fold].reset_index(drop=True) valid_df = df[df.kfold == fold].reset_index(drop=True) tokenizer = tf.keras.preprocessing.text.Tokenizer() tokenizer.fit_on_texts(df.review.values.tolist()) xtrain = tokenizer.texts_to_sequences(train_df.review.values) xtest = tokenizer.texts_to_sequences(valid_df.review.values) xtrain = tf.keras.preprocessing.sequence.pad_sequences( xtrain, maxlen=config.MAX_LEN) xtest = tf.keras.preprocessing.sequence.pad_sequences( xtest, maxlen=config.MAX_LEN) train_dataset = dataset.IMDBDataset(reviews=xtrain, targets=train_df.sentiment.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=2) valid_dataset = dataset.IMDBDataset(reviews=xtest, targets=valid_df.sentiment.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) print('Loading Embeddings') embedding_dict = load_vectors('./crawl-300d-2M.vec') print('Embeddings Loaded') embedding_matrix = create_embedding_matrix(tokenizer.word_index, embedding_dict) device = torch.device('cuda') model = lstm.LSTM(embedding_matrix) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.1) print('Training model') best_accuracy = 0 early_stopping_counter = 0 for epoch in range(config.EPOCHS): engine.train(train_data_loader, model, optimizer, device) outputs, targets = engine.evaluate(valid_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print('Fold: ', fold, ' EPOCH: ', epoch, ' Accuracy Score: ', accuracy) if accuracy > best_accuracy: best_accuracy = accuracy else: early_stopping_counter += 1
def run(df, fold): """ Run training and validation for a given fold and dataset :param df: pandas dataframe with kfold column :param fold: current fold, int """ # fetch training dataframe train_df = df[df.kfold != fold].reset_index(drop=True) # fetch validation dataframe valid_df = df[df.kfold == fold].reset_index(drop=True) print("Fitting tokenizer") # we use tf.keras for tokenization # you can use your own tokenizer and then you can # get rid of tensorflow tokenizer = tf.keras.preprocessing.text.Tokenizer() tokenizer.fit_on_texts(df.review.values.tolist()) # convert training data to sequences # for example : "bad movie" gets converted to # [24, 27] where 24 is the index for bad and 27 is the # index for movie xtrain = tokenizer.texts_to_sequences(train_df.review.values) xtest = tokenizer.texts_to_sequences(valid_df.review.values) # zero pad the training/validation sequences given the maximum length # this padding is done on left hand side # if sequence is > MAX_LEN, it is truncated on left hand side too xtrain = tf.keras.preprocessing.sequence.pad_sequences(xtrain, maxlen=config.MAX_LEN) xtest = tf.keras.preprocessing.sequence.pad_sequences(xtest, maxlen=config.MAX_LEN) # initialize dataset class for training train_dataset = dataset.IMDBDataset(reviews=xtrain, targets=train_df.sentiment.values) # create torch dataloader for training # torch dataloader loads the data using dataset # class in batches specified by batch size train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=2) # initialize dataset class for validation valid_dataset = dataset.IMDBDataset(reviews=xtest, targets=valid_df.sentiment.values) # create torch dataloader for validation valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) print("Loading embeddings") # load embeddings as shown previously embedding_dict = load_vectors("../input/crawl-300d-2M.vec") embedding_matrix = create_embedding_matrix(tokenizer.word_index, embedding_dict) # create torch device, since we use gpu, we are using cuda device = torch.device("cuda") # fetch our LSTM model model = lstm.LSTM(embedding_matrix) # send model to device model.to(device) # initialize Adam optimizer optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) print("Training Model") # set best accuracy to zero best_accuracy = 0 # set early stopping counter to zero early_stopping_counter = 0 # train and validate for all epochs for epoch in range(config.EPOCHS): # train one epoch engine.train(train_data_loader, model, optimizer, device) # validate outputs, targets = engine.evaluate(valid_data_loader, model, device) # use threshold of 0.5 # please note we are using linear layer and no sigmoid # you should do this 0.5 threshold after sigmoid outputs = np.array(outputs) >= 0.5 # calculate accuracy accuracy = metrics.accuracy_score(targets, outputs) print(f"FOLD:{fold}, Epoch: {epoch}, Accuracy Score = {accuracy}") # simple early stopping if accuracy > best_accuracy: best_accuracy = accuracy else: early_stopping_counter += 1 if early_stopping_counter > 2: break
def run(df, fold): """ Run training and validation for a given fold and dataset Args: df: pandas dataset with kfold column fold: current forl, int """ # fetch training df train_df = df[df["kfold"] != fold].reset_index(drop=True) # fetch validation df valid_df = df[df["kfold"] == fold].reset_index(drop=True) print("Fitting tokenizer") tokenizer = tf.keras.preprocessing.text.Tokenizer() tokenizer.fit_on_texts(df.review.values.tolist()) # convert training data to sequences xtrain = tokenizer.texts_to_sequences(train_df.review.values) # convert validation data to sequences xtest = tokenizer.texts_to_sequences(valid_df.review.values) # zero pad the training sequences given the maximum length xtrain = tf.keras.preprocessing.sequence.pad_sequences( xtrain, maxlen=config.MAX_LEN) # zero pad the validation sequences given the maximum length xtest = tf.keras.preprocessing.sequence.pad_sequences( xtest, maxlen=config.MAX_LEN) # initialize dataset class for training train_dataset = dataset.IMDBDataset(reviews=xtrain, targets=train_df.sentiment.values) # create torch dataloader for training # torch dataloader load the data using dataset class # in batches specified by batch size train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=2) # initialize dataset class for validation valid_dataset = dataset.IMDBDataset(reviews=xtest, targets=valid_df.sentiment.values) # create torch dataloader for validation valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) print("Loading embeddings") embedding_dict = load_vectors("../input/crawl-300d-2M.vec") embedding_matrix = create_embedding_matrix(tokenizer.word_index, embedding_dict) # create torch device device = torch.device("cpu") # send model to device model.to(device) # initialize optimizer optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) print("Training model") best_acuraccy = 0 early_stopping_counter = 0 # train and validate for all epochs for epoch in range(config.EPOCHS): # train one epoch engine.train(train_data_loader, model, optimizer, device) # validate outpust, targets = engine.evaluate(valid_data_loader, model, device) # use threshold of 0.5 outputs = np.array(outputs) >= 0.5 # calculate accuracy accuracy = metrics.accuracy_score(targets, outputs) print(f"Fold: {fold}, Epoch: {epoch}, Accuracy Score:{accuracy}") # simple early stoping if accuracy > best_acuraccy: best_acuraccy = accuracy else: early_stopping_counter += 1 if early_stopping_counter > 2: break
def run(df, fold): """ Run training and validation for a given fold & dataset :param df: pandas dataframe with kfold column :param fold: current fold, int """ # fetch training dataframe df_train = df[df.kfold != fold].reset_index(drop=True) # fetch validation dataframe df_valid = df[df.kfold == fold].reset_index(drop=True) tokenizer = tf.keras.preprocessing.text.Tokenizer() tokenizer.fit_on_texts(df_train.review.values) x_train = tokenizer.texts_to_sequences(df_train.review.values) x_valid = tokenizer.texts_to_sequences(df_valid.review.values) x_train = tf.keras.preprocessing.sequence.pad_sequences( x_train, maxlen=config.MAXLEN) x_valid = tf.keras.preprocessing.sequence.pad_sequences( x_valid, maxlen=config.MAXLEN) #* embedding_dict: dictionary with word:embedding_vectors embedding_dict = load_vectors( "../input/wiki-news-300d-1M.vec/wiki-news-300d-1M.vec") #* word_index: dictionary with word:idx -- {'the': 1, 'cat': 2, 'sat': 3, 'on': 4} word_index = tokenizer.word_index #* embedding matrix: a dictionary with idx:embedding_vector embedding_matrix = create_embedding_matrix(word_index, embedding_dict) model = lstm.LSTM(embedding_matrix) optimizer = torch.optim.Adam(model.parameters, lr=1e-3) # check if GPU is available else run on CPU. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # device = torch.device("cpu") train_dataset = dataset.IMDBDataset(reviews=x_train, targets=df_train.sentiment.values) valid_dataset = dataset.IMDBDataset(reviews=x_valid, targets=df_valid.sentiment.values) train_data_loader = torch.utils.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_data_loader = torch.utils.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) best_accuracy = 0 early_stopping_counter = 0 for epoch in range(config.EPOCHS): engine.train(train_data_loader, model, optimizer, device) preds, targets = engine.evaluate(valid_data_loader, model, optimizer, device) print(f"preds----{preds}") preds = np.array(preds) >= 0.5 accuracy = metrics.accuracy_score(preds, targets) print(f"Fold:{fold}, Epoch: {epoch}, Accuracy: {accuracy}") # simple early stopping if accuracy > best_accuracy: best_accuracy = accuracy else: early_stopping_counter += 1 if early_stopping_counter > 2: break
def run(df, fold): """ Run training and validation for a given fold and dataset :param df: pandas dataframe with kfold column :param fold: current fold, int """ train_df = df[df.kfold != fold].reset_index(drop=True) valid_df = df[df.kfold == fold].reset_index(drop=True) print("Fitting tokenizer") # we use tf.keras for tokenization tokenizer = tf.keras.preprocessing.text.Tokenizer() tokenizer.fit_on_texts(df.review.values.tolist()) X_train = tokenizer.texts_to_sequences(train_df.review.values) X_valid = tokenizer.texts_to_sequences(valid_df.review.values) # zero pad the training/validation sequences X_train = tf.keras.preprocessing.sequence.pad_sequences( X_train, maxlen=config.MAX_LEN) X_valid = tf.keras.preprocessing.sequence.pad_sequences( X_valid, maxlen=config.MAX_LEN) # initialize dataset class for training/validation train_dataset = dataset.IMDBDataset(reviews=X_train, targets=train_df.sentiment.values) valid_dataset = dataset.IMDBDataset(reviews=X_valid, targets=valid_df.sentiment.values) # create torch dataloader for training/validation train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=2) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) print("Loading embeddings") embedding_dict = load_vectors("../input/wiki-news-300d-1M.vec") embedding_matrix = create_embedding_matrix(tokenizer.word_index, embedding_dict) # create torch device, since we use gpu, we are using cuda device = torch.device("cpu") # fetch our LSTM model model = lstm.LSTM(embedding_matrix) # send model to device model.to(device) # initialize Adam optimizer optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) print("Training Model") # set best accuracy to zero best_accuracy = 0 # set early stopping counter to zero early_stopping_counter = 0 # train and validate for all epochs best_accuracy = 0 early_stopping_counter = 0 for epoch in range(config.EPOCHS): # train one epoch engine.train(train_data_loader, model, optimizer, device) # validate outputs, targets = engine.evaluate(valid_data_loader, model, device) # use threshold of 0.5 outputs = np.array(outputs) >= 0.5 # calculate accuracy accuracy = metrics.accuracy_score(targets, outputs) print(f"FOLD:{fold}, Epoch: {epoch}, " f"Accuracy Score = {accuracy}") # simple early stopping if accuracy > best_accuracy: best_accuracy = accuracy else: early_stopping_counter += 1 if early_stopping_counter > 2: break