def run(): data_df = pd.read_csv('../input/train.csv') train_df, valid_df = train_test_split(data_df, random_state=42, test_size=0.1) train_df = train_df.reset_index(drop=True) valid_df = valid_df.reset_index(drop=True) train_y = train_df['median_relevance'].values valid_y = valid_df['median_relevance'].values train_dataset = CrowdFlowerDataset( query=train_df['query'].values, prod_title=train_df['product_title'].values, prod_description=train_df['product_description'].values, targets=train_y) valid_dataset = CrowdFlowerDataset( query=valid_df['query'].values, prod_title=valid_df['product_title'].values, prod_description=valid_df['product_description'].values, targets=valid_y) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=configs.TRAIN_BATCH_SIZE, shuffle=True) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=configs.VALID_BATCH_SIZE, shuffle=False) num_train_steps = int( len(train_dataset) / configs.TRAIN_BATCH_SIZE * configs.EPOCHS) device = configs.DEVICE model = BERTBaseUncased().to(device) optimizer = configs.OPTIMIZER(model.parameters(), lr=configs.LR) scheduler = configs.SCHEDULER(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) for epoch in range(configs.EPOCHS): epoch_start = time.time() epoch_train_loss = train_loop_fn(train_dataloader, model, optimizer, scheduler) outputs, targets, epoch_valid_loss = eval_loop_fn( valid_dataloader, model) epoch_end = time.time() epoch_time_elapsed = (epoch_end - epoch_start) / 60.0 print(f'time take to run a epoch - {epoch_time_elapsed}') print( f'Epoch - Training loss - {epoch_train_loss} Valid loss - {epoch_valid_loss}' ) qw_kappa = quadratic_weighted_kappa(targets.flatten(), outputs.flatten()) print(f'Quadratic Weighted Kappa: {qw_kappa}')
def run(): dfx = pd.read_csv(config.TRAINING_FILE).fillna("none") dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") if torch.cuda.is_available() else "cpu" model = BERTBaseUncased() param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [{ "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight': 0.001 }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight': 0.01 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) # model = nn.DataParallel(model) # 在多GPU时使用 best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy score: {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): print('1.Loading data...') dfx = pd.read_csv(config.TRAINING_FILE).fillna("none") # only train 2000 entries dfx = dfx[:2000] dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) print('Creating dataset...') train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) print('Creating dataloader...') train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) print('Building Bert Model...') model = BERTBaseUncased() print("Creating BERT Trainer...") trainer = BERTTrainer(model=model, train_dataloader=train_data_loader, test_dataloader=valid_data_loader, lr=config.LR, with_cuda=config.USE_CUDA) # model = nn.DataParallel(model) print('Training Start...') best_accuracy = 0 for epoch in range(config.EPOCHS): train_acc, train_loss = trainer.train_fn(epoch, len(df_train)) print(f'Train loss: {train_loss} Train accuracy: {train_acc:.4%}') outputs, targets = trainer.eval_fn() outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy Score = {accuracy:.2%}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): df = pd.read_csv(config.TRAINING_FILE).fillna("none") df.sentiment = df.sentiment.apply(lambda x: 1 if x == "positive" else 0) df_train, df_valid = model_selection.train_test_split( df, test_size=0.1, random_state=42, stratify=df.sentiment.values) # Same ratio of +ve and -ve index df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, ) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=4, ) model = BERTBaseUncased() trainer = Trainer(gpus=1) trainer.fit(model, train_dataloader=train_data_loader, val_dataloaders=[valid_data_loader])
def sentence_prediction(sentence): sentence = preprocess(sentence) model_path = config.MODEL_PATH test_dataset = dataset.BERTDataset( review=[sentence], target=[0] ) test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=3 ) device = config.device model = BERTBaseUncased() model.load_state_dict(torch.load( model_path, map_location=torch.device(device))) model.to(device) outputs, [] = engine.predict_fn(test_data_loader, model, device) print(outputs) return outputs[0]
def run(): dfx = pd.read_csv(config.TRAINING_FILE).fillna("NONE") df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) model = BERTBaseUncased() model.to(config.DEVICE) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model = nn.DataParallel(model) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, scheduler, config.DEVICE) outputs, targets = engine.eval_fn(valid_data_loader, model, config.DEVICE) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy Score={accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): dfx = pd.read_csv(config.TRAINING_FILE).fillna('none') df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.2, random_state=config.RANDOM_SEED, stratify=dfx.Sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.Text.values, target=df_train.Sentiment.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.BERTDataset(review=df_valid.Text.values, target=df_valid.Sentiment.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=4) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f'Accuracy Score = {accuracy}') if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): df1 = pd.read_csv("../input/jigsaw-toxic-comment-train.csv", usecols=["comment_text", "toxic"]) df2 = pd.read_csv("../input/jigsaw-unintended-bias-train.csv", usecols=["comment_text", "toxic"]) df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True) df_valid = pd.read_csv("../input/validation.csv") train_dataset = dataset.BERTDataset( comment_text=df_train.comment_text.values, target=df_train.toxic.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.BERTDataset( comment_text=df_valid.comment_text.values, target=df_valid.toxic.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device(config.DEVICE) model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [{ "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.001 }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_data_loader, model, device) targets = np.array(targets) >= 0.5 accuracy = metrics.roc_auc_score(targets, outputs) print(f"AUC Score = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): df1 = pd.read_csv(config.TRAINING_FILE, usecols=["comment_text","toxic"]) train_dataset = dataset.BERTDataset( review=df1.comment_text.values, target=df1.toxic.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4 ) df2=pd.read_csv("../input/validation.csv", usecols=["comment_text","toxic"]) valid_dataset = dataset.BERTDataset( review=df2.comment_text.values, target=df2.toxic.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] num_train_steps = int(len(df1) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) model = nn.DataParallel(model) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy Score = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): df = pd.read_csv(config.TRAINING_FILE) df.sentiment = df.sentiment.apply(lambda x: 1 if x == 'positive' else 0) df_train, df_valid = model_selection.train_test_split( df, test_size=0.1, random_state=42, stratify=df.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) train_data_loader = torch.utils.data.DataLoader( train_data_loader, batch_size=config.TRAIN_BATCH_SIZE, shuffle=True, num_workers=2) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2) device = torch.device('cpu') model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weigth_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * num_train_steps) optimizer = AdamW(optimizer_parameters, lr=2e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_train_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) > 0.5 accuracy = model_selection.accuracy_score(targets, outputs) if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def main(_): input = config.EVAL_PROC output = 'predictions.csv' model_path = config.MODEL_PATH if FLAGS.input: input = FLAGS.input if FLAGS.output: output = FLAGS.input if FLAGS.model_path: model_path = FLAGS.model_path df_test = pd.read_fwf(input) logger.info(f"Bert Model: {config.BERT_PATH}") logger.info( f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} " ) logger.info(f"Test file: {input}") logger.info(f"Test size : {len(df_test):.4f}") trg = [] for i in range(len(df_test.values)): trg.append(0) test_dataset = dataset.BERTDataset(text=df_test.values, target=trg) test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=3) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = BERTBaseUncased(config.DROPOUT) model.load_state_dict( torch.load(model_path, map_location=torch.device(device))) model.to(device) outputs, extracted_features = engine.predict_fn( test_data_loader, model, device, extract_features=FLAGS.features) df_test["predicted"] = outputs # save file df_test.to_csv(output, header=None, index=False)
def run(): dfx = pd.read_csv(config.TRAINING_FILE, nrows=30).dropna().reset_index(drop=True) df_train, df_valid = model_selection.train_test_split( dfx, test_size = 0.1, random_state = 42, stratify = dfx.sentiment.values ) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.TweetDataset( tweet = df_train.text.values, sentiment = df_train.sentiment.values, selected_text=df_train.selected_text.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=1 ) valid_dataset = dataset.TweetDataset( tweet = df_valid.text.values, sentiment = df_valid.sentiment.values, selected_text=df_valid.selected_text.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) device = torch.device('cpu') model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) best_jaccard = 0 for epoch in range(config.EPOCHS): print("here") engine.train_fn(train_data_loader, model, optimizer, device, scheduler) mean_jac = engine.eval_fn(valid_data_loader, model, device) print("jaccard_score = {mean_jac}".format(mean_jac=mean_jac)) if(mean_jac>best_jaccard): torch.save(model.state_dict(), config.MODEL_PATH) best_jaccard = mean_jac
def run(opt_level="O2", keep_batchnorm_fp32=True, batch_size=5, nb_epochs=10, data_path="../inputs/IMDB_Dataset.csv", model_path="./"): df = pd.read_csv(data_path).fillna("none")[0:100] df.sentiment = df.sentiment.apply(lambda x: 1 if x == "positive" else 0) df_train, df_valid = model_selection.train_test_split( df, test_size=0.1, random_state=42, stratify=df.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) # Creating the datasets train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) # Creating the dataloaders train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, num_workers=10, drop_last=True) valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size, num_workers=10, drop_last=True) # Defining the model and sending to the device device = torch.device("cuda") model = BERTBaseUncased() model.to(device) parameters = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] # We don't want any decay for them optimizer_parameters = [{ "params": [p for n, p in parameters if not any(nd in n for nd in no_decay)], "weight_decay": 0.001 }, { "params": [p for n, p in parameters if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }] num_train_steps = int(len(df_train) * nb_epochs / batch_size) # Defining the optimizer and the scheduler optimizer = AdamW(optimizer_parameters, lr=3e-5) # Initialize the pytorch model and the optimizer to allow automatic mixed-precision training model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32, loss_scale="dynamic") scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # No warmup num_training_steps=num_train_steps) # Train the model engine.global_trainer(train_dataloader, valid_dataloader, model, optimizer, scheduler, device, nb_epochs, model_path)
def run(): dfx = pd.read_csv(config.TRAINING_FILE).fillna("none") dfx.sentiment = dfx.sentiment.apply( # can use label encoding lambda x: 1 if x == "positive" else 0 # can use map fn ) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment. values # when split both train and val have same positive to negative sample ratio ) df_train = df_train.reset_index(drop=True) # 0 to length of df_train df_valid = df_valid.reset_index(drop=True) # 0 to length of df_valid train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") # using cuda model = BERTBaseUncased() # calling from model.py param_optimizer = list( model.named_parameters()) # specify parameters to train no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] """ These parameters are adjustable, we should take a look at different layers and the decay we want, how much learning rate etc.""" num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) # model = nn.DataParallel(model) # converting to multi gpu model best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, target = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(target, outputs) print(f"Accuracy score = {accuracy}") if accuracy > best_accuracy: torch.save( model.state_dict(), config.MODEL_PATH) # saving the model only if it improves best_accuracy = accuracy
@app.route("/") def home(): return render_template("index.html") @app.route("/predict") def predict(): sentence = request.args.get("sentence") start_time = time.time() positive_prediction = sentence_prediction(sentence) negative_prediction = 1 - positive_prediction response = {} response["response"] = { "positive": str(positive_prediction), "negative": str(negative_prediction), "sentence": str(sentence), "time_taken": str(time.time() - start_time), } return flask.jsonify(response) if __name__ == "__main__": MODEL = BERTBaseUncased() # MODEL = nn.DataParallel(MODEL) MODEL.load_state_dict(torch.load(config.MODEL_PATH)) MODEL.to(DEVICE) MODEL.eval() app.run()
from torch import nn import config import dataset from model import BERTBaseUncased if __name__ == '__main__': device='cuda' review = ['this is an amzaing place'] dataset = BERTDataset( review = review,target=[0] ) model = BERTBaseUncased() #model = nn.DataParallel(model) #if you have used model as DataParallel-model then inside torch.onnx.export use 'model.module' instead of model #====>>> question is from where 'module' arises ??=>>>> print model(uncomment line 25) here .You will output as Dataparallel({module:BERTBaseUncased()) # ====> that means 'module' is key of 'BERTBaseUncased' model's value model.load_state_dict(torch.load(config.MODEL_PATH)) model.eval() #print(model) ids = dataset[0]['ids'].unsqueeze(0) mask = dataset[0]['mask'].unsqueeze(0) token_type_ids = dataset[0][''token_type_ids].unsqueeze(0) torch.onnx.export( model, #model.module [===>> if dataparallel-model ==>> see above commented line 20]
) outputs = torch.sigmoid(outputs).cpu().detach().numpy() return outputs[0][0] @app.route("/predict") def predict(): sentence = request.args.get("sentence") start_time = time.time() positive_prediction = sentence_prediction(sentence, model=MODEL) negative_prediction = 1 - positive_prediction response = {} response["response"] = { 'positive': str(positive_prediction), 'negative': str(negative_prediction), 'sentence': str(sentence), 'time_taken': str(time.time() - start_time) } return flask.jsonify(response) if __name__ == "__main__": MODEL = BERTBaseUncased() MODEL = nn.DataParallel(MODEL) MODEL.load_state_dict(torch.load(config.MODEL_PATH, map_location=torch.device(config.DEVICE))) MODEL.to(config.DEVICE) MODEL.eval() app.run()
def main(_): LEARNING_RATE = config.LEARNING_RATE DROPOUT = config.DROPOUT if FLAGS.lr: LEARNING_RATE = FLAGS.lr if FLAGS.dropout: DROPOUT = FLAGS.dropout train_file = config.TRAIN_PROC df_train = pd.read_csv(train_file).fillna("none") valid_file = config.DEVEL_PROC df_valid = pd.read_csv(valid_file).fillna("none") test_file = config.EVAL_PROC df_test = pd.read_csv(test_file).fillna("none") logger.info(f"Bert Model: {config.BERT_PATH}") logger.info(f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ") logger.info(f"Train file: {train_file}") logger.info(f"Valid file: {valid_file}") logger.info(f"Test file: {test_file}") logger.info(f"Train size : {len(df_train):.4f}") logger.info(f"Valid size : {len(df_valid):.4f}") logger.info(f"Test size : {len(df_test):.4f}") train_dataset = dataset.BERTDataset( review=df_train.text.values, target=df_train.label.values ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, shuffle=True ) valid_dataset = dataset.BERTDataset( review=df_valid.text.values, target=df_valid.label.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) test_dataset = dataset.BERTDataset( review=df_test.text.values, target=df_test.label.values ) test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #torch.device("cuda") model = BERTBaseUncased(DROPOUT) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) # model = nn.DataParallel(model) best_accuracy = 0 for epoch in range(config.EPOCHS): logger.info(f"Epoch = {epoch}") train_loss, train_acc = engine.train_fn( train_data_loader, model, optimizer, device, scheduler) for tag, parm in model.named_parameters(): if parm.grad is not None: writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch) outputs, targets, val_loss, val_acc = engine.eval_fn( valid_data_loader, model, device) val_mcc = metrics.matthews_corrcoef(outputs, targets) logger.info(f"val_MCC_Score = {val_mcc:.4f}") outputs, targets, test_loss, test_acc = engine.eval_fn( test_data_loader, model, device) test_mcc = metrics.matthews_corrcoef(outputs, targets) logger.info(f"test_MCC_Score = {test_mcc:.4f}") logger.info( f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, test_loss={test_loss:.4f}") writer.add_scalar('loss/train', train_loss, epoch) # data grouping by `slash` writer.add_scalar('loss/val', val_loss, epoch) # data grouping by `slash` writer.add_scalar('loss/test', test_loss, epoch) # data grouping by `slash` logger.info( f"train_acc={train_acc:.4f}, val_acc={val_acc:.4f}, test_acc={test_acc:.4f}") writer.add_scalar('acc/train', train_acc, epoch) # data grouping by `slash` writer.add_scalar('acc/val', val_acc, epoch) # data grouping by `slash` writer.add_scalar('acc/test', test_acc, epoch) # data grouping by `slash` logger.info(f"val_mcc={val_acc:.4f}, test_mcc={test_acc:.4f}") writer.add_scalar('mcc/val', val_mcc, epoch) # data grouping by `slash` writer.add_scalar('mcc/test', test_mcc, epoch) # data grouping by `slash` accuracy = metrics.accuracy_score(targets, outputs) logger.info(f"Accuracy Score = {accuracy:.4f}") if accuracy < 0.4: logger.info(f"Something is very wrong! Accuracy is only {accuracy:.4f} Stopping...") break if accuracy > best_accuracy: logger.info(f"Saving model with Accuracy Score = {accuracy:.4f}") torch.save(model.state_dict(), config.MODEL_PATH[:-4] + "." + str(round(accuracy*100, 2)) + ".bin") best_accuracy = accuracy es = 0 else: es += 1 logger.info(f"Not improved for {es} times of 5. Best so far - {best_accuracy:.4f}") if es > 4: logger.info(f"Early stopping with best accuracy: {best_accuracy:.4f} and accuracy for this epoch: {accuracy:.4f} ...") break
def run(): train_filename, label = sys.argv[1:3] model_path = "models2/" + label + "_best.pt" assert 'train' in train_filename filenames = {'train': train_filename, 'dev': train_filename.replace('train', 'dev'), 'test':train_filename.replace('train', 'test')} dataframes = {} num_classes = 0 for subset, filename in filenames.items(): dataframes[subset] = preprocess(filename, label) num_classes = max(num_classes, max(dataframes[subset].ENCODE_CAT) + 1) dataloaders = {} for subset, filename in filenames.items(): if subset == 'train': batch_size = config.TRAIN_BATCH_SIZE num_workers = 4 else: batch_size = config.VALID_BATCH_SIZE num_worker = 1 dataloaders[subset] = process_dataset( dataframes[subset], batch_size, num_workers) device = torch.device(config.DEVICE) model = BERTBaseUncased(num_classes) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=get_num_train_steps(filenames["train"], label) ) best_val_accuracy = float('-inf') best_val_epoch = None best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn( dataloaders["train"], model, optimizer, device, scheduler, epoch) outputs, targets = engine.eval_fn( dataloaders['dev'], model, device, epoch) accuracy = metrics.accuracy_score(outputs, targets) print(f"Validation Accuracy = {accuracy}") if accuracy > best_val_accuracy: torch.save(model.state_dict(), model_path) best_val_accuracy = accuracy best_val_epoch = epoch print("Best val accuracy till now {}".format(best_val_accuracy)) if best_val_epoch < (epoch - config.PATIENCE): break model.load_state_dict(torch.load(model_path)) for subset in ['train', 'dev', 'test']: outputs, targets = engine.eval_fn( dataloaders[subset], model, device, epoch) result_df_dicts = [] for o, t in zip(outputs, targets): result_df_dicts.append({"output":o, "target":t}) result_df = pd.DataFrame.from_dict(result_df_dicts) final_df = pd.concat([dataframes[subset], result_df], axis=1) for i in final_df.itertuples(): assert i.ENCODE_CAT == i.target result_file = "results2/" + subset + "_" + label + ".csv" final_df.to_csv(result_file)
def train(): df = pd.read_csv(config.TRAINING_FILE).fillna("none") df['sentiment'] = df['sentiment'].map({"positive": 1, "negative": 0}) df_train, df_valid = train_test_split(df, test_size=0.1, random_state=42, stratify=df.sentiment.values) # reset index of both splits df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, shuffle=False, num_workers=4, ) valid_dataset = dataset.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, shuffle=False, num_workers=4, ) device = torch.device("cuda") model = BERTBaseUncased() model.to(device) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=0, num_training_steps=int(len(df_train) / config.TRAIN_BATCH_SIZE) * config.EPOCHS) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_dataloader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_dataloader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(outputs, targets) print(f"Accuracy: {accuracy:.3f}") if accuracy > best_accuracy: best_accuracy = accuracy torch.save(model.state_dict(), config.MODEL_PATH)
def main(_): test_file = config.EVAL_PROC model_path = config.MODEL_PATH if FLAGS.test_file: test_file = FLAGS.test_file if FLAGS.model_path: model_path = FLAGS.model_path df_test = pd.read_csv(test_file).fillna("none") logger.info(f"Bert Model: {config.BERT_PATH}") logger.info( f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} " ) logger.info(f"Test file: {test_file}") logger.info(f"Test size : {len(df_test):.4f}") test_dataset = dataset.BERTDataset(review=df_test.text.values, target=df_test.label.values) test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=3) device = config.device model = BERTBaseUncased() model.load_state_dict( torch.load(model_path, map_location=torch.device(device))) model.to(device) outputs, extracted_features = engine.predict_fn( test_data_loader, model, device, extract_features=FLAGS.features) df_test["predicted"] = outputs # save file df_test.to_csv(model_path.split("/")[-2] + '.csv', header=None, index=False) if FLAGS.features: pca = PCA(n_components=50, random_state=7) X1 = pca.fit_transform(extracted_features) tsne = TSNE(n_components=2, perplexity=10, random_state=6, learning_rate=1000, n_iter=1500) X1 = tsne.fit_transform(X1) # if row == 0: print("Shape after t-SNE: ", X1.shape) X = pd.DataFrame(np.concatenate([X1], axis=1), columns=["x1", "y1"]) X = X.astype({"x1": float, "y1": float}) # Plot for layer -1 plt.figure(figsize=(20, 15)) p1 = sns.scatterplot(x=X["x1"], y=X["y1"], palette="coolwarm") # p1.set_title("development-"+str(row+1)+", layer -1") x_texts = [] for output, value in zip(outputs, df_test.label.values): if output == value: x_texts.append("@" + label_decoder(output)[0] + label_decoder(output)) else: x_texts.append( label_decoder(value) + "-" + label_decoder(output)) X["texts"] = x_texts # X["texts"] = ["@G" + label_decoder(output) if output == value else "@R-" + label_decoder(value) + "-" + label_decoder(output) # for output, value in zip(outputs, df_test.label.values)] # df_test.label.astype(str) #([str(output)+"-" + str(value)] for output, value in zip(outputs, df_test.label.values)) # Label each datapoint with the word it corresponds to for line in X.index: text = X.loc[line, "texts"] + "-" + str(line) if "@U" in text: p1.text(X.loc[line, "x1"] + 0.2, X.loc[line, "y1"], text[2:], horizontalalignment='left', size='medium', color='blue', weight='semibold') elif "@P" in text: p1.text(X.loc[line, "x1"] + 0.2, X.loc[line, "y1"], text[2:], horizontalalignment='left', size='medium', color='green', weight='semibold') elif "@N" in text: p1.text(X.loc[line, "x1"] + 0.2, X.loc[line, "y1"], text[2:], horizontalalignment='left', size='medium', color='red', weight='semibold') else: p1.text(X.loc[line, "x1"] + 0.2, X.loc[line, "y1"], text, horizontalalignment='left', size='medium', color='black', weight='semibold') plt.show() plt.savefig(model_path.split("/")[-2] + '-figure.svg', format="svg")
def run(): data_df = pd.read_csv('../input/train.csv') train_df, valid_df = train_test_split(data_df, random_state=42, test_size=0.1) train_df = train_df.reset_index(drop=True) valid_df = valid_df.reset_index(drop=True) sample_sub_df = pd.read_csv("../input/sample_submission.csv") target_cols = list(sample_sub_df.drop("qa_id", axis=1).columns) train_y = train_df[target_cols].values valid_y = valid_df[target_cols].values train_dataset = BertDataset( qtitle=train_df.question_title.values, qbody=train_df.question_body.values, answer=train_df.answer.values, targets=train_y ) valid_dataset = BertDataset( qtitle=valid_df.question_title.values, qbody=valid_df.question_body.values, answer=valid_df.answer.values, targets=valid_y ) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size= config.TRAIN_BATCH_SIZE, shuffle=True ) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size= config.VALID_BATCH_SIZE, shuffle=False ) num_train_steps = int(len(train_dataset)/ config.TRAIN_BATCH_SIZE * config.EPOCHS) device= config.DEVICE model = BERTBaseUncased().to(device) optimizer = config.OPTIMIZER(model.parameters(), lr=config.LR) scheduler = config.SCHEDULER(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) for epoch in range(config.EPOCHS): epoch_start = time.time() epoch_train_loss = train_loop_fn(train_dataloader, model, optimizer, scheduler) outputs, targets, epoch_valid_loss = eval_loop_fn(valid_dataloader, model) epoch_end = time.time() epoch_time_elapsed = (epoch_end - epoch_start)/60.0 print(f'time take to run a epoch - {epoch_time_elapsed}') print(f'Epoch - Training loss - {epoch_train_loss} Valid loss - {epoch_valid_loss}') spear =[] for jj in range(targets.shape[1]): p1 = list(targets[:, jj]) p2 = list(outputs[:, jj]) coef, _ = np.nan_to_num(stats.spearmanr(p1, p2)) spear.append(coef) spear = np.mean(spear) print(f"Spearman coeff : {spear}")
def run(): dfx = pd.read_csv(config.TRAINING_FILE, nrows=100).dropna().reset_index(drop=True) dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0) df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.TweetDataset( tweet=df_train.text.values, sentiment=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.TweetDataset( tweet=df_valid.text.values, sentiment=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) model = nn.DataParallel(model) best_jaccard = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) jaccard = engine.eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") if jaccard > best_jaccard: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = jaccard
def run(): df1 = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-train.csv", usecols=['comment_text', 'toxic']) df1 = pd.read_csv("../input/jigsaw-unintended-bias-train.csv", usecols=['comment_text', 'toxic']) #combined df1 and df2 and made big dataframe df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True) #validation dataframe has been given by kaggle df_valid - pd.read_csv("../input/validation.csv") train_dataset = dataset.BERTDataset( comment_text=df_train.comment_text.values, target=df_train.toxic.values) #-------------------------------------- #write sampler if using tpu else not train_sampler = torch.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) #---------------------------------------- train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, sampler=train_sampler, #problem with tpu when using torch_xla is that if batch size is not equal then it's going to crash , so use drop_last drop_last=True) valid_dataset = dataset.BERTDataset( comment_text=df_valid.comment_text.values, target=df_valid.toxic.values) #-------------------------------------- #write sampler if using tpu else not valid_sampler = torch.data.distributed.DistributedSampler( valid_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) #---------------------------------------------- valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1, sampler=valid_sampler, #no need of drop_last here ) device = xm.xla_device() #xla_device means tpu model = BERTBaseUncased() # model.to(device) #no need to move data on device #specify what parameters you want to train param_optimizer = list(model.named_parameters()) #we don't want any deacy for these layer names such as bias and othr following things no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { #don't decay weight for above no_decay list else decay "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE / xm.xrt_world_size() * config.EPOCHS) lr = 3e-5 * xm.xrt_world_size() #experiment with lr optimizer = AdamW(optimizer_parameters, lr=lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(config.EPOCHS): #parallel loader for tpus para_loader = pl.ParallelLoader(train_data_loader, [device]) engine.train_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler) parallel_loader = pl.ParallelLoader(valid_data_loader, [device]) outputs, targets = engine.eval_fn( para_loader.per_device_loader(device), model, device) #threshold the target instead of output targets = np.array(targets) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy Score = {accuracy}") if accuracy > best_accuracy: #instead of torch.save use xm.save xm.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
def run(): ''' Entire training loop - Create DataLoaders - Define Training Configuration - Launch Training Loop ''' # Num of available TPU cores if config.TPUs: n_TPUs = xm.xrt_world_size() DEVICE = xm.xla_device() else: DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' device = torch.device(DEVICE) # Read Data # df1 = pd.read_csv('data/jigsaw-toxic-comment-train.csv', usecols=['comment_text', 'toxic']) # df2 = pd.read_csv('data/jigsaw-unintended-bias-train.csv', usecols=['comment_text', 'toxic'], engine='python') # don't know why it was breaking with default C parser # df_train = df1 # pd.concat([df1,df2], axis=0).reset_index(drop=True) # df_valid = pd.read_csv('data/validation.csv') # Subsample df_train = pd.read_csv('data/jigsaw-toxic-comment-train-small.csv', usecols=['comment_text', 'toxic']) df_valid = pd.read_csv('data/validation-small.csv', usecols=['comment_text', 'toxic']) # Preprocess train_dataset = dataset.BERTDataset( comment=df_train.comment_text.values, target=df_train.toxic.values ) valid_dataset = dataset.BERTDataset( comment=df_valid.comment_text.values, target=df_valid.toxic.values ) drop_last=False train_sampler, valid_sampler = None, None if config.TPUs: drop_last=True train_sampler = DistributedSampler( train_dataset, num_replicas=n_TPUs, rank=xm.get_ordinal(), shuffle=True ) valid_sampler = DistributedSampler( valid_dataset, num_replicas=n_TPUs, rank=xm.get_ordinal(), shuffle=True ) # Create Data Loaders train_data_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, drop_last=drop_last, sampler=train_sampler ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1, drop_last=drop_last, sampler=valid_sampler ) # Machine Configuration if config.MODEL == 'bert': model = BERTBaseUncased() elif config.MODEL == 'distil-bert': model = DistilBERTBaseUncased() else: print('Model chosen in config not valid') exit() model.to(device) # Optimizer Configuration param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, ] lr = config.LR num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) # TODO: why do the LR increases because of a distributed training ? if config.TPUs: num_train_steps /= n_TPUs lr *= n_TPUs optimizer = AdamW(optimizer_parameters, lr=lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) if not config.TPUs: if N_GPU > 1: model = nn.DataParallel(model) # Training loop best_score = 0 for epoch in range(config.EPOCHS): if config.TPUs: train_loader = pl.ParallelLoader(train_data_loader, [device]) valid_loader = pl.ParallelLoader(valid_data_loader, [device]) train_fn(train_loader.per_device_loader(device), model, optimizer, device, scheduler) outputs, targets = eval_fn(valid_loader.per_device_loader(device), model, device) else: train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets = eval_fn(valid_data_loader, model, device) targets = np.array(targets) >= 0.5 # TODO: why ? auc_score = metrics.roc_auc_score(targets, outputs) # Save if best print(f"AUC Score = {auc_score}") if auc_score > best_score: if not config.TPUs: torch.save(model.state_dict(), config.MODEL_PATH) else: xm.save(model.state_dict(), config.MODEL_PATH) best_score = auc_score
outputs = MODEL(ids=ids, mask=mask, token_type_ids=token_type_ids) outputs = torch.sigmoid(outputs).cpu().detach().numpy() return outputs[0][0] @app.route("/predict") def predict(): sentence = request.args.get("sentence") pos_prediction = sentence_predict(sentence) neg_prediction = 1 - pos_prediction response = {} response['response'] = { 'positive': str(pos_prediction), 'negative': str(neg_prediction), 'sentence': str(sentence) } return flask.jsonify(response) if __name__ == "__main__": MODEL = BERTBaseUncased() # MODEL.load_state_dict(torch.load(config.MODEL_PATH)) my_model = torch.load(config.MODEL_PATH, map_location=torch.device(DEVICE)) MODEL.load_state_dict(my_model) torch.cuda.empty_cache() MODEL.to(DEVICE) MODEL.eval() app.run(debug=True)
def train(): # this function trains the model # read the training file and fill NaN values with "none" # you can also choose to drop NaN values in this # specific dataset dfx = pd.read_csv(config_2.TRAINING_FILE).fillna("none") # sentiment = 1 if its positive # else sentiment = 0 dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0) # we split the data into single training # and validation fold df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) # reset index df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) # initialize BERTDataset from dataset.py # for training dataset train_dataset = dataset_2.BERTDataset(review=df_train.review.values, target=df_train.sentiment.values) # create training dataloader train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config_2.TRAIN_BATCH_SIZE, num_workers=4) # initialize BERTDataset from dataset.py # for validation dataset valid_dataset = dataset_2.BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values) # create validation data loader valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config_2.VALID_BATCH_SIZE, num_workers=1) # initialize the cuda device # use cpu if you dont have GPU #device = torch.device("cuda") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # load model and send it to the device model = BERTBaseUncased() model.to(device) # create parameters we want to optimize # we generally dont use any decay for bias # and weight layers param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] # calculate the number of training steps # this is used by scheduler num_train_steps = int( len(df_train) / config_2.TRAIN_BATCH_SIZE * config_1.EPOCHS) # AdamW optimizer # AdamW is the most widely used optimizer # for transformer based networks optimizer = AdamW(optimizer_parameters, lr=3e-5) # fetch a scheduler # you can also try using reduce lr on plateau scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) # if you have multiple GPUs # model model to DataParallel # to use multiple GPUs model = nn.DataParallel(model) # start training the epochs best_accuracy = 0 for epoch in range(config_2.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) outputs, targets = engine.eval_fn(valid_data_loader, model, device) outputs = np.array(outputs) >= 0.5 accuracy = metrics.accuracy_score(targets, outputs) print(f"Accuracy Score = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config_2.MODEL_PATH) best_accuracy = accuracy
def run(): dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True) #stratify split so that class can be balanced for both train and validation ==>> it means number of positive class will be equal to negative class for train ===>>same for validation dataset also df_train, df_valid = model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.TweetDataset( tweet=df_train.text.values, target=df_train.sentiment.values, selected_text=df_train.selected_text.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.TweetDataset( tweet=df_valid.text.values, target=df_valid.sentiment.values, selected_text=df_valid.selected_text.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device("cuda") model = BERTBaseUncased() model.to(device) #specify what parameters you want to train param_optimizer = list(model.named_parameters()) #we don't want any deacy for these layer names such as bias and othr following things no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { #don't decay weight for above no_decay list else decay "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) #experiment with lr optimizer = AdamW(optimizer_parameters, lr=3e-5) #scheduler can be of your choice scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) #convert model to multi-gpu model --->> no need to do this if you have not multiple gpus model = nn.DataParallel(model) #evaluation matrix is jacccard best_jaccard = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler) jaccard = engine.eval_fn(valid_data_loader, model, device) print(f"Jaccard Score = {jaccard}") if jaccard > best_jaccard: torch.save(model.state_dict(), config.MODEL_PATH) best_jaccard = jaccard
def run(): df_train = preprocess('./review-sentence_train_clean.csv') df_valid = preprocess('./review-sentence_dev_clean.csv') df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) train_dataset = dataset.BERTDataset(review=df_train.sentence.values, target=df_train.ENCODE_CAT.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4) valid_dataset = dataset.BERTDataset(review=df_valid.sentence.values, target=df_valid.ENCODE_CAT.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device(config.DEVICE) model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) best_accuracy = 0 for epoch in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device, scheduler, epoch) outputs, targets = engine.eval_fn(valid_data_loader, model, device, epoch) accuracy = metrics.accuracy_score(outputs, targets) print(f"Validation Accuracy = {accuracy}") if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy print("Best val accuracy till now {}".format(best_accuracy))
def run(dataset_index): datasets = [ "gold.prep-auto.full.prep.{0}.csv", "gold.prep-auto.no-emoticons.prep.{0}.csv", "gold.prep-auto.prep.{0}.csv", "gold.prep-english.prep.{0}.csv", "gold.prep-peisenieks.prep.{0}.csv", "gold.prep.{0}.csv" ] # dataset_index = 5 #0-5 train_file = config.DATASET_LOCATION + datasets[dataset_index].format( "train") df_train = pd.read_csv(train_file).fillna("none") df_train.label = df_train.label.apply(label_encoder) valid_file = config.DATASET_LOCATION + datasets[dataset_index].format( "dev" ) #"gold.prep-auto.full.prep.dev.csv" #gold.prep-auto.no-emoticons.prep.dev.csv" #gold.prep-auto.prep.dev.csv" #"gold.prep-english.prep.dev.csv" #"gold.prep-peisenieks.prep.dev.csv" #"gold.prep.dev.csv" df_valid = pd.read_csv(valid_file).fillna("none") df_valid.label = df_valid.label.apply(label_encoder) test_file = config.DATASET_LOCATION + "eval.prep.test.csv" df_test = pd.read_csv(test_file).fillna("none") df_test.label = df_test.label.apply(label_encoder) logger.info(f"Bert Model: {config.BERT_PATH}") logger.info( f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} " ) logger.info(f"Train file: {train_file}") logger.info(f"Valid file: {valid_file}") logger.info(f"Test file: {test_file}") logger.info(f"Train size : {len(df_train):.4f}") logger.info(f"Valid size : {len(df_valid):.4f}") logger.info(f"Test size : {len(df_test):.4f}") train_dataset = dataset.BERTDataset(review=df_train.text.values, target=df_train.label.values) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4, shuffle=True) valid_dataset = dataset.BERTDataset(review=df_valid.text.values, target=df_valid.label.values) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) test_dataset = dataset.BERTDataset(review=df_test.text.values, target=df_test.label.values) test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1) device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') #torch.device("cuda") model = BERTBaseUncased() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) # model = nn.DataParallel(model) best_accuracy = 0 for epoch in range(config.EPOCHS): logger.info(f"epoch={epoch}") train_loss, train_acc = engine.train_fn(train_data_loader, model, optimizer, device, scheduler) for tag, parm in model.named_parameters(): if parm.grad is not None: writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch) outputs, targets, val_loss, val_acc = engine.eval_fn( valid_data_loader, model, device) val_mcc = metrics.matthews_corrcoef(outputs, targets) logger.info(f"val_MCC_Score = {val_mcc:.3f}") outputs, targets, test_loss, test_acc = engine.eval_fn( test_data_loader, model, device) test_mcc = metrics.matthews_corrcoef(outputs, targets) logger.info(f"test_MCC_Score = {test_mcc:.3f}") logger.info( f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, test_loss={test_loss:.4f}" ) writer.add_scalar('loss/train', train_loss, epoch) # data grouping by `slash` writer.add_scalar('loss/val', val_loss, epoch) # data grouping by `slash` writer.add_scalar('loss/test', test_loss, epoch) # data grouping by `slash` logger.info( f"train_acc={train_acc:.3f}, val_acc={val_acc:.3f}, test_acc={test_acc:.3f}" ) writer.add_scalar('acc/train', train_acc, epoch) # data grouping by `slash` writer.add_scalar('acc/val', val_acc, epoch) # data grouping by `slash` writer.add_scalar('acc/test', test_acc, epoch) # data grouping by `slash` logger.info(f"val_mcc={val_acc:.3f}, test_mcc={test_acc:.3f}") writer.add_scalar('mcc/val', val_mcc, epoch) # data grouping by `slash` writer.add_scalar('mcc/test', test_mcc, epoch) # data grouping by `slash` accuracy = metrics.accuracy_score(targets, outputs) logger.info(f"Accuracy Score = {accuracy:.3f}") if accuracy > best_accuracy: print(f"Saving model with Accuracy Score = {accuracy:.3f}") torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy