def train(run_test=False): results = {} vocab, id2tok, tok2id = get_vocab(train_dataset) embed_weights = None if USE_GLOVE: embed_weights = get_embed_weights(vocab, tok2id) # from scipy.spatial import distance # print(distance.cosine(embed_weights[tok2id['obama']], embed_weights[tok2id['clinton']])) model = LSTMClassifier( VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE, bidirectional=False, embed_weights=embed_weights, ) loss_func = nn.BCEWithLogitsLoss(pos_weight=torch.FloatTensor([POS_LOSS_WEIGHT])) """ Demo of weights in loss function. """ # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([1])) # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([0])) if glob.glob("models/lstm/*"): model_id = max([int(i[-1]) for i in glob.glob("models/lstm/*")]) + 1 else: model_id = 1 optimizer = optim.Adam(model.parameters(), lr=ALPHA) all_best_f1 = [] for label in LABELS: logger.green(f"Building classifier for {label}...") model.train() best_f1 = 0.0 for epoch in range(NUM_EPOCHS): print() print(f"Epoch: {epoch}") y_true = list() y_pred = list() total_loss = 0 for batch, targets, lengths, raw_data in create_dataset( train_dataset, id2tok, tok2id, label, batch_size=BATCH_SIZE ): pred = model(batch.T, lengths) loss = loss_func( pred.type(torch.FloatTensor), targets.unsqueeze(0).type(torch.FloatTensor), ) optimizer.zero_grad() loss.backward() optimizer.step() y_true += list(targets.int()) # pred_idx = torch.max(pred, 1)[1] # y_pred += list(pred_idx.data.int()) y_pred += [int(pred.float() >= THRESHOLD)] total_loss += loss acc = accuracy_score(y_true, y_pred) val_loss, val_acc, report = evaluate_validation_set( model, val_dataset, id2tok, tok2id, label, loss_func ) print( "Train loss: {} - acc: {} \nValidation loss: {} - acc: {}".format( total_loss.data.float() / len(train_dataset), acc, val_loss, val_acc ) ) val_f1 = report["1"]["f1-score"] if best_f1 < val_f1: logger.green(f"New best F1 score at {val_f1}") best_f1 = val_f1 if not os.path.exists(f"models/lstm/{model_id}/{LABEL_TO_IX[label]}"): Path(f"models/lstm/{model_id}/{LABEL_TO_IX[label]}").mkdir( parents=True, exist_ok=True ) torch.save( model.state_dict(), f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/{LABEL_TO_IX[label]}.pt", ) results[label] = report if os.path.exists( f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json" ): os.remove( f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json" ) with open( f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json", "w", ) as f: json.dump(results, f) all_best_f1.append(best_f1) logger.green(f"Final mean F1: {statistics.mean(all_best_f1)}") with open(f"models/lstm/{model_id}/summary.txt", "w") as f: f.write(f"Mean F1: {str(statistics.mean(all_best_f1))}\n") for ix, score in enumerate(all_best_f1): f.write(f"{ix}: {score} \n") f.write("\n") f.write(f"HIDDEN_SIZE: {HIDDEN_SIZE}\n") f.write(f"ALPHA: {ALPHA}\n") f.write(f"NUM_EPOCHS: {NUM_EPOCHS}\n") f.write(f"POS_LOSS_WEIGHT: {POS_LOSS_WEIGHT}\n") f.write(f"DROPOUT: {DROPOUT}\n")
lr_embed_clf = MultiOutputClassifier( LogisticRegression( max_iter=300, multi_class="multinomial", penalty="none", solver="lbfgs" ) ).fit(X_train_embeds, y_train) print(hamming_loss(y_val, lr_embed_clf.predict(X_val_embeds))) print(classification_report(y_val, lr_embed_clf.predict(X_val_embeds))) ## Seeing where no prediction was made null_predictions = len( [i for i in lr_embed_clf.predict(X_val_embeds) if not np.any(np.nonzero(i))] ) print(f"{null_predictions} out of {len(y_val)} predictions were null.") dub_ref_model = lr_embed_clf.estimators_[4] vocab, id2tok, tok2id = get_vocab(train_dataset) target_label = "dubious reference" BATCH_SIZE = 1 pred = [] actual = [] vectors = [] for batch, targets, lengths, raw_data in create_dataset( val_dataset, id2tok, tok2id, target_label, batch_size=BATCH_SIZE ): actual.append(targets.item()) pred.append(int(predict(dub_ref_model, raw_data[0]))) vectors.append(WE.get_sentence_vector(raw_data[0].lower().split(), vector_dict)) print(classification_report(actual, pred)) plot_confusion_matrix(dub_ref_model, vectors, actual)
def train(): results = {} vocab, id2tok, tok2id = get_vocab(train_dataset) if glob.glob("models/mlp/*"): model_id = (max([ int(re.search(r"\d+", i).group()) for i in glob.glob("models/mlp/*") ]) + 1) else: model_id = 1 """ Demo of weights in loss function. """ # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([1])) # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([0])) vector_dict = WE.get_vector_dict() all_best_f1 = [] for label in LABELS: if LABEL_TO_IX[label] in USE_FEATS: logger.yellow(f"Using additional features for label {label}...") model = MLPClassifier(EMBED_DIM + len(FEATS_TO_ADD), HIDDEN_SIZE, DROPOUT) else: model = MLPClassifier(EMBED_DIM, HIDDEN_SIZE, DROPOUT) loss_func = nn.BCEWithLogitsLoss( pos_weight=torch.FloatTensor([POS_LOSS_WEIGHT])) optimizer = optim.Adam(model.parameters(), lr=ALPHA) logger.green(f"Building classifier for {label}...") model.train() best_f1 = 0.0 for epoch in range(NUM_EPOCHS): print() print(f"Epoch: {epoch}") y_true = list() y_pred = list() total_loss = 0 for batch, targets, lengths, raw_data in create_dataset( train_dataset, id2tok, tok2id, label, batch_size=BATCH_SIZE): tokenized = tokenized_sentence(raw_data[0]) sentence_weights = None if TFIDF_WEIGHTS: sentence_weights = get_tfidf_vals( train_doc_to_tfidf_ix[raw_data[0]]) sentence_weights = [ sentence_weights[tok] if tok in sentence_weights else 0 for tok in tokenized ] input_vector = torch.FloatTensor( WE.get_sentence_vector( tokenized_sentence(raw_data[0]), vector_dict=vector_dict, weights=sentence_weights, )) if LABEL_TO_IX[label] in USE_FEATS: input_vector = add_features(input_vector, raw_data[0]) pred = model(input_vector) loss = loss_func(pred.type(torch.FloatTensor), targets.type(torch.FloatTensor)) optimizer.zero_grad() loss.backward() optimizer.step() y_true += list(targets.int()) y_pred += [int(pred.float() >= THRESHOLD)] total_loss += loss acc = accuracy_score(y_true, y_pred) val_loss, val_acc, report = evaluate_validation_set( model, val_dataset, id2tok, tok2id, label, loss_func, vector_dict) print("Train loss: {} - acc: {} \nValidation loss: {} - acc: {}". format(total_loss.data.float() / len(train_dataset), acc, val_loss, val_acc)) val_f1 = report["1"]["f1-score"] if best_f1 < val_f1: logger.green(f"New best F1 score at {val_f1}") best_f1 = val_f1 if not os.path.exists( f"models/mlp/{model_id}/{LABEL_TO_IX[label]}"): Path(f"models/mlp/{model_id}/{LABEL_TO_IX[label]}").mkdir( parents=True, exist_ok=True) torch.save( model.state_dict(), f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/{LABEL_TO_IX[label]}.pt", ) results[label] = report if os.path.exists( f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json" ): os.remove( f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json" ) with open( f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json", "w", ) as f: json.dump(results, f) all_best_f1.append(best_f1) logger.green(f"Final mean F1: {statistics.mean(all_best_f1)}") with open(f"models/mlp/{model_id}/summary.txt", "w") as f: f.write(f"Mean F1: {str(statistics.mean(all_best_f1))}\n") for ix, score in enumerate(all_best_f1): f.write(f"{ix}: {score} \n") f.write("\n") f.write(f"HIDDEN_SIZE: {HIDDEN_SIZE}\n") f.write(f"ALPHA: {ALPHA}\n") f.write(f"NUM_EPOCHS: {NUM_EPOCHS}\n") f.write(f"POS_LOSS_WEIGHT: {POS_LOSS_WEIGHT}\n") f.write(f"DROPOUT: {DROPOUT}\n") f.write(f"TFIDF_WEIGHTS: {TFIDF_WEIGHTS}\n") if FEATS_TO_ADD: f.write(f"FEATS_TO_ADD: {FEATS_TO_ADD}\n") f.write(f"FEAT_ADD_SOFTENER: {FEAT_ADD_SOFTENER}\n") if USE_FEATS: f.write(f"USE_FEATS: {USE_FEATS}\n") mark_best_results()