示例#1
0
def train(run_test=False):
    results = {}
    vocab, id2tok, tok2id = get_vocab(train_dataset)
    embed_weights = None
    if USE_GLOVE:
        embed_weights = get_embed_weights(vocab, tok2id)
    # from scipy.spatial import distance
    # print(distance.cosine(embed_weights[tok2id['obama']], embed_weights[tok2id['clinton']]))
    model = LSTMClassifier(
        VOCAB_SIZE,
        EMBED_DIM,
        HIDDEN_SIZE,
        bidirectional=False,
        embed_weights=embed_weights,
    )
    loss_func = nn.BCEWithLogitsLoss(pos_weight=torch.FloatTensor([POS_LOSS_WEIGHT]))

    """
    Demo of weights in loss function.
    """
    # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([1]))
    # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([0]))
    if glob.glob("models/lstm/*"):
        model_id = max([int(i[-1]) for i in glob.glob("models/lstm/*")]) + 1
    else:
        model_id = 1

    optimizer = optim.Adam(model.parameters(), lr=ALPHA)
    all_best_f1 = []
    for label in LABELS:
        logger.green(f"Building classifier for {label}...")
        model.train()
        best_f1 = 0.0
        for epoch in range(NUM_EPOCHS):
            print()
            print(f"Epoch: {epoch}")
            y_true = list()
            y_pred = list()
            total_loss = 0
            for batch, targets, lengths, raw_data in create_dataset(
                train_dataset, id2tok, tok2id, label, batch_size=BATCH_SIZE
            ):
                pred = model(batch.T, lengths)
                loss = loss_func(
                    pred.type(torch.FloatTensor),
                    targets.unsqueeze(0).type(torch.FloatTensor),
                )
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                y_true += list(targets.int())
                # pred_idx = torch.max(pred, 1)[1]
                # y_pred += list(pred_idx.data.int())
                y_pred += [int(pred.float() >= THRESHOLD)]
                total_loss += loss
            acc = accuracy_score(y_true, y_pred)
            val_loss, val_acc, report = evaluate_validation_set(
                model, val_dataset, id2tok, tok2id, label, loss_func
            )
            print(
                "Train loss: {} - acc: {} \nValidation loss: {} - acc: {}".format(
                    total_loss.data.float() / len(train_dataset), acc, val_loss, val_acc
                )
            )
            val_f1 = report["1"]["f1-score"]
            if best_f1 < val_f1:
                logger.green(f"New best F1 score at {val_f1}")
                best_f1 = val_f1
                if not os.path.exists(f"models/lstm/{model_id}/{LABEL_TO_IX[label]}"):
                    Path(f"models/lstm/{model_id}/{LABEL_TO_IX[label]}").mkdir(
                        parents=True, exist_ok=True
                    )
                torch.save(
                    model.state_dict(),
                    f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/{LABEL_TO_IX[label]}.pt",
                )
                results[label] = report
                if os.path.exists(
                    f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json"
                ):
                    os.remove(
                        f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json"
                    )
                with open(
                    f"models/lstm/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json",
                    "w",
                ) as f:
                    json.dump(results, f)
        all_best_f1.append(best_f1)
    logger.green(f"Final mean F1: {statistics.mean(all_best_f1)}")
    with open(f"models/lstm/{model_id}/summary.txt", "w") as f:
        f.write(f"Mean F1: {str(statistics.mean(all_best_f1))}\n")
        for ix, score in enumerate(all_best_f1):
            f.write(f"{ix}: {score} \n")
        f.write("\n")
        f.write(f"HIDDEN_SIZE: {HIDDEN_SIZE}\n")
        f.write(f"ALPHA: {ALPHA}\n")
        f.write(f"NUM_EPOCHS: {NUM_EPOCHS}\n")
        f.write(f"POS_LOSS_WEIGHT: {POS_LOSS_WEIGHT}\n")
        f.write(f"DROPOUT: {DROPOUT}\n")
lr_embed_clf = MultiOutputClassifier(
    LogisticRegression(
        max_iter=300, multi_class="multinomial", penalty="none", solver="lbfgs"
    )
).fit(X_train_embeds, y_train)
print(hamming_loss(y_val, lr_embed_clf.predict(X_val_embeds)))
print(classification_report(y_val, lr_embed_clf.predict(X_val_embeds)))
## Seeing where no prediction was made
null_predictions = len(
    [i for i in lr_embed_clf.predict(X_val_embeds) if not np.any(np.nonzero(i))]
)
print(f"{null_predictions} out of {len(y_val)} predictions were null.")

dub_ref_model = lr_embed_clf.estimators_[4]
vocab, id2tok, tok2id = get_vocab(train_dataset)
target_label = "dubious reference"
BATCH_SIZE = 1
pred = []
actual = []
vectors = []
for batch, targets, lengths, raw_data in create_dataset(
    val_dataset, id2tok, tok2id, target_label, batch_size=BATCH_SIZE
):
    actual.append(targets.item())
    pred.append(int(predict(dub_ref_model, raw_data[0])))
    vectors.append(WE.get_sentence_vector(raw_data[0].lower().split(), vector_dict))
print(classification_report(actual, pred))
plot_confusion_matrix(dub_ref_model, vectors, actual)

示例#3
0
def train():
    results = {}
    vocab, id2tok, tok2id = get_vocab(train_dataset)
    if glob.glob("models/mlp/*"):
        model_id = (max([
            int(re.search(r"\d+", i).group())
            for i in glob.glob("models/mlp/*")
        ]) + 1)
    else:
        model_id = 1
    """
    Demo of weights in loss function.
    """
    # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([1]))
    # loss_func(torch.FloatTensor([0.6]), torch.FloatTensor([0]))
    vector_dict = WE.get_vector_dict()
    all_best_f1 = []
    for label in LABELS:
        if LABEL_TO_IX[label] in USE_FEATS:
            logger.yellow(f"Using additional features for label {label}...")
            model = MLPClassifier(EMBED_DIM + len(FEATS_TO_ADD), HIDDEN_SIZE,
                                  DROPOUT)
        else:
            model = MLPClassifier(EMBED_DIM, HIDDEN_SIZE, DROPOUT)
        loss_func = nn.BCEWithLogitsLoss(
            pos_weight=torch.FloatTensor([POS_LOSS_WEIGHT]))
        optimizer = optim.Adam(model.parameters(), lr=ALPHA)
        logger.green(f"Building classifier for {label}...")
        model.train()
        best_f1 = 0.0
        for epoch in range(NUM_EPOCHS):
            print()
            print(f"Epoch: {epoch}")
            y_true = list()
            y_pred = list()
            total_loss = 0
            for batch, targets, lengths, raw_data in create_dataset(
                    train_dataset, id2tok, tok2id, label,
                    batch_size=BATCH_SIZE):
                tokenized = tokenized_sentence(raw_data[0])
                sentence_weights = None
                if TFIDF_WEIGHTS:
                    sentence_weights = get_tfidf_vals(
                        train_doc_to_tfidf_ix[raw_data[0]])
                    sentence_weights = [
                        sentence_weights[tok] if tok in sentence_weights else 0
                        for tok in tokenized
                    ]
                input_vector = torch.FloatTensor(
                    WE.get_sentence_vector(
                        tokenized_sentence(raw_data[0]),
                        vector_dict=vector_dict,
                        weights=sentence_weights,
                    ))
                if LABEL_TO_IX[label] in USE_FEATS:
                    input_vector = add_features(input_vector, raw_data[0])
                pred = model(input_vector)
                loss = loss_func(pred.type(torch.FloatTensor),
                                 targets.type(torch.FloatTensor))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                y_true += list(targets.int())
                y_pred += [int(pred.float() >= THRESHOLD)]
                total_loss += loss
            acc = accuracy_score(y_true, y_pred)
            val_loss, val_acc, report = evaluate_validation_set(
                model, val_dataset, id2tok, tok2id, label, loss_func,
                vector_dict)
            print("Train loss: {} - acc: {} \nValidation loss: {} - acc: {}".
                  format(total_loss.data.float() / len(train_dataset), acc,
                         val_loss, val_acc))
            val_f1 = report["1"]["f1-score"]
            if best_f1 < val_f1:
                logger.green(f"New best F1 score at {val_f1}")
                best_f1 = val_f1
                if not os.path.exists(
                        f"models/mlp/{model_id}/{LABEL_TO_IX[label]}"):
                    Path(f"models/mlp/{model_id}/{LABEL_TO_IX[label]}").mkdir(
                        parents=True, exist_ok=True)
                torch.save(
                    model.state_dict(),
                    f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/{LABEL_TO_IX[label]}.pt",
                )
                results[label] = report
                if os.path.exists(
                        f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json"
                ):
                    os.remove(
                        f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json"
                    )
                with open(
                        f"models/mlp/{model_id}/{LABEL_TO_IX[label]}/results_{LABEL_TO_IX[label]}.json",
                        "w",
                ) as f:
                    json.dump(results, f)
        all_best_f1.append(best_f1)
    logger.green(f"Final mean F1: {statistics.mean(all_best_f1)}")
    with open(f"models/mlp/{model_id}/summary.txt", "w") as f:
        f.write(f"Mean F1: {str(statistics.mean(all_best_f1))}\n")
        for ix, score in enumerate(all_best_f1):
            f.write(f"{ix}: {score} \n")
        f.write("\n")
        f.write(f"HIDDEN_SIZE: {HIDDEN_SIZE}\n")
        f.write(f"ALPHA: {ALPHA}\n")
        f.write(f"NUM_EPOCHS: {NUM_EPOCHS}\n")
        f.write(f"POS_LOSS_WEIGHT: {POS_LOSS_WEIGHT}\n")
        f.write(f"DROPOUT: {DROPOUT}\n")
        f.write(f"TFIDF_WEIGHTS: {TFIDF_WEIGHTS}\n")
        if FEATS_TO_ADD:
            f.write(f"FEATS_TO_ADD: {FEATS_TO_ADD}\n")
            f.write(f"FEAT_ADD_SOFTENER: {FEAT_ADD_SOFTENER}\n")
        if USE_FEATS:
            f.write(f"USE_FEATS: {USE_FEATS}\n")
    mark_best_results()