def token_frequency_distribution(name="train"):
    x_tr, _, _, _, _, _, _ = util.create_or_load_data(freq_threshold=0)

    token_counter = collections.defaultdict(int)
    for x in x_tr:
        for token in x:
            token_counter[token] += 1

    freq_counter = collections.defaultdict(int)
    total_count = 0
    for c in token_counter.values():
        freq_counter[c] += c
        total_count += c

    acc = 0
    acc_freq = []
    freq_list = []
    for freq, c in sorted(freq_counter.items()):
        if freq > 200:
            break
        freq_list.append(freq)
        acc += c
        acc_freq.append(acc / float(total_count))

    plt.plot(freq_list, acc_freq, label=name)
    plt.title("Token frequency distribution of train data")
    plt.ylabel("cutoff proportion")
    plt.xlabel("cutoff token frequency")
    plt.legend()
    plt.savefig("token_frequency_cutoff_f200_{}.png".format(name))
    plt.close()
def label_correlation(name="train"):
    _, y_tr, _, y_va, _, _, _ = util.create_or_load_data(freq_threshold=0)

    df = pd.DataFrame(data=y_tr, columns=data_process.TAGS)

    corr = df.corr()
    size = 10
    fig, ax = plt.subplots(figsize=(size, size))
    cax = ax.matshow(corr, cmap="Spectral_r")
    plt.colorbar(cax)
    plt.xticks(range(len(corr.columns)), corr.columns)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.legend()
    plt.savefig("label_correlation_{}.png".format(name))
    plt.close()
def nb_onehot():
    X_tr, Y_tr, X_va, Y_va, dictionary, X_te, id_list = util.create_or_load_data(freq_threshold=50)

    Y_te_pred_list = []
    sum_auc_va = 0.0
    for i in range(Y_tr.shape[1]):
        nb = BernoulliNB()

        j = 0
        batch_size = 10000
        while j < len(X_tr):
            end = min(j + batch_size, len(X_tr) - 1)
            batch = [data_process.seq2onehot(seq, dictionary) for seq in X_tr[j:end]]
            nb.partial_fit(batch, Y_tr[j:end, i], classes=[0, 1])
            j += batch_size

        logging.info("Finish training")

        Y_va_pred = []
        j = 0
        while j < len(X_va):
            end = min(j + batch_size, len(X_va))
            batch = [data_process.seq2onehot(seq, dictionary) for seq in X_va[j:end]]
            Y_va_pred.extend(nb.predict_proba(batch))
            j += batch_size

        auc_va = util.auc(Y_va[:, i], Y_va_pred)
        logging.info("tag{}, valid auc: ".format(i) + str(auc_va))
        sum_auc_va += auc_va

        Y_te_pred = []
        j = 0
        while j < len(X_te):
            end = min(j + batch_size, len(X_te))
            batch = [data_process.seq2onehot(seq, dictionary) for seq in X_te[j:end]]
            Y_te_pred.extend(nb.predict_proba(batch))
            j += batch_size
        Y_te_pred_list.append(Y_te_pred)

    logging.info("Avg auc: {}".format(sum_auc_va / Y_tr.shape[1]))

    util.submission(Y_te_pred_list, id_list)
def sentence_length_distribution():
    freq = 10
    x_tr, y_tr, x_va, y_va, dic, x_te, id_list = util.create_or_load_data(
        freq_threshold=freq)
    counter_tr, counter_va = [
        collections.defaultdict(lambda: collections.defaultdict(int))
    ] * 2
    for x, y in zip(x_tr, y_tr):
        for i in range(len(y)):
            counter_tr["{}_{}".format(i, y[i])][len(x)] += 1

    for x, y in zip(x_va, y_va):
        for i in range(len(y)):
            counter_va["{}_{}".format(i, y[i])][len(x)] += 1

    lengths_tr = collections.defaultdict(list)
    counts_tr = collections.defaultdict(list)
    for k in counter_tr:

        for length, c in sorted(counter_tr[k].items()):
            lengths_tr[k].append(length)
            counts_tr[k].append(c)

    bins = range(0, 2000, 20)
    plt.hist(lengths_tr["0_0"],
             bins=bins,
             weights=counts_tr["0_0"],
             label="non-toxic train")
    plt.hist(lengths_tr["0_1"],
             bins=bins,
             weights=counts_tr["0_1"],
             label="toxic train")
    plt.title("sentence length distribution of train data")
    plt.ylabel("count")
    plt.xlabel("length")
    plt.yscale("log")
    plt.legend()
    plt.savefig("sentence_length_distribution_{}_train.png".format(freq))
    plt.close()
Exemplo n.º 5
0
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam

import logging
import util
from model_lstm import MAX_SENTENCE_LEN

logging.basicConfig(level=logging.INFO)

if __name__ == '__main__':
    X_tr, Y_tr, X_va, Y_va, dictionary, X_te, id_list = util.create_or_load_data(
        freq_threshold=10)
    idx2token = {v: k for k, v in dictionary.items()}

    X_tr = pad_sequences(X_tr, MAX_SENTENCE_LEN, truncating='post')
    X_va = pad_sequences(X_va, MAX_SENTENCE_LEN, truncating='post')
    X_te = pad_sequences(X_te, MAX_SENTENCE_LEN, truncating='post')

    model_path = "./save/lstm_100.model"

    lstm = keras.models.load_model(model_path)
    lstm.get_layer("word_embedding").trainable = True
    lstm.compile(optimizer=Adam(lr=0.00001),
                 loss="binary_crossentropy",
                 metrics=["accuracy"])
    lstm.summary()

    lstm.fit(X_tr, Y_tr, epochs=2, batch_size=64, validation_data=[X_va, Y_va])
    lstm.save(model_path + ".ft")