def token_frequency_distribution(name="train"): x_tr, _, _, _, _, _, _ = util.create_or_load_data(freq_threshold=0) token_counter = collections.defaultdict(int) for x in x_tr: for token in x: token_counter[token] += 1 freq_counter = collections.defaultdict(int) total_count = 0 for c in token_counter.values(): freq_counter[c] += c total_count += c acc = 0 acc_freq = [] freq_list = [] for freq, c in sorted(freq_counter.items()): if freq > 200: break freq_list.append(freq) acc += c acc_freq.append(acc / float(total_count)) plt.plot(freq_list, acc_freq, label=name) plt.title("Token frequency distribution of train data") plt.ylabel("cutoff proportion") plt.xlabel("cutoff token frequency") plt.legend() plt.savefig("token_frequency_cutoff_f200_{}.png".format(name)) plt.close()
def label_correlation(name="train"): _, y_tr, _, y_va, _, _, _ = util.create_or_load_data(freq_threshold=0) df = pd.DataFrame(data=y_tr, columns=data_process.TAGS) corr = df.corr() size = 10 fig, ax = plt.subplots(figsize=(size, size)) cax = ax.matshow(corr, cmap="Spectral_r") plt.colorbar(cax) plt.xticks(range(len(corr.columns)), corr.columns) plt.yticks(range(len(corr.columns)), corr.columns) plt.legend() plt.savefig("label_correlation_{}.png".format(name)) plt.close()
def nb_onehot(): X_tr, Y_tr, X_va, Y_va, dictionary, X_te, id_list = util.create_or_load_data(freq_threshold=50) Y_te_pred_list = [] sum_auc_va = 0.0 for i in range(Y_tr.shape[1]): nb = BernoulliNB() j = 0 batch_size = 10000 while j < len(X_tr): end = min(j + batch_size, len(X_tr) - 1) batch = [data_process.seq2onehot(seq, dictionary) for seq in X_tr[j:end]] nb.partial_fit(batch, Y_tr[j:end, i], classes=[0, 1]) j += batch_size logging.info("Finish training") Y_va_pred = [] j = 0 while j < len(X_va): end = min(j + batch_size, len(X_va)) batch = [data_process.seq2onehot(seq, dictionary) for seq in X_va[j:end]] Y_va_pred.extend(nb.predict_proba(batch)) j += batch_size auc_va = util.auc(Y_va[:, i], Y_va_pred) logging.info("tag{}, valid auc: ".format(i) + str(auc_va)) sum_auc_va += auc_va Y_te_pred = [] j = 0 while j < len(X_te): end = min(j + batch_size, len(X_te)) batch = [data_process.seq2onehot(seq, dictionary) for seq in X_te[j:end]] Y_te_pred.extend(nb.predict_proba(batch)) j += batch_size Y_te_pred_list.append(Y_te_pred) logging.info("Avg auc: {}".format(sum_auc_va / Y_tr.shape[1])) util.submission(Y_te_pred_list, id_list)
def sentence_length_distribution(): freq = 10 x_tr, y_tr, x_va, y_va, dic, x_te, id_list = util.create_or_load_data( freq_threshold=freq) counter_tr, counter_va = [ collections.defaultdict(lambda: collections.defaultdict(int)) ] * 2 for x, y in zip(x_tr, y_tr): for i in range(len(y)): counter_tr["{}_{}".format(i, y[i])][len(x)] += 1 for x, y in zip(x_va, y_va): for i in range(len(y)): counter_va["{}_{}".format(i, y[i])][len(x)] += 1 lengths_tr = collections.defaultdict(list) counts_tr = collections.defaultdict(list) for k in counter_tr: for length, c in sorted(counter_tr[k].items()): lengths_tr[k].append(length) counts_tr[k].append(c) bins = range(0, 2000, 20) plt.hist(lengths_tr["0_0"], bins=bins, weights=counts_tr["0_0"], label="non-toxic train") plt.hist(lengths_tr["0_1"], bins=bins, weights=counts_tr["0_1"], label="toxic train") plt.title("sentence length distribution of train data") plt.ylabel("count") plt.xlabel("length") plt.yscale("log") plt.legend() plt.savefig("sentence_length_distribution_{}_train.png".format(freq)) plt.close()
import keras from keras.preprocessing.sequence import pad_sequences from keras.optimizers import Adam import logging import util from model_lstm import MAX_SENTENCE_LEN logging.basicConfig(level=logging.INFO) if __name__ == '__main__': X_tr, Y_tr, X_va, Y_va, dictionary, X_te, id_list = util.create_or_load_data( freq_threshold=10) idx2token = {v: k for k, v in dictionary.items()} X_tr = pad_sequences(X_tr, MAX_SENTENCE_LEN, truncating='post') X_va = pad_sequences(X_va, MAX_SENTENCE_LEN, truncating='post') X_te = pad_sequences(X_te, MAX_SENTENCE_LEN, truncating='post') model_path = "./save/lstm_100.model" lstm = keras.models.load_model(model_path) lstm.get_layer("word_embedding").trainable = True lstm.compile(optimizer=Adam(lr=0.00001), loss="binary_crossentropy", metrics=["accuracy"]) lstm.summary() lstm.fit(X_tr, Y_tr, epochs=2, batch_size=64, validation_data=[X_va, Y_va]) lstm.save(model_path + ".ft")