示例#1
0
Y_test = np.load("Objects/Y_test.npy")

Y_test[Y_test >= 0.5] = 1
Y_test[Y_test < 0.5] = 0

Y_train = (Y_train - np.mean(Y_train, axis=0, keepdims=True)) / np.std(
    Y_train, axis=0, keepdims=True)
Y_test = (Y_test - np.mean(Y_test, axis=0, keepdims=True)) / np.std(
    Y_test, axis=0, keepdims=True)

R_train = Y_train.T.dot(Y_train) / Y_train.shape[0]
R_test = Y_test.T.dot(Y_test) / Y_test.shape[0]

labels = pickle.load("Objects/labels.list")

file = open("Out/corrs.txt", "w")
file.write(
    "Correlations in training set and in test predictions respectively:\n\n")

D = {}
for i in range(R_train.shape[0]):
    for j in range(i + 1, R_train.shape[1]):
        D.update({labels[i] + ", " + labels[j]: [R_train[i, j], R_test[i, j]]})
        file.write(labels[i] + ", " + labels[j] + ": " +
                   str(np.round(R_train[i, j], 5)) + ", " +
                   str(np.round(R_test[i, j], 5)) + "\n")

file.close()

pickle.save("Objects/corrs.dict", D)
示例#2
0
        x[i, :] = ft_model.get_word_vector(word).astype('float32')
    return x


def df_to_data(df):
    """
	Convert a given dataframe to a dataset of inputs for the NN.
	"""
    x = np.zeros((len(df), window_length, n_features), dtype='float32')
    for i, comment in enumerate(df['comment_text'].values):
        x[i, :] = text_to_vector(comment)
    return x


print("Building embedded data . . . ")
X_train = df_to_data(train)
X_test = df_to_data(test)

labels = [
    'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
]
Y_train = train[labels].values

print("Saving embedded data, target values, and labels . . . ")
np.save("Objects/X_train.npy", X_train)
np.save("Objects/X_test.npy", X_test)
np.save("Objects/Y_train.npy", Y_train)
pickle.save("Objects/labels.list", labels)

print("Done!")
示例#3
0
num_tasks = Y_train.shape[1]
time_1 = time.time()
classifier = OneVsRestClassifier(LinearSVC())
classifier.fit(X_train, Y_train)
time_2 = time.time()

results = {}
results.update({"train_time": time_2-time_1})
pred = classifier.predict(X_train)
results.update({"acc": np.array([accuracy_score(Y_train[:,i], pred[:,i]) for i in range(num_tasks)])})
results.update({"acc_mean": np.mean(results["acc"])})
results.update({"roc": np.array([roc_auc_score(Y_train[:,i], pred[:,i]) for i in range(num_tasks)])})
results.update({"roc_mean": np.mean(results["roc"])})
pred = classifier.predict(X_test)
results.update({"val_acc": np.array([accuracy_score(Y_test[:,i], pred[:,i]) for i in range(num_tasks)])})
results.update({"val_acc_mean": np.mean(results["val_acc"])})
results.update({"val_roc": np.array([roc_auc_score(Y_test[:,i], pred[:,i]) for i in range(num_tasks)])})
results.update({"val_roc_mean": np.mean(results["val_roc"])})


#nrows = 10
X_test = sp.sparse.load_npz("Objects/X_test.npz")  # [:nrows,:X_train.shape[1]]
Y_test = classifier.predict(X_test)
submission = pd.read_csv("../Data/sample_submission.csv")  # , nrows=nrows)
submission.iloc[:,1:] = Y_test
submission.to_csv("Objects/submission.csv", index=False)

results.update({"script_time": time.time()-time_0})

pickle.save("Objects/results.dict", results)
示例#4
0
import numpy as np
from Utils import pickle

Y_train = np.load("Objects/Y_train.npy")
Y_test = np.load("Objects/Y_test.npy")

Y_test[Y_test>=0.5] = 1
Y_test[Y_test<0.5] = 0

p_train = np.mean(Y_train, axis=0)
p_test = np.mean(Y_test, axis=0)

labels = pickle.load("Objects/labels.list")

file = open("Out/imbalance.txt", "w")
file.write("Percentage of comments toxic in training set and in test predictions respectively:\n\n")

D = {}
for i in range(p_train.shape[0]):
	D.update({labels[i]: [p_train[i], p_test[i]]})
	file.write(labels[i]+": "+str(np.round(p_train[i], 5))+", "+str(np.round(p_test[i], 5))+"\n")

file.close()

pickle.save("Objects/imbalance.dict", D)
示例#5
0
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from Utils import pickle

data = pd.read_csv("../Data/train.csv")
comments = data["comment_text"].tolist()

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=200000, min_df=5)
X_train = vectorizer.fit_transform(comments)

sp.sparse.save_npz("Objects/X_train.npz", X_train)

Y_train = data.iloc[:, 2:].as_matrix()

np.save("Objects/Y_train.npy", Y_train)

data = pd.read_csv("../Data/test.csv")
comments = data["comment_text"].tolist()

X_test = vectorizer.transform(comments)

sp.sparse.save_npz("Objects/X_test.npz", X_test)

pickle.save("Objects/tdidf.tok", vectorizer)

labels = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]

pickle.save("Objects/labels.list", labels)