def run(): # Load data and build model batch_size = 10000 n_features = 100 tc = ToxicComments('tcc/data/train.csv', batch_size=batch_size) model = ToxicModel(n_features, 6) # Train model remove_callback_dir('events') remove_callback_dir('ckpts') iteration = 0 for _, comments, labels in tc: comments = np.array(prepare(comments, n_features)) labels = np.array(labels) iteration += 1 events_dir = make_callback_dir('events', iteration) ckpts_dir = make_callback_dir('ckpts', iteration) model.train(comments, labels, callback_dirs=[events_dir, ckpts_dir]) # Test model tc_test = ToxicComments('tcc/data/test.csv') ids, comments, _ = next(tc_test) comments = prepare(comments, n_features=n_features) predictions = model.predict(comments) save_array(ids, predictions, 'tcc/data/test_submission5.csv')
def preprocess_winequality(): """Cleans and generates wine quality dataset for experiments as a CSV file. """ # get file paths sdir = 'data/winequality' tdir = 'data/experiments' wr_file = get_abspath('winequality-red.csv', sdir) ww_file = get_abspath('winequality-white.csv', sdir) # load as data frame wine_red = pd.read_csv(wr_file, sep=';') wine_white = pd.read_csv(ww_file, sep=';') # encode artifical label to determine if wine is red or not wine_red['red'] = 1 wine_white['red'] = 0 # combine datasets and format column names df = wine_red.append(wine_white) df.columns = ['_'.join(col.split(' ')) for col in df.columns] df.rename(columns={'quality': 'class'}, inplace=True) # split out X data and scale (Gaussian zero mean and unit variance) X = df.drop(columns='class').as_matrix() y = df['class'].as_matrix() X_scaled = StandardScaler().fit_transform(X) data = np.concatenate((X_scaled, y[:, np.newaxis]), axis=1) # save to CSV save_array(array=data, filename='winequality.csv', subdir=tdir)
def pca_experiment(X, name, dims, evp): """Run PCA on specified dataset and saves dataset with components that explain at least 85% of total variance. Args: X (Numpy.Array): Attributes. name (str): Dataset name. dims (int): Number of components. evp (float): Explained variance percentage threshold. """ pca = PCA(random_state=0, svd_solver='full', n_components=dims) comps = pca.fit_transform(X) # get principal components # cumulative explained variance greater than threshold r = range(1, dims + 1) ev = pd.Series(pca.explained_variance_, index=r, name='ev') evr = pd.Series(pca.explained_variance_ratio_, index=r, name='evr') evrc = evr.rename('evr_cum').cumsum() res = comps[:, :evrc.where(evrc > evp).idxmin()] evars = pd.concat((ev, evr, evrc), axis=1) # save results as CSV resdir = 'results/PCA' evfile = get_abspath('{}_variances.csv'.format(name), resdir) resfile = get_abspath('{}_projected.csv'.format(name), resdir) save_array(array=res, filename=resfile, subdir=resdir) evars.to_csv(evfile, index_label='n')
def rf_experiment(X, y, name, theta): """Run RF on specified dataset and saves feature importance metrics and best results CSV. Args: X (Numpy.Array): Attributes. y (Numpy.Array): Labels. name (str): Dataset name. theta (float): Min cumulative information gain threshold. """ rfc = RandomForestClassifier( n_estimators=100, class_weight='balanced', random_state=0) fi = rfc.fit(X, y).feature_importances_ # get feature importance and sort by value in descending order i = [i + 1 for i in range(len(fi))] fi = pd.DataFrame({'importance': fi, 'feature': i}) fi.sort_values('importance', ascending=False, inplace=True) fi['i'] = i cumfi = fi['importance'].cumsum() fi['cumulative'] = cumfi # generate dataset that meets cumulative feature importance threshold idxs = fi.loc[:cumfi.where(cumfi > theta).idxmin(), :] idxs = list(idxs.index) reduced = X[:, idxs] # save results as CSV resdir = 'results/RF' fifile = get_abspath('{}_fi.csv'.format(name), resdir) resfile = get_abspath('{}_projected.csv'.format(name), resdir) save_array(array=reduced, filename=resfile, subdir=resdir) fi.to_csv(fifile, index_label=None)
def nn_cluster_datasets(X, name, km_k, gmm_k): """Generates datasets for ANN classification by appending cluster label to original dataset. Args: X (Numpy.Array): Original attributes. name (str): Dataset name. km_k (int): Number of clusters for K-Means. gmm_k (int): Number of components for GMM. """ km = KMeans(random_state=0).set_params(n_clusters=km_k) gmm = GMM(random_state=0).set_params(n_components=gmm_k) km.fit(X) gmm.fit(X) # add cluster labels to original attributes km_x = np.concatenate((X, km.labels_[:, None]), axis=1) gmm_x = np.concatenate((X, gmm.predict(X)[:, None]), axis=1) # save results resdir = 'results/NN' kmfile = get_abspath('{}_km_labels.csv'.format(name), resdir) gmmfile = get_abspath('{}_gmm_labels.csv'.format(name), resdir) save_array(array=km_x, filename=kmfile, subdir=resdir) save_array(array=gmm_x, filename=gmmfile, subdir=resdir)
def save_ica_results(X, name, dims): """Run ICA and save projected dataset as CSV. Args: X (Numpy.Array): Attributes. name (str): Dataset name. dims (int): Number of components. """ # transform data using ICA ica = FastICA(random_state=0, max_iter=5000, n_components=dims) res = ica.fit_transform(X) # save results file resdir = 'results/ICA' resfile = get_abspath('{}_projected.csv'.format(name), resdir) save_array(array=res, filename=resfile, subdir=resdir)
def save_rp_results(X, name, dims): """Run RP and save projected dataset as CSV. Args: X (Numpy.Array): Attributes. name (str): Dataset name. dims (int): Number of components. """ # transform data using ICA rp = SparseRandomProjection(random_state=0, n_components=dims) res = rp.fit_transform(X) # save results file resdir = 'results/RP' resfile = get_abspath('{}_projected.csv'.format(name), resdir) save_array(array=res, filename=resfile, subdir=resdir)
def preprocess_seismic(): """Cleans and generates seismic bumps dataset for experiments as a CSV file. Uses one-hot encoding for categorical features. """ # get file path sdir = 'data/seismic-bumps' tdir = 'data/experiments' seismic_file = get_abspath('seismic-bumps.arff', sdir) # read arff file and convert to record array rawdata = arff.loadarff(seismic_file) df = pd.DataFrame(rawdata[0]) # apply one-hot encoding to categorical features using Pandas get_dummies cat_cols = ['seismic', 'seismoacoustic', 'shift', 'ghazard'] cats = df[cat_cols] onehot_cols = pd.get_dummies(cats, prefix=cat_cols) # replace 0s with -1s to improve NN performance onehot_cols.replace(to_replace=[0], value=[-1], inplace=True) # drop original categorical columns and append one-hot encoded columns df.drop(columns=cat_cols, inplace=True) df = pd.concat((onehot_cols, df), axis=1) # drop columns that have only 1 unique value (features add no information) for col in df.columns: if len(np.unique(df[col])) == 1: df.drop(columns=col, inplace=True) # cast class column as integer df['class'] = df['class'].astype(int) # split out X data and scale (Gaussian zero mean and unit variance) X = df.drop(columns='class').as_matrix() y = df['class'].as_matrix() X_scaled = StandardScaler().fit_transform(X) data = np.concatenate((X_scaled, y[:, np.newaxis]), axis=1) # save to CSV save_array(array=data, filename='seismic-bumps.csv', subdir=tdir)
def pca_experiment(X, name, dims, evp): """Run PCA on specified dataset and saves dataset with components that explain at least 85% of total variance or 2 components which ever is larger Args: X (Numpy.Array): Attributes. name (str): Dataset name. dims (int): Number of components. evp (float): Explained variance percentage threshold. """ pca = PCA(random_state=0, svd_solver='full', n_components=dims) comps = pca.fit_transform( StandardScaler().fit_transform(X) ) # get principal components # cumulative explained variance greater than threshold r = range(1, dims + 1) ev = pd.Series(pca.explained_variance_, index=r, name='ev') evr = pd.Series(pca.explained_variance_ratio_, index=r, name='evr') evrc = evr.rename('evr_cum').cumsum() res = comps[:, :max(evrc.where(evrc > evp).idxmin(), 2)] error = [] for _ in range(1, dims+1): print _ pca = PCA(random_state=0, svd_solver='full', n_components=_) comps = pca.fit_transform( StandardScaler().fit_transform(X) ) # get principal components data_reduced = np.dot(X, pca.components_.T) error.append(((X - np.dot(data_reduced, pca.components_)) ** 2).mean()) evars = pd.concat((ev, evr, evrc), axis=1) evars['loss'] = error # save results as CSV resdir = 'results/PCA' evfile = get_abspath('{}_variances.csv'.format(name), resdir) resfile = get_abspath('{}_projected.csv'.format(name), resdir) save_array(array=res, filename=resfile, subdir=resdir) evars.to_csv(evfile, index_label='n')