예제 #1
0
def run():
    # Load data and build model
    batch_size = 10000
    n_features = 100
    tc = ToxicComments('tcc/data/train.csv', batch_size=batch_size)
    model = ToxicModel(n_features, 6)

    # Train model
    remove_callback_dir('events')
    remove_callback_dir('ckpts')
    iteration = 0
    for _, comments, labels in tc:
        comments = np.array(prepare(comments, n_features))
        labels = np.array(labels)

        iteration += 1
        events_dir = make_callback_dir('events', iteration)
        ckpts_dir = make_callback_dir('ckpts', iteration)

        model.train(comments, labels, callback_dirs=[events_dir, ckpts_dir])

    # Test model
    tc_test = ToxicComments('tcc/data/test.csv')
    ids, comments, _ = next(tc_test)
    comments = prepare(comments, n_features=n_features)
    predictions = model.predict(comments)
    save_array(ids, predictions, 'tcc/data/test_submission5.csv')
예제 #2
0
def preprocess_winequality():
    """Cleans and generates wine quality dataset for experiments as a
    CSV file.

    """
    # get file paths
    sdir = 'data/winequality'
    tdir = 'data/experiments'
    wr_file = get_abspath('winequality-red.csv', sdir)
    ww_file = get_abspath('winequality-white.csv', sdir)

    # load as data frame
    wine_red = pd.read_csv(wr_file, sep=';')
    wine_white = pd.read_csv(ww_file, sep=';')

    # encode artifical label to determine if wine is red or not
    wine_red['red'] = 1
    wine_white['red'] = 0

    # combine datasets and format column names
    df = wine_red.append(wine_white)
    df.columns = ['_'.join(col.split(' ')) for col in df.columns]
    df.rename(columns={'quality': 'class'}, inplace=True)

    # split out X data and scale (Gaussian zero mean and unit variance)
    X = df.drop(columns='class').as_matrix()
    y = df['class'].as_matrix()
    X_scaled = StandardScaler().fit_transform(X)
    data = np.concatenate((X_scaled, y[:, np.newaxis]), axis=1)

    # save to CSV
    save_array(array=data, filename='winequality.csv', subdir=tdir)
예제 #3
0
def pca_experiment(X, name, dims, evp):
    """Run PCA on specified dataset and saves dataset with components that
    explain at least 85% of total variance.

    Args:
        X (Numpy.Array): Attributes.
        name (str): Dataset name.
        dims (int): Number of components.
        evp (float): Explained variance percentage threshold.

    """
    pca = PCA(random_state=0, svd_solver='full', n_components=dims)
    comps = pca.fit_transform(X)  # get principal components

    # cumulative explained variance greater than threshold
    r = range(1, dims + 1)
    ev = pd.Series(pca.explained_variance_, index=r, name='ev')
    evr = pd.Series(pca.explained_variance_ratio_, index=r, name='evr')
    evrc = evr.rename('evr_cum').cumsum()
    res = comps[:, :evrc.where(evrc > evp).idxmin()]
    evars = pd.concat((ev, evr, evrc), axis=1)

    # save results as CSV
    resdir = 'results/PCA'
    evfile = get_abspath('{}_variances.csv'.format(name), resdir)
    resfile = get_abspath('{}_projected.csv'.format(name), resdir)
    save_array(array=res, filename=resfile, subdir=resdir)
    evars.to_csv(evfile, index_label='n')
예제 #4
0
def rf_experiment(X, y, name, theta):
    """Run RF on specified dataset and saves feature importance metrics and best
    results CSV.

    Args:
        X (Numpy.Array): Attributes.
        y (Numpy.Array): Labels.
        name (str): Dataset name.
        theta (float): Min cumulative information gain threshold.

    """
    rfc = RandomForestClassifier(
        n_estimators=100, class_weight='balanced', random_state=0)
    fi = rfc.fit(X, y).feature_importances_

    # get feature importance and sort by value in descending order
    i = [i + 1 for i in range(len(fi))]
    fi = pd.DataFrame({'importance': fi, 'feature': i})
    fi.sort_values('importance', ascending=False, inplace=True)
    fi['i'] = i
    cumfi = fi['importance'].cumsum()
    fi['cumulative'] = cumfi

    # generate dataset that meets cumulative feature importance threshold
    idxs = fi.loc[:cumfi.where(cumfi > theta).idxmin(), :]
    idxs = list(idxs.index)
    reduced = X[:, idxs]

    # save results as CSV
    resdir = 'results/RF'
    fifile = get_abspath('{}_fi.csv'.format(name), resdir)
    resfile = get_abspath('{}_projected.csv'.format(name), resdir)
    save_array(array=reduced, filename=resfile, subdir=resdir)
    fi.to_csv(fifile, index_label=None)
예제 #5
0
def nn_cluster_datasets(X, name, km_k, gmm_k):
    """Generates datasets for ANN classification by appending cluster label to
    original dataset.

    Args:
        X (Numpy.Array): Original attributes.
        name (str): Dataset name.
        km_k (int): Number of clusters for K-Means.
        gmm_k (int): Number of components for GMM.

    """
    km = KMeans(random_state=0).set_params(n_clusters=km_k)
    gmm = GMM(random_state=0).set_params(n_components=gmm_k)
    km.fit(X)
    gmm.fit(X)

    # add cluster labels to original attributes
    km_x = np.concatenate((X, km.labels_[:, None]), axis=1)
    gmm_x = np.concatenate((X, gmm.predict(X)[:, None]), axis=1)

    # save results
    resdir = 'results/NN'
    kmfile = get_abspath('{}_km_labels.csv'.format(name), resdir)
    gmmfile = get_abspath('{}_gmm_labels.csv'.format(name), resdir)
    save_array(array=km_x, filename=kmfile, subdir=resdir)
    save_array(array=gmm_x, filename=gmmfile, subdir=resdir)
예제 #6
0
def save_ica_results(X, name, dims):
    """Run ICA and save projected dataset as CSV.

    Args:
        X (Numpy.Array): Attributes.
        name (str): Dataset name.
        dims (int): Number of components.

    """
    # transform data using ICA
    ica = FastICA(random_state=0, max_iter=5000, n_components=dims)
    res = ica.fit_transform(X)

    # save results file
    resdir = 'results/ICA'
    resfile = get_abspath('{}_projected.csv'.format(name), resdir)
    save_array(array=res, filename=resfile, subdir=resdir)
예제 #7
0
def save_rp_results(X, name, dims):
    """Run RP and save projected dataset as CSV.

    Args:
        X (Numpy.Array): Attributes.
        name (str): Dataset name.
        dims (int): Number of components.

    """
    # transform data using ICA
    rp = SparseRandomProjection(random_state=0, n_components=dims)
    res = rp.fit_transform(X)

    # save results file
    resdir = 'results/RP'
    resfile = get_abspath('{}_projected.csv'.format(name), resdir)
    save_array(array=res, filename=resfile, subdir=resdir)
예제 #8
0
def preprocess_seismic():
    """Cleans and generates seismic bumps dataset for experiments as a
    CSV file. Uses one-hot encoding for categorical features.

    """
    # get file path
    sdir = 'data/seismic-bumps'
    tdir = 'data/experiments'
    seismic_file = get_abspath('seismic-bumps.arff', sdir)

    # read arff file and convert to record array
    rawdata = arff.loadarff(seismic_file)
    df = pd.DataFrame(rawdata[0])

    # apply one-hot encoding to categorical features using Pandas get_dummies
    cat_cols = ['seismic', 'seismoacoustic', 'shift', 'ghazard']
    cats = df[cat_cols]
    onehot_cols = pd.get_dummies(cats, prefix=cat_cols)

    # replace 0s with -1s to improve NN performance
    onehot_cols.replace(to_replace=[0], value=[-1], inplace=True)

    # drop original categorical columns and append one-hot encoded columns
    df.drop(columns=cat_cols, inplace=True)
    df = pd.concat((onehot_cols, df), axis=1)

    # drop columns that have only 1 unique value (features add no information)
    for col in df.columns:
        if len(np.unique(df[col])) == 1:
            df.drop(columns=col, inplace=True)

    # cast class column as integer
    df['class'] = df['class'].astype(int)

    # split out X data and scale (Gaussian zero mean and unit variance)
    X = df.drop(columns='class').as_matrix()
    y = df['class'].as_matrix()
    X_scaled = StandardScaler().fit_transform(X)
    data = np.concatenate((X_scaled, y[:, np.newaxis]), axis=1)

    # save to CSV
    save_array(array=data, filename='seismic-bumps.csv', subdir=tdir)
예제 #9
0
def pca_experiment(X, name, dims, evp):
    """Run PCA on specified dataset and saves dataset with components that
    explain at least 85% of total variance or 2 components which ever is larger

    Args:
        X (Numpy.Array): Attributes.
        name (str): Dataset name.
        dims (int): Number of components.
        evp (float): Explained variance percentage threshold.

    """
    pca = PCA(random_state=0, svd_solver='full', n_components=dims)

    comps = pca.fit_transform(
        StandardScaler().fit_transform(X)
    )  # get principal components

    # cumulative explained variance greater than threshold
    r = range(1, dims + 1)
    ev = pd.Series(pca.explained_variance_, index=r, name='ev')
    evr = pd.Series(pca.explained_variance_ratio_, index=r, name='evr')
    evrc = evr.rename('evr_cum').cumsum()
    res = comps[:, :max(evrc.where(evrc > evp).idxmin(), 2)]
    error = []
    for _ in range(1, dims+1):
        print _
        pca = PCA(random_state=0, svd_solver='full', n_components=_)
        comps = pca.fit_transform(
            StandardScaler().fit_transform(X)
        )  # get principal components
        data_reduced = np.dot(X, pca.components_.T)
        error.append(((X - np.dot(data_reduced, pca.components_)) ** 2).mean())

    evars = pd.concat((ev, evr, evrc), axis=1)
    evars['loss'] = error

    # save results as CSV
    resdir = 'results/PCA'
    evfile = get_abspath('{}_variances.csv'.format(name), resdir)
    resfile = get_abspath('{}_projected.csv'.format(name), resdir)
    save_array(array=res, filename=resfile, subdir=resdir)
    evars.to_csv(evfile, index_label='n')