Exemplo n.º 1
0
def fish_market():
    # Read in Fish market
    fish_data = pd.read_csv('data/fish-market/full.csv')
    species = fish_data['Species'].value_counts().index.tolist()

    # Test-train split
    x, y = fish_data.drop('Species', axis=1).values, []
    for fish in fish_data['Species']:
        y.append(species.index(fish))
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=1)
    x_train, x_test = scale_data(x_train, x_test)

    create_start = time.process_time()

    # K-Means
    k = 3
    kmeans = KMeans(n_clusters=k).fit(x_train)

    run_time = time.process_time() - create_start
    print('Fish market [k-means] time (ms):', run_time * 1000)

    km_as = accuracy_score(kmeans.predict(x_test), y_test)
    print("k-means clustering accuracy score: ", km_as)

    # Plot
    plot_clusters(k, x_test, kmeans, 'clustering/k-means/fish-market')
Exemplo n.º 2
0
def red_wine_quality():
    # Read in Red Wine Quality
    red_wine_data = pd.read_csv('data/red-wine-quality/full.csv')

    # Label encode/transform
    red_wine_data['quality'] = pd.cut(red_wine_data['quality'],
                                      bins=[2, 5.5, 8],
                                      labels=['bad', 'good'])
    le = LabelEncoder()
    red_wine_data['quality'] = le.fit_transform(red_wine_data['quality'])

    # Test-train split
    x = red_wine_data.drop('quality', axis=1).values
    y = red_wine_data['quality']
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1)
    x_train, x_test = scale_data(x_train, x_test)

    create_start = time.process_time()

    # K-Means
    k = 2
    kmeans = KMeans(n_clusters=k).fit(x_train)

    run_time = time.process_time() - create_start
    print('RWQ [k-means] time (ms):', run_time * 1000)

    km_as = accuracy_score(kmeans.predict(x_test), y_test)
    print("k-means clustering accuracy score: ", km_as)

    # Plot
    plot_clusters(k, x_test, kmeans, 'clustering/k-means/red-wine')
Exemplo n.º 3
0
def fish_market():
    # Read in Fish market
    fish_data = pd.read_csv('data/fish-market/full.csv')
    species = fish_data['Species'].value_counts().index.tolist()

    # Test-train split
    x, y = fish_data.drop('Species', axis=1).values, []
    for fish in fish_data['Species']:
        y.append(species.index(fish))
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
    x_train, x_test = scale_data(x_train, x_test)

    create_start = time.process_time()

    # EM
    k = 3
    em = GaussianMixture(n_components=k).fit(x_train)

    run_time = time.process_time() - create_start
    print('Fish market [EM] time (ms):', run_time*1000)

    em_as = accuracy_score(em.predict(x_test),y_test)
    print("EM clustering accuracy score: ",em_as)

    # Plot
    plot_clusters(k, x_test, em, 'clustering/em/fish-market')
Exemplo n.º 4
0
def plot_nn(x, y, suffix=''):
    # Test-train split
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1)
    x_train, x_test = scale_data(x_train, x_test)

    iter_counts = np.arange(1, 1100, 50, dtype=int)

    # Create Red Wine Quality NNs
    nns = create_nns(x_train, y_train, iter_counts)

    # Get training/testing accuracy for Red Wine Quality
    accs_train, accs_test = [], []

    for nn in nns:
        accs_train.append(accuracy_score(nn.predict(x_train), y_train))
        accs_test.append(accuracy_score(nn.predict(x_test), y_test))

    # Generate graph for Red Wine Quality
    plot = Plotter(name='red-wine{}'.format(suffix),
                   learner='nn',
                   axes={
                       'x': 'Number of weight updates',
                       'y': 'Accuracy score'
                   })
    plot.add_plot(iter_counts, accs_train, 'training data', 'None')
    plot.add_plot(iter_counts, accs_test, 'testing data', 'None')
    plot.find_max(iter_counts, accs_test, 'testing')
    plot.save()
Exemplo n.º 5
0
def fish_market():
    np.random.seed(93)

    fish_data1, fish_data2 = pd.read_csv(
        'data/fish-market/full.csv'), pd.read_csv('data/fish-market/full.csv')
    x1, x2 = fish_data1.drop('Species', axis=1).values, fish_data2.drop(
        ['Species', 'Weight'], axis=1).values
    y1 = fish_data1['Species']

    # Label encode/transform
    fish_data2['Weight'] = pd.cut(fish_data2['Weight'],
                                  bins=[0, 120, 650, 1650],
                                  labels=['light', 'avg', 'heavy'])
    le = LabelEncoder()
    fish_data2['Weight'] = le.fit_transform(fish_data2['Weight'].astype(str))

    y2 = fish_data2['Weight']

    x_train1, x_test1, y_train1, y_test1 = train_test_split(x1,
                                                            y1,
                                                            test_size=0.25,
                                                            random_state=1)
    x_train2, x_test2, y_train2, y_test2 = train_test_split(x2,
                                                            y2,
                                                            test_size=0.25,
                                                            random_state=1)
    x_train1, x_test1 = scale_data(x_train1, x_test1)
    x_train2, x_test2 = scale_data(x_train2, x_test2)

    k1 = KNeighborsClassifier(n_neighbors=5).fit(x_train1, y_train1)

    k1_as = accuracy_score(k1.predict(x_test1), y_test1)
    print("k1 accuracy score: ", k1_as)

    k2 = KNeighborsClassifier(n_neighbors=5).fit(x_train2, y_train2)

    k2_as = accuracy_score(k2.predict(x_test2), y_test2)
    print("k2 accuracy score: ", k2_as)
Exemplo n.º 6
0
def red_wine_quality(method='km'):
    if method == 'km':
        # Read in Red Wine Quality
        red_wine_data_km = pd.read_csv('data/red-wine-quality/full.csv')

        # Label encode/transform
        red_wine_data_km['quality'] = pd.cut(red_wine_data_km['quality'],
                                             bins=[2, 5.5, 8],
                                             labels=['bad', 'good'])
        le = LabelEncoder()
        red_wine_data_km['quality'] = le.fit_transform(
            red_wine_data_km['quality'])

        # Group x, y
        x = red_wine_data_km.drop('quality', axis=1).values
        y = red_wine_data_km['quality']

        create_start = time.process_time()

        # PCA
        c = 4
        x = PCA(n_components=c).fit_transform(x)

        test_train_start = time.process_time()

        # Test-train split
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=1)
        x_train, x_test = scale_data(x_train, x_test)

        test_train_time = time.process_time() - test_train_start

        # K-Means
        k = 2
        kmeans = KMeans(n_clusters=k).fit(x_train)

        run_time = time.process_time() - create_start - test_train_time
        print('RWQ [PCA + k-means] time (ms):', run_time * 1000)

        km_as = accuracy_score(kmeans.predict(x_test), y_test)
        print("k-means clustering accuracy score: ", km_as)

        # Plot
        plot_clusters(k, x_test, kmeans,
                      'dimen-reduction/pca/red-wine-k-means')
    elif method == 'em':
        # Read in Red Wine Quality
        red_wine_data_em = pd.read_csv('data/red-wine-quality/full.csv')

        # Label encode/transform
        red_wine_data_em['quality'] = pd.cut(red_wine_data_em['quality'],
                                             bins=[2, 5.5, 8],
                                             labels=['bad', 'good'])
        le = LabelEncoder()
        red_wine_data_em['quality'] = le.fit_transform(
            red_wine_data_em['quality'])

        # Group x, y
        x = red_wine_data_em.drop('quality', axis=1).values
        y = red_wine_data_em['quality']

        create_start = time.process_time()

        # PCA
        c = 3
        x = PCA(n_components=c).fit_transform(x)

        test_train_start = time.process_time()

        # Test-train split
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=1)
        x_train, x_test = scale_data(x_train, x_test)

        test_train_time = time.process_time() - test_train_start

        # EM
        k = 2
        em = GaussianMixture(n_components=k).fit(x_train)

        run_time = time.process_time() - create_start - test_train_time
        print('RWQ [PCA + EM] time (ms):', run_time * 1000)

        em_as = accuracy_score(em.predict(x_test), y_test)
        print("EM clustering accuracy score: ", em_as)

        # Plot
        plot_clusters(k, x_test, em, 'dimen-reduction/pca/red-wine-em')
    else:
        print('Invalid method: {}'.format(method))
Exemplo n.º 7
0
def fish_market(method='km'):
    if method == 'km':
        # Read in Fish market
        fish_data = pd.read_csv('data/fish-market/full.csv')
        species = fish_data['Species'].value_counts().index.tolist()

        # Group x, y
        x, y = fish_data.drop('Species', axis=1).values, []
        for fish in fish_data['Species']:
            y.append(species.index(fish))

        create_start = time.process_time()

        # PCA
        c = 6
        x = PCA(n_components=c).fit_transform(x)

        test_train_start = time.process_time()

        # Test-train split
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=1)
        x_train, x_test = scale_data(x_train, x_test)

        test_train_time = time.process_time() - test_train_start

        # K-Means
        k = 3
        kmeans = KMeans(n_clusters=k).fit(x_train)

        run_time = time.process_time() - create_start - test_train_time
        print('Fish market [PCA + k-means] time (ms):', run_time * 1000)

        km_as = accuracy_score(kmeans.predict(x_test), y_test)
        print("k-means clustering accuracy score: ", km_as)

        # Plot
        plot_clusters(k, x_test, kmeans,
                      'dimen-reduction/pca/fish-market-k-means')

    elif method == 'em':
        # Read in Fish market
        fish_data = pd.read_csv('data/fish-market/full.csv')
        species = fish_data['Species'].value_counts().index.tolist()

        # Group x, y
        x, y = fish_data.drop('Species', axis=1).values, []
        for fish in fish_data['Species']:
            y.append(species.index(fish))

        create_start = time.process_time()

        # PCA
        c = 3
        x = PCA(n_components=c).fit_transform(x)

        test_train_start = time.process_time()

        # Test-train split
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=1)
        x_train, x_test = scale_data(x_train, x_test)

        test_train_time = time.process_time() - test_train_start

        # EM
        k = 3
        em = GaussianMixture(n_components=k).fit(x_train)

        run_time = time.process_time() - create_start - test_train_time
        print('Fish market [PCA + k-means] time (ms):', run_time * 1000)

        em_as = accuracy_score(em.predict(x_test), y_test)
        print("EM clustering accuracy score: ", em_as)

        # Plot
        plot_clusters(k, x_test, em, 'dimen-reduction/pca/fish-market-em')
    else:
        print('Invalid method: {}'.format(method))