def fish_market(): # Read in Fish market fish_data = pd.read_csv('data/fish-market/full.csv') species = fish_data['Species'].value_counts().index.tolist() # Test-train split x, y = fish_data.drop('Species', axis=1).values, [] for fish in fish_data['Species']: y.append(species.index(fish)) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1) x_train, x_test = scale_data(x_train, x_test) create_start = time.process_time() # K-Means k = 3 kmeans = KMeans(n_clusters=k).fit(x_train) run_time = time.process_time() - create_start print('Fish market [k-means] time (ms):', run_time * 1000) km_as = accuracy_score(kmeans.predict(x_test), y_test) print("k-means clustering accuracy score: ", km_as) # Plot plot_clusters(k, x_test, kmeans, 'clustering/k-means/fish-market')
def red_wine_quality(): # Read in Red Wine Quality red_wine_data = pd.read_csv('data/red-wine-quality/full.csv') # Label encode/transform red_wine_data['quality'] = pd.cut(red_wine_data['quality'], bins=[2, 5.5, 8], labels=['bad', 'good']) le = LabelEncoder() red_wine_data['quality'] = le.fit_transform(red_wine_data['quality']) # Test-train split x = red_wine_data.drop('quality', axis=1).values y = red_wine_data['quality'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) x_train, x_test = scale_data(x_train, x_test) create_start = time.process_time() # K-Means k = 2 kmeans = KMeans(n_clusters=k).fit(x_train) run_time = time.process_time() - create_start print('RWQ [k-means] time (ms):', run_time * 1000) km_as = accuracy_score(kmeans.predict(x_test), y_test) print("k-means clustering accuracy score: ", km_as) # Plot plot_clusters(k, x_test, kmeans, 'clustering/k-means/red-wine')
def fish_market(): # Read in Fish market fish_data = pd.read_csv('data/fish-market/full.csv') species = fish_data['Species'].value_counts().index.tolist() # Test-train split x, y = fish_data.drop('Species', axis=1).values, [] for fish in fish_data['Species']: y.append(species.index(fish)) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1) x_train, x_test = scale_data(x_train, x_test) create_start = time.process_time() # EM k = 3 em = GaussianMixture(n_components=k).fit(x_train) run_time = time.process_time() - create_start print('Fish market [EM] time (ms):', run_time*1000) em_as = accuracy_score(em.predict(x_test),y_test) print("EM clustering accuracy score: ",em_as) # Plot plot_clusters(k, x_test, em, 'clustering/em/fish-market')
def plot_nn(x, y, suffix=''): # Test-train split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) x_train, x_test = scale_data(x_train, x_test) iter_counts = np.arange(1, 1100, 50, dtype=int) # Create Red Wine Quality NNs nns = create_nns(x_train, y_train, iter_counts) # Get training/testing accuracy for Red Wine Quality accs_train, accs_test = [], [] for nn in nns: accs_train.append(accuracy_score(nn.predict(x_train), y_train)) accs_test.append(accuracy_score(nn.predict(x_test), y_test)) # Generate graph for Red Wine Quality plot = Plotter(name='red-wine{}'.format(suffix), learner='nn', axes={ 'x': 'Number of weight updates', 'y': 'Accuracy score' }) plot.add_plot(iter_counts, accs_train, 'training data', 'None') plot.add_plot(iter_counts, accs_test, 'testing data', 'None') plot.find_max(iter_counts, accs_test, 'testing') plot.save()
def fish_market(): np.random.seed(93) fish_data1, fish_data2 = pd.read_csv( 'data/fish-market/full.csv'), pd.read_csv('data/fish-market/full.csv') x1, x2 = fish_data1.drop('Species', axis=1).values, fish_data2.drop( ['Species', 'Weight'], axis=1).values y1 = fish_data1['Species'] # Label encode/transform fish_data2['Weight'] = pd.cut(fish_data2['Weight'], bins=[0, 120, 650, 1650], labels=['light', 'avg', 'heavy']) le = LabelEncoder() fish_data2['Weight'] = le.fit_transform(fish_data2['Weight'].astype(str)) y2 = fish_data2['Weight'] x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size=0.25, random_state=1) x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size=0.25, random_state=1) x_train1, x_test1 = scale_data(x_train1, x_test1) x_train2, x_test2 = scale_data(x_train2, x_test2) k1 = KNeighborsClassifier(n_neighbors=5).fit(x_train1, y_train1) k1_as = accuracy_score(k1.predict(x_test1), y_test1) print("k1 accuracy score: ", k1_as) k2 = KNeighborsClassifier(n_neighbors=5).fit(x_train2, y_train2) k2_as = accuracy_score(k2.predict(x_test2), y_test2) print("k2 accuracy score: ", k2_as)
def red_wine_quality(method='km'): if method == 'km': # Read in Red Wine Quality red_wine_data_km = pd.read_csv('data/red-wine-quality/full.csv') # Label encode/transform red_wine_data_km['quality'] = pd.cut(red_wine_data_km['quality'], bins=[2, 5.5, 8], labels=['bad', 'good']) le = LabelEncoder() red_wine_data_km['quality'] = le.fit_transform( red_wine_data_km['quality']) # Group x, y x = red_wine_data_km.drop('quality', axis=1).values y = red_wine_data_km['quality'] create_start = time.process_time() # PCA c = 4 x = PCA(n_components=c).fit_transform(x) test_train_start = time.process_time() # Test-train split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) x_train, x_test = scale_data(x_train, x_test) test_train_time = time.process_time() - test_train_start # K-Means k = 2 kmeans = KMeans(n_clusters=k).fit(x_train) run_time = time.process_time() - create_start - test_train_time print('RWQ [PCA + k-means] time (ms):', run_time * 1000) km_as = accuracy_score(kmeans.predict(x_test), y_test) print("k-means clustering accuracy score: ", km_as) # Plot plot_clusters(k, x_test, kmeans, 'dimen-reduction/pca/red-wine-k-means') elif method == 'em': # Read in Red Wine Quality red_wine_data_em = pd.read_csv('data/red-wine-quality/full.csv') # Label encode/transform red_wine_data_em['quality'] = pd.cut(red_wine_data_em['quality'], bins=[2, 5.5, 8], labels=['bad', 'good']) le = LabelEncoder() red_wine_data_em['quality'] = le.fit_transform( red_wine_data_em['quality']) # Group x, y x = red_wine_data_em.drop('quality', axis=1).values y = red_wine_data_em['quality'] create_start = time.process_time() # PCA c = 3 x = PCA(n_components=c).fit_transform(x) test_train_start = time.process_time() # Test-train split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) x_train, x_test = scale_data(x_train, x_test) test_train_time = time.process_time() - test_train_start # EM k = 2 em = GaussianMixture(n_components=k).fit(x_train) run_time = time.process_time() - create_start - test_train_time print('RWQ [PCA + EM] time (ms):', run_time * 1000) em_as = accuracy_score(em.predict(x_test), y_test) print("EM clustering accuracy score: ", em_as) # Plot plot_clusters(k, x_test, em, 'dimen-reduction/pca/red-wine-em') else: print('Invalid method: {}'.format(method))
def fish_market(method='km'): if method == 'km': # Read in Fish market fish_data = pd.read_csv('data/fish-market/full.csv') species = fish_data['Species'].value_counts().index.tolist() # Group x, y x, y = fish_data.drop('Species', axis=1).values, [] for fish in fish_data['Species']: y.append(species.index(fish)) create_start = time.process_time() # PCA c = 6 x = PCA(n_components=c).fit_transform(x) test_train_start = time.process_time() # Test-train split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1) x_train, x_test = scale_data(x_train, x_test) test_train_time = time.process_time() - test_train_start # K-Means k = 3 kmeans = KMeans(n_clusters=k).fit(x_train) run_time = time.process_time() - create_start - test_train_time print('Fish market [PCA + k-means] time (ms):', run_time * 1000) km_as = accuracy_score(kmeans.predict(x_test), y_test) print("k-means clustering accuracy score: ", km_as) # Plot plot_clusters(k, x_test, kmeans, 'dimen-reduction/pca/fish-market-k-means') elif method == 'em': # Read in Fish market fish_data = pd.read_csv('data/fish-market/full.csv') species = fish_data['Species'].value_counts().index.tolist() # Group x, y x, y = fish_data.drop('Species', axis=1).values, [] for fish in fish_data['Species']: y.append(species.index(fish)) create_start = time.process_time() # PCA c = 3 x = PCA(n_components=c).fit_transform(x) test_train_start = time.process_time() # Test-train split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1) x_train, x_test = scale_data(x_train, x_test) test_train_time = time.process_time() - test_train_start # EM k = 3 em = GaussianMixture(n_components=k).fit(x_train) run_time = time.process_time() - create_start - test_train_time print('Fish market [PCA + k-means] time (ms):', run_time * 1000) em_as = accuracy_score(em.predict(x_test), y_test) print("EM clustering accuracy score: ", em_as) # Plot plot_clusters(k, x_test, em, 'dimen-reduction/pca/fish-market-em') else: print('Invalid method: {}'.format(method))