def loadArffFile(filePath): arffFile = ArffFile(filePath) unsupervisedFeatures = arffFile.getData().copy() labelColumn = unsupervisedFeatures.columns[-1] unsupervisedFeatures = unsupervisedFeatures.drop(labelColumn, axis=1) y = arffFile.getData()[labelColumn] return unsupervisedFeatures, y
def loadArffFile(arffFilePath): # load arff files arffFile = ArffFile(arffFilePath) unsupervisedFeatures = arffFile.getData().copy() # remove label column from training data labelColumn = unsupervisedFeatures.columns[-1] unsupervisedFeatures = unsupervisedFeatures.drop(labelColumn, axis=1) y = arffFile.getData()[labelColumn] return unsupervisedFeatures, y
def main(): file_paths = [ Path("../datasets/vote.arff"), Path("../datasets/adult.arff"), Path("../datasets/pen-based.arff") ] for arfffile in file_paths: arffResultsFolder = RESULTS_PATH / arfffile.stem arffResultsFolder.mkdir(parents=True, exist_ok=True) arfffile = ArffFile(arfffile) data = arfffile.getData() data = data.drop(data.columns[-1], axis=1) arfffile.scatterPlot(ignoreLabel=True, show=False, figsize=(15, 9)) plt.savefig(arffResultsFolder / "scatterplot.png") plt.close() plt.figure(figsize=(15, 9)) data.boxplot() plt.xticks(rotation='vertical') plt.tight_layout() plt.savefig(arffResultsFolder / "boxplot.png") plt.close()
import sys sys.path.append(".") import json from pathlib import Path from src.dataset import ArffFile configFolderPath = Path("./configs") for configPath in configFolderPath.glob('*.json'): with open(configPath) as f: confData = json.load(f) arffFilePath = confData["path"] arffFile = ArffFile(arffFilePath) arffFile.scatterPlot(ignoreLabel=True, show=True)
def findEps(data, Ks=[2]): X = data.to_numpy() colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(Ks))] for K, col in zip(Ks, colors): neigh = NearestNeighbors(n_neighbors=K + 1) distances, _ = neigh.fit(X).kneighbors(X) distances = np.sort(distances[:, -1], axis=0) plt.plot(distances, color=tuple(col), label=f'{K} Nearest Neighbor') plt.tight_layout() plt.xlabel('Points sorted according to distance of Kth Nearest Neighbor') plt.ylabel('Kth Nearest Neighbor Distance') plt.title("K-dist plot for 'adult' dataset") plt.legend() plt.savefig(f"results/adult/dbscan/dbscanDistanceToKthNeighbor.png") plt.show() if __name__ == "__main__": arffFile = ArffFile(Path("./datasets/adult.arff")) data = arffFile.getData().copy() labelColumn = data.columns[-1] y = data[labelColumn] data = data.drop(labelColumn, axis=1) minPts = [10, 20, 30, 40, 50] eps = [.55, .6, .65, .70, .75] findEps(data, Ks=minPts) #evaluate(data, y, eps, minPts)