def run(data, labels, numFeatures, trueLabelsFile=None, verbose=False): if (trueLabelsFile != None): trueLabels = readLabels(trueLabelsFile) if (verbose): print("Selecting features...") # feature selection features = featureSelect(data, labels, numFeatures) # remove columns other than selected features newData = extractFeatures(data, features) C_vals = [0.001, 1.0, 1000] for C in C_vals: # run Scikit learn SVM predictions1 = runScikitSVM(newData, labels, C=C, verbose=verbose) # get accuracy if (trueLabelsFile != None): # accuracy = calculateAccuracy(predictions1, trueLabels) bAccuracy = calculateBalancedAccuracy(predictions1, trueLabels) if (verbose): # print("Accuracy:", accuracy) print("Balanced accuracy:", bAccuracy) # run Scikit naive bayes predictions4 = runSciKitGaussianNaiveBayes(newData, labels, verbose=verbose) # get accuracy if (trueLabelsFile != None): # accuracy = calculateAccuracy(predictions4, trueLabels) bAccuracy = calculateBalancedAccuracy(predictions4, trueLabels) if (verbose): # print("Accuracy:", accuracy) print("Balanced accuracy:", bAccuracy)
def test(): dataFile = "train_data/snps.data" trueLabelsFile = "train_data/snps.labels" labelsFilePrefix = "train_data/snps.trainlabels." data, rows, cols = readData(dataFile, verbose=True) listFeatures = [15, 30, 50] # listFeatures = [100, 500] numSplits = 10 for numFeatures in listFeatures: for split in range(0, numSplits): labelsFile = labelsFilePrefix + str(split) labels = readLabels(labelsFile) print("----------Num features:", numFeatures, "----- Labels:", labelsFile, "----------") run(data, labels, numFeatures, trueLabelsFile=trueLabelsFile, verbose=True) exit()
def printPredictions(predictions): for pred in predictions: print(pred[0], pred[1]) if __name__ == '__main__': if (len(sys.argv) != 4): print("Usage:", sys.argv[0], "<dataFile> <trainLabelsFile> <k>") exit(1) dataFile = sys.argv[1] labelsFile = sys.argv[2] k_val = int(sys.argv[3]) data, _, _ = readData(dataFile) labels = readLabels(labelsFile) predictions = randomHyperplanes(data, labels, k_val, verbose=True, printTrainAccuracy=False) printPredictions(predictions)
random.shuffle(allRows) allRows = allRows[:int(numLabels * percent)] return allRows if __name__ == '__main__': if (len(sys.argv) != 3): print("Usage:", sys.argv[0], "<labelsFile> <numSplits>") exit() createTrainDir() labelFile = sys.argv[1] numSplits = int(sys.argv[2]) DATA_NAME = labelFile.split('.')[0] OUTPUT_TRAIN_PREFIX = DATA_NAME + ".trainlabels." splitPercent = 0.80 # 80 train 20 validation allLabels = readLabels(labelFile) numLabels = len(allLabels) for i in range(numSplits): newSplit = split(splitPercent, numLabels) outputFileName = OUTPUT_TRAIN_PREFIX + str(i) f = open(outputFileName, 'w') for key in newSplit: f.write(str(allLabels[key]) + " " + str(key) + "\n") f.close()
if (argument == 'all'): # datasets = ['ionosphere', 'breast_cancer', 'qsar', 'climate', 'micromass', 'hill_valley'] datasets = ['micromass', 'hill_valley'] else: datasets = [argument] for k in k_vals: for datasetName in datasets: base = '../datasets/' + datasetName + '/' train_base = base + datasetName + '.trainlabels.' dataFile = base + datasetName + '.data' trueLabelsFile = base + datasetName + '.labels' # read true labels trueLabels = readLabels(trueLabelsFile) C_vals = [0.001, 0.01, 0.1, 1, 10, 100] # read data data, _, _ = readData(dataFile) for split in range(numSplits): print("----------------------------------------------") print(datasetName, "split", split) labelsFile = train_base + str(split) # read labels labels = readLabels(labelsFile) initial = True # run regular SVM