def run(data, labels, numFeatures, trueLabelsFile=None, verbose=False):
    if (trueLabelsFile != None):
        trueLabels = readLabels(trueLabelsFile)

    if (verbose):
        print("Selecting features...")
    # feature selection
    features = featureSelect(data, labels, numFeatures)
    # remove columns other than selected features
    newData = extractFeatures(data, features)

    C_vals = [0.001, 1.0, 1000]
    for C in C_vals:
        # run Scikit learn SVM
        predictions1 = runScikitSVM(newData, labels, C=C, verbose=verbose)
        # get accuracy
        if (trueLabelsFile != None):
            # accuracy = calculateAccuracy(predictions1, trueLabels)
            bAccuracy = calculateBalancedAccuracy(predictions1, trueLabels)
            if (verbose):
                # print("Accuracy:", accuracy)
                print("Balanced accuracy:", bAccuracy)

    # run Scikit naive bayes
    predictions4 = runSciKitGaussianNaiveBayes(newData,
                                               labels,
                                               verbose=verbose)
    # get accuracy
    if (trueLabelsFile != None):
        # accuracy = calculateAccuracy(predictions4, trueLabels)
        bAccuracy = calculateBalancedAccuracy(predictions4, trueLabels)
        if (verbose):
            # print("Accuracy:", accuracy)
            print("Balanced accuracy:", bAccuracy)
def test():
    dataFile = "train_data/snps.data"
    trueLabelsFile = "train_data/snps.labels"
    labelsFilePrefix = "train_data/snps.trainlabels."

    data, rows, cols = readData(dataFile, verbose=True)
    listFeatures = [15, 30, 50]
    # listFeatures = [100, 500]

    numSplits = 10
    for numFeatures in listFeatures:
        for split in range(0, numSplits):
            labelsFile = labelsFilePrefix + str(split)
            labels = readLabels(labelsFile)
            print("----------Num features:", numFeatures, "----- Labels:",
                  labelsFile, "----------")
            run(data,
                labels,
                numFeatures,
                trueLabelsFile=trueLabelsFile,
                verbose=True)

    exit()
예제 #3
0
def printPredictions(predictions):
	for pred in predictions:
		print(pred[0], pred[1])

if __name__ == '__main__':

	if (len(sys.argv) != 4):
		print("Usage:", sys.argv[0], "<dataFile> <trainLabelsFile> <k>")
		exit(1)

	dataFile = sys.argv[1]
	labelsFile = sys.argv[2]
	k_val = int(sys.argv[3])

	data, _, _ = readData(dataFile)
	labels = readLabels(labelsFile)

	predictions = randomHyperplanes(data, labels, k_val, verbose=True, printTrainAccuracy=False)

	printPredictions(predictions)










예제 #4
0
    random.shuffle(allRows)
    allRows = allRows[:int(numLabels * percent)]
    return allRows


if __name__ == '__main__':
    if (len(sys.argv) != 3):
        print("Usage:", sys.argv[0], "<labelsFile> <numSplits>")
        exit()

    createTrainDir()

    labelFile = sys.argv[1]
    numSplits = int(sys.argv[2])

    DATA_NAME = labelFile.split('.')[0]
    OUTPUT_TRAIN_PREFIX = DATA_NAME + ".trainlabels."

    splitPercent = 0.80  # 80 train 20 validation

    allLabels = readLabels(labelFile)
    numLabels = len(allLabels)

    for i in range(numSplits):
        newSplit = split(splitPercent, numLabels)
        outputFileName = OUTPUT_TRAIN_PREFIX + str(i)
        f = open(outputFileName, 'w')
        for key in newSplit:
            f.write(str(allLabels[key]) + " " + str(key) + "\n")
        f.close()
예제 #5
0
if (argument == 'all'):
    # datasets = ['ionosphere', 'breast_cancer', 'qsar', 'climate', 'micromass', 'hill_valley']
    datasets = ['micromass', 'hill_valley']
else:
    datasets = [argument]

for k in k_vals:
    for datasetName in datasets:
        base = '../datasets/' + datasetName + '/'
        train_base = base + datasetName + '.trainlabels.'

        dataFile = base + datasetName + '.data'
        trueLabelsFile = base + datasetName + '.labels'

        # read true labels
        trueLabels = readLabels(trueLabelsFile)

        C_vals = [0.001, 0.01, 0.1, 1, 10, 100]

        #   read data
        data, _, _ = readData(dataFile)

        for split in range(numSplits):
            print("----------------------------------------------")
            print(datasetName, "split", split)
            labelsFile = train_base + str(split)
            #   read labels
            labels = readLabels(labelsFile)

            initial = True
            #   run regular SVM