Exemplo n.º 1
0
 def runTest(self):
     data = cleanData(self.filename)
     knearest = getDiscreetClassifier('knearest')
     resultTime = CV.doShuffleCrossValidation(knearest, data.data,
                                              data.target)
     print('knearest result: ' + str(resultTime.meanScore) +
           ' Time taken: ' + str(resultTime.timeTaken))
     self.assertTrue(resultTime.timeTaken > 0)
Exemplo n.º 2
0
def testAlgorithm():
    userName = getUsername()
    check_request_for_params(["name", "algorithm"])
    csynapseName = re_space(request.params.get('name'))
    algos = request.params.getall('algorithm')
    failedAlgos = []
    successAlgos = []
    for j in algos:
        # Try and load the algorithm and return errors with information about the algorithms that failed
        algoData = ''
        try:
            algoData = json.loads(j)
            successAlgos.append(algoData)
        except Exception as e:
            failedAlgos.append({'error':'json data {0} is invalid'.format(j),\
            "error_type":"invalid_json", "exception_message":str(e)})
            continue
        try:
            if ('params' in algoData):
                paramsData = algoData['params']
                for key, val in paramsData.iteritems():
                    try:
                        if (key == u'n_estimators'):
                            paramsData[key] = int(val)
                        else:
                            paramsData[key] = float(val)
                    except Exception:
                        continue
                getDiscreetClassifier(algoData['algorithm'],
                                      algoData['params'])
        except Exception as e:
            failedAlgos.append({'error':'algo params {0} is invalid'.format(j),\
            "error_type":"invalid_params", "exception_message":str(e)})
        runAlgoTest.delay(algoData, userName, csynapseName)

    if (len(failedAlgos) > 0):
        raise HTTPResponse(status=500, body=json.dumps({"status":"error",'failedAlgorithms':failedAlgos,\
          "error_type":"failedAlgorithms"}))
    else:
        return HTTPResponse(status=200,
                            body=json.dumps({
                                "status": "ok",
                                "message": "submitted for testing",
                                "csynapse": csynapseName,
                                "algorithms": successAlgos
                            }))
Exemplo n.º 3
0
 def runTest(self):
     svm = 'svm'
     svmC = getDiscreetClassifier(svm)
     meanScore = CV.doShuffleCrossValidation(svmC, self.data.data,
                                             self.data.target).meanScore
     trained = TrainClassifier.trainWithLabels(getDiscreetClassifier(svm),
                                               self.data.data,
                                               self.data.target)
     predictions = predict(trained, self.data.data)
     # Ensure the score after from predicting is close to the cross validation score
     correct = 0
     for x in range(len(predictions)):
         if (predictions[x][0] == self.data.target[x]):
             correct += 1
     newScore = float(correct) / len(self.data.target)
     result = abs(meanScore - newScore)
     self.assertTrue(result < .05)
Exemplo n.º 4
0
	def runTest(self):
		svm = 'svm'
		svmC = getDiscreetClassifier(svm)
		saveAble = TrainClassifier.trainWithLabels(svmC,self.data.data, self.data.target)
		meanScore = CV.doShuffleCrossValidation(svmC, self.data.data, self.data.target).meanScore
		s = saveClassifierAsString(saveAble)
		clf = loadClassifierFromString(s)
		newScore = clf.score(self.data.data,self.data.target)
		result = abs(meanScore - newScore)
		# Ensure the score after reloading the classifier is close to the cross validation score
		self.assertTrue(result < .05)
Exemplo n.º 5
0
def classifyForDemo(trainFile, newImageFile, algorithmName='passiveAggressive'):
	imagePixels = getPixels(newImageFile,(Constants.DEMO_SIZE,Constants.DEMO_SIZE))
	# get data out of file
	data = [makeInts(line) for line in trainFile.read().split('\n')]
	pixels = []
	labels = []
	for x in data:
		labels.append(x[0])
		pixels.append(x[1])

	# make big list of photos putting the last one on the end
	pixels.append(imagePixels)

	components = len(pixels)
	if(components > 200):
		components = 200

	pca = PCA(n_components=components,copy=False)

	transformed = pca.fit_transform(pixels)

	alg = getDiscreetClassifier(algorithmName)
	alg.fit(transformed[:-1], labels)
	return predict(alg, transformed[-1])[0][0]
Exemplo n.º 6
0
	def runTest(self):
		preceptron = 'perceptron'
		kC = getDiscreetClassifier(preceptron)
		meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target)
		print('Perceptron result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
Exemplo n.º 7
0
	def runTest(self):
		kC = getDiscreetClassifier('orthogonalMatchingPursuit')
		meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target)
		print('OrthogonalMatchingPursuit result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
Exemplo n.º 8
0
def classify(newDataId, oldDataId, algorithm, params, userName, csynapseName,
             dataName):
    result = None
    finalString = ''
    # special case for homegrown algorithms
    if (algorithm in externalAlgorithms):
        # training data
        trainingPath = getDataFile(oldDataId)

        # get headers
        possibleHeaders = getHeaders(trainingPath)
        # new Data
        newDataPath = getDataFile(newDataId)
        resultsPath = trainingPath + 'results'

        runEx.execute(
            'java -jar externalExecutables/{0}.jar {1} {2} {3}'.format(
                algorithm, trainingPath, newDataPath, resultsPath))

        # write header to file
        if (possibleHeaders != ''):
            with open(resultsPath, 'r') as f:
                content = f.read()
                finalString = possibleHeaders + content
    else:
        # Get old data
        oldData = cleanData(getDataFile(oldDataId))

        # Get params of algorithm
        # Instantiate Classifier
        alg = getDiscreetClassifier(algorithm, params)
        # Train on data
        alg.fit(oldData.data, oldData.target)

        # Get new data
        newData = cleanUntagged(getDataFile(newDataId))

        # Get headers from old file to save with results
        possibleHeaders = oldData.headers
        # Predict the new data
        result = predict(alg, newData.data)

        finalStringList = []
        if (possibleHeaders != ''):
            finalStringList.append(possibleHeaders)
        # Put data into string to save
        for x in result:
            finalStringList.append(str(x[0]) + ',')
            for v in x[1][:-1]:
                finalStringList.append(str(v))
                finalStringList.append(',')
            finalStringList[-1] = '\n'

        finalString = ''.join(finalStringList)
    # Put classified data file into the database
    classifiedDataId = db.files.put(finalString)
    # Save data Id to cynapse
    users = db.users
    users.update_one({'_id':userName},\
      {'$set':{'csynapses.{0}.classified.{1}'.format(csynapseName,dataName):classifiedDataId}})
    return
Exemplo n.º 9
0
def runAlgoTest(algoData, userName, csynapseName):
    newObjectId = str(ObjectId())
    try:
        userCollection = db.users

        algorithm = algoData['algorithm']
        params = {}
        if ('params' in algoData):
            params = algoData['params']

        # userCollection = db.users
        set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
        userCollection.update_one({'_id':userName}, \
          {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'algoId':algorithm, 'params':params, 'status':"processing", "last_updated":set_time}}})

        doc = userCollection.find_one({'_id': userName})
        #the following is ATROCIOUS code, but should work for now.
        #need better reporting tools to do better
        if "data_id" not in (doc['csynapses'][csynapseName]).keys(
        ) and "multipart_data" not in (doc['csynapses'][csynapseName]).keys():

            set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
            userCollection.update_one({'_id':userName}, \
            {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'status':"error", "last_updated":set_time}}})
            raise DataException("Data Not Available")
        found = False
        if "data_id" in (doc['csynapses'][csynapseName]).keys():
            found = True
        cycles_to_wait = 20
        cycles = 0
        if not found:
            while not found:
                doc = userCollection.find_one({'_id': userName})
                if ("multipart_data"
                        in (doc['csynapses'][csynapseName]).keys()
                        and "multipart_reduced"
                        in (doc['csynapses'][csynapseName]).keys()):
                    found = True
                else:
                    cycles = cycles + 1
                if cycles == cycles_to_wait:

                    set_time = strftime("%a, %d %b %Y %H:%M:%S +0000",
                                        gmtime())
                    userCollection.update_one({'_id':userName}, \
                    {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'status':"error", "last_updated":set_time}}})
                    raise WaitException("Took Too Long to Generate Data")
                sleep(1)
            sleep(3)

        dataId = doc['csynapses'][csynapseName]['data_id']

        ret = {}
        # special case for homegrown algos
        if (algorithm in externalAlgorithms):
            # get file
            path = getDataFile(dataId)
            resultsPath = path + 'results'
            script_dir = os.path.dirname(os.path.realpath(__file__))
            runEx.execute(
                'java -jar {3}/externalExecutables/{0}.jar {1} {2}'.format(
                    algorithm, path, resultsPath, script_dir))
            # get results from file
            with open(resultsPath, 'r') as f:
                data = f.read()
                score, time = data.split(',')
                ret['score'] = float(score) / 100
                ret['time'] = float(time) / 1000
        else:
            # Instantiate Classifier
            alg = getDiscreetClassifier(algorithm, params)
            # Get data from file
            data = cleanData(getDataFile(dataId))
            # Run Cross Validation
            meanScoreTime = doShuffleCrossValidation(alg, data.data,
                                                     data.target)
            ret['score'] = meanScoreTime.meanScore
            ret['time'] = meanScoreTime.timeTaken

        # save result in db
        set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
        userCollection.update_one({'_id':userName}, \
          {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'score':ret['score'],\
          'time':ret['time'], 'algoId':algorithm, 'params':params, 'status':"complete", "last_updated":set_time}}})
        return
    except:
        tb = traceback.format_exc()  #sys.exc_info()[0]
        set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
        userCollection.update_one({'_id':userName}, \
          {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'status':"error", "error_text":"{}".format(tb), "algoId":algorithm, "params":params, "last_updated":set_time}}})
Exemplo n.º 10
0
def classifyImages(dataIds, oldDataId, algorithm, params, userName,
                   csynapseName, dataName):
    fileNames = getMultiPartDataFiles(userName, csynapseName)
    userCollection = db.users
    doc = userCollection.find_one({'_id': userName})

    labelMap = doc['csynapses'][csynapseName]['multipart_data_tagmap']

    # make dictionary mapping labels to lists of filenames
    mappedFiles = {}
    for key, value in labelMap.iteritems():
        mappedFiles[key] = [getDataFile(mongoId) for mongoId in value]

    unlabeledFiles = []
    for x in dataIds:
        filename, dataId = x
        unlabeledFiles.append((filename, getDataFile(dataId)))

    finalString = ''

    # special case for homegrown algorithms
    if (algorithm in externalAlgorithms):
        # Get file paths for old and new data
        unique = str(uuid.uuid4())
        trainingPath = 'train' + unique
        newDataPath = 'classify' + unique
        resultsPath = 'results' + unique

        script_dir = os.path.dirname(os.path.realpath(__file__))
        imageNames = vectorizeForClassify(mappedFiles, unlabeledFiles,
                                          trainingPath, newDataPath)

        runEx.execute(
            'java -jar {4}/externalExecutables/{0}.jar {1} {2} {3}'.format(
                algorithm, trainingPath, newDataPath, resultsPath, script_dir))

        stringBuilder = []
        with open(resultsPath, 'r') as f:
            for i, line in enumerate(f):
                stringBuilder.append(line.split(',')[0] + ',' + imageNames[i])
        finalString = '\n'.join(stringBuilder)
    else:
        trainingData, toClassify = vectorizeForClassify(
            mappedFiles, unlabeledFiles)
        # Instantiate Classifier
        alg = getDiscreetClassifier(algorithm, params)
        # Train on data
        alg.fit(trainingData.data, trainingData.target)

        predictResult = predict(alg, toClassify.data)

        classifiedData = []
        # add file names back
        for index, label in enumerate(toClassify.names):
            classifiedData.append((predictResult[index][0], label))

        commaStringList = [','.join(x) for x in classifiedData]
        finalString = '\n'.join(commaStringList)

    classifiedDataId = db.files.put(finalString.encode("UTF-8"))
    # Save data Id to cynapse
    users = db.users
    users.update_one({'_id':userName},\
      {'$set':{'csynapses.{0}.classified.{1}'.format(csynapseName,dataName):classifiedDataId}})
Exemplo n.º 11
0
	def runTest(self):
		leastSquares = 'leastSquares'
		kC = getDiscreetClassifier(leastSquares)
		meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target)
		print('LeastSquares result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
Exemplo n.º 12
0
	def runTest(self):
		dt = 'decisionTree'
		kC = getDiscreetClassifier(dt)
		meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target)
		print('Decision Tree result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
Exemplo n.º 13
0
	def runTest(self):
		pa = 'passiveAggressive'
		kC = getDiscreetClassifier(pa)
		meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target)
		print('Passive Aggressive result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
Exemplo n.º 14
0
	def runTest(self):
		nCentroid = 'nearestCentroid'
		kC = getDiscreetClassifier(nCentroid)
		meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target)
		print('Nearest Centroid result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
Exemplo n.º 15
0
	def runTest(self):
		rForest = 'randomForest'
		kC = getDiscreetClassifier(rForest)
		meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target)
		print('Random Forest result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
Exemplo n.º 16
0
	def runTest(self):
		sgd = 'sgd'
		kC = getDiscreetClassifier(sgd)
		meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target)
		print('SGD result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
Exemplo n.º 17
0
	def runTest(self):
		gauss = 'guassNB'
		kC = getDiscreetClassifier(gauss)
		meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target)
		print('Guassian Naive Bayes result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
Exemplo n.º 18
0
	def runTest(self):
		with self.assertRaises(ValueError):
			getDiscreetClassifier('blksci')
Exemplo n.º 19
0
	def runTest(self):
		kC = getDiscreetClassifier('logisticRegression')
		meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target)
		print('logisticRegression result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
Exemplo n.º 20
0
	def runTest(self):
		kC = getDiscreetClassifier('bayesianRidge')
		meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target)
		print('BayesianRidge result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')