def runTest(self): data = cleanData(self.filename) result = getDataPoints(data.data, data.target, 3) self.assertTrue(len(result['Iris-virginica'][0]) == 3) jsonResult = json.dumps(result) reloaded = json.loads(jsonResult) self.assertTrue(len(reloaded['Iris-virginica'][0]) == 3)
def runTest(self): data = cleanData(self.filename) knearest = getDiscreetClassifier('knearest') resultTime = CV.doShuffleCrossValidation(knearest, data.data, data.target) print('knearest result: ' + str(resultTime.meanScore) + ' Time taken: ' + str(resultTime.timeTaken)) self.assertTrue(resultTime.timeTaken > 0)
def taskGetPoints(userName, csynapseName, mongoId): ret = {} # get Data points data = cleanData(getDataFile(mongoId)) # find dimensionality of data d = len(data.data[0]) dimensions = None if (d >= 3): dimensions = [3, 2, 1] elif (d == 2): dimensions = [2, 1] else: dimensions = [1] for x in dimensions: points = getDataPoints(data.data, data.target, x) #save result in database pointsId = db.files.put(json.dumps(points)) userCollection = db.users userCollection.update_one({'_id':userName},\ {'$set':{'csynapses.{0}.points.{1}'.format(csynapseName,x):pointsId}})
def classify(newDataId, oldDataId, algorithm, params, userName, csynapseName, dataName): result = None finalString = '' # special case for homegrown algorithms if (algorithm in externalAlgorithms): # training data trainingPath = getDataFile(oldDataId) # get headers possibleHeaders = getHeaders(trainingPath) # new Data newDataPath = getDataFile(newDataId) resultsPath = trainingPath + 'results' runEx.execute( 'java -jar externalExecutables/{0}.jar {1} {2} {3}'.format( algorithm, trainingPath, newDataPath, resultsPath)) # write header to file if (possibleHeaders != ''): with open(resultsPath, 'r') as f: content = f.read() finalString = possibleHeaders + content else: # Get old data oldData = cleanData(getDataFile(oldDataId)) # Get params of algorithm # Instantiate Classifier alg = getDiscreetClassifier(algorithm, params) # Train on data alg.fit(oldData.data, oldData.target) # Get new data newData = cleanUntagged(getDataFile(newDataId)) # Get headers from old file to save with results possibleHeaders = oldData.headers # Predict the new data result = predict(alg, newData.data) finalStringList = [] if (possibleHeaders != ''): finalStringList.append(possibleHeaders) # Put data into string to save for x in result: finalStringList.append(str(x[0]) + ',') for v in x[1][:-1]: finalStringList.append(str(v)) finalStringList.append(',') finalStringList[-1] = '\n' finalString = ''.join(finalStringList) # Put classified data file into the database classifiedDataId = db.files.put(finalString) # Save data Id to cynapse users = db.users users.update_one({'_id':userName},\ {'$set':{'csynapses.{0}.classified.{1}'.format(csynapseName,dataName):classifiedDataId}}) return
def runAlgoTest(algoData, userName, csynapseName): newObjectId = str(ObjectId()) try: userCollection = db.users algorithm = algoData['algorithm'] params = {} if ('params' in algoData): params = algoData['params'] # userCollection = db.users set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) userCollection.update_one({'_id':userName}, \ {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'algoId':algorithm, 'params':params, 'status':"processing", "last_updated":set_time}}}) doc = userCollection.find_one({'_id': userName}) #the following is ATROCIOUS code, but should work for now. #need better reporting tools to do better if "data_id" not in (doc['csynapses'][csynapseName]).keys( ) and "multipart_data" not in (doc['csynapses'][csynapseName]).keys(): set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) userCollection.update_one({'_id':userName}, \ {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'status':"error", "last_updated":set_time}}}) raise DataException("Data Not Available") found = False if "data_id" in (doc['csynapses'][csynapseName]).keys(): found = True cycles_to_wait = 20 cycles = 0 if not found: while not found: doc = userCollection.find_one({'_id': userName}) if ("multipart_data" in (doc['csynapses'][csynapseName]).keys() and "multipart_reduced" in (doc['csynapses'][csynapseName]).keys()): found = True else: cycles = cycles + 1 if cycles == cycles_to_wait: set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) userCollection.update_one({'_id':userName}, \ {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'status':"error", "last_updated":set_time}}}) raise WaitException("Took Too Long to Generate Data") sleep(1) sleep(3) dataId = doc['csynapses'][csynapseName]['data_id'] ret = {} # special case for homegrown algos if (algorithm in externalAlgorithms): # get file path = getDataFile(dataId) resultsPath = path + 'results' script_dir = os.path.dirname(os.path.realpath(__file__)) runEx.execute( 'java -jar {3}/externalExecutables/{0}.jar {1} {2}'.format( algorithm, path, resultsPath, script_dir)) # get results from file with open(resultsPath, 'r') as f: data = f.read() score, time = data.split(',') ret['score'] = float(score) / 100 ret['time'] = float(time) / 1000 else: # Instantiate Classifier alg = getDiscreetClassifier(algorithm, params) # Get data from file data = cleanData(getDataFile(dataId)) # Run Cross Validation meanScoreTime = doShuffleCrossValidation(alg, data.data, data.target) ret['score'] = meanScoreTime.meanScore ret['time'] = meanScoreTime.timeTaken # save result in db set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) userCollection.update_one({'_id':userName}, \ {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'score':ret['score'],\ 'time':ret['time'], 'algoId':algorithm, 'params':params, 'status':"complete", "last_updated":set_time}}}) return except: tb = traceback.format_exc() #sys.exc_info()[0] set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) userCollection.update_one({'_id':userName}, \ {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'status':"error", "error_text":"{}".format(tb), "algoId":algorithm, "params":params, "last_updated":set_time}}})