Exemplo n.º 1
0
 def runTest(self):
     data = cleanData(self.filename)
     result = getDataPoints(data.data, data.target, 3)
     self.assertTrue(len(result['Iris-virginica'][0]) == 3)
     jsonResult = json.dumps(result)
     reloaded = json.loads(jsonResult)
     self.assertTrue(len(reloaded['Iris-virginica'][0]) == 3)
Exemplo n.º 2
0
 def runTest(self):
     data = cleanData(self.filename)
     knearest = getDiscreetClassifier('knearest')
     resultTime = CV.doShuffleCrossValidation(knearest, data.data,
                                              data.target)
     print('knearest result: ' + str(resultTime.meanScore) +
           ' Time taken: ' + str(resultTime.timeTaken))
     self.assertTrue(resultTime.timeTaken > 0)
Exemplo n.º 3
0
def taskGetPoints(userName, csynapseName, mongoId):
    ret = {}
    # get Data points
    data = cleanData(getDataFile(mongoId))

    # find dimensionality of data
    d = len(data.data[0])
    dimensions = None
    if (d >= 3):
        dimensions = [3, 2, 1]
    elif (d == 2):
        dimensions = [2, 1]
    else:
        dimensions = [1]

    for x in dimensions:
        points = getDataPoints(data.data, data.target, x)
        #save result in database
        pointsId = db.files.put(json.dumps(points))
        userCollection = db.users
        userCollection.update_one({'_id':userName},\
          {'$set':{'csynapses.{0}.points.{1}'.format(csynapseName,x):pointsId}})
Exemplo n.º 4
0
def classify(newDataId, oldDataId, algorithm, params, userName, csynapseName,
             dataName):
    result = None
    finalString = ''
    # special case for homegrown algorithms
    if (algorithm in externalAlgorithms):
        # training data
        trainingPath = getDataFile(oldDataId)

        # get headers
        possibleHeaders = getHeaders(trainingPath)
        # new Data
        newDataPath = getDataFile(newDataId)
        resultsPath = trainingPath + 'results'

        runEx.execute(
            'java -jar externalExecutables/{0}.jar {1} {2} {3}'.format(
                algorithm, trainingPath, newDataPath, resultsPath))

        # write header to file
        if (possibleHeaders != ''):
            with open(resultsPath, 'r') as f:
                content = f.read()
                finalString = possibleHeaders + content
    else:
        # Get old data
        oldData = cleanData(getDataFile(oldDataId))

        # Get params of algorithm
        # Instantiate Classifier
        alg = getDiscreetClassifier(algorithm, params)
        # Train on data
        alg.fit(oldData.data, oldData.target)

        # Get new data
        newData = cleanUntagged(getDataFile(newDataId))

        # Get headers from old file to save with results
        possibleHeaders = oldData.headers
        # Predict the new data
        result = predict(alg, newData.data)

        finalStringList = []
        if (possibleHeaders != ''):
            finalStringList.append(possibleHeaders)
        # Put data into string to save
        for x in result:
            finalStringList.append(str(x[0]) + ',')
            for v in x[1][:-1]:
                finalStringList.append(str(v))
                finalStringList.append(',')
            finalStringList[-1] = '\n'

        finalString = ''.join(finalStringList)
    # Put classified data file into the database
    classifiedDataId = db.files.put(finalString)
    # Save data Id to cynapse
    users = db.users
    users.update_one({'_id':userName},\
      {'$set':{'csynapses.{0}.classified.{1}'.format(csynapseName,dataName):classifiedDataId}})
    return
Exemplo n.º 5
0
def runAlgoTest(algoData, userName, csynapseName):
    newObjectId = str(ObjectId())
    try:
        userCollection = db.users

        algorithm = algoData['algorithm']
        params = {}
        if ('params' in algoData):
            params = algoData['params']

        # userCollection = db.users
        set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
        userCollection.update_one({'_id':userName}, \
          {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'algoId':algorithm, 'params':params, 'status':"processing", "last_updated":set_time}}})

        doc = userCollection.find_one({'_id': userName})
        #the following is ATROCIOUS code, but should work for now.
        #need better reporting tools to do better
        if "data_id" not in (doc['csynapses'][csynapseName]).keys(
        ) and "multipart_data" not in (doc['csynapses'][csynapseName]).keys():

            set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
            userCollection.update_one({'_id':userName}, \
            {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'status':"error", "last_updated":set_time}}})
            raise DataException("Data Not Available")
        found = False
        if "data_id" in (doc['csynapses'][csynapseName]).keys():
            found = True
        cycles_to_wait = 20
        cycles = 0
        if not found:
            while not found:
                doc = userCollection.find_one({'_id': userName})
                if ("multipart_data"
                        in (doc['csynapses'][csynapseName]).keys()
                        and "multipart_reduced"
                        in (doc['csynapses'][csynapseName]).keys()):
                    found = True
                else:
                    cycles = cycles + 1
                if cycles == cycles_to_wait:

                    set_time = strftime("%a, %d %b %Y %H:%M:%S +0000",
                                        gmtime())
                    userCollection.update_one({'_id':userName}, \
                    {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'status':"error", "last_updated":set_time}}})
                    raise WaitException("Took Too Long to Generate Data")
                sleep(1)
            sleep(3)

        dataId = doc['csynapses'][csynapseName]['data_id']

        ret = {}
        # special case for homegrown algos
        if (algorithm in externalAlgorithms):
            # get file
            path = getDataFile(dataId)
            resultsPath = path + 'results'
            script_dir = os.path.dirname(os.path.realpath(__file__))
            runEx.execute(
                'java -jar {3}/externalExecutables/{0}.jar {1} {2}'.format(
                    algorithm, path, resultsPath, script_dir))
            # get results from file
            with open(resultsPath, 'r') as f:
                data = f.read()
                score, time = data.split(',')
                ret['score'] = float(score) / 100
                ret['time'] = float(time) / 1000
        else:
            # Instantiate Classifier
            alg = getDiscreetClassifier(algorithm, params)
            # Get data from file
            data = cleanData(getDataFile(dataId))
            # Run Cross Validation
            meanScoreTime = doShuffleCrossValidation(alg, data.data,
                                                     data.target)
            ret['score'] = meanScoreTime.meanScore
            ret['time'] = meanScoreTime.timeTaken

        # save result in db
        set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
        userCollection.update_one({'_id':userName}, \
          {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'score':ret['score'],\
          'time':ret['time'], 'algoId':algorithm, 'params':params, 'status':"complete", "last_updated":set_time}}})
        return
    except:
        tb = traceback.format_exc()  #sys.exc_info()[0]
        set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
        userCollection.update_one({'_id':userName}, \
          {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'status':"error", "error_text":"{}".format(tb), "algoId":algorithm, "params":params, "last_updated":set_time}}})