Пример #1
0
def evaluate(run_identifier, control_params, params, function_to_evaluate, out_folder, force = False):
  try:
    experiment_db = ExperimentDB(out_folder, function_to_evaluate.__name__, run_identifier, dump_also_as_json=True)
    previous_result = experiment_db.get_experiment_result(params)
    
    if previous_result is not None and not force:
      print("already_exists:", function_to_evaluate.__name__ + " for", run_identifier, "with", params)
      return previous_result
    else:
      print(function_to_evaluate.__name__ + " for", run_identifier, "with", params)
      result_q = Queue()
      p = Process(target=function_to_evaluate, args=(result_q, control_params, params))
      p.start()
      p.join() # this blocks until the process terminates
      
      try:
        res = result_q.get_nowait()
      except Empty:
        print("no result available for this call. the process most likely failed with %s" % str(p.exitcode))
        res = None
      
      experiment_db.add_experiment(control_params, params, res)
      return (control_params, params, res)
  except:
    raise
Пример #2
0
    def setName(self, name):
        oldname = self.name
        if not self._isFinished.is_set():
            return {'oldname': oldname, 'name': self.name, 'error': \
              'Experiment is currently running.'}

        result = ExperimentDB.rename(oldname, name)
        if 'error' not in result:
            self.name = name
        return result
Пример #3
0
  def setName(self, name):
    oldname = self.name
    if not self._isFinished.is_set():
      return {'oldname': oldname, 'name': self.name, 'error': \
        'Experiment is currently running.'}

    result = ExperimentDB.rename(oldname, name)
    if 'error' not in result:
      self.name = name
    return result
Пример #4
0
  def runCurrentExperiment(self, expType="Standard", isLoad=False):
    """
    Creates an experiment runner for the current model and starts running the
    model in a seperate thread
    """
    if self.experimentRunner:
      self.stopCurrentExperiment()
      self.datasets[self.currentDataset].rewind()

    if isLoad:
      modelInfo = json.loads(ExperimentDB.get(self.name)['metadata'])
      modelDescriptionText = modelInfo['modelDescriptionText']
      subDescriptionText = modelInfo['subDescriptionText']
      self.loadDescriptionFile(modelDescriptionText, subDescriptionText)
    else:
      data = dict(
        modelDescriptionText=self.descriptionText,
        subDescriptionText=self.subDescriptionText
      )
      ExperimentDB.add(self.name, data)

    self.__currentModelData = []
    if expType == "Standard":
      self.experimentRunner = ExperimentRunner(
                                  name = self.name,
                                  modelDescription=self.models[self.currentModel],
                                  control= self.control,
                                  dataset=self.datasets[self.currentDataset])
    elif expType == "Anomaly":
      self.experimentRunner = AnomalyRunner(
                                  name = self.name,
                                  modelDescription=self.models[self.currentModel],
                                  control= self.control,
                                  dataset=self.datasets[self.currentDataset])

    if isLoad:
      self.experimentRunner.load()
    else:
      self.experimentRunner.run()

    return self.getExperimentInfo(self.models[self.currentModel])
Пример #5
0
  def getProtosAtTime(self, timestep):

    collection = ExperimentDB.getExperimentDB(self.name)

    experimentData = collection.find_one({"_id":timestep})
    
    experimentData['protos'] = []

    predictedField = self._modelDescription["predictedField"]
    predictedFieldIndex = self.getFieldNames().index(predictedField)

    dists = json.loads(experimentData['classificationDist'])

    for distId in json.loads(experimentData["classificationIdx"]):
      distSurroundingValues = collection.find({"_id": {
          "$gt": distId-10, "$lt": distId+10 
      }}).sort('_id', pymongo.ASCENDING)

      experimentData['protos'].append(dict(
        ids=[],
        actual=[],
        prediction=[],
        anomaly=[],
        anomalyLabel=[],
        dist=dists.pop(0),
        index=distId
      ))
      protosId = len(experimentData['protos']) - 1
      for distSurroundingValue in distSurroundingValues:
        inferences = json.loads(distSurroundingValue["inferences"])

        actual = distSurroundingValue["actual"]

        
        inference = inferences[InferenceElement.multiStepBestPredictions]
        step = min(inference.iterkeys())
        prediction =inference[step]
        
        if prediction is None:
          prediction = 0.0

        anomaly = inferences[InferenceElement.anomalyScore]
        anomalyLabel = inferences[InferenceElement.anomalyLabel]
        experimentData['protos'][protosId]["ids"].append(distSurroundingValue["_id"])
        experimentData['protos'][protosId]["actual"].append(actual)
        experimentData['protos'][protosId]["prediction"].append(prediction)
        experimentData['protos'][protosId]["anomaly"].append(anomaly)
        experimentData['protos'][protosId]["anomalyLabel"].append(anomalyLabel)

    return experimentData
def result_evaluation_dataset_speed_comparison(out_folder, out_folder_csv):

    for fcnt, plotname in [('do_kmeans', 'kmeans_speeds')]:
        print(plotname)
        run_identifiers = ExperimentDB.get_identifiers(out_folder, fcnt)
        plot_data = {}

        result_data = {}
        for run_identifier in run_identifiers:
            db = ExperimentDB(out_folder, fcnt, run_identifier)

            for resid in db.get_algorithm_run_ids():
                (control_params, params,
                 res) = db.get_experiment_result_from_run_id(resid)
                if res is None:
                    continue
                ds = params['info']['dataset_name']
                alg = params['info']['algorithm']
                no_clusters = params['task']['no_clusters']
                run = params['task']['run']
                duration_kmeans = res['duration_kmeans']
                no_iterations = len(res['iteration_changes'])

                if ds not in result_data:
                    result_data[ds] = {}
                    result_data[ds]['results'] = {}
                    result_data[ds]['infos'] = {}

                if no_clusters not in result_data[ds]['results']:
                    result_data[ds]['results'][no_clusters] = {}

                if alg not in result_data[ds]['results'][no_clusters]:
                    result_data[ds]['results'][no_clusters][alg] = {}

                if 'duration' not in result_data[ds]['results'][no_clusters][
                        alg]:
                    result_data[ds]['results'][no_clusters][alg][
                        'duration'] = {}

                if 'no_iterations' not in result_data[ds]['results'][
                        no_clusters][alg]:
                    result_data[ds]['results'][no_clusters][alg][
                        'no_iterations'] = {}

                result_data[ds]['results'][no_clusters][alg]['duration'][
                    run] = duration_kmeans

                if 'truncated_svd' in res:
                    result_data[ds]['results'][no_clusters][alg]['duration'][
                        run] += res['truncated_svd']['duration']

                result_data[ds]['infos']['input_dimension'] = res[
                    'input_dimension']
                result_data[ds]['infos']['input_samples'] = res[
                    'input_samples']
                result_data[ds]['infos']['input_annz'] = res['input_annz']
                result_data[ds]['results'][no_clusters][alg]['no_iterations'][
                    run] = no_iterations

            remove_incomplete_data(result_data)

            print("Result data:")
            pprint(result_data)

        create_plot(output_folder=out_folder_csv,
                    plot_name=plotname,
                    pdata=result_data)
Пример #7
0
 def getDetailsAtTime(self, timestep):
   collection = ExperimentDB.getExperimentDB(self.name)
   return collection.find_one({"_id":timestep})
Пример #8
0
  def _runExperimentLoop(self, queue):
    collection = ExperimentDB.getExperimentDB(self.name)

    while self._maxiterations == -1 or self._iteration <= self._maxiterations:
      try:
        # Get next record
        record = self._dataset.getNextRecord()
        
      except StopIteration:
        self._isFinished.set()
        return None
      
      if self._stop.isSet():
        break

      # Feed record to model and get prediction
      modelResult = self._model.run(record)
      
      if modelResult is None:
        continue

      if modelResult.inferences[InferenceElement.anomalyVector] is not None:
        modelResult.inferences[InferenceElement.anomalyVector] = \
          modelResult.inferences[InferenceElement.anomalyVector].nonzero()[0].tolist()        

      distances = self._model._classifier_distances #classifier.getSelf().getLatestDistances()
      sortedDistIdx = []
      sortedDists = []
      if distances is not None and len(distances) > 0:
        sortedDistIdx = distances.argsort()
        sortedDists = distances[sortedDistIdx[:5]].tolist()

        idList = self._model._classifier_indexes #classifier.getSelf().getParameter('categoryRecencyList')

        if len(idList) > 0:
          sortedDistIdx = [ \
              idList[i] + self._model._classificationDelay - 1\
              for i in sortedDistIdx[:min(5, len(sortedDistIdx))]]
        else:
          sortedDistIdx = []
        #matrix = classifier.getSelf()._knn._Memory
        #print matrix.shape
        #print "Index: %s" % (sorted)
        #if len(sorted) > 0:
        #  print matrix.getRow(int(sorted[0]))
        #  print matrix.getRow(int(sorted[0])).nonzero()[0]

      predictedField = self._modelDescription["predictedField"]
      predictedFieldIndex = self.getFieldNames().index(predictedField)

      modelResult.inferences['encodings'] = None
      modelResult.sensorInput.dataEncodings = None
      
      actual = modelResult.sensorInput.dataRow[predictedFieldIndex]


      dbelem = {"_id":self._iteration,
                "actual": actual,
                "inferences": json.dumps(modelResult.inferences),
                "classificationIdx":json.dumps(sortedDistIdx),
                "classificationDist":json.dumps(sortedDists)
              }
      
      collection.insert(dbelem)
      
      self._dataQ.put(dbelem)
        
      self._iteration += 1
      gevent.sleep(0)

    self._isFinished.set()
Пример #9
0
 def _runExperimentLoadLoop(self, queue):
   collection = ExperimentDB.getExperimentDB(self.name)
   experimentData = collection.find()
   for record in experimentData:
     self._dataQ.put(record)
     gevent.sleep(0)
Пример #10
0
 def getDataAtTime(self, dataInput):
     timestep = int(dataInput['timestep'])
     collection = ExperimentDB.getExperimentDB(self.name)
     experimentData = collection.find_one({"_id": timestep})
     return experimentData
Пример #11
0
 def _runExperimentLoadLoop(self, queue):
     collection = ExperimentDB.getExperimentDB(self.name)
     experimentData = collection.find()
     for record in experimentData:
         self._dataQ.put(record)
         gevent.sleep(0)
Пример #12
0
  def _runExperimentLoop(self, queue):

    self.prevFieldPred = {}

    self._model.resetSequenceStates()

    cOut = os.fdopen(os.open("/tmp/cerebro.cout", os.O_RDWR | os.O_CREAT), 'w+')
    oldC = os.dup(1)

    collection = ExperimentDB.getExperimentDB(self.name)

    while self._maxiterations == -1 or self._iteration <= self._maxiterations:
      try:
        # Get next record
        record = self._dataset.getNextRecord()

      except StopIteration:
        self._isFinished.set()
        return None

      if self._stop.isSet():
        break

      # Feed record to model and get prediction. Capture all the stdout as well
      os.dup2(cOut.fileno(), 1)

      modelResult = self._model.run(record)

      os.dup2(oldC, 1)

      cOut.seek(0)
      verboseOutput = cOut.read()
      cOut.truncate(0)

      modelResult.inferences['encodings'] = None
      modelResult.sensorInput.dataEncodings = None

      model = self._model
      sensor = model._getSensorRegion()
      sp = model._getSPRegion()
      tp = model._getTPRegion()
      cl = model._getClassifierRegion()

      spImp = None
      tpImp = None

      if sp is not None:
        spImp = sp.getSelf()._sfdr
      if tp is not None:
        tpImp = tp.getSelf()._tfdr
      clImp = cl.getSelf()._claClassifier

      #Copy all the pertinent data
      sourceScalars = copy.deepcopy(sensor.getOutputData('sourceOut'))
      sensorBits = sensor.getOutputData('dataOut')
      sensorBUOut = sensorBits.nonzero()[0].tolist()

      SPBUOut = []
      nConnectedInputs = []
      overlaps = []

      if spImp is not None:
        SPBUOut = sp.getOutputData('bottomUpOut').nonzero()[0].tolist()
        nConnectedInputs = spImp._allConnectedM.nNonZerosPerRow()[SPBUOut].astype('int32').tolist()
        overlaps = zip(SPBUOut,
                       spImp._overlapsNoBoost[SPBUOut].astype('int32').tolist())


      TPTDOut = tp.getOutputData('topDownOut') if tp else None
      sensorTDIn = sensor.getInputData('temporalTopDownIn')

      permanences = {}
      predictedCols = ()
      predictedConfidences = ()
      tpInfActiveCells = ()
      tpLrnActiveCells = ()
      tpPredCells = []

      if TPTDOut is not None:
        predictedCols = TPTDOut.nonzero()[0].tolist()
        predictedConfidences = TPTDOut[predictedCols].tolist()
        tpInfActiveCells = self._formatActiveCells(tpImp.infActiveState['t'])
        tpLrnActiveCells = self._formatActiveCells(tpImp.lrnActiveState['t'])
        tpInfPredT_1 = self._formatActiveCells(tpImp.infPredictedState['t-1'])
        tpInfPredT = self._formatActiveCells(tpImp.infPredictedState['t'])
        tpPredCells = tpImp.infPredictedState['t'].nonzero()[0].tolist()

      sensorPredBits = []
      if sensorTDIn is not None:
        sensorPredBits = sensorTDIn

      if self.prevPredictedCols is None:
        self.prevPredictedCols = []
        self.prevTPPredictedCells = []
        self.prevPredictedConfs = []
        self.prevTPPredicted = []

      clPattern = clImp._patternNZHistory[-1]
      step = clImp.steps[0]
      bitHistories = {}

      fieldActivations = {}
      fieldPredictions = {}
      for fieldName, (start, stop) in self.fieldRanges.iteritems():
        nzBits = sensorBits[start:stop].nonzero()[0]
        fieldActivations[fieldName] = nzBits.tolist()
        nzBits = sensorPredBits[start:stop].nonzero()[0]
        fieldPredictions[fieldName] = nzBits.tolist()

      predictedField = self._modelDescription["predictedField"]
      predictedFieldIndex = self.getFieldNames().index(self.predictedField)
      actual = modelResult.sensorInput.dataRow[predictedFieldIndex]

      dthandler = lambda obj: obj.isoformat() if isinstance(obj,
                                                            datetime.datetime) \
                                              else None
      record = {"_id":self._iteration,
                "actual": actual,
                "SPBUOut":SPBUOut,
                "overlaps":overlaps,
                "predictedCols": self.prevPredictedCols,
                "tpInfActive": tpInfActiveCells,
                "tpLrnActive": tpLrnActiveCells,
                "tpPredicted": self.prevTPPredictedCells,
                "tpInfPredT_1":tpInfPredT_1,
                "tpInfPredT":tpInfPredT,
                "permanences": permanences,
                "overlaps": overlaps,
                "inferences": json.dumps(modelResult.inferences),
                "record":json.dumps(modelResult.rawInput,
                                    default=dthandler),
                "fieldActivations":fieldActivations,
                #TODO: for some reason, field predictions don't need to be shifted??
                "fieldPredictions": fieldPredictions,
                "verboseOutput": verboseOutput,
                }

      collection.insert(record)

      self._dataQ.put(record)

      self.prevPredictedCols = predictedCols
      self.prevTPPredictedCells = tpPredCells
      self.prevPredictedConfs = predictedConfidences
      #self.prevTPPredicted = tpPredCells
      self.prevTPPredicted = None
      self.prevFieldPred = fieldPredictions

      self._iteration += 1
      gevent.sleep(0)

    os.close(oldC)
    cOut.close()
    self._isFinished.set()
Пример #13
0
 def getDataAtTime(self, dataInput):
   timestep = int(dataInput['timestep'])
   collection = ExperimentDB.getExperimentDB(self.name)
   experimentData = collection.find_one({"_id":timestep})
   return experimentData
Пример #14
0
 def POST(self):
     name = web.input()["name"]
     return json.dumps(ExperimentDB.delete(name))
Пример #15
0
 def POST(self):
     name = web.input()["name"]
     return json.dumps(ExperimentDB.delete(name))
Пример #16
0
 def GET(self):
     return json.dumps(ExperimentDB.list())
Пример #17
0
 def GET(self):
     return json.dumps(ExperimentDB.list())
Пример #18
0
def result_evaluation_minibatch_best_params(out_folder,
                                            out_folder_csv,
                                            remove_incomplete=False,
                                            ignore_datasets={}):

    fcnt, plotname = ('do_minibatch_best_params', 'kmeans_params')
    print(plotname)
    run_identifiers = ExperimentDB.get_identifiers(out_folder, fcnt)
    plot_data = OrderedDict()

    result_data = OrderedDict()
    for run_identifier in run_identifiers:
        db = ExperimentDB(out_folder, fcnt, run_identifier)
        print(run_identifiers)
        for resid in db.get_algorithm_run_ids():
            (control_params, params,
             res) = db.get_experiment_result_from_run_id(resid)
            print(resid, control_params, params)
            if res is None:
                continue

            ds = params['info']['dataset_name']
            alg = params['info']['algorithm']
            no_clusters = params['task']['no_clusters']
            run = params['task']['run']
            duration_kmeans = res['duration_kmeans']
            no_iterations = len(res['iteration_changes'])
            iteration_durations = res['iteration_durations']
            iteration_changes = res['iteration_changes']
            iteration_wcssd = res['iteration_wcssd']

            if 'pca' in alg:
                param_percent = params['info']['truncated_svd_annz_percentage']
            elif 'bv' in alg:
                param_percent = params['task']['bv_annz']
            else:
                param_percent = 0

            if ds in ignore_datasets:
                continue

            if ds not in result_data:
                result_data[ds] = OrderedDict()
                result_data[ds]['results'] = OrderedDict()
                result_data[ds]['infos'] = OrderedDict()

            if no_clusters not in result_data[ds]['results']:
                result_data[ds]['results'][no_clusters] = OrderedDict()

            if alg not in result_data[ds]['results'][no_clusters]:
                result_data[ds]['results'][no_clusters][alg] = OrderedDict()

            for descr in [
                    'iteration_durations', 'iteration_changes',
                    'iteration_wcssd', 'duration', 'no_iterations'
            ]:
                if descr not in result_data[ds]['results'][no_clusters][alg]:
                    result_data[ds]['results'][no_clusters][alg][
                        descr] = OrderedDict()

            for descr in [
                    'iteration_durations', 'iteration_changes',
                    'iteration_wcssd', 'duration'
            ]:
                if run not in result_data[ds]['results'][no_clusters][alg][
                        descr]:
                    result_data[ds]['results'][no_clusters][alg][descr][
                        run] = OrderedDict()

            kmeans_duration_this_run = duration_kmeans

            if 'truncated_svd' in res:
                kmeans_duration_this_run += res['truncated_svd']['duration']

            if param_percent in result_data[ds]['results'][no_clusters][alg][
                    'duration'][run]:
                raise Exception(
                    "dataset=%s no_clusters=%s alg=%s duration run=%s already added !!! %s %s"
                    % (ds, str(no_clusters), alg, str(run), control_params,
                       params))

            for descr in [
                    'iteration_durations', 'iteration_changes',
                    'iteration_wcssd', 'duration'
            ]:
                if run not in result_data[ds]['results'][no_clusters][alg][
                        descr]:
                    result_data[ds]['results'][no_clusters][alg][descr][
                        run] = OrderedDict()

            result_data[ds]['results'][no_clusters][alg]['duration'][run][
                param_percent] = kmeans_duration_this_run

            result_data[ds]['infos']['input_dimension'] = res[
                'input_dimension']
            result_data[ds]['infos']['input_samples'] = res['input_samples']
            result_data[ds]['infos']['input_annz'] = res['input_annz']

            if run in result_data[ds]['results'][no_clusters][alg][
                    'no_iterations']:
                if result_data[ds]['results'][no_clusters][alg][
                        'no_iterations'][run] != no_iterations:
                    print(
                        alg, run, no_iterations, result_data[ds]['results']
                        [no_clusters][alg]['no_iterations'][run], ds,
                        no_clusters, param_percent, resid)
                    raise Exception(
                        "Number of iterations is not identical! len(res['iteration_changes']) = %d, no_iterations= %d resid=%d"
                        % (no_iterations, result_data[ds]['results']
                           [no_clusters][alg]['no_iterations'][run], resid))
            else:
                result_data[ds]['results'][no_clusters][alg]['no_iterations'][
                    run] = no_iterations

            result_data[ds]['results'][no_clusters][alg][
                'iteration_durations'][run][
                    param_percent] = iteration_durations
            result_data[ds]['results'][no_clusters][alg]['iteration_changes'][
                run][param_percent] = iteration_changes
            result_data[ds]['results'][no_clusters][alg]['iteration_wcssd'][
                run][param_percent] = iteration_changes

        if remove_incomplete:
            remove_incomplete_data(result_data)

        return result_data
def result_evaluation_memory_consumption(out_folder, out_folder_csv):
  
  for fcnt, plotname in [('do_kmeans', 'kmeans_speeds')]:
    print(plotname)
    run_identifiers = ExperimentDB.get_identifiers(out_folder, fcnt)
    plot_data = {}
    
    result_data = {}
    for run_identifier in run_identifiers:
      db = ExperimentDB(out_folder, fcnt, run_identifier)
      
      for resid in db.get_algorithm_run_ids():
        (control_params, params, res) = db.get_experiment_result_from_run_id(resid)
        if res is None:
          continue
        ds = params['info']['dataset_name']
        alg = params['info']['algorithm']
        no_clusters = params['task']['no_clusters']
        run = params['task']['run']
        duration_kmeans = res['duration_kmeans']
        no_iterations = len(res['iteration_changes'])
         
        if ds not in result_data:
          result_data[ds] = {}
          result_data[ds]['results'] = {}
          result_data[ds]['infos'] = {}
          
        if no_clusters not in result_data[ds]['results']:
          result_data[ds]['results'][no_clusters] = {}
          
        if alg not in result_data[ds]['results'][no_clusters]:
          result_data[ds]['results'][no_clusters][alg] = {}
          
        if 'duration' not in result_data[ds]['results'][no_clusters][alg]:
          result_data[ds]['results'][no_clusters][alg]['duration'] = {}
          
        if 'no_iterations' not in result_data[ds]['results'][no_clusters][alg]:
          result_data[ds]['results'][no_clusters][alg]['no_iterations'] = {}
        
        no_samples = res['input_samples']
        size_of_data_storage_element = 8
        size_of_key_storage_element = 4
        size_of_pointer_storage_element = 8
        
        if alg != 'kmeans':
          no_clusters_remaining = res['no_clusters_remaining']
        
        if alg == 'kmeans':
          mem_consumption = 0
        elif alg == 'elkan':
          # elkan stores two dense matrices
          # 1. lower_bound_matrix = no_samples * no_clusters_remaining
          # 2. distance_between_clusters_matrix = no_clusters_remaining * no_clusters_remaining
          
          lower_bound_matrix_mem_consumption = no_samples * no_clusters_remaining * size_of_data_storage_element
          distance_between_clusters_matrix_mem_consumption = no_clusters_remaining * no_clusters_remaining * size_of_data_storage_element
          mem_consumption = lower_bound_matrix_mem_consumption + distance_between_clusters_matrix_mem_consumption
        elif alg == 'pca_elkan':  
          # pca_elkan stores two dense matrices + orthonormal_basis_matrix + projected_matrix_samples + projected_matrix_clusters
          # 1. lower_bound_matrix = no_samples * no_clusters_remaining
          # 2. distance_between_clusters_matrix = no_clusters_remaining * no_clusters_remaining
          # 3. orthonormal_basis_matrix = no_orthonormal_vectors * orthonormal_basis_matrix_dim
          # 4. projected_matrix_samples = no_samples * dim ( = no_orthonormal_vectors)
          # 5  projected_matrix_clusters = no_clusters_remaining * dim ( = no_orthonormal_vectors)
          lower_bound_matrix_mem_consumption = no_samples * no_clusters_remaining * size_of_data_storage_element
          distance_between_clusters_matrix_mem_consumption = no_clusters_remaining * no_clusters_remaining * size_of_data_storage_element
          
          # These matrices are stored as sparse matrices. Can be changed in the future since these matrices are almost completely dense 
          orthonormal_basis_matrix_mem_consumption = (res['truncated_svd']['no_components']
                                                      * res['truncated_svd']['no_features']
                                                      * (size_of_data_storage_element + size_of_key_storage_element)) \
                                                      + ((res['truncated_svd']['no_components'] + 1) * size_of_pointer_storage_element)
          projected_matrix_samples_mem_consumption = (no_samples * res['truncated_svd']['no_components']
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
                              
          projected_matrix_clusters_mem_consumption = (no_clusters_remaining * res['truncated_svd']['no_components']
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
          
          mem_consumption = lower_bound_matrix_mem_consumption \
                                                     + distance_between_clusters_matrix_mem_consumption \
                                                     + orthonormal_basis_matrix_mem_consumption \
                                                     + projected_matrix_samples_mem_consumption \
                                                     + projected_matrix_clusters_mem_consumption
        elif alg == 'pca_kmeans':
          # pca_elkan stores a orthonormal_basis_matrix + projected_matrix
          # 1. orthonormal_basis_matrix = no_orthonormal_vectors * orthonormal_basis_matrix_dim
          # 2. projected_matrix_samples = no_samples * dim ( = no_orthonormal_vectors)
          # 3  projected_matrix_clusters = no_clusters_remaining * dim ( = no_orthonormal_vectors)
          
          # These matrices are stored as sparse matrices. Can be changed in the future since these matrices are almost completely dense 
          orthonormal_basis_matrix_mem_consumption = (res['truncated_svd']['no_components']
                                                      * res['truncated_svd']['no_features']
                                                      * (size_of_data_storage_element + size_of_key_storage_element)) \
                                                      + ((res['truncated_svd']['no_components'] + 1) * size_of_pointer_storage_element)
          projected_matrix_samples_mem_consumption = (no_samples * res['truncated_svd']['no_components']
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
                              
          projected_matrix_clusters_mem_consumption = (no_clusters_remaining * res['truncated_svd']['no_components']
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
          
          mem_consumption = orthonormal_basis_matrix_mem_consumption \
                            + projected_matrix_samples_mem_consumption \
                            + projected_matrix_clusters_mem_consumption
          
        elif alg == 'kmeans_optimized':
          # kmeans_optimized stores a projected_matrix_samples + projected_matrix_clusters
          # 1. projected_matrix_samples = no_samples * dim ( = no_orthonormal_vectors)
          # 2  projected_matrix_clusters = no_clusters_remaining * dim ( = no_orthonormal_vectors)
          
          annz_projected_matrix_samples = res['block_vector_data']['annz']
          # annz_projected_matrix_clusters was not measured (we use the annz_projected_matrix_samples as an approximation)
          annz_projected_matrix_clusters = annz_projected_matrix_samples
          
          projected_matrix_samples_mem_consumption = (annz_projected_matrix_samples * no_samples
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
                              
          projected_matrix_clusters_mem_consumption = (annz_projected_matrix_clusters * no_clusters_remaining
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
          
          mem_consumption = projected_matrix_samples_mem_consumption \
                            + projected_matrix_clusters_mem_consumption
        elif alg == 'yinyang':
          # yinyang stores a dense matrix to keep a lower bound to every of the t groups
          t = no_clusters_remaining / 10
          mem_consumption = no_samples * t * size_of_data_storage_element
          
        elif alg == 'fast_yinyang':
          # yinyang stores a dense matrix to keep a lower bound to every of the t groups + block vector projected matrices samples/clusters
          # 1. lower_bound_group_matrix = no_samples * t
          # 2. projected_matrix_samples = no_samples * dim ( = no_orthonormal_vectors)
          # 3. projected_matrix_clusters = no_clusters_remaining * dim ( = no_orthonormal_vectors)
          t = no_clusters_remaining / 10
          lower_bound_group_matrix_mem_consumption = no_samples * t * size_of_data_storage_element
          
          annz_projected_matrix_samples = res['block_vector_data']['annz']
          # annz_projected_matrix_clusters was not measured (we use the annz_projected_matrix_samples as an approximation)
          annz_projected_matrix_clusters = annz_projected_matrix_samples
          
          projected_matrix_samples_mem_consumption = (annz_projected_matrix_samples * no_samples
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
                              
          projected_matrix_clusters_mem_consumption = (annz_projected_matrix_clusters * no_clusters_remaining
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
          
          mem_consumption = lower_bound_group_matrix_mem_consumption \
                            + projected_matrix_samples_mem_consumption \
                            + projected_matrix_clusters_mem_consumption
        else:
          raise Exception("please provide details for the memory consumption of %s" % alg)
          
        kmeans_duration_this_run = duration_kmeans
        
        if 'truncated_svd' in res:
          kmeans_duration_this_run += res['truncated_svd']['duration']
        
        
        
        mem_consumption = (mem_consumption / 1024.0) / 1024.0
        result_data[ds]['results'][no_clusters_remaining][alg]['duration'][run] = (float(mem_consumption), kmeans_duration_this_run)
        
        result_data[ds]['infos']['input_dimension'] = res['input_dimension']
        result_data[ds]['infos']['input_samples'] = res['input_samples']
        result_data[ds]['infos']['input_annz'] = res['input_annz']
        result_data[ds]['results'][no_clusters][alg]['no_iterations'][run] = no_iterations
      
      remove_incomplete_data(result_data)
      
      print("Result data:")
      pprint(result_data)
    
    create_plot(output_folder=out_folder_csv,
                plot_name=plotname,
                pdata=result_data)