def checkLocalReferences(iterableObject, sourceObject, workflow_stats): if type(iterableObject) is dict: for key, value in iterableObject.iteritems(): if hasattr(iterableObject[key], '__iter__'): checkLocalReferences(iterableObject[key], sourceObject, workflow_stats) elif (type(value) is str): iterableObject[key] = evaluateExpression( value, sourceObject, workflow_stats) if type(iterableObject) is list or type(iterableObject) is tuple: for key, value in enumerate(iterableObject): if hasattr(iterableObject[key], '__iter__'): checkLocalReferences(iterableObject[key], sourceObject, workflow_stats) elif (type(value) is str): iterableObject[key] = evaluateExpression( value, sourceObject, workflow_stats)
def checkReferencesInIterableAttributes(iterableObject,current_workflow_stats): if type(iterableObject) is dict : for key,value in iterableObject.iteritems(): if hasattr(iterableObject[key], '__iter__'): checkReferencesInIterableAttributes(iterableObject[key],current_workflow_stats) #elif (type(value) is str and '.' in value and value.split('.')[0] in current_workflow_stats) : # iterableObject[key]=getInputFromOtherStage(iterableObject[key],current_workflow_stats) elif (type(value) is str) : iterableObject[key]=evaluateExpression(value, {}, current_workflow_stats) if type(iterableObject) is list or type(iterableObject) is tuple : for key,value in enumerate(iterableObject): if hasattr(iterableObject[key], '__iter__'): checkReferencesInIterableAttributes(iterableObject[key],current_workflow_stats) #elif (type(value) is str and '.' in value and value.split('.')[0] in current_workflow_stats) : # iterableObject[key]=getInputFromOtherStage(iterableObject[key],current_workflow_stats) elif (type(value) is str) : iterableObject[key]=evaluateExpression(value, {}, current_workflow_stats)
def project_pipelineOLD(dataset, projection, workflow_stats={}): #this must be modified to be more efficient projectedDataset = [] for d in dataset: objToInsert = {} if (type(projection) is dict): if bool(projection): for key, value in projection.iteritems(): objToInsert[key] = evaluateExpression( value, d, workflow_stats) else: objToInsert = dict(d) elif (type(projection) is str): #it's a value objToInsert['value'] = evaluateExpression(projection, d, workflow_stats) projectedDataset.append(objToInsert) return projectedDataset
def project_pipeline(dataset, projection, workflow_stats={}, parameters=None): #this must be modified to be more efficient projectedDataset = [] if len(dataset) > 0: if (type(projection) is str): #it's a value for d in dataset: objToInsert = {} objToInsert['value'] = evaluateExpression( projection, d, workflow_stats, parameters) projectedDataset.append(objToInsert) elif (type(projection) is dict): simpleProjectionKeys = {} complexProjectionKeys = {} simpleProjectionAid = set(projection.values()) & set( dataset[0].keys()) for key, value in projection.iteritems(): if value in simpleProjectionAid: simpleProjectionKeys[key] = value else: complexProjectionKeys[key] = value for d in dataset: objToInsert = {} for key, value in projection.iteritems(): if key in simpleProjectionKeys: objToInsert[key] = d[value] else: objToInsert[key] = evaluateExpression( value, d, workflow_stats, parameters) projectedDataset.append(objToInsert) return projectedDataset
def accumulator_min(objNew, tupleKey, dataset, mapsGrouperStats, measureKey, measureFormula, workflow_stats, simple=True): grouperEntry = mapsGrouperStats.get(tupleKey) oldValueInDataset = dataset[grouperEntry['index']].get( measureKey, float('+inf')) if simple: objNewMeasure = objNew[measureFormula] else: objNewMeasure = evaluateExpression(measureFormula, objNew, workflow_stats) dataset[grouperEntry['index']][measureKey] = min(oldValueInDataset, objNewMeasure)
def accumulator_avg(objNew, tupleKey, dataset, mapsGrouperStats, measureKey, measureFormula, workflow_stats, simple=True): grouperEntry = mapsGrouperStats.get(tupleKey) oldValueInDataset = dataset[grouperEntry['index']].get( measureKey, float(0)) if simple: objNewMeasure = objNew[measureFormula] else: objNewMeasure = evaluateExpression(measureFormula, objNew, workflow_stats) #ONE PASS MEAN : m_{k-1} + (x_k - m_{k-1}) / k dataset[grouperEntry['index']][measureKey] = oldValueInDataset + float( objNewMeasure - oldValueInDataset) / grouperEntry['count']
def accumulator_setappend(objNew, tupleKey, dataset, mapsGrouperStats, measureKey, measureFormula, workflow_stats, simple=True): grouperEntry = mapsGrouperStats.get(tupleKey) #oldValueInDataset=dataset[grouperEntry['index']].get(measureKey,set()) if simple: objNewMeasure = objNew[measureFormula] else: objNewMeasure = evaluateExpression(measureFormula, objNew, workflow_stats) try: dataset[grouperEntry['index']][measureKey] |= {objNewMeasure} except: dataset[grouperEntry['index']][measureKey] = {objNewMeasure}
def accumulator_append(objNew, tupleKey, dataset, mapsGrouperStats, measureKey, measureFormula, workflow_stats, simple=True): grouperEntry = mapsGrouperStats.get(tupleKey) #print measureKey,dataset[grouperEntry['index']] oldValueInDataset = dataset[grouperEntry['index']].get(measureKey, []) if simple: objNewMeasure = objNew[measureFormula] else: objNewMeasure = evaluateExpression(measureFormula, objNew, workflow_stats) dataset[grouperEntry['index']][measureKey] = oldValueInDataset dataset[grouperEntry['index']][measureKey].append(objNewMeasure)
def accumulator_countDistinct(objNew, tupleKey, dataset, mapsGrouperStats, measureKey, measureFormula, workflow_stats, simple=True): grouperEntry = mapsGrouperStats.get(tupleKey) #oldValueInDataset=dataset[grouperEntry['index']].get(measureKey,float(0)) if simple: objNewMeasure = objNew[measureFormula] else: objNewMeasure = evaluateExpression(measureFormula, objNew, workflow_stats) if (grouperEntry.has_key('set')): grouperEntry['set'] |= {objNewMeasure} else: grouperEntry['set'] = set() grouperEntry['set'] |= {objNewMeasure} dataset[grouperEntry['index']][measureKey] = len(grouperEntry['set'])
def process_workflow_innerRecursive(workflow,current_workflow_stats={},currentIndex=0,fromIter=False,verbose=False): if currentIndex < len(workflow) : startProccessingStage = time() stage=workflow[currentIndex] stage['execute']=stage.get('execute',True) if (stage['type'] in ITERATORS_STAGES_ARRAY): stageCopy={} stageCopy['inputs']=deepcopy(stage['inputs']) stageCopy['configuration']=deepcopy(stage['configuration']) stageCopy['execute']=deepcopy(stage['execute']) if verbose : utilPrint('start processing stage ' + stage['id'] + ' of type' + stage['type']) iterableStage=process_workflowStage(stage,current_workflow_stats) lastReachedIndex=currentIndex+1 enditerations=0 beforeiterations=0 sumTimeIterations=0 while(next(iterableStage,None)): beforeiterations = time() newCurrentIndex=currentIndex+1 newFromIter=True lastReachedIndex=process_workflow_innerRecursive(workflow,current_workflow_stats,newCurrentIndex,newFromIter,verbose) enditerations = time() sumTimeIterations+=(enditerations-beforeiterations) stage['inputs']=deepcopy(stageCopy['inputs']) stage['configuration']=deepcopy(stageCopy['configuration']) stage['execute']=deepcopy(stageCopy['execute']) endProccessingStage = time() stage['timespent']=stage.get('timespent',0)+ (endProccessingStage-startProccessingStage)-sumTimeIterations return process_workflow_innerRecursive(workflow,current_workflow_stats,lastReachedIndex+1,fromIter,verbose) elif (stage['type'] in MATCHERS_STAGES_ARRAY): if evaluateExpression(stage['execute'] , {}, current_workflow_stats): stageCopy={} stageCopy['inputs']=deepcopy(stage['inputs']) stageCopy['configuration']=deepcopy(stage['configuration']) stageCopy['execute']=deepcopy(stage['execute']) if verbose : utilPrint('start processing stage ' + stage['id'] + ' of type : ' + stage['type']) process_workflowStage(stage,current_workflow_stats) stage['inputs']=(stageCopy['inputs']) stage['configuration']=(stageCopy['configuration']) stage['execute']=(stageCopy['execute']) if (stage['outputs']['continue']) : currentIndex+=1 endProccessingStage = time() stage['timespent']=stage.get('timespent',0)+ (endProccessingStage-startProccessingStage) return process_workflow_innerRecursive(workflow,current_workflow_stats,currentIndex,fromIter,verbose) else : currentChain=getWorkflowChain(workflow) for indexToReturn in range(currentIndex,len(currentChain)): if currentChain[indexToReturn]==')' : break endProccessingStage = time() stage['timespent']=stage.get('timespent',0)+ (endProccessingStage-startProccessingStage) return indexToReturn else : currentIndex+=1 endProccessingStage = time() stage['timespent']=stage.get('timespent',0)+ (endProccessingStage-startProccessingStage) return process_workflow_innerRecursive(workflow,current_workflow_stats,currentIndex,fromIter,verbose) elif (stage['type'] in SYNCERS_STAGES_ARRAY and fromIter): #this is processed each time this method reach a syncer stage after starting an iterator stageCopy={} stageCopy['inputs']=deepcopy(stage['inputs']) stageCopy['configuration']=deepcopy(stage['configuration']) stageCopy['execute']=deepcopy(stage['execute']) if verbose : utilPrint('start processing stage ' + stage['id'] + ' of type : ' + stage['type']) process_workflowStage(stage,current_workflow_stats) stage['inputs']=(stageCopy['inputs']) stage['configuration']=(stageCopy['configuration']) stage['execute']=(stageCopy['execute']) #Nested ietration syncers must be reinitialized after each terminaison syncersStageToReInit=get_sincerStages_to_reinitialize(workflow[:currentIndex+1]) for i in syncersStageToReInit: stageToReinit=workflow[i] stageToReinit['outputs']={'syncedData':None} endProccessingStage = time() stage['timespent']=stage.get('timespent',0)+ (endProccessingStage-startProccessingStage) return currentIndex #process_workflow_innerRecursive(workflow,current_workflow_stats,currentIndex+1,verbose) elif (stage['type'] in SYNCERS_STAGES_ARRAY and not fromIter): #this is processed each time this method reach a syncer stage but not after an iteration of stages, it means do nothing and process the next stages #new stageCopy={} stageCopy['inputs']=deepcopy(stage['inputs']) stageCopy['configuration']=deepcopy(stage['configuration']) stageCopy['execute']=deepcopy(stage['execute']) if verbose : utilPrint('start processing stage ' + stage['id'] + ' of type : ' + stage['type']) process_workflowStage(stage,current_workflow_stats) stage['inputs']=(stageCopy['inputs']) stage['configuration']=(stageCopy['configuration']) stage['execute']=(stageCopy['execute']) #new currentIndex+=1 endProccessingStage = time() stage['timespent']=stage.get('timespent',0)+ (endProccessingStage-startProccessingStage) return process_workflow_innerRecursive(workflow,current_workflow_stats,currentIndex,fromIter,verbose) else : currentIndex+=1 startStage = time() if evaluateExpression(stage['execute'] , {}, current_workflow_stats): stageCopy={} stageCopy['inputs']=deepcopy(stage['inputs']) stageCopy['configuration']=deepcopy(stage['configuration']) stageCopy['execute']=deepcopy(stage['execute']) if verbose : utilPrint('start processing stage ' + stage['id'] + ' of type' + stage['type']) process_workflowStage(stage,current_workflow_stats) stopStage = time() if verbose : utilPrint('time elapsed while processing stage '+ stage['id']+ ' : '+ str(stopStage-startStage)) stage['inputs']=(stageCopy['inputs']) stage['configuration']=(stageCopy['configuration']) stage['execute']=(stageCopy['execute']) endProccessingStage = time() stage['timespent']=stage.get('timespent',0)+ (endProccessingStage-startProccessingStage) return process_workflow_innerRecursive(workflow,current_workflow_stats,currentIndex,fromIter,verbose) #ADD RETURN return currentIndex
def aggregate_dataset(dataset, dimensions, measures, workflow_stats={}): header = dimensions.keys() + measures.keys() rawAggregateMapping = {} #tuple and their index in array flatResultsDataset = [] if len(dataset) > 0: simpleGroupAid = set(dimensions.values()) & set(dataset[0].keys()) key_formula_measures = { k: v[v.keys()[0]] for k, v in measures.iteritems() } key_acc_measures = {k: v.keys()[0] for k, v in measures.iteritems()} simpleMeasuresAid = set(key_formula_measures.values()) & set( dataset[0].keys()) simpleGroupKeys = { key: value for key, value in dimensions.iteritems() if value in simpleGroupAid } simpleMeasuresKeys = { key: value for key, value in key_formula_measures.iteritems() if value in simpleMeasuresAid } accumulators_ready_to_use = [] for key, value in key_formula_measures.iteritems(): accumulator_callable = ACCUMULATOR_MAP_FUNCTIONS[ key_acc_measures[key]] accumulators_ready_to_use.append( [key, value, accumulator_callable, key in simpleMeasuresKeys]) resultsAppender = flatResultsDataset.append dimensions_iteritems = [[key, value, key in simpleGroupKeys] for key, value in dimensions.iteritems()] sizeOfResults = 0 for d in dataset: objToInsert = { key: d[value] if is_simple else evaluateExpression( value, d, workflow_stats) for key, value, is_simple in dimensions_iteritems } tuple_key = tuple(objToInsert.iteritems()) try: rawAggregateMapping[tuple_key]['count'] += 1 except: resultsAppender(objToInsert) rawAggregateMapping[tuple_key] = { 'index': sizeOfResults, 'count': 1 } sizeOfResults += 1 for key, value, acc_callable, is_simple_measure in accumulators_ready_to_use: acc_callable(dict(d), tuple_key, flatResultsDataset, rawAggregateMapping, key, value, workflow_stats, is_simple_measure) elif len(dimensions) == 0: #key_formula_measures= {k:v[v.keys()[0]] for k,v in measures.iteritems()} key_acc_measures = {k: v.keys()[0] for k, v in measures.iteritems()} accumulators_ready_to_use = [] rowToReturn = {} for key, value in key_acc_measures.iteritems(): rowToReturn[key] = ACCUMULATOR_MAP_FUNCTIONS_INIT[value] flatResultsDataset.append(rowToReturn) return flatResultsDataset, header
def aggregate_datasetOLD(dataset, dimensions, measures, workflow_stats={}): #mapping header = dimensions.keys() + measures.keys() rawAggregateMapping = {} #tuple and their index in array flatResultsDataset = [] simpleGroupKeys = {} complexGroupKeys = {} simpleGroupAid = set(dimensions.values()) & set(dataset[0].keys()) simpleMeasuresKeys = {} complexMeasuresKeys = {} key_formula_measures = {k: v[v.keys()[0]] for k, v in measures.iteritems()} simpleMeasuresAid = set(key_formula_measures.values()) & set( dataset[0].keys()) for key, value in dimensions.iteritems(): if value in simpleGroupAid: simpleGroupKeys[key] = value else: complexGroupKeys[key] = value for key, value in key_formula_measures.iteritems(): if value in simpleMeasuresAid: simpleMeasuresKeys[key] = value else: complexMeasuresKeys[key] = value for d in dataset: objToInsert = {} for key, value in dimensions.iteritems(): if key in simpleGroupKeys: objToInsert[key] = d[value] else: objToInsert[key] = evaluateExpression(value, d, workflow_stats) tuple_key = tuple(objToInsert.iteritems()) for key, value in key_formula_measures.iteritems(): if key in simpleMeasuresKeys: objToInsert[key] = d[value] else: objToInsert[key] = evaluateExpression(value, d, workflow_stats) indexOfObjInFlatResultsDataset = rawAggregateMapping.get( tuple_key, None) if indexOfObjInFlatResultsDataset is None: flatResultsDataset.append(objToInsert) rawAggregateMapping[tuple_key] = { 'index': len(flatResultsDataset) - 1 } for key, value in measures.iteritems(): accumulator = value.keys()[0] if accumulator == 'append': flatResultsDataset[ rawAggregateMapping[tuple_key]['index']][key] = [ flatResultsDataset[rawAggregateMapping[tuple_key] ['index']][key] ] elif accumulator == 'set_append': flatResultsDataset[ rawAggregateMapping[tuple_key]['index']][key] = set([ flatResultsDataset[rawAggregateMapping[tuple_key] ['index']][key] ]) else: for key, value in measures.iteritems(): accumulator = value.keys()[0] if accumulator == 'sum': flatResultsDataset[indexOfObjInFlatResultsDataset[ 'index']][key] += objToInsert[key] elif accumulator == 'max': flatResultsDataset[indexOfObjInFlatResultsDataset[ 'index']][key] = objToInsert[ key] if objToInsert[key] > flatResultsDataset[ indexOfObjInFlatResultsDataset['index']][ key] else flatResultsDataset[ indexOfObjInFlatResultsDataset[ 'index']][key] elif accumulator == 'min': flatResultsDataset[indexOfObjInFlatResultsDataset[ 'index']][key] = objToInsert[ key] if objToInsert[key] < flatResultsDataset[ indexOfObjInFlatResultsDataset['index']][ key] else flatResultsDataset[ indexOfObjInFlatResultsDataset[ 'index']][key] elif accumulator == 'avg': #ONE PASS MEAN : m_{k-1} + (x_k - m_{k-1}) / k rawAggregateMapping[tuple_key][ 'count'] = rawAggregateMapping[tuple_key].get( 'count', 1) + 1 oldAvg = flatResultsDataset[ indexOfObjInFlatResultsDataset['index']][key] flatResultsDataset[indexOfObjInFlatResultsDataset[ 'index']][key] = oldAvg + ( float(objToInsert[key] - oldAvg) / float(rawAggregateMapping[tuple_key]['count'])) elif accumulator == 'append': if type(flatResultsDataset[indexOfObjInFlatResultsDataset[ 'index']][key]) is list: flatResultsDataset[indexOfObjInFlatResultsDataset[ 'index']][key].append(objToInsert[key]) else: arr = [] arr.append(flatResultsDataset[ indexOfObjInFlatResultsDataset['index']][key]) flatResultsDataset[ indexOfObjInFlatResultsDataset['index']][key] = arr flatResultsDataset[indexOfObjInFlatResultsDataset[ 'index']][key].append(objToInsert[key]) elif accumulator == 'set_append': flatResultsDataset[ indexOfObjInFlatResultsDataset['index']][key].add( objToInsert[key]) return flatResultsDataset, header