def map(self, records, task): if len(self.preMappers) == 0 and not self.secondsort: for key, record in records: task.collect(key, record) else: if self.mapFunctionChain is None: # set up mapper input fn: if self.inputJson: def mapperfn(_, records): for key, value in records: yield key, happy.json.decode(value) else: def mapperfn(_, records): return records # emitting raw text: if self.reducetasks == 0 and not self.outputJson: def collector(k, v): task.collect(k, v) # secondary sort: elif self.secondsort: textint = TextInt() def collector(k, v): if len(v) != 2 or not isinstance(v[0], int): raise Exception("Invalid value " + str(v) + " for a secondary sort, (<int>, <obj>) tuple required") textint.setString(k) textint.setInt(v[0]) task.collect(textint, happy.json.encode(v)) # json output: else: def collector(k, v): task.collect(k, happy.json.encode(v)) self.mapFunctionChain = FunctionChain([mapperfn] + self.preMappers + [collector], self._recordError) # do the work: self.mapFunctionChain.callChain(None, records)
def reduce(self, key, values, task): if self.reducer is None: for value in values: task.collect(key, value) else: if self.reduceFunctionChain is None: # emitting raw text: if not self.outputJson: def collector(k, v): task.collect(k, v) # json output: else: def collector(k, v): task.collect(k, happy.json.encode(v)) self.reduceFunctionChain = FunctionChain([self.reducer] + self.postMappers + [collector], self._recordError) # second sort key: if self.secondsort: key = key.getString() # do the work: self.reduceFunctionChain.callChain(key, self._jsonReduceIterator(values))
class PipeJob(happy.HappyJob): """ The job that executes a series of pipes. """ def __init__(self, spec): happy.HappyJob.__init__(self) self.id = spec.id self.inputpaths = spec.inputpaths self.inputformat = spec.inputformat self.inputJson = spec.inputJson self.outputpath = spec.outputpath self.outputformat = spec.outputformat self.compressoutput = spec.compressoutput if spec.compressiontype is not None: self.compressiontype = spec.compressiontype self.jobargs = spec.jobargs self.outputJson = spec.outputJson self.preMappers = spec.preMappers[:] self.reducer = spec.reducer if self.reducer is None: self.reducetasks = 0 self.postMappers = spec.postMappers[:] self.errorpath = spec.workpath + "/errors" self.errorcollectors = {} # build a job name: prenames = [f.__name__ for f in self.preMappers] if self.reducer is not None: reducername = [self.reducer.__name__] else: reducername = [] postnames = [f.__name__ for f in self.postMappers] self.jobname = _scriptname + " " + str(spec.id) + " " + "-".join(prenames + reducername + postnames) # config second sort: self.secondsort = spec.secondsort if self.secondsort: self.jobargs["mapred.output.value.groupfn.class"] = "com.freebase.happy.util.TextInt$TextComparator" self.jobargs["mapred.partitioner.class"] = "com.freebase.happy.util.TextInt$TextPartitioner" self.mapoutputkey = "com.freebase.happy.util.TextInt" # init function chains: self.mapFunctionChain = None self.reduceFunctionChain = None def mapconfig(self): self.jobstage = "map" def map(self, records, task): if len(self.preMappers) == 0 and not self.secondsort: for key, record in records: task.collect(key, record) else: if self.mapFunctionChain is None: # set up mapper input fn: if self.inputJson: def mapperfn(_, records): for key, value in records: yield key, happy.json.decode(value) else: def mapperfn(_, records): return records # emitting raw text: if self.reducetasks == 0 and not self.outputJson: def collector(k, v): task.collect(k, v) # secondary sort: elif self.secondsort: textint = TextInt() def collector(k, v): if len(v) != 2 or not isinstance(v[0], int): raise Exception("Invalid value " + str(v) + " for a secondary sort, (<int>, <obj>) tuple required") textint.setString(k) textint.setInt(v[0]) task.collect(textint, happy.json.encode(v)) # json output: else: def collector(k, v): task.collect(k, happy.json.encode(v)) self.mapFunctionChain = FunctionChain([mapperfn] + self.preMappers + [collector], self._recordError) # do the work: self.mapFunctionChain.callChain(None, records) def reduceconfig(self): self.jobstage = "reduce" def reduce(self, key, values, task): if self.reducer is None: for value in values: task.collect(key, value) else: if self.reduceFunctionChain is None: # emitting raw text: if not self.outputJson: def collector(k, v): task.collect(k, v) # json output: else: def collector(k, v): task.collect(k, happy.json.encode(v)) self.reduceFunctionChain = FunctionChain([self.reducer] + self.postMappers + [collector], self._recordError) # second sort key: if self.secondsort: key = key.getString() # do the work: self.reduceFunctionChain.callChain(key, self._jsonReduceIterator(values)) def _recordError(self, key, value, message, operation): """ Records an error to the log and self.errorcollector. """ errorcollector = self.errorcollectors.get(operation) if errorcollector is None: self.errorcollectors[operation] = errorcollector = \ happy.dfs.createPartitionedCollector(self.errorpath + "/job-" + str(self.id) + "-" + self.jobstage + "-" + operation, type="text") errorcollector.collect(key, happy.json.encode({"key":key, "value":value, "operation": operation, "error": message})) currentErrors = happy.results.get("happy.cloud.dataerrors") if currentErrors is None: currentErrors = 1 else: currentErrors += 1 happy.results["happy.cloud.dataerrors"] = currentErrors def _jsonReduceIterator(self, records): for encodedRecord in records: yield happy.json.decode(encodedRecord) def run(self): happy.dfs.delete(self.outputpath) return happy.HappyJob.run(self)