def __run_test(self, mode, mapper_class, context_class): cmd_file = self.__write_cmd_file(mode) pp.run_task( pp.Factory(mapper_class=mapper_class), private_encoding=False, context_class=context_class, cmd_file=cmd_file ) out_fn = cmd_file + '.out' out_records = [] with open(out_fn) as ostream: for cmd, args in BinaryDownStreamFilter(ostream): if cmd == 'output': name, color = args out_records.append({'name': name, 'favorite_color': color}) self.assertEqual(len(out_records), len(self.records)) for out_r, r in zip(out_records, self.records): for k, v in out_r.iteritems(): self.assertEqual(v, r[k])
def run_local_avro(logger, avro_in='v', avro_out=None): mapper, reducer = AVRO_MAPPERS[avro_in], AVRO_REDUCERS[avro_out] schema_k_out = STATS_SCHEMA_STR if avro_out in {'k', 'kv'} else None schema_v_out = STATS_SCHEMA_STR if avro_out in {'v', 'kv'} else None file_in = USERS_PETS_FN if avro_in == 'kv' else AVRO_FN factory = pp.Factory(mapper_class=mapper, reducer_class=reducer) simulator = HadoopSimulatorLocal(factory, logger, logging.INFO, AvroContext, avro_in, avro_out, schema_k_out, schema_v_out) with open(file_in, 'rb') as fin, open(DATA_OUT, 'wb') as fout: simulator.run(fin, fout, {}, num_reducers=1) dump_counters(simulator, logger) if avro_out: data_out_des = DATA_OUT + '-des' avro_container_dump_results.main(DATA_OUT, data_out_des, avro_out) avro_check_results.main(USERS_CSV_FN, data_out_des) else: avro_check_results.main(USERS_CSV_FN, DATA_OUT)
def __run_test(self, mode, mapper_class, context_class): cmd_file = self.__write_cmd_file(mode) pp.run_task(pp.Factory(mapper_class=mapper_class), private_encoding=False, context_class=context_class, cmd_file=cmd_file) out_fn = cmd_file + '.out' out_records = [] with open(out_fn, 'rb') as f: bf = BinaryDownStreamAdapter(f) for cmd, args in bf: if cmd == bf.OUTPUT: name, color = args out_records.append({'name': name, 'favorite_color': color}) self.assertEqual(len(out_records), len(self.records)) for out_r, r in zip(out_records, self.records): for k, v in iteritems(out_r): self.assertEqual(v.decode('UTF-8'), r[k])
bneck_map = self.bneck_store.get_bnecks(top_dir) self.bnecks, self.gtruths = BottleneckStore.bnecks_map_to_vectors( bneck_map, BottleneckStore.assign_labels(top_dir)) def map(self, context): LOGGER.info("testing %s" % (context.value)) with tf.Session(graph=tf.Graph()) as session: models.load_checkpoint(context.value) graph = session.graph eval_step, prediction, bneck_input, gtruth_input = ( self.model.get_eval_step(graph), self.model.get_prediction(graph), self.model.get_bneck_input(graph), self.model.get_gtruth_input(graph), ) test_accuracy, predictions = session.run([eval_step, prediction], feed_dict={ bneck_input: self.bnecks, gtruth_input: self.gtruths }) context.emit(context.value, str(test_accuracy)) factory = pp.Factory(mapper_class=Mapper, record_reader_class=PathNameReader) def __main__(): pp.run_task(factory)
# END_COPYRIGHT """\ Includes only the bare minimum required to run wordcount. See wordcount-full.py for an example that uses counters, RecordReader, etc. """ # DOCS_INCLUDE_START import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pipes class Mapper(api.Mapper): def map(self, context): for w in context.value.split(): context.emit(w, 1) class Reducer(api.Reducer): def reduce(self, context): context.emit(context.key, sum(context.values)) FACTORY = pipes.Factory(mapper_class=Mapper, reducer_class=Reducer) def main(): pipes.run_task(FACTORY) if __name__ == "__main__": main()
def __main__(): pp.run_task(pp.Factory(mapper_class=Mapper), context_class=AvroContext)
# License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import re import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pp class Mapper(api.Mapper): def map(self, context): words = re.sub('[^0-9a-zA-Z]+', ' ', context.getInputValue()).split() for w in words: context.emit(w, 1) class Reducer(api.Reducer): def reduce(self, context): s = sum(context.values) context.emit(context.key, s) factory = pp.Factory(mapper_class=Mapper, reducer_class=Reducer) def __main__(): pp.run_task(factory)
def run_task(mapper_class, reducer_class=NoAvroColorCount): pp.run_task(pp.Factory(mapper_class=mapper_class, reducer_class=reducer_class), private_encoding=True, context_class=AvroContext)
class StupidReducer(api.Reducer): def __init__(self, context): super(StupidReducer, self).__init__(context) self.logger = LOGGER.getChild("Reducer") def reduce(self, context): fname = context.key recs = sorted(context.values, key=lambda _: _[0].offset) offset, length = recs[0][0].offset, recs[0][0].length lbndry, rbndry = recs[0][1] for r in recs[1:]: assert r[0].offset == offset + length assert rbndry <= r[1][0] offset, length = r[0].offset, r[0].length rbndry = r[1][1] context.emit(fname, [lbndry, rbndry]) factory = pp.Factory( mapper_class=StupidMapper, reducer_class=StupidReducer, record_reader_class=Reader, ) def __main__(): pp.run_task(factory)
def __main__(): pp.run_task(pp.Factory( mapper_class=Mapper, reducer_class=Reducer, combiner_class=Reducer ))
class Partitioner(api.Partitioner): def __init__(self, context): super(Partitioner, self).__init__(context) self.logger = LOGGER.getChild("Partitioner") def partition(self, key, num_reduces): reducer_id = (hash(key) & sys.maxint) % num_reduces self.logger.debug("reducer_id: %r" % reducer_id) return reducer_id FACTORY = pp.Factory(mapper_class=Mapper, reducer_class=Reducer, record_reader_class=Reader, record_writer_class=Writer, partitioner_class=Partitioner, combiner_class=Reducer) def main(): pp.run_task(FACTORY) if __name__ == "__main__": main() # Local Variables: # mode: python # End:
def __main__(): factory = pp.Factory(Mapper, Reducer) pp.run_task(factory, context_class=Context)
def __main__(): pipes.run_task(pipes.Factory(mapper_class=Mapper))
super(AvroContext, self).set_job_conf(vals) schema = avro.schema.parse(self._job_conf[AVRO_SCHEMA_KEY]) self.datum_reader = DatumReader(schema) def get_input_value(self): # FIXME reuse, reuse, reuse sys.stderr.write('value: %r\n' % self._value) f = StringIO(self._value) dec = BinaryDecoder(f) return self.datum_reader.read(dec) class ColorPick(api.Mapper): def map(self, ctx): user = ctx.value color = user['favorite_color'] sys.stderr.write('user: %r' % user) if color is not None: ctx.emit(user['office'], Counter({color: 1})) class ColorCount(api.Reducer): def reduce(self, ctx): s = sum(ctx.values, Counter()) ctx.emit(ctx.key, "%r" % s) pp.run_task(pp.Factory(mapper_class=ColorPick, reducer_class=ColorCount), private_encoding=True, context_class=AvroContext)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pydoop.mapreduce.api as api import pydoop.mapreduce.pipes as pp from pydoop.avrolib import AvroContext class Mapper(api.Mapper): def map(self, context): context.emit('', context.value['population']) class Reducer(api.Reducer): def reduce(self, context): context.emit('', sum(context.values)) FACTORY = pp.Factory(Mapper, Reducer) CONTEXT = AvroContext def __main__(): pp.run_task(FACTORY, private_encoding=True, context_class=CONTEXT)
pass class ColorWriter(AvroWriter): schema = parse(open("stats.avsc").read()) def emit(self, key, value): self.writer.append({'office': key, 'counts': value}) class ColorPick(api.Mapper): def map(self, ctx): user = ctx.value color = user['favorite_color'] if color is not None: ctx.emit(user['office'], Counter({color: 1})) class ColorCount(api.Reducer): def reduce(self, ctx): s = sum(ctx.values, Counter()) ctx.emit(ctx.key, s) pp.run_task(pp.Factory(mapper_class=ColorPick, reducer_class=ColorCount, record_reader_class=UserReader, record_writer_class=ColorWriter), private_encoding=True)
def __main__(): pipes.run_task( pipes.Factory( mapper_class=Mapper, record_writer_class=Writer, ))
def __main__(): factory = pp.Factory(mapper_class=Mapper) pp.run_task(factory, context_class=AvroContext)
def __main__(): factory = pp.Factory(Mapper, Reducer) pp.run_task(factory, private_encoding=True)
def __main__(): """Main function to be executed by pydoop framework""" factory = pp.Factory(mapper_class=Mapper, reducer_class=Reducer, record_reader_class=Reader) pp.run_task(factory, private_encoding=True)
self.bytes_read = 0 if self.isplit.offset > 0: discarded = self.file.readline() self.bytes_read += len(discarded) def close(self): self.file.close() self.file.fs.close() def next(self): if self.bytes_read > self.isplit.length: raise StopIteration key = serialize_to_string(self.isplit.offset + self.bytes_read) record = self.file.readline() if record == "": # end of file raise StopIteration self.bytes_read += len(record) return (key, record) def get_progress(self): return min(float(self.bytes_read) / self.isplit.length, 1.0) factory = pp.Factory(mapper_class=Mapper, reducer_class=Reducer, record_reader_class=Reader) def __main__(): pp.run_task(factory)
# TODO: look for a way to avoid the local write path, signal = context.key, context.value rr = utils.estimate_rainfall(signal) dt_string = os.path.splitext(hdfs.path.basename(path))[0] out_name = "%s.tif" % dt_string dt = datetime.strptime(dt_string, IN_FMT) metadata = {tiffio.DT_TAG: dt.strftime(tiffio.DT_FMT)} self.ga.save_as_gtiff(out_name, rr, metadata=metadata) with io.open(out_name, "rb") as f: value = f.read() context.emit(out_name, value) class Writer(api.RecordWriter): def __init__(self, context): super().__init__(context) self.d = context.get_work_path() def emit(self, key, value): with hdfs.open(hdfs.path.join(self.d, key), "wb") as f: f.write(value) factory = pp.Factory(mapper_class=Mapper, record_reader_class=Reader, record_writer_class=Writer) def __main__(): pp.run_task(factory)
def __main__(): pipes.run_task( pipes.Factory(Mapper, record_writer_class=Writer, record_reader_class=Reader))
def __main__(): factory = pp.Factory(mapper_class=Mapper, reducer_class=Reducer) pp.run_task(factory, private_encoding=True, context_class=AvroContext)
def __main__(): factory = pp.Factory(Mapper, Reducer) pp.run_task(factory)
def __main__(): pp.run_task(pp.Factory(Mapper, Reducer))
def map(self, context): i = context.key train_batch, val_batch = context.value train_bnecks, train_gtruths = self.__map_to_vectors(train_batch) val_bnecks, val_gtruths = self.__map_to_vectors(val_batch) self.retrainer.run_train_step(train_bnecks, train_gtruths) if (i % self.eval_step_interval == 0) or (i + 1 >= self.n_steps): train_accuracy, cross_entropy = self.retrainer.run_eval_step( train_bnecks, train_gtruths) LOGGER.info('step %d: train accuracy = %f%%, cross entropy = %f', i, 100 * train_accuracy, cross_entropy) val_accuracy = self.retrainer.run_validation_step( val_bnecks, val_gtruths) LOGGER.info('step %d: validation accuracy = %f%%', i, 100 * val_accuracy) context.emit( i, "%s\t%s\t%s" % (cross_entropy, train_accuracy, val_accuracy)) def __map_to_vectors(self, batch): return BottleneckStore.bnecks_map_to_vectors(batch, self.labels) factory = pp.Factory(mapper_class=Mapper, record_reader_class=BottleneckProjectionsReader) def __main__(): pp.run_task(factory)
class StupidMapper(api.Mapper): def __init__(self, context): super(StupidMapper, self).__init__(context) self.logger = LOGGER.getChild("Mapper") def map(self, context): self.logger.debug('key: %s, val: %s', context.key, context.value) context.emit(context.key, context.value) class StupidReducer(api.Reducer): def reduce(self, context): key = context.key for v in context.values: context.emit(key, v) factory = pp.Factory( mapper_class=StupidMapper, reducer_class=StupidReducer, partitioner_class=Partitioner, record_reader_class=Reader, record_writer_class=Writer, ) def __main__(): pp.run_task(factory, private_encoding=False, auto_serialize=False)
def __main__(): pp.run_task(pp.Factory(Mapper, None))