def test_record_reader_from_cpluplus(self): d = { 'input_key': 'inputkey', 'input_value': 'inputvalue', 'input_split': 'inputsplit', 'input_key_class': 'keyclass', 'input_value_class': 'valueclass', 'job_conf': {} } ctx = pp.get_MapContext_object(d) self.assertEqual(ctx.getInputKey(), d['input_key']) self.assertEqual(ctx.getInputValue(), d['input_value']) self.assertEqual(ctx.getInputSplit(), d['input_split']) self.assertEqual(ctx.getInputKeyClass(), d['input_key_class']) self.assertEqual(ctx.getInputValueClass(), d['input_value_class']) f = Factory(None, None, test_record_reader) rr = f.createRecordReader(ctx) for i in range(test_record_reader.NUMBER_RECORDS): (f, k, v) = pp.get_record_from_record_reader(rr) self.assertTrue(f) self.assertEqual(k, test_record_reader.KEY_FORMAT % (i + 1)) self.assertEqual(v, test_record_reader.DEFAULT_VALUE) self.assertAlmostEqual( pp.get_progress_from_record_reader(rr), float(i + 1) / test_record_reader.NUMBER_RECORDS) (f, k, v) = pp.get_record_from_record_reader(rr) self.assertFalse(f)
def test_factory_costructor(self): f = Factory(mapper, reducer) self.failUnless(isinstance(f.createMapper(self.m_ctx), mapper)) self.failUnless(isinstance(f.createReducer(self.r_ctx), reducer)) #-- f = Factory(mapper, reducer, record_reader) self.failUnless(isinstance(f.createMapper(self.m_ctx), mapper)) self.failUnless(isinstance(f.createReducer(self.r_ctx), reducer)) self.failUnless( isinstance(f.createRecordReader(self.m_ctx), record_reader))
def run_job(): """ Runs the Hadoop pipes task through Pydoop """ from pydoop.pipes import runTask, Factory from seal.seqal.mapper import mapper from seal.seqal.reducer import reducer return runTask(Factory(mapper, reducer))
def test_factory_costructor(self): f = Factory(mapper, reducer) self.failUnless(isinstance(f.createMapper(self.m_ctx), mapper)) self.failUnless(isinstance(f.createReducer(self.r_ctx), reducer)) #-- f = Factory(mapper, reducer, record_reader) self.failUnless(isinstance(f.createMapper(self.m_ctx), mapper)) self.failUnless(isinstance(f.createReducer(self.r_ctx), reducer)) self.failUnless(isinstance(f.createRecordReader(self.m_ctx), record_reader))
def test_partitioner_from_cpluplus(self): d = { 'input_key': 'inputkey', 'input_value': 'inputvalue', 'input_split': 'inputsplit', 'input_key_class': 'keyclass', 'input_value_class': 'valueclass', 'job_conf': {} } ctx = pp.get_MapContext_object(d) self.assertEqual(ctx.getInputKey(), d['input_key']) self.assertEqual(ctx.getInputValue(), d['input_value']) self.assertEqual(ctx.getInputSplit(), d['input_split']) self.assertEqual(ctx.getInputKeyClass(), d['input_key_class']) self.assertEqual(ctx.getInputValueClass(), d['input_value_class']) f = Factory(None, None, partitioner_class=test_partitioner) p = f.createPartitioner(ctx) n_partitions = 4 for i in range(10): k = 'key' + ('a' * i) self.assertEqual( partition_function(k, n_partitions), pp.get_partition_from_partitioner(p, k, n_partitions))
def test_partitioner_from_cpluplus(self): d = {'input_key' : 'inputkey', 'input_value' : 'inputvalue', 'input_split' : 'inputsplit', 'input_key_class' : 'keyclass', 'input_value_class' : 'valueclass', 'job_conf' : {} } ctx = pp.get_MapContext_object(d) self.assertEqual(ctx.getInputKey(), d['input_key']) self.assertEqual(ctx.getInputValue(), d['input_value']) self.assertEqual(ctx.getInputSplit(), d['input_split']) self.assertEqual(ctx.getInputKeyClass(), d['input_key_class']) self.assertEqual(ctx.getInputValueClass(), d['input_value_class']) f = Factory(None, None, partitioner_class=test_partitioner) p = f.createPartitioner(ctx) n_partitions = 4 for i in range(10): k = 'key' + ('a' * i) self.assertEqual( partition_function(k, n_partitions), pp.get_partition_from_partitioner(p, k, n_partitions) )
def test_record_reader_from_cpluplus(self): d = {'input_key' : 'inputkey', 'input_value' : 'inputvalue', 'input_split' : 'inputsplit', 'input_key_class' : 'keyclass', 'input_value_class' : 'valueclass', 'job_conf' : {}} ctx = pp.get_MapContext_object(d) self.assertEqual(ctx.getInputKey(), d['input_key']) self.assertEqual(ctx.getInputValue(), d['input_value']) self.assertEqual(ctx.getInputSplit(), d['input_split']) self.assertEqual(ctx.getInputKeyClass(), d['input_key_class']) self.assertEqual(ctx.getInputValueClass(), d['input_value_class']) f = Factory(None, None, test_record_reader) rr = f.createRecordReader(ctx) for i in range(test_record_reader.NUMBER_RECORDS): (f, k, v) = pp.get_record_from_record_reader(rr) self.assertTrue(f) self.assertEqual(k, test_record_reader.KEY_FORMAT % (i+1)) self.assertEqual(v, test_record_reader.DEFAULT_VALUE) self.assertAlmostEqual(pp.get_progress_from_record_reader(rr), float(i+1)/test_record_reader.NUMBER_RECORDS) (f, k, v) = pp.get_record_from_record_reader(rr) self.assertFalse(f)
def test_map_reduce_factory(self): import gc self.__check_ctx() mapper.call_history = [] reducer.call_history = [] mf = Factory(mapper, reducer) gc.collect() # clean up existing references pp.try_factory_internal(mf) self.assertEqual(0, gc.collect()) self.assertEqual(len(mapper.call_history), 2) self.assertEqual(len(reducer.call_history), 2) f = pp.TestFactory(mf) self.failUnless(isinstance(f.createMapper(self.m_ctx), mapper)) self.failUnless(isinstance(f.createReducer(self.r_ctx), reducer)) self.assertEqual(len(mapper.call_history), 3) self.assertEqual(len(reducer.call_history), 3) self.assertEqual(0, gc.collect())
import struct from pydoop.pipes import Mapper, Reducer, Factory, runTask from pydoop.utils import jc_configure_int class FilterMapper(Mapper): """ Process a wordcount output stream, emitting only records relative to words whose count is equal to or above the configured threshold. """ def __init__(self, context): super(FilterMapper, self).__init__(context) jc = context.getJobConf() jc_configure_int(self, jc, "filter.occurrence.threshold", "threshold") def map(self, context): word, occurrence = (context.getInputKey(), context.getInputValue()) occurrence = struct.unpack(">i", occurrence)[0] if occurrence >= self.threshold: context.emit(word, str(occurrence)) class FilterReducer(Reducer): def reduce(self, context): pass if __name__ == "__main__": runTask(Factory(FilterMapper, FilterReducer))
def run_task(): return runTask(Factory(Mapper, Reducer, combiner_class=Reducer))
def run_task(): return runTask(Factory(mapper, reducer))
def main(argv): runTask(Factory(FastaMapper, FastaReducer, record_reader_class=FastaReader))
def run_task(): return runTask(Factory(Mapper, Reducer))
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import struct from pydoop.pipes import Mapper, Reducer, Factory, runTask class WordCountMapper(Mapper): def map(self, context): words = context.getInputValue().split() for w in words: context.emit(w, "1") class WordCountReducer(Reducer): def reduce(self, context): s = 0 while context.nextValue(): s += int(context.getInputValue()) context.emit(context.getInputKey(), struct.pack(">i", s)) if __name__ == "__main__": runTask(Factory(WordCountMapper, WordCountReducer))