示例#1
0
 def test_record_reader_from_cpluplus(self):
     d = {
         'input_key': 'inputkey',
         'input_value': 'inputvalue',
         'input_split': 'inputsplit',
         'input_key_class': 'keyclass',
         'input_value_class': 'valueclass',
         'job_conf': {}
     }
     ctx = pp.get_MapContext_object(d)
     self.assertEqual(ctx.getInputKey(), d['input_key'])
     self.assertEqual(ctx.getInputValue(), d['input_value'])
     self.assertEqual(ctx.getInputSplit(), d['input_split'])
     self.assertEqual(ctx.getInputKeyClass(), d['input_key_class'])
     self.assertEqual(ctx.getInputValueClass(), d['input_value_class'])
     f = Factory(None, None, test_record_reader)
     rr = f.createRecordReader(ctx)
     for i in range(test_record_reader.NUMBER_RECORDS):
         (f, k, v) = pp.get_record_from_record_reader(rr)
         self.assertTrue(f)
         self.assertEqual(k, test_record_reader.KEY_FORMAT % (i + 1))
         self.assertEqual(v, test_record_reader.DEFAULT_VALUE)
         self.assertAlmostEqual(
             pp.get_progress_from_record_reader(rr),
             float(i + 1) / test_record_reader.NUMBER_RECORDS)
     (f, k, v) = pp.get_record_from_record_reader(rr)
     self.assertFalse(f)
示例#2
0
 def test_factory_costructor(self):
     f = Factory(mapper, reducer)
     self.failUnless(isinstance(f.createMapper(self.m_ctx), mapper))
     self.failUnless(isinstance(f.createReducer(self.r_ctx), reducer))
     #--
     f = Factory(mapper, reducer, record_reader)
     self.failUnless(isinstance(f.createMapper(self.m_ctx), mapper))
     self.failUnless(isinstance(f.createReducer(self.r_ctx), reducer))
     self.failUnless(
         isinstance(f.createRecordReader(self.m_ctx), record_reader))
示例#3
0
def run_job():
    """
    Runs the Hadoop pipes task through Pydoop
    """
    from pydoop.pipes import runTask, Factory
    from seal.seqal.mapper import mapper
    from seal.seqal.reducer import reducer
    return runTask(Factory(mapper, reducer))
示例#4
0
 def test_factory_costructor(self):
   f = Factory(mapper, reducer)
   self.failUnless(isinstance(f.createMapper(self.m_ctx), mapper))
   self.failUnless(isinstance(f.createReducer(self.r_ctx), reducer))
   #--
   f = Factory(mapper, reducer, record_reader)
   self.failUnless(isinstance(f.createMapper(self.m_ctx), mapper))
   self.failUnless(isinstance(f.createReducer(self.r_ctx), reducer))
   self.failUnless(isinstance(f.createRecordReader(self.m_ctx), record_reader))
示例#5
0
 def test_partitioner_from_cpluplus(self):
     d = {
         'input_key': 'inputkey',
         'input_value': 'inputvalue',
         'input_split': 'inputsplit',
         'input_key_class': 'keyclass',
         'input_value_class': 'valueclass',
         'job_conf': {}
     }
     ctx = pp.get_MapContext_object(d)
     self.assertEqual(ctx.getInputKey(), d['input_key'])
     self.assertEqual(ctx.getInputValue(), d['input_value'])
     self.assertEqual(ctx.getInputSplit(), d['input_split'])
     self.assertEqual(ctx.getInputKeyClass(), d['input_key_class'])
     self.assertEqual(ctx.getInputValueClass(), d['input_value_class'])
     f = Factory(None, None, partitioner_class=test_partitioner)
     p = f.createPartitioner(ctx)
     n_partitions = 4
     for i in range(10):
         k = 'key' + ('a' * i)
         self.assertEqual(
             partition_function(k, n_partitions),
             pp.get_partition_from_partitioner(p, k, n_partitions))
示例#6
0
 def test_partitioner_from_cpluplus(self):
   d = {'input_key' : 'inputkey',
        'input_value' : 'inputvalue',
        'input_split' : 'inputsplit',
        'input_key_class' : 'keyclass',
        'input_value_class' : 'valueclass',
        'job_conf' : {}
        }
   ctx = pp.get_MapContext_object(d)
   self.assertEqual(ctx.getInputKey(), d['input_key'])
   self.assertEqual(ctx.getInputValue(), d['input_value'])
   self.assertEqual(ctx.getInputSplit(), d['input_split'])
   self.assertEqual(ctx.getInputKeyClass(), d['input_key_class'])
   self.assertEqual(ctx.getInputValueClass(), d['input_value_class'])
   f = Factory(None, None, partitioner_class=test_partitioner)
   p = f.createPartitioner(ctx)
   n_partitions = 4
   for i in range(10):
     k = 'key' + ('a' * i)
     self.assertEqual(
       partition_function(k, n_partitions),
       pp.get_partition_from_partitioner(p, k, n_partitions)
       )
示例#7
0
 def test_record_reader_from_cpluplus(self):
   d = {'input_key' : 'inputkey',
        'input_value' : 'inputvalue',
        'input_split' : 'inputsplit',
        'input_key_class' : 'keyclass',
        'input_value_class' : 'valueclass',
        'job_conf' : {}}
   ctx = pp.get_MapContext_object(d)
   self.assertEqual(ctx.getInputKey(), d['input_key'])
   self.assertEqual(ctx.getInputValue(), d['input_value'])
   self.assertEqual(ctx.getInputSplit(), d['input_split'])
   self.assertEqual(ctx.getInputKeyClass(), d['input_key_class'])
   self.assertEqual(ctx.getInputValueClass(), d['input_value_class'])
   f = Factory(None, None, test_record_reader)
   rr = f.createRecordReader(ctx)
   for i in range(test_record_reader.NUMBER_RECORDS):
     (f, k, v) = pp.get_record_from_record_reader(rr)
     self.assertTrue(f)
     self.assertEqual(k, test_record_reader.KEY_FORMAT % (i+1))
     self.assertEqual(v, test_record_reader.DEFAULT_VALUE)
     self.assertAlmostEqual(pp.get_progress_from_record_reader(rr),
                            float(i+1)/test_record_reader.NUMBER_RECORDS)
   (f, k, v) = pp.get_record_from_record_reader(rr)
   self.assertFalse(f)
示例#8
0
 def test_map_reduce_factory(self):
     import gc
     self.__check_ctx()
     mapper.call_history = []
     reducer.call_history = []
     mf = Factory(mapper, reducer)
     gc.collect()  # clean up existing references
     pp.try_factory_internal(mf)
     self.assertEqual(0, gc.collect())
     self.assertEqual(len(mapper.call_history), 2)
     self.assertEqual(len(reducer.call_history), 2)
     f = pp.TestFactory(mf)
     self.failUnless(isinstance(f.createMapper(self.m_ctx), mapper))
     self.failUnless(isinstance(f.createReducer(self.r_ctx), reducer))
     self.assertEqual(len(mapper.call_history), 3)
     self.assertEqual(len(reducer.call_history), 3)
     self.assertEqual(0, gc.collect())
示例#9
0
import struct
from pydoop.pipes import Mapper, Reducer, Factory, runTask
from pydoop.utils import jc_configure_int


class FilterMapper(Mapper):
    """
  Process a wordcount output stream, emitting only records relative to
  words whose count is equal to or above the configured threshold.
  """
    def __init__(self, context):
        super(FilterMapper, self).__init__(context)
        jc = context.getJobConf()
        jc_configure_int(self, jc, "filter.occurrence.threshold", "threshold")

    def map(self, context):
        word, occurrence = (context.getInputKey(), context.getInputValue())
        occurrence = struct.unpack(">i", occurrence)[0]
        if occurrence >= self.threshold:
            context.emit(word, str(occurrence))


class FilterReducer(Reducer):
    def reduce(self, context):
        pass


if __name__ == "__main__":
    runTask(Factory(FilterMapper, FilterReducer))
示例#10
0
def run_task():
    return runTask(Factory(Mapper, Reducer, combiner_class=Reducer))
示例#11
0
def run_task():
    return runTask(Factory(mapper, reducer))
示例#12
0
def main(argv):
    runTask(Factory(FastaMapper, FastaReducer,
                    record_reader_class=FastaReader))
示例#13
0
def run_task():
  return runTask(Factory(Mapper, Reducer))
示例#14
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import struct
from pydoop.pipes import Mapper, Reducer, Factory, runTask


class WordCountMapper(Mapper):
    def map(self, context):
        words = context.getInputValue().split()
        for w in words:
            context.emit(w, "1")


class WordCountReducer(Reducer):
    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), struct.pack(">i", s))


if __name__ == "__main__":
    runTask(Factory(WordCountMapper, WordCountReducer))