示例#1
0
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT
"""
This example includes only the bare minimum required to run
wordcount. See wordcount-full.py for an example that uses counters,
RecordReader, etc.
"""

import pydoop.pipes as pp


class Mapper(pp.Mapper):
    def map(self, context):
        words = context.getInputValue().split()
        for w in words:
            context.emit(w, "1")


class Reducer(pp.Reducer):
    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), str(s))


if __name__ == "__main__":
    pp.runTask(pp.Factory(Mapper, Reducer))
示例#2
0
        self.file.fs.close()

    def emit(self, key, value):
        self.file.write("%s%s%s\n" % (key, self.sep, value))


class Partitioner(pp.Partitioner):

    def __init__(self, context):
        super(Partitioner, self).__init__(context)
        self.logger = logging.getLogger("Partitioner")

    def partition(self, key, numOfReduces):
        reducer_id = (hash(key) & sys.maxint) % numOfReduces
        self.logger.debug("reducer_id: %r" % reducer_id)
        return reducer_id


if __name__ == "__main__":
    pp.runTask(pp.Factory(
        Mapper, Reducer,
        record_reader_class=Reader,
        record_writer_class=Writer,
        partitioner_class=Partitioner,
        combiner_class=Reducer,
    ))

# Local Variables:
# mode: python
# End:
示例#3
0
RecordReader, etc.
"""

import pydoop.pipes as pp
import re


class Mapper(pp.Mapper):
    def __init__(self, context):
        print context

    def map(self, context):
        words = re.sub('[^0-9a-zA-Z]+', ' ', context.getInputValue()).split()
        for w in words:
            context.emit(w, "1")


class Reducer(pp.Reducer):
    def __init__(self, context):
        print "Map"

    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), str(s))


if __name__ == "__main__":
    pp.runTask(pp.Factory(mapper_class=Mapper, reducer_class=Reducer))
示例#4
0
def run_task():
    return pp.runTask(pp.Factory(Mapper, Reducer))
示例#5
0
        super(Mapper, self).__init__(context)
        context.setStatus("Initialization started")
        self.excluded_counter = context.getCounter("IPCOUNT", "EXCLUDED_LINES")
        jc = context.getJobConf()
        pu.jc_configure(self, jc, "ipcount.excludes", "excludes_fn", "")
        if self.excludes_fn:
            with open(self.excludes_fn) as f:
                self.excludes = set(l.strip() for l in f if not l.isspace())
        else:
            self.excludes = set()
        context.setStatus("Initialization done")

    def map(self, context):
        ip = context.getInputValue().split(None, 1)[0]
        if ip not in self.excludes:
            context.emit(ip, "1")
        else:
            context.incrementCounter(self.excluded_counter, 1)


class Reducer(pp.Reducer):
    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), str(s))


if __name__ == "__main__":
    pp.runTask(pp.Factory(Mapper, Reducer, combiner_class=Reducer))
示例#6
0
        super(Reader, self).__init__()
        self.isplit = pp.InputSplit(context.getInputSplit())
        self.file = hdfs.open(self.isplit.filename)
        self.file.seek(self.isplit.offset)
        self.bytes_read = 0
        if self.isplit.offset > 0:
            discarded = self.file.readline(
            )  # read by reader of previous split
            self.bytes_read += len(discarded)

    def close(self):
        self.file.close()
        self.file.fs.close()

    def next(self):
        if self.bytes_read > self.isplit.length:  # end of input split
            return (False, "", "")
        key = struct.pack(">q", self.isplit.offset + self.bytes_read)
        record = self.file.readline()
        if record == "":  # end of file
            return (False, "", "")
        self.bytes_read += len(record)
        return (True, key, record)

    def getProgress(self):
        return min(float(self.bytes_read) / self.isplit.length, 1.0)


if __name__ == "__main__":
    pp.runTask(pp.Factory(Mapper, Reducer, record_reader_class=Reader))