def run_job(): """ Runs the Hadoop pipes task through Pydoop """ from pydoop.pipes import runTask, Factory from seal.seqal.mapper import mapper from seal.seqal.reducer import reducer return runTask(Factory(mapper, reducer))
# END_COPYRIGHT """ This example includes only the bare minimum required to run wordcount. See wordcount-full.py for an example that uses counters, RecordReader, etc. """ import pydoop.pipes as pp class Mapper(pp.Mapper): def map(self, context): words = context.getInputValue().split() for w in words: context.emit(w, "1") class Reducer(pp.Reducer): def reduce(self, context): s = 0 while context.nextValue(): s += int(context.getInputValue()) context.emit(context.getInputKey(), str(s)) if __name__ == "__main__": pp.runTask(pp.Factory(Mapper, Reducer))
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import struct from pydoop.pipes import Mapper, Reducer, Factory, runTask class WordCountMapper(Mapper): def map(self, context): words = context.getInputValue().split() for w in words: context.emit(w, "1") class WordCountReducer(Reducer): def reduce(self, context): s = 0 while context.nextValue(): s += int(context.getInputValue()) context.emit(context.getInputKey(), struct.pack(">i", s)) if __name__ == "__main__": runTask(Factory(WordCountMapper, WordCountReducer))
RecordReader, etc. """ import pydoop.pipes as pp import re class Mapper(pp.Mapper): def __init__(self, context): print context def map(self, context): words = re.sub('[^0-9a-zA-Z]+', ' ', context.getInputValue()).split() for w in words: context.emit(w, "1") class Reducer(pp.Reducer): def __init__(self, context): print "Map" def reduce(self, context): s = 0 while context.nextValue(): s += int(context.getInputValue()) context.emit(context.getInputKey(), str(s)) if __name__ == "__main__": pp.runTask(pp.Factory(mapper_class=Mapper, reducer_class=Reducer))
super(Reader, self).__init__() self.isplit = pp.InputSplit(context.getInputSplit()) self.file = hdfs.open(self.isplit.filename) self.file.seek(self.isplit.offset) self.bytes_read = 0 if self.isplit.offset > 0: discarded = self.file.readline( ) # read by reader of previous split self.bytes_read += len(discarded) def close(self): self.file.close() self.file.fs.close() def next(self): if self.bytes_read > self.isplit.length: # end of input split return (False, "", "") key = struct.pack(">q", self.isplit.offset + self.bytes_read) record = self.file.readline() if record == "": # end of file return (False, "", "") self.bytes_read += len(record) return (True, key, record) def getProgress(self): return min(float(self.bytes_read) / self.isplit.length, 1.0) if __name__ == "__main__": pp.runTask(pp.Factory(Mapper, Reducer, record_reader_class=Reader))
context.setStatus("Initialization started") self.excluded_counter = context.getCounter("IPCOUNT", "EXCLUDED_LINES") jc = context.getJobConf() pu.jc_configure(self, jc, "ipcount.excludes", "excludes_fn", "") if self.excludes_fn: with open(self.excludes_fn) as f: self.excludes = set(l.strip() for l in f if not l.isspace()) else: self.excludes = set() context.setStatus("Initialization done") def map(self, context): ip = context.getInputValue().split(None, 1)[0] if ip not in self.excludes: context.emit(ip, "1") else: context.incrementCounter(self.excluded_counter, 1) class Reducer(pp.Reducer): def reduce(self, context): s = 0 while context.nextValue(): s += int(context.getInputValue()) context.emit(context.getInputKey(), str(s)) if __name__ == "__main__": pp.runTask(pp.Factory(Mapper, Reducer, combiner_class=Reducer))
# License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT """ This example includes only the bare minimum required to run wordcount. See wordcount-full.py for an example that uses counters, RecordReader, etc. """ import pydoop.pipes as pp class Mapper(pp.Mapper): def map(self, context): words = context.getInputValue().split() for w in words: context.emit(w, "1") class Reducer(pp.Reducer): def reduce(self, context): s = 0 while context.nextValue(): s += int(context.getInputValue()) context.emit(context.getInputKey(), str(s)) if __name__ == "__main__": pp.runTask(pp.Factory(Mapper, Reducer))
self.file.fs.close() def emit(self, key, value): self.file.write("%s%s%s\n" % (key, self.sep, value)) class Partitioner(pp.Partitioner): def __init__(self, context): super(Partitioner, self).__init__(context) self.logger = logging.getLogger("Partitioner") def partition(self, key, numOfReduces): reducer_id = (hash(key) & sys.maxint) % numOfReduces self.logger.debug("reducer_id: %r" % reducer_id) return reducer_id if __name__ == "__main__": pp.runTask(pp.Factory( Mapper, Reducer, record_reader_class=Reader, record_writer_class=Writer, partitioner_class=Partitioner, combiner_class=Reducer, )) # Local Variables: # mode: python # End:
def main(argv): runTask(Factory(FastaMapper, FastaReducer, record_reader_class=FastaReader))
def run_task(): return pp.runTask(pp.Factory(Mapper, Reducer))
super(Mapper, self).__init__(context) context.setStatus("Initialization started") self.excluded_counter = context.getCounter("IPCOUNT", "EXCLUDED_LINES") jc = context.getJobConf() pu.jc_configure(self, jc, "ipcount.excludes", "excludes_fn", "") if self.excludes_fn: with open(self.excludes_fn) as f: self.excludes = set(l.strip() for l in f if not l.isspace()) else: self.excludes = set() context.setStatus("Initialization done") def map(self, context): ip = context.getInputValue().split(None, 1)[0] if ip not in self.excludes: context.emit(ip, "1") else: context.incrementCounter(self.excluded_counter, 1) class Reducer(pp.Reducer): def reduce(self, context): s = 0 while context.nextValue(): s += int(context.getInputValue()) context.emit(context.getInputKey(), str(s)) if __name__ == "__main__": pp.runTask(pp.Factory(Mapper, Reducer, combiner_class=Reducer))
import pydoop.pipes as pp import re class Mapper(pp.Mapper): def __init__(self, context): print context def map(self, context): words = re.sub('[^0-9a-zA-Z]+', ' ', context.getInputValue()).split() for w in words: context.emit(w, "1") class Reducer(pp.Reducer): def __init__(self, context): print "Map" def reduce(self, context): s = 0 while context.nextValue(): s += int(context.getInputValue()) context.emit(context.getInputKey(), str(s)) if __name__ == "__main__": pp.runTask(pp.Factory(mapper_class=Mapper, reducer_class=Reducer))
def run_task(): return runTask(Factory(Mapper, Reducer))
def run_task(): return runTask(Factory(mapper, reducer))
def __init__(self, context): super(Reader, self).__init__() self.isplit = pp.InputSplit(context.getInputSplit()) self.file = hdfs.open(self.isplit.filename) self.file.seek(self.isplit.offset) self.bytes_read = 0 if self.isplit.offset > 0: discarded = self.file.readline() # read by reader of previous split self.bytes_read += len(discarded) def close(self): self.file.close() self.file.fs.close() def next(self): if self.bytes_read > self.isplit.length: # end of input split return (False, "", "") key = struct.pack(">q", self.isplit.offset+self.bytes_read) record = self.file.readline() if record == "": # end of file return (False, "", "") self.bytes_read += len(record) return (True, key, record) def getProgress(self): return min(float(self.bytes_read)/self.isplit.length, 1.0) if __name__ == "__main__": pp.runTask(pp.Factory(Mapper, Reducer, record_reader_class=Reader))
def run_task(): return runTask(Factory(Mapper, Reducer, combiner_class=Reducer))
import struct from pydoop.pipes import Mapper, Reducer, Factory, runTask from pydoop.utils import jc_configure_int class FilterMapper(Mapper): """ Process a wordcount output stream, emitting only records relative to words whose count is equal to or above the configured threshold. """ def __init__(self, context): super(FilterMapper, self).__init__(context) jc = context.getJobConf() jc_configure_int(self, jc, "filter.occurrence.threshold", "threshold") def map(self, context): word, occurrence = (context.getInputKey(), context.getInputValue()) occurrence = struct.unpack(">i", occurrence)[0] if occurrence >= self.threshold: context.emit(word, str(occurrence)) class FilterReducer(Reducer): def reduce(self, context): pass if __name__ == "__main__": runTask(Factory(FilterMapper, FilterReducer))
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import struct from pydoop.pipes import Mapper, Reducer, Factory, runTask class WordCountMapper(Mapper): def map(self, context): words = context.getInputValue().split() for w in words: context.emit(w, "1") class WordCountReducer(Reducer): def reduce(self, context): s = 0 while context.nextValue(): s += int(context.getInputValue()) context.emit(context.getInputKey(), struct.pack(">i", s)) if __name__ == "__main__": runTask(Factory(WordCountMapper, WordCountReducer))