def runner(job): inout_opts = [("inputformat", "text"), ("outputformat", "text")] o1 = job.additer(Parse_comunidad_contratos_mapper, join_comunidad_contratos_reduce, opts=inout_opts) if __name__ == "__main__": main(runner)
class join_comunidades_provincias_contratos_reduce: def __init__(self): self.provincia = load_comunidades_provincias('./Comunidades_y_provincias.csv') def __call__(self, key, value): try: total_contratos_mujeres = 0 total_contratos_hombres = 0 acc_mujeres = 0 acc_hombres = 0 comunidad = self.provincia.get(provincia) provincia = key[:] for v in values: total_contratos_mujeres, total_contratos_hombres = v[:] acc_mujeres += int(total_contratos_mujeres) acc_hombres += int(total_contratos_hombres) if contratos_mujeres > 0 and provincia in self.provincia: total_contratos_mujeres += int(contratos_mujeres) if contratos_hombres > 0 and provincia in self.provincia: total_contratos_hombres += int(contratos_hombres) yield comunidad, (acc_mujeres, acc_hombres) except: pass from dumbo import main def runner(job): inout_opts = [("inputformat", "text"), ("outputformat", "text")] o1 = job.additer(Parse_contratos_municipio_mapper, join_comunidades_provincias_contratos_reduce, opts=inout_opts) if __name__ == "__main__": main(runner)
import os from dumbo import main, opt @opt( "addpath", "yes" ) def mapper0( key, value ): yield "N", key[0] def reducer0( key, values ): yield len(set( values )), "" def runner( job ): job.additer( mapper0, reducer0 ) def starter( prog ): return if __name__ == "__main__": main( runner, starter )
def runner(job): compute_svd = gopts.getintkey('svd') mapper = mrmc.ID_MAPPER reducer = full.FullTSQRRed2(compute_svd) job.additer(mapper=mapper, reducer=reducer, opts=[('numreducetasks', str(1))]) def starter(prog): # set the global opts gopts.prog = prog mat = mrmc.starter_helper(prog, True) if not mat: return "'mat' not specified" matname, matext = os.path.splitext(mat) output = prog.getopt('output') if not output: prog.addopt('output', '%s-full-tsqr-2%s' % (matname, matext)) gopts.getintkey('svd', 0) gopts.save_params() if __name__ == '__main__': dumbo.main(runner, starter)
# value = json.loads(value) try: key = value['prb_id'] except KeyError: key = 'NO_PROBE_ID' measurement_type = value['type'] value['is_failure'] = None if measurement_type == 'ping': detect_ping_failure(value) # mutates the dictionary elif measurement_type == 'dns': detect_dns_failure(value) # mutates the dictionary elif measurement_type == 'traceroute': detect_traceroute_failure(value) # mutates the dictionary try: yield (key, value['timestamp']), (value['timestamp'], value['is_failure'], value) except KeyError: yield (key, 'NO_TIMESTAMP'), ('NO_TIMESTAMP', value['is_failure'], value) def runner(job): job.additer(TimestampMapper) if __name__ == "__main__": main(runner)
if not word in self.stopwords: yield word, 1 def normalize(self, word): word = word.lower() return self.lemmatizer.lemmatize(word) def tokenize(self, sentence): for word in wordpunct_tokenize(sentence): yield self.normalize(word) @property def stopwords(self): if not self._stopwords: self._stopwords = nltk.corpus.stopwords.words('english') return self._stopwords def reducer(key, values): yield key, sum(values) def runner(job): job.additer(Mapper, reducer, reducer) def starter(prog): excludes = prog.delopt("stopwords") if excludes: prog.addopt("param", "stopwords="+excludes) if __name__ == "__main__": import dumbo dumbo.main(runner, starter)
from dumbo import main, opt from dumbo.lib import sumreducer from aqualab.dumbo.util import * import json class ProbeMapper: #@aquaflows.lib.parsers.Json def __call__(self, key, value): while type(value) == str or type(value) == unicode: value = json.loads(value) try: yield value['prb_id'], value except KeyError: yield 'NO_PROBE_ID', value def runner(job): job.additer(ProbeMapper) if __name__ == "__main__": main(runner)
status = v[0]; # Generate the exit if (status != "" and len(info)>0): yield info, status # Jobs workflow function def runner(job): # Step 1: Prepare users, details deliver opts = [ ("inputformat","text"), \ ("outputformat","text") ] multimapper = MultiMapper(); multimapper.add("users", users_parser) multimapper.add("details", deliveries_parser) o1 = job.additer(multimapper, reducer1, \ opts=opts ) # Step 2: Get status description multimapper = MultiMapper(); multimapper.add("status", status_parser) o2 = job.additer(multimapper, identityreducer, \ opts=opts, input=[job.root] ) # Step 3: Join results o3 = job.additer(identitymapper, reducer2, \ opts=opts, input=[o1, o2] ) if __name__ == "__main__": from dumbo import main dumbo.main(runner)
def reducer1(key, values): """ Output: K=>(row1, 'earthquake', 1), V=>10, K=>(row2, 'strike', 1), V=>5 """ yield (key, sum(values)) def mapper2(key, value): doc_num, issue_id, word = key occurences = value yield (doc_num, issue_id), (word, occurences) def reducer2(key, values): # Pulls it all into memory but that should be ok word_occurences = list(values) yield key, word_occurences def runner(job): job.additer(mapper1, reducer1) job.additer(mapper2, reducer2) if __name__ == '__main__': """ Usage: dumbo start issues_from_text.py -input /some/hdfs/input -output /some/hdfs/output -hadoop /path/to/hadoop """ import dumbo dumbo.main(runner)
"country_code", "country_name", "region", "region_name", "city", "postal_code", "latitude", "longitude", "area_code", "time_zone", "metro_code" ] saved_fields = [ "cc", "country", "st", "state", "city", "postcode", "lat", "lng", "areacode", "tz", "metrocode" ] for field, save_as in zip(fields, saved_fields): val = gir[field] if val is None: continue d[save_as] = val value["geo"] = d yield key, value def runner(job): job.additer(Mapper) def starter(prog): parseArgs(prog) if __name__ == "__main__": main(runner, starter)