예제 #1
0
def runner(job):
    inout_opts = [("inputformat", "text"), ("outputformat", "text")]

    o1 = job.additer(Parse_comunidad_contratos_mapper,
                     join_comunidad_contratos_reduce,
                     opts=inout_opts)

    if __name__ == "__main__": main(runner)
예제 #2
0
class join_comunidades_provincias_contratos_reduce:

    def __init__(self):
        self.provincia = load_comunidades_provincias('./Comunidades_y_provincias.csv')

    def __call__(self, key, value):
           try:
              total_contratos_mujeres = 0
              total_contratos_hombres = 0

              acc_mujeres = 0
              acc_hombres = 0

              comunidad = self.provincia.get(provincia)            

              provincia = key[:]

              for v in values:
                  total_contratos_mujeres, total_contratos_hombres = v[:]
                  acc_mujeres += int(total_contratos_mujeres)
                  acc_hombres += int(total_contratos_hombres)

                  if contratos_mujeres > 0 and provincia in self.provincia:
                      total_contratos_mujeres += int(contratos_mujeres)

                  if contratos_hombres > 0 and provincia in self.provincia:
                      total_contratos_hombres += int(contratos_hombres)


              yield comunidad, (acc_mujeres, acc_hombres)
      except:
          pass

from dumbo import main

def runner(job):
    inout_opts = [("inputformat", "text"), ("outputformat", "text")]
    o1 = job.additer(Parse_contratos_municipio_mapper, join_comunidades_provincias_contratos_reduce, opts=inout_opts)


if __name__ == "__main__":
    main(runner)
예제 #3
0
import os
from dumbo import main, opt

@opt( "addpath", "yes" )
def mapper0( key, value ):
    yield "N", key[0]

def reducer0( key, values ):
    yield len(set( values )), ""

def runner( job ):
    job.additer( mapper0, reducer0 )

def starter( prog ):
    return

if __name__ == "__main__":
    main( runner, starter )
예제 #4
0

def runner(job):
    compute_svd = gopts.getintkey('svd')
    mapper = mrmc.ID_MAPPER
    reducer = full.FullTSQRRed2(compute_svd)
    job.additer(mapper=mapper,
                reducer=reducer,
                opts=[('numreducetasks', str(1))])


def starter(prog):
    # set the global opts
    gopts.prog = prog

    mat = mrmc.starter_helper(prog, True)
    if not mat: return "'mat' not specified"

    matname, matext = os.path.splitext(mat)
    output = prog.getopt('output')
    if not output:
        prog.addopt('output', '%s-full-tsqr-2%s' % (matname, matext))

    gopts.getintkey('svd', 0)

    gopts.save_params()


if __name__ == '__main__':
    dumbo.main(runner, starter)
예제 #5
0
        #    value = json.loads(value)
        try:
            key = value['prb_id']
        except KeyError:
            key = 'NO_PROBE_ID'

        measurement_type = value['type']
        value['is_failure'] = None

        if measurement_type == 'ping':
            detect_ping_failure(value)  # mutates the dictionary
        elif measurement_type == 'dns':
            detect_dns_failure(value)  # mutates the dictionary
        elif measurement_type == 'traceroute':
            detect_traceroute_failure(value)  # mutates the dictionary

        try:
            yield (key, value['timestamp']), (value['timestamp'],
                                              value['is_failure'], value)
        except KeyError:
            yield (key, 'NO_TIMESTAMP'), ('NO_TIMESTAMP', value['is_failure'],
                                          value)


def runner(job):
    job.additer(TimestampMapper)


if __name__ == "__main__":
    main(runner)
            if not word in self.stopwords:
                yield word, 1

    def normalize(self, word):
        word = word.lower()
        return self.lemmatizer.lemmatize(word)

    def tokenize(self, sentence):
        for word in wordpunct_tokenize(sentence):
            yield self.normalize(word)

    @property
    def stopwords(self):
        if not self._stopwords:
            self._stopwords = nltk.corpus.stopwords.words('english')
        return self._stopwords

def reducer(key, values):
    yield key, sum(values)

def runner(job):
    job.additer(Mapper, reducer, reducer)

def starter(prog):
    excludes = prog.delopt("stopwords")
    if excludes: prog.addopt("param", "stopwords="+excludes)

if __name__ == "__main__":
    import dumbo
    dumbo.main(runner, starter)
예제 #7
0
from dumbo import main, opt
from dumbo.lib import sumreducer
from aqualab.dumbo.util import *

import json

class ProbeMapper:
    #@aquaflows.lib.parsers.Json
    def __call__(self, key, value):
        while type(value) == str or type(value) == unicode:
            value = json.loads(value)
        try:
            yield value['prb_id'], value
        except KeyError:
            yield 'NO_PROBE_ID', value

def runner(job):
    job.additer(ProbeMapper)

if __name__ == "__main__":
    main(runner)
예제 #8
0
      status = v[0];

  # Generate the exit
  if (status != "" and len(info)>0):
    yield info, status

# Jobs workflow function
def runner(job):
  # Step 1: Prepare users, details deliver
  opts = [ ("inputformat","text"), \
    ("outputformat","text") ]
  multimapper = MultiMapper();
  multimapper.add("users", users_parser)
  multimapper.add("details", deliveries_parser) 
  o1 = job.additer(multimapper, reducer1, \
    opts=opts )

  # Step 2: Get status description
  multimapper = MultiMapper();
  multimapper.add("status", status_parser)
  o2 = job.additer(multimapper, identityreducer, \
    opts=opts, input=[job.root] )

  # Step 3: Join results
  o3 = job.additer(identitymapper, reducer2, \
    opts=opts, input=[o1, o2] )

if __name__ == "__main__":
  from dumbo import main
  dumbo.main(runner)
예제 #9
0
def reducer1(key, values):
    """ Output:
    
    K=>(row1, 'earthquake', 1), V=>10,
    K=>(row2, 'strike', 1), V=>5
    """
    yield (key, sum(values))


def mapper2(key, value):
    doc_num, issue_id, word = key
    occurences = value
    yield (doc_num, issue_id), (word, occurences)


def reducer2(key, values):
    # Pulls it all into memory but that should be ok
    word_occurences = list(values)
    yield key, word_occurences


def runner(job):
    job.additer(mapper1, reducer1)
    job.additer(mapper2, reducer2)


if __name__ == '__main__':
    """ Usage:  dumbo start issues_from_text.py -input /some/hdfs/input -output /some/hdfs/output -hadoop /path/to/hadoop """
    import dumbo
    dumbo.main(runner)
예제 #10
0
            "country_code", "country_name", "region", "region_name", "city",
            "postal_code", "latitude", "longitude", "area_code", "time_zone",
            "metro_code"
        ]

        saved_fields = [
            "cc", "country", "st", "state", "city", "postcode", "lat", "lng",
            "areacode", "tz", "metrocode"
        ]

        for field, save_as in zip(fields, saved_fields):
            val = gir[field]
            if val is None: continue
            d[save_as] = val

        value["geo"] = d

        yield key, value


def runner(job):
    job.additer(Mapper)


def starter(prog):
    parseArgs(prog)


if __name__ == "__main__":
    main(runner, starter)