Пример #1
0
# map.py
#
# Part of the mr.py library.  Run on worker machines by mr.py, when a map
# job only is called. 

import itertools, mr_lib, sys

mr_lib.set_flag("map_done",False) # map phase on this worker not yet done
filename,input_dict = sys.argv[1:]
exec("from "+filename+" import mapper") # import map job
i = mr_lib.read_pickle(input_dict) # Read the input dictionary
mapper_params = mr_lib.read_pickle("mapper_params.mr") # Get the parameters for the mapper
for k in i.keys(): i[k] = mapper(k,i[k],mapper_params) # Run the mapper
mr_lib.write_pickle(i,input_dict) # Write the input dictionary back out
mr_lib.set_flag("map_done",True) # map phase is done

Пример #2
0
# map_combine.py
#
# Part of the mr.py library.
#
# This program is automatically run on worker machines by mr.py, at the start of
# a MapReduce job.

import itertools, mr_lib, sys

mr_lib.set_flag("map_combine_done",False) # flag indicating map_combine not yet done

# Read in ip address of current worker, and description of whole cluster
my_number,my_ip = mr_lib.read_pickle("my_details.mr")
ip = mr_lib.read_pickle("cluster_description.mr")

filename,input_dict = sys.argv[1:]
exec("from "+filename+" import mapper, reducer")

# Get the parameters for the MapReduce job
mapper_params,reducer_params = mr_lib.read_pickle("params.mr")

# Read the input dictionary into local memory
i = mr_lib.read_pickle(input_dict)

# Run MapReduce over the local input dictionary.
intermediate = []
for (key,value) in i.items():
  intermediate.extend(mapper(key,value,mapper_params))
groups = {}
for key, group in itertools.groupby(sorted(intermediate), 
                                    lambda x: x[0]):
Пример #3
0
# reduce.py
#
# Part of the mr.py library.
#
# This program is automatically run on worker machines by mr.py, to conclude
# a MapReduce job.

import itertools, mr_lib, os, sys

# Sets a flag, visible to the client, saying that the reduce phase on this worker 
# is not yet done.
mr_lib.set_flag("reduce_done",False)

# Read in ip address of current worker, and description of whole cluster
my_number,my_ip = mr_lib.read_pickle("my_details.mr")
ip = mr_lib.read_pickle("cluster_description.mr")

# Read the filename and input dictionary
filename,output_dict,output_field = sys.argv[1:]

# Get the parameters for the MapReduce job
mapper_params,reducer_params = mr_lib.read_pickle("params.mr")

# import the reducer
module = filename[:-3]
exec("from "+module+" import reducer")

# read in all the intermediate data
intermediate = []
for machine in xrange(len(ip)):
  name = "inter.dict."+str(machine)+"."+str(my_number)