# map.py # # Part of the mr.py library. Run on worker machines by mr.py, when a map # job only is called. import itertools, mr_lib, sys mr_lib.set_flag("map_done",False) # map phase on this worker not yet done filename,input_dict = sys.argv[1:] exec("from "+filename+" import mapper") # import map job i = mr_lib.read_pickle(input_dict) # Read the input dictionary mapper_params = mr_lib.read_pickle("mapper_params.mr") # Get the parameters for the mapper for k in i.keys(): i[k] = mapper(k,i[k],mapper_params) # Run the mapper mr_lib.write_pickle(i,input_dict) # Write the input dictionary back out mr_lib.set_flag("map_done",True) # map phase is done
# map_combine.py # # Part of the mr.py library. # # This program is automatically run on worker machines by mr.py, at the start of # a MapReduce job. import itertools, mr_lib, sys mr_lib.set_flag("map_combine_done",False) # flag indicating map_combine not yet done # Read in ip address of current worker, and description of whole cluster my_number,my_ip = mr_lib.read_pickle("my_details.mr") ip = mr_lib.read_pickle("cluster_description.mr") filename,input_dict = sys.argv[1:] exec("from "+filename+" import mapper, reducer") # Get the parameters for the MapReduce job mapper_params,reducer_params = mr_lib.read_pickle("params.mr") # Read the input dictionary into local memory i = mr_lib.read_pickle(input_dict) # Run MapReduce over the local input dictionary. intermediate = [] for (key,value) in i.items(): intermediate.extend(mapper(key,value,mapper_params)) groups = {} for key, group in itertools.groupby(sorted(intermediate), lambda x: x[0]):
# reduce.py # # Part of the mr.py library. # # This program is automatically run on worker machines by mr.py, to conclude # a MapReduce job. import itertools, mr_lib, os, sys # Sets a flag, visible to the client, saying that the reduce phase on this worker # is not yet done. mr_lib.set_flag("reduce_done",False) # Read in ip address of current worker, and description of whole cluster my_number,my_ip = mr_lib.read_pickle("my_details.mr") ip = mr_lib.read_pickle("cluster_description.mr") # Read the filename and input dictionary filename,output_dict,output_field = sys.argv[1:] # Get the parameters for the MapReduce job mapper_params,reducer_params = mr_lib.read_pickle("params.mr") # import the reducer module = filename[:-3] exec("from "+module+" import reducer") # read in all the intermediate data intermediate = [] for machine in xrange(len(ip)): name = "inter.dict."+str(machine)+"."+str(my_number)