def create_dict(self, name, keys): worker_key_pairs = sorted([(hash(key) % self.size, key) for key in keys]) for machine, local_group in itertools.groupby(sorted(worker_key_pairs), lambda x: x[0]): local_dict = {} for machine, key in local_group: local_dict[key] = {} mr_lib.write_pickle(local_dict, name) self.send(name, machine)
def map(self, filename, input_dict, mapper_params): # distribute the file defining the mapper functions through the cluster, # as well as the corresponding parameters. self.distribute_public(filename) mr_lib.write_pickle(mapper_params, "mapper_params.mr") self.distribute_public("mapper_params.mr") # Launch the map phase, and wait until it's done. self.exec_public("python map.py " + filename + " " + input_dict) self.wait_until_task_done("map_done")
def mr(self, filename, mapper_params, reducer_params, input_dict, output_dict, output_field): # distribute the file defining the mapper and reducer functions through the cluster, # as well as the corresponding parameters self.distribute_public(filename) mr_lib.write_pickle([mapper_params, reducer_params], "params.mr") self.distribute_public("params.mr") # Launch the map phase, with combining, and wait until it's done self.exec_public("python map_combine.py " + filename + " " + input_dict) self.wait_until_task_done("map_combine_done") # Launch the reduce phase, and wait until it's done self.exec_public("python reduce.py " + filename + " " + output_dict + " " + output_field) self.wait_until_task_done("reduce_done")
def __init__(self, n): # set the size attribute self.size = n # set up an EC2 connection, and grab an Ubuntu 8.04 image (http://alestic.com) connection = boto.connect_ec2() image = connection.get_image("ami-1c5db975") # create a keypair to use with the image, save to disk, and set permissions # so ssh will be happy self.keypair = connection.create_key_pair("mr_keypair") mr_lib.write_file(self.keypair.material, mr_lib.mr_keypair_filename()) os.system("chmod 600 " + mr_lib.mr_keypair_filename()) # tell EC2 to start the instances running, set the self.workers attribute to the # corresponding reservation, and wait for all the workers to start running self.workers = image.run(n, n, "mr_keypair") for instance in self.workers.instances: instance.update() while instance.state != u"running": instance.update() time.sleep(5) # Delay before we start distributing files, so all instances are running properly. time.sleep(10) # distribute a list of all the private ip addresses private_ip_list = [instance.private_dns_name for instances in self.workers.instances] mr_lib.write_pickle(private_ip_list, "cluster_description.mr") self.distribute_public("cluster_description.mr") for j in xrange(n): mr_lib.write_pickle([j, self.workers.instances[j].private_dns_name], "my_details.mr") self.send("my_details.mr", j) # distribute the files necessary to run map and mapreduce jobs self.distribute_public("map.py") self.distribute_public("map_combine.py") self.distribute_public("reduce.py") self.distribute_public("mr_lib.py") # Distribute the ssh keypairs and config file for instance in self.workers.instances: mr_lib.scp(mr_lib.mr_keypair_filename(), "root@" + instance.public_dns_name + ":.ssh/id_rsa-mr_keypair") mr_lib.ssh("root@" + instance.public_dns_name, "chmod 600 /root/.ssh/id_rsa-mr_keypair") mr_lib.scp(os.environ.get("HOME") + "/.ssh/config", "root@" + instance.public_dns_name + ":.ssh/config")
# map.py # # Part of the mr.py library. Run on worker machines by mr.py, when a map # job only is called. import itertools, mr_lib, sys mr_lib.set_flag("map_done",False) # map phase on this worker not yet done filename,input_dict = sys.argv[1:] exec("from "+filename+" import mapper") # import map job i = mr_lib.read_pickle(input_dict) # Read the input dictionary mapper_params = mr_lib.read_pickle("mapper_params.mr") # Get the parameters for the mapper for k in i.keys(): i[k] = mapper(k,i[k],mapper_params) # Run the mapper mr_lib.write_pickle(i,input_dict) # Write the input dictionary back out mr_lib.set_flag("map_done",True) # map phase is done
filename,output_dict,output_field = sys.argv[1:] # Get the parameters for the MapReduce job mapper_params,reducer_params = mr_lib.read_pickle("params.mr") # import the reducer module = filename[:-3] exec("from "+module+" import reducer") # read in all the intermediate data intermediate = [] for machine in xrange(len(ip)): name = "inter.dict."+str(machine)+"."+str(my_number) if os.path.exists(name): intermediate.extend(mr_lib.read_pickle(name)) groups = {} for key, group in itertools.groupby(sorted(intermediate), lambda x: x[0]): groups[key] = list([y for x, y in group]) result = [reducer(inter_key,groups[inter_key],reducer_params) for inter_key in groups] # load the existing output dictionary, and modify it to include the results # obtained in the reduce step, saving the resulting file. o = mr_lib.read_pickle(output_dict) for key, value in result: o[key][output_field] = value mr_lib.write_pickle(o,output_dict) # Sets a flag, visible to the client, saying that the reduce phase on this worker # is done. mr_lib.set_flag("reduce_done",True)