示例#1
0
文件: mr.py 项目: mnielsen/mr_py
 def create_dict(self, name, keys):
     worker_key_pairs = sorted([(hash(key) % self.size, key) for key in keys])
     for machine, local_group in itertools.groupby(sorted(worker_key_pairs), lambda x: x[0]):
         local_dict = {}
         for machine, key in local_group:
             local_dict[key] = {}
         mr_lib.write_pickle(local_dict, name)
         self.send(name, machine)
示例#2
0
文件: mr.py 项目: mnielsen/mr_py
    def map(self, filename, input_dict, mapper_params):
        # distribute the file defining the mapper functions through the cluster,
        # as well as the corresponding parameters.
        self.distribute_public(filename)
        mr_lib.write_pickle(mapper_params, "mapper_params.mr")
        self.distribute_public("mapper_params.mr")

        # Launch the map phase, and wait until it's done.
        self.exec_public("python map.py " + filename + " " + input_dict)
        self.wait_until_task_done("map_done")
示例#3
0
文件: mr.py 项目: mnielsen/mr_py
    def mr(self, filename, mapper_params, reducer_params, input_dict, output_dict, output_field):
        # distribute the file defining the mapper and reducer functions through the cluster,
        # as well as the corresponding parameters
        self.distribute_public(filename)
        mr_lib.write_pickle([mapper_params, reducer_params], "params.mr")
        self.distribute_public("params.mr")

        # Launch the map phase, with combining, and wait until it's done
        self.exec_public("python map_combine.py " + filename + " " + input_dict)
        self.wait_until_task_done("map_combine_done")

        # Launch the reduce phase, and wait until it's done
        self.exec_public("python reduce.py " + filename + " " + output_dict + " " + output_field)
        self.wait_until_task_done("reduce_done")
示例#4
0
文件: mr.py 项目: mnielsen/mr_py
    def __init__(self, n):

        # set the size attribute
        self.size = n

        # set up an EC2 connection, and grab an Ubuntu 8.04 image (http://alestic.com)
        connection = boto.connect_ec2()
        image = connection.get_image("ami-1c5db975")

        # create a keypair to use with the image, save to disk, and set permissions
        # so ssh will be happy
        self.keypair = connection.create_key_pair("mr_keypair")
        mr_lib.write_file(self.keypair.material, mr_lib.mr_keypair_filename())
        os.system("chmod 600 " + mr_lib.mr_keypair_filename())

        # tell EC2 to start the instances running, set the self.workers attribute to the
        # corresponding reservation, and wait for all the workers to start running
        self.workers = image.run(n, n, "mr_keypair")
        for instance in self.workers.instances:
            instance.update()
            while instance.state != u"running":
                instance.update()
                time.sleep(5)

        # Delay before we start distributing files, so all instances are running properly.
        time.sleep(10)

        # distribute a list of all the private ip addresses
        private_ip_list = [instance.private_dns_name for instances in self.workers.instances]
        mr_lib.write_pickle(private_ip_list, "cluster_description.mr")
        self.distribute_public("cluster_description.mr")
        for j in xrange(n):
            mr_lib.write_pickle([j, self.workers.instances[j].private_dns_name], "my_details.mr")
            self.send("my_details.mr", j)

        # distribute the files necessary to run map and mapreduce jobs
        self.distribute_public("map.py")
        self.distribute_public("map_combine.py")
        self.distribute_public("reduce.py")
        self.distribute_public("mr_lib.py")

        # Distribute the ssh keypairs and config file
        for instance in self.workers.instances:
            mr_lib.scp(mr_lib.mr_keypair_filename(), "root@" + instance.public_dns_name + ":.ssh/id_rsa-mr_keypair")
            mr_lib.ssh("root@" + instance.public_dns_name, "chmod 600 /root/.ssh/id_rsa-mr_keypair")
            mr_lib.scp(os.environ.get("HOME") + "/.ssh/config", "root@" + instance.public_dns_name + ":.ssh/config")
示例#5
0
文件: map.py 项目: mnielsen/mr_py
# map.py
#
# Part of the mr.py library.  Run on worker machines by mr.py, when a map
# job only is called. 

import itertools, mr_lib, sys

mr_lib.set_flag("map_done",False) # map phase on this worker not yet done
filename,input_dict = sys.argv[1:]
exec("from "+filename+" import mapper") # import map job
i = mr_lib.read_pickle(input_dict) # Read the input dictionary
mapper_params = mr_lib.read_pickle("mapper_params.mr") # Get the parameters for the mapper
for k in i.keys(): i[k] = mapper(k,i[k],mapper_params) # Run the mapper
mr_lib.write_pickle(i,input_dict) # Write the input dictionary back out
mr_lib.set_flag("map_done",True) # map phase is done

示例#6
0
文件: reduce.py 项目: mnielsen/mr_py
filename,output_dict,output_field = sys.argv[1:]

# Get the parameters for the MapReduce job
mapper_params,reducer_params = mr_lib.read_pickle("params.mr")

# import the reducer
module = filename[:-3]
exec("from "+module+" import reducer")

# read in all the intermediate data
intermediate = []
for machine in xrange(len(ip)):
  name = "inter.dict."+str(machine)+"."+str(my_number)
  if os.path.exists(name): intermediate.extend(mr_lib.read_pickle(name))

groups = {}
for key, group in itertools.groupby(sorted(intermediate), 
                                    lambda x: x[0]):
  groups[key] = list([y for x, y in group])
result = [reducer(inter_key,groups[inter_key],reducer_params) for inter_key in groups] 

# load the existing output dictionary, and modify it to include the results
# obtained in the reduce step, saving the resulting file.
o = mr_lib.read_pickle(output_dict)
for key, value in result:  o[key][output_field] =  value
mr_lib.write_pickle(o,output_dict)

# Sets a flag, visible to the client, saying that the reduce phase on this worker
# is done.
mr_lib.set_flag("reduce_done",True)