Пример #1
0
    def __init__(self,
                 job_name,
                 server_names,
                 server_id,
                 max_number_dumped_items_shuffler=500000,
                 simultaneous_files_in_redis=10):
        # configs
        configs = get_configs(self.__module__)
        self.base_dir = configs['base_dir']
        self.code_dir = configs['code_dir']
        self.working_dir = configs['working_dir']
        self.max_hash_cache_size = configs['max_hash_cache_size']
        self.redis_auth = configs['redis_auth']

        # variables
        self.job_name = job_name
        self.server_names = server_names
        self.my_server_id = server_id
        self.number_of_servers = len(self.server_names)
        self.max_number_dumped_items_shuffler = max_number_dumped_items_shuffler
        self.simultaneous_files_in_redis = simultaneous_files_in_redis

        # shared references
        self.redis_token = None
        self.pending_file_names = set([])
        self.pending_file_name2server_name = {}
        self.processed_file_names = {}
        self.processed_server_names = set([])
        self.hash_cache = {}
        self.files = {}
        self.file_names = {}
        self.have_seen_file_names = set([])
        self.number_dumped_items = {}
        self.server_id2server_name = {}
        self.location2connection = {}
        for server_name in self.server_names:
            server_id = self.server_names[server_name]['SERVER_ID']
            file_name, file = self._get_new_file_reference(server_id)
            self.files[server_id] = file
            self.file_names[server_id] = [file_name]
            self.number_dumped_items[server_id] = 0
            self.server_id2server_name[server_id] = server_name
            location = server_names[server_name]['LOCATION']
            if location not in self.location2connection:
                connection = redis.Redis(location, password=self.redis_auth)
                self.location2connection[location] = connection

        self.my_server_name = self.server_id2server_name[self.my_server_id]
        self.my_location = self.server_names[self.my_server_name]['LOCATION']

        self.connections = {}
        for server_name in self.server_names:
            location = self.server_names[server_name]['LOCATION']
            connection = self.location2connection[location]
            self.connections[server_name] = connection
            if server_name == self.my_server_name:
                self.my_connection = connection

        self._make_dir()
Пример #2
0
    def __init__(self):
        # configs
        configs = get_configs(self.__module__)
        self.local_working_dir = configs['local_working_dir']
        self.job_submitter = os.path.expanduser('~')

        # shared references
        self.scheduler_token = None
Пример #3
0
 def __init__(self):
     self.config_data = get_configs(self.__module__)
     working_dir = self.config_data['working_dir']
     self.working_file = working_dir + '/LIST_' + str(
         uuid.uuid4()) + '.data'
     self.file = open(self.working_file, 'w')
     self.mode = 'PUSH'
     self.len = 0
     self.current_item = None
Пример #4
0
    def __init__(self):
        # configs
        configs = get_configs(self.__module__)
        self.local_working_dir = configs['local_working_dir']

        # shared references
        self.manage_cluster = ManageCluster()
        self.manage_cluster.start_cluster()
        self.file_transfer = FileTransfer()
        self.scheduler = Scheduler()
        self.finished_jobs = None
Пример #5
0
    def __init__(self,
                 reduce_function_name,
                 project_name,
                 job_name,
                 server_id,
                 max_number_dumped_items,
                 disk_based_input=False,
                 disk_based_output=False,
                 auxiliary_data_name=None,
                 compress=False):
        # configs
        configs = get_configs(self.__module__)
        self.base_dir = configs['base_dir']
        self.base_projects_dir = configs['base_projects_dir']
        self.auxiliary_dir = configs['auxiliary_dir']
        self.number_of_servers_per_location = configs[
            'number_of_servers_per_location']

        # variables
        project_path = self.base_projects_dir + '/' + project_name
        if project_path in sys.path:
            sys.path.remove(project_path)

        sys.path.insert(0, project_path)

        import reduce_functions
        reload(reduce_functions)
        from reduce_functions import ReduceFunctions
        self.reduce_function = ReduceFunctions(
            reduce_function_name).get_reduce_function()
        self.job_name = job_name
        self.server_id = server_id
        self.max_number_dumped_items = max_number_dumped_items
        self.disk_based_input = disk_based_input
        self.disk_based_output = disk_based_output
        self.compress = compress

        # shared references
        self.sorted_file = None
        self.output_file_name = None
        self.number_dumped_items = 0
        self.output_file = self._get_output_file()
        self.input_file_names = self._get_input_file_names()
        self.auxiliary_data = self._get_auxiliary_data(auxiliary_data_name)
Пример #6
0
    def __init__(self):
        # configs
        configs = get_configs(self.__module__)
        self.working_dir = configs['working_dir']
        self.max_number_of_bytes_in_memory = configs[
            'max_number_of_bytes_in_memory']
        self.number_of_servers_per_location = configs[
            'number_of_servers_per_location']
        self.max_number_items = None

        # shared references
        self.number_items = 0
        self.current_list = []
        self.file_names = []
        self.item_sizes = []
        self.current_item = None
        self.write_mode = True
        self.finished = False
        self.file = None
Пример #7
0
    def __init__(self, server_name, server_id, location, server_names):
        # who am i and where am i?
        self.server_name = server_name
        self.server_id = server_id
        self.location = location
        self.server_names = copy(server_names)

        # configs
        configs = get_configs(self.__module__)
        redis_auth = configs['redis_auth']
        self.redis = redis.Redis(self.location, password=redis_auth)

        # job/project
        self.project_name = None
        self.job_name = None
        self.input_dirs = None
        self.delete_job_data = None

        # mapper
        self.map_function_name = None
        self.auxiliary_data_name_mapper = None
        self.hold_state = None
        self.downsample = None

        # shuffler
        self.max_number_dumped_items_shuffler = None
        self.simultaneous_files_in_redis = None

        # reduce
        self.reduce_function_name = None
        self.auxiliary_data_name_reducer = None
        self.max_number_dumped_items_reducer = None
        self.disk_based_input = None
        self.disk_based_output = None
        self.compress = None

        # mapreduce
        self.mapper = None
        self.reducer = None

        # shared objects
        self.job = None
Пример #8
0
    def __init__(self,
                 map_function_name,
                 project_name,
                 input_dirs,
                 server_id,
                 job_name,
                 server_names,
                 hold_state=False,
                 downsample=1.0,
                 auxiliary_data_name=None,
                 max_number_dumped_items_shuffler=500000,
                 simultaneous_files_in_redis=10):
        # configs
        configs = get_configs(self.__module__)
        self.base_dir = configs['base_dir']
        self.base_projects_dir = configs['base_projects_dir']
        self.auxiliary_dir = configs['auxiliary_dir']
        self.number_of_servers_per_location = configs[
            'number_of_servers_per_location']

        # variables
        self.map_function_name = map_function_name
        self.project_name = project_name
        self.input_dirs = input_dirs
        self.server_id = server_id
        self.job_name = job_name
        self.hold_state = hold_state
        self.downsample = downsample
        self.max_number_dumped_items_shuffler = max_number_dumped_items_shuffler
        self.simultaneous_files_in_redis = simultaneous_files_in_redis

        # shared references
        self.map_function = None
        self.shuffler = Shuffle(job_name, server_names, server_id,
                                max_number_dumped_items_shuffler,
                                simultaneous_files_in_redis)
        self.state = self._read_state()
        self.file_names = self._get_file_names()
        self.auxiliary_data = self._get_auxiliary_data(auxiliary_data_name)
Пример #9
0
    def __init__(self):
        # configs
        configs = get_configs(self.__module__)
        self.cluster_name = configs['cluster_name']
        self.locations = configs['locations']
        self.base_port = configs['base_port']
        self.management_port = configs['management_port']
        self.number_of_servers_per_location = configs[
            'number_of_servers_per_location']
        self.start_command = configs['start_command']
        self.kill_command = configs['kill_command']
        self.redis_restart_command = configs['redis_restart_command']
        self.management_start_command = configs['management_start_command']
        self.push_code_command = configs['push_code_command']
        self.create_configs_command = configs['create_configs_command']
        self.code_dir = configs['code_dir']
        self.base_dir = configs['base_dir']
        self.working_dir = configs['working_dir']
        self.base_projects_dir = configs['base_projects_dir']
        self.logs_dir = configs['logs_dir']
        self.auxiliary_dir = configs['auxiliary_dir']
        self.poll_every = configs['poll_every']
        self.redis_auth = configs['redis_auth']

        # project and job
        self.project_dir = None
        self.project_name = None
        self.job_name = None
        self.delete_job_data = None
        self.current_phase = None

        # other
        self.server_names = None
        self.connections = None
        self.management_connections = None
        self.processes = None
        self.redis = None
        self.management_redis = None
        self.exceptions = None