def __init__(self, job_name, server_names, server_id, max_number_dumped_items_shuffler=500000, simultaneous_files_in_redis=10): # configs configs = get_configs(self.__module__) self.base_dir = configs['base_dir'] self.code_dir = configs['code_dir'] self.working_dir = configs['working_dir'] self.max_hash_cache_size = configs['max_hash_cache_size'] self.redis_auth = configs['redis_auth'] # variables self.job_name = job_name self.server_names = server_names self.my_server_id = server_id self.number_of_servers = len(self.server_names) self.max_number_dumped_items_shuffler = max_number_dumped_items_shuffler self.simultaneous_files_in_redis = simultaneous_files_in_redis # shared references self.redis_token = None self.pending_file_names = set([]) self.pending_file_name2server_name = {} self.processed_file_names = {} self.processed_server_names = set([]) self.hash_cache = {} self.files = {} self.file_names = {} self.have_seen_file_names = set([]) self.number_dumped_items = {} self.server_id2server_name = {} self.location2connection = {} for server_name in self.server_names: server_id = self.server_names[server_name]['SERVER_ID'] file_name, file = self._get_new_file_reference(server_id) self.files[server_id] = file self.file_names[server_id] = [file_name] self.number_dumped_items[server_id] = 0 self.server_id2server_name[server_id] = server_name location = server_names[server_name]['LOCATION'] if location not in self.location2connection: connection = redis.Redis(location, password=self.redis_auth) self.location2connection[location] = connection self.my_server_name = self.server_id2server_name[self.my_server_id] self.my_location = self.server_names[self.my_server_name]['LOCATION'] self.connections = {} for server_name in self.server_names: location = self.server_names[server_name]['LOCATION'] connection = self.location2connection[location] self.connections[server_name] = connection if server_name == self.my_server_name: self.my_connection = connection self._make_dir()
def __init__(self): # configs configs = get_configs(self.__module__) self.local_working_dir = configs['local_working_dir'] self.job_submitter = os.path.expanduser('~') # shared references self.scheduler_token = None
def __init__(self): self.config_data = get_configs(self.__module__) working_dir = self.config_data['working_dir'] self.working_file = working_dir + '/LIST_' + str( uuid.uuid4()) + '.data' self.file = open(self.working_file, 'w') self.mode = 'PUSH' self.len = 0 self.current_item = None
def __init__(self): # configs configs = get_configs(self.__module__) self.local_working_dir = configs['local_working_dir'] # shared references self.manage_cluster = ManageCluster() self.manage_cluster.start_cluster() self.file_transfer = FileTransfer() self.scheduler = Scheduler() self.finished_jobs = None
def __init__(self, reduce_function_name, project_name, job_name, server_id, max_number_dumped_items, disk_based_input=False, disk_based_output=False, auxiliary_data_name=None, compress=False): # configs configs = get_configs(self.__module__) self.base_dir = configs['base_dir'] self.base_projects_dir = configs['base_projects_dir'] self.auxiliary_dir = configs['auxiliary_dir'] self.number_of_servers_per_location = configs[ 'number_of_servers_per_location'] # variables project_path = self.base_projects_dir + '/' + project_name if project_path in sys.path: sys.path.remove(project_path) sys.path.insert(0, project_path) import reduce_functions reload(reduce_functions) from reduce_functions import ReduceFunctions self.reduce_function = ReduceFunctions( reduce_function_name).get_reduce_function() self.job_name = job_name self.server_id = server_id self.max_number_dumped_items = max_number_dumped_items self.disk_based_input = disk_based_input self.disk_based_output = disk_based_output self.compress = compress # shared references self.sorted_file = None self.output_file_name = None self.number_dumped_items = 0 self.output_file = self._get_output_file() self.input_file_names = self._get_input_file_names() self.auxiliary_data = self._get_auxiliary_data(auxiliary_data_name)
def __init__(self): # configs configs = get_configs(self.__module__) self.working_dir = configs['working_dir'] self.max_number_of_bytes_in_memory = configs[ 'max_number_of_bytes_in_memory'] self.number_of_servers_per_location = configs[ 'number_of_servers_per_location'] self.max_number_items = None # shared references self.number_items = 0 self.current_list = [] self.file_names = [] self.item_sizes = [] self.current_item = None self.write_mode = True self.finished = False self.file = None
def __init__(self, server_name, server_id, location, server_names): # who am i and where am i? self.server_name = server_name self.server_id = server_id self.location = location self.server_names = copy(server_names) # configs configs = get_configs(self.__module__) redis_auth = configs['redis_auth'] self.redis = redis.Redis(self.location, password=redis_auth) # job/project self.project_name = None self.job_name = None self.input_dirs = None self.delete_job_data = None # mapper self.map_function_name = None self.auxiliary_data_name_mapper = None self.hold_state = None self.downsample = None # shuffler self.max_number_dumped_items_shuffler = None self.simultaneous_files_in_redis = None # reduce self.reduce_function_name = None self.auxiliary_data_name_reducer = None self.max_number_dumped_items_reducer = None self.disk_based_input = None self.disk_based_output = None self.compress = None # mapreduce self.mapper = None self.reducer = None # shared objects self.job = None
def __init__(self, map_function_name, project_name, input_dirs, server_id, job_name, server_names, hold_state=False, downsample=1.0, auxiliary_data_name=None, max_number_dumped_items_shuffler=500000, simultaneous_files_in_redis=10): # configs configs = get_configs(self.__module__) self.base_dir = configs['base_dir'] self.base_projects_dir = configs['base_projects_dir'] self.auxiliary_dir = configs['auxiliary_dir'] self.number_of_servers_per_location = configs[ 'number_of_servers_per_location'] # variables self.map_function_name = map_function_name self.project_name = project_name self.input_dirs = input_dirs self.server_id = server_id self.job_name = job_name self.hold_state = hold_state self.downsample = downsample self.max_number_dumped_items_shuffler = max_number_dumped_items_shuffler self.simultaneous_files_in_redis = simultaneous_files_in_redis # shared references self.map_function = None self.shuffler = Shuffle(job_name, server_names, server_id, max_number_dumped_items_shuffler, simultaneous_files_in_redis) self.state = self._read_state() self.file_names = self._get_file_names() self.auxiliary_data = self._get_auxiliary_data(auxiliary_data_name)
def __init__(self): # configs configs = get_configs(self.__module__) self.cluster_name = configs['cluster_name'] self.locations = configs['locations'] self.base_port = configs['base_port'] self.management_port = configs['management_port'] self.number_of_servers_per_location = configs[ 'number_of_servers_per_location'] self.start_command = configs['start_command'] self.kill_command = configs['kill_command'] self.redis_restart_command = configs['redis_restart_command'] self.management_start_command = configs['management_start_command'] self.push_code_command = configs['push_code_command'] self.create_configs_command = configs['create_configs_command'] self.code_dir = configs['code_dir'] self.base_dir = configs['base_dir'] self.working_dir = configs['working_dir'] self.base_projects_dir = configs['base_projects_dir'] self.logs_dir = configs['logs_dir'] self.auxiliary_dir = configs['auxiliary_dir'] self.poll_every = configs['poll_every'] self.redis_auth = configs['redis_auth'] # project and job self.project_dir = None self.project_name = None self.job_name = None self.delete_job_data = None self.current_phase = None # other self.server_names = None self.connections = None self.management_connections = None self.processes = None self.redis = None self.management_redis = None self.exceptions = None