def generate_synthetic_click_logs(directory, num_input_disks, data_size, max_clicks): # Create the directory on DFS if it doesn't exist dfs_mkdir(directory, True) # Get local paths corresponding to DFS directory for the first # num_input_disks disks local_paths = dfs_get_local_paths(directory)[:num_input_disks] local_files = [] # Make a path for a local file on each disk storing click logs, dividing # the number of bytes to generate over those files evenly for (local_path_id, local_path) in enumerate(local_paths): if not os.path.exists(local_path): os.makedirs(local_path) local_file = os.path.join(local_path, "click_logs") local_files.append([local_file, data_size / num_input_disks]) # Add the extra few bytes arbitrarily to the first local file local_files[0][1] += data_size % num_input_disks generator_path = os.path.abspath( os.path.join(SCRIPT_DIR, os.pardir, os.pardir, "gen_synthetic_click_logs", "gen_synthetic_click_logs")) if not os.path.exists(generator_path): sys.exit( "Can't find '%s'. It's possible you didn't build it, or " "haven't symlinked it if you're doing an out-of-source build" % (generator_path)) cmd_template = "%s %%(output_file)s %d %%(num_bytes)s" % (generator_path, max_clicks) pending_cmds = [] for (filename, num_bytes) in local_files: command = cmd_template % { "output_file": filename, "num_bytes": num_bytes } pending_cmds.append(subprocess.Popen(command, shell=True)) returncode = 0 for pending_cmd in pending_cmds: pending_cmd.communicate() returncode = max(pending_cmd.returncode, returncode) return returncode
def generate_pagerank_initial_tuples(directory, num_input_disks): # Create directory on DFS (recursively creating sub-directories) if it # doesn't already exist dfs_mkdir(directory, True) # Get the local disks corresponding to that directory local_paths = dfs_get_local_paths(directory)[:num_input_disks] num_local_paths = len(local_paths) for (local_path_id, local_path) in enumerate(local_paths): if not os.path.exists(local_path): os.makedirs(local_path) local_file = os.path.join(local_path, "input") with open(local_file, 'wb+') as fp: packed_tuple = struct.pack("IIQQ", 8, 8, local_path_id, num_local_paths) fp.write(packed_tuple)
def generate_disk_path_tuples(directory): # Create directory on DFS (recursively creating sub-directories) if it # doesn't already exist dfs_mkdir(directory, True) # Get the local disks corresponding to that directory local_paths = dfs_get_local_paths(directory) for local_path in local_paths: benchmark_files_path = os.path.join(local_path, "benchmark_files") if not os.path.exists(benchmark_files_path): os.makedirs(benchmark_files_path) benchmark_files_path_length = len(benchmark_files_path) local_file = os.path.join(local_path, "input") with open(local_file, 'wb+') as fp: packed_tuple = struct.pack("II%ds" % (benchmark_files_path_length), benchmark_files_path_length, 0, benchmark_files_path) fp.write(packed_tuple)
def generate_cloudBurst_input( num_splits, num_input_disks, file_offset_index, reference_file_path, reference_file_total_sequences, query_file_path, query_file_total_sequences, output_file_path, host_input_directory, executable_path): """ Generate binary coded input files for cloudburst MapReduce job. The script takes raw sequence file as input (in FASTA FORMAT) and split and binary code it and copies file to intermediate directory. ConvertFastaForThemis.jar splits and binary code raw sequence file. Jar file (split n binary code process) is executed on master node, with file_offset_index = 0. Rest of experiment-nodes sleep till file is being splitted. Once split process is complete each host copies some subset of intermediate file to local directory. """ # Create directory on DFS (recursively creating sub-directories) if it # doesn't already exist print "exec path", executable_path, "num_splits", str(num_splits), print "output_file_path", output_file_path, "num disk",num_input_disks, print "file_offset_index", file_offset_index dfs_mkdir(host_input_directory, True) equal_size = 1 dummy_file = os.path.join(output_file_path, "dummy") if not os.path.exists(dummy_file): # If the dummy file does not exist, either wait for it if you are a # slave node, or run the converter jar and create the dummy file if you # are the master node. if file_offset_index != 0: # check if dummy file exists which indicates master node # finished off split process dummy_file = os.path.join(output_file_path, "dummy") while os.path.exists(dummy_file) == False: time.sleep(1) else: file_names = [(reference_file_path, reference_file_total_sequences), (query_file_path, query_file_total_sequences)] for (file_path, seq_size) in file_names: output_filename = "output_" + os.path.basename(file_path) output_file = os.path.join(output_file_path, output_filename) print file_path, output_filename cloudburst_file_converter_hosts_cmd = ( "%s/bin/java -jar %s %s %s %s %s %s")% (JAVA_HOME, executable_path, file_path, output_file, equal_size, seq_size, num_splits) print cloudburst_file_converter_hosts_cmd running_cmd = subprocess.Popen( cloudburst_file_converter_hosts_cmd, universal_newlines=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = running_cmd.communicate() if running_cmd.returncode != 0: sys.exit("Command '%s' failed: %s" % (' '.join(cloudburst_file_converter_hosts_cmd), stderr)) # create a dummy file f = open(dummy_file, 'w') f.close() # Get the local disks corresponding to that directory local_paths = dfs_get_local_paths(host_input_directory)[:num_input_disks] print local_paths for (local_path_id, local_path) in enumerate(local_paths): if os.path.exists(local_path): # scp the input files to local disks fileIndex = local_path_id + num_input_disks*file_offset_index+1 # copy all the files which match of form *1,2..25,35.. input_file = output_file_path + "output_*[a-zA-z]"+ str(fileIndex) cloudburst_scp_input_file_command = "scp %s %s/" %( input_file, local_path) print cloudburst_scp_input_file_command running_cmd = subprocess.Popen(cloudburst_scp_input_file_command, universal_newlines=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = running_cmd.communicate() if running_cmd.returncode != 0: sys.exit("Command '%s' failed: %s" % ( ' '.join(cloudburst_scp_input_file_command), stderr))