예제 #1
0
def generate_synthetic_click_logs(directory, num_input_disks, data_size,
                                  max_clicks):

    # Create the directory on DFS if it doesn't exist
    dfs_mkdir(directory, True)

    # Get local paths corresponding to DFS directory for the first
    # num_input_disks disks
    local_paths = dfs_get_local_paths(directory)[:num_input_disks]

    local_files = []

    # Make a path for a local file on each disk storing click logs, dividing
    # the number of bytes to generate over those files evenly

    for (local_path_id, local_path) in enumerate(local_paths):
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        local_file = os.path.join(local_path, "click_logs")
        local_files.append([local_file, data_size / num_input_disks])

    # Add the extra few bytes arbitrarily to the first local file
    local_files[0][1] += data_size % num_input_disks

    generator_path = os.path.abspath(
        os.path.join(SCRIPT_DIR, os.pardir, os.pardir,
                     "gen_synthetic_click_logs", "gen_synthetic_click_logs"))

    if not os.path.exists(generator_path):
        sys.exit(
            "Can't find '%s'. It's possible you didn't build it, or "
            "haven't symlinked it if you're doing an out-of-source build" %
            (generator_path))

    cmd_template = "%s %%(output_file)s %d %%(num_bytes)s" % (generator_path,
                                                              max_clicks)

    pending_cmds = []

    for (filename, num_bytes) in local_files:
        command = cmd_template % {
            "output_file": filename,
            "num_bytes": num_bytes
        }

        pending_cmds.append(subprocess.Popen(command, shell=True))

    returncode = 0

    for pending_cmd in pending_cmds:
        pending_cmd.communicate()

        returncode = max(pending_cmd.returncode, returncode)

    return returncode
예제 #2
0
def generate_pagerank_initial_tuples(directory, num_input_disks):
    # Create directory on DFS (recursively creating sub-directories) if it
    # doesn't already exist
    dfs_mkdir(directory, True)

    # Get the local disks corresponding to that directory
    local_paths = dfs_get_local_paths(directory)[:num_input_disks]

    num_local_paths = len(local_paths)

    for (local_path_id, local_path) in enumerate(local_paths):
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        local_file = os.path.join(local_path, "input")

        with open(local_file, 'wb+') as fp:
            packed_tuple = struct.pack("IIQQ", 8, 8, local_path_id,
                                       num_local_paths)
            fp.write(packed_tuple)
def generate_disk_path_tuples(directory):
    # Create directory on DFS (recursively creating sub-directories) if it
    # doesn't already exist
    dfs_mkdir(directory, True)

    # Get the local disks corresponding to that directory
    local_paths = dfs_get_local_paths(directory)

    for local_path in local_paths:
        benchmark_files_path = os.path.join(local_path, "benchmark_files")

        if not os.path.exists(benchmark_files_path):
            os.makedirs(benchmark_files_path)

        benchmark_files_path_length = len(benchmark_files_path)

        local_file = os.path.join(local_path, "input")

        with open(local_file, 'wb+') as fp:
            packed_tuple = struct.pack("II%ds" % (benchmark_files_path_length),
                                       benchmark_files_path_length, 0,
                                       benchmark_files_path)
            fp.write(packed_tuple)
예제 #4
0
def generate_disk_path_tuples(directory):
    # Create directory on DFS (recursively creating sub-directories) if it
    # doesn't already exist
    dfs_mkdir(directory, True)

    # Get the local disks corresponding to that directory
    local_paths = dfs_get_local_paths(directory)

    for local_path in local_paths:
        benchmark_files_path = os.path.join(local_path, "benchmark_files")

        if not os.path.exists(benchmark_files_path):
            os.makedirs(benchmark_files_path)

        benchmark_files_path_length = len(benchmark_files_path)

        local_file = os.path.join(local_path, "input")

        with open(local_file, 'wb+') as fp:
            packed_tuple = struct.pack("II%ds" % (benchmark_files_path_length),
                                       benchmark_files_path_length, 0,
                                       benchmark_files_path)
            fp.write(packed_tuple)
def generate_cloudBurst_input(
  num_splits, num_input_disks, file_offset_index,
  reference_file_path, reference_file_total_sequences, query_file_path,
  query_file_total_sequences, output_file_path, host_input_directory,
  executable_path):
    """
      Generate binary coded input files for cloudburst MapReduce job. The
      script takes raw sequence file as input (in FASTA FORMAT) and split
      and binary code it and copies file to intermediate directory.

      ConvertFastaForThemis.jar splits and binary code raw sequence file.
      Jar file (split n binary code process) is executed on master node,
      with file_offset_index = 0. Rest of experiment-nodes sleep till file
      is being splitted. Once split process is complete each host copies
      some subset of intermediate file to local directory.
    """
    # Create directory on DFS (recursively creating sub-directories) if it
    # doesn't already exist
    print "exec path", executable_path, "num_splits", str(num_splits),
    print  "output_file_path", output_file_path, "num disk",num_input_disks,
    print  "file_offset_index", file_offset_index

    dfs_mkdir(host_input_directory, True)
    equal_size = 1

    dummy_file = os.path.join(output_file_path, "dummy")
    if not os.path.exists(dummy_file):
        # If the dummy file does not exist, either wait for it if you are a
        # slave node, or run the converter jar and create the dummy file if you
        # are the master node.

        if file_offset_index != 0:
            # check if dummy file exists which indicates master node
            # finished off split process
            dummy_file = os.path.join(output_file_path, "dummy")
            while os.path.exists(dummy_file) == False:
                time.sleep(1)
        else:
            file_names = [(reference_file_path, reference_file_total_sequences),
                          (query_file_path, query_file_total_sequences)]
            for (file_path, seq_size) in file_names:
                output_filename = "output_" + os.path.basename(file_path)
                output_file = os.path.join(output_file_path, output_filename)
                print file_path, output_filename
                cloudburst_file_converter_hosts_cmd = (
                    "%s/bin/java -jar %s %s %s %s %s %s")% (JAVA_HOME,
                        executable_path, file_path, output_file, equal_size,
                        seq_size, num_splits)

                print cloudburst_file_converter_hosts_cmd
                running_cmd = subprocess.Popen(
                                           cloudburst_file_converter_hosts_cmd,
                                           universal_newlines=True,
                                           shell=True,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
                (stdout, stderr) = running_cmd.communicate()

                if running_cmd.returncode != 0:
                    sys.exit("Command '%s' failed: %s" %
                        (' '.join(cloudburst_file_converter_hosts_cmd), stderr))
            # create a dummy file
            f = open(dummy_file, 'w')
            f.close()
    # Get the local disks corresponding to that directory
    local_paths = dfs_get_local_paths(host_input_directory)[:num_input_disks]

    print local_paths

    for (local_path_id, local_path) in enumerate(local_paths):
        if os.path.exists(local_path):
          #  scp the input files to local disks
          fileIndex = local_path_id + num_input_disks*file_offset_index+1
          #  copy all the files which match of form *1,2..25,35..
          input_file = output_file_path + "output_*[a-zA-z]"+ str(fileIndex)
          cloudburst_scp_input_file_command = "scp %s %s/" %(
            input_file, local_path)
          print cloudburst_scp_input_file_command
          running_cmd = subprocess.Popen(cloudburst_scp_input_file_command,
                                         universal_newlines=True,
                                         shell=True,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.PIPE)
          (stdout, stderr) = running_cmd.communicate()
          if running_cmd.returncode != 0:
              sys.exit("Command '%s' failed: %s" % (
                  ' '.join(cloudburst_scp_input_file_command), stderr))