Exemplo n.º 1
0
  def MigrateCrossCloud(self, source_location, destination_location):
    """Method to copy data cross cloud using a distributed job on the cluster.

    Currently the only supported destination cloud is AWS.
    TODO(user): Add support for other destination clouds.

    Args:
      source_location: The source GCS path to migrate.
      destination_location: The destination S3 location.

    Returns:
      A dictionary with key 'success' and boolean value set to the status of
      data migration command.
    """
    cmd = self.DataprocGcloudCommand('jobs', 'submit', 'hadoop')
    if self.project is not None:
      cmd.flags['project'] = self.project
    cmd.flags['cluster'] = self.cluster_id
    cmd.flags['class'] = 'org.apache.hadoop.tools.DistCp'
    s3_access_key, s3_secret_key = aws_credentials.GetCredentials()
    cmd.flags['properties'] = 'fs.s3a.access.key=%s,fs.s3a.secret.key=%s' % (
        s3_access_key, s3_secret_key)
    cmd.additional_flags = ['--'] + [
        'gs://' + source_location, 's3a://' + destination_location
    ]
    _, _, retcode = cmd.Issue(timeout=None, raise_on_failure=False)
    return {dpb_service.SUCCESS: retcode == 0}
Exemplo n.º 2
0
def GetConfiguration(driver_memory_mb: int,
                     worker_memory_mb: int,
                     worker_cores: int,
                     num_workers: int,
                     configure_s3: bool = False) -> Dict[str, str]:
    """Calculate Spark configuration. Shared between VMs and k8s."""
    conf = {
        SPARK_DRIVER_MEMORY: f'{driver_memory_mb}m',
        SPARK_WORKER_MEMORY: f'{worker_memory_mb}m',
        SPARK_WORKER_VCPUS: str(worker_cores),
        'spark.executor.instances': str(num_workers),
        # Tell spark not to run job if it can't schedule all workers. This would
        # silently degrade performance.
        'spark.scheduler.minRegisteredResourcesRatio': '1'
    }
    if configure_s3:
        # Configure S3A Hadoop's S3 filesystem
        aws_access_key, aws_secret_key = aws_credentials.GetCredentials()
        conf.update({
            # Use s3:// scheme to be consistent with EMR
            'spark.hadoop.fs.s3.impl':
            'org.apache.hadoop.fs.s3a.S3AFileSystem',
            'spark.hadoop.fs.s3a.access.key': aws_access_key,
            'spark.hadoop.fs.s3a.secret.key': aws_secret_key,
        })
    return conf
Exemplo n.º 3
0
    def MigrateCrossCloud(self,
                          source_location,
                          destination_location,
                          dest_cloud='AWS'):
        """Method to copy data cross cloud using a distributed job on the cluster.

    Currently the only supported destination cloud is AWS.
    TODO(user): Add support for other destination clouds.

    Args:
      source_location: The source GCS path to migrate.
      destination_location: The destination path.
      dest_cloud: The cloud to copy data to.

    Returns:
      A dictionary with key 'success' and boolean value set to the status of
      data migration command.
    """
        if dest_cloud == 'AWS':
            dest_prefix = 's3a://'
        else:
            raise ValueError('Unsupported destination cloud.')
        s3_access_key, s3_secret_key = aws_credentials.GetCredentials()
        return self.DistributedCopy('gs://' + source_location,
                                    dest_prefix + destination_location,
                                    properties={
                                        'fs.s3a.access.key': s3_access_key,
                                        'fs.s3a.secret.key': s3_secret_key,
                                    })
Exemplo n.º 4
0
def _RenderConfig(vm,
                  leader,
                  workers,
                  memory_fraction=SPARK_MEMORY_FRACTION,
                  configure_s3=False):
    """Load Spark Condfiguration on VM."""
    # Use first worker to get worker configuration
    worker = workers[0]
    worker_cores = worker.NumCpusForBenchmark()
    worker_memory_mb = int((worker.total_memory_kb / 1024) * memory_fraction)
    driver_memory_mb = int((leader.total_memory_kb / 1024) * memory_fraction)

    if vm.scratch_disks:
        # TODO(pclay): support multiple scratch disks. A current suboptimal
        # workaround is RAID0 local_ssds with --num_striped_disks.
        scratch_dir = posixpath.join(vm.GetScratchDir(), 'spark')
    else:
        scratch_dir = posixpath.join('/tmp/pkb/local_scratch', 'spark')

    aws_access_key = None
    aws_secret_key = None
    if configure_s3:
        aws_access_key, aws_secret_key = aws_credentials.GetCredentials()

    context = {
        'leader_ip': leader.internal_ip,
        'worker_ips': [vm.internal_ip for vm in workers],
        'scratch_dir': scratch_dir,
        'worker_vcpus': worker_cores,
        'spark_private_key': SPARK_PRIVATE_KEY,
        'worker_memory_mb': worker_memory_mb,
        'driver_memory_mb': driver_memory_mb,
        'hadoop_cmd': hadoop.HADOOP_CMD,
        'python_cmd': 'python3',
        'aws_access_key': aws_access_key,
        'aws_secret_key': aws_secret_key,
    }

    for file_name in DATA_FILES:
        file_path = data.ResourcePath(file_name)
        if file_name == 'spark/workers.j2':
            # Spark calls its worker list slaves.
            file_name = 'spark/slaves.j2'
        remote_path = posixpath.join(SPARK_CONF_DIR,
                                     os.path.basename(file_name))
        if file_name.endswith('.j2'):
            vm.RenderTemplate(file_path,
                              os.path.splitext(remote_path)[0], context)
        else:
            vm.RemoteCopy(file_path, remote_path)
Exemplo n.º 5
0
def _RenderConfig(vm,
                  master,
                  workers,
                  memory_fraction=YARN_MEMORY_FRACTION,
                  configure_s3=False):
    """Load Hadoop Condfiguration on VM."""
    # Use first worker to get worker configuration
    worker = workers[0]
    num_workers = len(workers)
    worker_cores = worker.NumCpusForBenchmark()
    yarn_memory_mb = int((vm.total_memory_kb / 1024) * memory_fraction)
    # Reserve 1 GB per worker for AppMaster containers.
    usable_memory_mb = yarn_memory_mb - 1024

    # YARN generally schedules based on memory (and ignores cores). We invert this
    # by calculating memory in terms of cores. This means that changing
    # machine memory will not change scheduling simply change the memory given to
    # each task.
    maps_per_node = int(worker_cores * MAP_SLOTS_PER_CORE)
    map_memory_mb = usable_memory_mb // maps_per_node
    map_heap_mb = int(map_memory_mb * HEAP_MEMORY_RATIO)

    reduces_per_node = int(worker_cores * REDUCE_SLOTS_PER_CORE)
    reduce_memory_mb = usable_memory_mb // reduces_per_node
    reduce_heap_mb = int(reduce_memory_mb * HEAP_MEMORY_RATIO)

    # This property is only used for generating data like teragen.
    # Divide 2 to avoid tiny files on large clusters.
    num_map_tasks = maps_per_node * num_workers
    # This determines the number of reduce tasks in Terasort and is critical to
    # scale with the cluster.
    num_reduce_tasks = reduces_per_node * num_workers

    if vm.scratch_disks:
        # TODO(pclay): support multiple scratch disks. A current suboptimal
        # workaround is RAID0 local_ssds with --num_striped_disks.
        scratch_dir = posixpath.join(vm.GetScratchDir(), 'hadoop')
    else:
        scratch_dir = posixpath.join('/tmp/pkb/local_scratch', 'hadoop')

    aws_access_key = None
    aws_secret_key = None
    if configure_s3:
        aws_access_key, aws_secret_key = aws_credentials.GetCredentials()

    context = {
        'master_ip': master.internal_ip,
        'worker_ips': [vm.internal_ip for vm in workers],
        'scratch_dir': scratch_dir,
        'worker_vcpus': worker_cores,
        'hadoop_private_key': HADOOP_PRIVATE_KEY,
        'user': vm.user_name,
        'yarn_memory_mb': yarn_memory_mb,
        'map_memory_mb': map_memory_mb,
        'map_heap_mb': map_heap_mb,
        'num_map_tasks': num_map_tasks,
        'reduce_memory_mb': reduce_memory_mb,
        'reduce_heap_mb': reduce_heap_mb,
        'num_reduce_tasks': num_reduce_tasks,
        'aws_access_key': aws_access_key,
        'aws_secret_key': aws_secret_key,
    }

    for file_name in DATA_FILES:
        file_path = data.ResourcePath(file_name)
        if (file_name == 'hadoop/workers.j2'
                and FLAGS.hadoop_version.split('.')[0] < '3'):
            file_name = 'hadoop/slaves.j2'
        remote_path = posixpath.join(HADOOP_CONF_DIR,
                                     os.path.basename(file_name))
        if file_name.endswith('.j2'):
            vm.RenderTemplate(file_path,
                              os.path.splitext(remote_path)[0], context)
        else:
            vm.RemoteCopy(file_path, remote_path)