Пример #1
0
    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._gce_zone.lower(),
                 random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._api_cluster_get(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google_errors.HttpError as e:
            if not e.resp.status == 404:
                raise

            log.info('Creating Dataproc Hadoop cluster - %s' %
                     self._cluster_id)

            cluster_data = self._cluster_create_args()

            self._api_cluster_create(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id
Пример #2
0
    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        bucket_name, _ = parse_gcs_uri(self._job_tmpdir)
        self._create_fs_tmp_bucket(bucket_name)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._gce_zone.lower(), random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._api_cluster_get(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google_errors.HttpError as e:
            if not e.resp.status == 404:
                raise

            log.info(
                'Creating Dataproc Hadoop cluster - %s' % self._cluster_id)

            cluster_data = self._cluster_create_args()

            self._api_cluster_create(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id
Пример #3
0
    def _get_tmpdir(self, given_tmpdir):
        """Helper for _fix_tmpdir"""
        if given_tmpdir:
            return given_tmpdir

        mrjob_buckets = self.fs.list_buckets(self._gcp_project,
                                             prefix='mrjob-')

        # Loop over buckets until we find one that matches region
        # NOTE - because this is a tmpdir, we look for a GCS bucket in the
        # same GCE region
        chosen_bucket_name = None
        gce_lower_location = self._gce_region.lower()
        for tmp_bucket in mrjob_buckets:
            tmp_bucket_name = tmp_bucket['name']

            # NOTE - GCP ambiguous Behavior - Bucket location is being
            # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs
            # suggest lowercase
            lower_location = tmp_bucket['location'].lower()
            if lower_location == gce_lower_location:
                # Regions are both specified and match
                log.info("using existing temp bucket %s" % tmp_bucket_name)
                chosen_bucket_name = tmp_bucket_name
                break

        # Example default - "mrjob-us-central1-RANDOMHEX"
        if not chosen_bucket_name:
            chosen_bucket_name = '-'.join(
                ['mrjob', gce_lower_location,
                 random_identifier()])

        return 'gs://%s/tmp/' % chosen_bucket_name
Пример #4
0
    def _get_tmpdir(self, given_tmpdir):
        """Helper for _fix_tmpdir"""
        if given_tmpdir:
            return given_tmpdir

        # Loop over buckets until we find one that matches region
        # NOTE - because this is a tmpdir, we look for a GCS bucket in the
        # same GCE region
        chosen_bucket_name = None

        # determine region for bucket
        region = self._region()

        for tmp_bucket_name in self.fs.gcs.get_all_bucket_names(
                prefix='mrjob-'):
            tmp_bucket = self.fs.gcs.get_bucket(tmp_bucket_name)

            # NOTE - GCP ambiguous Behavior - Bucket location is being
            # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs
            # suggest lowercase. (As of Feb. 12, 2018, this is still true,
            # observed on google-cloud-sdk)
            if tmp_bucket.location.lower() == region:
                # Regions are both specified and match
                log.info("using existing temp bucket %s" % tmp_bucket_name)
                chosen_bucket_name = tmp_bucket_name
                break

        # Example default - "mrjob-us-central1-RANDOMHEX"
        if not chosen_bucket_name:
            chosen_bucket_name = '-'.join(
                ['mrjob', region, random_identifier()])

        return 'gs://%s/tmp/' % chosen_bucket_name
Пример #5
0
    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        self.fs.mkdir(self._job_tmpdir)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._region(),
                 random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._get_cluster(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google.api_core.exceptions.NotFound:
            log.info('Creating Dataproc Hadoop cluster - %s' %
                     self._cluster_id)

            cluster_data = self._cluster_create_kwargs()
            self._create_cluster(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        self._set_up_ssh_tunnel()

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id
Пример #6
0
    def _launch_cluster(self):
        """Create an empty cluster on Dataproc, and set self._cluster_id to
        its ID."""
        self.fs.mkdir(self._job_tmpdir)

        # clusterName must be a match of
        # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).'
        # as documented in an API error message
        # (not currently documented in the Dataproc docs)
        if not self._cluster_id:
            self._cluster_id = '-'.join(
                ['mrjob', self._region(), random_identifier()])

        # Create the cluster if it's missing, otherwise join an existing one
        try:
            self._get_cluster(self._cluster_id)
            log.info('Adding job to existing cluster - %s' % self._cluster_id)
        except google.api_core.exceptions.NotFound:
            log.info(
                'Creating Dataproc Hadoop cluster - %s' % self._cluster_id)

            cluster_data = self._cluster_create_kwargs()
            self._create_cluster(cluster_data)

            self._wait_for_cluster_ready(self._cluster_id)

        self._set_up_ssh_tunnel()

        # keep track of when we launched our job
        self._dataproc_job_start = time.time()
        return self._cluster_id
Пример #7
0
    def _get_tmpdir(self, given_tmpdir):
        """Helper for _fix_tmpdir"""
        if given_tmpdir:
            return given_tmpdir

        # Loop over buckets until we find one that matches region
        # NOTE - because this is a tmpdir, we look for a GCS bucket in the
        # same GCE region
        chosen_bucket_name = None

        # determine region for bucket
        region = self._region()

        for tmp_bucket_name in self.fs.gcs.get_all_bucket_names(
                prefix='mrjob-'):
            tmp_bucket = self.fs.gcs.get_bucket(tmp_bucket_name)

            # NOTE - GCP ambiguous Behavior - Bucket location is being
            # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs
            # suggest lowercase. (As of Feb. 12, 2018, this is still true,
            # observed on google-cloud-sdk)
            if tmp_bucket.location.lower() == region:
                # Regions are both specified and match
                log.info("using existing temp bucket %s" % tmp_bucket_name)
                chosen_bucket_name = tmp_bucket_name
                break

        # Example default - "mrjob-us-central1-RANDOMHEX"
        if not chosen_bucket_name:
            chosen_bucket_name = '-'.join(
                ['mrjob', region, random_identifier()])

        return 'gs://%s/tmp/' % chosen_bucket_name
Пример #8
0
    def _get_tmpdir(self, given_tmpdir):
        """Helper for _fix_tmpdir"""
        if given_tmpdir:
            return given_tmpdir

        mrjob_buckets = self.fs.list_buckets(
            self._gcp_project, prefix='mrjob-')

        # Loop over buckets until we find one that matches region
        # NOTE - because this is a tmpdir, we look for a GCS bucket in the
        # same GCE region
        chosen_bucket_name = None
        gce_lower_location = self._gce_region.lower()
        for tmp_bucket in mrjob_buckets:
            tmp_bucket_name = tmp_bucket['name']

            # NOTE - GCP ambiguous Behavior - Bucket location is being
            # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs
            # suggest lowercase
            lower_location = tmp_bucket['location'].lower()
            if lower_location == gce_lower_location:
                # Regions are both specified and match
                log.info("using existing temp bucket %s" % tmp_bucket_name)
                chosen_bucket_name = tmp_bucket_name
                break

        # Example default - "mrjob-us-central1-RANDOMHEX"
        if not chosen_bucket_name:
            chosen_bucket_name = '-'.join(
                ['mrjob', gce_lower_location, random_identifier()])

        return 'gs://%s/tmp/' % chosen_bucket_name
Пример #9
0
def _create_mrjob_role_with_attached_policy(conn, role_document, policy_arn):
    # create role
    role_name = 'mrjob-' + random_identifier()

    conn.create_role(role_name, json.dumps(role_document))
    _attach_role_policy(conn, role_name, policy_arn)

    return role_name
Пример #10
0
def _create_mrjob_role_with_attached_policy(conn, role_document, policy_arn):
    # create role
    role_name = 'mrjob-' + random_identifier()

    conn.create_role(role_name, json.dumps(role_document))
    _attach_role_policy(conn, role_name, policy_arn)

    return role_name
Пример #11
0
    def _simulate_progress(self, mock_job):
        state = _job_state_name(mock_job.status.state)

        if state == 'SETUP_DONE':
            mock_job.status.state = _job_state_value('PENDING')
        elif state == 'PENDING':
            mock_job.status.state = _job_state_value('RUNNING')
            # for now now, we just need this to be set
            mock_job.driver_output_resource_uri = (
                'gs://mock-bucket-%s/google-cloud-dataproc-metainfo/'
                'mock-cluster-id-%s/jobs/mock-job-%s/driveroutput' %
                (random_identifier(), random_identifier(),
                 random_identifier()))
        elif state == 'RUNNING':
            if self.mock_jobs_succeed:
                mock_job.status.state = _job_state_value('DONE')
            else:
                mock_job.status.state = _job_state_value('ERROR')
Пример #12
0
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True):
    """Given an :py:class:`EMRJobRunner`, run the command specified by
    *cmd_args* on all nodes in the cluster and save the stdout and stderr of
    each run to subdirectories of *output_dir*.

    You should probably have run :py:meth:`_enable_slave_ssh_access()` on the
    runner before calling this function.
    """
    master_addr = runner._address_of_master()
    addresses = [master_addr]

    ssh_bin = runner._opts['ssh_bin']
    ec2_key_pair_file = runner._opts['ec2_key_pair_file']

    keyfile = None
    slave_addrs = runner.fs.ssh_slave_hosts(master_addr)

    if slave_addrs:
        addresses += [
            '%s!%s' % (master_addr, slave_addr) for slave_addr in slave_addrs
        ]
        # copying key file like a boss (name of keyfile doesn't really matter)
        keyfile = 'mrboss-%s.pem' % random_identifier()
        _ssh_copy_key(ssh_bin, master_addr, ec2_key_pair_file, keyfile)

    for addr in addresses:

        stdout, stderr = _ssh_run_with_recursion(
            ssh_bin,
            addr,
            ec2_key_pair_file,
            keyfile,
            cmd_args,
        )

        if print_stderr:
            print('---')
            print('Command completed on %s.' % addr)
            print(to_string(stderr), end=' ')

        if '!' in addr:
            base_dir = os.path.join(output_dir, 'slave ' + addr.split('!')[1])
        else:
            base_dir = os.path.join(output_dir, 'master')

        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        with open(os.path.join(base_dir, 'stdout'), 'wb') as f:
            f.write(stdout)

        with open(os.path.join(base_dir, 'stderr'), 'wb') as f:
            f.write(stderr)
Пример #13
0
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True):
    """Given an :py:class:`EMRJobRunner`, run the command specified by
    *cmd_args* on all nodes in the cluster and save the stdout and stderr of
    each run to subdirectories of *output_dir*.

    You should probably have run :py:meth:`_enable_slave_ssh_access()` on the
    runner before calling this function.
    """
    master_addr = runner._address_of_master()
    addresses = [master_addr]

    ssh_bin = runner._opts['ssh_bin']
    ec2_key_pair_file = runner._opts['ec2_key_pair_file']

    keyfile = None
    slave_addrs = runner.fs.ssh_slave_hosts(master_addr)

    if slave_addrs:
        addresses += ['%s!%s' % (master_addr, slave_addr)
                      for slave_addr in slave_addrs]
        # copying key file like a boss (name of keyfile doesn't really matter)
        keyfile = 'mrboss-%s.pem' % random_identifier()
        _ssh_copy_key(ssh_bin, master_addr, ec2_key_pair_file, keyfile)

    for addr in addresses:

        stdout, stderr = _ssh_run_with_recursion(
            ssh_bin,
            addr,
            ec2_key_pair_file,
            keyfile,
            cmd_args,
        )

        if print_stderr:
            print('---')
            print('Command completed on %s.' % addr)
            print(to_string(stderr), end=' ')

        if '!' in addr:
            base_dir = os.path.join(output_dir, 'slave ' + addr.split('!')[1])
        else:
            base_dir = os.path.join(output_dir, 'master')

        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        with open(os.path.join(base_dir, 'stdout'), 'wb') as f:
            f.write(stdout)

        with open(os.path.join(base_dir, 'stderr'), 'wb') as f:
            f.write(stderr)
Пример #14
0
    def test_read_large_bz2_file(self):
        # catch incorrect use of bz2 library (Issue #814)

        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'wb')

        # can't just repeat same value, because we need the file to be
        # compressed! 50000 lines is too few to catch the bug.
        with random_seed(0):
            for _ in range(100000):
                input_bz2.write((random_identifier() + '\n').encode('ascii'))
            input_bz2.close()

        # now expect to read back the same bytes
        with random_seed(0):
            num_lines = 0
            for line in read_file(input_bz2_path):
                self.assertEqual(line,
                                 (random_identifier() + '\n').encode('ascii'))
                num_lines += 1

            self.assertEqual(num_lines, 100000)
Пример #15
0
    def test_read_large_bz2_file(self):
        # catch incorrect use of bz2 library (Issue #814)

        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'wb')

        # can't just repeat same value, because we need the file to be
        # compressed! 50000 lines is too few to catch the bug.
        with random_seed(0):
            for _ in range(100000):
                input_bz2.write((random_identifier() + '\n').encode('ascii'))
            input_bz2.close()

        # now expect to read back the same bytes
        with random_seed(0):
            num_lines = 0
            for line in read_file(input_bz2_path):
                self.assertEqual(line,
                                 (random_identifier() + '\n').encode('ascii'))
                num_lines += 1

            self.assertEqual(num_lines, 100000)
Пример #16
0
def _create_mrjob_role_with_attached_policy(client, role_document, policy_arn):
    """Create a new role with a random name starting with ``mrjob-`` that
    has the given policy document and the given policy ARN attached.

    (Roles can have up to two policy ARNs attached, but we don't need this
    functionality.)
    """
    # create role
    role_name = 'mrjob-' + random_identifier()

    client.create_role(AssumeRolePolicyDocument=json.dumps(role_document),
                       RoleName=role_name)
    client.attach_role_policy(PolicyArn=policy_arn, RoleName=role_name)

    return role_name
Пример #17
0
def _create_mrjob_role_with_attached_policy(client, role_document, policy_arn):
    """Create a new role with a random name starting with ``mrjob-`` that
    has the given policy document and the given policy ARN attached.

    (Roles can have up to two policy ARNs attached, but we don't need this
    functionality.)
    """
    # create role
    role_name = 'mrjob-' + random_identifier()

    client.create_role(AssumeRolePolicyDocument=json.dumps(role_document),
                       RoleName=role_name)
    client.attach_role_policy(PolicyArn=policy_arn,
                              RoleName=role_name)

    return role_name
Пример #18
0
    def _key_filename_for(self, addr):
        """If *addr* is a !-separated pair of hosts like ``master!slave``,
        get the name of the copy of our keypair file on ``master``. If there
        isn't one, pick a random name, and copy the key file there.

        Otherwise, return ``None``."""
        # don't need to copy a key if we're SSHing directly
        if '!' not in addr:
            return None

        host = addr.split('!')[0]

        if host not in self._host_to_key_filename:
            # copy the key if we haven't already
            keyfile = 'mrjob-%s.pem' % random_identifier()
            ssh_copy_key(self._ssh_bin, host, self._ec2_key_pair_file, keyfile)
            # don't set above; ssh_copy_key() may throw an IOError
            self._host_to_key_filename[host] = keyfile

        return self._host_to_key_filename[host]
Пример #19
0
    def _key_filename_for(self, addr):
        """If *addr* is a !-separated pair of hosts like ``master!slave``,
        get the name of the copy of our keypair file on ``master``. If there
        isn't one, pick a random name, and copy the key file there.

        Otherwise, return ``None``."""
        # don't need to copy a key if we're SSHing directly
        if '!' not in addr:
            return None

        host = addr.split('!')[0]

        if host not in self._host_to_key_filename:
            # copy the key if we haven't already
            keyfile = 'mrjob-%s.pem' % random_identifier()
            ssh_copy_key(self._ssh_bin, host, self._ec2_key_pair_file, keyfile)
            # don't set above; ssh_copy_key() may throw an IOError
            self._host_to_key_filename[host] = keyfile

        return self._host_to_key_filename[host]
Пример #20
0
    def __init__(self, ssh_bin, ec2_key_pair_file):
        """
        :param ssh_bin: path to ``ssh`` binary
        :param ec2_key_pair_file: path to an SSH keyfile
        """
        super(SSHFilesystem, self).__init__()
        self._ssh_bin = ssh_bin
        self._ec2_key_pair_file = ec2_key_pair_file
        if self._ec2_key_pair_file is None:
            raise ValueError('ec2_key_pair_file must be a path')

        # use this name for all remote copies of the key pair file
        self._remote_key_pair_file = '.mrjob-%s.pem' % random_identifier()

        # keep track of hosts we've already copied the key pair to
        self._hosts_with_key_pair_file = set()

        # keep track of which hosts we've copied our key to, and
        # what the (random) name of the key file is on that host
        self._host_to_key_filename = {}

        # should we use sudo (for EMR)? Enable with use_sudo_over_ssh().
        self._sudo = False
Пример #21
0
    def __init__(self, ssh_bin, ec2_key_pair_file):
        """
        :param ssh_bin: path to ``ssh`` binary
        :param ec2_key_pair_file: path to an SSH keyfile
        """
        super(SSHFilesystem, self).__init__()
        self._ssh_bin = ssh_bin
        self._ec2_key_pair_file = ec2_key_pair_file
        if self._ec2_key_pair_file is None:
            raise ValueError('ec2_key_pair_file must be a path')

        # use this name for all remote copies of the key pair file
        self._remote_key_pair_file = '.mrjob-%s.pem' % random_identifier()

        # keep track of hosts we've already copied the key pair to
        self._hosts_with_key_pair_file = set()

        # keep track of which hosts we've copied our key to, and
        # what the (random) name of the key file is on that host
        self._host_to_key_filename = {}

        # should we use sudo (for EMR)? Enable with use_sudo_over_ssh().
        self._sudo = False
Пример #22
0
 def test_no_collisions_possible_ever(self):
     # heh
     with random_seed(0):
         self.assertNotEqual(random_identifier(), random_identifier())
Пример #23
0
 def test_format(self):
     with random_seed(0):
         random_id = random_identifier()
     self.assertEqual(len(random_id), 16)
     self.assertFalse(set(random_id) - set('0123456789abcdef'))
Пример #24
0
 def test_format(self):
     with random_seed(0):
         random_id = random_identifier()
     self.assertEqual(len(random_id), 16)
     self.assertFalse(set(random_id) - set('0123456789abcdef'))
Пример #25
0
 def test_no_collisions_possible_ever(self):
     # heh
     with random_seed(0):
         self.assertNotEqual(random_identifier(), random_identifier())