def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._gce_zone.lower(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._api_cluster_get(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info('Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_args() self._api_cluster_create(cluster_data) self._wait_for_cluster_ready(self._cluster_id) # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id
def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" bucket_name, _ = parse_gcs_uri(self._job_tmpdir) self._create_fs_tmp_bucket(bucket_name) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._gce_zone.lower(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._api_cluster_get(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google_errors.HttpError as e: if not e.resp.status == 404: raise log.info( 'Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_args() self._api_cluster_create(cluster_data) self._wait_for_cluster_ready(self._cluster_id) # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id
def _get_tmpdir(self, given_tmpdir): """Helper for _fix_tmpdir""" if given_tmpdir: return given_tmpdir mrjob_buckets = self.fs.list_buckets(self._gcp_project, prefix='mrjob-') # Loop over buckets until we find one that matches region # NOTE - because this is a tmpdir, we look for a GCS bucket in the # same GCE region chosen_bucket_name = None gce_lower_location = self._gce_region.lower() for tmp_bucket in mrjob_buckets: tmp_bucket_name = tmp_bucket['name'] # NOTE - GCP ambiguous Behavior - Bucket location is being # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs # suggest lowercase lower_location = tmp_bucket['location'].lower() if lower_location == gce_lower_location: # Regions are both specified and match log.info("using existing temp bucket %s" % tmp_bucket_name) chosen_bucket_name = tmp_bucket_name break # Example default - "mrjob-us-central1-RANDOMHEX" if not chosen_bucket_name: chosen_bucket_name = '-'.join( ['mrjob', gce_lower_location, random_identifier()]) return 'gs://%s/tmp/' % chosen_bucket_name
def _get_tmpdir(self, given_tmpdir): """Helper for _fix_tmpdir""" if given_tmpdir: return given_tmpdir # Loop over buckets until we find one that matches region # NOTE - because this is a tmpdir, we look for a GCS bucket in the # same GCE region chosen_bucket_name = None # determine region for bucket region = self._region() for tmp_bucket_name in self.fs.gcs.get_all_bucket_names( prefix='mrjob-'): tmp_bucket = self.fs.gcs.get_bucket(tmp_bucket_name) # NOTE - GCP ambiguous Behavior - Bucket location is being # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs # suggest lowercase. (As of Feb. 12, 2018, this is still true, # observed on google-cloud-sdk) if tmp_bucket.location.lower() == region: # Regions are both specified and match log.info("using existing temp bucket %s" % tmp_bucket_name) chosen_bucket_name = tmp_bucket_name break # Example default - "mrjob-us-central1-RANDOMHEX" if not chosen_bucket_name: chosen_bucket_name = '-'.join( ['mrjob', region, random_identifier()]) return 'gs://%s/tmp/' % chosen_bucket_name
def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" self.fs.mkdir(self._job_tmpdir) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._region(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._get_cluster(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google.api_core.exceptions.NotFound: log.info('Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_kwargs() self._create_cluster(cluster_data) self._wait_for_cluster_ready(self._cluster_id) self._set_up_ssh_tunnel() # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id
def _launch_cluster(self): """Create an empty cluster on Dataproc, and set self._cluster_id to its ID.""" self.fs.mkdir(self._job_tmpdir) # clusterName must be a match of # regex '(?:[a-z](?:[-a-z0-9]{0,53}[a-z0-9])?).' # as documented in an API error message # (not currently documented in the Dataproc docs) if not self._cluster_id: self._cluster_id = '-'.join( ['mrjob', self._region(), random_identifier()]) # Create the cluster if it's missing, otherwise join an existing one try: self._get_cluster(self._cluster_id) log.info('Adding job to existing cluster - %s' % self._cluster_id) except google.api_core.exceptions.NotFound: log.info( 'Creating Dataproc Hadoop cluster - %s' % self._cluster_id) cluster_data = self._cluster_create_kwargs() self._create_cluster(cluster_data) self._wait_for_cluster_ready(self._cluster_id) self._set_up_ssh_tunnel() # keep track of when we launched our job self._dataproc_job_start = time.time() return self._cluster_id
def _get_tmpdir(self, given_tmpdir): """Helper for _fix_tmpdir""" if given_tmpdir: return given_tmpdir mrjob_buckets = self.fs.list_buckets( self._gcp_project, prefix='mrjob-') # Loop over buckets until we find one that matches region # NOTE - because this is a tmpdir, we look for a GCS bucket in the # same GCE region chosen_bucket_name = None gce_lower_location = self._gce_region.lower() for tmp_bucket in mrjob_buckets: tmp_bucket_name = tmp_bucket['name'] # NOTE - GCP ambiguous Behavior - Bucket location is being # returned as UPPERCASE, ticket filed as of Apr 23, 2016 as docs # suggest lowercase lower_location = tmp_bucket['location'].lower() if lower_location == gce_lower_location: # Regions are both specified and match log.info("using existing temp bucket %s" % tmp_bucket_name) chosen_bucket_name = tmp_bucket_name break # Example default - "mrjob-us-central1-RANDOMHEX" if not chosen_bucket_name: chosen_bucket_name = '-'.join( ['mrjob', gce_lower_location, random_identifier()]) return 'gs://%s/tmp/' % chosen_bucket_name
def _create_mrjob_role_with_attached_policy(conn, role_document, policy_arn): # create role role_name = 'mrjob-' + random_identifier() conn.create_role(role_name, json.dumps(role_document)) _attach_role_policy(conn, role_name, policy_arn) return role_name
def _simulate_progress(self, mock_job): state = _job_state_name(mock_job.status.state) if state == 'SETUP_DONE': mock_job.status.state = _job_state_value('PENDING') elif state == 'PENDING': mock_job.status.state = _job_state_value('RUNNING') # for now now, we just need this to be set mock_job.driver_output_resource_uri = ( 'gs://mock-bucket-%s/google-cloud-dataproc-metainfo/' 'mock-cluster-id-%s/jobs/mock-job-%s/driveroutput' % (random_identifier(), random_identifier(), random_identifier())) elif state == 'RUNNING': if self.mock_jobs_succeed: mock_job.status.state = _job_state_value('DONE') else: mock_job.status.state = _job_state_value('ERROR')
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True): """Given an :py:class:`EMRJobRunner`, run the command specified by *cmd_args* on all nodes in the cluster and save the stdout and stderr of each run to subdirectories of *output_dir*. You should probably have run :py:meth:`_enable_slave_ssh_access()` on the runner before calling this function. """ master_addr = runner._address_of_master() addresses = [master_addr] ssh_bin = runner._opts['ssh_bin'] ec2_key_pair_file = runner._opts['ec2_key_pair_file'] keyfile = None slave_addrs = runner.fs.ssh_slave_hosts(master_addr) if slave_addrs: addresses += [ '%s!%s' % (master_addr, slave_addr) for slave_addr in slave_addrs ] # copying key file like a boss (name of keyfile doesn't really matter) keyfile = 'mrboss-%s.pem' % random_identifier() _ssh_copy_key(ssh_bin, master_addr, ec2_key_pair_file, keyfile) for addr in addresses: stdout, stderr = _ssh_run_with_recursion( ssh_bin, addr, ec2_key_pair_file, keyfile, cmd_args, ) if print_stderr: print('---') print('Command completed on %s.' % addr) print(to_string(stderr), end=' ') if '!' in addr: base_dir = os.path.join(output_dir, 'slave ' + addr.split('!')[1]) else: base_dir = os.path.join(output_dir, 'master') if not os.path.exists(base_dir): os.makedirs(base_dir) with open(os.path.join(base_dir, 'stdout'), 'wb') as f: f.write(stdout) with open(os.path.join(base_dir, 'stderr'), 'wb') as f: f.write(stderr)
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True): """Given an :py:class:`EMRJobRunner`, run the command specified by *cmd_args* on all nodes in the cluster and save the stdout and stderr of each run to subdirectories of *output_dir*. You should probably have run :py:meth:`_enable_slave_ssh_access()` on the runner before calling this function. """ master_addr = runner._address_of_master() addresses = [master_addr] ssh_bin = runner._opts['ssh_bin'] ec2_key_pair_file = runner._opts['ec2_key_pair_file'] keyfile = None slave_addrs = runner.fs.ssh_slave_hosts(master_addr) if slave_addrs: addresses += ['%s!%s' % (master_addr, slave_addr) for slave_addr in slave_addrs] # copying key file like a boss (name of keyfile doesn't really matter) keyfile = 'mrboss-%s.pem' % random_identifier() _ssh_copy_key(ssh_bin, master_addr, ec2_key_pair_file, keyfile) for addr in addresses: stdout, stderr = _ssh_run_with_recursion( ssh_bin, addr, ec2_key_pair_file, keyfile, cmd_args, ) if print_stderr: print('---') print('Command completed on %s.' % addr) print(to_string(stderr), end=' ') if '!' in addr: base_dir = os.path.join(output_dir, 'slave ' + addr.split('!')[1]) else: base_dir = os.path.join(output_dir, 'master') if not os.path.exists(base_dir): os.makedirs(base_dir) with open(os.path.join(base_dir, 'stdout'), 'wb') as f: f.write(stdout) with open(os.path.join(base_dir, 'stderr'), 'wb') as f: f.write(stderr)
def test_read_large_bz2_file(self): # catch incorrect use of bz2 library (Issue #814) input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2') input_bz2 = bz2.BZ2File(input_bz2_path, 'wb') # can't just repeat same value, because we need the file to be # compressed! 50000 lines is too few to catch the bug. with random_seed(0): for _ in range(100000): input_bz2.write((random_identifier() + '\n').encode('ascii')) input_bz2.close() # now expect to read back the same bytes with random_seed(0): num_lines = 0 for line in read_file(input_bz2_path): self.assertEqual(line, (random_identifier() + '\n').encode('ascii')) num_lines += 1 self.assertEqual(num_lines, 100000)
def _create_mrjob_role_with_attached_policy(client, role_document, policy_arn): """Create a new role with a random name starting with ``mrjob-`` that has the given policy document and the given policy ARN attached. (Roles can have up to two policy ARNs attached, but we don't need this functionality.) """ # create role role_name = 'mrjob-' + random_identifier() client.create_role(AssumeRolePolicyDocument=json.dumps(role_document), RoleName=role_name) client.attach_role_policy(PolicyArn=policy_arn, RoleName=role_name) return role_name
def _key_filename_for(self, addr): """If *addr* is a !-separated pair of hosts like ``master!slave``, get the name of the copy of our keypair file on ``master``. If there isn't one, pick a random name, and copy the key file there. Otherwise, return ``None``.""" # don't need to copy a key if we're SSHing directly if '!' not in addr: return None host = addr.split('!')[0] if host not in self._host_to_key_filename: # copy the key if we haven't already keyfile = 'mrjob-%s.pem' % random_identifier() ssh_copy_key(self._ssh_bin, host, self._ec2_key_pair_file, keyfile) # don't set above; ssh_copy_key() may throw an IOError self._host_to_key_filename[host] = keyfile return self._host_to_key_filename[host]
def __init__(self, ssh_bin, ec2_key_pair_file): """ :param ssh_bin: path to ``ssh`` binary :param ec2_key_pair_file: path to an SSH keyfile """ super(SSHFilesystem, self).__init__() self._ssh_bin = ssh_bin self._ec2_key_pair_file = ec2_key_pair_file if self._ec2_key_pair_file is None: raise ValueError('ec2_key_pair_file must be a path') # use this name for all remote copies of the key pair file self._remote_key_pair_file = '.mrjob-%s.pem' % random_identifier() # keep track of hosts we've already copied the key pair to self._hosts_with_key_pair_file = set() # keep track of which hosts we've copied our key to, and # what the (random) name of the key file is on that host self._host_to_key_filename = {} # should we use sudo (for EMR)? Enable with use_sudo_over_ssh(). self._sudo = False
def test_no_collisions_possible_ever(self): # heh with random_seed(0): self.assertNotEqual(random_identifier(), random_identifier())
def test_format(self): with random_seed(0): random_id = random_identifier() self.assertEqual(len(random_id), 16) self.assertFalse(set(random_id) - set('0123456789abcdef'))