Exemplo n.º 1
0
  def stage_file(self, gcs_or_local_path, file_name, stream,
                 mime_type='application/octet-stream', total_size=None):
    """Stages a file at a GCS or local path with stream-supplied contents."""
    if not gcs_or_local_path.startswith('gs://'):
      local_path = FileSystems.join(gcs_or_local_path, file_name)
      _LOGGER.info('Staging file locally to %s', local_path)
      with open(local_path, 'wb') as f:
        f.write(stream.read())
      return
    gcs_location = FileSystems.join(gcs_or_local_path, file_name)
    bucket, name = gcs_location[5:].split('/', 1)

    request = storage.StorageObjectsInsertRequest(
        bucket=bucket, name=name)
    start_time = time.time()
    _LOGGER.info('Starting GCS upload to %s...', gcs_location)
    upload = storage.Upload(stream, mime_type, total_size)
    try:
      response = self._storage_client.objects.Insert(request, upload=upload)
    except exceptions.HttpError as e:
      reportable_errors = {
          403: 'access denied',
          404: 'bucket not found',
      }
      if e.status_code in reportable_errors:
        raise IOError(('Could not upload to GCS path %s: %s. Please verify '
                       'that credentials are valid and that you have write '
                       'access to the specified path.') %
                      (gcs_or_local_path, reportable_errors[e.status_code]))
      raise
    _LOGGER.info('Completed GCS upload to %s in %s seconds.', gcs_location,
                 int(time.time() - start_time))
    return response
Exemplo n.º 2
0
  def stage_file(self, gcs_or_local_path, file_name, stream,
                 mime_type='application/octet-stream'):
    """Stages a file at a GCS or local path with stream-supplied contents."""
    if not gcs_or_local_path.startswith('gs://'):
      local_path = FileSystems.join(gcs_or_local_path, file_name)
      logging.info('Staging file locally to %s', local_path)
      with open(local_path, 'wb') as f:
        f.write(stream.read())
      return
    gcs_location = FileSystems.join(gcs_or_local_path, file_name)
    bucket, name = gcs_location[5:].split('/', 1)

    request = storage.StorageObjectsInsertRequest(
        bucket=bucket, name=name)
    logging.info('Starting GCS upload to %s...', gcs_location)
    upload = storage.Upload(stream, mime_type)
    try:
      response = self._storage_client.objects.Insert(request, upload=upload)
    except exceptions.HttpError as e:
      reportable_errors = {
          403: 'access denied',
          404: 'bucket not found',
      }
      if e.status_code in reportable_errors:
        raise IOError(('Could not upload to GCS path %s: %s. Please verify '
                       'that credentials are valid and that you have write '
                       'access to the specified path.') %
                      (gcs_or_local_path, reportable_errors[e.status_code]))
      raise
    logging.info('Completed GCS upload to %s', gcs_location)
    return response
Exemplo n.º 3
0
    def __init__(self, options, proto_pipeline):
        self.options = options
        self.proto_pipeline = proto_pipeline
        self.google_cloud_options = options.view_as(GoogleCloudOptions)
        if not self.google_cloud_options.job_name:
            self.google_cloud_options.job_name = self.default_job_name(
                self.google_cloud_options.job_name)

        required_google_cloud_options = [
            'project', 'job_name', 'temp_location'
        ]
        missing = [
            option for option in required_google_cloud_options
            if not getattr(self.google_cloud_options, option)
        ]
        if missing:
            raise ValueError('Missing required configuration parameters: %s' %
                             missing)

        if not self.google_cloud_options.staging_location:
            logging.info(
                'Defaulting to the temp_location as staging_location: %s',
                self.google_cloud_options.temp_location)
            (self.google_cloud_options.staging_location
             ) = self.google_cloud_options.temp_location

        # Make the staging and temp locations job name and time specific. This is
        # needed to avoid clashes between job submissions using the same staging
        # area or team members using same job names. This method is not entirely
        # foolproof since two job submissions with same name can happen at exactly
        # the same time. However the window is extremely small given that
        # time.time() has at least microseconds granularity. We add the suffix only
        # for GCS staging locations where the potential for such clashes is high.
        if self.google_cloud_options.staging_location.startswith('gs://'):
            path_suffix = '%s.%f' % (self.google_cloud_options.job_name,
                                     time.time())
            self.google_cloud_options.staging_location = FileSystems.join(
                self.google_cloud_options.staging_location, path_suffix)
            self.google_cloud_options.temp_location = FileSystems.join(
                self.google_cloud_options.temp_location, path_suffix)

        self.proto = dataflow.Job(name=self.google_cloud_options.job_name)
        if self.options.view_as(StandardOptions).streaming:
            self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_STREAMING
        else:
            self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_BATCH

        # Labels.
        if self.google_cloud_options.labels:
            self.proto.labels = dataflow.Job.LabelsValue()
            for label in self.google_cloud_options.labels:
                parts = label.split('=', 1)
                key = parts[0]
                value = parts[1] if len(parts) > 1 else ''
                self.proto.labels.additionalProperties.append(
                    dataflow.Job.LabelsValue.AdditionalProperty(key=key,
                                                                value=value))

        self.base64_str_re = re.compile(r'^[A-Za-z0-9+/]*=*$')
        self.coder_str_re = re.compile(r'^([A-Za-z]+\$)([A-Za-z0-9+/]*=*)$')
Exemplo n.º 4
0
def bigquery_export_destination_uri(
    gcs_location_vp: Optional[ValueProvider],
    temp_location: Optional[str],
    unique_id: str,
    directory_only: bool = False,
) -> str:
    """Returns the fully qualified Google Cloud Storage URI where the
  extracted table should be written.
  """
    file_pattern = 'bigquery-table-dump-*.json'

    gcs_location = None
    if gcs_location_vp is not None:
        gcs_location = gcs_location_vp.get()

    if gcs_location is not None:
        gcs_base = gcs_location
    elif temp_location is not None:
        gcs_base = temp_location
        _LOGGER.debug("gcs_location is empty, using temp_location instead")
    else:
        raise ValueError(
            'ReadFromBigQuery requires a GCS location to be provided. Neither '
            'gcs_location in the constructor nor the fallback option '
            '--temp_location is set.')

    if not unique_id:
        unique_id = uuid.uuid4().hex

    if directory_only:
        return FileSystems.join(gcs_base, unique_id)
    else:
        return FileSystems.join(gcs_base, unique_id, file_pattern)
Exemplo n.º 5
0
    def _stage_beam_sdk(self, sdk_remote_location, staging_location, temp_dir):
        """Stages a Beam SDK file with the appropriate version.

      Args:
        sdk_remote_location: A URL from which thefile can be downloaded or a
          remote file location. The SDK file can be a tarball or a wheel. Set
          to 'pypi' to download and stage a wheel and source SDK from PyPi.
        staging_location: Location where the SDK file should be copied.
        temp_dir: path to temporary location where the file should be
          downloaded.

      Returns:
        A list of SDK files that were staged to the staging location.

      Raises:
        RuntimeError: if staging was not successful.
      """
        if sdk_remote_location == 'pypi':
            sdk_local_file = Stager._download_pypi_sdk_package(temp_dir)
            sdk_sources_staged_name = Stager.\
                _desired_sdk_filename_in_staging_location(sdk_local_file)
            staged_path = FileSystems.join(staging_location,
                                           sdk_sources_staged_name)
            logging.info('Staging SDK sources from PyPI to %s', staged_path)
            self.stage_artifact(sdk_local_file, staged_path)
            staged_sdk_files = [sdk_sources_staged_name]
            try:
                # Stage binary distribution of the SDK, for now on a best-effort basis.
                sdk_local_file = Stager._download_pypi_sdk_package(
                    temp_dir, fetch_binary=True)
                sdk_binary_staged_name = Stager.\
                    _desired_sdk_filename_in_staging_location(sdk_local_file)
                staged_path = FileSystems.join(staging_location,
                                               sdk_binary_staged_name)
                logging.info(
                    'Staging binary distribution of the SDK from PyPI to %s',
                    staged_path)
                self.stage_artifact(sdk_local_file, staged_path)
                staged_sdk_files.append(sdk_binary_staged_name)
            except RuntimeError as e:
                logging.warn(
                    'Failed to download requested binary distribution '
                    'of the SDK: %s', repr(e))

            return staged_sdk_files
        elif Stager._is_remote_path(sdk_remote_location):
            local_download_file = os.path.join(temp_dir, 'beam-sdk.tar.gz')
            Stager._download_file(sdk_remote_location, local_download_file)
            staged_name = Stager._desired_sdk_filename_in_staging_location(
                sdk_remote_location)
            staged_path = FileSystems.join(staging_location, staged_name)
            logging.info('Staging Beam SDK from %s to %s', sdk_remote_location,
                         staged_path)
            self.stage_artifact(local_download_file, staged_path)
            return [staged_name]
        else:
            raise RuntimeError(
                'The --sdk_location option was used with an unsupported '
                'type of location: %s' % sdk_remote_location)
 def test_windows_path_join(self, *unused_mocks):
   # Test joining of Windows paths.
   localfilesystem.os.path.join.side_effect = _gen_fake_join('\\')
   self.assertEqual(r'C:\tmp\path\to\file',
                    FileSystems.join(r'C:\tmp\path', 'to', 'file'))
   self.assertEqual(r'C:\tmp\path\to\file',
                    FileSystems.join(r'C:\tmp\path', r'to\file'))
   self.assertEqual(r'C:\tmp\path\to\file',
                    FileSystems.join(r'C:\tmp\path\\', 'to', 'file'))
Exemplo n.º 7
0
 def test_windows_path_join(self, *unused_mocks):
     # Test joining of Windows paths.
     localfilesystem.os.path.join.side_effect = _gen_fake_join('\\')
     self.assertEqual(r'C:\tmp\path\to\file',
                      FileSystems.join(r'C:\tmp\path', 'to', 'file'))
     self.assertEqual(r'C:\tmp\path\to\file',
                      FileSystems.join(r'C:\tmp\path', r'to\file'))
     self.assertEqual(r'C:\tmp\path\to\file',
                      FileSystems.join(r'C:\tmp\path\\', 'to', 'file'))
Exemplo n.º 8
0
  def __init__(self, options, proto_pipeline):
    self.options = options
    self.proto_pipeline = proto_pipeline
    self.google_cloud_options = options.view_as(GoogleCloudOptions)
    if not self.google_cloud_options.job_name:
      self.google_cloud_options.job_name = self.default_job_name(
          self.google_cloud_options.job_name)

    required_google_cloud_options = ['project', 'job_name', 'temp_location']
    missing = [
        option for option in required_google_cloud_options
        if not getattr(self.google_cloud_options, option)]
    if missing:
      raise ValueError(
          'Missing required configuration parameters: %s' % missing)

    if not self.google_cloud_options.staging_location:
      logging.info('Defaulting to the temp_location as staging_location: %s',
                   self.google_cloud_options.temp_location)
      (self.google_cloud_options
       .staging_location) = self.google_cloud_options.temp_location

    # Make the staging and temp locations job name and time specific. This is
    # needed to avoid clashes between job submissions using the same staging
    # area or team members using same job names. This method is not entirely
    # foolproof since two job submissions with same name can happen at exactly
    # the same time. However the window is extremely small given that
    # time.time() has at least microseconds granularity. We add the suffix only
    # for GCS staging locations where the potential for such clashes is high.
    if self.google_cloud_options.staging_location.startswith('gs://'):
      path_suffix = '%s.%f' % (self.google_cloud_options.job_name, time.time())
      self.google_cloud_options.staging_location = FileSystems.join(
          self.google_cloud_options.staging_location, path_suffix)
      self.google_cloud_options.temp_location = FileSystems.join(
          self.google_cloud_options.temp_location, path_suffix)

    self.proto = dataflow.Job(name=self.google_cloud_options.job_name)
    if self.options.view_as(StandardOptions).streaming:
      self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_STREAMING
    else:
      self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_BATCH
    if self.google_cloud_options.update:
      self.proto.replaceJobId = self.job_id_for_name(self.proto.name)

    # Labels.
    if self.google_cloud_options.labels:
      self.proto.labels = dataflow.Job.LabelsValue()
      for label in self.google_cloud_options.labels:
        parts = label.split('=', 1)
        key = parts[0]
        value = parts[1] if len(parts) > 1 else ''
        self.proto.labels.additionalProperties.append(
            dataflow.Job.LabelsValue.AdditionalProperty(key=key, value=value))

    self.base64_str_re = re.compile(r'^[A-Za-z0-9+/]*=*$')
    self.coder_str_re = re.compile(r'^([A-Za-z]+\$)([A-Za-z0-9+/]*=*)$')
Exemplo n.º 9
0
  def _stage_beam_sdk(self, sdk_remote_location, staging_location, temp_dir):
    """Stages a Beam SDK file with the appropriate version.

      Args:
        sdk_remote_location: A URL from which thefile can be downloaded or a
          remote file location. The SDK file can be a tarball or a wheel. Set
          to 'pypi' to download and stage a wheel and source SDK from PyPi.
        staging_location: Location where the SDK file should be copied.
        temp_dir: path to temporary location where the file should be
          downloaded.

      Returns:
        A list of SDK files that were staged to the staging location.

      Raises:
        RuntimeError: if staging was not successful.
      """
    if sdk_remote_location == 'pypi':
      sdk_local_file = Stager._download_pypi_sdk_package(temp_dir)
      sdk_sources_staged_name = Stager.\
          _desired_sdk_filename_in_staging_location(sdk_local_file)
      staged_path = FileSystems.join(staging_location, sdk_sources_staged_name)
      logging.info('Staging SDK sources from PyPI to %s', staged_path)
      self.stage_artifact(sdk_local_file, staged_path)
      staged_sdk_files = [sdk_sources_staged_name]
      try:
        # Stage binary distribution of the SDK, for now on a best-effort basis.
        sdk_local_file = Stager._download_pypi_sdk_package(
            temp_dir, fetch_binary=True)
        sdk_binary_staged_name = Stager.\
            _desired_sdk_filename_in_staging_location(sdk_local_file)
        staged_path = FileSystems.join(staging_location, sdk_binary_staged_name)
        logging.info('Staging binary distribution of the SDK from PyPI to %s',
                     staged_path)
        self.stage_artifact(sdk_local_file, staged_path)
        staged_sdk_files.append(sdk_binary_staged_name)
      except RuntimeError as e:
        logging.warn(
            'Failed to download requested binary distribution '
            'of the SDK: %s', repr(e))

      return staged_sdk_files
    elif Stager._is_remote_path(sdk_remote_location):
      local_download_file = os.path.join(temp_dir, 'beam-sdk.tar.gz')
      Stager._download_file(sdk_remote_location, local_download_file)
      staged_name = Stager._desired_sdk_filename_in_staging_location(
          sdk_remote_location)
      staged_path = FileSystems.join(staging_location, staged_name)
      logging.info('Staging Beam SDK from %s to %s', sdk_remote_location,
                   staged_path)
      self.stage_artifact(local_download_file, staged_path)
      return [staged_name]
    else:
      raise RuntimeError(
          'The --sdk_location option was used with an unsupported '
          'type of location: %s' % sdk_remote_location)
Exemplo n.º 10
0
 def test_unix_path_join(self, *unused_mocks):
     # Test joining of Unix paths.
     localfilesystem.os.path.join.side_effect = _gen_fake_join('/')
     self.assertEqual('/tmp/path/to/file',
                      FileSystems.join('/tmp/path', 'to', 'file'))
     self.assertEqual('/tmp/path/to/file',
                      FileSystems.join('/tmp/path', 'to/file'))
     self.assertEqual('/tmp/path/to/file',
                      FileSystems.join('/', 'tmp/path', 'to/file'))
     self.assertEqual('/tmp/path/to/file',
                      FileSystems.join('/tmp/', 'path', 'to/file'))
 def test_unix_path_join(self, *unused_mocks):
   # Test joining of Unix paths.
   localfilesystem.os.path.join.side_effect = _gen_fake_join('/')
   self.assertEqual('/tmp/path/to/file',
                    FileSystems.join('/tmp/path', 'to', 'file'))
   self.assertEqual('/tmp/path/to/file',
                    FileSystems.join('/tmp/path', 'to/file'))
   self.assertEqual('/tmp/path/to/file',
                    FileSystems.join('/', 'tmp/path', 'to/file'))
   self.assertEqual('/tmp/path/to/file',
                    FileSystems.join('/tmp/', 'path', 'to/file'))
Exemplo n.º 12
0
        def write_orphaned_file(temp_dir, writer_key):
            temp_dir_path = FileSystems.join(dir, temp_dir)

            file_prefix_dir = FileSystems.join(temp_dir_path,
                                               str(abs(hash(writer_key))))

            file_name = '%s_%s' % (file_prefix_dir, uuid.uuid4())
            with FileSystems.create(file_name) as f:
                f.write(b'Hello y\'all')

            return file_name
Exemplo n.º 13
0
    def _stage_jar_packages(self, jar_packages, staging_location, temp_dir):
        # type: (...) -> List[str]
        """Stages a list of local jar packages for Java SDK Harness.

    :param jar_packages: Ordered list of local paths to jar packages to be
      staged. Only packages on localfile system and GCS are supported.
    :param staging_location: Staging location for the packages.
    :param temp_dir: Temporary folder where the resource building can happen.
    :return: A list of file names (no paths) for the resource staged. All the
      files are assumed to be staged in staging_location.
    :raises:
      RuntimeError: If files specified are not found or do not have expected
        name patterns.
    """
        resources = []  # type: List[str]
        staging_temp_dir = tempfile.mkdtemp(dir=temp_dir)
        local_packages = []  # type: List[str]
        for package in jar_packages:
            if not os.path.basename(package).endswith('.jar'):
                raise RuntimeError(
                    'The --experiment=\'jar_packages=\' option expects a full path '
                    'ending with ".jar" instead of %s' % package)

            if not os.path.isfile(package):
                if Stager._is_remote_path(package):
                    # Download remote package.
                    _LOGGER.info(
                        'Downloading jar package: %s locally before staging',
                        package)
                    _, last_component = FileSystems.split(package)
                    local_file_path = FileSystems.join(staging_temp_dir,
                                                       last_component)
                    Stager._download_file(package, local_file_path)
                else:
                    raise RuntimeError(
                        'The file %s cannot be found. It was specified in the '
                        '--experiment=\'jar_packages=\' command line option.' %
                        package)
            else:
                local_packages.append(package)

        local_packages.extend([
            FileSystems.join(staging_temp_dir, f)
            for f in os.listdir(staging_temp_dir)
        ])

        for package in local_packages:
            basename = os.path.basename(package)
            staged_path = FileSystems.join(staging_location, basename)
            self.stage_artifact(package, staged_path)
            resources.append(basename)

        return resources
Exemplo n.º 14
0
def run(argv=None):
    pipeline_options = PipelineOptions(argv)
    options = pipeline_options.view_as(ParkdataPipelineOptions)
    # Save the main session that defines global import, functions and variables. Otherwise they are not saved during
    # the serialization. Details see https://cloud.google.com/dataflow/docs/resources/faq#how_do_i_handle_nameerrors
    pipeline_options.view_as(
        SetupOptions).save_main_session = options.save_session
    with beam.Pipeline(options=pipeline_options) as p:
        wikidata_data, commons_ids = (
            p
            | "wikidata_query/create" >> beam.Create(wd_queries())
            | "wikidata/query" >> wikidata.Query(
                FileSystems.join(options.base_path,
                                 "wikidata_query_cache.sqlite"),
                user_agent=options.user_agent,
            )
            | "wikidata/group" >> beam.GroupByKey()
            | "wikidata/fetch" >> wikidata.Transform(
                options.supported_languages(),
                cache_file=FileSystems.join(options.base_path,
                                            "wikidata_cache.sqlite"),
                user_agent=options.user_agent,
            ))

        commons_data = commons_ids | "commons" >> commons.Transform(
            FileSystems.join(options.base_path, "commons_cache.sqlite"),
            user_agent=options.user_agent)

        wikipedia_data = wikidata_data | "wikipedia" >> wikipedia.Transform(
            FileSystems.join(options.base_path, "wikipedia_qache.sqlite"),
            user_agent=options.user_agent)

        changed_places = (
            {
                Combine.TAG_COMMONS: commons_data,
                Combine.TAG_WIKIDATA: wikidata_data,
                Combine.TAG_WIKIPEDIA: wikipedia_data,
            }
            | "combine/group_by_key" >> beam.CoGroupByKey()
            | "combine/combine" >> beam.ParDo(Combine())
            | "combine/changed" >> beam.ParDo(
                OutputNewOrChangedEntires(
                    FileSystems.join(options.base_path, "output.sqlite"))))

        (changed_places
         |
         "firestore_output/convert_types" >> beam.MapTuple(use_firestore_types)
         | "firestore_output/write" >> beam.ParDo(
             FirestoreWrite(project=options.project_id,
                            collection="places_v4",
                            credentials="gcp-service-account.json")))
Exemplo n.º 15
0
  def test_store_fileio_file_small_buffer_flush(self, FakeClient):
    input_dict = {}
    input_dict['project_id'] = "test_project"
    input_dict['region'] = "test_region"
    input_dict['dataset_id'] = "test_dataset_id"
    input_dict['dicom_store_id'] = "test_dicom_store_id"

    fc = FakeHttpClient()
    FakeClient.return_value = fc

    temp_dir = '%s%s' % (self._new_tempdir(), os.sep)
    dict_input_1 = {
        'PatientName': 'George', 'Age': 23, 'TestResult': 'Negative'
    }
    str_input_1 = json.dumps(dict_input_1)
    self._create_temp_file(dir=temp_dir, content=str_input_1)
    dict_input_2 = {'PatientName': 'Peter', 'Age': 54, 'TestResult': 'Positive'}
    str_input_2 = json.dumps(dict_input_2)
    self._create_temp_file(dir=temp_dir, content=str_input_2)
    dict_input_3 = {'PatientName': 'Zen', 'Age': 27, 'TestResult': 'Negative'}
    str_input_3 = json.dumps(dict_input_3)
    self._create_temp_file(dir=temp_dir, content=str_input_3)

    with TestPipeline() as p:
      results = (
          p
          | beam.Create([FileSystems.join(temp_dir, '*')])
          | fileio.MatchAll()
          | fileio.ReadMatches()
          | UploadToDicomStore(input_dict, 'fileio', buffer_size=1)
          | beam.Map(lambda x: x['success']))
      assert_that(results, equal_to([True] * 3))
    self.assertTrue(dict_input_1 in fc.dicom_metadata)
    self.assertTrue(dict_input_2 in fc.dicom_metadata)
    self.assertTrue(dict_input_3 in fc.dicom_metadata)
Exemplo n.º 16
0
    def create_job(self, job):
        """Creates job description. May stage and/or submit for remote execution."""
        self.create_job_description(job)

        # Stage and submit the job when necessary
        dataflow_job_file = job.options.view_as(DebugOptions).dataflow_job_file
        template_location = (
            job.options.view_as(GoogleCloudOptions).template_location)

        job_location = template_location or dataflow_job_file
        if job_location:
            gcs_or_local_path = os.path.dirname(job_location)
            file_name = os.path.basename(job_location)
            self.stage_file(gcs_or_local_path, file_name,
                            io.BytesIO(job.json().encode('utf-8')))

        if job.options.view_as(DebugOptions).lookup_experiment('upload_graph'):
            self.stage_file(
                job.options.view_as(GoogleCloudOptions).staging_location,
                "dataflow_graph.json", io.BytesIO(job.json().encode('utf-8')))
            del job.proto.steps[:]
            job.proto.stepsLocation = FileSystems.join(
                job.options.view_as(GoogleCloudOptions).staging_location,
                "dataflow_graph.json")

        if not template_location:
            return self.submit_job_description(job)

        _LOGGER.info('A template was just created at location %s',
                     template_location)
        return None
Exemplo n.º 17
0
    def test_write_to_dynamic_destination(self):

        sink_params = [
            fileio.TextSink,  # pass a type signature
            fileio.TextSink()  # pass a FileSink object
        ]

        for sink in sink_params:
            dir = self._new_tempdir()

            with TestPipeline() as p:
                _ = (p
                     | "Create" >> beam.Create(range(100))
                     | beam.Map(lambda x: str(x))
                     | fileio.WriteToFiles(
                         path=dir,
                         destination=lambda n: "odd" if int(n) % 2 else "even",
                         sink=sink,
                         file_naming=fileio.destination_prefix_naming("test")))

            with TestPipeline() as p:
                result = (
                    p
                    | fileio.MatchFiles(FileSystems.join(dir, '*'))
                    | fileio.ReadMatches()
                    | beam.Map(lambda f: (
                        os.path.basename(f.metadata.path).split('-')[0],
                        sorted(map(int,
                                   f.read_utf8().strip().split('\n'))))))

                assert_that(
                    result,
                    equal_to([('odd', list(range(1, 100, 2))),
                              ('even', list(range(0, 100, 2)))]))
Exemplo n.º 18
0
 def test_valid(self):
     file_pattern = FileSystems.join(self.test_data_dir, 'detail.json')
     expected_valid = [(1, {
         'error': [],
         'first_name': 'Bart',
         'last_name': 'Bruck',
         'email': '*****@*****.**',
         'id': 1
     }),
                       (3, {
                           'error':
                           [u"email 'wtuppeny2bandcamp.com' is invalid"],
                           'first_name':
                           'Winny',
                           'last_name':
                           'Tuppeny',
                           'email':
                           None,
                           'id':
                           3
                       })]
     expected_broken = [{
         'error':
         'id is missing',
         'element':
         '{"first_name":"Alfonso","last_name":"Koenen","email":"*****@*****.**"}'
     }]
     # Make use of the TestPipeline from the Beam testing util.
     with TestPipeline() as p:
         actual_valid, actual_broken = (p | Prepare(file_pattern))
         # The labels are required because otherwise the assert_that Transform does not have a stable unique label.
         assert_that(actual_valid, equal_to(expected_valid), label='valid')
         assert_that(actual_broken,
                     equal_to(expected_broken),
                     label='broken')
Exemplo n.º 19
0
  def stage_job_resources(self,
                          resources,  # type: List[Tuple[str, str]]
                          staging_location=None  # type: Optional[str]
                         ):
    """For internal use only; no backwards-compatibility guarantees.

        Stages job resources to staging_location.

        Args:
          resources: A list of tuples of local file paths and file names (no
            paths) to be used for staging resources.
          staging_location: Location to stage the file.

        Returns:
          A list of file names (no paths) for the resources staged. All the
          files are assumed to be staged at staging_location.

        Raises:
          RuntimeError: If files specified are not found or error encountered
          while trying to create the resources (e.g., build a setup package).
        """
    # Make sure that all required options are specified.
    if staging_location is None:
      raise RuntimeError('The staging_location must be specified.')

    staged_resources = []
    for file_path, staged_path in resources:
      self.stage_artifact(
          file_path, FileSystems.join(staging_location, staged_path))
      staged_resources.append(staged_path)

    return staged_resources
Exemplo n.º 20
0
  def test_write_to_different_file_types_some_spilling(self):

    dir = self._new_tempdir()

    with TestPipeline() as p:
      _ = (
          p
          | beam.Create(WriteFilesTest.SIMPLE_COLLECTION)
          | beam.io.fileio.WriteToFiles(
              path=dir,
              destination=lambda record: record['foundation'],
              sink=lambda dest: (
                  WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS)
                  if dest == 'apache' else WriteFilesTest.JsonSink()),
              file_naming=fileio.destination_prefix_naming(),
              max_writers_per_bundle=1))

    with TestPipeline() as p:
      cncf_res = (
          p
          | fileio.MatchFiles(FileSystems.join(dir, 'cncf*'))
          | fileio.ReadMatches()
          | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n'))
          | beam.Map(json.loads))

      apache_res = (
          p
          |
          "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*'))
          | "ReadApache" >> fileio.ReadMatches()
          | "MapApache" >>
          beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf))))

      assert_that(
          cncf_res,
          equal_to([
              row for row in self.SIMPLE_COLLECTION
              if row['foundation'] == 'cncf'
          ]),
          label='verifyCNCF')

      assert_that(
          apache_res,
          equal_to([[row['project'], row['foundation']]
                    for row in self.SIMPLE_COLLECTION
                    if row['foundation'] == 'apache']),
          label='verifyApache')
Exemplo n.º 21
0
class TestPrepare(unittest.TestCase):

    test_data_dir = FileSystems.join(os.path.dirname(os.path.realpath(__file__)), 'testdata')

    def test_valid(self):
        file_pattern = FileSystems.join(self.test_data_dir, 'order.json')
        expected_valid = [
            (880, {
                'id': 880,
                'customer_id': 1,
                'total_price': Decimal('287.69'),
                'error': [],
            }),
            (1342, {
                'id': 1342,
                'customer_id': 2,
                'total_price': Decimal('194.52'),
                'error': [],
            }),
            (1766, {
                'id': 1766,
                'customer_id': 2,
                'total_price': Decimal('985.00'),
                'error': [],
            }),
            (2924, {
                'id': 2924,
                'customer_id': 2,
                'total_price': Decimal('837.23'),
                'error': [],
            }),
            (3607, {
                'id': 3607,
                'customer_id': 3,
                'total_price': Decimal('373.02'),
                'error': [],
            }),
            (3949, {
                'id': 3949,
                'customer_id': 3,
                'total_price': Decimal('702.88'),
                'error': [],
            }),
        ]
        expected_broken = [
            {
                'error': 'id is missing',
                'element': '{"customer_id":3,"total_price":"707.16"}'
            }
        ]
        # Make use of the TestPipeline from the Beam testing util.
        with TestPipeline() as p:
            actual_valid, actual_broken = (
                p | Prepare(file_pattern)
            )
            # The labels are required because otherwise the assert_that Transform does not have a stable unique label.
            assert_that(actual_valid, equal_to(expected_valid), label='valid')
            assert_that(actual_broken, equal_to(expected_broken), label='broken')
Exemplo n.º 22
0
 def file_copy(from_path, to_path):
   if not from_path.endswith(names.PICKLED_MAIN_SESSION_FILE):
     self.assertEqual(expected_from_path, from_path)
     self.assertEqual(FileSystems.join(expected_to_dir,
                                       names.DATAFLOW_SDK_TARBALL_FILE),
                      to_path)
   if from_path.startswith('gs://') or to_path.startswith('gs://'):
     logging.info('Faking file_copy(%s, %s)', from_path, to_path)
   else:
     shutil.copyfile(from_path, to_path)
Exemplo n.º 23
0
 def open_writer(self, init_result, uid):
     # A proper suffix is needed for AUTO compression detection.
     # We also ensure there will be no collisions with uid and a
     # (possibly unsharded) file_path_prefix and a (possibly empty)
     # file_name_suffix.
     file_path_prefix = self.file_path_prefix.get()
     file_name_suffix = self.file_name_suffix.get()
     suffix = ('.' + os.path.basename(file_path_prefix) + file_name_suffix)
     writer_path = FileSystems.join(init_result, uid) + suffix
     return FileBasedSinkWriter(self, writer_path)
Exemplo n.º 24
0
 def _create_temp_dir(self, file_path_prefix):
   base_path, last_component = FileSystems.split(file_path_prefix)
   if not last_component:
     # Trying to re-split the base_path to check if it's a root.
     new_base_path, _ = FileSystems.split(base_path)
     if base_path == new_base_path:
       raise ValueError('Cannot create a temporary directory for root path '
                        'prefix %s. Please specify a file path prefix with '
                        'at least two components.' % file_path_prefix)
   path_components = [base_path,
                      'beam-temp-' + last_component + '-' + uuid.uuid1().hex]
   return FileSystems.join(*path_components)
Exemplo n.º 25
0
    def _check_state_for_finalize_write(self, writer_results, num_shards):
        """Checks writer output files' states.

    Returns:
      src_files, dst_files: Lists of files to rename. For each i, finalize_write
        should rename(src_files[i], dst_files[i]).
      delete_files: Src files to delete. These could be leftovers from an
        incomplete (non-atomic) rename operation.
      num_skipped: Tally of writer results files already renamed, such as from
        a previous run of finalize_write().
    """
        if not writer_results:
            return [], [], [], 0

        src_glob = FileSystems.join(
            FileSystems.split(writer_results[0])[0], '*')
        dst_glob = self._get_final_name_glob(num_shards)
        src_glob_files = set(file_metadata.path
                             for mr in FileSystems.match([src_glob])
                             for file_metadata in mr.metadata_list)
        dst_glob_files = set(file_metadata.path
                             for mr in FileSystems.match([dst_glob])
                             for file_metadata in mr.metadata_list)

        src_files = []
        dst_files = []
        delete_files = []
        num_skipped = 0
        for shard_num, src in enumerate(writer_results):
            final_name = self._get_final_name(shard_num, num_shards)
            dst = final_name
            src_exists = src in src_glob_files
            dst_exists = dst in dst_glob_files
            if not src_exists and not dst_exists:
                raise BeamIOError(
                    'src and dst files do not exist. src: %s, dst: %s' %
                    (src, dst))
            if not src_exists and dst_exists:
                logging.debug('src: %s -> dst: %s already renamed, skipping',
                              src, dst)
                num_skipped += 1
                continue
            if (src_exists and dst_exists and FileSystems.checksum(src)
                    == FileSystems.checksum(dst)):
                logging.debug('src: %s == dst: %s, deleting src', src, dst)
                delete_files.append(src)
                continue

            src_files.append(src)
            dst_files.append(dst)
        return src_files, dst_files, delete_files, num_skipped
Exemplo n.º 26
0
  def test_basic_file_name_provided(self):
    content = 'TestingMyContent\nIn multiple lines\nhaha!'
    dir = '%s%s' % (self._new_tempdir(), os.sep)
    self._create_temp_file(dir=dir, content=content)

    with TestPipeline() as p:
      content_pc = (
          p
          | beam.Create([FileSystems.join(dir, '*')])
          | fileio.MatchAll()
          | fileio.ReadMatches()
          | beam.FlatMap(lambda f: f.read().decode('utf-8').splitlines()))

      assert_that(content_pc, equal_to(content.splitlines()))
Exemplo n.º 27
0
    def test_basic_two_files(self):
        files = []
        tempdir = '%s%s' % (self._new_tempdir(), os.sep)

        # Create a couple files to be matched
        files.append(self._create_temp_file(dir=tempdir))
        files.append(self._create_temp_file(dir=tempdir))

        with TestPipeline() as p:
            files_pc = (p
                        | fileio.MatchFiles(FileSystems.join(tempdir, '*'))
                        | beam.Map(lambda x: x.path))

            assert_that(files_pc, equal_to(files))
Exemplo n.º 28
0
  def test_csv_file_source(self):
    content = 'name,year,place\ngoogle,1999,CA\nspotify,2006,sweden'
    rows = [r.split(',') for r in content.split('\n')]

    dir = '%s%s' % (self._new_tempdir(), os.sep)
    self._create_temp_file(dir=dir, content=content)

    with TestPipeline() as p:
      content_pc = (p
                    | beam.Create([FileSystems.join(dir, '*')])
                    | fileio.MatchAll()
                    | fileio.ReadMatches()
                    | beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf))))

      assert_that(content_pc, equal_to(rows))
Exemplo n.º 29
0
    def test_find_orphaned_files(self):
        dir = self._new_tempdir()

        write_transform = beam.io.fileio.WriteToFiles(path=dir)

        def write_orphaned_file(temp_dir, writer_key):
            temp_dir_path = FileSystems.join(dir, temp_dir)

            file_prefix_dir = FileSystems.join(temp_dir_path,
                                               str(abs(hash(writer_key))))

            file_name = '%s_%s' % (file_prefix_dir, uuid.uuid4())
            with FileSystems.create(file_name) as f:
                f.write(b'Hello y\'all')

            return file_name

        with TestPipeline() as p:
            _ = (p
                 | beam.Create(WriteFilesTest.SIMPLE_COLLECTION)
                 | "Serialize" >> beam.Map(json.dumps)
                 | write_transform)

            # Pre-create the temp directory.
            temp_dir_path = FileSystems.mkdirs(
                FileSystems.join(dir, write_transform._temp_directory.get()))
            write_orphaned_file(write_transform._temp_directory.get(),
                                (None, GlobalWindow()))
            f2 = write_orphaned_file(write_transform._temp_directory.get(),
                                     ('other-dest', GlobalWindow()))

        temp_dir_path = FileSystems.join(dir,
                                         write_transform._temp_directory.get())
        leftovers = FileSystems.match(['%s%s*' % (temp_dir_path, os.sep)])
        found_files = [m.path for m in leftovers[0].metadata_list]
        self.assertListEqual(found_files, [f2])
Exemplo n.º 30
0
  def _check_state_for_finalize_write(self, writer_results, num_shards):
    """Checks writer output files' states.

    Returns:
      src_files, dst_files: Lists of files to rename. For each i, finalize_write
        should rename(src_files[i], dst_files[i]).
      delete_files: Src files to delete. These could be leftovers from an
        incomplete (non-atomic) rename operation.
      num_skipped: Tally of writer results files already renamed, such as from
        a previous run of finalize_write().
    """
    if not writer_results:
      return [], [], [], 0

    src_glob = FileSystems.join(FileSystems.split(writer_results[0])[0], '*')
    dst_glob = self._get_final_name_glob(num_shards)
    src_glob_files = set(file_metadata.path
                         for mr in FileSystems.match([src_glob])
                         for file_metadata in mr.metadata_list)
    dst_glob_files = set(file_metadata.path
                         for mr in FileSystems.match([dst_glob])
                         for file_metadata in mr.metadata_list)

    src_files = []
    dst_files = []
    delete_files = []
    num_skipped = 0
    for shard_num, src in enumerate(writer_results):
      final_name = self._get_final_name(shard_num, num_shards)
      dst = final_name
      src_exists = src in src_glob_files
      dst_exists = dst in dst_glob_files
      if not src_exists and not dst_exists:
        raise BeamIOError('src and dst files do not exist. src: %s, dst: %s' % (
            src, dst))
      if not src_exists and dst_exists:
        logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst)
        num_skipped += 1
        continue
      if (src_exists and dst_exists and
          FileSystems.checksum(src) == FileSystems.checksum(dst)):
        logging.debug('src: %s == dst: %s, deleting src', src, dst)
        delete_files.append(src)
        continue

      src_files.append(src)
      dst_files.append(dst)
    return src_files, dst_files, delete_files, num_skipped
Exemplo n.º 31
0
        def pip_fake(args):
            """Fakes fetching a package from pip by creating a temporary file.

          Args:
            args: a complete list of command line arguments to invoke pip.
              The fake is sensitive to the order of the arguments.
              Supported commands:

              1) Download SDK sources file:
              python pip -m download --dest /tmp/dir apache-beam==2.0.0 \
                  --no-deps --no-binary :all:

              2) Download SDK binary wheel file:
              python pip -m download --dest /tmp/dir apache-beam==2.0.0 \
                  --no-deps --no-binary :all: --python-version 27 \
                  --implementation cp --abi cp27mu --platform manylinux1_x86_64
          """
            package_file = None
            if len(args) >= 8:
                # package_name==x.y.z
                if '==' in args[6]:
                    distribution_name = args[6][0:args[6].find('==')]
                    distribution_version = args[6][args[6].find('==') + 2:]

                    if args[8] == '--no-binary':
                        package_file = '%s-%s.zip' % (distribution_name,
                                                      distribution_version)
                    elif args[8] == '--only-binary' and len(args) >= 18:
                        if not has_wheels:
                            # Imitate the case when desired wheel distribution is not in PyPI.
                            raise RuntimeError('No matching distribution.')

                        # Per PEP-0427 in wheel filenames non-alphanumeric characters
                        # in distribution name are replaced with underscore.
                        distribution_name = distribution_name.replace('-', '_')
                        package_file = '%s-%s-%s%s-%s-%s.whl' % (
                            distribution_name,
                            distribution_version,
                            args[13],  # implementation
                            args[11],  # python version
                            args[15],  # abi tag
                            args[17]  # platform
                        )

            assert package_file, 'Pip fake does not support the command: ' + str(
                args)
            self.create_temp_file(FileSystems.join(args[5], package_file),
                                  'Package content.')
Exemplo n.º 32
0
  def test_run_example_with_setup_file(self):
    pipeline = TestPipeline(is_integration_test=True)
    coordinate_output = FileSystems.join(
        pipeline.get_option('output'),
        'juliaset-{}'.format(str(uuid.uuid4())),
        'coordinates.txt')
    extra_args = {
        'coordinate_output': coordinate_output,
        'grid_size': self.GRID_SIZE,
        'setup_file': os.path.normpath(
            os.path.join(os.path.dirname(__file__), '..', 'setup.py')),
        'on_success_matcher': all_of(PipelineStateMatcher(PipelineState.DONE)),
    }
    args = pipeline.get_full_options_as_args(**extra_args)

    juliaset.run(args)
Exemplo n.º 33
0
    def pip_fake(args):
      """Fakes fetching a package from pip by creating a temporary file.

          Args:
            args: a complete list of command line arguments to invoke pip.
              The fake is sensitive to the order of the arguments.
              Supported commands:

              1) Download SDK sources file:
              python pip -m download --dest /tmp/dir apache-beam==2.0.0 \
                  --no-deps --no-binary :all:

              2) Download SDK binary wheel file:
              python pip -m download --dest /tmp/dir apache-beam==2.0.0 \
                  --no-deps --no-binary :all: --python-version 27 \
                  --implementation cp --abi cp27mu --platform manylinux1_x86_64
          """
      package_file = None
      if len(args) >= 8:
        # package_name==x.y.z
        if '==' in args[6]:
          distribution_name = args[6][0:args[6].find('==')]
          distribution_version = args[6][args[6].find('==') + 2:]

          if args[8] == '--no-binary':
            package_file = '%s-%s.zip' % (distribution_name,
                                          distribution_version)
          elif args[8] == '--only-binary' and len(args) >= 18:
            if not has_wheels:
              # Imitate the case when desired wheel distribution is not in PyPI.
              raise RuntimeError('No matching distribution.')

            # Per PEP-0427 in wheel filenames non-alphanumeric characters
            # in distribution name are replaced with underscore.
            distribution_name = distribution_name.replace('-', '_')
            package_file = '%s-%s-%s%s-%s-%s.whl' % (
                distribution_name,
                distribution_version,
                args[13],  # implementation
                args[11],  # python version
                args[15],  # abi tag
                args[17]  # platform
            )

      assert package_file, 'Pip fake does not support the command: ' + str(args)
      self.create_temp_file(
          FileSystems.join(args[5], package_file), 'Package content.')
Exemplo n.º 34
0
  def test_match_files_one_directory_failure(self):
    directories = [
        '%s%s' % (self._new_tempdir(), os.sep),
        '%s%s' % (self._new_tempdir(), os.sep)]

    files = list()
    files.append(self._create_temp_file(dir=directories[0]))
    files.append(self._create_temp_file(dir=directories[0]))

    with TestPipeline() as p:
      files_pc = (
          p
          | beam.Create([FileSystems.join(d, '*') for d in directories])
          | fileio.MatchAll(fileio.EmptyMatchTreatment.ALLOW_IF_WILDCARD)
          | beam.Map(lambda x: x.path))

      assert_that(files_pc, equal_to(files))
Exemplo n.º 35
0
def _merge_headers(known_args, pipeline_args, pipeline_mode):
  """Merges VCF headers using beam based on pipeline_mode."""
  if known_args.representative_header_file:
    return

  options = PipelineOptions(pipeline_args)

  # Always run pipeline locally if data is small.
  if (pipeline_mode == PipelineModes.SMALL and
      not known_args.infer_undefined_headers):
    options.view_as(StandardOptions).runner = 'DirectRunner'


  google_cloud_options = options.view_as(GoogleCloudOptions)
  if google_cloud_options.job_name:
    google_cloud_options.job_name += '-' + _MERGE_HEADERS_JOB_NAME
  else:
    google_cloud_options.job_name = _MERGE_HEADERS_JOB_NAME

  temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
  # Add a time prefix to ensure files are unique in case multiple
  # pipelines are run at the same time.
  temp_merged_headers_file_name = '-'.join([
      datetime.datetime.now().strftime('%Y%m%d-%H%M%S'),
      google_cloud_options.job_name,
      _MERGE_HEADERS_FILE_NAME])
  known_args.representative_header_file = FileSystems.join(
      temp_directory, temp_merged_headers_file_name)

  with beam.Pipeline(options=options) as p:
    headers = p
    if pipeline_mode == PipelineModes.LARGE:
      headers |= (beam.Create([known_args.input_pattern])
                  | vcf_header_io.ReadAllVcfHeaders())
    else:
      headers |= vcf_header_io.ReadVcfHeaders(known_args.input_pattern)

    merged_header = (headers
                     | 'MergeHeaders' >> merge_headers.MergeHeaders(
                         known_args.split_alternate_allele_info_fields))

    if known_args.infer_undefined_headers:
      merged_header = _add_inferred_headers(p, known_args, merged_header)

    _ = (merged_header | 'WriteHeaders' >> vcf_header_io.WriteVcfHeaders(
        known_args.representative_header_file))
Exemplo n.º 36
0
  def create_job_description(self, job):
    """Creates a job described by the workflow proto."""

    # Stage the pipeline for the runner harness
    self.stage_file(job.google_cloud_options.staging_location,
                    names.STAGED_PIPELINE_FILENAME,
                    io.BytesIO(job.proto_pipeline.SerializeToString()))

    # Stage other resources for the SDK harness
    resources = self._stage_resources(job.options)

    job.proto.environment = Environment(
        pipeline_url=FileSystems.join(job.google_cloud_options.staging_location,
                                      names.STAGED_PIPELINE_FILENAME),
        packages=resources, options=job.options,
        environment_version=self.environment_version).proto
    logging.debug('JOB: %s', job)
Exemplo n.º 37
0
  def create_job_description(self, job):
    """Creates a job described by the workflow proto."""

    # Stage the pipeline for the runner harness
    self.stage_file(job.google_cloud_options.staging_location,
                    shared_names.STAGED_PIPELINE_FILENAME,
                    io.BytesIO(job.proto_pipeline.SerializeToString()))

    # Stage other resources for the SDK harness
    resources = self._stage_resources(job.options)

    job.proto.environment = Environment(
        pipeline_url=FileSystems.join(job.google_cloud_options.staging_location,
                                      shared_names.STAGED_PIPELINE_FILENAME),
        packages=resources, options=job.options,
        environment_version=self.environment_version).proto
    logging.debug('JOB: %s', job)
Exemplo n.º 38
0
    def test_read_gzip_compressed_file_without_suffix(self):
        dir = '%s%s' % (self._new_tempdir(), os.sep)

        file_contents = b'compressed_contents!'
        import gzip
        with gzip.GzipFile(os.path.join(dir, 'compressed'), 'w') as f:
            f.write(file_contents)

        with TestPipeline() as p:
            content_pc = (
                p
                | beam.Create([FileSystems.join(dir, '*')])
                | fileio.MatchAll()
                | fileio.ReadMatches()
                | beam.Map(lambda rf: rf.open(compression_type=CompressionTypes
                                              .GZIP).read(len(file_contents))))

            assert_that(content_pc, equal_to([file_contents]))
Exemplo n.º 39
0
def _stage_beam_sdk(sdk_remote_location, staging_location, temp_dir):
  """Stages a Beam SDK file with the appropriate version.

  Args:
    sdk_remote_location: A GCS path to a SDK file or a URL from which
      the file can be downloaded. The SDK file can be a tarball or a wheel.
      Set to 'pypi' to download and stage a wheel and source SDK from PyPi.
    staging_location: A GCS bucket where the SDK file should be copied.
    temp_dir: path to temporary location where the file should be downloaded.

  Returns:
    A list of SDK files that were staged to the staging location.

  Raises:
    RuntimeError: if staging was not successful.
  """
  if (sdk_remote_location.startswith('http://') or
      sdk_remote_location.startswith('https://')):
    local_download_file = _dependency_file_download(
        sdk_remote_location, temp_dir)
    staged_name = _desired_sdk_filename_in_staging_location(local_download_file)
    staged_path = FileSystems.join(staging_location, staged_name)
    logging.info(
        'Staging Beam SDK from %s to %s',
        sdk_remote_location, staged_path)
    _dependency_file_copy(local_download_file, staged_path)
    return [staged_name]
  elif sdk_remote_location.startswith('gs://'):
    # Stage the file to the GCS staging area.
    staged_name = _desired_sdk_filename_in_staging_location(sdk_remote_location)
    staged_path = FileSystems.join(staging_location, staged_name)
    logging.info(
        'Staging Beam SDK from %s to %s',
        sdk_remote_location, staged_path)
    _dependency_file_copy(sdk_remote_location, staged_path)
    return [staged_name]
  elif sdk_remote_location == 'pypi':
    sdk_local_file = _download_pypi_sdk_package(temp_dir)
    sdk_sources_staged_name = _desired_sdk_filename_in_staging_location(
        sdk_local_file)
    staged_path = FileSystems.join(staging_location, sdk_sources_staged_name)
    logging.info('Staging SDK sources from PyPI to %s', staged_path)
    _dependency_file_copy(sdk_local_file, staged_path)
    staged_sdk_files = [sdk_sources_staged_name]
    try:
      # Stage binary distribution of the SDK, for now on a best-effort basis.
      sdk_local_file = _download_pypi_sdk_package(temp_dir, fetch_binary=True)
      sdk_binary_staged_name = _desired_sdk_filename_in_staging_location(
          sdk_local_file)
      staged_path = FileSystems.join(staging_location, sdk_binary_staged_name)
      logging.info('Staging binary distribution of the SDK from PyPI to %s',
                   staged_path)
      _dependency_file_copy(sdk_local_file, staged_path)
      staged_sdk_files.append(sdk_binary_staged_name)
    except RuntimeError as e:
      logging.warn('Failed to download requested binary distribution '
                   'of the SDK: %s', repr(e))

    return staged_sdk_files
  else:
    raise RuntimeError(
        'The --sdk_location option was used with an unsupported '
        'type of location: %s' % sdk_remote_location)
Exemplo n.º 40
0
def stage_job_resources(
    options, file_copy=_dependency_file_copy, build_setup_args=None,
    temp_dir=None, populate_requirements_cache=_populate_requirements_cache):
  """For internal use only; no backwards-compatibility guarantees.

  Creates (if needed) and stages job resources to options.staging_location.

  Args:
    options: Command line options. More specifically the function will expect
      staging_location, requirements_file, setup_file, and save_main_session
      options to be present.
    file_copy: Callable for copying files. The default version will copy from
      a local file to a GCS location using the gsutil tool available in the
      Google Cloud SDK package.
    build_setup_args: A list of command line arguments used to build a setup
      package. Used only if options.setup_file is not None. Used only for
      testing.
    temp_dir: Temporary folder where the resource building can happen. If None
      then a unique temp directory will be created. Used only for testing.
    populate_requirements_cache: Callable for populating the requirements cache.
      Used only for testing.

  Returns:
    A list of file names (no paths) for the resources staged. All the files
    are assumed to be staged in options.staging_location.

  Raises:
    RuntimeError: If files specified are not found or error encountered while
      trying to create the resources (e.g., build a setup package).
  """
  temp_dir = temp_dir or tempfile.mkdtemp()
  resources = []

  google_cloud_options = options.view_as(GoogleCloudOptions)
  setup_options = options.view_as(SetupOptions)
  # Make sure that all required options are specified. There are a few that have
  # defaults to support local running scenarios.
  if google_cloud_options.staging_location is None:
    raise RuntimeError(
        'The --staging_location option must be specified.')
  if google_cloud_options.temp_location is None:
    raise RuntimeError(
        'The --temp_location option must be specified.')

  # Stage a requirements file if present.
  if setup_options.requirements_file is not None:
    if not os.path.isfile(setup_options.requirements_file):
      raise RuntimeError('The file %s cannot be found. It was specified in the '
                         '--requirements_file command line option.' %
                         setup_options.requirements_file)
    staged_path = FileSystems.join(google_cloud_options.staging_location,
                                   REQUIREMENTS_FILE)
    file_copy(setup_options.requirements_file, staged_path)
    resources.append(REQUIREMENTS_FILE)
    requirements_cache_path = (
        os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache')
        if setup_options.requirements_cache is None
        else setup_options.requirements_cache)
    # Populate cache with packages from requirements and stage the files
    # in the cache.
    if not os.path.exists(requirements_cache_path):
      os.makedirs(requirements_cache_path)
    populate_requirements_cache(
        setup_options.requirements_file, requirements_cache_path)
    for pkg in  glob.glob(os.path.join(requirements_cache_path, '*')):
      file_copy(pkg, FileSystems.join(google_cloud_options.staging_location,
                                      os.path.basename(pkg)))
      resources.append(os.path.basename(pkg))

  # Handle a setup file if present.
  # We will build the setup package locally and then copy it to the staging
  # location because the staging location is a GCS path and the file cannot be
  # created directly there.
  if setup_options.setup_file is not None:
    if not os.path.isfile(setup_options.setup_file):
      raise RuntimeError('The file %s cannot be found. It was specified in the '
                         '--setup_file command line option.' %
                         setup_options.setup_file)
    if os.path.basename(setup_options.setup_file) != 'setup.py':
      raise RuntimeError(
          'The --setup_file option expects the full path to a file named '
          'setup.py instead of %s' % setup_options.setup_file)
    tarball_file = _build_setup_package(setup_options.setup_file, temp_dir,
                                        build_setup_args)
    staged_path = FileSystems.join(google_cloud_options.staging_location,
                                   WORKFLOW_TARBALL_FILE)
    file_copy(tarball_file, staged_path)
    resources.append(WORKFLOW_TARBALL_FILE)

  # Handle extra local packages that should be staged.
  if setup_options.extra_packages is not None:
    resources.extend(
        _stage_extra_packages(setup_options.extra_packages,
                              google_cloud_options.staging_location,
                              temp_dir=temp_dir, file_copy=file_copy))

  # Pickle the main session if requested.
  # We will create the pickled main session locally and then copy it to the
  # staging location because the staging location is a GCS path and the file
  # cannot be created directly there.
  if setup_options.save_main_session:
    pickled_session_file = os.path.join(temp_dir,
                                        names.PICKLED_MAIN_SESSION_FILE)
    pickler.dump_session(pickled_session_file)
    staged_path = FileSystems.join(google_cloud_options.staging_location,
                                   names.PICKLED_MAIN_SESSION_FILE)
    file_copy(pickled_session_file, staged_path)
    resources.append(names.PICKLED_MAIN_SESSION_FILE)

  if hasattr(setup_options, 'sdk_location'):
    if setup_options.sdk_location == 'default':
      stage_tarball_from_remote_location = True
    elif (setup_options.sdk_location.startswith('gs://') or
          setup_options.sdk_location.startswith('http://') or
          setup_options.sdk_location.startswith('https://')):
      stage_tarball_from_remote_location = True
    else:
      stage_tarball_from_remote_location = False

    staged_path = FileSystems.join(google_cloud_options.staging_location,
                                   names.DATAFLOW_SDK_TARBALL_FILE)
    if stage_tarball_from_remote_location:
      # If --sdk_location is not specified then the appropriate package
      # will be obtained from PyPI (https://pypi.python.org) based on the
      # version of the currently running SDK. If the option is
      # present then no version matching is made and the exact URL or path
      # is expected.
      #
      # Unit tests running in the 'python setup.py test' context will
      # not have the sdk_location attribute present and therefore we
      # will not stage a tarball.
      if setup_options.sdk_location == 'default':
        sdk_remote_location = 'pypi'
      else:
        sdk_remote_location = setup_options.sdk_location
      _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir)
      resources.append(names.DATAFLOW_SDK_TARBALL_FILE)
    else:
      # Check if we have a local Beam SDK tarball present. This branch is
      # used by tests running with the SDK built at head.
      if setup_options.sdk_location == 'default':
        module_path = os.path.abspath(__file__)
        sdk_path = os.path.join(
            os.path.dirname(module_path), '..', '..', '..',
            names.DATAFLOW_SDK_TARBALL_FILE)
      elif os.path.isdir(setup_options.sdk_location):
        sdk_path = os.path.join(
            setup_options.sdk_location, names.DATAFLOW_SDK_TARBALL_FILE)
      else:
        sdk_path = setup_options.sdk_location
      if os.path.isfile(sdk_path):
        logging.info('Copying Beam SDK "%s" to staging location.', sdk_path)
        file_copy(sdk_path, staged_path)
        resources.append(names.DATAFLOW_SDK_TARBALL_FILE)
      else:
        if setup_options.sdk_location == 'default':
          raise RuntimeError('Cannot find default Beam SDK tar file "%s"',
                             sdk_path)
        elif not setup_options.sdk_location:
          logging.info('Beam SDK will not be staged since --sdk_location '
                       'is empty.')
        else:
          raise RuntimeError(
              'The file "%s" cannot be found. Its location was specified by '
              'the --sdk_location command-line option.' %
              sdk_path)

  # Delete all temp files created while staging job resources.
  shutil.rmtree(temp_dir)
  return resources
Exemplo n.º 41
0
def _stage_extra_packages(extra_packages, staging_location, temp_dir,
                          file_copy=_dependency_file_copy):
  """Stages a list of local extra packages.

  Args:
    extra_packages: Ordered list of local paths to extra packages to be staged.
    staging_location: Staging location for the packages.
    temp_dir: Temporary folder where the resource building can happen. Caller
      is responsible for cleaning up this folder after this function returns.
    file_copy: Callable for copying files. The default version will copy from
      a local file to a GCS location using the gsutil tool available in the
      Google Cloud SDK package.

  Returns:
    A list of file names (no paths) for the resources staged. All the files
    are assumed to be staged in staging_location.

  Raises:
    RuntimeError: If files specified are not found or do not have expected
      name patterns.
  """
  resources = []
  staging_temp_dir = None
  local_packages = []
  for package in extra_packages:
    if not (os.path.basename(package).endswith('.tar') or
            os.path.basename(package).endswith('.tar.gz') or
            os.path.basename(package).endswith('.whl') or
            os.path.basename(package).endswith('.zip')):
      raise RuntimeError(
          'The --extra_package option expects a full path ending with '
          '".tar", ".tar.gz", ".whl" or ".zip" instead of %s' % package)
    if os.path.basename(package).endswith('.whl'):
      logging.warning(
          'The .whl package "%s" is provided in --extra_package. '
          'This functionality is not officially supported. Since wheel '
          'packages are binary distributions, this package must be '
          'binary-compatible with the worker environment (e.g. Python 2.7 '
          'running on an x64 Linux host).')

    if not os.path.isfile(package):
      if package.startswith('gs://'):
        if not staging_temp_dir:
          staging_temp_dir = tempfile.mkdtemp(dir=temp_dir)
        logging.info('Downloading extra package: %s locally before staging',
                     package)
        if os.path.isfile(staging_temp_dir):
          local_file_path = staging_temp_dir
        else:
          _, last_component = FileSystems.split(package)
          local_file_path = FileSystems.join(staging_temp_dir, last_component)
        _dependency_file_copy(package, local_file_path)
      else:
        raise RuntimeError(
            'The file %s cannot be found. It was specified in the '
            '--extra_packages command line option.' % package)
    else:
      local_packages.append(package)

  if staging_temp_dir:
    local_packages.extend(
        [FileSystems.join(staging_temp_dir, f) for f in os.listdir(
            staging_temp_dir)])

  for package in local_packages:
    basename = os.path.basename(package)
    staged_path = FileSystems.join(staging_location, basename)
    file_copy(package, staged_path)
    resources.append(basename)
  # Create a file containing the list of extra packages and stage it.
  # The file is important so that in the worker the packages are installed
  # exactly in the order specified. This approach will avoid extra PyPI
  # requests. For example if package A depends on package B and package A
  # is installed first then the installer will try to satisfy the
  # dependency on B by downloading the package from PyPI. If package B is
  # installed first this is avoided.
  with open(os.path.join(temp_dir, EXTRA_PACKAGES_FILE), 'wt') as f:
    for package in local_packages:
      f.write('%s\n' % os.path.basename(package))
  staged_path = FileSystems.join(staging_location, EXTRA_PACKAGES_FILE)
  # Note that the caller of this function is responsible for deleting the
  # temporary folder where all temp files are created, including this one.
  file_copy(os.path.join(temp_dir, EXTRA_PACKAGES_FILE), staged_path)
  resources.append(EXTRA_PACKAGES_FILE)

  return resources
Exemplo n.º 42
0
  def stage_job_resources(self,
                          options,
                          build_setup_args=None,
                          temp_dir=None,
                          populate_requirements_cache=None,
                          staging_location=None):
    """For internal use only; no backwards-compatibility guarantees.

        Creates (if needed) and stages job resources to staging_location.

        Args:
          options: Command line options. More specifically the function will
            expect requirements_file, setup_file, and save_main_session options
            to be present.
          build_setup_args: A list of command line arguments used to build a
            setup package. Used only if options.setup_file is not None. Used
            only for testing.
          temp_dir: Temporary folder where the resource building can happen. If
            None then a unique temp directory will be created. Used only for
            testing.
          populate_requirements_cache: Callable for populating the requirements
            cache. Used only for testing.
          staging_location: Location to stage the file.

        Returns:
          A list of file names (no paths) for the resources staged. All the
          files
          are assumed to be staged at staging_location.

        Raises:
          RuntimeError: If files specified are not found or error encountered
          while trying to create the resources (e.g., build a setup package).
        """
    temp_dir = temp_dir or tempfile.mkdtemp()
    resources = []

    setup_options = options.view_as(SetupOptions)
    # Make sure that all required options are specified.
    if staging_location is None:
      raise RuntimeError('The staging_location must be specified.')

    # Stage a requirements file if present.
    if setup_options.requirements_file is not None:
      if not os.path.isfile(setup_options.requirements_file):
        raise RuntimeError(
            'The file %s cannot be found. It was specified in the '
            '--requirements_file command line option.' %
            setup_options.requirements_file)
      staged_path = FileSystems.join(staging_location, REQUIREMENTS_FILE)
      self.stage_artifact(setup_options.requirements_file, staged_path)
      resources.append(REQUIREMENTS_FILE)
      requirements_cache_path = (
          os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache')
          if setup_options.requirements_cache is None else
          setup_options.requirements_cache)
      # Populate cache with packages from requirements and stage the files
      # in the cache.
      if not os.path.exists(requirements_cache_path):
        os.makedirs(requirements_cache_path)
      (populate_requirements_cache if populate_requirements_cache else
       Stager._populate_requirements_cache)(setup_options.requirements_file,
                                            requirements_cache_path)
      for pkg in glob.glob(os.path.join(requirements_cache_path, '*')):
        self.stage_artifact(
            pkg, FileSystems.join(staging_location, os.path.basename(pkg)))
        resources.append(os.path.basename(pkg))

    # Handle a setup file if present.
    # We will build the setup package locally and then copy it to the staging
    # location because the staging location is a remote path and the file cannot
    # be created directly there.
    if setup_options.setup_file is not None:
      if not os.path.isfile(setup_options.setup_file):
        raise RuntimeError(
            'The file %s cannot be found. It was specified in the '
            '--setup_file command line option.' % setup_options.setup_file)
      if os.path.basename(setup_options.setup_file) != 'setup.py':
        raise RuntimeError(
            'The --setup_file option expects the full path to a file named '
            'setup.py instead of %s' % setup_options.setup_file)
      tarball_file = Stager._build_setup_package(setup_options.setup_file,
                                                 temp_dir, build_setup_args)
      staged_path = FileSystems.join(staging_location, WORKFLOW_TARBALL_FILE)
      self.stage_artifact(tarball_file, staged_path)
      resources.append(WORKFLOW_TARBALL_FILE)

    # Handle extra local packages that should be staged.
    if setup_options.extra_packages is not None:
      resources.extend(
          self._stage_extra_packages(
              setup_options.extra_packages, staging_location,
              temp_dir=temp_dir))

    # Pickle the main session if requested.
    # We will create the pickled main session locally and then copy it to the
    # staging location because the staging location is a remote path and the
    # file cannot be created directly there.
    if setup_options.save_main_session:
      pickled_session_file = os.path.join(temp_dir,
                                          names.PICKLED_MAIN_SESSION_FILE)
      pickler.dump_session(pickled_session_file)
      staged_path = FileSystems.join(staging_location,
                                     names.PICKLED_MAIN_SESSION_FILE)
      self.stage_artifact(pickled_session_file, staged_path)
      resources.append(names.PICKLED_MAIN_SESSION_FILE)

    if hasattr(setup_options, 'sdk_location'):

      if (setup_options.sdk_location == 'default') or Stager._is_remote_path(
          setup_options.sdk_location):
        # If --sdk_location is not specified then the appropriate package
        # will be obtained from PyPI (https://pypi.python.org) based on the
        # version of the currently running SDK. If the option is
        # present then no version matching is made and the exact URL or path
        # is expected.
        #
        # Unit tests running in the 'python setup.py test' context will
        # not have the sdk_location attribute present and therefore we
        # will not stage SDK.
        sdk_remote_location = 'pypi' if (setup_options.sdk_location == 'default'
                                        ) else setup_options.sdk_location
        resources.extend(
            self._stage_beam_sdk(sdk_remote_location, staging_location,
                                 temp_dir))
      else:
        # This branch is also used by internal tests running with the SDK built
        # at head.
        if os.path.isdir(setup_options.sdk_location):
          # TODO(angoenka): remove reference to Dataflow
          sdk_path = os.path.join(setup_options.sdk_location,
                                  names.DATAFLOW_SDK_TARBALL_FILE)
        else:
          sdk_path = setup_options.sdk_location

        if os.path.isfile(sdk_path):
          logging.info('Copying Beam SDK "%s" to staging location.', sdk_path)
          staged_path = FileSystems.join(
              staging_location,
              Stager._desired_sdk_filename_in_staging_location(
                  setup_options.sdk_location))
          self.stage_artifact(sdk_path, staged_path)
          _, sdk_staged_filename = FileSystems.split(staged_path)
          resources.append(sdk_staged_filename)
        else:
          if setup_options.sdk_location == 'default':
            raise RuntimeError('Cannot find default Beam SDK tar file "%s"'
                               % sdk_path)
          elif not setup_options.sdk_location:
            logging.info('Beam SDK will not be staged since --sdk_location '
                         'is empty.')
          else:
            raise RuntimeError(
                'The file "%s" cannot be found. Its location was specified by '
                'the --sdk_location command-line option.' % sdk_path)

    # Delete all temp files created while staging job resources.
    shutil.rmtree(temp_dir)
    self.commit_manifest()
    return resources