Exemplo n.º 1
0
 def test_match_file_exception(self):
   # Match files with None so that it throws an exception
   with self.assertRaises(BeamIOError) as error:
     FileSystems.match([None])
   self.assertTrue(
       error.exception.message.startswith('Unable to get the Filesystem'))
   self.assertEqual(error.exception.exception_details.keys(), [None])
Exemplo n.º 2
0
  def _check_state_for_finalize_write(self, writer_results, num_shards):
    """Checks writer output files' states.

    Returns:
      src_files, dst_files: Lists of files to rename. For each i, finalize_write
        should rename(src_files[i], dst_files[i]).
      delete_files: Src files to delete. These could be leftovers from an
        incomplete (non-atomic) rename operation.
      num_skipped: Tally of writer results files already renamed, such as from
        a previous run of finalize_write().
    """
    if not writer_results:
      return [], [], [], 0

    src_glob = FileSystems.join(FileSystems.split(writer_results[0])[0], '*')
    dst_glob = self._get_final_name_glob(num_shards)
    src_glob_files = set(file_metadata.path
                         for mr in FileSystems.match([src_glob])
                         for file_metadata in mr.metadata_list)
    dst_glob_files = set(file_metadata.path
                         for mr in FileSystems.match([dst_glob])
                         for file_metadata in mr.metadata_list)

    src_files = []
    dst_files = []
    delete_files = []
    num_skipped = 0
    for shard_num, src in enumerate(writer_results):
      final_name = self._get_final_name(shard_num, num_shards)
      dst = final_name
      src_exists = src in src_glob_files
      dst_exists = dst in dst_glob_files
      if not src_exists and not dst_exists:
        raise BeamIOError('src and dst files do not exist. src: %s, dst: %s' % (
            src, dst))
      if not src_exists and dst_exists:
        logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst)
        num_skipped += 1
        continue
      if (src_exists and dst_exists and
          FileSystems.checksum(src) == FileSystems.checksum(dst)):
        logging.debug('src: %s == dst: %s, deleting src', src, dst)
        delete_files.append(src)
        continue

      src_files.append(src)
      dst_files.append(dst)
    return src_files, dst_files, delete_files, num_skipped
  def test_match_file_empty(self):
    path = os.path.join(self.tmpdir, 'f2')  # Does not exist

    # Match files in the temp directory
    result = FileSystems.match([path])[0]
    files = [f.path for f in result.metadata_list]
    self.assertEqual(files, [])
Exemplo n.º 4
0
  def _get_concat_source(self):
    if self._concat_source is None:
      pattern = self._pattern.get()

      single_file_sources = []
      match_result = FileSystems.match([pattern])[0]
      files_metadata = match_result.metadata_list

      # We create a reference for FileBasedSource that will be serialized along
      # with each _SingleFileSource. To prevent this FileBasedSource from having
      # a reference to ConcatSource (resulting in quadratic space complexity)
      # we clone it here.
      file_based_source_ref = pickler.loads(pickler.dumps(self))

      for file_metadata in files_metadata:
        file_name = file_metadata.path
        file_size = file_metadata.size_in_bytes
        if file_size == 0:
          continue  # Ignoring empty file.

        # We determine splittability of this specific file.
        splittable = (
            self.splittable and
            _determine_splittability_from_compression_type(
                file_name, self._compression_type))

        single_file_source = _SingleFileSource(
            file_based_source_ref, file_name,
            0,
            file_size,
            min_bundle_size=self._min_bundle_size,
            splittable=splittable)
        single_file_sources.append(single_file_source)
      self._concat_source = concat_source.ConcatSource(single_file_sources)
    return self._concat_source
Exemplo n.º 5
0
 def process(self, unused_element, signal):
     gcs_location = self.get_destination_uri()
     match_result = FileSystems.match([gcs_location
                                       ])[0].metadata_list
     logging.debug("%s: matched %s files", self.__class__.__name__,
                   len(match_result))
     paths = [x.path for x in match_result]
     FileSystems.delete(paths)
Exemplo n.º 6
0
    def test_match_file(self):
        path = os.path.join(self.tmpdir, 'f1')
        open(path, 'a').close()

        # Match files in the temp directory
        result = FileSystems.match([path])[0]
        files = [f.path for f in result.metadata_list]
        self.assertEqual(files, [path])
  def test_match_file(self):
    path = os.path.join(self.tmpdir, 'f1')
    open(path, 'a').close()

    # Match files in the temp directory
    result = FileSystems.match([path])[0]
    files = [f.path for f in result.metadata_list]
    self.assertEqual(files, [path])
Exemplo n.º 8
0
    def _validate(self):
        """Validate if there are actual files in the specified glob pattern
    """
        pattern = self._pattern.get()

        # Limit the responses as we only want to check if something exists
        match_result = FileSystems.match([pattern], limits=[1])[0]
        if len(match_result.metadata_list) <= 0:
            raise IOError('No files found based on the file pattern %s' %
                          pattern)
Exemplo n.º 9
0
  def _validate(self):
    """Validate if there are actual files in the specified glob pattern
    """
    pattern = self._pattern.get()

    # Limit the responses as we only want to check if something exists
    match_result = FileSystems.match([pattern], limits=[1])[0]
    if len(match_result.metadata_list) <= 0:
      raise IOError(
          'No files found based on the file pattern %s' % pattern)
Exemplo n.º 10
0
  def pre_finalize(self, init_result, writer_results):
    num_shards = len(list(writer_results))
    dst_glob = self._get_final_name_glob(num_shards)
    dst_glob_files = [file_metadata.path
                      for mr in FileSystems.match([dst_glob])
                      for file_metadata in mr.metadata_list]

    if dst_glob_files:
      logging.warn('Deleting %d existing files in target path matching: %s',
                   len(dst_glob_files), self.shard_name_glob_format)
      FileSystems.delete(dst_glob_files)
  def test_match_directory(self):
    path1 = os.path.join(self.tmpdir, 'f1')
    path2 = os.path.join(self.tmpdir, 'f2')
    open(path1, 'a').close()
    open(path2, 'a').close()

    # Match both the files in the directory
    path = os.path.join(self.tmpdir, '*')
    result = FileSystems.match([path])[0]
    files = [f.path for f in result.metadata_list]
    self.assertEqual(files, [path1, path2])
Exemplo n.º 12
0
    def test_match_directory(self):
        path1 = os.path.join(self.tmpdir, 'f1')
        path2 = os.path.join(self.tmpdir, 'f2')
        open(path1, 'a').close()
        open(path2, 'a').close()

        # Match both the files in the directory
        path = os.path.join(self.tmpdir, '*')
        result = FileSystems.match([path])[0]
        files = [f.path for f in result.metadata_list]
        self.assertEqual(files, [path1, path2])
Exemplo n.º 13
0
  def pre_finalize(self, init_result, writer_results):
    num_shards = len(list(writer_results))
    dst_glob = self._get_final_name_glob(num_shards)
    dst_glob_files = [file_metadata.path
                      for mr in FileSystems.match([dst_glob])
                      for file_metadata in mr.metadata_list]

    if dst_glob_files:
      logging.warn('Deleting %d existing files in target path matching: %s',
                   len(dst_glob_files), self.shard_name_glob_format)
      FileSystems.delete(dst_glob_files)
def run(argv=None):
    """Run the beam pipeline."""
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    sentence_files_match = FileSystems.match([args.sentence_files])[0]
    sentence_files = [
        file_metadata.path
        for file_metadata in sentence_files_match.metadata_list
    ]
    logging.info("Reading %i files from %s.", len(sentence_files),
                 args.sentence_files)
    assert len(sentence_files) > 0
    sentence_files = p | beam.Create(sentence_files)
    examples = sentence_files | "create examples" >> beam.FlatMap(
        partial(_create_examples_from_file,
                min_length=args.min_length,
                max_length=args.max_length,
                num_extra_contexts=args.num_extra_contexts))

    examples = _shuffle_examples(examples)

    examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(args.train_split)).with_outputs(
            _TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps
    else:
        assert args.dataset_format == _TF_FORMAT
        write_sink = WriteToTFRecord
        file_name_suffix = ".tfrecord"
        serialize_fn = _features_to_serialized_tf_example

    for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG),
                      ("test", _TrainTestSplitFn.TEST_TAG)]:

        serialized_examples = examples[tag] | (
            "serialize {} examples".format(name) >> beam.Map(serialize_fn))
        (serialized_examples | ("write " + name) >> write_sink(
            os.path.join(args.output_dir, name),
            file_name_suffix=file_name_suffix,
            num_shards=args.num_shards_train,
        ))

    result = p.run()
    result.wait_until_finish()
Exemplo n.º 15
0
  def _export_files(
      self,
      bq: bigquery_tools.BigQueryWrapper,
      element: 'ReadFromBigQueryRequest',
      table_reference: TableReference):
    """Runs a BigQuery export job.

    Returns:
      bigquery.TableSchema instance, a list of FileMetadata instances
    """
    job_labels = self._get_bq_metadata().add_additional_bq_job_labels(
        self.bigquery_job_labels)
    export_job_name = bigquery_tools.generate_bq_job_name(
        self._job_name,
        self._source_uuid,
        bigquery_tools.BigQueryJobTypes.EXPORT,
        element.obj_id)
    temp_location = self.options.view_as(GoogleCloudOptions).temp_location
    gcs_location = bigquery_export_destination_uri(
        self.gcs_location,
        temp_location,
        '%s%s' % (self._source_uuid, element.obj_id))
    if self.use_json_exports:
      job_ref = bq.perform_extract_job([gcs_location],
                                       export_job_name,
                                       table_reference,
                                       bigquery_tools.FileFormat.JSON,
                                       project=self._get_project(),
                                       job_labels=job_labels,
                                       include_header=False)
    else:
      job_ref = bq.perform_extract_job([gcs_location],
                                       export_job_name,
                                       table_reference,
                                       bigquery_tools.FileFormat.AVRO,
                                       project=self._get_project(),
                                       include_header=False,
                                       job_labels=job_labels,
                                       use_avro_logical_types=True)
    bq.wait_for_bq_job(job_ref)
    metadata_list = FileSystems.match([gcs_location])[0].metadata_list

    if isinstance(table_reference, ValueProvider):
      table_ref = bigquery_tools.parse_table_reference(
          element.table, project=self._get_project())
    else:
      table_ref = table_reference
    table = bq.get_table(
        table_ref.projectId, table_ref.datasetId, table_ref.tableId)

    return table.schema, metadata_list
Exemplo n.º 16
0
 def estimate_size(self):
     try:
         pattern = self._pattern.get()
     except:
         return None
     match_result = FileSystems.match([pattern])[0]
     # size = 0
     # for f in match_result.metadata_list:
     #     if f.path[-4:] in ['.mp4', '.MP4']:
     #         size += 100*f.size_in_bytes
     #     else:
     #         size += f.size_in_bytes
     # return int(size)
     return sum([f.size_in_bytes for f in match_result.metadata_list])
Exemplo n.º 17
0
    def _match_files(
            self, file_pattern: str) -> List[beam.io.filesystem.FileMetadata]:
        """Fetch files based on the file pattern.
        Args:
         file_pattern: Full path of the files containing data

        Returns:
         List of matching FileMetadata instances
        """
        match_results = FileSystems.match([file_pattern])
        match_result = match_results[0]

        if match_result.metadata_list:
            return match_result.metadata_list
Exemplo n.º 18
0
  def _read_with_retry(self):
    """Read path with retry if I/O failed"""
    read_lines = []
    match_result = FileSystems.match([self.file_path])[0]
    matched_path = [f.path for f in match_result.metadata_list]
    if not matched_path:
      raise IOError('No such file or directory: %s' % self.file_path)

    logging.info('Find %d files in %s: \n%s',
                 len(matched_path), self.file_path, '\n'.join(matched_path))
    for path in matched_path:
      with FileSystems.open(path, 'r') as f:
        for line in f:
          read_lines.append(line)
    return read_lines
Exemplo n.º 19
0
    def process(self, element, *args, **kwargs):
        match_results = FileSystems.match([element])
        for metadata in match_results[0].metadata_list:
            splittable = (self._splittable
                          and _determine_splittability_from_compression_type(
                              metadata.path, self._compression_type))

            if splittable:
                for split in OffsetRange(0, metadata.size_in_bytes).split(
                        self._desired_bundle_size, self._min_bundle_size):
                    yield (metadata, split)
            else:
                yield (metadata,
                       OffsetRange(
                           0,
                           range_trackers.OffsetRangeTracker.OFFSET_INFINITY))
Exemplo n.º 20
0
  def process(self, element, *args, **kwargs):
    match_results = FileSystems.match([element])
    for metadata in match_results[0].metadata_list:
      splittable = (
          self._splittable and
          _determine_splittability_from_compression_type(
              metadata.path, self._compression_type))

      if splittable:
        for split in OffsetRange(
            0, metadata.size_in_bytes).split(
                self._desired_bundle_size, self._min_bundle_size):
          yield (metadata, split)
      else:
        yield (metadata, OffsetRange(
            0, range_trackers.OffsetRangeTracker.OFFSET_INFINITY))
Exemplo n.º 21
0
def _get_pipeline_mode(known_args):
  """Returns the mode the pipeline should operate in based on input size."""
  if known_args.optimize_for_large_inputs:
    return PipelineModes.LARGE

  match_results = FileSystems.match([known_args.input_pattern])
  if not match_results:
    raise ValueError('No files matched input_pattern: {}'.format(
        known_args.input_pattern))

  total_files = len(match_results[0].metadata_list)
  if total_files > _LARGE_DATA_THRESHOLD:
    return PipelineModes.LARGE
  elif total_files > _SMALL_DATA_THRESHOLD:
    return PipelineModes.MEDIUM

  return PipelineModes.SMALL
Exemplo n.º 22
0
    def _export_files(self, bq):
        """Runs a BigQuery export job.

        Returns:
          bigquery.TableSchema instance, a list of FileMetadata instances
        """
        job_id = uuid.uuid4().hex
        gcs_location = self.get_destination_uri()
        job_ref = bq.perform_extract_job([gcs_location],
                                         job_id,
                                         self.table_reference,
                                         bigquery_tools.FileFormat.JSON,
                                         include_header=False)
        bq.wait_for_bq_job(job_ref)
        metadata_list = FileSystems.match([gcs_location])[0].metadata_list

        table = bq.get_table(self.table_reference.projectId,
                             self.table_reference.datasetId,
                             self.table_reference.tableId)

        return table.schema, metadata_list
Exemplo n.º 23
0
    def _get_concat_source(self):
        if self._concat_source is None:
            pattern = self._pattern.get()

            single_file_sources = []
            match_result = FileSystems.match([pattern])[0]
            files_metadata = match_result.metadata_list

            # We create a reference for FileBasedSource that will be serialized along
            # with each _SingleFileSource. To prevent this FileBasedSource from having
            # a reference to ConcatSource (resulting in quadratic space complexity)
            # we clone it here.
            file_based_source_ref = pickler.loads(pickler.dumps(self))

            for file_metadata in files_metadata:
                file_name = file_metadata.path
                file_size = file_metadata.size_in_bytes
                if file_size == 0:
                    continue  # Ignoring empty file.

                # We determine splittability of this specific file.
                splittable = self.splittable
                if (splittable
                        and self._compression_type == CompressionTypes.AUTO):
                    compression_type = CompressionTypes.detect_compression_type(
                        file_name)
                    if compression_type != CompressionTypes.UNCOMPRESSED:
                        splittable = False

                single_file_source = _SingleFileSource(
                    file_based_source_ref,
                    file_name,
                    0,
                    file_size,
                    min_bundle_size=self._min_bundle_size,
                    splittable=splittable)
                single_file_sources.append(single_file_source)
            self._concat_source = concat_source.ConcatSource(
                single_file_sources)
        return self._concat_source
Exemplo n.º 24
0
def run(argv=None):
    """Run the beam pipeline."""
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    sentence_files_match = FileSystems.match([args.sentence_files])[0]
    sentence_files = [
        file_metadata.path
        for file_metadata in sentence_files_match.metadata_list
    ]
    logging.info("Reading %i files from %s.", len(sentence_files),
                 args.sentence_files)
    assert len(sentence_files) > 0
    sentence_files = p | beam.Create(sentence_files)
    serialized_examples = sentence_files | "create examples" >> beam.FlatMap(
        partial(_create_examples_from_file,
                min_length=args.min_length,
                max_length=args.max_length,
                num_extra_contexts=args.num_extra_contexts))

    serialized_examples = _shuffle_examples(serialized_examples)

    serialized_examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(args.train_split)).with_outputs(
            _TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    (serialized_examples[_TrainTestSplitFn.TRAIN_TAG]
     | "write train" >> WriteToTFRecord(os.path.join(args.output_dir, "train"),
                                        file_name_suffix=".tfrecords",
                                        num_shards=args.num_shards_train))
    (serialized_examples[_TrainTestSplitFn.TEST_TAG]
     | "write test" >> WriteToTFRecord(os.path.join(args.output_dir, "test"),
                                       file_name_suffix=".tfrecords",
                                       num_shards=args.num_shards_test))

    result = p.run()
    result.wait_until_finish()
Exemplo n.º 25
0
    def test_find_orphaned_files(self):
        dir = self._new_tempdir()

        write_transform = beam.io.fileio.WriteToFiles(path=dir)

        def write_orphaned_file(temp_dir, writer_key):
            temp_dir_path = FileSystems.join(dir, temp_dir)

            file_prefix_dir = FileSystems.join(temp_dir_path,
                                               str(abs(hash(writer_key))))

            file_name = '%s_%s' % (file_prefix_dir, uuid.uuid4())
            with FileSystems.create(file_name) as f:
                f.write(b'Hello y\'all')

            return file_name

        with TestPipeline() as p:
            _ = (p
                 | beam.Create(WriteFilesTest.SIMPLE_COLLECTION)
                 | "Serialize" >> beam.Map(json.dumps)
                 | write_transform)

            # Pre-create the temp directory.
            temp_dir_path = FileSystems.mkdirs(
                FileSystems.join(dir, write_transform._temp_directory.get()))
            write_orphaned_file(write_transform._temp_directory.get(),
                                (None, GlobalWindow()))
            f2 = write_orphaned_file(write_transform._temp_directory.get(),
                                     ('other-dest', GlobalWindow()))

        temp_dir_path = FileSystems.join(dir,
                                         write_transform._temp_directory.get())
        leftovers = FileSystems.match(['%s%s*' % (temp_dir_path, os.sep)])
        found_files = [m.path for m in leftovers[0].metadata_list]
        self.assertListEqual(found_files, [f2])
Exemplo n.º 26
0
 def _InferArrowSchema(self):
     match_result = FileSystems.match([self._file_pattern])[0]
     files_metadata = match_result.metadata_list[0]
     with FileSystems.open(files_metadata.path) as f:
         return pq.read_schema(f)
Exemplo n.º 27
0
def find_matching_filenames(pattern):
    return (x.path for x in FileSystems.match([pattern])[0].metadata_list)
Exemplo n.º 28
0
 def test_match_directory(self):
     result = FileSystems.match([self.tmpdir])[0]
     files = [f.path for f in result.metadata_list]
     self.assertEqual(files, [self.tmpdir])
Exemplo n.º 29
0
def _compute_target_info_blob(
    path: str, depth: Union[int, float], return_generators=False
) -> TargetInfo:
    """Computes target info for a file that is externalized on Blob Storage, meaning
    that it's contained within an indexed archive file.

    Args:
        path (str): The path that refers to the specified target.
        depth (Union[int, float]): Depth until which directory contents are resolved.
        return_generators (bool, optional): If set to True, the 'contents' key of directories is equal to a generator instead of a list. Defaults to False.

    Raises:
        PathException: Path not found or invalid.

    Returns:
        TargetInfo: Target info of specified path.
    """

    linked_bundle_path = parse_linked_bundle_url(path)
    if not FileSystems.exists(linked_bundle_path.bundle_path):
        raise PathException(linked_bundle_path.bundle_path)
    if not linked_bundle_path.is_archive:
        # Single file
        raise PathException(
            "Single files on Blob Storage are not supported; only a path within an archive file is supported."
        )

    # process_contents is used to process the value of the 'contents' key (which is a generator) before it is returned.
    # If return_generators is False, it resolves the given generator into a list; otherwise, it just returns
    # the generator unchanged.
    process_contents = list if return_generators is False else lambda x: x

    with OpenIndexedArchiveFile(linked_bundle_path.bundle_path) as tf:
        islink = lambda finfo: stat.S_ISLNK(finfo.mode)
        readlink = lambda finfo: finfo.linkname
        isfile = lambda finfo: not stat.S_ISDIR(finfo.mode)
        isdir = lambda finfo: stat.S_ISDIR(finfo.mode)
        listdir = lambda path: cast(Dict[str, FileInfo], tf.listDir(path) or {})

        def _get_info(path: str, depth: Union[int, float]) -> TargetInfo:
            """This function is called to get the target info of the specified path.
            If the specified path is a directory and additional depth is requested, this
            function is recursively called to retrieve the target info of files within
            the directory, much like _compute_target_info_local.
            """
            if not path.startswith("/"):
                path = "/" + path
            finfo = cast(FileInfo, tf.getFileInfo(path))
            if finfo is None:
                # Not found
                raise PathException("File not found.")
            result: TargetInfo = {
                'name': os.path.basename(path),  # get last part of path
                'size': finfo.size,
                'perm': finfo.mode & 0o777,
                'type': '',
            }
            if islink(finfo):
                result['type'] = 'link'
                result['link'] = readlink(finfo)
            elif isfile(finfo):
                result['type'] = 'file'
            elif isdir(finfo):
                result['type'] = 'directory'
                if depth > 0:
                    result['contents'] = process_contents(
                        _get_info(path + "/" + file_name, depth - 1)
                        for file_name in listdir(path)
                        if file_name != "."
                    )
            return result

        if not linked_bundle_path.is_archive_dir:
            # Return the contents of the single .gz file.
            # The entry returned by ratarmount for a single .gz file is not technically part of a tar archive
            # and has a name hardcoded as "contents," so we modify the type, name, and permissions of
            # the output accordingly.
            return cast(
                TargetInfo,
                dict(
                    _get_info("/contents", depth),
                    type="file",
                    name=linked_bundle_path.bundle_uuid,
                    perm=0o755,
                ),
            )
        if linked_bundle_path.archive_subpath:
            # Return the contents of a subpath within a directory.
            return _get_info(linked_bundle_path.archive_subpath, depth)
        else:
            # No subpath, return the entire directory with the bundle
            # contents in it. The permissions of this directory
            # cannot be set by the user (the user can only set permissions
            # of files *within* this directory that are part of the bundle
            # itself), so we just return a placeholder value of 0o755
            # for this directory's permissions.
            file = FileSystems.match([path])[0].metadata_list[0]
            result: TargetInfo = {
                'name': linked_bundle_path.bundle_uuid,
                'type': 'directory',
                'size': file.size_in_bytes,
                'perm': 0o755,
            }
            if depth > 0:
                result['contents'] = process_contents(
                    _get_info(file_name, depth - 1)
                    for file_name in listdir("/")
                    if file_name != "."
                )
            return result
Exemplo n.º 30
0
 def test_match_file_exception(self):
     # Match files with None so that it throws an exception
     with self.assertRaisesRegex(BeamIOError,
                                 r'^Unable to get the Filesystem') as error:
         FileSystems.match([None])
     self.assertEqual(list(error.exception.exception_details), [None])
Exemplo n.º 31
0
 def estimate_size(self):
     pattern = self._pattern.get()
     match_result = FileSystems.match([pattern])[0]
     return sum([f.size_in_bytes for f in match_result.metadata_list])
Exemplo n.º 32
0
 def estimate_size(self):
   pattern = self._pattern.get()
   match_result = FileSystems.match([pattern])[0]
   return sum([f.size_in_bytes for f in match_result.metadata_list])
 def test_match_directory(self):
   result = FileSystems.match([self.tmpdir])[0]
   files = [f.path for f in result.metadata_list]
   self.assertEqual(files, [self.tmpdir])
Exemplo n.º 34
0
input_files = app_args.input
output_filename = 'output.txt'

options = PipelineOptions()
gcloud_options = options.view_as(GoogleCloudOptions)
# gcloud_options.project = project_id
gcloud_options.job_name = 'import-citybikes'

# Dataflow runner
runner = os.environ['DATAFLOW_RUNNER']
options.view_as(StandardOptions).runner = runner

with apache_beam.Pipeline(options=options) as p:

    inputs = []
    for match in FileSystems.match([input_files]):
        for file in match.metadata_list:
            inputs.append(file.path)

    files = (p | apache_beam.Create(inputs))

    utils = Utils()
    read = (
        files | ReadAllFromText()
        | apache_beam.Map(lambda x, utils=utils, inputs=inputs:
                          (utils.get_basename(inputs.pop(0)), x)
                          if len(inputs) > 0 else ("", ""))
        #apache_beam.Map(lambda x: (get_basename(inputs.pop(0)), x))
    )

    rows = (read | apache_beam.ParDo(Split()))
 def test_match_file_exception(self):
   # Match files with None so that it throws an exception
   with self.assertRaisesRegexp(BeamIOError,
                                r'^Unable to get the Filesystem') as error:
     FileSystems.match([None])
   self.assertEqual(list(error.exception.exception_details), [None])