def test_match_file_exception(self): # Match files with None so that it throws an exception with self.assertRaises(BeamIOError) as error: FileSystems.match([None]) self.assertTrue( error.exception.message.startswith('Unable to get the Filesystem')) self.assertEqual(error.exception.exception_details.keys(), [None])
def _check_state_for_finalize_write(self, writer_results, num_shards): """Checks writer output files' states. Returns: src_files, dst_files: Lists of files to rename. For each i, finalize_write should rename(src_files[i], dst_files[i]). delete_files: Src files to delete. These could be leftovers from an incomplete (non-atomic) rename operation. num_skipped: Tally of writer results files already renamed, such as from a previous run of finalize_write(). """ if not writer_results: return [], [], [], 0 src_glob = FileSystems.join(FileSystems.split(writer_results[0])[0], '*') dst_glob = self._get_final_name_glob(num_shards) src_glob_files = set(file_metadata.path for mr in FileSystems.match([src_glob]) for file_metadata in mr.metadata_list) dst_glob_files = set(file_metadata.path for mr in FileSystems.match([dst_glob]) for file_metadata in mr.metadata_list) src_files = [] dst_files = [] delete_files = [] num_skipped = 0 for shard_num, src in enumerate(writer_results): final_name = self._get_final_name(shard_num, num_shards) dst = final_name src_exists = src in src_glob_files dst_exists = dst in dst_glob_files if not src_exists and not dst_exists: raise BeamIOError('src and dst files do not exist. src: %s, dst: %s' % ( src, dst)) if not src_exists and dst_exists: logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst) num_skipped += 1 continue if (src_exists and dst_exists and FileSystems.checksum(src) == FileSystems.checksum(dst)): logging.debug('src: %s == dst: %s, deleting src', src, dst) delete_files.append(src) continue src_files.append(src) dst_files.append(dst) return src_files, dst_files, delete_files, num_skipped
def test_match_file_empty(self): path = os.path.join(self.tmpdir, 'f2') # Does not exist # Match files in the temp directory result = FileSystems.match([path])[0] files = [f.path for f in result.metadata_list] self.assertEqual(files, [])
def _get_concat_source(self): if self._concat_source is None: pattern = self._pattern.get() single_file_sources = [] match_result = FileSystems.match([pattern])[0] files_metadata = match_result.metadata_list # We create a reference for FileBasedSource that will be serialized along # with each _SingleFileSource. To prevent this FileBasedSource from having # a reference to ConcatSource (resulting in quadratic space complexity) # we clone it here. file_based_source_ref = pickler.loads(pickler.dumps(self)) for file_metadata in files_metadata: file_name = file_metadata.path file_size = file_metadata.size_in_bytes if file_size == 0: continue # Ignoring empty file. # We determine splittability of this specific file. splittable = ( self.splittable and _determine_splittability_from_compression_type( file_name, self._compression_type)) single_file_source = _SingleFileSource( file_based_source_ref, file_name, 0, file_size, min_bundle_size=self._min_bundle_size, splittable=splittable) single_file_sources.append(single_file_source) self._concat_source = concat_source.ConcatSource(single_file_sources) return self._concat_source
def process(self, unused_element, signal): gcs_location = self.get_destination_uri() match_result = FileSystems.match([gcs_location ])[0].metadata_list logging.debug("%s: matched %s files", self.__class__.__name__, len(match_result)) paths = [x.path for x in match_result] FileSystems.delete(paths)
def test_match_file(self): path = os.path.join(self.tmpdir, 'f1') open(path, 'a').close() # Match files in the temp directory result = FileSystems.match([path])[0] files = [f.path for f in result.metadata_list] self.assertEqual(files, [path])
def test_match_file(self): path = os.path.join(self.tmpdir, 'f1') open(path, 'a').close() # Match files in the temp directory result = FileSystems.match([path])[0] files = [f.path for f in result.metadata_list] self.assertEqual(files, [path])
def _validate(self): """Validate if there are actual files in the specified glob pattern """ pattern = self._pattern.get() # Limit the responses as we only want to check if something exists match_result = FileSystems.match([pattern], limits=[1])[0] if len(match_result.metadata_list) <= 0: raise IOError('No files found based on the file pattern %s' % pattern)
def _validate(self): """Validate if there are actual files in the specified glob pattern """ pattern = self._pattern.get() # Limit the responses as we only want to check if something exists match_result = FileSystems.match([pattern], limits=[1])[0] if len(match_result.metadata_list) <= 0: raise IOError( 'No files found based on the file pattern %s' % pattern)
def pre_finalize(self, init_result, writer_results): num_shards = len(list(writer_results)) dst_glob = self._get_final_name_glob(num_shards) dst_glob_files = [file_metadata.path for mr in FileSystems.match([dst_glob]) for file_metadata in mr.metadata_list] if dst_glob_files: logging.warn('Deleting %d existing files in target path matching: %s', len(dst_glob_files), self.shard_name_glob_format) FileSystems.delete(dst_glob_files)
def test_match_directory(self): path1 = os.path.join(self.tmpdir, 'f1') path2 = os.path.join(self.tmpdir, 'f2') open(path1, 'a').close() open(path2, 'a').close() # Match both the files in the directory path = os.path.join(self.tmpdir, '*') result = FileSystems.match([path])[0] files = [f.path for f in result.metadata_list] self.assertEqual(files, [path1, path2])
def test_match_directory(self): path1 = os.path.join(self.tmpdir, 'f1') path2 = os.path.join(self.tmpdir, 'f2') open(path1, 'a').close() open(path2, 'a').close() # Match both the files in the directory path = os.path.join(self.tmpdir, '*') result = FileSystems.match([path])[0] files = [f.path for f in result.metadata_list] self.assertEqual(files, [path1, path2])
def pre_finalize(self, init_result, writer_results): num_shards = len(list(writer_results)) dst_glob = self._get_final_name_glob(num_shards) dst_glob_files = [file_metadata.path for mr in FileSystems.match([dst_glob]) for file_metadata in mr.metadata_list] if dst_glob_files: logging.warn('Deleting %d existing files in target path matching: %s', len(dst_glob_files), self.shard_name_glob_format) FileSystems.delete(dst_glob_files)
def run(argv=None): """Run the beam pipeline.""" args, pipeline_args = _parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) sentence_files_match = FileSystems.match([args.sentence_files])[0] sentence_files = [ file_metadata.path for file_metadata in sentence_files_match.metadata_list ] logging.info("Reading %i files from %s.", len(sentence_files), args.sentence_files) assert len(sentence_files) > 0 sentence_files = p | beam.Create(sentence_files) examples = sentence_files | "create examples" >> beam.FlatMap( partial(_create_examples_from_file, min_length=args.min_length, max_length=args.max_length, num_extra_contexts=args.num_extra_contexts)) examples = _shuffle_examples(examples) examples |= "split train and test" >> beam.ParDo( _TrainTestSplitFn(args.train_split)).with_outputs( _TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG) if args.dataset_format == _JSON_FORMAT: write_sink = WriteToText file_name_suffix = ".json" serialize_fn = json.dumps else: assert args.dataset_format == _TF_FORMAT write_sink = WriteToTFRecord file_name_suffix = ".tfrecord" serialize_fn = _features_to_serialized_tf_example for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG), ("test", _TrainTestSplitFn.TEST_TAG)]: serialized_examples = examples[tag] | ( "serialize {} examples".format(name) >> beam.Map(serialize_fn)) (serialized_examples | ("write " + name) >> write_sink( os.path.join(args.output_dir, name), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, )) result = p.run() result.wait_until_finish()
def _export_files( self, bq: bigquery_tools.BigQueryWrapper, element: 'ReadFromBigQueryRequest', table_reference: TableReference): """Runs a BigQuery export job. Returns: bigquery.TableSchema instance, a list of FileMetadata instances """ job_labels = self._get_bq_metadata().add_additional_bq_job_labels( self.bigquery_job_labels) export_job_name = bigquery_tools.generate_bq_job_name( self._job_name, self._source_uuid, bigquery_tools.BigQueryJobTypes.EXPORT, element.obj_id) temp_location = self.options.view_as(GoogleCloudOptions).temp_location gcs_location = bigquery_export_destination_uri( self.gcs_location, temp_location, '%s%s' % (self._source_uuid, element.obj_id)) if self.use_json_exports: job_ref = bq.perform_extract_job([gcs_location], export_job_name, table_reference, bigquery_tools.FileFormat.JSON, project=self._get_project(), job_labels=job_labels, include_header=False) else: job_ref = bq.perform_extract_job([gcs_location], export_job_name, table_reference, bigquery_tools.FileFormat.AVRO, project=self._get_project(), include_header=False, job_labels=job_labels, use_avro_logical_types=True) bq.wait_for_bq_job(job_ref) metadata_list = FileSystems.match([gcs_location])[0].metadata_list if isinstance(table_reference, ValueProvider): table_ref = bigquery_tools.parse_table_reference( element.table, project=self._get_project()) else: table_ref = table_reference table = bq.get_table( table_ref.projectId, table_ref.datasetId, table_ref.tableId) return table.schema, metadata_list
def estimate_size(self): try: pattern = self._pattern.get() except: return None match_result = FileSystems.match([pattern])[0] # size = 0 # for f in match_result.metadata_list: # if f.path[-4:] in ['.mp4', '.MP4']: # size += 100*f.size_in_bytes # else: # size += f.size_in_bytes # return int(size) return sum([f.size_in_bytes for f in match_result.metadata_list])
def _match_files( self, file_pattern: str) -> List[beam.io.filesystem.FileMetadata]: """Fetch files based on the file pattern. Args: file_pattern: Full path of the files containing data Returns: List of matching FileMetadata instances """ match_results = FileSystems.match([file_pattern]) match_result = match_results[0] if match_result.metadata_list: return match_result.metadata_list
def _read_with_retry(self): """Read path with retry if I/O failed""" read_lines = [] match_result = FileSystems.match([self.file_path])[0] matched_path = [f.path for f in match_result.metadata_list] if not matched_path: raise IOError('No such file or directory: %s' % self.file_path) logging.info('Find %d files in %s: \n%s', len(matched_path), self.file_path, '\n'.join(matched_path)) for path in matched_path: with FileSystems.open(path, 'r') as f: for line in f: read_lines.append(line) return read_lines
def process(self, element, *args, **kwargs): match_results = FileSystems.match([element]) for metadata in match_results[0].metadata_list: splittable = (self._splittable and _determine_splittability_from_compression_type( metadata.path, self._compression_type)) if splittable: for split in OffsetRange(0, metadata.size_in_bytes).split( self._desired_bundle_size, self._min_bundle_size): yield (metadata, split) else: yield (metadata, OffsetRange( 0, range_trackers.OffsetRangeTracker.OFFSET_INFINITY))
def process(self, element, *args, **kwargs): match_results = FileSystems.match([element]) for metadata in match_results[0].metadata_list: splittable = ( self._splittable and _determine_splittability_from_compression_type( metadata.path, self._compression_type)) if splittable: for split in OffsetRange( 0, metadata.size_in_bytes).split( self._desired_bundle_size, self._min_bundle_size): yield (metadata, split) else: yield (metadata, OffsetRange( 0, range_trackers.OffsetRangeTracker.OFFSET_INFINITY))
def _get_pipeline_mode(known_args): """Returns the mode the pipeline should operate in based on input size.""" if known_args.optimize_for_large_inputs: return PipelineModes.LARGE match_results = FileSystems.match([known_args.input_pattern]) if not match_results: raise ValueError('No files matched input_pattern: {}'.format( known_args.input_pattern)) total_files = len(match_results[0].metadata_list) if total_files > _LARGE_DATA_THRESHOLD: return PipelineModes.LARGE elif total_files > _SMALL_DATA_THRESHOLD: return PipelineModes.MEDIUM return PipelineModes.SMALL
def _export_files(self, bq): """Runs a BigQuery export job. Returns: bigquery.TableSchema instance, a list of FileMetadata instances """ job_id = uuid.uuid4().hex gcs_location = self.get_destination_uri() job_ref = bq.perform_extract_job([gcs_location], job_id, self.table_reference, bigquery_tools.FileFormat.JSON, include_header=False) bq.wait_for_bq_job(job_ref) metadata_list = FileSystems.match([gcs_location])[0].metadata_list table = bq.get_table(self.table_reference.projectId, self.table_reference.datasetId, self.table_reference.tableId) return table.schema, metadata_list
def _get_concat_source(self): if self._concat_source is None: pattern = self._pattern.get() single_file_sources = [] match_result = FileSystems.match([pattern])[0] files_metadata = match_result.metadata_list # We create a reference for FileBasedSource that will be serialized along # with each _SingleFileSource. To prevent this FileBasedSource from having # a reference to ConcatSource (resulting in quadratic space complexity) # we clone it here. file_based_source_ref = pickler.loads(pickler.dumps(self)) for file_metadata in files_metadata: file_name = file_metadata.path file_size = file_metadata.size_in_bytes if file_size == 0: continue # Ignoring empty file. # We determine splittability of this specific file. splittable = self.splittable if (splittable and self._compression_type == CompressionTypes.AUTO): compression_type = CompressionTypes.detect_compression_type( file_name) if compression_type != CompressionTypes.UNCOMPRESSED: splittable = False single_file_source = _SingleFileSource( file_based_source_ref, file_name, 0, file_size, min_bundle_size=self._min_bundle_size, splittable=splittable) single_file_sources.append(single_file_source) self._concat_source = concat_source.ConcatSource( single_file_sources) return self._concat_source
def run(argv=None): """Run the beam pipeline.""" args, pipeline_args = _parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) sentence_files_match = FileSystems.match([args.sentence_files])[0] sentence_files = [ file_metadata.path for file_metadata in sentence_files_match.metadata_list ] logging.info("Reading %i files from %s.", len(sentence_files), args.sentence_files) assert len(sentence_files) > 0 sentence_files = p | beam.Create(sentence_files) serialized_examples = sentence_files | "create examples" >> beam.FlatMap( partial(_create_examples_from_file, min_length=args.min_length, max_length=args.max_length, num_extra_contexts=args.num_extra_contexts)) serialized_examples = _shuffle_examples(serialized_examples) serialized_examples |= "split train and test" >> beam.ParDo( _TrainTestSplitFn(args.train_split)).with_outputs( _TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG) (serialized_examples[_TrainTestSplitFn.TRAIN_TAG] | "write train" >> WriteToTFRecord(os.path.join(args.output_dir, "train"), file_name_suffix=".tfrecords", num_shards=args.num_shards_train)) (serialized_examples[_TrainTestSplitFn.TEST_TAG] | "write test" >> WriteToTFRecord(os.path.join(args.output_dir, "test"), file_name_suffix=".tfrecords", num_shards=args.num_shards_test)) result = p.run() result.wait_until_finish()
def test_find_orphaned_files(self): dir = self._new_tempdir() write_transform = beam.io.fileio.WriteToFiles(path=dir) def write_orphaned_file(temp_dir, writer_key): temp_dir_path = FileSystems.join(dir, temp_dir) file_prefix_dir = FileSystems.join(temp_dir_path, str(abs(hash(writer_key)))) file_name = '%s_%s' % (file_prefix_dir, uuid.uuid4()) with FileSystems.create(file_name) as f: f.write(b'Hello y\'all') return file_name with TestPipeline() as p: _ = (p | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) | "Serialize" >> beam.Map(json.dumps) | write_transform) # Pre-create the temp directory. temp_dir_path = FileSystems.mkdirs( FileSystems.join(dir, write_transform._temp_directory.get())) write_orphaned_file(write_transform._temp_directory.get(), (None, GlobalWindow())) f2 = write_orphaned_file(write_transform._temp_directory.get(), ('other-dest', GlobalWindow())) temp_dir_path = FileSystems.join(dir, write_transform._temp_directory.get()) leftovers = FileSystems.match(['%s%s*' % (temp_dir_path, os.sep)]) found_files = [m.path for m in leftovers[0].metadata_list] self.assertListEqual(found_files, [f2])
def _InferArrowSchema(self): match_result = FileSystems.match([self._file_pattern])[0] files_metadata = match_result.metadata_list[0] with FileSystems.open(files_metadata.path) as f: return pq.read_schema(f)
def find_matching_filenames(pattern): return (x.path for x in FileSystems.match([pattern])[0].metadata_list)
def test_match_directory(self): result = FileSystems.match([self.tmpdir])[0] files = [f.path for f in result.metadata_list] self.assertEqual(files, [self.tmpdir])
def _compute_target_info_blob( path: str, depth: Union[int, float], return_generators=False ) -> TargetInfo: """Computes target info for a file that is externalized on Blob Storage, meaning that it's contained within an indexed archive file. Args: path (str): The path that refers to the specified target. depth (Union[int, float]): Depth until which directory contents are resolved. return_generators (bool, optional): If set to True, the 'contents' key of directories is equal to a generator instead of a list. Defaults to False. Raises: PathException: Path not found or invalid. Returns: TargetInfo: Target info of specified path. """ linked_bundle_path = parse_linked_bundle_url(path) if not FileSystems.exists(linked_bundle_path.bundle_path): raise PathException(linked_bundle_path.bundle_path) if not linked_bundle_path.is_archive: # Single file raise PathException( "Single files on Blob Storage are not supported; only a path within an archive file is supported." ) # process_contents is used to process the value of the 'contents' key (which is a generator) before it is returned. # If return_generators is False, it resolves the given generator into a list; otherwise, it just returns # the generator unchanged. process_contents = list if return_generators is False else lambda x: x with OpenIndexedArchiveFile(linked_bundle_path.bundle_path) as tf: islink = lambda finfo: stat.S_ISLNK(finfo.mode) readlink = lambda finfo: finfo.linkname isfile = lambda finfo: not stat.S_ISDIR(finfo.mode) isdir = lambda finfo: stat.S_ISDIR(finfo.mode) listdir = lambda path: cast(Dict[str, FileInfo], tf.listDir(path) or {}) def _get_info(path: str, depth: Union[int, float]) -> TargetInfo: """This function is called to get the target info of the specified path. If the specified path is a directory and additional depth is requested, this function is recursively called to retrieve the target info of files within the directory, much like _compute_target_info_local. """ if not path.startswith("/"): path = "/" + path finfo = cast(FileInfo, tf.getFileInfo(path)) if finfo is None: # Not found raise PathException("File not found.") result: TargetInfo = { 'name': os.path.basename(path), # get last part of path 'size': finfo.size, 'perm': finfo.mode & 0o777, 'type': '', } if islink(finfo): result['type'] = 'link' result['link'] = readlink(finfo) elif isfile(finfo): result['type'] = 'file' elif isdir(finfo): result['type'] = 'directory' if depth > 0: result['contents'] = process_contents( _get_info(path + "/" + file_name, depth - 1) for file_name in listdir(path) if file_name != "." ) return result if not linked_bundle_path.is_archive_dir: # Return the contents of the single .gz file. # The entry returned by ratarmount for a single .gz file is not technically part of a tar archive # and has a name hardcoded as "contents," so we modify the type, name, and permissions of # the output accordingly. return cast( TargetInfo, dict( _get_info("/contents", depth), type="file", name=linked_bundle_path.bundle_uuid, perm=0o755, ), ) if linked_bundle_path.archive_subpath: # Return the contents of a subpath within a directory. return _get_info(linked_bundle_path.archive_subpath, depth) else: # No subpath, return the entire directory with the bundle # contents in it. The permissions of this directory # cannot be set by the user (the user can only set permissions # of files *within* this directory that are part of the bundle # itself), so we just return a placeholder value of 0o755 # for this directory's permissions. file = FileSystems.match([path])[0].metadata_list[0] result: TargetInfo = { 'name': linked_bundle_path.bundle_uuid, 'type': 'directory', 'size': file.size_in_bytes, 'perm': 0o755, } if depth > 0: result['contents'] = process_contents( _get_info(file_name, depth - 1) for file_name in listdir("/") if file_name != "." ) return result
def test_match_file_exception(self): # Match files with None so that it throws an exception with self.assertRaisesRegex(BeamIOError, r'^Unable to get the Filesystem') as error: FileSystems.match([None]) self.assertEqual(list(error.exception.exception_details), [None])
def estimate_size(self): pattern = self._pattern.get() match_result = FileSystems.match([pattern])[0] return sum([f.size_in_bytes for f in match_result.metadata_list])
def estimate_size(self): pattern = self._pattern.get() match_result = FileSystems.match([pattern])[0] return sum([f.size_in_bytes for f in match_result.metadata_list])
def test_match_directory(self): result = FileSystems.match([self.tmpdir])[0] files = [f.path for f in result.metadata_list] self.assertEqual(files, [self.tmpdir])
input_files = app_args.input output_filename = 'output.txt' options = PipelineOptions() gcloud_options = options.view_as(GoogleCloudOptions) # gcloud_options.project = project_id gcloud_options.job_name = 'import-citybikes' # Dataflow runner runner = os.environ['DATAFLOW_RUNNER'] options.view_as(StandardOptions).runner = runner with apache_beam.Pipeline(options=options) as p: inputs = [] for match in FileSystems.match([input_files]): for file in match.metadata_list: inputs.append(file.path) files = (p | apache_beam.Create(inputs)) utils = Utils() read = ( files | ReadAllFromText() | apache_beam.Map(lambda x, utils=utils, inputs=inputs: (utils.get_basename(inputs.pop(0)), x) if len(inputs) > 0 else ("", "")) #apache_beam.Map(lambda x: (get_basename(inputs.pop(0)), x)) ) rows = (read | apache_beam.ParDo(Split()))
def test_match_file_exception(self): # Match files with None so that it throws an exception with self.assertRaisesRegexp(BeamIOError, r'^Unable to get the Filesystem') as error: FileSystems.match([None]) self.assertEqual(list(error.exception.exception_details), [None])