def test_dicom_store_instance(self): # Store DICOM files to a empty DICOM store from a GCS bucket, # then check if the store metadata match. input_dict = {} input_dict['project_id'] = self.project input_dict['region'] = REGION input_dict['dataset_id'] = DATA_SET_ID input_dict['dicom_store_id'] = self.temp_dicom_store input_dict['search_type'] = "instances" expected_dict = {} expected_dict['result'] = self.expected_output_metadata expected_dict['status'] = 200 expected_dict['input'] = input_dict expected_dict['success'] = True with TestPipeline() as p: gcs_path = DICOM_FILES_PATH + "/*" results = (p | fileio.MatchFiles(gcs_path) | fileio.ReadMatches() | UploadToDicomStore(input_dict, 'fileio') | beam.Map(lambda x: x['success'])) assert_that(results, equal_to([True] * NUM_INSTANCE)) with TestPipeline() as p: results = (p | beam.Create([input_dict]) | DicomSearch()) assert_that(results, equal_to([expected_dict]))
def test_transform_on_gcs(self): args = self.test_pipeline.get_full_options_as_args() with beam.Pipeline(argv=args) as p: matches_pc = ( p | beam.Create([self.INPUT_FILE, self.INPUT_FILE_LARGE]) | fileio.MatchAll() | 'GetPath' >> beam.Map(lambda metadata: metadata.path)) assert_that(matches_pc, equal_to([self.INPUT_FILE] + self.WIKI_FILES), label='Matched Files') checksum_pc = ( p | 'SingleFile' >> beam.Create([self.INPUT_FILE]) | 'MatchOneAll' >> fileio.MatchAll() | fileio.ReadMatches() | 'ReadIn' >> beam.Map(lambda x: x.read_utf8().split('\n')) | 'Checksums' >> beam.Map(compute_hash)) assert_that(checksum_pc, equal_to([self.KINGLEAR_CHECKSUM]), label='Assert Checksums')
def test_store_fileio_file_small_buffer_flush(self, FakeClient): input_dict = {} input_dict['project_id'] = "test_project" input_dict['region'] = "test_region" input_dict['dataset_id'] = "test_dataset_id" input_dict['dicom_store_id'] = "test_dicom_store_id" fc = FakeHttpClient() FakeClient.return_value = fc temp_dir = '%s%s' % (self._new_tempdir(), os.sep) dict_input_1 = { 'PatientName': 'George', 'Age': 23, 'TestResult': 'Negative' } str_input_1 = json.dumps(dict_input_1) self._create_temp_file(dir=temp_dir, content=str_input_1) dict_input_2 = {'PatientName': 'Peter', 'Age': 54, 'TestResult': 'Positive'} str_input_2 = json.dumps(dict_input_2) self._create_temp_file(dir=temp_dir, content=str_input_2) dict_input_3 = {'PatientName': 'Zen', 'Age': 27, 'TestResult': 'Negative'} str_input_3 = json.dumps(dict_input_3) self._create_temp_file(dir=temp_dir, content=str_input_3) with TestPipeline() as p: results = ( p | beam.Create([FileSystems.join(temp_dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | UploadToDicomStore(input_dict, 'fileio', buffer_size=1) | beam.Map(lambda x: x['success'])) assert_that(results, equal_to([True] * 3)) self.assertTrue(dict_input_1 in fc.dicom_metadata) self.assertTrue(dict_input_2 in fc.dicom_metadata) self.assertTrue(dict_input_3 in fc.dicom_metadata)
def expand(self, root): # TODO(robertwb): Handle streaming (with explicit schema). paths_pcoll = root | beam.Create([self.path]) first = io.filesystems.FileSystems.match( [self.path], limits=[1])[0].metadata_list[0].path with io.filesystems.FileSystems.open(first) as handle: if not self.binary: handle = TextIOWrapper(handle) if self.incremental: sample = next( self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100))) else: sample = self.reader(handle, *self.args, **self.kwargs) pcoll = (paths_pcoll | fileio.MatchFiles(self.path) | beam.Reshuffle() | fileio.ReadMatches() | beam.ParDo( _ReadFromPandasDoFn(self.reader, self.args, self.kwargs, self.binary, self.incremental, self.splitter))) from apache_beam.dataframe import convert return convert.to_dataframe(pcoll, proxy=_prefix_range_index_with( ':', sample[:0]))
def expand(self, root): paths_pcoll = root | beam.Create([self.path]) match = io.filesystems.FileSystems.match([self.path], limits=[1])[0] if not match.metadata_list: # TODO(BEAM-12031): This should be allowed for streaming pipelines if # user provides an explicit schema. raise FileNotFoundError(f"Found no files that match {self.path!r}") first_path = match.metadata_list[0].path with io.filesystems.FileSystems.open(first_path) as handle: if not self.binary: handle = TextIOWrapper(handle) if self.incremental: sample = next( self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100))) else: sample = self.reader(handle, *self.args, **self.kwargs) pcoll = (paths_pcoll | fileio.MatchFiles(self.path) | beam.Reshuffle() | fileio.ReadMatches() | beam.ParDo( _ReadFromPandasDoFn(self.reader, self.args, self.kwargs, self.binary, self.incremental, self.splitter))) from apache_beam.dataframe import convert return convert.to_dataframe(pcoll, proxy=_prefix_range_index_with( ':', sample[:0]))
def expand(self, pcoll): return (pcoll | 'MatchAll' >> fileio.MatchAll() | beam.Reshuffle() | 'ReadEach' >> fileio.ReadMatches() | beam.FlatMap(lambda rfile: csv.DictReader( io.TextIOWrapper(rfile.open()))))
def ReadImagesFromDisk(pipeline: beam.Pipeline, base_path: Text) -> beam.pvalue.PCollection: """ The Beam PTransform used to load a collection of images and metadata from a local file system or a remote cloud storage bucket. Args: pipeline (beam.Pipeline): Input beam.Pipeline object coming from a TFX Executor. base_path (Text): Base directory containing images and labels. """ wildcard_qualifier = "*" # ingest all the files from the base path by supplying the wildcard file_pattern = os.path.join(base_path, wildcard_qualifier) allowed_ext = [".jpg", ".json", ".png", ".txt", ".jpeg"] images, label_file = ( pipeline | fileio.MatchFiles(file_pattern) | fileio.ReadMatches() | beam.Map(read_file_content) | "FilterOutFiles" >> beam.Filter(lambda x: x[FILE_EXT] in allowed_ext) | "SplitLabelFile" >> beam.Partition(SplitByFileName, 2)) # label_file is actually a dict label_dict = beam.pvalue.AsSingleton(label_file) ready_images = ( images | "AddLabelAndMetadata" >> beam.Map(add_label_and_metadata, label_dict)) return ready_images
def test_dicom_store_instance_from_gcs(self): # Store DICOM files to a empty DICOM store from a GCS bucket, # then check if the store metadata match. input_dict_store = {} input_dict_store['project_id'] = self.project input_dict_store['region'] = REGION input_dict_store['dataset_id'] = DATA_SET_ID input_dict_store['dicom_store_id'] = self.temp_dicom_store expected_output = [True] * NUM_INSTANCE with self.test_pipeline as p: gcs_path = DICOM_FILES_PATH + "/io_test_files/*" results = (p | fileio.MatchFiles(gcs_path) | fileio.ReadMatches() | UploadToDicomStore(input_dict_store, 'fileio') | beam.Map(lambda x: x['success'])) assert_that(results, equal_to(expected_output), label='store first assert') # Check the metadata using client result, status_code = DicomApiHttpClient().qido_search( self.project, REGION, DATA_SET_ID, self.temp_dicom_store, 'instances') self.assertEqual(status_code, 200) # List comparison based on different version of python self.assertCountEqual(result, self.expected_output_all_metadata)
def test_write_to_dynamic_destination(self): sink_params = [ fileio.TextSink, # pass a type signature fileio.TextSink() # pass a FileSink object ] for sink in sink_params: dir = self._new_tempdir() with TestPipeline() as p: _ = (p | "Create" >> beam.Create(range(100)) | beam.Map(lambda x: str(x)) | fileio.WriteToFiles( path=dir, destination=lambda n: "odd" if int(n) % 2 else "even", sink=sink, file_naming=fileio.destination_prefix_naming("test"))) with TestPipeline() as p: result = ( p | fileio.MatchFiles(FileSystems.join(dir, '*')) | fileio.ReadMatches() | beam.Map(lambda f: ( os.path.basename(f.metadata.path).split('-')[0], sorted(map(int, f.read_utf8().strip().split('\n')))))) assert_that( result, equal_to([('odd', list(range(1, 100, 2))), ('even', list(range(0, 100, 2)))]))
def test_write_to_different_file_types_some_spilling(self): dir = self._new_tempdir() with TestPipeline() as p: _ = ( p | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) | beam.io.fileio.WriteToFiles( path=dir, destination=lambda record: record['foundation'], sink=lambda dest: ( WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS) if dest == 'apache' else WriteFilesTest.JsonSink()), file_naming=fileio.destination_prefix_naming(), max_writers_per_bundle=1)) with TestPipeline() as p: cncf_res = ( p | fileio.MatchFiles(FileSystems.join(dir, 'cncf*')) | fileio.ReadMatches() | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n')) | beam.Map(json.loads)) apache_res = ( p | "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*')) | "ReadApache" >> fileio.ReadMatches() | "MapApache" >> beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf)))) assert_that( cncf_res, equal_to([ row for row in self.SIMPLE_COLLECTION if row['foundation'] == 'cncf' ]), label='verifyCNCF') assert_that( apache_res, equal_to([[row['project'], row['foundation']] for row in self.SIMPLE_COLLECTION if row['foundation'] == 'apache']), label='verifyApache')
def test_basic_file_name_provided(self): content = 'TestingMyContent\nIn multiple lines\nhaha!' dir = '%s/' % self._new_tempdir() self._create_temp_file(dir=dir, content=content) with TestPipeline() as p: content_pc = (p | beam.Create([dir]) | fileio.MatchAll() | fileio.ReadMatches() | beam.Map(lambda f: f.read().decode('utf-8'))) assert_that(content_pc, equal_to([content]))
def test_basic_file_name_provided(self): content = 'TestingMyContent\nIn multiple lines\nhaha!' dir = '%s%s' % (self._new_tempdir(), os.sep) self._create_temp_file(dir=dir, content=content) with TestPipeline() as p: content_pc = ( p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.FlatMap(lambda f: f.read().decode('utf-8').splitlines())) assert_that(content_pc, equal_to(content.splitlines()))
def file_process_pattern_access_metadata(): import apache_beam as beam from apache_beam.io import fileio # [START FileProcessPatternAccessMetadataSnip1] with beam.Pipeline() as p: readable_files = (p | fileio.MatchFiles('hdfs://path/to/*.txt') | fileio.ReadMatches() | beam.Reshuffle()) files_and_contents = (readable_files | beam.Map(lambda x: (x.metadata.path, x.read_utf8())))
def test_csv_file_source(self): content = 'name,year,place\ngoogle,1999,CA\nspotify,2006,sweden' rows = [r.split(',') for r in content.split('\n')] dir = '%s%s' % (self._new_tempdir(), os.sep) self._create_temp_file(dir=dir, content=content) with TestPipeline() as p: content_pc = (p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf)))) assert_that(content_pc, equal_to(rows))
def test_fail_on_directories(self): content = 'thecontent\n' files = [] tempdir = '%s%s' % (self._new_tempdir(), os.sep) # Create a couple files to be matched files.append(self._create_temp_file(dir=tempdir, content=content)) files.append(self._create_temp_file(dir=tempdir, content=content)) with self.assertRaises(beam.io.filesystem.BeamIOError): with TestPipeline() as p: _ = (p | beam.Create(files + ['%s/' % tempdir]) | fileio.ReadMatches(skip_directories=False) | beam.Map(lambda x: x.read_utf8()))
def test_string_filenames_and_skip_directory(self): content = 'thecontent\n' files = [] tempdir = '%s/' % self._new_tempdir() # Create a couple files to be matched files.append(self._create_temp_file(dir=tempdir, content=content)) files.append(self._create_temp_file(dir=tempdir, content=content)) with TestPipeline() as p: contents_pc = (p | beam.Create(files + [tempdir]) | fileio.ReadMatches() | beam.Map(lambda x: x.read().decode('utf-8'))) assert_that(contents_pc, equal_to([content] * 2))
def expand(self, root): # TODO(robertwb): Handle streaming (with explicit schema). paths_pcoll = root | beam.Create([self.path]) first = io.filesystems.FileSystems.match([self.path], limits=[1 ])[0].metadata_list[0].path with io.filesystems.FileSystems.open(first) as handle: df = next(self.reader(handle, *self.args, chunksize=100, **self.kwargs)) pcoll = ( paths_pcoll | fileio.MatchFiles(self.path) | fileio.ReadMatches() | beam.ParDo(_ReadFromPandasDoFn(self.reader, self.args, self.kwargs))) from apache_beam.dataframe import convert return convert.to_dataframe( pcoll, proxy=_prefix_range_index_with(':', df[:0]))
def test_read_gzip_compressed_file_without_suffix(self): dir = '%s%s' % (self._new_tempdir(), os.sep) file_contents = b'compressed_contents!' import gzip with gzip.GzipFile(os.path.join(dir, 'compressed'), 'w') as f: f.write(file_contents) with TestPipeline() as p: content_pc = ( p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.Map(lambda rf: rf.open(compression_type=CompressionTypes .GZIP).read(len(file_contents)))) assert_that(content_pc, equal_to([file_contents]))
def test_write_to_single_file_batch(self): dir = self._new_tempdir() with TestPipeline() as p: _ = (p | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) | "Serialize" >> beam.Map(json.dumps) | beam.io.fileio.WriteToFiles(path=dir)) with TestPipeline() as p: result = (p | fileio.MatchFiles(FileSystems.join(dir, '*')) | fileio.ReadMatches() | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n')) | beam.Map(json.loads)) assert_that(result, equal_to([row for row in self.SIMPLE_COLLECTION]))
def test_infer_compressed_file(self): dir = '%s%s' % (self._new_tempdir(), os.sep) file_contents = b'compressed_contents!' import gzip with gzip.GzipFile(os.path.join(dir, 'compressed.gz'), 'w') as f: f.write(file_contents) file_contents2 = b'compressed_contents_bz2!' import bz2 with bz2.BZ2File(os.path.join(dir, 'compressed2.bz2'), 'w') as f: f.write(file_contents2) with TestPipeline() as p: content_pc = (p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.Map(lambda rf: rf.open().readline())) assert_that(content_pc, equal_to([file_contents, file_contents2]))
def test_csv_file_source(self): content = 'name,year,place\ngoogle,1999,CA\nspotify,2006,sweden' rows = [r.split(',') for r in content.split('\n')] dir = '%s/' % self._new_tempdir() self._create_temp_file(dir=dir, content=content) def get_csv_reader(readable_file): if sys.version_info >= (3, 0): return csv.reader(io.TextIOWrapper(readable_file.open())) else: return csv.reader(readable_file.open()) with TestPipeline() as p: content_pc = (p | beam.Create([dir]) | fileio.MatchAll() | fileio.ReadMatches() | beam.FlatMap(get_csv_reader)) assert_that(content_pc, equal_to(rows))
def expand(self, root): paths_pcoll = root | beam.Create([self.path]) match = io.filesystems.FileSystems.match([self.path], limits=[1])[0] if not match.metadata_list: # TODO(BEAM-12031): This should be allowed for streaming pipelines if # user provides an explicit schema. raise FileNotFoundError(f"Found no files that match {self.path!r}") first_path = match.metadata_list[0].path with io.filesystems.FileSystems.open(first_path) as handle: if not self.binary: handle = TextIOWrapper(handle) if self.incremental: sample = next( self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100))) else: sample = self.reader(handle, *self.args, **self.kwargs) matches_pcoll = paths_pcoll | fileio.MatchAll() indices_pcoll = ( matches_pcoll.pipeline | 'DoOnce' >> beam.Create([None]) | beam.Map( lambda _, paths: {path: ix for ix, path in enumerate(sorted(paths))}, paths=beam.pvalue.AsList(matches_pcoll | beam.Map(lambda match: match.path))) ) pcoll = (matches_pcoll | beam.Reshuffle() | fileio.ReadMatches() | beam.ParDo( _ReadFromPandasDoFn(self.reader, self.args, self.kwargs, self.binary, self.incremental, self.splitter), path_indices=beam.pvalue.AsSingleton(indices_pcoll))) from apache_beam.dataframe import convert return convert.to_dataframe(pcoll, proxy=sample[:0])
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) from apache_beam.io import fileio with beam.Pipeline(options=PipelineOptions()) as p: output = (p | "match" >> fileio.MatchFiles(known_args.input) | "read match" >> fileio.ReadMatches() | "read_file" >> beam.Map(lambda x: (x.metadata.path, x.read_utf8())) | "parse file" >> beam.Map(lambda x: (parse_filename(x[0]), parse_file(x[1].split('\n')) )) | "unfold" >> beam.ParDo(MetaAndContent()) # | "debug" >> beam.FlatMap(lambda x: print(x)) ) table_spec = 'brainscode-140622:tf2up.conversions' table_schema = {'fields': [ {'name': 'date', 'type': 'DATE'}, {'name': 'file_hash', 'type': 'STRING'}, {'name': 'line', 'type': 'INT64'}, {'name': 'position', 'type': 'INT64'}, {'name': 'severity', 'type': 'STRING'}, {'name': 'message', 'type': 'STRING'}, {'name': 'ops', 'type': 'STRING', 'mode': 'REPEATED'} ]} # two different setups for create_disposition CREATE_IF_NEEDED # and write_disposition - WRITE_TRUNCATE output | 'store to BQ' >> beam.io.WriteToBigQuery( table_spec, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, method="STREAMING_INSERTS" ) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output)
def run(argv=None): class DataIngestion: def parse_method(self, string_input): values = re.split(",", re.sub('\r\n', '', re.sub(u'"', '', string_input))) row = dict( zip(('Requisition_Number', 'Opportunity_Title', 'Opportunity_Status', 'Featured', 'Company_Code', 'Company', 'Entity', 'Entity_Desc', 'Source_Job_Code', 'Job_Title', 'FullTime_Or_PartTime', 'Salary_Or_Hourly', 'Recruiter', 'Location_Name', 'Date_Applied', 'Source', 'Step', 'Step_Date', 'Recruiting_Hire_Date', 'Start_Date', 'Candidate', 'Candidate_Email_Address', 'Candidate_Primary_Phone', 'First_Published_Date', 'Average_Days_Between_Publish_Hire_Dates'), values)) return row parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', required=False, help='Input file to read. This can be a local file or ' 'a file in a Google Storage Bucket.', # default='gs://hc_crackerbarrel_ats/Historical' ) parser.add_argument('--output', dest='output', required=False, help='Output BQ table to write results to.', default='chs.ats_master') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--runner=DataflowRunner', '--project=hireclix', '--region=us-east1', '--staging_location=gs://hc_chs_ats/File_Temp/Source', '--temp_location=gs://hc_chs_ats/File_Temp/Staging', '--job_name=chstest1' ]) data_ingestion = DataIngestion() with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p: readable_files = (p | 'Matching .csv files' >> fileio.MatchFiles('gs://hc_chs_ats/File_Temp/Temp_File/*.csv') | 'Read Matches' >> fileio.ReadMatches() | 'Rebalance data inputs' >> beam.Reshuffle()) files_and_content = (readable_files | 'Determine FilePath' >> beam.Map(lambda x: x.metadata.path)) writebq = (files_and_content | 'Read from a File' >> beam.io.ReadAllFromText(skip_header_lines=1) | 'String To BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s)) | 'Write to BigQuery' >> beam.io.Write( beam.io.WriteToBigQuery(known_args.output, schema='Requisition_Number:STRING,' 'Opportunity_Title:STRING,' 'Opportunity_Status:STRING,' 'Featured:STRING,' 'Company_Code:STRING,' 'Company:STRING,' 'Entity:STRING,' 'Entity_Desc:STRING,' 'Source_Job_Code:STRING,' 'Job_Title:STRING,' 'FullTime_Or_PartTime:STRING,' 'Salary_Or_Hourly:STRING,' 'Recruiter:STRING,' 'Location_Name:STRING,' 'Date_Applied:DATE,' 'Source:STRING,' 'Step:STRING,' 'Step_Date:DATE,' 'Recruiting_Hire_Date:DATE,' 'Start_Date:DATE,' 'Candidate:STRING,' 'Candidate_Email_Address:STRING,' 'Candidate_Primary_Phone:STRING,' 'First_Published_Date:DATE,' 'Average_Days_Between_Publish_Hire_Dates:STRING', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))