Python ReadMatches示例，apache_beam.io.fileio.ReadMatches Python示例

示例#1

0

显示文件

    def test_dicom_store_instance(self):
        # Store DICOM files to a empty DICOM store from a GCS bucket,
        # then check if the store metadata match.
        input_dict = {}
        input_dict['project_id'] = self.project
        input_dict['region'] = REGION
        input_dict['dataset_id'] = DATA_SET_ID
        input_dict['dicom_store_id'] = self.temp_dicom_store
        input_dict['search_type'] = "instances"

        expected_dict = {}
        expected_dict['result'] = self.expected_output_metadata
        expected_dict['status'] = 200
        expected_dict['input'] = input_dict
        expected_dict['success'] = True

        with TestPipeline() as p:
            gcs_path = DICOM_FILES_PATH + "/*"
            results = (p
                       | fileio.MatchFiles(gcs_path)
                       | fileio.ReadMatches()
                       | UploadToDicomStore(input_dict, 'fileio')
                       | beam.Map(lambda x: x['success']))
            assert_that(results, equal_to([True] * NUM_INSTANCE))

        with TestPipeline() as p:
            results = (p | beam.Create([input_dict]) | DicomSearch())
            assert_that(results, equal_to([expected_dict]))

示例#2

0

显示文件

    def test_transform_on_gcs(self):
        args = self.test_pipeline.get_full_options_as_args()

        with beam.Pipeline(argv=args) as p:
            matches_pc = (
                p
                | beam.Create([self.INPUT_FILE, self.INPUT_FILE_LARGE])
                | fileio.MatchAll()
                | 'GetPath' >> beam.Map(lambda metadata: metadata.path))

            assert_that(matches_pc,
                        equal_to([self.INPUT_FILE] + self.WIKI_FILES),
                        label='Matched Files')

            checksum_pc = (
                p
                | 'SingleFile' >> beam.Create([self.INPUT_FILE])
                | 'MatchOneAll' >> fileio.MatchAll()
                | fileio.ReadMatches()
                | 'ReadIn' >> beam.Map(lambda x: x.read_utf8().split('\n'))
                | 'Checksums' >> beam.Map(compute_hash))

            assert_that(checksum_pc,
                        equal_to([self.KINGLEAR_CHECKSUM]),
                        label='Assert Checksums')

示例#3

0

显示文件

文件： dicomio_test.py 项目： fernando-wizeline/beam

  def test_store_fileio_file_small_buffer_flush(self, FakeClient):
    input_dict = {}
    input_dict['project_id'] = "test_project"
    input_dict['region'] = "test_region"
    input_dict['dataset_id'] = "test_dataset_id"
    input_dict['dicom_store_id'] = "test_dicom_store_id"

    fc = FakeHttpClient()
    FakeClient.return_value = fc

    temp_dir = '%s%s' % (self._new_tempdir(), os.sep)
    dict_input_1 = {
        'PatientName': 'George', 'Age': 23, 'TestResult': 'Negative'
    }
    str_input_1 = json.dumps(dict_input_1)
    self._create_temp_file(dir=temp_dir, content=str_input_1)
    dict_input_2 = {'PatientName': 'Peter', 'Age': 54, 'TestResult': 'Positive'}
    str_input_2 = json.dumps(dict_input_2)
    self._create_temp_file(dir=temp_dir, content=str_input_2)
    dict_input_3 = {'PatientName': 'Zen', 'Age': 27, 'TestResult': 'Negative'}
    str_input_3 = json.dumps(dict_input_3)
    self._create_temp_file(dir=temp_dir, content=str_input_3)

    with TestPipeline() as p:
      results = (
          p
          | beam.Create([FileSystems.join(temp_dir, '*')])
          | fileio.MatchAll()
          | fileio.ReadMatches()
          | UploadToDicomStore(input_dict, 'fileio', buffer_size=1)
          | beam.Map(lambda x: x['success']))
      assert_that(results, equal_to([True] * 3))
    self.assertTrue(dict_input_1 in fc.dicom_metadata)
    self.assertTrue(dict_input_2 in fc.dicom_metadata)
    self.assertTrue(dict_input_3 in fc.dicom_metadata)

示例#4

0

显示文件

文件： io.py 项目： henrik680/RawImporterRenewableStatsSwe

    def expand(self, root):
        # TODO(robertwb): Handle streaming (with explicit schema).
        paths_pcoll = root | beam.Create([self.path])
        first = io.filesystems.FileSystems.match(
            [self.path], limits=[1])[0].metadata_list[0].path
        with io.filesystems.FileSystems.open(first) as handle:
            if not self.binary:
                handle = TextIOWrapper(handle)
            if self.incremental:
                sample = next(
                    self.reader(handle, *self.args,
                                **dict(self.kwargs, chunksize=100)))
            else:
                sample = self.reader(handle, *self.args, **self.kwargs)

        pcoll = (paths_pcoll
                 | fileio.MatchFiles(self.path)
                 | beam.Reshuffle()
                 | fileio.ReadMatches()
                 | beam.ParDo(
                     _ReadFromPandasDoFn(self.reader, self.args, self.kwargs,
                                         self.binary, self.incremental,
                                         self.splitter)))
        from apache_beam.dataframe import convert
        return convert.to_dataframe(pcoll,
                                    proxy=_prefix_range_index_with(
                                        ':', sample[:0]))

示例#5

0

显示文件

    def expand(self, root):
        paths_pcoll = root | beam.Create([self.path])
        match = io.filesystems.FileSystems.match([self.path], limits=[1])[0]
        if not match.metadata_list:
            # TODO(BEAM-12031): This should be allowed for streaming pipelines if
            # user provides an explicit schema.
            raise FileNotFoundError(f"Found no files that match {self.path!r}")
        first_path = match.metadata_list[0].path
        with io.filesystems.FileSystems.open(first_path) as handle:
            if not self.binary:
                handle = TextIOWrapper(handle)
            if self.incremental:
                sample = next(
                    self.reader(handle, *self.args,
                                **dict(self.kwargs, chunksize=100)))
            else:
                sample = self.reader(handle, *self.args, **self.kwargs)

        pcoll = (paths_pcoll
                 | fileio.MatchFiles(self.path)
                 | beam.Reshuffle()
                 | fileio.ReadMatches()
                 | beam.ParDo(
                     _ReadFromPandasDoFn(self.reader, self.args, self.kwargs,
                                         self.binary, self.incremental,
                                         self.splitter)))
        from apache_beam.dataframe import convert
        return convert.to_dataframe(pcoll,
                                    proxy=_prefix_range_index_with(
                                        ':', sample[:0]))

示例#6

0

显示文件

 def expand(self, pcoll):
     return (pcoll
             | 'MatchAll' >> fileio.MatchAll()
             | beam.Reshuffle()
             | 'ReadEach' >> fileio.ReadMatches()
             | beam.FlatMap(lambda rfile: csv.DictReader(
                 io.TextIOWrapper(rfile.open()))))

示例#7

0

显示文件

文件： image_data_step.py 项目： sjoerdteunisse/zenml

def ReadImagesFromDisk(pipeline: beam.Pipeline,
                       base_path: Text) -> beam.pvalue.PCollection:
    """
    The Beam PTransform used to load a collection of images and metadata
    from a local file system or a remote cloud storage bucket.

    Args:
        pipeline (beam.Pipeline): Input beam.Pipeline object coming
         from a TFX Executor.
        base_path (Text): Base directory containing images and labels.
    """

    wildcard_qualifier = "*"

    # ingest all the files from the base path by supplying the wildcard
    file_pattern = os.path.join(base_path, wildcard_qualifier)

    allowed_ext = [".jpg", ".json", ".png", ".txt", ".jpeg"]

    images, label_file = (
        pipeline
        | fileio.MatchFiles(file_pattern)
        | fileio.ReadMatches()
        | beam.Map(read_file_content)
        | "FilterOutFiles" >> beam.Filter(lambda x: x[FILE_EXT] in allowed_ext)
        | "SplitLabelFile" >> beam.Partition(SplitByFileName, 2))

    # label_file is actually a dict
    label_dict = beam.pvalue.AsSingleton(label_file)
    ready_images = (
        images
        |
        "AddLabelAndMetadata" >> beam.Map(add_label_and_metadata, label_dict))

    return ready_images

示例#8

0

显示文件

文件： dicomio_integration_test.py 项目： fernando-wizeline/beam

    def test_dicom_store_instance_from_gcs(self):
        # Store DICOM files to a empty DICOM store from a GCS bucket,
        # then check if the store metadata match.
        input_dict_store = {}
        input_dict_store['project_id'] = self.project
        input_dict_store['region'] = REGION
        input_dict_store['dataset_id'] = DATA_SET_ID
        input_dict_store['dicom_store_id'] = self.temp_dicom_store

        expected_output = [True] * NUM_INSTANCE

        with self.test_pipeline as p:
            gcs_path = DICOM_FILES_PATH + "/io_test_files/*"
            results = (p
                       | fileio.MatchFiles(gcs_path)
                       | fileio.ReadMatches()
                       | UploadToDicomStore(input_dict_store, 'fileio')
                       | beam.Map(lambda x: x['success']))
            assert_that(results,
                        equal_to(expected_output),
                        label='store first assert')

        # Check the metadata using client
        result, status_code = DicomApiHttpClient().qido_search(
            self.project, REGION, DATA_SET_ID, self.temp_dicom_store,
            'instances')

        self.assertEqual(status_code, 200)

        # List comparison based on different version of python
        self.assertCountEqual(result, self.expected_output_all_metadata)

示例#9

0

显示文件

文件： fileio_test.py 项目： mahak/beam

    def test_write_to_dynamic_destination(self):

        sink_params = [
            fileio.TextSink,  # pass a type signature
            fileio.TextSink()  # pass a FileSink object
        ]

        for sink in sink_params:
            dir = self._new_tempdir()

            with TestPipeline() as p:
                _ = (p
                     | "Create" >> beam.Create(range(100))
                     | beam.Map(lambda x: str(x))
                     | fileio.WriteToFiles(
                         path=dir,
                         destination=lambda n: "odd" if int(n) % 2 else "even",
                         sink=sink,
                         file_naming=fileio.destination_prefix_naming("test")))

            with TestPipeline() as p:
                result = (
                    p
                    | fileio.MatchFiles(FileSystems.join(dir, '*'))
                    | fileio.ReadMatches()
                    | beam.Map(lambda f: (
                        os.path.basename(f.metadata.path).split('-')[0],
                        sorted(map(int,
                                   f.read_utf8().strip().split('\n'))))))

                assert_that(
                    result,
                    equal_to([('odd', list(range(1, 100, 2))),
                              ('even', list(range(0, 100, 2)))]))

示例#10

0

显示文件

  def test_write_to_different_file_types_some_spilling(self):

    dir = self._new_tempdir()

    with TestPipeline() as p:
      _ = (
          p
          | beam.Create(WriteFilesTest.SIMPLE_COLLECTION)
          | beam.io.fileio.WriteToFiles(
              path=dir,
              destination=lambda record: record['foundation'],
              sink=lambda dest: (
                  WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS)
                  if dest == 'apache' else WriteFilesTest.JsonSink()),
              file_naming=fileio.destination_prefix_naming(),
              max_writers_per_bundle=1))

    with TestPipeline() as p:
      cncf_res = (
          p
          | fileio.MatchFiles(FileSystems.join(dir, 'cncf*'))
          | fileio.ReadMatches()
          | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n'))
          | beam.Map(json.loads))

      apache_res = (
          p
          |
          "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*'))
          | "ReadApache" >> fileio.ReadMatches()
          | "MapApache" >>
          beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf))))

      assert_that(
          cncf_res,
          equal_to([
              row for row in self.SIMPLE_COLLECTION
              if row['foundation'] == 'cncf'
          ]),
          label='verifyCNCF')

      assert_that(
          apache_res,
          equal_to([[row['project'], row['foundation']]
                    for row in self.SIMPLE_COLLECTION
                    if row['foundation'] == 'apache']),
          label='verifyApache')

示例#11

0

显示文件

文件： fileio_test.py 项目： l2pg/beam_moremmr

    def test_basic_file_name_provided(self):
        content = 'TestingMyContent\nIn multiple lines\nhaha!'
        dir = '%s/' % self._new_tempdir()
        self._create_temp_file(dir=dir, content=content)

        with TestPipeline() as p:
            content_pc = (p
                          | beam.Create([dir])
                          | fileio.MatchAll()
                          | fileio.ReadMatches()
                          | beam.Map(lambda f: f.read().decode('utf-8')))

            assert_that(content_pc, equal_to([content]))

示例#12

0

显示文件

  def test_basic_file_name_provided(self):
    content = 'TestingMyContent\nIn multiple lines\nhaha!'
    dir = '%s%s' % (self._new_tempdir(), os.sep)
    self._create_temp_file(dir=dir, content=content)

    with TestPipeline() as p:
      content_pc = (
          p
          | beam.Create([FileSystems.join(dir, '*')])
          | fileio.MatchAll()
          | fileio.ReadMatches()
          | beam.FlatMap(lambda f: f.read().decode('utf-8').splitlines()))

      assert_that(content_pc, equal_to(content.splitlines()))

示例#13

0

显示文件

def file_process_pattern_access_metadata():

  import apache_beam as beam
  from apache_beam.io import fileio

  # [START FileProcessPatternAccessMetadataSnip1]
  with beam.Pipeline() as p:
    readable_files = (p
                      | fileio.MatchFiles('hdfs://path/to/*.txt')
                      | fileio.ReadMatches()
                      | beam.Reshuffle())
    files_and_contents = (readable_files
                          | beam.Map(lambda x: (x.metadata.path,
                                                x.read_utf8())))

示例#14

0

显示文件

  def test_csv_file_source(self):
    content = 'name,year,place\ngoogle,1999,CA\nspotify,2006,sweden'
    rows = [r.split(',') for r in content.split('\n')]

    dir = '%s%s' % (self._new_tempdir(), os.sep)
    self._create_temp_file(dir=dir, content=content)

    with TestPipeline() as p:
      content_pc = (p
                    | beam.Create([FileSystems.join(dir, '*')])
                    | fileio.MatchAll()
                    | fileio.ReadMatches()
                    | beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf))))

      assert_that(content_pc, equal_to(rows))

示例#15

0

显示文件

    def test_fail_on_directories(self):
        content = 'thecontent\n'
        files = []
        tempdir = '%s%s' % (self._new_tempdir(), os.sep)

        # Create a couple files to be matched
        files.append(self._create_temp_file(dir=tempdir, content=content))
        files.append(self._create_temp_file(dir=tempdir, content=content))

        with self.assertRaises(beam.io.filesystem.BeamIOError):
            with TestPipeline() as p:
                _ = (p
                     | beam.Create(files + ['%s/' % tempdir])
                     | fileio.ReadMatches(skip_directories=False)
                     | beam.Map(lambda x: x.read_utf8()))

示例#16

0

显示文件

文件： fileio_test.py 项目： l2pg/beam_moremmr

    def test_string_filenames_and_skip_directory(self):
        content = 'thecontent\n'
        files = []
        tempdir = '%s/' % self._new_tempdir()

        # Create a couple files to be matched
        files.append(self._create_temp_file(dir=tempdir, content=content))
        files.append(self._create_temp_file(dir=tempdir, content=content))

        with TestPipeline() as p:
            contents_pc = (p
                           | beam.Create(files + [tempdir])
                           | fileio.ReadMatches()
                           | beam.Map(lambda x: x.read().decode('utf-8')))

            assert_that(contents_pc, equal_to([content] * 2))

示例#17

0

显示文件

  def expand(self, root):
    # TODO(robertwb): Handle streaming (with explicit schema).
    paths_pcoll = root | beam.Create([self.path])
    first = io.filesystems.FileSystems.match([self.path],
                                             limits=[1
                                                     ])[0].metadata_list[0].path
    with io.filesystems.FileSystems.open(first) as handle:
      df = next(self.reader(handle, *self.args, chunksize=100, **self.kwargs))

    pcoll = (
        paths_pcoll
        | fileio.MatchFiles(self.path)
        | fileio.ReadMatches()
        | beam.ParDo(_ReadFromPandasDoFn(self.reader, self.args, self.kwargs)))
    from apache_beam.dataframe import convert
    return convert.to_dataframe(
        pcoll, proxy=_prefix_range_index_with(':', df[:0]))

示例#18

0

显示文件

    def test_read_gzip_compressed_file_without_suffix(self):
        dir = '%s%s' % (self._new_tempdir(), os.sep)

        file_contents = b'compressed_contents!'
        import gzip
        with gzip.GzipFile(os.path.join(dir, 'compressed'), 'w') as f:
            f.write(file_contents)

        with TestPipeline() as p:
            content_pc = (
                p
                | beam.Create([FileSystems.join(dir, '*')])
                | fileio.MatchAll()
                | fileio.ReadMatches()
                | beam.Map(lambda rf: rf.open(compression_type=CompressionTypes
                                              .GZIP).read(len(file_contents))))

            assert_that(content_pc, equal_to([file_contents]))

示例#19

0

显示文件

  def test_write_to_single_file_batch(self):

    dir = self._new_tempdir()

    with TestPipeline() as p:
      _ = (p
           | beam.Create(WriteFilesTest.SIMPLE_COLLECTION)
           | "Serialize" >> beam.Map(json.dumps)
           | beam.io.fileio.WriteToFiles(path=dir))

    with TestPipeline() as p:
      result = (p
                | fileio.MatchFiles(FileSystems.join(dir, '*'))
                | fileio.ReadMatches()
                | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n'))
                | beam.Map(json.loads))

      assert_that(result,
                  equal_to([row for row in self.SIMPLE_COLLECTION]))

示例#20

0

显示文件

    def test_infer_compressed_file(self):
        dir = '%s%s' % (self._new_tempdir(), os.sep)

        file_contents = b'compressed_contents!'
        import gzip
        with gzip.GzipFile(os.path.join(dir, 'compressed.gz'), 'w') as f:
            f.write(file_contents)

        file_contents2 = b'compressed_contents_bz2!'
        import bz2
        with bz2.BZ2File(os.path.join(dir, 'compressed2.bz2'), 'w') as f:
            f.write(file_contents2)

        with TestPipeline() as p:
            content_pc = (p
                          | beam.Create([FileSystems.join(dir, '*')])
                          | fileio.MatchAll()
                          | fileio.ReadMatches()
                          | beam.Map(lambda rf: rf.open().readline()))

            assert_that(content_pc, equal_to([file_contents, file_contents2]))

示例#21

0

显示文件

文件： fileio_test.py 项目： l2pg/beam_moremmr

    def test_csv_file_source(self):
        content = 'name,year,place\ngoogle,1999,CA\nspotify,2006,sweden'
        rows = [r.split(',') for r in content.split('\n')]

        dir = '%s/' % self._new_tempdir()
        self._create_temp_file(dir=dir, content=content)

        def get_csv_reader(readable_file):
            if sys.version_info >= (3, 0):
                return csv.reader(io.TextIOWrapper(readable_file.open()))
            else:
                return csv.reader(readable_file.open())

        with TestPipeline() as p:
            content_pc = (p
                          | beam.Create([dir])
                          | fileio.MatchAll()
                          | fileio.ReadMatches()
                          | beam.FlatMap(get_csv_reader))

            assert_that(content_pc, equal_to(rows))

示例#22

0

显示文件

文件： io.py 项目： fernando-wizeline/beam

    def expand(self, root):
        paths_pcoll = root | beam.Create([self.path])
        match = io.filesystems.FileSystems.match([self.path], limits=[1])[0]
        if not match.metadata_list:
            # TODO(BEAM-12031): This should be allowed for streaming pipelines if
            # user provides an explicit schema.
            raise FileNotFoundError(f"Found no files that match {self.path!r}")
        first_path = match.metadata_list[0].path
        with io.filesystems.FileSystems.open(first_path) as handle:
            if not self.binary:
                handle = TextIOWrapper(handle)
            if self.incremental:
                sample = next(
                    self.reader(handle, *self.args,
                                **dict(self.kwargs, chunksize=100)))
            else:
                sample = self.reader(handle, *self.args, **self.kwargs)

        matches_pcoll = paths_pcoll | fileio.MatchAll()
        indices_pcoll = (
            matches_pcoll.pipeline
            | 'DoOnce' >> beam.Create([None])
            | beam.Map(
                lambda _, paths:
                {path: ix
                 for ix, path in enumerate(sorted(paths))},
                paths=beam.pvalue.AsList(matches_pcoll
                                         | beam.Map(lambda match: match.path)))
        )

        pcoll = (matches_pcoll
                 | beam.Reshuffle()
                 | fileio.ReadMatches()
                 | beam.ParDo(
                     _ReadFromPandasDoFn(self.reader, self.args, self.kwargs,
                                         self.binary, self.incremental,
                                         self.splitter),
                     path_indices=beam.pvalue.AsSingleton(indices_pcoll)))
        from apache_beam.dataframe import convert
        return convert.to_dataframe(pcoll, proxy=sample[:0])

示例#23

0

显示文件

def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)


  from apache_beam.io import fileio

  with beam.Pipeline(options=PipelineOptions()) as p:
    output = (p
              | "match" >> fileio.MatchFiles(known_args.input)
              | "read match" >> fileio.ReadMatches()
              | "read_file" >> beam.Map(lambda x: (x.metadata.path,
                                                   x.read_utf8()))
              | "parse file" >> beam.Map(lambda x: (parse_filename(x[0]),
                                                    parse_file(x[1].split('\n'))
                                                   ))
              | "unfold" >> beam.ParDo(MetaAndContent())
              # | "debug" >> beam.FlatMap(lambda x: print(x))
    )

    table_spec = 'brainscode-140622:tf2up.conversions'
    table_schema = {'fields': [
        {'name': 'date', 'type': 'DATE'},
        {'name': 'file_hash', 'type': 'STRING'},
        {'name': 'line', 'type': 'INT64'},
        {'name': 'position', 'type': 'INT64'},
        {'name': 'severity', 'type': 'STRING'},
        {'name': 'message', 'type': 'STRING'},
        {'name': 'ops', 'type': 'STRING', 'mode': 'REPEATED'}
    ]}


    # two different setups for create_disposition CREATE_IF_NEEDED
    # and write_disposition - WRITE_TRUNCATE
    output | 'store to BQ' >> beam.io.WriteToBigQuery(
                          table_spec,
                          schema=table_schema,
                          write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                          create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                          method="STREAMING_INSERTS"
                        )



    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> WriteToText(known_args.output)

示例#24

0

显示文件

文件： HC_CHS_ATS_DF.py 项目： superfonz/BQ_Projects

def run(argv=None):
    class DataIngestion:
        def parse_method(self, string_input):
            values = re.split(",", re.sub('\r\n', '', re.sub(u'"', '', string_input)))

            row = dict(
                zip(('Requisition_Number', 'Opportunity_Title', 'Opportunity_Status', 'Featured', 'Company_Code',
                     'Company', 'Entity', 'Entity_Desc', 'Source_Job_Code', 'Job_Title', 'FullTime_Or_PartTime',
                     'Salary_Or_Hourly', 'Recruiter', 'Location_Name', 'Date_Applied', 'Source', 'Step', 'Step_Date',
                     'Recruiting_Hire_Date', 'Start_Date', 'Candidate', 'Candidate_Email_Address',
                     'Candidate_Primary_Phone', 'First_Published_Date', 'Average_Days_Between_Publish_Hire_Dates'),
                    values))
            return row

    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        dest='input',
        required=False,
        help='Input file to read. This can be a local file or '
             'a file in a Google Storage Bucket.',
        # default='gs://hc_crackerbarrel_ats/Historical'
    )
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        help='Output BQ table to write results to.',
                        default='chs.ats_master')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--runner=DataflowRunner',
        '--project=hireclix',
        '--region=us-east1',
        '--staging_location=gs://hc_chs_ats/File_Temp/Source',
        '--temp_location=gs://hc_chs_ats/File_Temp/Staging',
        '--job_name=chstest1'
    ])

    data_ingestion = DataIngestion()

    with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p:
        readable_files = (p
                          | 'Matching .csv files' >> fileio.MatchFiles('gs://hc_chs_ats/File_Temp/Temp_File/*.csv')
                          | 'Read Matches' >> fileio.ReadMatches()
                          | 'Rebalance data inputs' >> beam.Reshuffle())
        files_and_content = (readable_files
                             | 'Determine FilePath' >> beam.Map(lambda x: x.metadata.path))
        writebq = (files_and_content
                   | 'Read from a File' >> beam.io.ReadAllFromText(skip_header_lines=1)
                   | 'String To BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s))
                   | 'Write to BigQuery' >> beam.io.Write(
                    beam.io.WriteToBigQuery(known_args.output,
                                            schema='Requisition_Number:STRING,'
                                                   'Opportunity_Title:STRING,'
                                                   'Opportunity_Status:STRING,'
                                                   'Featured:STRING,'
                                                   'Company_Code:STRING,'
                                                   'Company:STRING,'
                                                   'Entity:STRING,'
                                                   'Entity_Desc:STRING,'
                                                   'Source_Job_Code:STRING,'
                                                   'Job_Title:STRING,'
                                                   'FullTime_Or_PartTime:STRING,'
                                                   'Salary_Or_Hourly:STRING,'
                                                   'Recruiter:STRING,'
                                                   'Location_Name:STRING,'
                                                   'Date_Applied:DATE,'
                                                   'Source:STRING,'
                                                   'Step:STRING,'
                                                   'Step_Date:DATE,'
                                                   'Recruiting_Hire_Date:DATE,'
                                                   'Start_Date:DATE,'
                                                   'Candidate:STRING,'
                                                   'Candidate_Email_Address:STRING,'
                                                   'Candidate_Primary_Phone:STRING,'
                                                   'First_Published_Date:DATE,'
                                                   'Average_Days_Between_Publish_Hire_Dates:STRING',
                                            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))