Пример #1
0
def run_pipeline(in_file):
    import csv

    import apache_beam as beam
    from apache_beam.io.textio import ReadFromText
    from apache_beam.io.textio import WriteToText

    # Simple process for apache beam pipeline
    with beam.Pipeline(runner='DirectRunner') as p:
        #
        # Pipeline(0): Data ingestion
        #
        # "lines" will include pcollections of each line
        # Options
        # file_pattern: File path to file
        # skip_header_lines: First line will be skipped. Set to "1".

        # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.ReadFromText
        collections = p | 'ReadAiportInfo' >> ReadFromText(
            file_pattern=in_file[0], skip_header_lines=1)

        #
        # Pipeline(1): Create side input
        # Final PCollection will be used as side input for the date time convertion in the next transformation
        # 1. Parse each line and return fields as a list. Use csv module to remove any double quotes inside field
        # 2. Filter out invalid fields
        # 3. Just get "AIRPORT_SEQ_ID"(0),"LATITUDE"(21),"LONGITUDE"(26). Also add timezone for correspondng coordinates
        #
        airports = (collections
                    | 'airports:Extract' >>
                    beam.Map(lambda x: next(csv.reader([x], delimiter=',')))
                    |
                    'airports:Filter' >> beam.Filter(lambda x: x[21] and x[26])
                    | 'airports:Timezone' >>
                    beam.Map(lambda x: (x[0], addtimezone(x[21], x[26]))))

        #
        # Pipeline(2): Correct timezone
        # 1. Read flight data
        # 2. Convert times into UTC
        flights = (p | 'flights:read' >> ReadFromText(file_pattern=in_file[1],
                                                      skip_header_lines=1)
                   | 'flights:tzcorr' >> beam.FlatMap(
                       tz_correct, beam.pvalue.AsDict(airports)))

        # Write results to a file. Tuples are unpacked while function call.
        # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.WriteToText
        (flights | 'flights:out' >> WriteToText(file_path_prefix='flights'))

        # Pipeline(3): Generate departed and arrived events
        # 1.
        events = flights | '' >> beam.FlatMap(get_next_event)

        #
        # Pipeline(Final)
        #
        # Write results to a file. Tuples are unpacked while function call.
        # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.WriteToText
        (events | 'event:out' >> WriteToText(file_path_prefix='events'))
Пример #2
0
 def test_read_gzip_empty_file(self):
   with TempDir() as tempdir:
     file_name = tempdir.create_temp_file()
     with TestPipeline() as pipeline:
       pcoll = pipeline | 'Read' >> ReadFromText(
           file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
       assert_that(pcoll, equal_to([]))
Пример #3
0
def run_pipeline(in_file, out_file):
    # Simple process for apache beam pipeline
    with beam.Pipeline(runner='DirectRunner') as p:
        #
        # Pipeline(0): Data ingestion
        #
        # "lines" will include pcollections of each line
        # Options
        # file_pattern: File path to file
        # skip_header_lines: First line will be skipped. Set to "1".

        # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.ReadFromText
        collections = p | ReadFromText(file_pattern=in_file,
                                       skip_header_lines=1)

        #
        # Pipeline(n): Detailed Transformation
        # 1. Parse each line and return fields as a list. Use csv module to remove any double quotes inside field
        # 2. Just get "AIRPORT_SEQ_ID"(0),"LATITUDE"(21),"LONGITUDE"(26)
        #
        airports = (collections
                    | 'Extract_Into_Fields' >>
                    beam.Map(lambda x: next(csv.reader([x], delimiter=',')))
                    |
                    'Set_Fields' >> beam.Map(lambda x: (x[0], (x[21], x[26]))))

        #
        # Pipeline(Final)
        #
        # Write results to a file. Tuples are unpacked while function call.
        # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.WriteToText
        (airports
         | beam.Map(lambda
                    (airport, data): "{0},{1}".format(airport, ','.join(data)))
         | WriteToText(file_path_prefix=out_file))
Пример #4
0
  def test_read_from_text_single_file_with_coder(self):

    file_name, expected_data = write_data(5)
    assert len(expected_data) == 5
    with TestPipeline() as pipeline:
      pcoll = pipeline | 'Read' >> ReadFromText(file_name, coder=DummyCoder())
      assert_that(pcoll, equal_to([record * 2 for record in expected_data]))
Пример #5
0
 def test_read_gzip_empty_file(self):
     file_name = self._create_temp_file()
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(
         file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
     assert_that(pcoll, equal_to([]))
     pipeline.run()
def run(argv=None):
    parser = argparse.ArgumentParser()

    parser.add_argument("--file_location", dest="file_location", required=True)
    parser.add_argument("--schema_location",
                        dest="schema_location",
                        required=True)
    parser.add_argument("--output_table", dest="output_table", required=True)

    known_args, pipeline_args = parser.parse_known_args(argv)

    file_location = known_args.file_location
    schema_location = known_args.schema_location
    output_table = known_args.output_table

    cloud_storage_to_bq = CloudStorageToBigQuery()

    with open("./schemas/customers.json") as f:
        schema_dict = json.load(f)
        table_schema = json_to_schema(schema_dict)

    p = beam.Pipeline(options=_options)
    (p
     | "Read CSV file from cloud storage" >> ReadFromText(
         file_pattern=file_location, skip_header_lines=True)
     | 'CSV Row To BigQuery Row' >>
     beam.Map(lambda s: cloud_storage_to_bq.process_row(s, schema_dict))
     | "Write to BigQuery" >> bigquery.WriteToBigQuery(
         table=f"precis-digital-case-interview:part_1.{output_table}",
         schema=table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

    p.run().wait_until_finish()
Пример #7
0
    def get(self):
        """
        Flask view that triggers the execution of the pipeline
        """
        input_filename = 'data/input/titanic.txt'
        output_filename = 'data/output/titanic.txt'

        # project_id = os.environ['DATASTORE_PROJECT_ID']
        # credentials_file = os.environ['GOOGLE_APPLICATION_CREDENTIALS']
        # client = datastore.Client.from_service_account_json(credentials_file)

        options = PipelineOptions()
        gcloud_options = options.view_as(GoogleCloudOptions)
        # gcloud_options.project = project_id
        gcloud_options.job_name = 'test-job'

        # Dataflow runner
        runner = os.environ['DATAFLOW_RUNNER']
        options.view_as(StandardOptions).runner = runner

        with apache_beam.Pipeline(options=options) as p:
            rows = (p | ReadFromText(input_filename)
                    | apache_beam.ParDo(Split()))

            survived = (rows | apache_beam.ParDo(CollectSurvived())
                        | apache_beam.GroupByKey()
                        | apache_beam.ParDo(WriteToCSV())
                        | WriteToText(output_filename))

        return 'All Titanic survivors are writte to data/output/titanic.txt-00000-of-00001'
Пример #8
0
 def test_read_from_text_file_pattern(self):
   pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4])
   assert len(expected_data) == 40
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(pattern)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
Пример #9
0
def main():
    args, pipeline_args = get_args()

    # 먼저 PipelineOpions 을 통해서 pipeline에 대한 설정을 할 수 있습니다.
    # 예를 들어서 pipeline runner를 설정하여 무엇이 pipeline을 실행할지 설정할수 있습니다.

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        # Input data file -> TextIO.Read Transform -> PCollection(lines)
        lines = p | ReadFromText(args.input)

        counts = (
                lines
                | 'Split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
                              .with_output_types(unicode))
                | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
                | 'GroupAndSum' >> beam.CombinePerKey(sum))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return '%s: %s' % (word, count)

        output = counts | 'Format' >> beam.Map(format_result)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | WriteToText(args.output)
Пример #10
0
 def test_read_from_text_single_file(self):
   file_name, expected_data = write_data(5)
   assert len(expected_data) == 5
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(file_name)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
Пример #11
0
def run(argv=None):
    """Run the beam pipeline."""
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    lines = p | "read qa files" >> ReadFromText(args.file_pattern)

    # The lines are not JSON, but the string representation of python
    # dictionary objects. Parse them with ast.literal_eval.
    json_objects = lines | "parsing dictionaries" >> beam.Map(ast.literal_eval)
    qa_tuples = json_objects | "create tuples" >> beam.FlatMap(
        partial(
            _create_tuples,
            min_words=args.min_words, max_words=args.max_words)
    )

    # Remove duplicate examples.
    qa_tuples |= "key by QA" >> beam.Map(lambda v: (v[1:], v))
    qa_tuples |= "group duplicates" >> beam.GroupByKey()
    qa_tuples |= "remove duplicates" >> beam.Map(lambda v: sorted(v[1])[0])

    # Create the examples.
    examples = qa_tuples | "create examples" >> beam.Map(
        lambda args: _create_example(*args)
    )
    examples = _shuffle_examples(examples)

    examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(args.train_split)
    ).with_outputs(_TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps
    else:
        assert args.dataset_format == _TF_FORMAT
        write_sink = WriteToTFRecord
        file_name_suffix = ".tfrecord"
        serialize_fn = _features_to_serialized_tf_example

    for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG),
                      ("test", _TrainTestSplitFn.TEST_TAG)]:
        serialized_examples = examples[tag] | (
                "serialize {} examples".format(name) >> beam.Map(serialize_fn))
        (
                serialized_examples | ("write " + name)
                >> write_sink(
            os.path.join(args.output_dir, name),
            file_name_suffix=file_name_suffix,
            num_shards=args.num_shards_train,
        )
        )

    result = p.run()
    result.wait_until_finish()
Пример #12
0
def execute():
    with beam.Pipeline('DirectRunner') as p:
        (p 
            | 'ReadFile'      >> ReadFromText(file_pattern='./data/SMSSpamCollection')
            | 'Deduplicate'   >> beam.RemoveDuplicates()
            | 'Parse'         >> beam.FlatMap(parse)
            | 'Write'         >> beam.io.WriteToText('./data/Output.jsonl') 
        )
Пример #13
0
 def process(self, element):
     file = {
         'name':
         element['name'],
         'content':
         ReadFromText(element['file']) | apache_beam.combiners.ToList()
     }
     return [file]
Пример #14
0
 def test_read_gzip_empty_file(self):
     filename = tempfile.NamedTemporaryFile(delete=False,
                                            prefix=tempfile.template).name
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(
         filename, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
     assert_that(pcoll, equal_to([]))
     pipeline.run()
Пример #15
0
def run(p, args, aggregator_dict, cloud_logger=None):
    """Run the pipeline with the args and dataflow pipeline option."""
    # Create a PCollection for model directory.
    model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir])

    input_file_format = args.input_file_format.lower()

    # Create one pcollection per input file or file pattern. And then flatten
    # them into one pcollection. The duplicated names need to be removed as the
    # file name is used to create unique labels for the PTransform.
    readers = []
    for pattern in list(
            set(args.input_file_patterns.split(FILE_LIST_SEPARATOR))):
        # Setup reader.
        #
        # TODO(user): Perhaps simplify the batch prediction code by using
        # CompressionTypes.AUTO.
        if input_file_format.startswith("tfrecord"):
            if input_file_format == "tfrecord_gzip":
                compression_type = CompressionTypes.GZIP
            else:
                assert input_file_format == "tfrecord"
                compression_type = CompressionTypes.UNCOMPRESSED
            reader = "READ_TFRECORD_FILES_%s" % pattern >> ReadFromTFRecord(
                pattern, compression_type=compression_type)

        else:
            assert input_file_format == "text"
            reader = "READ_TEXT_FILES_%s" % pattern >> ReadFromText(pattern)

        # Put the pcollections into a list and flatten later.
        readers.append(p | reader)

    # Setup the whole pipeline.
    results, errors = (readers
                       | beam.Flatten()
                       | "BATCH_PREDICTION" >> batch_prediction.BatchPredict(
                           beam.pvalue.AsSingleton(model_dir),
                           batch_size=args.batch_size,
                           aggregator_dict=aggregator_dict,
                           cloud_logger=cloud_logger))

    # Convert predictions to JSON and then write to output files.
    _ = (results
         | "TO_JSON" >> beam.Map(json.dumps)
         | "WRITE_PREDICTION_RESULTS" >> WriteToText(
             os.path.join(args.output_location,
                          OUTPUT_RESULTS_FILES_BASENAME_)))
    # Write prediction errors counts to output files.
    _ = (
        errors
        | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey()
        | "WRITE_ERRORS" >> WriteToText(
            os.path.join(args.output_location, OUTPUT_ERRORS_FILES_BASENAME_)))

    return p.run()
Пример #16
0
    def get(self):
        """
        Flask view that triggers the execution of the pipeline
        """
        input_filename = 'input.txt'
        output_filename = 'output.txt'

        # project_id = os.environ['DATASTORE_PROJECT_ID']
        # credentials_file = os.environ['GOOGLE_APPLICATION_CREDENTIALS']
        # client = datastore.Client.from_service_account_json(credentials_file)

        options = PipelineOptions()
        gcloud_options = options.view_as(GoogleCloudOptions)
        # gcloud_options.project = project_id
        gcloud_options.job_name = 'test-job'

        # Dataflow runner
        runner = os.environ['DATAFLOW_RUNNER']
        options.view_as(StandardOptions).runner = runner

        with apache_beam.Pipeline(options=options) as p:
            rows = (
                p |
                ReadFromText(input_filename) |
                apache_beam.ParDo(Split())
            )

            timings = (
                rows |
                apache_beam.ParDo(CollectTimings()) |
                "Grouping timings" >> apache_beam.GroupByKey() |
                "Calculating average" >> apache_beam.CombineValues(
                    apache_beam.combiners.MeanCombineFn()
                )
            )

            users = (
                rows |
                apache_beam.ParDo(CollectUsers()) |
                "Grouping users" >> apache_beam.GroupByKey() |
                "Counting users" >> apache_beam.CombineValues(
                    apache_beam.combiners.CountCombineFn()
                )
            )

            to_be_joined = (
                {
                    'timings': timings,
                    'users': users
                } |
                apache_beam.CoGroupByKey() |
                apache_beam.ParDo(WriteToCSV()) |
                WriteToText(output_filename)
            )

        return 'ok'
Пример #17
0
  def test_read_auto_bzip2(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file(suffix='.bz2')
      with bz2.BZ2File(file_name, 'wb') as f:
        f.write('\n'.join(lines).encode('utf-8'))

      with TestPipeline() as pipeline:
        pcoll = pipeline | 'Read' >> ReadFromText(file_name)
        assert_that(pcoll, equal_to(lines))
Пример #18
0
    def test_read_auto_bzip2(self):
        _, lines = write_data(15)
        file_name = self._create_temp_file(suffix='.bz2')
        with bz2.BZ2File(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(file_name)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
Пример #19
0
  def test_read_auto_deflate(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file(suffix='.deflate')
      with open(file_name, 'wb') as f:
        f.write(zlib.compress('\n'.join(lines).encode('utf-8')))

      with TestPipeline() as pipeline:
        pcoll = pipeline | 'Read' >> ReadFromText(file_name)
        assert_that(pcoll, equal_to(lines))
Пример #20
0
def run(argv=None):
    """Run the beam pipeline."""
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    lines = p | "read qa files" >> ReadFromText(args.file_pattern)

    # The lines are not JSON, but the string representation of python
    # dictionary objects. Parse them with ast.literal_eval.
    json_objects = lines | "parsing dictionaries" >> beam.Map(ast.literal_eval)
    qa_tuples = json_objects | "create tuples" >> beam.FlatMap(
        partial(
            _create_tuples,
            min_words=args.min_words, max_words=args.max_words)
    )

    # Remove duplicate examples.
    qa_tuples |= "key by QA" >> beam.Map(lambda v: (v[1:], v))
    qa_tuples |= "group duplicates" >> beam.GroupByKey()
    qa_tuples |= "remove duplicates" >> beam.Map(lambda v: sorted(v[1])[0])

    # Create the examples.
    serialized_examples = qa_tuples | "create examples" >> beam.Map(
        lambda args: create_example(*args).SerializeToString()
    )
    serialized_examples = _shuffle_examples(serialized_examples)

    serialized_examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(args.train_split)
    ).with_outputs(_TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    (
        serialized_examples[_TrainTestSplitFn.TRAIN_TAG]
        | "write train"
        >> WriteToTFRecord(
            os.path.join(args.output_dir, "train"),
            file_name_suffix=".tfrecords",
            num_shards=args.num_shards_train,
        )
    )
    (
        serialized_examples[_TrainTestSplitFn.TEST_TAG]
        | "write test"
        >> WriteToTFRecord(
            os.path.join(args.output_dir, "test"),
            file_name_suffix=".tfrecords",
            num_shards=args.num_shards_test,
        )
    )

    result = p.run()
    result.wait_until_finish()
Пример #21
0
    def test_read_bzip2(self):
        _, lines = write_data(15)
        with TempDir() as tempdir:
            file_name = tempdir.create_temp_file()
            with bz2.BZ2File(file_name, 'wb') as f:
                f.write('\n'.join(lines).encode('utf-8'))

            with TestPipeline() as pipeline:
                pcoll = pipeline | 'Read' >> ReadFromText(
                    file_name, compression_type=CompressionTypes.BZIP2)
                assert_that(pcoll, equal_to(lines))
Пример #22
0
    def test_read_bzip2(self):
        _, lines = write_data(15)
        file_name = self._create_temp_file()
        with bz2.BZ2File(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, compression_type=CompressionTypes.BZIP2)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
Пример #23
0
    def test_read_gzip(self):
        _, lines = write_data(15)
        file_name = self._create_temp_file()
        with gzip.GzipFile(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
Пример #24
0
  def test_read_gzip(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file()
      with gzip.GzipFile(file_name, 'wb') as f:
        f.write('\n'.join(lines).encode('utf-8'))

      with TestPipeline() as pipeline:
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
        assert_that(pcoll, equal_to(lines))
Пример #25
0
    def test_read_bzip2(self):
        _, lines = write_data(15)
        file_name = tempfile.NamedTemporaryFile(delete=False,
                                                prefix=tempfile.template).name
        with bz2.BZ2File(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, compression_type=CompressionTypes.BZIP2)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
Пример #26
0
    def test_read_gzip_large(self):
        _, lines = write_data(10000)
        file_name = tempfile.NamedTemporaryFile(delete=False,
                                                prefix=tempfile.template).name
        with gzip.GzipFile(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
Пример #27
0
  def test_read_auto_gzip(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file(suffix='.gz')

      with gzip.GzipFile(file_name, 'wb') as f:
        f.write('\n'.join(lines).encode('utf-8'))

      pipeline = TestPipeline()
      pcoll = pipeline | 'Read' >> ReadFromText(file_name)
      assert_that(pcoll, equal_to(lines))
      pipeline.run()
Пример #28
0
    def test_read_auto_gzip(self):
        _, lines = write_data(15)
        file_name = tempfile.NamedTemporaryFile(delete=False,
                                                prefix=tempfile.template,
                                                suffix='.gz').name
        with gzip.GzipFile(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(file_name)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
Пример #29
0
  def test_read_gzip_with_skip_lines(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file()
      with gzip.GzipFile(file_name, 'wb') as f:
        f.write('\n'.join(lines))

      pipeline = TestPipeline()
      pcoll = pipeline | 'Read' >> ReadFromText(
          file_name, 0, CompressionTypes.GZIP,
          True, coders.StrUtf8Coder(), skip_header_lines=2)
      assert_that(pcoll, equal_to(lines[2:]))
      pipeline.run()
Пример #30
0
  def test_read_from_text_single_file_with_coder(self):
    class DummyCoder(coders.Coder):
      def encode(self, x):
        raise ValueError

      def decode(self, x):
        return (x * 2).decode('utf-8')

    file_name, expected_data = write_data(5)
    assert len(expected_data) == 5
    with TestPipeline() as pipeline:
      pcoll = pipeline | 'Read' >> ReadFromText(file_name, coder=DummyCoder())
      assert_that(pcoll, equal_to([record * 2 for record in expected_data]))