Python ParDo示例，apache_beam.transforms.core.ParDo Python示例

示例#1

0

显示文件

文件： filebasedsource.py 项目： jeganbaskar/GAE-Data-Flow

 def expand(self, pvalue):
   return (pvalue
           | 'ExpandIntoRanges' >> ParDo(_ExpandIntoRanges(
               self._splittable, self._compression_type,
               self._desired_bundle_size, self._min_bundle_size))
           | 'Reshard' >> Reshuffle()
           | 'ReadRange' >> ParDo(_ReadRange(self._source_from_file)))

示例#2

0

显示文件

文件： util.py 项目： samwagg/beam

    def expand(self, pcoll):
        class ReifyTimestamps(DoFn):
            def process(self, element, timestamp=DoFn.TimestampParam):
                yield element[0], TimestampedValue(element[1], timestamp)

        class RestoreTimestamps(DoFn):
            def process(self, element, window=DoFn.WindowParam):
                # Pass the current window since _IdentityWindowFn wouldn't know how
                # to generate it.
                yield windowed_value.WindowedValue(
                    (element[0], element[1].value), element[1].timestamp,
                    [window])

        windowing_saved = pcoll.windowing
        # The linter is confused.
        # pylint: disable=abstract-class-instantiated
        result = (
            pcoll
            | ParDo(ReifyTimestamps())
            | 'IdentityWindow' >> WindowInto(
                _IdentityWindowFn(windowing_saved.windowfn.get_window_coder()),
                trigger=AfterCount(1),
                accumulation_mode=AccumulationMode.DISCARDING,
                timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST,
            )
            | GroupByKey()
            | 'ExpandIterable' >> FlatMap(lambda e: [(e[0], value)
                                                     for value in e[1]])
            | ParDo(RestoreTimestamps()))
        result._windowing = windowing_saved
        return result

示例#3

0

显示文件

文件： trigger_manager_test.py 项目： fernando-wizeline/beam

    def test_fixed_after_count_accumulating(self):
        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([('k1', 1), ('k1', 1), ('k2', 1), ('k2', 1)])
              .add_elements([('k1', 1), ('k1', 1)])
              .advance_watermark_to(2)
              .add_elements([('k1', 2), ('k2', 2)])  # This values are discarded.
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(FixedWindows(2),
                              triggerfn=Repeatedly(AfterCount(2)),
                              accumulation_mode=AccumulationMode.ACCUMULATING)

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(0, 2), [1, 1]),
                    ('k2', IntervalWindow(0, 2), [1, 1]),
                    ('k1', IntervalWindow(0, 2), [1, 1, 1, 1]),
                ]))

示例#4

0

显示文件

文件： util.py 项目： xmarker/beam

 def expand(self, pcoll):
   if getattr(pcoll.pipeline.runner, 'is_streaming', False):
     raise NotImplementedError("Requires stateful processing (BEAM-2687)")
   elif pcoll.windowing.is_default():
     # This is the same logic as _GlobalWindowsBatchingDoFn, but optimized
     # for that simpler case.
     return pcoll | ParDo(_GlobalWindowsBatchingDoFn(
         self._batch_size_estimator))
   else:
     return pcoll | ParDo(_WindowAwareBatchingDoFn(self._batch_size_estimator))

示例#5

0

显示文件

 def expand(self, pvalue):
     pvalue = (pvalue
               | 'ExpandIntoRanges' >> ParDo(
                   _ExpandIntoRanges(
                       self._splittable, self._compression_type,
                       self._desired_bundle_size, self._min_bundle_size)))
     if self._is_reshuffle:
         pvalue = pvalue | 'Reshard' >> Reshuffle()
     return (pvalue
             | 'ReadRange' >> ParDo(
                 _ReadRange(self._source_from_file,
                            with_filename=self._with_filename)))

示例#6

0

显示文件

文件： sdf_direct_runner.py 项目： AfterShip/aftership-beam

    def expand(self, pcoll):
        sdf = self._ptransform.fn
        signature = DoFnSignature(sdf)
        restriction_coder = signature.get_restriction_coder()
        element_coder = typecoders.registry.get_coder(pcoll.element_type)

        keyed_elements = (pcoll
                          | 'pair' >> ParDo(PairWithRestrictionFn(sdf))
                          | 'split' >> ParDo(SplitRestrictionFn(sdf))
                          | 'explode' >> ParDo(ExplodeWindowsFn())
                          | 'random' >> ParDo(RandomUniqueKeyFn()))

        return keyed_elements | ProcessKeyedElements(
            sdf, element_coder, restriction_coder, pcoll.windowing,
            self._ptransform.args, self._ptransform.kwargs,
            self._ptransform.side_inputs)

示例#7

0

显示文件

文件： util.py 项目： raznem/beam

 def expand(self, pcoll):
   input_coder = coders.registry.get_coder(pcoll)
   return pcoll | ParDo(
       _pardo_group_into_batches(
           input_coder,
           self.params.batch_size,
           self.params.max_buffering_duration_secs,
           self.clock))

示例#8

0

显示文件

文件： direct_runner.py 项目： Shelna-GCP/SentinelToEVI-dataflow

  def expand(self, pcoll):
    # Imported here to avoid circular dependencies.
    # pylint: disable=wrong-import-order, wrong-import-position
    from apache_beam.coders import typecoders

    input_type = pcoll.element_type
    if input_type is not None:
      # Initialize type-hints used below to enforce type-checking and to
      # pass downstream to further PTransforms.
      key_type, value_type = trivial_inference.key_value_types(input_type)
      # Enforce the input to a GBK has a KV element type.
      pcoll.element_type = typehints.typehints.coerce_to_kv_type(
          pcoll.element_type)
      typecoders.registry.verify_deterministic(
          typecoders.registry.get_coder(key_type),
          'GroupByKey operation "%s"' % self.label)

      reify_output_type = typehints.KV[
          key_type, typehints.WindowedValue[value_type]]  # type: ignore[misc]
      gbk_input_type = (
          typehints.KV[
              key_type,
              typehints.Iterable[typehints.WindowedValue[  # type: ignore[misc]
                  value_type]]])
      gbk_output_type = typehints.KV[key_type, typehints.Iterable[value_type]]

      # pylint: disable=bad-continuation
      return (
          pcoll
          | 'ReifyWindows' >> (
              ParDo(beam.GroupByKey.ReifyWindows()).with_output_types(
                  reify_output_type))
          | 'GroupByKey' >> (
              _GroupByKeyOnly().with_input_types(
                  reify_output_type).with_output_types(gbk_input_type))
          | (
              'GroupByWindow' >>
              _GroupAlsoByWindow(pcoll.windowing).with_input_types(
                  gbk_input_type).with_output_types(gbk_output_type)))
    else:
      # The input_type is None, run the default
      return (
          pcoll
          | 'ReifyWindows' >> ParDo(beam.GroupByKey.ReifyWindows())
          | 'GroupByKey' >> _GroupByKeyOnly()
          | 'GroupByWindow' >> _GroupAlsoByWindow(pcoll.windowing))

示例#9

0

显示文件

文件： util.py 项目： takashi-iwabuchi/beam

        def expand(self, pcoll):
            if reify_windows:
                pcoll = pcoll | ParDo(ReifyTimestampWindow())

            keyed_singleton = pcoll.pipeline | Create([(None, None)])
            keyed_actual = (pcoll
                            | WindowInto(custom_windowing
                                         or window.GlobalWindows())
                            | "ToVoidKey" >> Map(lambda v: (None, v)))
            plain_actual = ((keyed_singleton, keyed_actual)
                            | "Group" >> CoGroupByKey()
                            | "Unkey" >> Map(lambda k_values: k_values[1][1]))

            if custom_windowing:
                plain_actual = plain_actual | "AddWindow" >> ParDo(AddWindow())

            plain_actual = plain_actual | "Match" >> Map(matcher)

示例#10

0

显示文件

文件： sdf_common.py 项目： zoyahav/beam

    def expand(self, pcoll):
        sdf = self._ptransform.fn
        signature = DoFnSignature(sdf)
        invoker = DoFnInvoker.create_invoker(signature,
                                             process_invocation=False)

        element_coder = typecoders.registry.get_coder(pcoll.element_type)
        restriction_coder = invoker.invoke_restriction_coder()

        keyed_elements = (pcoll
                          | 'pair' >> ParDo(PairWithRestrictionFn(sdf))
                          | 'split' >> ParDo(SplitRestrictionFn(sdf))
                          | 'explode' >> ParDo(ExplodeWindowsFn())
                          | 'random' >> ParDo(RandomUniqueKeyFn()))

        return keyed_elements | ProcessKeyedElements(
            sdf, element_coder, restriction_coder, pcoll.windowing,
            self._ptransform.args, self._ptransform.kwargs)

示例#11

0

显示文件

文件： test_beam.py 项目： vzts/sentry-python

 def inner(fn):
     sentry_init(default_integrations=False,
                 integrations=[BeamIntegration()])
     # Little hack to avoid having to run the whole pipeline.
     pardo = ParDo(fn)
     signature = pardo._signature
     output_processor = _OutputProcessor()
     return DoFnInvoker.create_invoker(signature, output_processor,
                                       DoFnContext("test"))

示例#12

0

显示文件

文件： trigger_manager_test.py 项目： fernando-wizeline/beam

    def test_fixed_windows_simple_watermark(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 0), tsv('k2', 1, 0),
                             tsv('k1', 2, 0), tsv('k2', 2, 0)])
              .add_elements([tsv('k1', 3, 0), tsv('k2', 3, 0)])
              .add_elements([tsv('k1', 4, 1), tsv('k2', 4, 1)])
              .add_elements([tsv('k1', 5, 1), tsv('k2', 5, 1)])
              .advance_watermark_to(1)
              .add_elements([tsv('k1', 6, 0)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(FixedWindows(1),
                              allowed_lateness=MAX_TIMESTAMP.seconds())

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                    ('k2', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                    ('k1', IntervalWindow(1, 2), [4, 5]),  # On the watermark
                    ('k2', IntervalWindow(1, 2), [4, 5]),  # On the watermark
                    ('k1', IntervalWindow(0, 1), [6]),  # After the watermark
                ]))

示例#13

0

显示文件

文件： trigger_manager_test.py 项目： fernando-wizeline/beam

    def test_sessions_and_complex_trigger_accumulating(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15),
                             tsv('k1', 3, 7), tsv('k1', 4, 30)])
              .advance_watermark_to(50)
              .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),])
              .add_elements([tsv('k1', -1, 21)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(Sessions(10),
                              triggerfn=AfterWatermark(early=AfterCount(2),
                                                       late=AfterCount(1)),
                              accumulation_mode=AccumulationMode.ACCUMULATING,
                              allowed_lateness=MAX_TIMESTAMP.seconds())

        with TestPipeline() as p:
            result = (p
                      | test_stream
                      | WindowInto(windowing.windowfn)
                      | ParDo(trigger_manager._ReifyWindows())
                      | ParDo(trigger_manager._GroupBundlesByKey())
                      | ParDo(
                          trigger_manager.GeneralTriggerManagerDoFn(windowing))
                      | Map(lambda elm: (elm[0], elm[1][0].windows[0],
                                         set(v.value for v in elm[1]))))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # early
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # on time
                    ('k1', IntervalWindow(30, 40), {4}),  # on time
                    ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}),  # late
                    ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2,
                                                   -1}),  # late
                ]))

示例#14

0

显示文件

文件： trigger_manager_test.py 项目： fernando-wizeline/beam

    def test_sliding_windows_simple_watermark(self):
        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([('k1', 1), ('k2', 1), ('k1', 1), ('k2', 1)])
              .add_elements([('k1', 1), ('k2', 1)])
              .advance_watermark_to(1)
              .add_elements([('k1', 2), ('k2', 2)])
              .add_elements([('k1', 2), ('k2', 2)])
              .advance_watermark_to(2)
              .add_elements([('k1', 3), ('k2', 3)])
              .add_elements([('k1', 3), ('k2', 3)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(SlidingWindows(2, 1))

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(-1, 1), [1, 1, 1]),
                    ('k2', IntervalWindow(-1, 1), [1, 1, 1]),
                    ('k1', IntervalWindow(0, 2), [1, 1, 1, 2, 2]),
                    ('k2', IntervalWindow(0, 2), [1, 1, 1, 2, 2]),
                    ('k1', IntervalWindow(1, 3), [2, 2, 3, 3]),
                    ('k2', IntervalWindow(1, 3), [2, 2, 3, 3]),
                    ('k1', IntervalWindow(2, 4), [3, 3]),
                    ('k2', IntervalWindow(2, 4), [3, 3]),
                ]))

示例#15

0

显示文件

        def expand(self, pcoll):
            if reify_windows:
                pcoll = pcoll | ParDo(ReifyTimestampWindow())

            keyed_singleton = pcoll.pipeline | Create([(None, None)])

            if use_global_window:
                pcoll = pcoll | WindowInto(window.GlobalWindows())

            keyed_actual = pcoll | "ToVoidKey" >> Map(lambda v: (None, v))

            # This is a CoGroupByKey so that the matcher always runs, even if the
            # PCollection is empty.
            plain_actual = ((keyed_singleton, keyed_actual)
                            | "Group" >> CoGroupByKey()
                            | "Unkey" >> Map(lambda k_values: k_values[1][1]))

            if not use_global_window:
                plain_actual = plain_actual | "AddWindow" >> ParDo(AddWindow())

            plain_actual = plain_actual | "Match" >> Map(matcher)

示例#16

0

显示文件

文件： trigger_manager_test.py 项目： fernando-wizeline/beam

    def test_with_trigger_window_that_finish(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 0), tsv('k1', 2, 0)])
              .add_elements([tsv('k1', 3, 0)])
              .advance_watermark_to(2)
              .add_elements([tsv('k1', 6, 0)])  # This value is discarded.
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(FixedWindows(1),
                              triggerfn=AfterWatermark(),
                              allowed_lateness=0,
                              accumulation_mode=AccumulationMode.DISCARDING)

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                ]))

示例#17

0

显示文件

    def expand(self, pcoll):
      if reify_windows:
        pcoll = pcoll | ParDo(ReifyTimestampWindow())

      # We must have at least a single element to ensure the matcher
      # code gets run even if the input pcollection is empty.
      keyed_singleton = pcoll.pipeline | Create([(None, None)])
      keyed_actual = (
          pcoll
          | WindowInto(window.GlobalWindows())
          | "ToVoidKey" >> Map(lambda v: (None, v)))
      _ = ((keyed_singleton, keyed_actual)
           | "Group" >> CoGroupByKey()
           | "Unkey" >> Map(lambda k___actual_values: k___actual_values[1][1])
           | "Match" >> Map(matcher))

示例#18

0

显示文件

 def expand(self, pcoll):
     return pcoll | ParDo(self.add_timestamp_info)

示例#19

0

显示文件

 def expand(self, pcoll):
     return pcoll | ParDo(self.add_window_info)

示例#20

0

显示文件

 def expand(self, pcoll):
     input_coder = coders.registry.get_coder(pcoll)
     return pcoll | ParDo(
         _pardo_group_into_batches(self.batch_size, input_coder))

示例#21

0

显示文件

文件： streaming_wordcount_debugging.py 项目： wanwanzhu/beam

def run(argv=None):
  """Build and run the pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--output_topic', required=True,
      help=('Output PubSub topic of the form '
            '"projects/<PROJECT>/topic/<TOPIC>".'))
  group = parser.add_mutually_exclusive_group(required=True)
  group.add_argument(
      '--input_topic',
      help=('Input PubSub topic of the form '
            '"projects/<PROJECT>/topics/<TOPIC>".'))
  group.add_argument(
      '--input_subscription',
      help=('Input PubSub subscription of the form '
            '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(StandardOptions).streaming = True
  with beam.Pipeline(options=pipeline_options) as p:

    # Read from PubSub into a PCollection.
    if known_args.input_subscription:
      lines = p | beam.io.ReadFromPubSub(
          subscription=known_args.input_subscription)
    else:
      lines = p | beam.io.ReadFromPubSub(topic=known_args.input_topic)

    # Count the occurrences of each word.
    def count_ones(word_ones):
      (word, ones) = word_ones
      return (word, sum(ones))

    counts = (lines
              | 'AddTimestampFn' >> beam.ParDo(AddTimestampFn())
              | 'After AddTimestampFn' >> ParDo(PrintFn('After AddTimestampFn'))
              | 'Split' >> (beam.ParDo(WordExtractingDoFn())
                            .with_output_types(unicode))
              | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
              | beam.WindowInto(window.FixedWindows(5, 0))
              | 'GroupByKey' >> beam.GroupByKey()
              | 'CountOnes' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
      (word, count) = word_count
      return '%s: %d' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write to PubSub.
    # pylint: disable=expression-not-assigned
    output | beam.io.WriteStringsToPubSub(known_args.output_topic)

    def check_gbk_format():
      # A matcher that checks that the output of GBK is of the form word: count.
      def matcher(elements):
        # pylint: disable=unused-variable
        actual_elements_in_window, window = elements
        for elm in actual_elements_in_window:
          assert re.match(r'\S+:\s+\d+', elm) is not None
      return matcher

    # Check that the format of the output is correct.
    assert_that(
        output,
        check_gbk_format(),
        use_global_window=False,
        label='Assert word:count format.')

    # Check also that elements are ouput in the right window.
    # This expects exactly 1 occurrence of any subset of the elements
    # 150, 151, 152, 153, 154 in the window [150, 155)
    # or exactly 1 occurrence of any subset of the elements
    # 210, 211, 212, 213, 214 in the window [210, 215).
    expected_window_to_elements = {
        window.IntervalWindow(150, 155): [
            ('150: 1'), ('151: 1'), ('152: 1'), ('153: 1'), ('154: 1'),
        ],
        window.IntervalWindow(210, 215): [
            ('210: 1'), ('211: 1'), ('212: 1'), ('213: 1'), ('214: 1'),
        ],
    }

    # To pass, publish numbers in [150-155) or [210-215) with no repeats.
    # To fail, publish a repeated number in the range above range.
    # For example: '210 213 151 213'
    assert_that(
        output,
        equal_to_per_window(expected_window_to_elements),
        use_global_window=False,
        label='Assert correct streaming windowing.')

示例#22

0

显示文件

文件： api_pipeline.py 项目： InigoSJ/ApacheAirflow

def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default=default_input,
                        help='Input file to process.')
    parser.add_argument('--table',
                        dest='table',
                        default=default_table,
                        help='Table to upload.')
    parser.add_argument(
        '--dataset',
        dest='dataset',
        default=default_dataset,
        help='Dataset where the table is store. Needs to exists beforehand')

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(['--project={}'.format(project)])

    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:

        def QuestionAPI(tags):

            import datetime
            import pandas as pd
            import requests

            #python2 to calculate today and yesterday
            today = datetime.date.today() - datetime.date(1970, 1, 1)
            yesterday = today - datetime.timedelta(1)

            from_date = int(yesterday.total_seconds())
            to_date = int(today.total_seconds())

            logging.info('Calling API for tag: "{}"'.format(tags))

            api_url = "http://api.stackexchange.com/2.2/search/advanced?fromdate={0}&todate={1}&order=desc&sort=activity&tagged={2}&site=stackoverflow".format(
                from_date, to_date, tags)

            api_call = requests.get(api_url)
            api_call_dict = eval(
                api_call.content.replace(b"true",
                                         b"True").replace(b"false", b"False"))

            # Create a DataFrame to simplify data processing
            try:
                so_api_call_DF = pd.DataFrame(api_call_dict['items'])

                if so_api_call_DF.empty:
                    logging.info(
                        'Tag "{}" does not have questions '.format(tags))
                    return []
                else:
                    so_api_call_DF = so_api_call_DF[[
                        'creation_date', 'question_id', 'title', 'link',
                        'tags', 'is_answered'
                    ]]
                    # Fixing tags and creation_date fields:
                    so_api_call_DF['tags'] = so_api_call_DF.tags.apply(
                        lambda x: ', '.join(x).replace('[', '').replace(
                            ']', ''))
                    so_api_call_dict = so_api_call_DF.to_dict('records')

                    return so_api_call_dict

            except:
                logging.warning('Unexpected API request output: \n {}'.format(
                    api_call_dict))
                return []

        schema = 'creation_date:timestamp,question_id:integer,is_answered:boolean,title:string,tags:string,link:string'

        api_call = (
            p | 'read' >> ReadFromText(known_args.input)
            | "API call for each Tag" >> ParDo(fn=QuestionAPI)
            | "Writing to BQ" >> WriteToBigQuery(
                table=known_args.table,
                dataset=known_args.dataset,
                project=project,
                schema=schema,
                create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=BigQueryDisposition.WRITE_APPEND))