def _AddCrossSliceMetrics(  # pylint: disable=invalid-name
    sliced_combiner_outputs: beam.pvalue.PCollection,
    cross_slice_specs: Optional[Iterable[config.CrossSlicingSpec]]
) -> Tuple[slicer.SliceKeyOrCrossSliceKeyType, metric_types.MetricsDict]:
    """Generates CrossSlice metrics from SingleSlices."""
    def is_slice_applicable(
        sliced_combiner_output: Tuple[slicer.SliceKeyType,
                                      metric_types.MetricsDict],
        slicing_specs: Union[config.SlicingSpec, Iterable[config.SlicingSpec]]
    ) -> bool:
        slice_key, _ = sliced_combiner_output
        for slicing_spec in slicing_specs:
            if slicer.SingleSliceSpec(
                    spec=slicing_spec).is_slice_applicable(slice_key):
                return True
        return False

    def compute_cross_slices(
        baseline_slice: Tuple[slicer.SliceKeyType, metric_types.MetricsDict],
        comparison_slices: Iterable[Tuple[slicer.SliceKeyType,
                                          Dict[metric_types.MetricKey, Any]]]
    ) -> Iterator[Tuple[slicer.CrossSliceKeyType, Dict[metric_types.MetricKey,
                                                       Any]]]:
        baseline_slice_key, baseline_metrics = baseline_slice
        for (comparison_slice_key, comparison_metrics) in comparison_slices:
            result = {}
            for (comparison_metric_key,
                 comparison_metric_value) in comparison_metrics.items():
                if comparison_metric_key not in baseline_metrics:
                    continue
                result[comparison_metric_key] = (
                    baseline_metrics[comparison_metric_key] -
                    comparison_metric_value)
            yield ((baseline_slice_key, comparison_slice_key), result)

    cross_slice_outputs = []
    for cross_slice_ind, cross_slice_spec in enumerate(cross_slice_specs):
        baseline_slices = (
            sliced_combiner_outputs
            | 'FilterBaselineSlices(%d)' % cross_slice_ind >> beam.Filter(
                is_slice_applicable, [cross_slice_spec.baseline_spec]))

        slicing_specs = list(cross_slice_spec.slicing_specs)
        comparison_slices = (
            sliced_combiner_outputs
            | 'FilterComparisonSlices(%d)' % cross_slice_ind >> beam.Filter(
                is_slice_applicable, slicing_specs))

        cross_slice_outputs.append(
            baseline_slices
            | 'GenerateCrossSlices(%d)' % cross_slice_ind >> beam.FlatMap(
                compute_cross_slices,
                comparison_slices=beam.pvalue.AsIter(comparison_slices)))

    if cross_slice_outputs:
        cross_slice_outputs = (cross_slice_outputs
                               | 'FlattenCrossSliceResults' >> beam.Flatten())
        return ([sliced_combiner_outputs, cross_slice_outputs]
                | 'CombineSingleSlicesWithCrossSlice' >> beam.Flatten())
    else:
        return sliced_combiner_outputs
Пример #2
0
  def expand(self, pcoll: beam.pvalue.PCollection) -> beam.pvalue.PCollection:

    def _sum_pairwise(
        iter_of_pairs: Iterator[Tuple[Union[int, float], Union[int, float]]]
    ) -> Tuple[Union[int, float], Union[int, float]]:
      """Computes sum of counts and weights."""
      # We take advantage of the fact that constructing a np array from a list
      # is much faster as the length is known beforehand.
      if isinstance(iter_of_pairs, list):
        arr = np.array(
            iter_of_pairs, dtype=[('c', np.int64), ('w', np.float)])
      else:
        arr = np.fromiter(
            iter_of_pairs, dtype=[('c', np.int64), ('w', np.float)])
      return arr['c'].sum(), arr['w'].sum()

    if self._weight_feature is not None:
      sum_fn = _sum_pairwise
    else:
      # For non-weighted case, use sum combine fn over integers to allow Beam
      # to use Cython combiner.
      sum_fn = sum
    top_k_tuples_combined = (
        pcoll
        | 'ToTopKTuples' >> beam.FlatMap(
            _to_topk_tuples,
            bytes_features=self._bytes_features,
            categorical_features=self._categorical_features,
            weight_feature=self._weight_feature)
        | 'CombineCountsAndWeights' >> beam.CombinePerKey(sum_fn)
        | 'Rearrange' >> beam.MapTuple(lambda k, v: ((k[0], k[1]), (v, k[2]))))
    # (slice_key, feature), (count_and_maybe_weight, value)

    top_k = top_k_tuples_combined
    if self._weight_feature is not None:
      top_k |= 'Unweighted_DropWeightsAndRearrange' >> beam.MapTuple(
          lambda k, v: (k, (v[0][0], v[1])))
      # (slice_key, feature), (count, value)
    top_k = (
        top_k
        | 'Unweighted_TopK' >> beam.combiners.Top().PerKey(
            max(self._num_top_values, self._num_rank_histogram_buckets))
        | 'Unweighted_ToFeatureValueCount' >> beam.MapTuple(
            lambda k, v: (k, [FeatureValueCount(t[1], t[0]) for t in v]))
        | 'Unweighted_ToProto' >> beam.Map(
            _make_dataset_feature_stats_proto_with_topk_for_single_feature,
            categorical_features=self._categorical_features,
            is_weighted_stats=False,
            num_top_values=self._num_top_values,
            frequency_threshold=self._frequency_threshold,
            num_rank_histogram_buckets=self._num_rank_histogram_buckets))
    uniques = (
        top_k_tuples_combined
        | 'Uniques_Keys' >> beam.Keys()
        | 'Uniques_CountPerFeatureName' >> beam.combiners.Count().PerElement()
        | 'Uniques_ConvertToSingleFeatureStats' >> beam.Map(
            _make_dataset_feature_stats_proto_with_uniques_for_single_feature,
            categorical_features=self._categorical_features))
    result_protos = [top_k, uniques]

    if self._weight_feature is not None:
      weighted_top_k = (
          top_k_tuples_combined
          | 'Weighted_DropCountsAndRearrange'
          >> beam.MapTuple(lambda k, v: (k, (v[0][1], v[1])))
          # (slice_key, feature), (weight, value)
          | 'Weighted_TopK' >> beam.combiners.Top().PerKey(
              max(self._num_top_values, self._num_rank_histogram_buckets))
          | 'Weighted_ToFeatureValueCount' >> beam.MapTuple(
              lambda k, v: (k, [FeatureValueCount(t[1], t[0]) for t in v]))
          | 'Weighted_ToProto' >> beam.Map(
              _make_dataset_feature_stats_proto_with_topk_for_single_feature,
              categorical_features=self._categorical_features,
              is_weighted_stats=True,
              num_top_values=self._num_top_values,
              frequency_threshold=self._weighted_frequency_threshold,
              num_rank_histogram_buckets=self._num_rank_histogram_buckets))
      result_protos.append(weighted_top_k)

    return (result_protos
            | 'FlattenTopKUniquesFeatureStatsProtos' >> beam.Flatten())
Пример #3
0
options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = project_id
google_cloud_options.job_name = job_id
google_cloud_options.staging_location = bucket_id
google_cloud_options.temp_location = bucket_id
options.view_as(StandardOptions).runner = 'DataflowRunner'


class FormatAsRow(beam.DoFn):
    def process(self, element):
        cf = 'cf'
        column_names = ['prediction', 'time', 'prob_0', 'prob_1']
        direct_row = row.DirectRow(row_key=element['event_id'])
        for name in column_names:
            direct_row.set_cell(column_family_id=cf,
                                column=name,
                                value=element[name],
                                timestamp=datetime.datetime.now())
        yield direct_row


with beam.Pipeline(options=options) as p:
    _ = p | beam.io.ReadFromText(bucket_id+'kinglear.txt')\
        | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))\
        | beam.combiners.Count.PerElement()\
        | beam.Map(lambda word_count: '%s: %s' % (word_count[0], word_count[1]))\
        | beam.io.WriteToText(bucket_id+'counts.txt')
    result = p.run()

Пример #4
0
 def expand(self, pcoll):
     return (pcoll
             | beam.FlatMap(lambda line: map(int, line.split(',')))
             | beam.Map(lambda num: num * 10))
Пример #5
0
    def expand(self, uri_to_content):

        # Compute the total number of documents, and prepare a singleton
        # PCollection to use as side input.
        total_documents = (uri_to_content
                           | 'GetUris 1' >> beam.Keys()
                           | 'GetUniqueUris' >> beam.RemoveDuplicates()
                           | 'CountUris' >> beam.combiners.Count.Globally())

        # Create a collection of pairs mapping a URI to each of the words
        # in the document associated with that that URI.

        def split_into_words(uri_line):
            (uri, line) = uri_line
            return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)]

        uri_to_words = (uri_to_content
                        | 'SplitWords' >> beam.FlatMap(split_into_words))

        # Compute a mapping from each word to the total number of documents
        # in which it appears.
        word_to_doc_count = (
            uri_to_words
            | 'GetUniqueWordsPerDoc' >> beam.RemoveDuplicates()
            | 'GetWords' >> beam.Values()
            | 'CountDocsPerWord' >> beam.combiners.Count.PerElement())

        # Compute a mapping from each URI to the total number of words in the
        # document associated with that URI.
        uri_to_word_total = (
            uri_to_words
            | 'GetUris 2' >> beam.Keys()
            | 'CountWordsInDoc' >> beam.combiners.Count.PerElement())

        # Count, for each (URI, word) pair, the number of occurrences of that word
        # in the document associated with the URI.
        uri_and_word_to_count = (
            uri_to_words
            | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement())

        # Adjust the above collection to a mapping from (URI, word) pairs to counts
        # into an isomorphic mapping from URI to (word, count) pairs, to prepare
        # for a join by the URI key.
        def shift_keys(uri_word_count):
            return (uri_word_count[0][0], (uri_word_count[0][1],
                                           uri_word_count[1]))

        uri_to_word_and_count = (uri_and_word_to_count
                                 | 'ShiftKeys' >> beam.Map(shift_keys))

        # Perform a CoGroupByKey (a sort of pre-join) on the prepared
        # uri_to_word_total and uri_to_word_and_count tagged by 'word totals' and
        # 'word counts' strings. This yields a mapping from URI to a dictionary
        # that maps the above mentioned tag strings to an iterable containing the
        # word total for that URI and word and count respectively.
        #
        # A diagram (in which '[]' just means 'iterable'):
        #
        #   URI: {'word totals': [count],  # Total words within this URI's document.
        #         'word counts': [(word, count),  # Counts of specific words
        #                         (word, count),  # within this URI's document.
        #                         ... ]}
        uri_to_word_and_count_and_total = (
            {
                'word totals': uri_to_word_total,
                'word counts': uri_to_word_and_count
            }
            | 'CoGroupByUri' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, term frequency) pair for each
        # URI. A word's term frequency for a document is simply the number of times
        # that word occurs in the document divided by the total number of words in
        # the document.

        def compute_term_frequency(uri_count_and_total):
            (uri, count_and_total) = uri_count_and_total
            word_and_count = count_and_total['word counts']
            # We have an iterable for one element that we want extracted.
            [word_total] = count_and_total['word totals']
            for word, count in word_and_count:
                yield word, (uri, float(count) / word_total)

        word_to_uri_and_tf = (
            uri_to_word_and_count_and_total
            | 'ComputeTermFrequencies' >> beam.FlatMap(compute_term_frequency))

        # Compute a mapping from each word to its document frequency.
        # A word's document frequency in a corpus is the number of
        # documents in which the word appears divided by the total
        # number of documents in the corpus.
        #
        # This calculation uses a side input, a Dataflow-computed auxiliary value
        # presented to each invocation of our MapFn lambda. The second argument to
        # the lambda (called total---note that we are unpacking the first argument)
        # receives the value we listed after the lambda in Map(). Additional side
        # inputs (and ordinary Python values, too) can be provided to MapFns and
        # DoFns in this way.
        word_to_df = (
            word_to_doc_count
            | 'ComputeDocFrequencies' >> beam.Map(
                lambda (word, count), total:
                (word, float(count) / total), AsSingleton(total_documents)))

        # Join the term frequency and document frequency collections,
        # each keyed on the word.
        word_to_uri_and_tf_and_df = (
            {
                'tf': word_to_uri_and_tf,
                'df': word_to_df
            }
            | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, TF-IDF) score for each URI.
        # There are a variety of definitions of TF-IDF
        # ("term frequency - inverse document frequency") score; here we use a
        # basic version that is the term frequency divided by the log of the
        # document frequency.

        def compute_tf_idf(word_tf_and_df):
            (word, tf_and_df) = word_tf_and_df
            [docf] = tf_and_df['df']
            for uri, tf in tf_and_df['tf']:
                yield word, (uri, tf * math.log(1 / docf))

        word_to_uri_and_tfidf = (
            word_to_uri_and_tf_and_df
            | 'ComputeTf-idf' >> beam.FlatMap(compute_tf_idf))

        return word_to_uri_and_tfidf
Пример #6
0
import sys

def my_grep(line, term):
   if line.startswith(term):
      yield line

if __name__ == '__main__':
   p = beam.Pipeline(argv=sys.argv)
   input = '../javahelp/src/main/java/com/google/cloud/training/dataanalyst/javahelp/*.java'
   output_prefix = '/tmp/output'
   searchTerm = 'import'

   # find all lines that contain the searchTerm
   (p
      | 'GetJava' >> beam.io.ReadFromText(input)
      | 'Grep' >> beam.FlatMap(lambda line: my_grep(line, searchTerm) )
      | 'write' >> beam.io.WriteToText(output_prefix)
   )

   p.run().wait_until_finish()#!/usr/bin/env python

"""
Copyright Google Inc. 2016
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
    def test_typed_flatmap(self):
        def fn(element: int) -> typehints.Iterable[int]:
            yield element * 2

        result = [1, 2, 3] | beam.FlatMap(fn)
        self.assertCountEqual([2, 4, 6], result)
Пример #8
0
import apache_beam as beam
project = 'teak-proton-148317'
input_table = 'clouddataflow-readonly:samples.weather_stations'
output_table = 'mydataset.weather_copy_from_dataflow1'

p = beam.Pipeline(argv=['--project', project])

read = beam.Read(beam.io.BigQuerySource(input_table))

tornadoesMonths = beam.FlatMap(lambda row: [(int(row['month']), 1)]
                               if row['tornado'] else [])

monthlyCount = beam.CombinePerKey(sum)
frmat = beam.Map(lambda (k, v): {'month': k, 'tornado_count': v})
sve = beam.Write(
    beam.io.BigQuerySink(
        output_table,
        schema='month:INTEGER, tornado_count:INTEGER',
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

(p | read | tornadoesMonths | monthlyCount | frmat | sve)

p.run()
Пример #9
0
def make_beam_pipeline(
    root, input_filenames, sample_rate, debug, embedding_names,
    embedding_modules, module_output_keys, audio_key, sample_rate_key,
    label_key, speaker_id_key, average_over_time, delete_audio_from_output,
    output_filename, input_format='tfrecord', output_format='tfrecord',
    suffix='Main'):
  """Construct beam pipeline for mapping from audio to embeddings.

  Args:
    root: The beam root node.
    input_filenames: Python list. List of input files.
    sample_rate: Python int, or `None`. The sample rate for all embeddings,
      or `None` if this is a TFDS dataset, or if each example has its own sample
      rate.
    debug: Python bool. Whether to operate in debug mode.
    embedding_names: Python list of embeddings.
    embedding_modules: Python list of TF-Hub modules.
    module_output_keys: Python list of strings, names of output modules.
    audio_key: Python string, the key of the audio.
    sample_rate_key: Python string or `None`, the key for.
    label_key: Python string. Field for label.
    speaker_id_key: Python string or `None`. Key for speaker ID, or `None`.
    average_over_time: Python bool. If `True`, average over the time axis.
    delete_audio_from_output: Python bool. Whether to remove audio fromm
      outputs.
    output_filename: Python string. Output filename.
    input_format: Python string. Must correspond to a function in
      `reader_functions`.
    output_format: Python string. Must correspond to a function
      `writer_functions`.
    suffix: Python string. Suffix to stage names to make them unique.
  """
  tf_examples_key_ = 'tf_examples'
  assert tf_examples_key_ not in embedding_names
  s = suffix  # for code brevity.

  # Read from input.
  input_examples = reader_functions[input_format](root, input_filenames, s)

  # In debug mode, take one input example.
  if debug:
    input_examples = (
        input_examples
        | f'TakeOne{s}' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1)
        # Sampling generates lists, so flatten back into one collection.
        | f'DebugFlatten{s}' >> beam.FlatMap(lambda x: x))

  # Compute all the embeddings simultaneously.
  embedding_tables = {}
  for name, mod, out_key in zip(
      embedding_names, embedding_modules, module_output_keys):
    logging.info('Adding signal: %s %s, %s', name, mod, out_key)
    tbl = input_examples | f'ComputeEmbedding-{name}-{s}' >> beam.ParDo(
        ComputeEmbeddingMapFn(
            name=name,
            module=mod,
            output_key=out_key,
            audio_key=audio_key,
            sample_rate_key=sample_rate_key,
            sample_rate=sample_rate,
            average_over_time=average_over_time))
    embedding_tables[name] = tbl
  assert tf_examples_key_ not in embedding_tables
  embedding_tables[tf_examples_key_] = input_examples
  logging.info('embedding_tables: %s', embedding_tables)

  # Combine embeddings and tf.train.Example, using the common key.
  combined_tbl = (
      embedding_tables
      | f'CombineEmbeddingTables-{s}' >> beam.CoGroupByKey()
      | f'AddEmbeddings-{s}' >> beam.Map(
          _add_embedding_column_map_fn,
          original_example_key=tf_examples_key_,
          delete_audio_from_output=delete_audio_from_output,
          audio_key=audio_key,
          label_key=label_key,
          speaker_id_key=speaker_id_key))

  output_filename = f'{output_filename}@*'
  logging.info('Writing to %s', output_filename)
  writer_functions[output_format](combined_tbl, output_filename, s)
Пример #10
0
    def test_progress_metrics(self):
        p = self.create_pipeline()
        if not isinstance(p.runner, fn_api_runner.FnApiRunner):
            # This test is inherited by others that may not support the same
            # internal way of accessing progress metrics.
            self.skipTest('Progress metrics not supported.')
            return

        _ = (p
             | beam.Create([0, 0, 0, 5e-3 * DEFAULT_SAMPLING_PERIOD_MS])
             | beam.Map(time.sleep)
             | beam.Map(lambda x: ('key', x))
             | beam.GroupByKey()
             | 'm_out' >> beam.FlatMap(lambda x: [
                 1, 2, 3, 4, 5,
                 beam.pvalue.TaggedOutput('once', x),
                 beam.pvalue.TaggedOutput('twice', x),
                 beam.pvalue.TaggedOutput('twice', x)
             ]))

        res = p.run()
        res.wait_until_finish()

        def has_mi_for_ptransform(monitoring_infos, ptransform):
            for mi in monitoring_infos:
                if ptransform in mi.labels['PTRANSFORM']:
                    return True
            return False

        try:
            # TODO(ajamato): Delete this block after deleting the legacy metrics code.
            # Test the DEPRECATED legacy metrics
            pregbk_metrics, postgbk_metrics = list(
                res._metrics_by_stage.values())
            if 'Create/Read' not in pregbk_metrics.ptransforms:
                # The metrics above are actually unordered. Swap.
                pregbk_metrics, postgbk_metrics = postgbk_metrics, pregbk_metrics
            self.assertEqual(
                4, pregbk_metrics.ptransforms['Create/Read'].
                processed_elements.measured.output_element_counts['out'])
            self.assertEqual(
                4, pregbk_metrics.ptransforms['Map(sleep)'].processed_elements.
                measured.output_element_counts['None'])
            self.assertLessEqual(
                4e-3 * DEFAULT_SAMPLING_PERIOD_MS,
                pregbk_metrics.ptransforms['Map(sleep)'].processed_elements.
                measured.total_time_spent)
            self.assertEqual(
                1, postgbk_metrics.ptransforms['GroupByKey/Read'].
                processed_elements.measured.output_element_counts['None'])

            # The actual stage name ends up being something like 'm_out/lamdbda...'
            m_out, = [
                metrics
                for name, metrics in list(postgbk_metrics.ptransforms.items())
                if name.startswith('m_out')
            ]
            self.assertEqual(
                5, m_out.processed_elements.measured.
                output_element_counts['None'])
            self.assertEqual(
                1, m_out.processed_elements.measured.
                output_element_counts['once'])
            self.assertEqual(
                2, m_out.processed_elements.measured.
                output_element_counts['twice'])

            # Test the new MonitoringInfo monitoring format.
            self.assertEqual(2, len(res._monitoring_infos_by_stage))
            pregbk_mis, postgbk_mis = list(
                res._monitoring_infos_by_stage.values())

            if not has_mi_for_ptransform(pregbk_mis, 'Create/Read'):
                # The monitoring infos above are actually unordered. Swap.
                pregbk_mis, postgbk_mis = postgbk_mis, pregbk_mis

            def assert_has_monitoring_info(monitoring_infos,
                                           urn,
                                           labels,
                                           value=None,
                                           ge_value=None):
                def contains_labels(monitoring_info, labels):
                    return len([
                        x for x in labels.items()
                        if x[0] in monitoring_info.labels
                        and monitoring_info.labels[x[0]] == x[1]
                    ]) == len(labels)

                # TODO(ajamato): Consider adding a matcher framework
                found = 0
                for mi in monitoring_infos:
                    if contains_labels(mi, labels) and mi.urn == urn:
                        if (ge_value is not None
                                and mi.metric.counter_data.int64_value >=
                                ge_value):
                            found = found + 1
                        elif (value is not None
                              and mi.metric.counter_data.int64_value == value):
                            found = found + 1
                ge_value_str = {'ge_value': ge_value} if ge_value else ''
                value_str = {'value': value} if value else ''
                self.assertEqual(
                    1, found,
                    "Found (%s) Expected only 1 monitoring_info for %s." % (
                        found,
                        (urn, labels, value_str, ge_value_str),
                    ))

            # pregbk monitoring infos
            labels = {'PCOLLECTION': 'ref_PCollection_PCollection_1'}
            assert_has_monitoring_info(pregbk_mis,
                                       monitoring_infos.ELEMENT_COUNT_URN,
                                       labels,
                                       value=4)
            labels = {'PCOLLECTION': 'ref_PCollection_PCollection_2'}
            assert_has_monitoring_info(pregbk_mis,
                                       monitoring_infos.ELEMENT_COUNT_URN,
                                       labels,
                                       value=4)
            labels = {'PTRANSFORM': 'Map(sleep)'}
            assert_has_monitoring_info(pregbk_mis,
                                       monitoring_infos.TOTAL_MSECS_URN,
                                       labels,
                                       ge_value=4 * DEFAULT_SAMPLING_PERIOD_MS)

            # postgbk monitoring infos
            labels = {'PCOLLECTION': 'ref_PCollection_PCollection_6'}
            assert_has_monitoring_info(postgbk_mis,
                                       monitoring_infos.ELEMENT_COUNT_URN,
                                       labels,
                                       value=1)
            labels = {'PCOLLECTION': 'ref_PCollection_PCollection_7'}
            assert_has_monitoring_info(postgbk_mis,
                                       monitoring_infos.ELEMENT_COUNT_URN,
                                       labels,
                                       value=5)
        except:
            print(res._monitoring_infos_by_stage)
            raise
  publicdata.samples.natality
WHERE year > 2000
AND weight_pounds > 0
AND mother_age > 0
AND plurality > 0
AND gestation_weeks > 0
AND month > 0
    """

  if in_test_mode:
    query = query + ' LIMIT 100'

  for step in ['train', 'eval']:
    if step == 'train':
      selquery = 'SELECT * FROM ({}) WHERE MOD(ABS(hashmonth),4) < 3'.format(query)
    else:
      selquery = 'SELECT * FROM ({}) WHERE MOD(ABS(hashmonth),4) = 3'.format(query)

    (p
     | '{}_read'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query = selquery, use_standard_sql = True))
     | '{}_csv'.format(step) >> beam.FlatMap(to_csv)
     | '{}_out'.format(step) >> beam.io.Write(beam.io.WriteToText(os.path.join(OUTPUT_DIR, '{}.csv'.format(step))))
    )

  job = p.run()
  if in_test_mode:
    job.wait_until_finish()
    print("Done!")

preprocess(in_test_mode = False)
Пример #12
0
    def _load_data(self, partitions_using_temp_tables,
                   partitions_direct_to_destination, load_job_name_pcv,
                   copy_job_name_pcv, singleton_pc):
        """Load data to BigQuery

    Data is loaded into BigQuery in the following two ways:
      1. Single partition:
         When there is a single partition of files destined to a single
         destination, a single load job is triggered.
      2. Multiple partitions and/or Dynamic Destinations:
         When there are multiple partitions of files destined for a single
         destination or when Dynamic Destinations are used, multiple load jobs
         need to be triggered for each partition/destination. Load Jobs are
         triggered to temporary tables, and those are later copied to the actual
         appropriate destination table. This ensures atomicity when only some
         of the load jobs would fail but not other. If any of them fails, then
         copy jobs are not triggered.
    """
        # Load data using temp tables
        trigger_loads_outputs = (
            partitions_using_temp_tables
            | "TriggerLoadJobsWithTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=True,
                    additional_bq_parameters=self.additional_bq_parameters,
                    source_format=self._temp_file_format), load_job_name_pcv, *
                self.schema_side_inputs).with_outputs(
                    TriggerLoadJobs.TEMP_TABLES, main='main'))

        temp_tables_load_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            singleton_pc
            | "WaitForTempTableLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(temp_tables_load_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                test_client=self.test_client),
                copy_job_name_pcv))

        finished_copy_jobs_pc = (
            singleton_pc
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            finished_copy_jobs_pc
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda x, deleting_tables: deleting_tables,
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
            | "RemoveTempTables/Delete" >> beam.ParDo(
                DeleteTablesFn(self.test_client)))

        # Load data directly to destination table
        destination_load_job_ids_pc = (
            partitions_direct_to_destination
            | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=False,
                    additional_bq_parameters=self.additional_bq_parameters,
                    source_format=self._temp_file_format), load_job_name_pcv, *
                self.schema_side_inputs))

        _ = (singleton_pc
             | "WaitForDestinationLoadJobs" >> beam.ParDo(
                 WaitForBQJobs(self.test_client),
                 beam.pvalue.AsList(destination_load_job_ids_pc)))

        destination_load_job_ids_pc = (
            (temp_tables_load_job_ids_pc, destination_load_job_ids_pc)
            | beam.Flatten())

        return destination_load_job_ids_pc, destination_copy_job_ids_pc
Пример #13
0
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

p = beam.Pipeline(options=PipelineOptions())

class ComputeWordLengthFn(beam.DoFn):
    def process(self,element):
        element = element.split(' ')
        element = [(len(i),i) for i in element if len(i)]
        return element

def find_rangeFn(element, lower, upper):
    if lower <= element[0] <= upper:
       return [element]

(p 
   | "Read the file" >> beam.io.ReadFromText('/mybeam/beam_programs/create_pipeline.py')
   | "lenght of line" >> beam.ParDo(ComputeWordLengthFn())
   | "find word between range" >> beam.FlatMap(find_rangeFn,10,100)
   | "Write Output" >> beam.io.WriteToText('mybeam/sideinputs.txt')
)


p.run().wait_until_finish()


Пример #14
0
def make_beam_pipeline(root,
                       input_filenames,
                       sample_rate,
                       debug,
                       embedding_names,
                       embedding_modules,
                       module_output_keys,
                       audio_key,
                       sample_rate_key,
                       label_key,
                       speaker_id_key,
                       average_over_time,
                       delete_audio_from_output,
                       output_filename,
                       split_embeddings_into_separate_tables=False,
                       use_frontend_fn=False,
                       input_format='tfrecord',
                       output_format='tfrecord',
                       suffix='Main'):
    """Construct beam pipeline for mapping from audio to embeddings.

  Args:
    root: The beam root node.
    input_filenames: Python list. List of input files.
    sample_rate: Python int, or `None`. The sample rate for all embeddings,
      or `None` if this is a TFDS dataset, or if each example has its own sample
      rate.
    debug: Python bool. Whether to operate in debug mode.
    embedding_names: Python list of embeddings.
    embedding_modules: Python list of TF-Hub modules.
    module_output_keys: Python list of strings, names of output modules.
    audio_key: Python string, the key of the audio.
    sample_rate_key: Python string or `None`, the key for.
    label_key: Python string. Field for label.
    speaker_id_key: Python string or `None`. Key for speaker ID, or `None`.
    average_over_time: Python bool. If `True`, average over the time axis.
    delete_audio_from_output: Python bool. Whether to remove audio fromm
      outputs.
    output_filename: Python string. Output filename.
    split_embeddings_into_separate_tables: Python bool. If true, write each
      embedding to a separate table.
    use_frontend_fn: If `true`, call frontend fn on audio before passing to the
      model.
    input_format: Python string. Must correspond to a function in
      `reader_functions`.
    output_format: Python string. Must correspond to a function
      `writer_functions`.
    suffix: Python string. Suffix to stage names to make them unique.
  """
    tf_examples_key_ = 'tf_examples'
    assert tf_examples_key_ not in embedding_names
    s = suffix  # for code brevity.

    # Read from input.
    input_examples = reader_functions[input_format](root, input_filenames, s)

    # In debug mode, take one input example.
    if debug:
        input_examples = (
            input_examples
            | f'TakeOne{s}' >>
            beam.transforms.combiners.Sample.FixedSizeGlobally(1)
            # Sampling generates lists, so flatten back into one collection.
            | f'DebugFlatten{s}' >> beam.FlatMap(lambda x: x))

    # Compute all the embeddings simultaneously.
    embedding_tables = {}
    for name, mod, out_key in zip(embedding_names, embedding_modules,
                                  module_output_keys):
        logging.info('Adding signal: %s %s, %s', name, mod, out_key)
        tbl = input_examples | f'ComputeEmbedding-{name}-{s}' >> beam.ParDo(
            ComputeEmbeddingMapFn(
                name=name,
                module=mod,
                output_key=out_key,
                audio_key=audio_key,
                sample_rate_key=sample_rate_key,
                sample_rate=sample_rate,
                average_over_time=average_over_time,
                feature_fn=_default_feature_fn if use_frontend_fn else None))
        embedding_tables[name] = tbl
    assert tf_examples_key_ not in embedding_tables
    embedding_tables[tf_examples_key_] = input_examples
    logging.info('embedding_tables: %s', embedding_tables)

    # Either write to one table with all embeddings, or one table per embedding.
    if split_embeddings_into_separate_tables:
        output_table_dicts = [(k, {
            k: v,
            tf_examples_key_: input_examples
        }) for k, v in embedding_tables.items() if k != tf_examples_key_]
    else:
        output_table_dicts = [('all', embedding_tables)]

    # Combine embeddings and tf.train.Example, using the common key.
    writer_function = writer_functions[output_format]
    for name, embedding_tables in output_table_dicts:
        if split_embeddings_into_separate_tables:
            cur_s = f'{name}-{s}'
            # Add `name` as a subdir.
            dirname, basename = os.path.split(output_filename)
            cur_output_filename = os.path.join(dirname, name, f'{basename}@*')
        else:
            cur_s = s
            cur_output_filename = f'{output_filename}@*'
        combined_tbl = (
            embedding_tables
            | f'CombineEmbeddingTables-{cur_s}' >> beam.CoGroupByKey()
            | f'AddEmbeddings-{cur_s}' >> beam.Map(
                _add_embedding_column_map_fn,
                original_example_key=tf_examples_key_,
                delete_audio_from_output=delete_audio_from_output,
                audio_key=audio_key,
                label_key=label_key,
                speaker_id_key=speaker_id_key))
        logging.info('Writing to %s', cur_output_filename)
        writer_function(combined_tbl, cur_output_filename, cur_s)
def ComputeQueryBasedMetrics(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    prediction_key: str,
    query_id: str,
    combine_fns: List[beam.CombineFn],
) -> beam.pvalue.PCollection:
    """Computes metrics and plots using the EvalSavedModel.

  Args:
    extracts: PCollection of Extracts. The extracts MUST contain a
      FeaturesPredictionsLabels extract keyed by
      tfma.FEATURE_PREDICTIONS_LABELS_KEY and a list of SliceKeyType extracts
      keyed by tfma.SLICE_KEY_TYPES_KEY. Typically these will be added by
      calling the default_extractors function.
    prediction_key: Key in predictions dictionary to use as the prediction (for
      sorting examples within the query). Use the empty string if the Estimator
      returns a predictions Tensor (not a dictionary).
    query_id: Key of query ID column in the features dictionary.
    combine_fns: List of query based metrics combine functions.

  Returns:
    PCollection of (slice key, query-based metrics).
  """

    missing_query_id_counter = beam.metrics.Metrics.counter(
        constants.METRICS_NAMESPACE, 'missing_query_id')

    def key_by_query_id(extract: types.Extracts,
                        query_id: str) -> Iterator[Tuple[str, types.Extracts]]:
        """Extract the query ID from the extract and key by that."""
        features = extract[constants.FEATURES_PREDICTIONS_LABELS_KEY].features
        if query_id not in features:
            missing_query_id_counter.inc()
            return
        feature_value = features[query_id][encoding.NODE_SUFFIX]
        if isinstance(feature_value, tf.compat.v1.SparseTensorValue):
            feature_value = feature_value.values
        if feature_value.size != 1:
            raise ValueError(
                'Query ID feature "%s" should have exactly 1 value, but '
                'found %d instead. Values were: %s' %
                (query_id, feature_value.size(), feature_value))
        yield ('{}'.format(np.asscalar(feature_value)), extract)

    def merge_dictionaries(
            dictionaries: Tuple[Dict[str, Any], ...]) -> Dict[str, Any]:
        """Merge dictionaries in a tuple into a single dictionary."""
        result = dict()
        for d in dictionaries:
            intersection = set(d.keys()) & set(result.keys())
            if intersection:
                raise ValueError(
                    'Overlapping keys found when merging dictionaries. '
                    'Intersection was: %s. Keys up to this point: %s '
                    'keys from next dictionary: %s' %
                    (intersection, result.keys(), d.keys()))
            result.update(d)
        return result

    # pylint: disable=no-value-for-parameter
    return (
        extracts
        | 'KeyByQueryId' >> beam.FlatMap(key_by_query_id, query_id)
        | 'CreateQueryExamples' >> beam.CombinePerKey(
            CreateQueryExamples(prediction_key=prediction_key))
        | 'DropQueryId' >> beam.Map(lambda kv: kv[1]._replace(query_id=kv[0]))
        | 'CombineGlobally' >> beam.CombineGlobally(
            beam.combiners.SingleInputTupleCombineFn(*combine_fns))
        | 'MergeDictionaries' >> beam.Map(merge_dictionaries)
        | 'AddOverallSliceKey' >> beam.Map(lambda v: ((), v)))
Пример #16
0
#pipeline3.py: Read data from a file and give results back to another file
import apache_beam as beam
from apache_beam.io import WriteToText, ReadFromText

with beam.Pipeline() as pipeline:
    lines = pipeline | ReadFromText('sample1.txt')

    subjects = (lines | 'Subjects' >> beam.FlatMap(str.split))

    subjects | WriteToText(file_path_prefix='subjects',
                           file_name_suffix='.txt',
                           shard_name_template='')
Пример #17
0
"""
This dataflow program will find all the winary exists in california on the basis of provided data
"""
import apache_beam as beam
import sys


def find_wineries(line, searchText):
    if searchText in line:
        yield line


if __name__ == "__main__":
    p = beam.Pipeline(argv=sys.argv)
    input = '../data/spikey_winery_list.csv'
    output = '../output/calWineries'

    searchText = 'California'

    (p
     | 'ReadData' >> beam.io.ReadFromText(input)
     | 'GrepSearchText' >>
     beam.FlatMap(lambda line: find_wineries(line, searchText))
     | 'WriteOutput' >> beam.io.WriteToText(output))

    p.run().wait_until_finish()
Пример #18
0
  def expand(self, dataset_and_transform_fn):
    """Transforms the dataset using the transform_fn.

    Args:
      dataset_and_transform_fn: A tuple of dataset and preprocessing
      function.

    Returns:
      A dataset transformed according to the transform_fn.
    """
    (input_values, input_metadata), (transform_fn, output_metadata) = (
        dataset_and_transform_fn)

    # If exclude_outputs is set, update the output metadata.
    if self._exclude_outputs is not None:
      if isinstance(output_metadata, beam_metadata_io.BeamDatasetMetadata):
        # Unwrap BeamDatasetMetadata into DatasetMetadata and pcollections dict.
        output_metadata, pcollections = output_metadata
        schema = output_metadata.schema
        # Update DatasetMetadata to remove excluded outputs
        output_metadata = dataset_metadata.DatasetMetadata(
            schema=dataset_schema.Schema({
                key: column_schema
                for key, column_schema in six.iteritems(schema.column_schemas)
                if key not in self._exclude_outputs
            }))
        # Update pcollections to keep only pcollections that resolve futures in
        # the updated metadata.
        unresolved_future_names = set(
            future.name for future in output_metadata.substitute_futures({}))
        pcollections = {
            name: pcollection
            for name, pcollection in six.iteritems(pcollections)
            if name in unresolved_future_names
        }
        # Wrap DatasetMetadata and pcollections as BeamDatasetMetadata
        output_metadata = beam_metadata_io.BeamDatasetMetadata(
            output_metadata, pcollections)
      else:
        schema = output_metadata.schema
        output_metadata = dataset_metadata.DatasetMetadata(
            schema=dataset_schema.Schema({
                key: column_schema
                for key, column_schema in six.iteritems(schema.column_schemas)
                if key not in self._exclude_outputs
            }))

    def convert_and_unbatch(batch_dict):
      return impl_helper.to_instance_dicts(output_metadata.schema, batch_dict)

    serialized_tf_config = (
        analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
            self.pipeline.runner))
    output_instances = (
        input_values
        | 'Batch' >> _BatchElements()
        | 'Transform' >> beam.ParDo(
            _RunMetaGraphDoFn(
                input_metadata.schema,
                serialized_tf_config,
                shared_graph_state_handle=shared.Shared(),
                exclude_outputs=self._exclude_outputs),
            saved_model_dir=beam.pvalue.AsSingleton(transform_fn))
        | 'ConvertAndUnbatch' >> beam.FlatMap(convert_and_unbatch))

    _clear_shared_state_after_barrier(self.pipeline, output_instances)

    return (output_instances, output_metadata)
Пример #19
0
def prepare_tfrecord(input_audio_paths,
                     output_tfrecord_path,
                     num_shards=None,
                     sample_rate=16000,
                     frame_rate=250,
                     window_secs=4,
                     hop_secs=1,
                     eval_split_fraction=0.0,
                     coarse_chunk_secs=20.0,
                     pipeline_options=''):
  """Prepares a TFRecord for use in training, evaluation, and prediction.

  Args:
    input_audio_paths: An iterable of paths to audio files to include in
      TFRecord.
    output_tfrecord_path: The prefix path to the output TFRecord. Shard numbers
      will be added to actual path(s).
    num_shards: The number of shards to use for the TFRecord. If None, this
      number will be determined automatically.
    sample_rate: The sample rate to use for the audio.
    frame_rate: The frame rate to use for f0 and loudness features. If set to
      None, these features will not be computed.
    window_secs: The size of the sliding window (in seconds) to use to split the
      audio and features. If 0, they will not be split.
    hop_secs: The number of seconds to hop when computing the sliding windows.
    eval_split_fraction: Fraction of the dataset to reserve for eval split. If
      set to 0, no eval split is created.
    coarse_chunk_secs: Chunk size in seconds used to split the input audio
      files. This is used to split large audio files into manageable chunks
      for better parallelization and to enable non-overlapping train/eval
      splits.
    pipeline_options: An iterable of command line arguments to be used as
      options for the Beam Pipeline.
  """
  pipeline_options = beam.options.pipeline_options.PipelineOptions(
      pipeline_options)
  with beam.Pipeline(options=pipeline_options) as pipeline:
    examples = (
        pipeline
        | beam.Create(input_audio_paths)
        | beam.Map(_load_audio, sample_rate))

    if frame_rate:
      examples = (
          examples
          | beam.Map(_add_f0_estimate, sample_rate, frame_rate)
          | beam.Map(add_loudness, sample_rate, frame_rate))

    if coarse_chunk_secs:
      examples |= beam.FlatMap(split_example, sample_rate, frame_rate,
                               coarse_chunk_secs, coarse_chunk_secs)

    def postprocess_pipeline(examples, output_path, stage_name=''):
      if stage_name:
        stage_name = f'_{stage_name}'

      if window_secs:
        examples |= f'create_batches{stage_name}' >> beam.FlatMap(
            split_example, sample_rate, frame_rate, window_secs, hop_secs)
      _ = (
          examples
          | f'reshuffle{stage_name}' >> beam.Reshuffle()
          | f'make_tfexample{stage_name}' >> beam.Map(float_dict_to_tfexample)
          | f'write{stage_name}' >> beam.io.tfrecordio.WriteToTFRecord(
              output_path,
              num_shards=num_shards,
              coder=beam.coders.ProtoCoder(tf.train.Example)))

    if eval_split_fraction:
      examples |= beam.Map(add_key)
      keys = examples | beam.Keys()
      splits = examples | beam.Partition(eval_split_partition_fn, 2,
                                         eval_split_fraction,
                                         beam.pvalue.AsList(keys))

      # Remove ids.
      eval_split = splits[0] | 'remove_id_eval' >> beam.Map(lambda x: x[1])
      train_split = splits[1] | 'remove_id_train' >> beam.Map(lambda x: x[1])

      postprocess_pipeline(eval_split, f'{output_tfrecord_path}-eval', 'eval')
      postprocess_pipeline(train_split, f'{output_tfrecord_path}-train',
                           'train')
    else:
      postprocess_pipeline(examples, output_tfrecord_path)
Пример #20
0
        list_words_clean = list(map(cleanpunc, list_words))
        list_words_lower = list(
            map(lambda x: x.lower().strip(), list_words_clean))
        list_word_key = list(map(lambda x: (x, 1), list_words_lower))
        return list_word_key
    except:
        pass
    # try:
    #     return element.split()
    # except:
    #     pass


def cleanpunc(
    sentence
):  #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|%|!|+|*|@|&|^|`|~|\'|"|#|=]', r'', sentence)
    cleaned = re.sub(r'[:|;|.|)|(|,|\|/|_|-]', r'', cleaned)
    #cleaned = re.sub(r'\s+',r' ',cleaned)
    return cleaned


p1 = beam.Pipeline()

Word_Count = (p1
              | beam.io.ReadFromText(input_file)
              | beam.FlatMap(split)
              | beam.CombinePerKey(sum)
              | beam.io.WriteToText(output_file))

p1.run()
Пример #21
0
    def _build_pcollection(self, pipeline, filepaths, language):
        def _extract_content(filepath):
            # Extracts article content from a single WikiMedia XML file.
            context = etree.iterparse(filepath,
                                      events=("end", ),
                                      encoding="utf-8")
            context = iter(context)
            # To clear root, to free-up more memory than just `elem.clear()`.
            _, root = next(context)
            for _, elem in context:
                if not elem.tag.endswith("page"):
                    continue
                namespace = elem.tag[:-4]
                title = elem.find("./{0}title".format(namespace)).text
                ns = elem.find("./{0}ns".format(namespace)).text
                id_ = elem.find("./{0}id".format(namespace)).text

                # Filter pages that are not in the "main" namespace.
                if ns != "0":
                    root.clear()
                    continue

                raw_content = elem.find(
                    "./{0}revision/{0}text".format(namespace)).text
                root.clear()

                # Filter redirects.
                if raw_content is None or raw_content.lower().startswith(
                        "#redirect"):
                    beam.metrics.Metrics.counter(language,
                                                 "filtered-redirects").inc()
                    continue

                beam.metrics.Metrics.counter(language,
                                             "extracted-examples").inc()

                yield (id_, title, raw_content)

        def _clean_content(inputs):
            id_, title, raw_content = inputs

            try:
                text = _parse_and_clean_wikicode(raw_content)
            except (mwparserfromhell.parser.ParserError) as e:
                beam.metrics.Metrics.counter(language, "parser-error").inc()
                return

            if not text:
                beam.metrics.Metrics.counter(language,
                                             "empty-clean-examples").inc()
                return

            beam.metrics.Metrics.counter(language, "cleaned-examples").inc()

            yield id_, {"title": title, "text": text}

        feedback("Creating pipeline: extract => shuffle => parse/clean...")

        return (pipeline
                | beam.Create(filepaths)
                | beam.FlatMap(_extract_content)
                | beam.transforms.Reshuffle()
                | beam.FlatMap(_clean_content))
Пример #22
0
import re
import apache_beam as beam

# Run Dataflow pipeline
pipeline = beam.Pipeline('DirectRunner')

(pipeline
 | 'read file' >>
 beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt')
 | 'get words' >>
 beam.FlatMap(lambda x: re.findall(r'\w+', x)).with_output_types(unicode)
 | 'count words' >> beam.combiners.Count.PerElement()
 | 'save' >> beam.io.WriteToText('./wordcount_output'))
pipeline.run()
            def expand(self, pcolls):

                scalar_inputs = [
                    expr for expr in self.stage.inputs if is_scalar(expr)
                ]
                tabular_inputs = [
                    expr for expr in self.stage.inputs if not is_scalar(expr)
                ]

                if len(tabular_inputs) == 0:
                    partitioned_pcoll = next(
                        pcolls.values()).pipeline | beam.Create([{}])

                elif self.stage.partitioning != partitionings.Nothing():
                    # Partitioning required for these operations.
                    # Compute the number of partitions to use for the inputs based on
                    # the estimated size of the inputs.
                    if self.stage.partitioning == partitionings.Singleton():
                        # Always a single partition, don't waste time computing sizes.
                        num_partitions = 1
                    else:
                        # Estimate the sizes from the outputs of a *previous* stage such
                        # that using these estimates will not cause a fusion break.
                        input_sizes = [
                            estimate_size(input, same_stage_ok=False)
                            for input in tabular_inputs
                        ]
                        if None in input_sizes:
                            # We were unable to (cheaply) compute the size of one or more
                            # inputs.
                            num_partitions = DEFAULT_PARTITIONS
                        else:
                            num_partitions = beam.pvalue.AsSingleton(
                                input_sizes
                                | 'FlattenSizes' >> beam.Flatten()
                                | 'SumSizes' >> beam.CombineGlobally(sum)
                                | 'NumPartitions' >> beam.Map(lambda size: max(
                                    MIN_PARTITIONS,
                                    min(MAX_PARTITIONS, size //
                                        TARGET_PARTITION_SIZE))))

                    partition_fn = self.stage.partitioning.partition_fn

                    class Partition(beam.PTransform):
                        def expand(self, pcoll):
                            return (
                                pcoll
                                # Attempt to create batches of reasonable size.
                                | beam.ParDo(_PreBatch())
                                # Actually partition.
                                | beam.FlatMap(partition_fn, num_partitions)
                                # Don't bother shuffling empty partitions.
                                | beam.Filter(lambda k_df: len(k_df[1])))

                    # Arrange such that partitioned_pcoll is properly partitioned.
                    main_pcolls = {
                        expr._id: pcolls[expr._id] | 'Partition_%s_%s' %
                        (self.stage.partitioning, expr._id) >> Partition()
                        for expr in tabular_inputs
                    } | beam.CoGroupByKey()
                    partitioned_pcoll = main_pcolls | beam.ParDo(_ReBatch())

                else:
                    # Already partitioned, or no partitioning needed.
                    assert len(tabular_inputs) == 1
                    tag = tabular_inputs[0]._id
                    partitioned_pcoll = pcolls[tag] | beam.Map(
                        lambda df: {tag: df})

                side_pcolls = {
                    expr._id: beam.pvalue.AsSingleton(pcolls[expr._id])
                    for expr in scalar_inputs
                }

                # Actually evaluate the expressions.
                def evaluate(partition, stage=self.stage, **side_inputs):
                    def lookup(expr):
                        # Use proxy if there's no data in this partition
                        return expr.proxy().iloc[:0] if partition[
                            expr._id] is None else partition[expr._id]

                    session = expressions.Session(
                        dict([(expr, lookup(expr))
                              for expr in tabular_inputs] +
                             [(expr, side_inputs[expr._id])
                              for expr in scalar_inputs]))
                    for expr in stage.outputs:
                        yield beam.pvalue.TaggedOutput(
                            expr._id, expr.evaluate_at(session))

                return partitioned_pcoll | beam.FlatMap(
                    evaluate, **side_pcolls).with_outputs()
Пример #24
0
from apache_beam.options.pipeline_options import PipelineOptions
with beam.Pipeline(options=PipelineOptions()) as p:
    table_schema={'fields':[{'name':'key','type':'STRING','mode':'NULLABLE'},
                           {'name':'value','type':'INTEGER','mode':'NULLABLE'}
                           ]}
    
    table_spec = bigquery.TableReference(
    projectId='XXXXX',
    datasetId='XXXXx',
    tableId='word_cnt')

    def sum_val(tup):
        (key,val) = tup
        return {'key': key,'value': sum(val)}#'%s  - %d'%(key, sum(val))
    out= (
        p
        |"read fro txt " >>ReadFromText("F:\codebase\Dataengineering_stuff\Dataflow\dee.txt.txt")
        |beam.FlatMap(lambda x: x.split(' '))
        |beam.Map(lambda x: (x,1))
        |beam.GroupByKey()
        |beam.Map(sum_val)
        
        
       #|WriteToText("F:\codebase\Dataengineering_stuff\Dataflow\dee1.txt")
    )
    out|beam.io.WriteToBigQuery(
            table_spec,
            schema =table_schema,
            write_disposition = beam.io.BigQueryDisposition.WRITE_TRUNCATE,
            #create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
        )
Пример #25
0
def run(argv=None):
    """Runs the workflow."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output BigQuery table: PROJECT:DATASET.TABLE')
    known_args, pipeline_args = parser.parse_known_args(argv)

    schema = bigquery.TableSchema()
    schema.fields.append(field('Alexa_rank', 'integer'))
    schema.fields.append(field('Alexa_domain'))

    schema.fields.append(field('DMOZ_title'))
    schema.fields.append(field('DMOZ_description'))
    schema.fields.append(field('DMOZ_url'))
    schema.fields.append(field('DMOZ_topic', 'string', 'repeated'))

    schema.fields.append(field('Host'))
    schema.fields.append(field('FinalLocation'))
    schema.fields.append(field('HTTPOk', 'boolean'))
    schema.fields.append(field('HTTPSOk', 'boolean'))
    schema.fields.append(field('HTTPSOnly', 'boolean'))

    schema.fields.append(build_response_schema('HTTPResponses'))
    schema.fields.append(build_response_schema('HTTPSResponses'))
    schema.fields.append(field('Error'))

    options = PipelineOptions(pipeline_args)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    options.view_as(SetupOptions).save_main_session = True

    # https://cloud.google.com/dataflow/pipelines/specifying-exec-params
    gc_options = options.view_as(GoogleCloudOptions)
    gc_options.project = 'httparchive'
    gc_options.job_name = 'host-scan-import-' + str(datetime.date.today())
    gc_options.staging_location = 'gs://httparchive/dataflow-binaries'
    gc_options.temp_location = 'gs://httparchive/dataflow-tmp'

    wk_options = options.view_as(WorkerOptions)
    wk_options.num_workers = 10

    # options.view_as(StandardOptions).runner = 'DirectPipelineRunner'
    options.view_as(StandardOptions).runner = 'DataflowPipelineRunner'

    p = beam.Pipeline(options=options)
    (p
     | 'read' >> beam.Read(
         beam.io.TextFileSource(known_args.input, coder=JsonCoder()))
     | 'process' >> beam.FlatMap(process_record)
     # | 'local-write' >> beam.Write(beam.io.TextFileSink('./results')))
     | 'bq-write' >> beam.io.Write(
         beam.io.BigQuerySink(
             known_args.output,
             schema=schema,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run()
Пример #26
0
 def expand(self, pc):
     return (pc
             | beam.Map(rotate_key)
             | beam.GroupByKey()
             | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v)
                                                      for v in elm[1]]))
Пример #27
0
 def expand(self, pcoll):
     return pcoll | 'ReadGCSNotifications' >> beam.FlatMap(
         self.parse_element)
Пример #28
0
 def expand(self, pc):
     return (pc
             | beam.Map(rotate_key)
             | beam.Map(
                 lambda elem, ignored: elem,
                 beam.pvalue.AsIter(pc | beam.FlatMap(lambda elem: None))))
Пример #29
0
      # convert all times to UTC
      dep_airport_id = fields[6]
      arr_airport_id = fields[10]
      dep_timezone = airport_timezones[dep_airport_id][2]
      arr_timezone = airport_timezones[arr_airport_id][2]

      for f in [13, 14, 17]: #crsdeptime, deptime, wheelsoff
         fields[f] = as_utc(fields[0], fields[f], dep_timezone)
      for f in [18, 20, 21]: #wheelson, crsarrtime, arrtime
         fields[f] = as_utc(fields[0], fields[f], arr_timezone)

      yield ','.join(fields)

if __name__ == '__main__':
   with beam.Pipeline('DirectRunner') as pipeline:

      airports = (pipeline
         | 'airports:read' >> beam.io.ReadFromText('airports.csv.gz')
         | 'airports:fields' >> beam.Map(lambda line: next(csv.reader([line])))
         | 'airports:tz' >> beam.Map(lambda fields: (fields[0], addtimezone(fields[21], fields[26])))
      )

      flights = (pipeline
         | 'flights:read' >> beam.io.ReadFromText('201501_part.csv')
         | 'flights:tzcorr' >> beam.FlatMap(tz_correct, beam.pvalue.AsDict(airports))
      )

      flights | beam.io.textio.WriteToText('all_flights')

      pipeline.run()
Пример #30
0
    def expand(self, pcoll):
        """Computes top-k most frequent values and number of uniques."""
        # Convert input example to tuples of form
        # (slice_key, feature_name, feature_value_list, optional weight)
        # corresponding to each example.
        feature_values_with_weights = (
            pcoll
            | 'TopKUniques_ConvertInputToFeatureValuesWithWeights' >>
            beam.FlatMap(_convert_input_to_feature_values_with_weights,
                         categorical_features=self._categorical_features,
                         weight_feature=self._weight_feature))

        # Lambda to convert from ((slice_key, feature_name, feature_value), count)
        # to ((slice_key, feature_name), (feature_value, count))
        modify_key = (lambda x:
                      ((x[0][0], x[0][1]), FeatureValueCount(x[0][2], x[1])))

        sliced_feature_name_value_count = (
            feature_values_with_weights
            # Flatten (slice_key, feature_name, feature_value_list, optional weight)
            # to (slice_key, feature_name, feature_value)
            | 'TopKUniques_FlattenToSlicedFeatureNameValueTuples' >>
            beam.FlatMap(_flatten_value_list)
            # Compute the frequency of each feature_value per slice. Output is a
            # PCollection of ((slice_key, feature_name, feature_value), count)
            | 'TopKUniques_CountSlicedFeatureNameValueTuple' >>
            beam.combiners.Count().PerElement()
            # Convert from ((slice_key, feature_name, feature_value), count) to
            # ((slice_key, feature_name), (feature_value, count))
            |
            'TopKUniques_ModifyKeyToSlicedFeatureName' >> beam.Map(modify_key))

        result_protos = []
        # Find topk values for each feature.
        topk = (
            sliced_feature_name_value_count
            # Obtain the top-k most frequent feature value for each feature in a
            # slice.
            | 'TopK_GetTopK' >> beam.combiners.Top().PerKey(
                max(self._num_top_values, self._num_rank_histogram_buckets),
                _feature_value_count_comparator)
            | 'TopK_ConvertToSingleFeatureStats' >> beam.Map(
                _make_dataset_feature_stats_proto_with_topk_for_single_feature,
                categorical_features=self._categorical_features,
                is_weighted_stats=False,
                num_top_values=self._num_top_values,
                frequency_threshold=self._frequency_threshold,
                num_rank_histogram_buckets=self._num_rank_histogram_buckets))

        result_protos.append(topk)

        # If a weight feature is provided, find the weighted topk values for each
        # feature.
        if self._weight_feature is not None:
            weighted_topk = (
                # Flatten (slice_key, feature_name, feature_value_list, weight) to
                # ((slice_key, feature_name, feature_value), weight)
                feature_values_with_weights
                | 'TopKWeighted_FlattenToSlicedFeatureNameValueTuples' >>
                beam.FlatMap(_flatten_weighted_value_list)
                # Sum the weights of each feature_value per slice. Output is a
                # PCollection of
                # ((slice_key, feature_name, feature_value), weighted_count)
                | 'TopKWeighted_CountSlicedFeatureNameValueTuple' >>
                beam.CombinePerKey(sum)
                # Convert from
                # ((slice_key, feature_name, feature_value), weighted_count) to
                # ((slice_key, feature_name), (feature_value, weighted_count))
                | 'TopKWeighted_ModifyKeyToSlicedFeatureName' >>
                beam.Map(modify_key)
                # Obtain the top-k most frequent feature value for each feature in a
                # slice.
                | 'TopKWeighted_GetTopK' >> beam.combiners.Top().PerKey(
                    max(self._num_top_values,
                        self._num_rank_histogram_buckets),
                    _feature_value_count_comparator)
                | 'TopKWeighted_ConvertToSingleFeatureStats' >> beam.Map(
                    _make_dataset_feature_stats_proto_with_topk_for_single_feature,
                    categorical_features=self._categorical_features,
                    is_weighted_stats=True,
                    num_top_values=self._num_top_values,
                    frequency_threshold=self._weighted_frequency_threshold,
                    num_rank_histogram_buckets=self._num_rank_histogram_buckets
                ))
            result_protos.append(weighted_topk)

        uniques = (
            sliced_feature_name_value_count
            # Drop the values to only have the slice_key and feature_name with
            # each repeated the number of unique values times.
            | 'Uniques_DropValues' >> beam.Keys()
            | 'Uniques_CountPerFeatureName' >>
            beam.combiners.Count().PerElement()
            | 'Uniques_ConvertToSingleFeatureStats' >> beam.Map(
                _make_dataset_feature_stats_proto_with_uniques_for_single_feature,
                categorical_features=self._categorical_features))
        result_protos.append(uniques)

        def _deserialize_sliced_feature_stats_proto(entry):
            feature_stats_proto = statistics_pb2.DatasetFeatureStatistics()
            feature_stats_proto.ParseFromString(entry[1])
            return entry[0], feature_stats_proto

        return (
            result_protos
            | 'FlattenTopKUniquesResults' >> beam.Flatten()
            # TODO(b/121152126): This deserialization stage is a workaround.
            # Remove this once it is no longer needed.
            | 'DeserializeTopKUniquesFeatureStatsProto' >>
            beam.Map(_deserialize_sliced_feature_stats_proto))