def expand(self, pcoll):
     if self._preserve_sample_order:
         return (pcoll
                 | 'GetSampleIds' >> beam.Map(self._get_sample_ids)
                 | 'RemoveDuplicates' >> beam.RemoveDuplicates()
                 | 'Combine' >> beam.combiners.ToList()
                 | 'ExtractUniqueSampleIds' >> beam.ParDo(
                     self._extract_unique_sample_ids))
     else:
         return (pcoll
                 | 'GetSampleIds' >> beam.FlatMap(self._get_sample_ids)
                 | 'RemoveDuplicates' >> beam.RemoveDuplicates()
                 | 'Combine' >> beam.combiners.ToList()
                 | 'SortSampleIds' >> beam.ParDo(sorted))
Exemplo n.º 2
0
  def expand(self, uri_to_content):

    # Compute the total number of documents, and prepare a singleton
    # PCollection to use as side input.
    total_documents = (
        uri_to_content
        | 'GetUris 1' >> beam.Keys()
        | 'GetUniqueUris' >> beam.RemoveDuplicates()
        | 'CountUris' >> beam.combiners.Count.Globally())

    # Create a collection of pairs mapping a URI to each of the words
    # in the document associated with that that URI.

    def split_into_words((uri, line)):
      return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)]

    uri_to_words = (
        uri_to_content
        | 'SplitWords' >> beam.FlatMap(split_into_words))

    # Compute a mapping from each word to the total number of documents
    # in which it appears.
    word_to_doc_count = (
        uri_to_words
        | 'GetUniqueWordsPerDoc' >> beam.RemoveDuplicates()
        | 'GetWords' >> beam.Values()
        | 'CountDocsPerWord' >> beam.combiners.Count.PerElement())

    # Compute a mapping from each URI to the total number of words in the
    # document associated with that URI.
    uri_to_word_total = (
        uri_to_words
        | 'GetUris 2' >> beam.Keys()
        | 'CountWordsInDoc' >> beam.combiners.Count.PerElement())

    # Count, for each (URI, word) pair, the number of occurrences of that word
    # in the document associated with the URI.
    uri_and_word_to_count = (
        uri_to_words
        | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement())

    # Adjust the above collection to a mapping from (URI, word) pairs to counts
    # into an isomorphic mapping from URI to (word, count) pairs, to prepare
    # for a join by the URI key.
    uri_to_word_and_count = (
        uri_and_word_to_count
        | 'ShiftKeys' >> beam.Map(
            lambda ((uri, word), count): (uri, (word, count))))
Exemplo n.º 3
0
 def expand(self, pcoll):
     if self._preserve_call_names_order:
         return (pcoll
                 | 'GetCallNames' >> beam.Map(self._get_call_names)
                 | 'RemoveDuplicates' >> beam.RemoveDuplicates()
                 | 'Combine' >> beam.combiners.ToList()
                 | 'ExtractUniqueCallNames' >> beam.ParDo(
                     self._extract_unique_call_names)
                 | beam.combiners.ToList())
     else:
         return (pcoll
                 | 'GetCallNames' >> beam.FlatMap(self._get_call_names)
                 | 'RemoveDuplicates' >> beam.RemoveDuplicates()
                 | 'Combine' >> beam.combiners.ToList()
                 | 'SortCallNames' >> beam.ParDo(sorted)
                 | beam.combiners.ToList())
Exemplo n.º 4
0
  def expand(self, inputs):
    if self._top_k is not None and self._top_k < 0:
      raise ValueError('top_k for VocabularyImpl should be >= 0 or None, got '
                       '{}.'.format(self._top_k))
    if self._frequency_threshold is not None and self._frequency_threshold < 0:
      raise ValueError(
          'frequency_threshold for VocabularyImpl should be >= 0 or None, '
          'got {}.'.format(self._frequency_threshold))
    if self._coverage_top_k is not None and self._coverage_top_k < 0:
      raise ValueError('coverage_top_k for VocabularyImpl should be >= 0 or '
                       'None, got {}.'.format(self._coverage_top_k))
    if (self._coverage_frequency_threshold is not None and
        self._coverage_frequency_threshold < 0):
      raise ValueError(
          'coverage_frequency_threshold for VocabularyImpl should be >= 0 or '
          'None, got {}.'.format(self._coverage_frequency_threshold))
    pcoll, = inputs

    result = (
        pcoll | 'ApplyFrequencyThresholdAndTopK' >> (
            _ApplyFrequencyThresholdAndTopK(  # pylint: disable=no-value-for-parameter
                self._frequency_threshold, self._top_k, None)))

    if self._key_fn:
      coverage_counts = (
          pcoll | 'ApplyCoverageFrequencyThresholdAndTopK' >> (
              _ApplyFrequencyThresholdAndTopK(  # pylint: disable=no-value-for-parameter
                  self._coverage_frequency_threshold, self._coverage_top_k,
                  self._key_fn)))

      result = ((result, coverage_counts)
                | 'MergeStandardAndCoverageArms' >> beam.Flatten()
                | 'RemoveDuplicates' >> beam.RemoveDuplicates())

    return result
Exemplo n.º 5
0
def ReadAndShuffleData(pcoll, filepatterns):
  """Read a train or test dataset from disk and shuffle it."""
  # NOTE: we pass filepatterns as a tuple instead of two args, as the current
  # version of beam assumes that if the first arg to a ptransfrom_fn is a
  # string, then that string is the label.
  neg_filepattern, pos_filepattern = filepatterns

  # Read from each file pattern and create a tuple of the review text and the
  # correct label.
  negative_examples = (
      pcoll
      | 'ReadNegativeExamples' >> textio.ReadFromText(neg_filepattern)
      | 'PairWithZero' >> beam.Map(lambda review: (review, 0)))
  positive_examples = (
      pcoll
      | 'ReadPositiveExamples' >> textio.ReadFromText(pos_filepattern)
      | 'PairWithOne' >> beam.Map(lambda review: (review, 1)))
  all_examples = (
      [negative_examples, positive_examples] | 'Merge' >> beam.Flatten())

  # Shuffle the data.  Note that the data does in fact contain duplicate reviews
  # for reasons that are unclear.  This means that NUM_TRAIN_INSTANCES and
  # NUM_TRAIN_INSTANCES are slightly wrong for the preprocessed data.
  # pylint: disable=no-value-for-parameter
  shuffled_examples = (
      all_examples
      | 'RemoveDuplicates' >> beam.RemoveDuplicates()
      | 'Shuffle' >> Shuffle())

  # Put the data in the format that can be accepted directly by tf.Transform.
  return shuffled_examples | 'MakeInstances' >> beam.Map(
      lambda p: {REVIEW_COLUMN: p[0], LABEL_COLUMN: p[1]})
Exemplo n.º 6
0
 def expand(self, pcoll):
   return (pcoll
           | 'GetCallNames' >> beam.Map(self._get_call_names)
           | 'RemoveDuplicates' >> beam.RemoveDuplicates()
           | 'Combine' >> beam.combiners.ToList()
           | 'CombineUniqueCallNames'
           >> beam.ParDo(self._combine_unique_call_names)
           | beam.combiners.ToList())
Exemplo n.º 7
0
def execute():
    with beam.Pipeline('DirectRunner') as p:
        (p 
            | 'ReadFile'      >> ReadFromText(file_pattern='./data/SMSSpamCollection')
            | 'Deduplicate'   >> beam.RemoveDuplicates()
            | 'Parse'         >> beam.FlatMap(parse)
            | 'Write'         >> beam.io.WriteToText('./data/Output.jsonl') 
        )
    def expand(self, pcoll):
        # Get a list of all call names across variants.
        call_names = (pcoll
                      | 'GetCallNames' >> beam.FlatMap(self._get_call_names)
                      | 'RemoveDuplicates' >> beam.RemoveDuplicates()
                      | 'Combine' >> beam.combiners.ToList())

        # Extend each variant's list of calls to contain all samples.
        return (pcoll
                | 'DensifyVariants' >> beam.Map(
                    self._densify_variants,
                    all_call_names=beam.pvalue.AsSingleton(call_names)))
Exemplo n.º 9
0
def _TrackDistinctSliceKeys(  # pylint: disable=invalid-name
    slice_keys_and_values: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
    """Gathers slice key telemetry post slicing."""
    def increment_counter(element):  # pylint: disable=invalid-name
        num_distinct_slice_keys = beam.metrics.Metrics.counter(
            constants.METRICS_NAMESPACE, 'num_distinct_slice_keys')
        num_distinct_slice_keys.inc(element)
        return element

    return (slice_keys_and_values
            | 'ExtractSliceKeys' >> beam.Keys()
            | 'RemoveDuplicates' >> beam.RemoveDuplicates()
            | 'Size' >> beam.combiners.Count.Globally()
            | 'IncrementCounter' >> beam.Map(increment_counter))
Exemplo n.º 10
0
  def _combine_unique_call_names(self, call_names):
    # type: (List[Tuple[str]]) -> List[str]
    """Combines unique call names from all variants.

    If there is only one unique call name tuple in `call_names`, it means that
    the call names from all variants are the same. For this case, return this
    call name tuple directly. Otherwise, return the call names in sorted order.
    """
    if len(call_names) == 1:
      return list(call_names[0])
    return (call_names
            | 'FlattenCallNames' >> beam.Flatten()
            | 'RemoveDuplicates' >> beam.RemoveDuplicates()
            | 'Combine' >> beam.combiners.ToList()
            | 'SortCallNames' >> beam.ParDo(sorted))
Exemplo n.º 11
0
def write_total_distinct_keys_to_file(data, filename, key):
    """
    Counts how many distinct items of "key" is present in data. Key here is either
    sku or customer_id.

    Args
    ----
      data: pcollection.
      filename: where to write results to.
      key: on which value to count for.
    """
    _ = (data
         | 'get {}'.format(key) >> beam.Map(lambda x: x[key])
         | 'group {}'.format(key) >> beam.RemoveDuplicates()
         | 'count {}'.format(key) >> beam.combiners.Count.Globally()
         | 'write {}'.format(key) >> beam.io.WriteToText(filename))
Exemplo n.º 12
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/5: The Google Cloud Storage path is required
        # for outputting the results.
        default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DirectRunner',
        # CHANGE 3/5: Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=SET_YOUR_PROJECT_ID_HERE',
        # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
        # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
        '--job_name=your-wordcount-job',
    ])

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    with beam.Pipeline(options=pipeline_options) as p:

        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input)

        # Count the occurrences of each word.
        uniq = (lines | 'RemoveDuplicates' >> beam.RemoveDuplicates())

        uniq | WriteToText(known_args.output)
Exemplo n.º 13
0
 def expand(self, pcoll):
     """Computes number of unique values for string features."""
     # Count the number of appearance of each feature_value. Output is a
     # pcollection of DatasetFeatureStatistics protos
     return (
         pcoll
         | 'Uniques_ConvertToFeatureNameValueTuples' >> beam.FlatMap(
             self._convert_to_feature_name_value_tuples)
         | 'Uniques_RemoveDuplicateFeatureNameValueTuples' >>
         beam.RemoveDuplicates()
         # Drop the values to only have the slice_key and feature_name with each
         # repeated the number of unique values times.
         | 'Uniques_DropValues' >> beam.Map(lambda entry:
                                            (entry[0], entry[1]))
         | 'Uniques_CountPerFeatureName' >>
         beam.combiners.Count().PerElement()
         | 'Uniques_ConvertToSingleFeatureStats' >> beam.Map(
             _make_dataset_feature_stats_proto_with_single_feature,
             categorical_features=self._categorical_features))
 def expand(self, pcoll):
   """Computes number of unique values for string features."""
   # Count the number of appearance of each feature_value. Output is a
   # pcollection of DatasetFeatureStatistics protos
   return (
       pcoll
       | 'Uniques_FilterIrrelevantFeatures' >>
       (beam.FlatMap(self._filter_irrelevant_features).with_output_types(
           beam.typehints.KV[types.BeamFeatureName, np.ndarray]))
       | 'Uniques_FlattenToFeatureNameValueTuples' >>
       beam.FlatMap(lambda name_and_value_list:  # pylint: disable=g-long-lambda
                    [(name_and_value_list[0], value)
                     for value in name_and_value_list[1]])
       | 'Uniques_RemoveDuplicateFeatureNameValueTuples' >>
       beam.RemoveDuplicates()
       # Drop the values to only have the feature_name with each repeated the
       # number of unique values times.
       | 'Uniques_DropValues' >> beam.Keys()
       | 'Uniques_CountPerFeatureName' >> beam.combiners.Count().PerElement()
       | 'Uniques_ConvertToSingleFeatureStats' >> beam.Map(
           _make_dataset_feature_stats_proto_with_single_feature,
           categorical_features=self._categorical_features))
Exemplo n.º 15
0
    def expand(self, inputs):
        pcoll, = inputs
        if self._top_k is not None and self._top_k < 0:
            raise ValueError(
                'top_k for VocabularyImpl should be >= 0 or None, got '
                '{}.'.format(self._top_k))
        if self._frequency_threshold is not None and self._frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold for VocabularyImpl should be >= 0 or None, '
                'got {}.'.format(self._frequency_threshold))
        if self._coverage_top_k is not None and self._coverage_top_k < 0:
            raise ValueError(
                'coverage_top_k for VocabularyImpl should be >= 0 or '
                'None, got {}.'.format(self._coverage_top_k))
        if (self._coverage_frequency_threshold is not None
                and self._coverage_frequency_threshold < 0):
            raise ValueError(
                'coverage_frequency_threshold for VocabularyImpl should be >= 0 or '
                'None, got {}.'.format(self._coverage_frequency_threshold))

        # Create a PCollection of (count, element) pairs, then iterates over
        # this to create a single element PCollection containing this list of
        # pairs in sorted order by decreasing counts (and by values for equal
        # counts).

        def is_problematic_string(kv):
            string, _ = kv  # Ignore counts.
            return string and b'\n' not in string and b'\r' not in string

        if (self._vocab_ordering_type ==
                tf_utils.VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION):
            flatten_map_fn = _flatten_to_key_and_means_accumulator_list
            combine_transform = _MutualInformationTransform(  # pylint: disable=no-value-for-parameter
                self._use_adjusted_mutual_info, self._min_diff_from_avg)
        elif (self._vocab_ordering_type ==
              tf_utils.VocabOrderingType.WEIGHTED_FREQUENCY):
            flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples
            combine_transform = beam.CombinePerKey(sum)
        else:
            flatten_map_fn = _flatten_value_to_list
            combine_transform = beam.combiners.Count.PerElement()

        raw_counts = (
            pcoll
            | 'FlattenStringsAndMaybeWeightsLabels' >>
            beam.FlatMap(flatten_map_fn)
            | 'CountPerString' >> combine_transform
            | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)
            | 'SwapStringsAndCounts' >> beam.KvSwap())

        counts = (
            raw_counts | 'ApplyFrequencyThresholdAndTopK' >> (
                _ApplyFrequencyThresholdAndTopK(  # pylint: disable=no-value-for-parameter
                    self._frequency_threshold, self._top_k, None)))

        if self._key_fn:
            coverage_counts = (
                raw_counts | 'ApplyCoverageFrequencyThresholdAndTopK' >> (
                    _ApplyFrequencyThresholdAndTopK(  # pylint: disable=no-value-for-parameter
                        self._coverage_frequency_threshold,
                        self._coverage_top_k, self._key_fn)))

            counts = ((counts, coverage_counts)
                      | 'MergeStandardAndCoverageArms' >> beam.Flatten()
                      | 'RemoveDuplicates' >> beam.RemoveDuplicates())

        return counts | 'WriteVocabFile' >> (
            _WriteVocabFile(  # pylint: disable=no-value-for-parameter
                self._base_temp_dir, self._vocab_filename,
                self._store_frequency))
Exemplo n.º 16
0
                       fields[3]))
                   | 'weather:reducing for max temperature' >> beam.CombinePerKey(max)
                   )
        #########################################
        # Writing to file system the dictionary #
        #########################################
        weather \
            | "weather:cleaning" >> beam.Map(lambda counter: '%s, %s' % (counter[0], counter[1])) \
            | 'weather:write' >> beam.io.textio.WriteToText('weather_dictionary')

        #####################################
        # Starting Pipeline for the flights #
        #####################################
        flights = (pipeline
                   | 'flights:read'  >> beam.io.ReadFromText('flights_large.csv')
                   | 'flights:removeduplicates' >> beam.RemoveDuplicates()
                   | 'flights:lines' >> beam.Map(lambda line: next(csv.reader([line])))
                   | 'flight:remove heads' >> beam.Filter(lambda row: row[0] != 'Date')
                   | 'flights:fields' >> beam.Map(lambda fields: (
                    (str(fields[5]) + '-' + str(fields[0]) + '-' + str(hour_(fields[7]))),
                    fields[0],
                    fields[7],
                    fields[1],
                    fields[5],
                    georefe(fields[16], fields[15]),
                    str(fields[5]) + '-->' + str(fields[3]),
                    delaymarker(fields[8]),
                    fields[18].ljust(10, '0')))
                   | 'flights:addint temperature' >> beam.FlatMap(temp_dict, beam.pvalue.AsDict(weather))
                   | 'flights:compact' >> beam.Map(lambda (data, temp): '{},{}'.format(','.join(data), temp))
                   )
Exemplo n.º 17
0
    def expand(self, uri_to_content):

        # Compute the total number of documents, and prepare a singleton
        # PCollection to use as side input.
        total_documents = (uri_to_content
                           | 'GetUris 1' >> beam.Keys()
                           | 'GetUniqueUris' >> beam.RemoveDuplicates()
                           | 'CountUris' >> beam.combiners.Count.Globally())

        # Create a collection of pairs mapping a URI to each of the words
        # in the document associated with that that URI.

        def split_into_words(uri_line):
            (uri, line) = uri_line
            return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)]

        uri_to_words = (uri_to_content
                        | 'SplitWords' >> beam.FlatMap(split_into_words))

        # Compute a mapping from each word to the total number of documents
        # in which it appears.
        word_to_doc_count = (
            uri_to_words
            | 'GetUniqueWordsPerDoc' >> beam.RemoveDuplicates()
            | 'GetWords' >> beam.Values()
            | 'CountDocsPerWord' >> beam.combiners.Count.PerElement())

        # Compute a mapping from each URI to the total number of words in the
        # document associated with that URI.
        uri_to_word_total = (
            uri_to_words
            | 'GetUris 2' >> beam.Keys()
            | 'CountWordsInDoc' >> beam.combiners.Count.PerElement())

        # Count, for each (URI, word) pair, the number of occurrences of that word
        # in the document associated with the URI.
        uri_and_word_to_count = (
            uri_to_words
            | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement())

        # Adjust the above collection to a mapping from (URI, word) pairs to counts
        # into an isomorphic mapping from URI to (word, count) pairs, to prepare
        # for a join by the URI key.
        def shift_keys(uri_word_count):
            return (uri_word_count[0][0], (uri_word_count[0][1],
                                           uri_word_count[1]))

        uri_to_word_and_count = (uri_and_word_to_count
                                 | 'ShiftKeys' >> beam.Map(shift_keys))

        # Perform a CoGroupByKey (a sort of pre-join) on the prepared
        # uri_to_word_total and uri_to_word_and_count tagged by 'word totals' and
        # 'word counts' strings. This yields a mapping from URI to a dictionary
        # that maps the above mentioned tag strings to an iterable containing the
        # word total for that URI and word and count respectively.
        #
        # A diagram (in which '[]' just means 'iterable'):
        #
        #   URI: {'word totals': [count],  # Total words within this URI's document.
        #         'word counts': [(word, count),  # Counts of specific words
        #                         (word, count),  # within this URI's document.
        #                         ... ]}
        uri_to_word_and_count_and_total = (
            {
                'word totals': uri_to_word_total,
                'word counts': uri_to_word_and_count
            }
            | 'CoGroupByUri' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, term frequency) pair for each
        # URI. A word's term frequency for a document is simply the number of times
        # that word occurs in the document divided by the total number of words in
        # the document.

        def compute_term_frequency(uri_count_and_total):
            (uri, count_and_total) = uri_count_and_total
            word_and_count = count_and_total['word counts']
            # We have an iterable for one element that we want extracted.
            [word_total] = count_and_total['word totals']
            for word, count in word_and_count:
                yield word, (uri, float(count) / word_total)

        word_to_uri_and_tf = (
            uri_to_word_and_count_and_total
            | 'ComputeTermFrequencies' >> beam.FlatMap(compute_term_frequency))

        # Compute a mapping from each word to its document frequency.
        # A word's document frequency in a corpus is the number of
        # documents in which the word appears divided by the total
        # number of documents in the corpus.
        #
        # This calculation uses a side input, a Dataflow-computed auxiliary value
        # presented to each invocation of our MapFn lambda. The second argument to
        # the function (called total---note that the first argument is a tuple)
        # receives the value we listed after the lambda in Map(). Additional side
        # inputs (and ordinary Python values, too) can be provided to MapFns and
        # DoFns in this way.
        def div_word_count_by_total(word_count, total):
            (word, count) = word_count
            return (word, float(count) / total)

        word_to_df = (
            word_to_doc_count
            | 'ComputeDocFrequencies' >> beam.Map(
                div_word_count_by_total, AsSingleton(total_documents)))

        # Join the term frequency and document frequency collections,
        # each keyed on the word.
        word_to_uri_and_tf_and_df = (
            {
                'tf': word_to_uri_and_tf,
                'df': word_to_df
            }
            | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, TF-IDF) score for each URI.
        # There are a variety of definitions of TF-IDF
        # ("term frequency - inverse document frequency") score; here we use a
        # basic version that is the term frequency divided by the log of the
        # document frequency.

        def compute_tf_idf(word_tf_and_df):
            (word, tf_and_df) = word_tf_and_df
            [docf] = tf_and_df['df']
            for uri, tf in tf_and_df['tf']:
                yield word, (uri, tf * math.log(1 / docf))

        word_to_uri_and_tfidf = (
            word_to_uri_and_tf_and_df
            | 'ComputeTf-idf' >> beam.FlatMap(compute_tf_idf))

        return word_to_uri_and_tfidf