Exemplo n.º 1
0
    def expand(self, pcoll):
        top_k = self._spec.top_k
        frequency_threshold = self._spec.frequency_threshold
        assert top_k is None or top_k >= 0
        assert frequency_threshold is None or frequency_threshold >= 0

        # Creates a PCollection of (count, element) pairs, then iterates over
        # this to create a single element PCollection containing this list of
        # pairs in sorted order by decreasing counts (and by values for equal
        # counts).
        counts = (
            pcoll
            | 'FlattenValueToList' >> beam.Map(_flatten_value_to_list)
            | 'CountWithinList' >>
            # Specification of with_output_types allows for combiner optimizations.
            (beam.FlatMap(lambda lst: six.iteritems(collections.Counter(lst))).
             with_output_types(KV[common.PRIMITIVE_TYPE, int]))
            | 'CountGlobally' >> beam.CombinePerKey(sum))

        counts = (counts
                  | 'FilterProblematicStrings' >> beam.Filter(lambda kv: kv[
                      0] and '\n' not in kv[0] and '\r' not in kv[0])
                  | 'SwapElementsAndCounts' >> beam.KvSwap())

        # Filter is cheaper than TopK computation and the two commute, so
        # filter first.
        if frequency_threshold is not None:
            counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold
                       >> beam.Filter(lambda kv: kv[0] >= frequency_threshold))

        if top_k is not None:
            counts = (counts
                      | 'Top(%s)' % top_k >>
                      beam.transforms.combiners.Top.Largest(top_k)
                      | 'FlattenList' >> beam.FlatMap(lambda lst: lst))

        # Performance optimization to obviate reading from finely sharded files
        # via AsIter. By forcing all data into a single group we end up reading
        # from a single file.
        #
        @beam.ptransform_fn
        def Reshard(pcoll):  # pylint: disable=invalid-name
            return (pcoll
                    | 'PairWithNone' >> beam.Map(lambda x: (None, x))
                    | 'GroupByNone' >> beam.GroupByKey()
                    | 'ExtractValues' >> beam.FlatMap(lambda x: x[1]))

        counts |= 'ReshardToOneGroup' >> Reshard()  # pylint: disable=no-value-for-parameter

        # Using AsIter instead of AsList below in order to reduce max memory
        # usage (due to AsList caching).
        def order_by_decreasing_counts(ignored, counts_iter, store_frequency):
            """Sort the vocabulary by frequency count."""
            del ignored
            counts = list(counts_iter)
            if not counts:
                counts = [(1, '49d0cd50-04bb-48c0-bc6f-5b575dce351a')]
            counts.sort(reverse=True)  # Largest first.
            if store_frequency:
                # Returns ['count1 element1', ... ]
                return [
                    '{} {}'.format(count, element) for count, element in counts
                ]
            else:
                return [element for _, element in counts]

        vocabulary_file = os.path.join(self._temp_assets_dir,
                                       self._spec.vocab_filename)
        vocab_is_written = (pcoll.pipeline
                            | 'Prepare' >> beam.Create([None])
                            | 'OrderByDecreasingCounts' >> beam.FlatMap(
                                order_by_decreasing_counts,
                                counts_iter=beam.pvalue.AsIter(counts),
                                store_frequency=self._spec.store_frequency)
                            | 'WriteToFile' >> beam.io.WriteToText(
                                vocabulary_file, shard_name_template=''))
        # Return the vocabulary path.
        wait_for_vocabulary_transform = (
            pcoll.pipeline
            | 'CreatePath' >> beam.Create([[vocabulary_file]])
            # Ensure that the analysis returns only after the file is written.
            | 'WaitForVocabularyFile' >> beam.Map(
                lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written)))
        return wait_for_vocabulary_transform
 def expand(self, pcoll):
     return pcoll | "IsAuction" >> beam.Filter(is_auction)
        fields["DEP_AIRPORT_LON"] = airport_timezones[dep_airport_id][1]
        fields["DEP_AIRPORT_TZOFFSET"] = deptz
        fields["ARR_AIRPORT_LAT"] = airport_timezones[arr_airport_id][0]
        fields["ARR_AIRPORT_LON"] = airport_timezones[arr_airport_id][1]
        fields["ARR_AIRPORT_TZOFFSET"] = arrtz
        yield json.dumps(fields)
    except KeyError as e:
        logging.exception(" Ignoring " + line +
                          " because airport is not known")


if __name__ == '__main__':
    with beam.Pipeline('DirectRunner') as pipeline:
        airports = (pipeline
                    |
                    'airports:read' >> beam.io.ReadFromText('airports.csv.gz')
                    | beam.Filter(lambda line: "United States" in line)
                    | 'airports:fields' >>
                    beam.Map(lambda line: next(csv.reader([line])))
                    | 'airports:tz' >>
                    beam.Map(lambda fields:
                             (fields[0], addtimezone(fields[21], fields[26]))))

        flights = (
            pipeline
            | 'flights:read' >> beam.io.ReadFromText('flights_sample.json')
            | 'flights:tzcorr' >> beam.FlatMap(tz_correct,
                                               beam.pvalue.AsDict(airports)))

        flights | beam.io.textio.WriteToText('all_flights')
Exemplo n.º 4
0
    def expand(self, inputs):
        pcoll, = inputs
        if self._top_k is not None and self._top_k < 0:
            raise ValueError(
                'top_k for VocabularyImpl should be >= 0 or None, got '
                '{}.'.format(self._top_k))
        if self._frequency_threshold is not None and self._frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold for VocabularyImpl should be >= 0 or None, '
                'got {}.'.format(self._frequency_threshold))

        # Create a PCollection of (count, element) pairs, then iterates over
        # this to create a single element PCollection containing this list of
        # pairs in sorted order by decreasing counts (and by values for equal
        # counts).

        def is_problematic_string(kv):
            string, _ = kv  # Ignore counts.
            return string and b'\n' not in string and b'\r' not in string

        if (self._vocab_ordering_type ==
                tf_utils.VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION):
            flatten_map_fn = (
                _flatten_positive_label_weights_total_weights_and_counts)

            # count_and_means is a pcollection that contains a
            # _CountAndWeightsMeansAccumulator where:
            #   `weighted_mean` is the weighted mean of positive labels
            #       for all features.
            #   `count` is the count for all features.
            #   `weights_mean` is the mean of the weights for all features.
            count_and_means = (
                pcoll
                | 'SumBatchCountAndWeightsMeans' >> beam.Map(_count_and_means)
                | 'ComputeCountAndWeightsMeansGlobally' >>
                beam.CombineGlobally(CountAndWeightsMeansCombineFn()))

            # CountAndWeightsMeansCombineFn returns a tuple of the form:
            # (feature,_CountAndWeightsMeansAccumulator) where:
            #   `feature` is a single string, which is the word in the vocabulary
            #       whose mutual information with the label is being computed.
            #   `weighted_mean` is the weighted mean of y positive given x.
            #   `count` is the count of weights for a feature.
            #   `weights_mean` is the mean of the weights for a feature.
            combine_transform = (
                'ComputeCountAndWeightsMeansPerUniqueWord' >>
                beam.CombinePerKey(CountAndWeightsMeansCombineFn())
                | 'CalculateMutualInformationPerUniqueWord' >> beam.Map(
                    _calculate_mutual_information,
                    global_accumulator=beam.pvalue.AsSingleton(
                        count_and_means),
                    use_adjusted_mutual_info=self._use_adjusted_mutual_info))
        elif (self._vocab_ordering_type ==
              tf_utils.VocabOrderingType.WEIGHTED_FREQUENCY):
            flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples
            combine_transform = beam.CombinePerKey(sum)
        else:
            flatten_map_fn = _flatten_value_to_list
            combine_transform = beam.combiners.Count.PerElement()

        raw_counts = (
            pcoll
            | 'FlattenStringsAndMaybeWeightsLabels' >>
            beam.FlatMap(flatten_map_fn)
            | 'CountPerString' >> combine_transform
            | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)
            | 'SwapStringsAndCounts' >> beam.KvSwap())

        counts = (
            raw_counts | 'ApplyFrequencyThresholdAndTopK' >> (
                _ApplyFrequencyThresholdAndTopK(  # pylint: disable=no-value-for-parameter
                    self._frequency_threshold, self._top_k)))

        return counts | 'WriteVocabFile' >> (
            _WriteVocabFile(  # pylint: disable=no-value-for-parameter
                self._base_temp_dir, self._vocab_filename,
                self._store_frequency))
Exemplo n.º 5
0
    'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP'
]


def make_string(array):
    return (map(lambda tup: '{},{}'.format(tup[0], round(tup[1], 2)), array))


print '\n-----Starting Pipeline-----\n\n'

pipeline = beam.Pipeline('DirectRunner')

(pipeline
 | beam.io.ReadFromText('headless_battingext.csv')
 | beam.Map(lambda line: next(csv.reader([line])))
 | beam.Map(lambda d_array: dict(zip(header, d_array)))
 | beam.Map(lambda d_dict: (d_dict['playerID'], int(d_dict['HR'])))
 | beam.combiners.Count.PerKey()
 | beam.Filter(lambda d_tup: int(d_tup[1]) >= 20)
 | beam.combiners.ToList()
 | beam.Map(lambda tup: sorted(tup, key=lambda tup: tup[1], reverse=True))
 | beam.Map(make_string)
 | beam.Map(lambda t_array: ['playerID,SEASONS'] + t_array)
 | beam.FlatMap(lambda x: x)
 | beam.io.WriteToText('output', num_shards=1))

result = pipeline.run()
result.wait_until_finish()

print '\n\n-----Ending Pipeline-----\n'
Exemplo n.º 6
0
def pipeline(root):
    """Beam pipeline.

  Args:
    root: the root of the pipeline.
  """
    stage1_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage1')
    stage2_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage2')

    # Create a collection of conformers with duplicate information
    equivalent_files = gfile.glob(FLAGS.input_equivalent_glob)
    equivalent_conformers = (
        root
        | 'CreateEquivInputs' >> beam.Create(equivalent_files)
        | 'ParseEquiv' >> beam.FlatMap(parse_equivalent_file))

    # Merge by bond_topology_id
    merged_results = (
        (stage1_matched_conformers, stage2_matched_conformers,
         equivalent_conformers)
        | 'FlattenAllConformers' >> beam.Flatten()
        | 'GroupByCID' >> beam.GroupBy(lambda c: c.conformer_id)
        | 'MergeConformers' >> beam.ParDo(MergeConformersFn()).with_outputs(
            MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT, main='conformers'))
    merged_conformers = merged_results['conformers']

    # Write out the merge conflicts
    _ = (merged_results[MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT]
         | 'ConflictsCSVFormat' >> beam.Map(csv_format)
         | 'ConflictsReshuffle' >> beam.Reshuffle()
         | 'WriteConflictsCSV' >> beam.io.WriteToText(
             FLAGS.output_stem + '_conflicts',
             header=csv_format(smu_utils_lib.MERGE_CONFLICT_FIELDS),
             num_shards=1,
             file_name_suffix='.csv'))

    # Get the bond length distributions
    unused_bond_length_dists_pcoll = (
        merged_conformers
        | 'FilterForBondLengths' >> beam.Filter(
            smu_utils_lib.should_include_in_standard)
        | 'ExtractBondLengths' >> beam.FlatMap(
            extract_bond_lengths, dist_sig_digits=3, unbonded_max=2.0)
        | 'CountBondLengths' >> beam.combiners.Count.PerElement()
        | 'ToListBondLengths' >> beam.combiners.ToList()
        | 'WriteBondLengths' >> beam.ParDo(
            write_bond_lengths,
            filename=f'{FLAGS.output_stem}_bond_lengths.csv'))

    # Various per conformer processing
    update_results = (
        merged_conformers
        | 'UpdateConformers' >> beam.ParDo(UpdateConformerFn()).with_outputs(
            UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH, main='conformers'))
    updated_conformers = update_results['conformers']

    # Output SMILES mismatches
    _ = (
        update_results[UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH]
        | 'ReshuffleSmilesOutput' >> beam.Reshuffle()
        | 'SmilesCSVFormat' >> beam.Map(csv_format)
        | 'WriteSmilesCSV' >> beam.io.WriteToText(
            FLAGS.output_stem + '_smiles_compare',
            header=
            'conformer_id,compare,smiles_given,smiles_with_h,smiles_without_h',
            num_shards=1,
            file_name_suffix='.csv'))

    # Process duplicate information
    final_conformers = (
        updated_conformers
        | 'KeyedForDuplicates' >>
        beam.FlatMap(generate_keyed_conformers_for_duplicates)
        | 'DupGroupByKey' >> beam.GroupByKey()
        | 'MergeDupInfo' >> beam.MapTuple(merge_duplicate_information))

    # Pull the stats of various sorts write to a file
    _ = (final_conformers
         | 'ExtractStats' >> beam.FlatMap(conformer_to_stat_values)
         | 'CountStats' >> beam.combiners.Count.PerElement()
         | 'StatsCSVFormat' >> beam.MapTuple(lambda x, c: f'{x[0]},{x[1]},{c}')
         | 'WriteStatsCSV' >> beam.io.WriteToText(
             FLAGS.output_stem + '_stats',
             header='primary_key,secondary_key,count',
             num_shards=1,
             file_name_suffix='.csv'))

    # Generate the summary by bond topology.
    bare_bt_summaries = (
        root
        | 'BondTopologyInput' >> beam.Create([FLAGS.input_bond_topology_csv])
        | 'GenerateBareBTSummaries' >>
        beam.FlatMap(bond_topology_summaries_from_csv))
    real_bt_summaries = (
        final_conformers
        |
        'GenerateBTSummaries' >> beam.FlatMap(to_keyed_bond_topology_summary))
    _ = ((bare_bt_summaries, real_bt_summaries)
         | 'FlattenAllBTSummaries' >> beam.Flatten()
         | 'FinishBTSummary' >> CombineAndWriteBondTopologySummary())

    # Make the filtered versions of the dataset
    complete_conformers = (final_conformers
                           |
                           'MakeComplete' >> beam.Map(make_complete_conformer))

    standard_conformers = (
        final_conformers
        | 'MakeStandard' >> beam.FlatMap(make_standard_conformer))

    # Write the complete and standard conformers as binary protobuf in TFRecord.
    for id_str, collection in [['complete', complete_conformers],
                               ['standard', standard_conformers]]:
        _ = (collection
             | ('TFRecordReshuffle_' + id_str) >> beam.Reshuffle()
             |
             ('WriteTFRecord_' + id_str) >> beam.io.tfrecordio.WriteToTFRecord(
                 f'{FLAGS.output_stem}_{id_str}_tfrecord',
                 coder=beam.coders.ProtoCoder(dataset_pb2.Conformer),
                 num_shards=FLAGS.output_shards))

    # Write the complete and standard conformers as JSON.
    # Bit of a hack here: the slowest part of the whole pipeline is writing out
    # the JSON for the complete conformers. So we just hard code a tripling of the
    # shards to get more parallelism.
    for id_str, collection, num_shards in [[
            'complete', complete_conformers, FLAGS.output_shards * 3
    ], ['standard', standard_conformers, FLAGS.output_shards]]:
        _ = (collection
             | ('JSONReshuffle_' + id_str) >> beam.Reshuffle()
             | ('ToJSON_' + id_str) >> beam.Map(conformer_to_json)
             | ('WriteJSON_' + id_str) >> beam.io.WriteToText(
                 f'{FLAGS.output_stem}_{id_str}_json',
                 num_shards=num_shards,
                 file_name_suffix='.json.gz'))
Exemplo n.º 7
0

# Setup options for pipe
options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
custom_options = options.view_as(CustomPipelineOptions)
google_cloud_options.project = 'freightwaves-engineering-prod'
google_cloud_options.job_name = f"clean-intra-bk-{datetime.now().strftime('%Y%m%d%H%M%S')}"
google_cloud_options.staging_location = 'gs://fw-etl-tmp-prod/'
google_cloud_options.temp_location = 'gs://fw-etl-tmp-prod/'
options.view_as(StandardOptions).runner = 'DataFlowRunner'
#options.view_as(StandardOptions).runner = 'DirectRunner'

# Create pipeline object
p = beam.Pipeline(options=options)

# Define the pipeline steps
out = (
    p | "Input" >> beam.io.ReadFromText(
        f"gs://fw-etl-raw-prod/inttra/{custom_options.file_to_clean}")
    | "Remove Invalid Imos" >> beam.Filter(is_valid_imo)
    | "Remove Empty Strings" >> beam.Map(replace_empty_str)
    | "Output" >> beam.io.WriteToText(
        f"gs://fw-etl-load-prod/inttra/{custom_options.file_to_clean}",
        shard_name_template='')
    #| beam.Map(print)
)

# Run the pipeline
result = p.run()
Exemplo n.º 8
0
 def test_row_coder_in_pipeine(self):
     with TestPipeline() as p:
         res = (p
                | beam.Create(self.PEOPLE)
                | beam.Filter(lambda person: person.name == "Jon Snow"))
         assert_that(res, equal_to([self.JON_SNOW]))
Exemplo n.º 9
0
            element['company_name'] = 'default-name-' + element['company_id']
        #retorno async
        yield (element['company_name'] + '_' + element['company_id'])


main = (
    p
    |
    'data source ' >> beam.io.ReadFromMongoDB(uri='mongodb://localhost:27017',
                                              db='conekta',
                                              coll='data_stagin',
                                              projection={
                                                  'company_name': 1,
                                                  'company_id': 1
                                              }))

prov = (main
        | 'filtro por identificador de compania' >>
        beam.Filter(lambda row: len(row['company_id']) > 24)
        | 'prepara informacion' >> beam.ParDo(PrepareDataProv())
        | 'agrupo por proveedor' >> beam.combiners.Count().PerElement()
        | 'split de campos unicos' >> beam.Map(lambda row: row[0].split('_'))
        | 'preparamos el registro' >> beam.Map(lambda row: {
            'id': row[1],
            'company_name': row[0]
        })
        | 'imprime' >> beam.Map(imprime)
        | 'Writing to DB table' >> relational_db.Write(
            source_config=source_config, table_config=table_config))

p.run().wait_until_finish()
Exemplo n.º 10
0
def transform_data(train_data_file, eval_data_file,
                   transformed_train_data_base, transformed_eval_data_base,
                   transformed_metadata_dir):
    """Transform the cleaned data and write out as a TFRecord of Example protos.

  Read in the cleaned data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and coverts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    eval_data_file: File containing evaluation data
    transformed_train_data_base: Base filename for transformed training data
        shards
    transformed_eval_data_base: Base filename for cleaned evaluation data
        shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written.
  """
    raw_data_schema = {
        key: dataset_schema.ColumnSchema(
            dataset_schema.LogicalColumnSchema(
                dataset_schema.Domain(tf.string),
                dataset_schema.LogicalShape([])),
            dataset_schema.FixedColumnRepresentation())
        for key in CATEGORICAL_COLUMNS
    }
    raw_data_schema.update({
        key: dataset_schema.ColumnSchema(
            dataset_schema.LogicalColumnSchema(
                dataset_schema.Domain(tf.float32),
                dataset_schema.LogicalShape([])),
            dataset_schema.FixedColumnRepresentation())
        for key in NUMERIC_COLUMNS
    })
    raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema(
        dataset_schema.LogicalColumnSchema(dataset_schema.Domain(tf.string),
                                           dataset_schema.LogicalShape([])),
        dataset_schema.FixedColumnRepresentation())
    raw_data_schema = dataset_schema.Schema(raw_data_schema)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema)

    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_COLUMNS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # For all categorical columns except the label column, we use
        # tft.string_to_int which computes the set of unique values and uses this
        # to convert the strings to indices.
        for key in CATEGORICAL_COLUMNS:
            outputs[key] = tft.string_to_int(inputs[key])

        # Update outputs of both kinds to convert from shape (batch,), i.e. a batch
        # of scalars, to shape (batch, 1), i.e. a batch of vectors of length 1.
        # This is needed so the output can be easily wrapped in `FeatureColumn`s.
        for key in NUMERIC_COLUMNS + CATEGORICAL_COLUMNS:
            outputs[key] = tft.map(lambda x: tf.expand_dims(x, -1),
                                   outputs[key])

        # For the label column we provide the mapping from string to index.
        def convert_label(label):
            table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as p:
        # Create a coder to read the census data with the schema.  To do this we
        # need to list all columns in order since the schema doesn't specify the
        # order of columns in the csv.
        ordered_columns = [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'label'
        ]
        converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema)

        # Read in raw data and convert using CSV converter.  Note that we apply some
        # Beam transformations here, which will not be encoded in the TF graph since
        # we don't do the from within tf.Transform's methods (AnalyzeDataset,
        # TransformDataset etc.).  These transformations are just to get data into
        # a format that the CSV converter can read, in particular removing empty
        # lines and removing spaces after commas.
        raw_data = (p
                    | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
                    | 'FilterTrainData' >> beam.Filter(lambda line: line)
                    | 'FixCommasTrainData' >>
                    beam.Map(lambda line: line.replace(', ', ','))
                    | 'DecodeTrainData' >> beam.Map(converter.decode))

        # Combine data and schema into a dataset tuple.  Note that we already used
        # the schema to read the CSV data, but we also need it to interpret
        # raw_data.
        raw_dataset = (raw_data, raw_data_metadata)
        transformed_dataset, transform_fn = (
            raw_dataset | beam_impl.AnalyzeAndTransformDataset(
                preprocessing_fn, output_dir=os.path.join(tempfile.mkdtemp())))
        transformed_data, transformed_metadata = transformed_dataset

        _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
            transformed_train_data_base,
            coder=example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema))

        # Now apply transform function to eval data.  In this case we also remove
        # the header line from the CSV file and the trailing period at the end of
        # each line.
        raw_eval_data = (
            p
            | 'ReadEvalData' >> textio.ReadFromText(eval_data_file)
            | 'FilterEvalData' >>
            beam.Filter(lambda line: line and line != '|1x3 Cross validator')
            | 'FixCommasEvalData' >>
            beam.Map(lambda line: line.replace(', ', ','))
            |
            'RemoveTrailingPeriodsEvalData' >> beam.Map(lambda line: line[:-1])
            | 'DecodeEvalData' >> beam.Map(converter.decode))

        raw_eval_dataset = (raw_eval_data, raw_data_metadata)

        transformed_eval_dataset = ((raw_eval_dataset, transform_fn)
                                    | beam_impl.TransformDataset())
        # Don't need transformed data schema, it's the same as before.
        transformed_eval_data, _ = transformed_eval_dataset

        _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
            transformed_eval_data_base,
            coder=example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema))

        _ = (transformed_metadata
             | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                 transformed_metadata_dir, pipeline=p))
Exemplo n.º 11
0
    def process_hub(self,
                    hub_name,
                    pk,
                    bkey_list,
                    field_list,
                    foreign_keys=None):
        ext_field_list = \
            [CONST_BK_FIELD, CONST_SOURCE_FIELD, CONST_LOADDTM_FIELD, CONST_STATUS_FIELD] + \
            field_list

        with beam.Pipeline(options=self.pipeline_options) as p:
            # First set up a stream for the data
            data = read_file(
                p, hub_name,
                self.get_psa_location('public.{0}'.format(hub_name)) + '*', pk)

            index = None
            try:
                # Also set up a stream for the index
                index = read_file(
                    p, '{0}index'.format(hub_name),
                    self.get_source_index('hub_{0}*'.format(hub_name)), pk)
            except IOError:
                logging.info("Could not open index, maybe doesn't exist")
                # create an empty pcollection, so we can at least run
                index = p | beam.Create([])

            # Generate business keys, checksum, dv_source, load_dtm
            preproc_data = data | 'preprocess_' + hub_name >> \
                beam.Map(add_hub_dv_details, bkey_list, self.source)

            if foreign_keys:
                preproc_data = self.resolve_foreign_keys(
                    hub_name=hub_name,
                    pk=pk,
                    data=preproc_data,
                    foreign_keys=foreign_keys,
                    pipeline=p)

            # Group with index to be able to identify new, updated, deleted
            merge = ({
                'data': preproc_data,
                'index': index
            }) | 'grouped_by_' + pk >> beam.CoGroupByKey()

            # Extract the data out of the records (still has index/data dict in there)
            extract = merge \
                | 'filter_' + hub_name >> beam.Filter(filter_data_rows) \
                | 'extract_' + hub_name >> beam.Map(extract_data)

            # Write them out to disk in loading area
            extract | 'Write_' + hub_name >> beam.io.Write(
                CsvFileSink(self.get_loading_location(
                    'public.{0}'.format(hub_name)),
                            header=ext_field_list))

            # Update the index
            updated_index = merge | 'updated_index_' + hub_name >> beam.Map(
                hub_select_index_or_data, pk)
            updated_index | 'Write_index_' + hub_name >> beam.io.Write(
                CsvFileSink(self.get_target_index('hub_{0}'.format(hub_name)),
                            header=[CONST_BK_FIELD, CONST_CKSUM_FIELD, pk]))
Exemplo n.º 12
0
                   )
        #########################################
        # Writing to file system the dictionary #
        #########################################
        weather \
            | "weather:cleaning" >> beam.Map(lambda counter: '%s, %s' % (counter[0], counter[1])) \
            | 'weather:write' >> beam.io.textio.WriteToText('weather_dictionary')

        #####################################
        # Starting Pipeline for the flights #
        #####################################
        flights = (pipeline
                   | 'flights:read'  >> beam.io.ReadFromText('flights_large.csv')
                   | 'flights:removeduplicates' >> beam.RemoveDuplicates()
                   | 'flights:lines' >> beam.Map(lambda line: next(csv.reader([line])))
                   | 'flight:remove heads' >> beam.Filter(lambda row: row[0] != 'Date')
                   | 'flights:fields' >> beam.Map(lambda fields: (
                    (str(fields[5]) + '-' + str(fields[0]) + '-' + str(hour_(fields[7]))),
                    fields[0],
                    fields[7],
                    fields[1],
                    fields[5],
                    georefe(fields[16], fields[15]),
                    str(fields[5]) + '-->' + str(fields[3]),
                    delaymarker(fields[8]),
                    fields[18].ljust(10, '0')))
                   | 'flights:addint temperature' >> beam.FlatMap(temp_dict, beam.pvalue.AsDict(weather))
                   | 'flights:compact' >> beam.Map(lambda (data, temp): '{},{}'.format(','.join(data), temp))
                   )

        flights | 'flights:write' >> beam.io.textio.WriteToText('flights_full_details')
Exemplo n.º 13
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of results from the story migration.

        Returns:
            PCollection. A PCollection of results from the story migration.
        """

        unmigrated_story_models = (
            self.pipeline
            | 'Get all non-deleted story models' >>
            (ndb_io.GetModels(story_models.StoryModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add story keys' >> beam.WithKeys(  # pylint: disable=no-value-for-parameter
                lambda story_model: story_model.id))
        story_summary_models = (
            self.pipeline
            | 'Get all non-deleted story summary models' >>
            (ndb_io.GetModels(story_models.StorySummaryModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add story summary keys' >> beam.WithKeys(  # pylint: disable=no-value-for-parameter
                lambda story_summary_model: story_summary_model.id))
        topics = (
            self.pipeline
            | 'Get all non-deleted topic models' >>
            (ndb_io.GetModels(topic_models.TopicModel.get_all()))
            | 'Transform model into domain object' >> beam.Map(
                topic_fetchers.get_topic_from_model)
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add topic keys' >> beam.WithKeys(  # pylint: disable=no-value-for-parameter
                lambda topic: topic.id))
        topic_id_to_topic = beam.pvalue.AsDict(topics)

        migrated_story_results = (
            unmigrated_story_models
            | 'Transform and migrate model' >> beam.MapTuple(
                self._migrate_story, topic_id_to_topic=topic_id_to_topic))
        migrated_stories = (
            migrated_story_results
            | 'Filter oks' >>
            beam.Filter(lambda result_item: result_item.is_ok())
            |
            'Unwrap ok' >> beam.Map(lambda result_item: result_item.unwrap()))
        migrated_story_job_run_results = (
            migrated_story_results
            | 'Generate results for migration' >>
            (job_result_transforms.ResultsToJobRunResults('STORY PROCESSED')))

        story_changes = (unmigrated_story_models
                         | 'Generate story changes' >> beam.FlatMapTuple(
                             self._generate_story_changes))

        story_objects_list = (
            {
                'story_model': unmigrated_story_models,
                'story_summary_model': story_summary_models,
                'story': migrated_stories,
                'story_change': story_changes
            }
            | 'Merge objects' >> beam.CoGroupByKey()
            | 'Get rid of ID' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Remove unmigrated stories' >> beam.Filter(
                lambda x: len(x['story_change']) > 0 and len(x['story']) > 0)
            | 'Reorganize the story objects' >> beam.Map(
                lambda objects: {
                    'story_model': objects['story_model'][0],
                    'story_summary_model': objects['story_summary_model'][0],
                    'story': objects['story'][0],
                    'story_change': objects['story_change'][0]
                }))

        story_objects_list_job_run_results = (
            story_objects_list
            | 'Transform story objects into job run results' >>
            (job_result_transforms.CountObjectsToJobRunResult('STORY MIGRATED')
             ))

        cache_deletion_job_run_results = (
            story_objects_list
            | 'Delete story from cache' >>
            beam.Map(lambda story_objects: self._delete_story_from_cache(
                story_objects['story']))
            | 'Generate results for cache deletion' >>
            (job_result_transforms.ResultsToJobRunResults('CACHE DELETION')))

        story_models_to_put = (
            story_objects_list
            | 'Generate story models to put' >>
            beam.FlatMap(lambda story_objects: self._update_story(
                story_objects['story_model'],
                story_objects['story'],
                story_objects['story_change'],
            )))

        story_summary_models_to_put = (
            story_objects_list
            | 'Generate story summary models to put' >>
            beam.Map(lambda story_objects: self._update_story_summary(
                story_objects['story'], story_objects['story_summary_model'])))

        unused_put_results = (
            (story_models_to_put, story_summary_models_to_put)
            | 'Merge models' >> beam.Flatten()
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (
            (cache_deletion_job_run_results, migrated_story_job_run_results,
             story_objects_list_job_run_results)
            | beam.Flatten())
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from
        generating SkillOpportunityModel.

        Returns:
            PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from
            generating SkillOpportunityModel.
        """
        question_skill_link_models = (
            self.pipeline
            | 'Get all non-deleted QuestionSkillLinkModels' >>
            (ndb_io.GetModels(
                question_models.QuestionSkillLinkModel.get_all(
                    include_deleted=False)))
            | 'Group QuestionSkillLinkModels by skill ID' >>
            beam.GroupBy(lambda n: n.skill_id))

        skills = (
            self.pipeline
            | 'Get all non-deleted SkillModels' >> (ndb_io.GetModels(
                skill_models.SkillModel.get_all(include_deleted=False)))
            | 'Get skill object from model' >> beam.Map(
                skill_fetchers.get_skill_from_model)
            |
            'Group skill objects by skill ID' >> beam.GroupBy(lambda m: m.id))

        skills_with_question_counts = (
            {
                'skill': skills,
                'question_skill_links': question_skill_link_models
            }
            | 'Merge by skill ID' >> beam.CoGroupByKey()
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Remove skill IDs' >> beam.Values()  # pylint: disable=no-value-for-parameter
            # We are using itertools.chain.from_iterable to flatten
            # question_skill_links from a 2D list into a 1D list.
            | 'Flatten skill and question_skill_links' >> beam.Map(
                lambda object: {
                    'skill':
                    list(object['skill'][0])[0],
                    'question_skill_links':
                    list(
                        itertools.chain.from_iterable(object[
                            'question_skill_links']))
                }))

        opportunities_results = (
            skills_with_question_counts
            | beam.Map(lambda object: self._create_skill_opportunity_model(
                object['skill'], object['question_skill_links'])))

        unused_put_result = (
            opportunities_results
            | 'Filter the results with OK status' >>
            beam.Filter(lambda result: result.is_ok())
            | 'Fetch the models to be put' >>
            beam.Map(lambda result: result.unwrap())
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (opportunities_results
                | 'Transform Results to JobRunResults' >>
                (job_result_transforms.ResultsToJobRunResults()))
Exemplo n.º 15
0
    def test_bad_types(self):
        p = TestPipeline()
        evens = None  # pylint: disable=unused-variable

        # [START type_hints_missing_define_numbers]
        numbers = p | beam.Create(['1', '2', '3'])
        # [END type_hints_missing_define_numbers]

        # Consider the following code.
        # pylint: disable=expression-not-assigned
        # pylint: disable=unused-variable
        # [START type_hints_missing_apply]
        evens = numbers | beam.Filter(lambda x: x % 2 == 0)
        # [END type_hints_missing_apply]

        # Now suppose numbers was defined as [snippet above].
        # When running this pipeline, you'd get a runtime error,
        # possibly on a remote machine, possibly very late.

        with self.assertRaises(TypeError):
            p.run()

        # To catch this early, we can assert what types we expect.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_takes]
            p.options.view_as(TypeOptions).pipeline_type_check = True
            evens = numbers | beam.Filter(
                lambda x: x % 2 == 0).with_input_types(int)
            # [END type_hints_takes]

        # Type hints can be declared on DoFns and callables as well, rather
        # than where they're used, to be more self contained.
        with self.assertRaises(typehints.TypeCheckError):
            # [START type_hints_do_fn]
            @beam.typehints.with_input_types(int)
            class FilterEvensDoFn(beam.NewDoFn):
                def process(self, element):
                    if element % 2 == 0:
                        yield element

            evens = numbers | beam.ParDo(FilterEvensDoFn())
            # [END type_hints_do_fn]

        words = p | 'words' >> beam.Create(['a', 'bb', 'c'])
        # One can assert outputs and apply them to transforms as well.
        # Helps document the contract and checks it at pipeline construction time.
        # [START type_hints_transform]
        T = beam.typehints.TypeVariable('T')

        @beam.typehints.with_input_types(T)
        @beam.typehints.with_output_types(beam.typehints.Tuple[int, T])
        class MyTransform(beam.PTransform):
            def expand(self, pcoll):
                return pcoll | beam.Map(lambda x: (len(x), x))

        words_with_lens = words | MyTransform()
        # [END type_hints_transform]

        # pylint: disable=expression-not-assigned
        with self.assertRaises(typehints.TypeCheckError):
            words_with_lens | beam.Map(lambda x: x).with_input_types(
                beam.typehints.Tuple[int, int])
Exemplo n.º 16
0
def run(argv=None):
  """Main entry point; defines and runs the hourly_team_score pipeline."""
  parser = argparse.ArgumentParser()

  parser.add_argument('--topic',
                      type=str,
                      help='Pub/Sub topic to read from')
  parser.add_argument('--subscription',
                      type=str,
                      help='Pub/Sub subscription to read from')
  parser.add_argument('--dataset',
                      type=str,
                      required=True,
                      help='BigQuery Dataset to write tables to. '
                      'Must already exist.')
  parser.add_argument('--table_name',
                      type=str,
                      default='game_stats',
                      help='The BigQuery table name. Should not already exist.')
  parser.add_argument('--fixed_window_duration',
                      type=int,
                      default=60,
                      help='Numeric value of fixed window duration for user '
                           'analysis, in minutes')
  parser.add_argument('--session_gap',
                      type=int,
                      default=5,
                      help='Numeric value of gap between user sessions, '
                           'in minutes')
  parser.add_argument('--user_activity_window_duration',
                      type=int,
                      default=30,
                      help='Numeric value of fixed window for finding mean of '
                           'user session duration, in minutes')

  args, pipeline_args = parser.parse_known_args(argv)

  if args.topic is None and args.subscription is None:
    parser.print_usage()
    print(sys.argv[0] + ': error: one of --topic or --subscription is required')
    sys.exit(1)

  options = PipelineOptions(pipeline_args)

  # We also require the --project option to access --dataset
  if options.view_as(GoogleCloudOptions).project is None:
    parser.print_usage()
    print(sys.argv[0] + ': error: argument --project is required')
    sys.exit(1)

  fixed_window_duration = args.fixed_window_duration * 60
  session_gap = args.session_gap * 60
  user_activity_window_duration = args.user_activity_window_duration * 60

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  options.view_as(SetupOptions).save_main_session = True

  # Enforce that this pipeline is always run in streaming mode
  options.view_as(StandardOptions).streaming = True

  with beam.Pipeline(options=options) as p:
    # Read game events from Pub/Sub using custom timestamps, which
    # are extracted from the data elements, and parse the data.
    if args.subscription:
      scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
          subscription=args.subscription)
    else:
      scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
          topic=args.topic)
    raw_events = (
        scores
        | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8'))
        | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn())
        | 'AddEventTimestamps' >> beam.Map(
            lambda elem: beam.window.TimestampedValue(elem, elem['timestamp'])))

    # Extract username/score pairs from the event stream
    user_events = (
        raw_events
        | 'ExtractUserScores' >> beam.Map(
            lambda elem: (elem['user'], elem['score'])))

    # Calculate the total score per user over fixed windows, and cumulative
    # updates for late data
    spammers_view = (
        user_events
        | 'UserFixedWindows' >> beam.WindowInto(
            beam.window.FixedWindows(fixed_window_duration))

        # Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate.
        # These might be robots/spammers.
        | 'CalculateSpammyUsers' >> CalculateSpammyUsers()

        # Derive a view from the collection of spammer users. It will be used as
        # a side input in calculating the team score sums, below
        | 'CreateSpammersView' >> beam.CombineGlobally(
            beam.combiners.ToDictCombineFn()).as_singleton_view())

    # [START filter_and_calc]
    # Calculate the total score per team over fixed windows, and emit cumulative
    # updates for late data. Uses the side input derived above --the set of
    # suspected robots-- to filter out scores from those users from the sum.
    # Write the results to BigQuery.
    (raw_events  # pylint: disable=expression-not-assigned
     | 'WindowIntoFixedWindows' >> beam.WindowInto(
         beam.window.FixedWindows(fixed_window_duration))

     # Filter out the detected spammer users, using the side input derived above
     | 'FilterOutSpammers' >> beam.Filter(
         lambda elem, spammers: elem['user'] not in spammers,
         spammers_view)
     # Extract and sum teamname/score pairs from the event data.
     | 'ExtractAndSumScore' >> ExtractAndSumScore('team')
     # [END filter_and_calc]
     | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict())
     | 'WriteTeamScoreSums' >> WriteToBigQuery(
         args.table_name + '_teams', args.dataset, {
             'team': 'STRING',
             'total_score': 'INTEGER',
             'window_start': 'STRING',
             'processing_time': 'STRING',
         }, options.view_as(GoogleCloudOptions).project))

    # [START session_calc]
    # Detect user sessions-- that is, a burst of activity separated by a gap
    # from further activity. Find and record the mean session lengths.
    # This information could help the game designers track the changing user
    # engagement as their set of game changes.
    (user_events  # pylint: disable=expression-not-assigned
     | 'WindowIntoSessions' >> beam.WindowInto(
         beam.window.Sessions(session_gap),
         timestamp_combiner=beam.window.TimestampCombiner.OUTPUT_AT_EOW)

     # For this use, we care only about the existence of the session, not any
     # particular information aggregated over it, so we can just group by key
     # and assign a "dummy value" of None.
     | beam.CombinePerKey(lambda _: None)

     # Get the duration of the session
     | 'UserSessionActivity' >> beam.ParDo(UserSessionActivity())
     # [END session_calc]

     # [START rewindow]
     # Re-window to process groups of session sums according to when the
     # sessions complete
     | 'WindowToExtractSessionMean' >> beam.WindowInto(
         beam.window.FixedWindows(user_activity_window_duration))

     # Find the mean session duration in each window
     | beam.CombineGlobally(beam.combiners.MeanCombineFn()).without_defaults()
     | 'FormatAvgSessionLength' >> beam.Map(
         lambda elem: {'mean_duration': float(elem)})
     | 'WriteAvgSessionLength' >> WriteToBigQuery(
         args.table_name + '_sessions', args.dataset, {
             'mean_duration': 'FLOAT',
         }, options.view_as(GoogleCloudOptions).project))
Exemplo n.º 17
0
def run(args, pipeline_args):
    # INSERT YOUR CODE HERE
    key_field_index = 0
    if args.director_copies_sold or args.director_dollars_sold:
        key_field_index = 5

    def SplitLine(line):
        # split to extract each field in the .csv file
        line_modified = line.replace(', ', '_')
        return line_modified.split(',')

    def PairWithCopies(fields):
        id = fields[key_field_index]
        purchase_method = fields[12]
        amount = fields[11]
        return (id, (amount if purchase_method == 'buy' else 0,
                     amount if purchase_method == 'rent' else 0))

    def PairWithRevenue(fields):
        id = fields[key_field_index]
        revenue = int(fields[9]) if fields[12] == 'buy' else int(fields[10])
        return (id, revenue)

    def PairWithTransaction(fields):
        movie_id = fields[0]
        user_name = fields[13]
        date_time = fields[14]
        return ((user_name, date_time), movie_id)

    def Sum(group):
        from operator import add
        buy_tot = 0
        rent_tot = 0
        (id, records) = group
        for record in records:
            (buy_amt, rent_amt) = record
            buy_tot = buy_tot + int(buy_amt)
            rent_tot = rent_tot + int(rent_amt)
        return (id, buy_tot, rent_tot)

    def Permute(transaction):
        ((user_name, date_time), movie_list) = transaction
        li = []
        position = 0
        for movie_id in movie_list:
            if len(movie_list) > 1:
                for movie_id_other in movie_list:
                    if (movie_id_other != movie_id):
                        li.append(((movie_id, movie_id_other), 1))
            else:
                li.append(((movie_id, None), 0))
        return li

    def ChangeKey(movie_combination):
        #print(movie_combination)
        (movie_id, movie_id_other), count = movie_combination
        return (movie_id, (movie_id_other, count))

    def Sort(movie_and_list):
        from operator import itemgetter
        (movie_id, purchased_together_tuples) = movie_and_list
        highest_list = []
        sorted_list = sorted(purchased_together_tuples,
                             key=itemgetter(1),
                             reverse=True)
        if sorted_list[0][1] == 0:
            highest_list.append(('None', str(0)))
        else:
            i = 0
            while i < len(
                    sorted_list) and sorted_list[i][1] == sorted_list[0][1]:
                highest_list.append(sorted_list[i])
                i = i + 1
        return (movie_id, highest_list)

    def FormatMovieNumbers(result):
        (id, buy_tot, rent_tot) = result
        return '%s\t%s\t%s' % (id, str(buy_tot), str(rent_tot))

    def FormatMovieRevenue(result):
        (id, revenue_tot) = result
        return '%s\t%s' % (id, str(revenue_tot))

    def FormatHighestList(result):
        movie_id, highest_list = result
        li = []
        #print(movie_id)
        li.append(str(movie_id))
        for highest_movie in highest_list:
            #print(highest_movie)
            li.append(str(highest_movie[0]))
        frequency = highest_list[0][1]
        li.append(str(frequency))
        result_formatted = '\t'.join(li)
        return result_formatted

    with beam.Pipeline(options=PipelineOptions(pipeline_args)) as pipeline:
        lines = pipeline | beam.io.ReadFromText(args.input)
        fields = (lines | 'Split' >> beam.Map(SplitLine))
        filtered_fields = (
            fields
            | 'Filter' >>
            beam.Filter(lambda field: args.genre is None and field is not None
                        or args.genre is not None and field[4] == args.genre))

        if args.copies_sold or args.director_copies_sold:
            movie_numbers = (filtered_fields
                             | 'PairWithCopies' >> beam.Map(PairWithCopies)
                             | 'GroupAndSum' >> beam.GroupByKey()
                             | 'MergeAmount' >> beam.Map(Sum)
                             |
                             'FormatRenvenue' >> beam.Map(FormatMovieNumbers))
            movie_numbers | 'WriteMovieNumbers' >> beam.io.WriteToText(
                args.output)

        if args.dollars_sold or args.director_dollars_sold:
            movie_revenue = (filtered_fields
                             | 'PairWithRevenue' >> beam.Map(PairWithRevenue)
                             | 'CombineRevenue' >> beam.CombinePerKey(sum)
                             | 'FormatRevenue' >> beam.Map(FormatMovieRevenue))
            movie_revenue | 'WriteMovieRevenue' >> beam.io.WriteToText(
                args.output)

        if args.purchased_together:
            highest_list = (
                filtered_fields
                | 'PairWithTrnasaction' >> beam.Map(
                    PairWithTransaction)  # (user_name, date_time), movie_id
                | 'GroupByTransaction' >> beam.GroupByKey()
                | 'Permute' >> beam.FlatMap(
                    Permute)  # (movie_id, movie_id_other), 1
                | 'CombineMovieCombo' >> beam.CombinePerKey(sum)
                | 'ChangeKey' >> beam.Map(
                    ChangeKey)  # movie_id, (movie_id_other, count)
                | 'GroupByMovie' >>
                beam.GroupByKey()  # movie_id, [(movie_id_other, count), ... ]
                | 'SortList' >> beam.Map(Sort)
                | 'FormatHighestList' >> beam.Map(FormatHighestList))
            highest_list | 'WriteHighestList' >> beam.io.WriteToText(
                args.output)

    pass
Exemplo n.º 18
0
pipeline=beam.Pipeline(argv=argv)

side=(
pipeline
|'read roster'>>beam.io.ReadFromText('gs://justinminsk_bucket/retrosheet/roster')
|beam.Map(lambda line:next(csv.reader([line])))
|beam.Map(lambda array:(array[0],array[2] +  ' ' + array[1]))
)


(
pipeline
|beam.io.ReadFromText('gs://justinminsk_bucket/retrosheet/events')
|beam.Map(lambda line:next(csv.reader([line])))
|beam.Filter(lambda tuple: int(tuple[2]) == 23)
|beam.Map(lambda tuple:dict(zip(header,tuple)))
|beam.Map(lambda dict:(dict['playerID'],int(dict['HRTotal'])))
|beam.combiners.Count.PerKey()
|beam.Map(lambda tuple,d:(tuple[0],tuple[1],d[tuple[0].split(' ')[0]]),beam.pvalue.AsDict(side))
|beam.combiners.ToList()
|beam.Map(make_string)
|beam.Map(lambda array:['playerID,Name,HRTotal'] + array)
|beam.FlatMap(lambda x:x)
|beam.io.WriteToText('gs://justinminsk_bucket/retrosheet/Minsk',num_shards=1)
)


result=pipeline.run()
result.wait_until_finish()
Exemplo n.º 19
0
    return name, 0


def return_tuple(element):
    thisTuple = element.split(',')
    return (thisTuple[0], thisTuple[1:])


p1 = beam.Pipeline()

card_defaulter = (
    p1
    | beam.io.ReadFromText('cards.txt', skip_header_lines=1)
    | beam.Map(default_score)
    | beam.CombinePerKey(sum)
    | beam.Filter(lambda x: x[1] > 0)
    #|beam.io.WriteToText('./output/card_skip')
)

medical_loan_defaulter = (
    p1
    | 'Read_medical' >> beam.io.ReadFromText('loan.txt', skip_header_lines=1)
    | 'Split Row' >> beam.Map(lambda row: row.split(','))
    | 'Filter medical loan' >> beam.Filter(
        lambda element: (element[5]).rstrip().lstrip() == 'Medical Loan')
    | 'Calculate late payment' >> beam.Map(calculate_late_payment)
    | 'Make key value pairs' >> beam.Map(lambda elements: (elements[
        0] + ', ' + elements[1] + ' ' + elements[2], int(elements[9])))
    | 'Group medical loan based on month' >> beam.CombinePerKey(sum)
    | 'Check for medical loan defaulter' >>
    beam.Filter(lambda element: element[1] >= 3)
Exemplo n.º 20
0
def run():
    address_scd = """SELECT * FROM `automatic-asset-253215.CORE.IM_CUSTOMER_ADDRESS_SCD`"""

    upd_addrorg_data = """SELECT
  CAST(a.HSN_ACCT_NUM AS INT64) AS CUSTOMER_ID,
  a.ADDRESS_NAME AS ADDR_NAME,
  'CLIC' AS ETL_SOURCE_SYSTEM,
  a.FILE_SET_DATE AS ETL_END_EFFECTIVE_DT,
  '0' AS ETL_CURRENT_IND,
  CAST(FORMAT_DATETIME('%Y%m%d%H%M%S', CURRENT_DATETIME()) AS INT64) AS UPD_BATCH_NBR
FROM
  `automatic-asset-253215.STAGE.STG_CLIC_CUSTADDRORG` a"""

    primary_pipeline_1 = 'p1'
    p1 = p | 'AddressSCD Table' >> beam.io.Read(
        beam.io.BigQuerySource(query=address_scd, use_standard_sql=True))

    join_pipeline_1 = 'j1'
    j1 = p | 'AddressORG Table' >> beam.io.Read(
        beam.io.BigQuerySource(query=upd_addrorg_data, use_standard_sql=True))

    common_key = {'CUSTOMER_ID', 'ADDR_NAME', 'ETL_SOURCE_SYSTEM'}
    pipelines_dictionary_1 = {primary_pipeline_1: p1, join_pipeline_1: j1}

    p1j1 = (pipelines_dictionary_1 | 'Updating addrs Fields' >> LeftJoin(
        primary_pipeline_1, p1, join_pipeline_1, j1, common_key))

    ins_addrorg_data = """SELECT
  (srg_key.MAX_VALUE_KEY + ROW_NUMBER() OVER()) AS CUSTOMER_ADDRESS_KEY,
  '' AS CUSTOMER_KEY,
  CAST(a.HSN_ACCT_NUM AS INT64) AS CUSTOMER_ID,
  a.ADDRESS_NAME AS ADDR_NAME,
  'CLIC' AS ETL_SOURCE_SYSTEM,
  CAST(a.ROW_CREATED_DATE AS TIMESTAMP) AS SOURCE_CREATE_DT,
  a.ADDRESS_LINE_1 AS ADDR_LINE1_TXT,
  a.ADDRESS_LINE_2 AS ADDR_LINE2_TXT,
  a.CITY AS CITY_NAME,
  a.STATE AS STATE_CODE,
  a.COUNTRY AS COUNTRY_CODE,
  SUBSTR(a.ZIP_CODE,1,5) AS POSTAL_ZIP,
  SUBSTR(a.ZIP_CODE,6,9) AS POSTAL_ZIP4,
  CASE WHEN a.DISABLE_CLEANSING_FLAG = 'N' THEN 1
          ELSE 0 
          END AS ADDR_CLEANSING_IND,
  CASE WHEN a.FRAUD_BAD_ACCT_FLAG = 'Y' THEN 1
          ELSE 0 
          END AS ADDR_FRAUD_IND,
  CASE WHEN a.AGENT_VERIFIED_ADDRESS = 'Y' THEN 1
          ELSE 0 
          END AS ADDR_QAS_VERIFIED_IND,
  a.ADDRESS_TYPE_CODE AS ADDR_TYPE_CODE,
  a.SHIP_TO_FIRST_NAME AS SHIPTO_FIRST_NAME,
  a.SHIP_TO_LAST_NAME AS SHIPTO_LAST_NAME,
  TIMESTAMP_ADD(a.FILE_SET_DATE, INTERVAL 1 DAY) AS ETL_BEGIN_EFFECTIVE_DT,
  CAST('2099-12-31 00:00:00' AS TIMESTAMP) AS ETL_END_EFFECTIVE_DT,
  '1' AS ETL_CURRENT_IND,
  '2' AS ETL_VERSION_NBR, --should be a sequntial number
  '0' AS VOID_IND,
  CAST(FORMAT_DATETIME('%Y%m%d%H%M%S',
    CURRENT_DATETIME()) AS INT64) AS UPD_BATCH_NBR,
  CAST(FORMAT_DATETIME('%Y%m%d%H%M%S',
    CURRENT_DATETIME()) AS INT64) AS INS_BATCH_NBR
FROM
  `automatic-asset-253215.STAGE.STG_CLIC_CUSTADDRORG` a,
  `automatic-asset-253215.STAGE.STG_CLIC_SURROGKEYS` srg_key
          WHERE srg_key.TABLE_NAME = "IM_CUSTOMER_ADDRESS_SCD"
   """

    Attribute_ref_query = """SELECT
    CUSTOMER_KEY,
    CUSTOMER_ID
  FROM
    `automatic-asset-253215.CORE.IM_CUSTOMER_ATTRIBUTE_REF` b"""

    lookup_data = p1 | 'Get Cust_Ids ' >> beam.Map(lambda row: (str(row[
        'CUSTOMER_ID']) + row['ADDR_NAME'] + row['ETL_SOURCE_SYSTEM'], row))

    primary_pipeline_2 = 'p2'
    p2 = (p | 'Read from addrorg' >> beam.io.Read(
        beam.io.BigQuerySource(query=ins_addrorg_data, use_standard_sql=True))
          | 'Lookup' >> beam.Map(lookup, AsDict(lookup_data))
          | 'Filter' >> beam.ParDo(filter_out_nones))

    join_pipeline_2 = 'j2'
    j2 = p | 'Read From Attribute Ref Table' >> beam.io.Read(
        beam.io.BigQuerySource(query=Attribute_ref_query,
                               use_standard_sql=True))

    common_key = 'CUSTOMER_ID'
    pipelines_dictionary_2 = {primary_pipeline_2: p2, join_pipeline_2: j2}

    p2j2 = (pipelines_dictionary_2
            | 'Left join' >> LeftJoin2(primary_pipeline_2, p2, join_pipeline_2,
                                       j2, common_key)
            | 'Filter Nulls' >> beam.Filter(filter_null))

    ((p1j1, p2j2) | 'Merge PCollections' >> beam.Flatten()
     | 'Write to IM_CUSTOMER_ADDRESS_SCD' >> beam.io.WriteToBigQuery(
         output_table,
         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
         create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER))

    p.run().wait_until_finish()
Exemplo n.º 21
0
def run_job(
    output_loc,
    policies,
    true_rewards,
    num_trials,
    num_contexts,
    num_logs,
    slate_depth,
    cut_off,
    dataflow_args,
):
    def init_target_policies_fn():
        target_policies = [(p[0], p[1]) for p in policies.items() if p[0] != "logging_policy"]
        return target_policies

    target_policy_names = list(zip(*init_target_policies_fn()))[0]

    pipeline_options = PipelineOptions(dataflow_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline = beam.Pipeline(options=pipeline_options)

    _init_env_fn = partial(
        init_env_fn,
        num_logs=num_logs,
        num_contexts=num_contexts,
        true_rewards=true_rewards,
        policies=policies,
        depth=slate_depth,
    )
    _init_estimators_fn = partial(init_estimators_fn, cutoffs=cut_off)
    _flatten_onpolicy_results = partial(flatten_onpolicy_results, cutoffs=cut_off)

    for trial in range(num_trials):
        logs = pipeline | "LogSimulation[T-{}]".format(trial) >> BeamRankerSimulator(num_logs, _init_env_fn)

        (
            logs
            | "FilterLoggingPolicyLog[T-{}]".format(trial) >> beam.Filter(lambda x: x[0] == "logging_policy")
            | "AddPredictions[T-{}]".format(trial)
            >> beam.Map(lambda x: addTargetPolicies(x[1], target_policy_names, policies=policies))
            | "ListwiseMetricRunner[T-{}]".format(trial)
            >> BeamListwiseMetricRunner(
                _init_estimators_fn,
                init_target_policies_fn,
                max_cutoff=max(cut_off),
            )
            | "WriteToFile[T-{}]".format(trial)
            >> beam.io.WriteToText(
                join(output_loc, "trial-{}-results".format(trial)),
                file_name_suffix=".json",
                coder=JsonCoder,
            )
        )
        (
            logs
            | "SumRewards[T-{}]".format(trial)
            >> beam.FlatMap(lambda l: [(l[0] + ":" + str(c), sum(l[1].slate_rewards[:c])) for c in cut_off])
            | "ComputeMean[T-{}]".format(trial) >> beam.transforms.combiners.Mean.PerKey()
            | "GroupAll[T-{}]".format(trial) >> GroupAll()
            | "FlattenResultIntoSingleMap[T-{}]".format(trial) >> beam.Map(_flatten_onpolicy_results)
            | "WriteToOnPolicyFile[T-{}]".format(trial)
            >> beam.io.WriteToText(
                join(output_loc, "trial-{}-onpolicy".format(trial)),
                file_name_suffix=".json",
                coder=JsonCoder,
            )
        )
    results = pipeline.run()
    results.wait_until_finish()
Exemplo n.º 22
0
#   regarding copyright ownership.  The ASF licenses this file
#   to you under the Apache License, Version 2.0 (the
#   "License"); you may not use this file except in compliance
#   with the License.  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

# beam-playground:
#   name: Filter
#   description: Task from katas to implement a filter function that filters out odd numbers.
#   multifile: false
#   categories:
#     - Filtering

import apache_beam as beam

from log_elements import LogElements

with beam.Pipeline() as p:

  (p | beam.Create(range(1, 11))
     | beam.Filter(lambda num: num % 2 == 0)
     | LogElements())

Exemplo n.º 23
0

def FormatText(elem):
    return elem[0] + ' has received ' + str(elem[1]) + ' marks'


p1 = beam.Pipeline()

input_collection = (
    p1
    | beam.io.ReadFromText('../Apache_Beam_Data/students_marks.txt')
    | beam.Map(SplitRow))

US_pipeline = (
    input_collection
    | beam.Filter(lambda record: FilterBasedonCountry('US', record))
    | "Composite Transformation for US" >> MyTransform()
    | 'Writing results to US File' >> beam.io.WriteToText('output/US_Result'))

India_pipeline = (
    input_collection
    | beam.Filter(lambda record: FilterBasedonCountry('IN', record))
    | "Composite Transformation for IN" >> MyTransform()
    |
    'Writing results to India File' >> beam.io.WriteToText('output/IN_Result'))

p1.run()

print('')
print("US Result: ")
print(os.system('cat output/US*'))
Exemplo n.º 24
0
def transform_data(train_data_file, test_data_file, working_dir, pipeline):
    def pre_processing_fun(inputs):
        outputs = {}

        for fea in NUMERIC_FEATURE_KEYS:
            outputs[fea] = tft.scale_to_0_1(inputs[fea])

        for fea in CATEGORICAL_FEATURE_KEYS:
            outputs[fea] = tft.string_to_int(inputs[fea])

        def convert_label(label):
            table = lookup.index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_KEY] = tft.apply_function(convert_label,
                                                inputs[LABEL_KEY])

        return outputs

    converter = csv_coder.CsvCoder(ORDERED_COLUMNS, RAW_DATA_META.schema)
    '''
    Transform and save train data
    '''
    raw_train_data = (
        pipeline
        |
        "Read raw train input" >> beam.io.textio.ReadFromText(train_data_file)
        | "Filter train line" >> beam.Filter(lambda x: x)
        | "Fix commas train data" >> beam.Map(lambda x: x.replace(', ', ','))
        | "Decode train as csv" >> beam.Map(converter.decode))

    raw_train_dataset = (raw_train_data, RAW_DATA_META)

    transformed_train_dataset, transform_fn = (
        raw_train_dataset
        | beam_impl.AnalyzeAndTransformDataset(pre_processing_fun))

    transformed_train_data, transformed_train_meta = transformed_train_dataset

    # Save transformed training data
    (transformed_train_data
     | "Save transformed train data" >> beam.io.tfrecordio.WriteToTFRecord(
         os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE),
         coder=example_proto_coder.ExampleProtoCoder(
             transformed_train_meta.schema)))
    '''
    Transform and save test data
    '''
    raw_test_data = (
        pipeline
        | "Read raw test input" >> beam.io.textio.ReadFromText(test_data_file)
        | "Filter test line" >> beam.Filter(lambda x: x)
        | "Fix commas test data" >> beam.Map(lambda x: x.replace(', ', ','))
        | "Decode test as csv" >> beam.Map(converter.decode))

    raw_test_dataset = (raw_test_data, RAW_DATA_META)

    transformed_test_dataset = (raw_test_dataset,
                                transform_fn) | beam_impl.TransformDataset()

    transformed_test_data, _ = transformed_test_dataset

    # Save transformed test data
    (transformed_test_data
     | "Save transformed test data" >> beam.io.tfrecordio.WriteToTFRecord(
         os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE),
         coder=example_proto_coder.ExampleProtoCoder(
             transformed_train_meta.schema)))
    '''
    Save transform function
    '''
    (transform_fn
     | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))
Exemplo n.º 25
0
 def expand(self, pcoll):
     return pcoll | "IsBid" >> beam.Filter(is_bid)
Exemplo n.º 26
0
        format='[%(asctime)s][%(name)s][%(levelname)s] %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        level=logging.INFO)
    return log_file


if __name__ == "__main__":

    args, log_file = init(), config()
    logging.info('Starting job...')
    logging.info(f'Input: "{args.input}"')
    logging.info(f'Output: "{args.output}"')

    with beam.Pipeline(runner="DirectRunner") as pipeline:
        (pipeline
         | 'Read Data' >> beam.io.ReadFromText(args.input)
         | 'Parse JSON' >> beam.ParDo(JSONParser())
         | 'Remove Invalid' >> beam.Filter(lambda data: 'id' in data)
         | 'Key/Value Pair' >> beam.Map(lambda data: (data['id'], data))
         | 'Group by Key' >> beam.GroupByKey()
         | 'Remove Duplicates' >> beam.Map(lambda data: data[1][0])
         | 'Show IDs' >> beam.ParDo(Printer())
         | 'Parse Dates' >> beam.ParDo(DateParser())
         | 'Write Output' >> beam.io.WriteToParquet(
             f'{args.output}/{uuid4()}',
             schema.jokes(),
             codec='snappy',
             file_name_suffix='.snappy.parquet'))

    logging.info(f'Job finished... Log file saved at "{log_file}"')
Exemplo n.º 27
0
 def expand(self, pcoll):
     return pcoll | "IsPerson" >> beam.Filter(is_person)
Exemplo n.º 28
0
! pip install apache-beam
import apache_beam as beam

#! {(mkdir data)}
p1 = beam.Pipeline()

def SplitRow(element):
  return element.split(',')

def filtering(record):
  return record[3] == "Accounts"
attendance_count = (
    p1
    |"Read from data" >> beam.io.ReadFromText('dept_data.txt')
    | "Split data" >> beam.Map(lambda line : (line.split(",")))
    | "Filter" >> beam.Filter(filtering)
    | "Adding Key and value" >> beam.Map(lambda  l : (l[1],1))
    | "Combine by key" >> beam.CombinePerKey(sum)
    | "Write_File" >> beam.io.WriteToText('data/sample_data12121')
)

p1.run()

!head -n 20 data/*






Exemplo n.º 29
0
import apache_beam as beam
def SplitRow(element):
    return element.split(',')

def filtering(record):
    return record[3] == 'Accounts'

def listing(record):
    return (record[1],1)

p1 = beam.Pipeline()

attendance_count = (
    
    p1 
     |"Read" >> beam.io.ReadFromText('dept-data.txt')
     |"Split" >> beam.Map(lambda element: element.split(','))
     |"Filter" >> beam.Filter(lambda record: record[3] == 'Accounts')
     |"Map Name" >> beam.Map(lambda record: (record[1],1))
     |"Combin name" >> beam.CombinePerKey(sum)
     |"Write to beam" >> beam.io.WriteToText('data/output_new2')
)

p1.run()
Exemplo n.º 30
0
                                                     skip_header_lines=1)
    | "Of text to list (rainfall)" >> beam.Map(text_to_list, delimeter=',')
    | "Create key uf-ano-mes" >> beam.Map(key_uf_year_month)
    | "Sum of rainfall cases" >> beam.CombinePerKey(sum)
    | "Rounding rain results" >> beam.Map(round_results)
    #| "Show rain results" >> beam.Map(print)
)

results = (
    #(rain, dengue)
    #| "Join at pcollections dengue and rainfall's" >> beam.Flatten()
    #| "Agroup" >> beam.GroupByKey()
    #| "Show results finaly" >> beam.Map(print)
    ({
        'chuvas': rain,
        'dengue': dengue
    })
    | "Merge at results" >> beam.CoGroupByKey()
    | "Filter data" >> beam.Filter(filter_fields)
    | "Descompact element" >> beam.Map(descompct_element)
    | "Build at a csv" >> beam.Map(buil_csv)
    # | "Show results finaly" >> beam.Map(print)
)

header = 'UF;YEAR;MONTH;RAINFALL;DENGUE'

results | "Build from csv" >> WriteToText(
    './basedb/resultado', file_name_suffix='.csv', header=header)

pipeline.run()