Exemplos de ReadFromParquet em Python, exemplos de apache_beam.io.parquetio.ReadFromParquet em Python

Exemplo n.º 1

0

Exibir arquivo

  def test_read_display_data(self):
    file_name = 'some_parquet_source'
    read = \
      ReadFromParquet(
          file_name,
          validate=False)
    dd = DisplayData.create_from(read)

    expected_items = [
        DisplayDataItemMatcher('compression', 'auto'),
        DisplayDataItemMatcher('file_pattern', file_name)]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))

Exemplo n.º 2

0

Exibir arquivo

 def test_sink_transform(self):
   with tempfile.NamedTemporaryFile() as dst:
     path = dst.name
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS) \
       | WriteToParquet(
           path, self.SCHEMA, num_shards=1, shard_name_template='')
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path) \
           | Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))

Exemplo n.º 3

0

Exibir arquivo

 def test_sink_transform(self):
   with TemporaryDirectory() as tmp_dirname:
     path = os.path.join(tmp_dirname + "tmp_filename")
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS) \
       | WriteToParquet(
           path, self.SCHEMA, num_shards=1, shard_name_template='')
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path) \
           | Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: parquetio_test.py Projeto: rezarokni/beam

 def test_sink_transform_compressed(self, compression_type):
   if compression_type == 'lz4' and ARROW_MAJOR_VERSION == 1:
     return unittest.skip(
         "Writing with LZ4 compression is not supported in "
         "pyarrow 1.x")
   with TemporaryDirectory() as tmp_dirname:
     path = os.path.join(tmp_dirname + "tmp_filename")
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS) \
       | WriteToParquet(
           path, self.SCHEMA, codec=compression_type,
           num_shards=1, shard_name_template='')
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path + '*') \
           | Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))

Exemplo n.º 5

0

Exibir arquivo

Arquivo: parquetio_test.py Projeto: rezarokni/beam

 def test_sink_transform_compliant_nested_type(self):
   if ARROW_MAJOR_VERSION < 4:
     return unittest.skip(
         'Writing with compliant nested type is only '
         'supported in pyarrow 4.x and above')
   with TemporaryDirectory() as tmp_dirname:
     path = os.path.join(tmp_dirname + 'tmp_filename')
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS_NESTED) \
       | WriteToParquet(
           path, self.SCHEMA_NESTED, num_shards=1,
           shard_name_template='', use_compliant_nested_type=True)
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path) \
           | Map(json.dumps)
       assert_that(
           readback, equal_to([json.dumps(r) for r in self.RECORDS_NESTED]))

Exemplo n.º 6

0

Exibir arquivo

Arquivo: DFJ_SII_BCH_ELT_TRN.py Projeto: sruriel/dataEngineeringGCP

    # Esta funcion calcula el schema del parquet a escribir, aplicando el renombre de columnas al schema original
    def getSchema():
        df_schema = pyarrow.Schema.from_pandas(
            pd.read_parquet(user_options.schema_source.get()))
        for (key, value) in ast.literal_eval(
                user_options.rename_columns.get()).items():
            df_schema = df_schema.set(
                df_schema.get_field_index(key),
                pyarrow.field(value,
                              df_schema.types[df_schema.get_field_index(key)]))
        return df_schema

    # Este lee los archivos parquet fuente y calcula el diccionario con el mapeo de las columnas a renombrar
    map_rename_cols = (
        p | "Read for rename cols" >> ReadFromParquet(user_options.url_raw)
        | "Map rename cols" >> beam.Map(mapRenameCols)
        | "Rename cols to string" >> beam.Map(str)
        | "Deduplicate elements" >> beam.Distinct())
    # Este lee los datos desde los archivos fuente
    data = (p
            | "Read parquet for data" >> ReadFromParquet(user_options.url_raw))
    # Este aplica la funcion para renombarar las columnas y recibe el resultado del paso anterior como diccionario
    rename_data = (data | "Rename columns" >> beam.Map(
        reColumns, rename_cols=AsList(map_rename_cols)))
    # Este escribe los datos en la ruta destino, obteniendo el schema desde la funcion getSchema
    _ = (rename_data | "Write to storage TRN" >> WriteToParquet(
        user_options.url_trn, schema=getSchema(), file_name_suffix=".parquet"))

print("End Pipeline")

Exemplo n.º 7

0

Exibir arquivo

Arquivo: DFJ_SII_BCH_ELT_BQ.py Projeto: sruriel/dataEngineeringGCP

    if "timePartitioning" in metadata_table_bq.keys(
    ) or "rangePartitioning" in metadata_table_bq.keys():
        if "timePartitioning" in metadata_table_bq.keys():
            print("time Partitioning")
            bq_param = metadata_table_bq["timePartitioning"]
#             bq_param = {"timePartitioning": {"type": metadata_table_bq["timePartitioning"]["type"],"field": metadata_table_bq["timePartitioning"]["field"]}}
        elif "rangePartitioning" in metadata_table_bq.keys():
            print("range Partitioning")
            bq_param2 = metadata_table_bq["rangePartitioning"]
            bq_param = {
                "rangePartitioning": {
                    "field": metadata_table_bq["rangePartitioning"]["field"],
                    "range": metadata_table_bq["rangePartitioning"]["range"]
                }
            }
        data = p | "Read from storage HOM" >> ReadFromParquet(
            user_options.url_hom)
        print(bq_param2)
        print(bq_param)
        #         data | beam.Map(print)
        data | "Write to BQ HOM" >> WriteToBigQuery(
            #                                             table=user_options.table_id,
            table='yas-dev-sii-pid:sii_yas_de_hom.hom_cuentas_test',
            #                                             project=metadata_table_bq['tableReference']['projectId'],
            additional_bq_parameters=bq_param,
            schema=metadata_table_bq['schema'],
            #                                             method='FILE_LOADS',
            create_disposition='CREATE_IF_NEEDED',
            #                                             create_disposition='CREATE_NEVER',
            write_disposition='WRITE_TRUNCATE',
            #                                             write_disposition='WRITE_APPEND',
            #                                             custom_gcs_temp_location=user_options.custom_bq_temp_loc.get()