Python GlueContext.createDataFrame примеры использования

Язык программирования: Python

Пространство имен/Пакет: awsglue.context

Класс/Тип: GlueContext

Метод/Функция: createDataFrame

Примеров на hotexamples.com: 3

Python GlueContext.createDataFrame - 3 примера найдено. Это лучшие примеры Python кода для awsglue.context.GlueContext.createDataFrame, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GlueContext(30)

create_dynamic_frame_from_options(30)

write_dynamic_frame_from_options(13)

create_dynamic_frame_from_catalog(8)

getSink(7)

forEachBatch(4)

purge_s3_path(4)

get_logger(4)

extract_jdbc_conf(3)

createDataFrame(3)

sql(2)

create_dynamic_frame_from_rdd(1)

abort_transaction(1)

commit_transaction(1)

begin_transaction(1)

getSource(1)

get_catalog_schema_as_spark_schema(1)

createOrReplaceTempView(1)

start_transaction(1)

cancel_transaction(1)

Пример #1

Показать файл

def get_influx_dataframe(table, influx_url=INFLUX_URL,
                         s3_bucket_for_bookmark=S3_BUCKET_FOR_BOOKMARK,
                         bookmark_file=BOOKMARK_FILE):
    """
    Reads table from URL of influx and returns it as PySpark DataFrame

    Arguments
    ---------
        table (str):
            Table to be retrieved from Influx Database
        influx_url (str):
            URL of influx database
        s3_bucket_for_bookmark (str):
            Name of s3 bucket containing bookmark
        bookmark_file (str):
            Location of file in s3 bucket containing bookmark

    Returns
    -------
        influx_df (pyspark.sql.DataFrame):
            Influx database retrieved as pyspark.sql.DataFrame where time
            is datetime-encoded
    """
    s3 = boto3.resource('s3')
    obj = s3.Object(s3_bucket_for_bookmark, bookmark_file)
    insert_into_timestamp = obj.get()['Body'].read().decode('utf-8')
    print("Inserted into timestamp: " + insert_into_timestamp)

    # Converting insert_into_timestamp into datetime format.
    # NOTE: Python's `datetime` module supports only until microseconds. So,
    # insert_into_timestamp has been trimmed from nanoseconds to microseconds
    # for parsing into datetime.strptime()
    temp = insert_into_timestamp[:-4]
    datetime_insert_timestamp = datetime.datetime.strptime(temp, '%Y-%m-%dT%H:%M:%S.%f')

    glueContext = GlueContext(SparkContext.getOrCreate())

    if table == TABLE1:
        query = f"SELECT * FROM {table} WHERE event =~ /card_viewed|card_created|card_marked_as_complete|channel_followed|group_user_added|card_assigned/ and time > now() - 3d and time < now() - 1d"
    elif table == TABLE2:
        query = f"SELECT * FROM {table} WHERE event =~ /card_viewed|card_created|card_marked_as_complete|channel_followed|group_user_added|card_assigned/ and time > now() - 3d and time < now() - 1d"
    elif table == TABLE3:
        query = f"SELECT * FROM {table} WHERE time > now() - 3d and time < now() - 1d"

    params = {"pretty": "false", "q": query}        


    #params = {"pretty": "false", "q": "SELECT * FROM "+influxTable+" WHERE time >'"+last_processed_timestamp+"' order by time desc"}

    r = requests.get(influx_url, params=params)
    data = json.loads(r.text)

    # Retreiving data column names and values from JSON
    values = data["results"][0]["series"][0]["values"]
    columns = data["results"][0]["series"][0]["columns"]

    # Data is inserted after timestamp of first value row
    inserted_into_timestamp = values[0][0]

    # Defining schema for the data
    column_structfields = [StructField(column, StringType(), True) for column in columns]
    schema = StructType(column_structfields)

    # Creating new dataframe with above-defined schema
    df = glueContext.createDataFrame(values, schema)

    # Typecasting 'time' in 'df' to 'timestamp' format
    new_df = df.withColumn("time", df["time"].cast("timestamp"))
    new_df = new_df.withColumn("time_string",
                               date_format(new_df.time, "yyyy-MM-dd hh:mm:ss"))

    return new_df

Пример #2

Показать файл

Файл: demo2.py Проект: kr80865n/Projects

job.init(args['JOB_NAME'], args)

### CODE STARTS HERE ###

# Create an empty dictionary and use it to load the contents of the JSON file taken from "Referenced files path":
filejson = {}
with open('areas.json', 'r') as inputfile:
    filejson = eval(inputfile.read())

# Create a Spark DataFrame with data from the JSON file:
df = glueContext.createDataFrame([(filejson[0]["id"], filejson[0]["name"]),
                                  (filejson[1]["id"], filejson[1]["name"]),
                                  (filejson[2]["id"], filejson[2]["name"]),
                                  (filejson[3]["id"], filejson[3]["name"]),
                                  (filejson[4]["id"], filejson[4]["name"]),
                                  (filejson[5]["id"], filejson[5]["name"]),
                                  (filejson[6]["id"], filejson[6]["name"]),
                                  (filejson[7]["id"], filejson[7]["name"]),
                                  (filejson[8]["id"], filejson[8]["name"]),
                                  (filejson[9]["id"], filejson[9]["name"])],
                                 ['id', 'name'])

# Check 'df' contents:
df.printSchema()
df.show()

# Convert 'df' to Glue DynamicFrame:
dyf = DynamicFrame.fromDF(df, glueContext, "dftodyf")

# Repartition to 12 to achieve maximum parallelization based on my DPUs config:
dyf_rep = dyf.repartition(12)

Пример #3

Показать файл

  StructField("steve_ross", LongType()),
  StructField("structure", LongType()),
  StructField("sun", LongType()),
  StructField("tomb_frame", LongType()),
  StructField("tree", LongType()),
  StructField("trees", LongType()),
  StructField("triple_frame", LongType()),
  StructField("waterfall", LongType()),
  StructField("waves", LongType()),
  StructField("windmill", LongType()),
  StructField("window_frame", LongType()),
  StructField("winter", LongType()),
  StructField("wood_framed", LongType())
])

paintings_data_frame = glueContext.createDataFrame(paintings_source, schema = paintings_schema)
paintings = DynamicFrame.fromDF(paintings_data_frame, glueContext, 'dyf')

# Cast all "bit" fields (LongTypes) into booleans
# It's easier to use a list of non-bit fields as the majority of fields imported are bit fields
non_bit_fields = ["episode", "title"]
bit_fields_specs = [
    (field.name, "cast:boolean")
    for field in paintings.schema()
    if field.name not in non_bit_fields and field.dataType.typeName() == 'long' # Type-check to provide accidentally casting a non-bit column if not in "non_bit_fields"
]
paintings_with_bool_fields = ResolveChoice.apply(paintings, specs = bit_fields_specs)

# Parse and clean up the season, episode, and episode text fields
def normalize_episode_fields(record):
    # Parse the season and episode numbers