Пример #1
0
def get_influx_dataframe(table, influx_url=INFLUX_URL,
                         s3_bucket_for_bookmark=S3_BUCKET_FOR_BOOKMARK,
                         bookmark_file=BOOKMARK_FILE):
    """
    Reads table from URL of influx and returns it as PySpark DataFrame

    Arguments
    ---------
        table (str):
            Table to be retrieved from Influx Database
        influx_url (str):
            URL of influx database
        s3_bucket_for_bookmark (str):
            Name of s3 bucket containing bookmark
        bookmark_file (str):
            Location of file in s3 bucket containing bookmark

    Returns
    -------
        influx_df (pyspark.sql.DataFrame):
            Influx database retrieved as pyspark.sql.DataFrame where time
            is datetime-encoded
    """
    s3 = boto3.resource('s3')
    obj = s3.Object(s3_bucket_for_bookmark, bookmark_file)
    insert_into_timestamp = obj.get()['Body'].read().decode('utf-8')
    print("Inserted into timestamp: " + insert_into_timestamp)

    # Converting insert_into_timestamp into datetime format.
    # NOTE: Python's `datetime` module supports only until microseconds. So,
    # insert_into_timestamp has been trimmed from nanoseconds to microseconds
    # for parsing into datetime.strptime()
    temp = insert_into_timestamp[:-4]
    datetime_insert_timestamp = datetime.datetime.strptime(temp, '%Y-%m-%dT%H:%M:%S.%f')

    glueContext = GlueContext(SparkContext.getOrCreate())

    if table == TABLE1:
        query = f"SELECT * FROM {table} WHERE event =~ /card_viewed|card_created|card_marked_as_complete|channel_followed|group_user_added|card_assigned/ and time > now() - 3d and time < now() - 1d"
    elif table == TABLE2:
        query = f"SELECT * FROM {table} WHERE event =~ /card_viewed|card_created|card_marked_as_complete|channel_followed|group_user_added|card_assigned/ and time > now() - 3d and time < now() - 1d"
    elif table == TABLE3:
        query = f"SELECT * FROM {table} WHERE time > now() - 3d and time < now() - 1d"

    params = {"pretty": "false", "q": query}        


    #params = {"pretty": "false", "q": "SELECT * FROM "+influxTable+" WHERE time >'"+last_processed_timestamp+"' order by time desc"}

    r = requests.get(influx_url, params=params)
    data = json.loads(r.text)

    # Retreiving data column names and values from JSON
    values = data["results"][0]["series"][0]["values"]
    columns = data["results"][0]["series"][0]["columns"]

    # Data is inserted after timestamp of first value row
    inserted_into_timestamp = values[0][0]

    # Defining schema for the data
    column_structfields = [StructField(column, StringType(), True) for column in columns]
    schema = StructType(column_structfields)

    # Creating new dataframe with above-defined schema
    df = glueContext.createDataFrame(values, schema)

    # Typecasting 'time' in 'df' to 'timestamp' format
    new_df = df.withColumn("time", df["time"].cast("timestamp"))
    new_df = new_df.withColumn("time_string",
                               date_format(new_df.time, "yyyy-MM-dd hh:mm:ss"))

    return new_df
Пример #2
0
job.init(args['JOB_NAME'], args)

### CODE STARTS HERE ###

# Create an empty dictionary and use it to load the contents of the JSON file taken from "Referenced files path":
filejson = {}
with open('areas.json', 'r') as inputfile:
    filejson = eval(inputfile.read())

# Create a Spark DataFrame with data from the JSON file:
df = glueContext.createDataFrame([(filejson[0]["id"], filejson[0]["name"]),
                                  (filejson[1]["id"], filejson[1]["name"]),
                                  (filejson[2]["id"], filejson[2]["name"]),
                                  (filejson[3]["id"], filejson[3]["name"]),
                                  (filejson[4]["id"], filejson[4]["name"]),
                                  (filejson[5]["id"], filejson[5]["name"]),
                                  (filejson[6]["id"], filejson[6]["name"]),
                                  (filejson[7]["id"], filejson[7]["name"]),
                                  (filejson[8]["id"], filejson[8]["name"]),
                                  (filejson[9]["id"], filejson[9]["name"])],
                                 ['id', 'name'])

# Check 'df' contents:
df.printSchema()
df.show()

# Convert 'df' to Glue DynamicFrame:
dyf = DynamicFrame.fromDF(df, glueContext, "dftodyf")

# Repartition to 12 to achieve maximum parallelization based on my DPUs config:
dyf_rep = dyf.repartition(12)
Пример #3
0
  StructField("steve_ross", LongType()),
  StructField("structure", LongType()),
  StructField("sun", LongType()),
  StructField("tomb_frame", LongType()),
  StructField("tree", LongType()),
  StructField("trees", LongType()),
  StructField("triple_frame", LongType()),
  StructField("waterfall", LongType()),
  StructField("waves", LongType()),
  StructField("windmill", LongType()),
  StructField("window_frame", LongType()),
  StructField("winter", LongType()),
  StructField("wood_framed", LongType())
])

paintings_data_frame = glueContext.createDataFrame(paintings_source, schema = paintings_schema)
paintings = DynamicFrame.fromDF(paintings_data_frame, glueContext, 'dyf')

# Cast all "bit" fields (LongTypes) into booleans
# It's easier to use a list of non-bit fields as the majority of fields imported are bit fields
non_bit_fields = ["episode", "title"]
bit_fields_specs = [
    (field.name, "cast:boolean")
    for field in paintings.schema()
    if field.name not in non_bit_fields and field.dataType.typeName() == 'long' # Type-check to provide accidentally casting a non-bit column if not in "non_bit_fields"
]
paintings_with_bool_fields = ResolveChoice.apply(paintings, specs = bit_fields_specs)

# Parse and clean up the season, episode, and episode text fields
def normalize_episode_fields(record):
    # Parse the season and episode numbers