示例#1
0
def extract_from_stage(import_table, stage, schema, path, pattern, env):
    engine = snowflake_engine_factory(env, 'LOADER', schema)

    query = f"""
        COPY INTO raw.{schema}.{import_table}
        FROM @{stage}/{path}
        PATTERN = '{pattern}'
        ON_ERROR = 'CONTINUE';
    """
    execute_query(engine, query)
def update_chronological_sequence():
    engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util")

    query = f'''
    UPDATE ANALYTICS.EVENTS.USER_EVENTS_BY_DATE
    SET chronological_sequence = a.chronological_sequence,
        seconds_after_prev_event = a.seconds_after_prev_event
    FROM (
        SELECT id,
            updated_at,
            ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY min_timestamp) as chronological_sequence,
            datediff(second, lag(min_timestamp) over (partition by user_id order by min_timestamp), min_timestamp) as seconds_after_prev_event
        FROM ANALYTICS.EVENTS.USER_EVENTS_BY_DATE
        WHERE length(user_id) < 36
        AND user_id IS NOT NULL
    ) a
    WHERE user_events_by_date.updated_at::timestamp = (SELECT MAX(UPDATED_AT)::timestamp FROM analytics.events.user_events_by_date)
    AND a.id = user_events_by_date.id;
    '''

    execute_query(engine, query)
示例#3
0
def parse_user_agent():
    engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util")
    """ This function searches for and parses all available user agents received via telemetry data that are
    not currently in the analytics.mattermost.user_agent_registry table."""

    # CREATE USER_AGENT_REGISTRY IF IT DOES NOT ALREADY EXIST.
    query = f"""
    CREATE TABLE IF NOT EXISTS analytics.WEB.user_agent_registry
(
    context_useragent VARCHAR,
    browser           VARCHAR,
    browser_version   VARCHAR,
    operating_system  VARCHAR,
    os_version        VARCHAR,
    device_type       VARCHAR,
    device_brand      VARCHAR,
    device_model      VARCHAR
);"""

    execute_query(engine, query)

    # UNION ALL SOURCES OF CONTEXT_USERAGENT DATA THAT ARE NOT CURRENTLY IN THE USER_AGENT_REGISTRY TABLE.
    query = f"""
    SELECT *
    FROM (
        SELECT COALESCE(ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USER_AGENT, ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USERAGENT) AS CONTEXT_USERAGENT
        FROM ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY
        LEFT JOIN (SELECT CONTEXT_USERAGENT as JOIN_KEY FROM analytics.WEB.user_agent_registry GROUP BY 1) a
            ON COALESCE(ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USER_AGENT, ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USERAGENT) = a.JOIN_KEY
        WHERE COALESCE(ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USER_AGENT, ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USERAGENT) IS NOT NULL
            AND a.JOIN_KEY IS NULL
        GROUP BY 1
        UNION ALL
        SELECT USERAGENT AS CONTEXT_USERAGENT
        FROM RAW.RELEASES.LOG_ENTRIES
        LEFT JOIN (SELECT CONTEXT_USERAGENT as JOIN_KEY FROM analytics.WEB.user_agent_registry GROUP BY 1) a
            ON RAW.RELEASES.LOG_ENTRIES.USERAGENT = a.JOIN_KEY
        WHERE USERAGENT IS NOT NULL
            AND a.JOIN_KEY IS NULL
        GROUP BY 1
        UNION ALL
        SELECT USERAGENT AS CONTEXT_USERAGENT
        FROM RAW.DIAGNOSTICS.LOG_ENTRIES
        WHERE USERAGENT IS NOT NULL
            AND USERAGENT NOT IN (SELECT CONTEXT_USERAGENT FROM analytics.WEB.user_agent_registry GROUP BY 1)
            AND LOGDATE::date >= CURRENT_DATE - INTERVAL '1 DAY'
        GROUP BY 1
        UNION ALL
        SELECT CONTEXT_USERAGENT
        FROM ANALYTICS.WEB.DAILY_WEBSITE_TRAFFIC
        LEFT JOIN (SELECT CONTEXT_USERAGENT as JOIN_KEY FROM analytics.WEB.user_agent_registry GROUP BY 1) a
            ON ANALYTICS.WEB.DAILY_WEBSITE_TRAFFIC.CONTEXT_USERAGENT = a.JOIN_KEY
        WHERE CONTEXT_USERAGENT IS NOT NULL
            AND a.JOIN_KEY IS NULL
        GROUP BY 1
        )
    GROUP BY 1;
    """

    df = execute_dataframe(engine, query)

    if (
            len(df) == 0
    ):  # CHECKS TO SEE IF THERE ARE ANY NEW CONTEXT_USERAGENTS TO INSERT INTO THE TABLE
        print("Nothing to do.")
    else:  # PARSES USERAGENT COMPONENTS AND APPENDS EACH COMPONENT AS A COLUMN TO THE EXISTING DATAFRAME.
        browser = []
        browser_family = []
        browser_version = []
        browser_version_string = []
        operating_system = []
        os_family = []
        os_version = []
        os_version_string = []
        device = []
        device_family = []
        device_brand = []
        device_model = []

        for index, row in df.iterrows():
            ua_string = row["CONTEXT_USERAGENT"]
            user_agent = parse(ua_string)

            browser.append(user_agent.browser)
            browser_family.append(user_agent.browser.family)
            browser_version.append(user_agent.browser.version)
            browser_version_string.append(user_agent.browser.version_string)

            # Accessing user agent's operating system properties
            operating_system.append(user_agent.os)
            os_family.append(user_agent.os.family)
            os_version.append(user_agent.os.version)
            os_version_string.append(user_agent.os.version_string)

            # Accessing user agent's device properties
            device.append(user_agent.device)
            device_family.append(user_agent.device.family)
            device_brand.append(user_agent.device.brand)
            device_model.append(user_agent.device.model)

        browser = pd.Series(browser_family, name="browser")
        browser_version = pd.Series(browser_version_string,
                                    name="browser_version")
        op_sys = pd.Series(os_family, name="operating_system")
        os_version = pd.Series(os_version_string, name="os_version")
        device_type = pd.Series(device_family, name="device_type")
        device_brand = pd.Series(device_brand, name="device_brand")
        device_model = pd.Series(device_model, name="device_model")

        agent_lists = [
            browser,
            browser_version,
            op_sys,
            os_version,
            device_type,
            device_brand,
            device_model,
        ]
        for item in agent_lists:
            df = df.join(item)

        connection = engine.connect()

        # 16,384 is Snowflake Insert statement row limit. To ensure the job executes successfully we use the below code to check that the data being inserted
        # is not more than the allowed row limit. If it is, we incrementally load the dataframe.
        df[0:16384 if len(df) > 16384 else len(df)].to_sql(
            "user_agent_registry",
            con=connection,
            index=False,
            schema="WEB",
            if_exists="append",
        )
        i = 2  # The default number of times to increment. Will autoincrement if more than 2 inserts are required.

        if i <= math.ceil(len(df) / 16384):
            x = 16384  # The start row of the dataframe slice to be inserted. Will autoincrement if more than 2 inserts are required.
            y = (
                16384 * 2
            )  # The end row of the dataframe slice to be inserted. Will autoincrement if more than 2 inserts are required.

            # Loops through the remaining insert statements required to finish the job i.e. load all new user agents found in the mattermostcom.pages table.
            for n in range(math.ceil(len(df) / 16384) - 1):
                df[x:y if y < len(df) else len(df)].to_sql(
                    "user_agent_registry",
                    con=connection,
                    index=False,
                    schema="WEB",
                    if_exists="append",
                )
                x = y
                y += 16384
                i += 1
        return print(
            f"""Successfully uploaded {len(df)} records to mattermost.user_agent_registry!"""
        )
    print(f'''Success. Uploaded {len(onprem_df2)} rows to ANALYTICS.MATTERMOST.ONPREM_CLEARBIT''')

    clearbit_onprem_exceptions.to_sql("clearbit_onprem_exceptions", con=connection, index=False, schema="STAGING", if_exists="append")
    print(f'''Success. Uploaded {len(clearbit_onprem_exceptions)} rows to ANALYTICS.STAGING.CLEARBIT_ONPREM_EXCEPTIONS''')


    engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util")
    connection = engine.connect()
    try:
        query = f'''SELECT * FROM ANALYTICS.STAGING.CLEARBIT_ONPREM_EXCEPTIONS'''
        exceptions_test = execute_dataframe(engine, query=query)
    except:
        exceptions_test = None 

    if exceptions_test is None:
        query = '''CREATE OR REPLACE TABLE ANALYTICS.STAGING.CLEARBIT_ONPREM_EXCEPTIONS AS SELECT DISTINCT SERVER_ID, CURRENT_TIMESTAMP AS received_at FROM ANALYTICS.MATTERMOST.ONPREM_CLEARBIT WHERE FUZZY IS NULL;'''
        execute_query(engine, query)

        q2 = '''DELETE FROM ANALYTICS.MATTERMOST.ONPREM_CLEARBIT WHERE FUZZY IS NULL;'''
        execute_query(engine, query=q2)
    elif exceptions_test is not None:
        query = '''INSERT INTO ANALYTICS.STAGING.CLEARBIT_ONPREM_EXCEPTIONS(SERVER_ID, received_at)
                        SELECT DISTINCT SERVER_ID, CURRENT_TIMESTAMP AS received_at FROM ANALYTICS.MATTERMOST.ONPREM_CLEARBIT WHERE FUZZY IS NULL;'''
        execute_query(engine, query)

        q2 = '''DELETE FROM ANALYTICS.MATTERMOST.ONPREM_CLEARBIT WHERE FUZZY IS NULL;'''
        execute_query(engine, query=q2)
else:
    print("Nothing to do.")

示例#5
0
def get_twitter_mentions():
    # Twitter credentials
    # Obtain them from your twitter developer account

    # Need to add this or another Twitter Developer API Key to SysVars - This is currently my personal API Key
    consumer_key = os.getenv("TWITTER_CONSUMER_KEY")
    consumer_secret = os.getenv("TWITTER_CONSUMER_SECRET")
    access_key = os.getenv("TWITTER_ACCESS_KEY")
    access_secret = os.getenv("TWITTER_ACCESS_SECRET")

    # Pass your twitter credentials to tweepy via its OAuthHandler
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth,
                     wait_on_rate_limit=True,
                     wait_on_rate_limit_notify=True)

    # Create database connection and cursor
    engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util")
    connection = engine.connect()

    # Create empty dataframe with required columns
    db_tweets = pd.DataFrame(columns=[
        "username",
        "text",
        "full_name",
        "user_url",
        "url",
        "retweet_text",
        "original_tweet_date",
        "retweeted_status",
        "retweet_count",
        "created_at",
        "location",
        "followers",
        "user_id",
        "favorite_count",
        "lang",
        "verified",
        "hashtags",
        "following_count",
        "is_tweet_reply",
        "id",
        "longitude_latitude",
    ])

    # Fetch latest data from existing ANALYTICS.SOCIAL_MENTIONS.TWITTER relation
    query = f"""
    SELECT MAX(CREATED_AT - interval '1 day')::date::varchar AS DATE, 
    MAX(CREATED_AT)::VARCHAR AS TIMESTAMP 
    FROM analytics.social_mentions.twitter
    """

    try:
        results = execute_query(engine, query)
    except Exception as e:
        print(f"""Oh no! There was an error executing your query: {e}""")

    # Retrieve all tweets >= Max Created At in ANALYTICS.SOCIAL_MENTIONS.TWITTER relation
    tweets = tweepy.Cursor(api.search,
                           q="mattermost",
                           since=f"{results[0][0]}").items(5000)

    # Loop through new tweets and extract relevant fields to populate dataframe.
    for tweet in tweets:
        is_tweet_reply = True if tweet.in_reply_to_screen_name != None else False
        username = tweet.user.screen_name
        full_name = tweet.user.name
        user_url = tweet.user.url
        url = f"https://twitter.com/{tweet.user.screen_name}/status/{tweet.id}"
        retweet_count = tweet.retweet_count
        verified = tweet.user.verified
        user_id = tweet.user.id
        favorite_count = tweet.favorite_count
        acctdesc = tweet.user.description
        location = tweet.user.location
        following = tweet.user.friends_count
        followers = tweet.user.followers_count
        totaltweets = tweet.user.statuses_count
        usercreatedts = tweet.user.created_at
        created_at = tweet.created_at.strftime("%Y-%m-%d %H:%M:%S")
        lang = tweet.lang
        hashtags = str(tweet.entities["hashtags"])
        longitude_latitude = tweet.coordinates
        tweet_id = tweet.id

        try:
            text = tweet.text
            retweet_text = tweet.retweeted_status.text
            original_tweet_date = tweet.retweeted_status.created_at.strftime(
                "%Y-%m-%d %H:%M:%S")
            is_retweet = True if tweet.retweeted_status.text != None else False
        except AttributeError:  # Not a Retweet
            text = tweet.text
            original_tweet_date = None
            retweet_text = None
            is_retweet = False

        # Add variables to tweet list to be inserted into dataframe:
        ith_tweet = [
            username,
            text,
            full_name,
            user_url,
            url,
            retweet_text,
            original_tweet_date,
            is_retweet,
            retweet_count,
            created_at,
            location,
            followers,
            user_id,
            favorite_count,
            lang,
            verified,
            hashtags,
            following,
            is_tweet_reply,
            tweet_id,
            longitude_latitude,
        ]
        # Append to dataframe - db_tweets
        db_tweets.loc[len(db_tweets)] = ith_tweet

    # Append dataframe to ANALYTICS.SOCIAL_MENTIONS.TWITTER relation
    db_tweets[db_tweets["created_at"] > results[0][1]].to_sql(
        "twitter",
        con=connection,
        index=False,
        schema="SOCIAL_MENTIONS",
        if_exists="append",
    )
示例#6
0
#!/usr/bin/python

import argparse
import os
import sys

from extract.utils import snowflake_engine_factory, execute_query


parser = argparse.ArgumentParser()
parser.add_argument("queries_file", nargs="?", type=argparse.FileType("r"),
                    default=sys.stdin)
parser.add_argument("role", help="The role to run the queries for")
parser.add_argument("schema", help="Default schema to use for queries")


if __name__ == "__main__":
    args = parser.parse_args()

    engine = snowflake_engine_factory(os.environ.copy(), args.role, args.schema)

    for line in args.queries_file:
       line = line.rstrip("\n")
       execute_query(engine, line)