def __init__(self, source_table, destination_table, post_process_file): self.source_schema, self.source_table = source_table.split(".") self.destination_schema, self.destination_table = destination_table.split( ".") self.sf_engine = snowflake_engine_factory(os.environ, "TRANSFORMER", self.source_schema) self.post_process_file = post_process_file
def extract_from_stage(import_table, stage, schema, path, pattern, env): engine = snowflake_engine_factory(env, 'LOADER', schema) query = f""" COPY INTO raw.{schema}.{import_table} FROM @{stage}/{path} PATTERN = '{pattern}' ON_ERROR = 'CONTINUE'; """ execute_query(engine, query)
def update_chronological_sequence(): engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util") query = f''' UPDATE ANALYTICS.EVENTS.USER_EVENTS_BY_DATE SET chronological_sequence = a.chronological_sequence, seconds_after_prev_event = a.seconds_after_prev_event FROM ( SELECT id, updated_at, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY min_timestamp) as chronological_sequence, datediff(second, lag(min_timestamp) over (partition by user_id order by min_timestamp), min_timestamp) as seconds_after_prev_event FROM ANALYTICS.EVENTS.USER_EVENTS_BY_DATE WHERE length(user_id) < 36 AND user_id IS NOT NULL ) a WHERE user_events_by_date.updated_at::timestamp = (SELECT MAX(UPDATED_AT)::timestamp FROM analytics.events.user_events_by_date) AND a.id = user_events_by_date.id; ''' execute_query(engine, query)
def __init__(self, processing_table="analytics.util.pg_imports"): self.processing_table = processing_table self.sf_engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util")
import clearbit import psycopg2 import os import snowflake.connector import sys from datetime import datetime from sqlalchemy import create_engine import pandas as pd from extract.utils import snowflake_engine_factory, execute_query, execute_dataframe # Create database connection and cursor engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util") connection = engine.connect() # SET CLEARBIT API KEY clearbit.key = os.getenv('CLEARBIT_KEY') # RETRIEVE COLUMN NAMES IF TABLE ALREADY EXISTS try: col_q = f''' SELECT COLUMN_NAME FROM ANALYTICS.INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'CLOUD_CLEARBIT' AND TABLE_SCHEMA = 'MATTERMOST' ORDER BY ORDINAL_POSITION ''' col_df = execute_dataframe(engine, query=col_q) clearbit_cols = [] if len(col_df) > 0: for index, row in col_df.iterrows(): clearbit_cols.append(row['COLUMN_NAME'].lower())
def parse_user_agent(): engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util") """ This function searches for and parses all available user agents received via telemetry data that are not currently in the analytics.mattermost.user_agent_registry table.""" # CREATE USER_AGENT_REGISTRY IF IT DOES NOT ALREADY EXIST. query = f""" CREATE TABLE IF NOT EXISTS analytics.WEB.user_agent_registry ( context_useragent VARCHAR, browser VARCHAR, browser_version VARCHAR, operating_system VARCHAR, os_version VARCHAR, device_type VARCHAR, device_brand VARCHAR, device_model VARCHAR );""" execute_query(engine, query) # UNION ALL SOURCES OF CONTEXT_USERAGENT DATA THAT ARE NOT CURRENTLY IN THE USER_AGENT_REGISTRY TABLE. query = f""" SELECT * FROM ( SELECT COALESCE(ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USER_AGENT, ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USERAGENT) AS CONTEXT_USERAGENT FROM ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY LEFT JOIN (SELECT CONTEXT_USERAGENT as JOIN_KEY FROM analytics.WEB.user_agent_registry GROUP BY 1) a ON COALESCE(ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USER_AGENT, ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USERAGENT) = a.JOIN_KEY WHERE COALESCE(ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USER_AGENT, ANALYTICS.EVENTS.USER_EVENTS_TELEMETRY.CONTEXT_USERAGENT) IS NOT NULL AND a.JOIN_KEY IS NULL GROUP BY 1 UNION ALL SELECT USERAGENT AS CONTEXT_USERAGENT FROM RAW.RELEASES.LOG_ENTRIES LEFT JOIN (SELECT CONTEXT_USERAGENT as JOIN_KEY FROM analytics.WEB.user_agent_registry GROUP BY 1) a ON RAW.RELEASES.LOG_ENTRIES.USERAGENT = a.JOIN_KEY WHERE USERAGENT IS NOT NULL AND a.JOIN_KEY IS NULL GROUP BY 1 UNION ALL SELECT USERAGENT AS CONTEXT_USERAGENT FROM RAW.DIAGNOSTICS.LOG_ENTRIES WHERE USERAGENT IS NOT NULL AND USERAGENT NOT IN (SELECT CONTEXT_USERAGENT FROM analytics.WEB.user_agent_registry GROUP BY 1) AND LOGDATE::date >= CURRENT_DATE - INTERVAL '1 DAY' GROUP BY 1 UNION ALL SELECT CONTEXT_USERAGENT FROM ANALYTICS.WEB.DAILY_WEBSITE_TRAFFIC LEFT JOIN (SELECT CONTEXT_USERAGENT as JOIN_KEY FROM analytics.WEB.user_agent_registry GROUP BY 1) a ON ANALYTICS.WEB.DAILY_WEBSITE_TRAFFIC.CONTEXT_USERAGENT = a.JOIN_KEY WHERE CONTEXT_USERAGENT IS NOT NULL AND a.JOIN_KEY IS NULL GROUP BY 1 ) GROUP BY 1; """ df = execute_dataframe(engine, query) if ( len(df) == 0 ): # CHECKS TO SEE IF THERE ARE ANY NEW CONTEXT_USERAGENTS TO INSERT INTO THE TABLE print("Nothing to do.") else: # PARSES USERAGENT COMPONENTS AND APPENDS EACH COMPONENT AS A COLUMN TO THE EXISTING DATAFRAME. browser = [] browser_family = [] browser_version = [] browser_version_string = [] operating_system = [] os_family = [] os_version = [] os_version_string = [] device = [] device_family = [] device_brand = [] device_model = [] for index, row in df.iterrows(): ua_string = row["CONTEXT_USERAGENT"] user_agent = parse(ua_string) browser.append(user_agent.browser) browser_family.append(user_agent.browser.family) browser_version.append(user_agent.browser.version) browser_version_string.append(user_agent.browser.version_string) # Accessing user agent's operating system properties operating_system.append(user_agent.os) os_family.append(user_agent.os.family) os_version.append(user_agent.os.version) os_version_string.append(user_agent.os.version_string) # Accessing user agent's device properties device.append(user_agent.device) device_family.append(user_agent.device.family) device_brand.append(user_agent.device.brand) device_model.append(user_agent.device.model) browser = pd.Series(browser_family, name="browser") browser_version = pd.Series(browser_version_string, name="browser_version") op_sys = pd.Series(os_family, name="operating_system") os_version = pd.Series(os_version_string, name="os_version") device_type = pd.Series(device_family, name="device_type") device_brand = pd.Series(device_brand, name="device_brand") device_model = pd.Series(device_model, name="device_model") agent_lists = [ browser, browser_version, op_sys, os_version, device_type, device_brand, device_model, ] for item in agent_lists: df = df.join(item) connection = engine.connect() # 16,384 is Snowflake Insert statement row limit. To ensure the job executes successfully we use the below code to check that the data being inserted # is not more than the allowed row limit. If it is, we incrementally load the dataframe. df[0:16384 if len(df) > 16384 else len(df)].to_sql( "user_agent_registry", con=connection, index=False, schema="WEB", if_exists="append", ) i = 2 # The default number of times to increment. Will autoincrement if more than 2 inserts are required. if i <= math.ceil(len(df) / 16384): x = 16384 # The start row of the dataframe slice to be inserted. Will autoincrement if more than 2 inserts are required. y = ( 16384 * 2 ) # The end row of the dataframe slice to be inserted. Will autoincrement if more than 2 inserts are required. # Loops through the remaining insert statements required to finish the job i.e. load all new user agents found in the mattermostcom.pages table. for n in range(math.ceil(len(df) / 16384) - 1): df[x:y if y < len(df) else len(df)].to_sql( "user_agent_registry", con=connection, index=False, schema="WEB", if_exists="append", ) x = y y += 16384 i += 1 return print( f"""Successfully uploaded {len(df)} records to mattermost.user_agent_registry!""" )
def contributors(): org = "mattermost" data = [] repo = [] records = [] has_next = True cursor = "" while has_next: try: result = graphql_query(gen_repo_query(org, cursor)) except Exception as e: print(e) return repo_results = result["data"]["organization"]["repositories"]["nodes"] has_next = result["data"]["organization"]["repositories"]["pageInfo"][ "hasNextPage"] cursor = result["data"]["organization"]["repositories"]["pageInfo"][ "endCursor"] for i in repo_results: repo.append(i["name"]) for one_repo in repo: has_next = True cursor = "" while has_next: query = gen_query(org, one_repo, cursor) try: result = graphql_query(query) except Exception as e: print(e) return pull_requests = result["data"]["repository"]["pullRequests"] has_next = pull_requests["pageInfo"]["hasNextPage"] cursor = pull_requests["pageInfo"]["endCursor"] for node in pull_requests["nodes"]: if node and node["author"]: records.append({ "PR_NUMBER": node["number"], "MERGED_AT": node["mergedAt"], "AUTHOR": node["author"]["login"], "REPO": one_repo, }) df = pd.DataFrame.from_records(records) try: engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util") connection = engine.connect() connection.execute("DELETE FROM staging.github_contributions_all") print( f"Preparing to load results to Snowflake. Records: {len(records)}") df.to_sql( "github_contributions_all", con=connection, index=False, schema="STAGING", if_exists="append", method=pd_writer, ) except Exception as e: print(e) connection.close() return
def get_twitter_mentions(): # Twitter credentials # Obtain them from your twitter developer account # Need to add this or another Twitter Developer API Key to SysVars - This is currently my personal API Key consumer_key = os.getenv("TWITTER_CONSUMER_KEY") consumer_secret = os.getenv("TWITTER_CONSUMER_SECRET") access_key = os.getenv("TWITTER_ACCESS_KEY") access_secret = os.getenv("TWITTER_ACCESS_SECRET") # Pass your twitter credentials to tweepy via its OAuthHandler auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_key, access_secret) api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) # Create database connection and cursor engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util") connection = engine.connect() # Create empty dataframe with required columns db_tweets = pd.DataFrame(columns=[ "username", "text", "full_name", "user_url", "url", "retweet_text", "original_tweet_date", "retweeted_status", "retweet_count", "created_at", "location", "followers", "user_id", "favorite_count", "lang", "verified", "hashtags", "following_count", "is_tweet_reply", "id", "longitude_latitude", ]) # Fetch latest data from existing ANALYTICS.SOCIAL_MENTIONS.TWITTER relation query = f""" SELECT MAX(CREATED_AT - interval '1 day')::date::varchar AS DATE, MAX(CREATED_AT)::VARCHAR AS TIMESTAMP FROM analytics.social_mentions.twitter """ try: results = execute_query(engine, query) except Exception as e: print(f"""Oh no! There was an error executing your query: {e}""") # Retrieve all tweets >= Max Created At in ANALYTICS.SOCIAL_MENTIONS.TWITTER relation tweets = tweepy.Cursor(api.search, q="mattermost", since=f"{results[0][0]}").items(5000) # Loop through new tweets and extract relevant fields to populate dataframe. for tweet in tweets: is_tweet_reply = True if tweet.in_reply_to_screen_name != None else False username = tweet.user.screen_name full_name = tweet.user.name user_url = tweet.user.url url = f"https://twitter.com/{tweet.user.screen_name}/status/{tweet.id}" retweet_count = tweet.retweet_count verified = tweet.user.verified user_id = tweet.user.id favorite_count = tweet.favorite_count acctdesc = tweet.user.description location = tweet.user.location following = tweet.user.friends_count followers = tweet.user.followers_count totaltweets = tweet.user.statuses_count usercreatedts = tweet.user.created_at created_at = tweet.created_at.strftime("%Y-%m-%d %H:%M:%S") lang = tweet.lang hashtags = str(tweet.entities["hashtags"]) longitude_latitude = tweet.coordinates tweet_id = tweet.id try: text = tweet.text retweet_text = tweet.retweeted_status.text original_tweet_date = tweet.retweeted_status.created_at.strftime( "%Y-%m-%d %H:%M:%S") is_retweet = True if tweet.retweeted_status.text != None else False except AttributeError: # Not a Retweet text = tweet.text original_tweet_date = None retweet_text = None is_retweet = False # Add variables to tweet list to be inserted into dataframe: ith_tweet = [ username, text, full_name, user_url, url, retweet_text, original_tweet_date, is_retweet, retweet_count, created_at, location, followers, user_id, favorite_count, lang, verified, hashtags, following, is_tweet_reply, tweet_id, longitude_latitude, ] # Append to dataframe - db_tweets db_tweets.loc[len(db_tweets)] = ith_tweet # Append dataframe to ANALYTICS.SOCIAL_MENTIONS.TWITTER relation db_tweets[db_tweets["created_at"] > results[0][1]].to_sql( "twitter", con=connection, index=False, schema="SOCIAL_MENTIONS", if_exists="append", )
#!/usr/bin/python import argparse import os import sys from extract.utils import snowflake_engine_factory, execute_query parser = argparse.ArgumentParser() parser.add_argument("sql_file", help="The SQL file to run on Snowflake") parser.add_argument("role", default="TRANSFORMER", help="The role to run the queries for") parser.add_argument("schema", help="Default schema to use for queries") if __name__ == "__main__": args = parser.parse_args() engine = snowflake_engine_factory(os.environ.copy(), args.role, args.schema) with open(f"transform/sql/snowflake/{args.sql_file}.sql") as f: content = f.read() content = content.replace("\;", "SEMICOLONTEMPFIX") queries = content.split(";") with engine.begin() as conn: [ conn.execute(query.replace("SEMICOLONTEMPFIX", ";")) for query in queries ]