def _run_bigquery(config, start_time_t, time_interval_seconds, time_table_expiration): """config is as described in logs_bridge.config.json.""" # First, create a temporary table that's just the rows from # start_time_t to start_time_t + time_interval_seconds. # We'll give it a random name so we can run multiple copies of # this script at the same time. temp_table_name = ( 'khan-academy:logs_streaming_tmp_analysis.logs_bridge_%d_%04d' % (start_time_t, random.randint(0, 9999))) # We assume that this script will not run for longer than # time_interval_seconds; if it did, it would continually be # falling behind! temp_table_query = _query_for_rows_in_time_range(config, start_time_t, time_interval_seconds) # Apparently the commandline doesn't like newlines in the script. # Reformat for the commandline. temp_table_query = temp_table_query.replace('\n', ' ') logging.debug("Creating the temporary table for querying over by running " + temp_table_query) bq_util.call_bq(['mk', '--expiration', str(time_table_expiration), temp_table_name], project='khan-academy', return_output=False, stdout=open(os.devnull, 'w')) bq_util.call_bq(['query', '--destination_table', temp_table_name, '--allow_large_results', temp_table_query], project='khanacademy.org:deductive-jet-827', return_output=False) logging.info("Step 1: Created temp table with all loglines. " "View it on bigquery:") logging.info(" https://bigquery.cloud.google.com/table/%s?tab=preview" % (temp_table_name)) subqueries = [_create_subquery(entry, start_time_t, time_interval_seconds, temp_table_name) for entry in config] # num_requests is the total number of requests in the specified # time period (either `now` or some other time). In order to get # this value, we sum the total number of requests for each field # (e.g. we sum the total number of requests for each browser) # partioned by the time, `when`. query = ('SELECT *, SUM(num_requests_by_field) OVER(PARTITION BY when)' ' as num_requests FROM %s' % ',\n'.join(subqueries)) logging.debug('BIGQUERY QUERY: %s' % query) job_name = 'logs_bridge_query_%s' % random.randint(0, sys.maxint) r = bq_util.query_bigquery(query, job_name=job_name) logging.info('Step 2: Counting # of logs that match metric:') logging.info(' https://bigquery.cloud.google.com/results/khanacademy.org:deductive-jet-827:%s' % job_name) logging.debug('BIGQUERY RESULTS: %s' % r) return r
def dos_detect(start, end): query = QUERY_TEMPLATE.format(fastly_log_tables=_fastly_log_tables( start, end), start_timestamp=start.strftime(TS_FORMAT), end_timestamp=end.strftime(TS_FORMAT), max_count=(MAX_REQS_SEC * PERIOD)) results = bq_util.call_bq(['query', query], project=BQ_PROJECT) for row in results: msg = ALERT_TEMPLATE.format(**row) alertlib.Alert(msg).send_to_slack(ALERT_CHANNEL)
def scratchpad_detect(start, end): scratchpad_query = SCRATCHPAD_QUERY_TEMPLATE.format( fastly_log_tables=_fastly_log_tables(start, end), start_timestamp=start.strftime(TS_FORMAT), end_timestamp=end.strftime(TS_FORMAT), max_count=MAX_SCRATCHPADS) scratchpad_results = bq_util.call_bq(['query', scratchpad_query], project=BQ_PROJECT) if len(scratchpad_results) != 0: msg = SCRATCHPAD_ALERT_INTRO_TEMPLATE.format(max_count=MAX_SCRATCHPADS) msg += '\n'.join( SCRATCHPAD_ALERT_ENTRY_TEMPLATE.format(**row) for row in scratchpad_results) alertlib.Alert(msg).send_to_slack(ALERT_CHANNEL)
def _send_campaign_report(status, start_date, end_date, temp_file, verbose, dry_run, keep_temp): """Export data about all campaigns in a date range to Bigquery. This selects campaigns that started between start_date and end_date inclusive. Arguments: status: Export only the details of campaigns with this status. Options are 'sent', 'sending', 'scheduled' and 'draft'. start_date: Start date of blasts (format example: 'January 1 2017') end_date: End date of blasts (format example: 'January 1 2017') temp_file: A file to store the data, to be used by 'bq load'. verbose: True if you want to show debug messages, else False. dry_run: True if we should skip writing to bq. keep_temp: True if we should keep the temp_file that we write. Returns: Returns a python set of the blast IDs for the blasts that were started within 7 days before end_date inclusive both the end_date and seven days before end_date. The returned set has nothing to do with the start-date. """ recent_blast_ids = set() response = _get('blast', status=status, start_date=start_date, end_date=end_date, limit=0) blasts_info_json = response.get_body().get('blasts') all_blasts_length = len(blasts_info_json) try: with open(temp_file, "wb") as json_file: for i in range(all_blasts_length): # Get the date a blast was started. date = datetime.datetime.strptime( blasts_info_json[i]['start_time'], '%a, %d %b %Y %H:%M:%S -%f') # Store a list of all blast IDs that started in the # last 7 days of end_date. if date >= datetime.datetime.strptime( end_date, '%B %d %Y') - datetime.timedelta(days=7): recent_blast_ids.add(blasts_info_json[i]['blast_id']) json.dump(blasts_info_json[i], json_file) if i != len(blasts_info_json) - 1: json_file.write("\n") table_name = "sailthru_blasts.campaigns" if dry_run: print( "DRY RUN: if this was for real, we would write data at path" " '%s' to bq table '%s'" % (temp_file, table_name)) else: if verbose: print("Writing json file with %s lines to bigquery table %s" % (all_blasts_length, table_name)) bq_util.call_bq([ 'load', '--source_format=NEWLINE_DELIMITED_JSON', '--replace', table_name, temp_file, os.path.join(os.path.dirname(__file__), 'sailthru_campaign_export_schema.json') ], project='khanacademy.org:deductive-jet-827', return_output=False) finally: if not keep_temp: os.unlink(temp_file) return recent_blast_ids
def _send_blast_details_to_bq(blast_id, temp_file, verbose, dry_run, keep_temp): """Export blast data to BigQuery. Arguments: blast_id: ID of the blast to fetch data for. temp_file: A file to store the data, to be used by 'bq load'. verbose: True if you want to show debug messages, else False. dry_run: True if we should skip writing to bq, and instead log what would have happened. For normal behavior, set False. keep_temp: True if we should keep the temp_file that we write. """ # Map associating blast_query response column names with separators. # # The blast_query response contains columns that contain multiple items. # These columns have different separators. # # The absence of a column name in this associative array indicates that # cells under this column each only contain a single item. blast_report_list_column_seperators = { "first_ten_clicks": " ", "first_ten_clicks_time": "|" } # Map associating Sailthru blast report header names with bq column names. blast_report_header_corrections = { "email hash": "email_hash", # Spaces are annoying in SQL! "extid": "kaid", # Might as well be precise. } # Fields for which we should append timezone information. blast_report_timestamp_columns = None with open( os.path.join(os.path.dirname(__file__), "sailthru_blast_export_schema.json")) as f: blast_report_timestamp_columns = { column.get("name"): True for column in json.load(f) if column.get("type") == "TIMESTAMP" } tz_utc_offset = _get_sailthru_timezone_utc_offset() response_1 = _post('job', job="blast_query", blast_id=blast_id, verbose=verbose) job_id = response_1.get_body().get('job_id') if job_id is None: print( "WARNING: For the blast_query job with blast_id = %s, " "the job_id returned from Sailthru's job=blast_query is " "None" % blast_id) return if verbose: print( "For the blast_query job with blast_id = %s, calling " "sailthru's job status API for job_id = %s" % (blast_id, job_id)) response_2 = _get('job', job_id=job_id) while response_2.get_body().get('status') != "completed": if verbose: print( "For the blast_query job with blast_id = %s, polled " "sailthru's job status API for job_id = %s " % (blast_id, job_id)) print "Will poll again in 5 seconds." time.sleep(5) response_2 = _get('job', job_id=job_id) if response_2.get_body().get('status') == "expired": raise SailthruAPIException(response_2) filename_url = response_2.get_body().get('export_url') if verbose: print( "For the blast_query job with blast_id = %s, creating a jsonl " "file from the sailthru data" % blast_id) with _CPU_LOCK: try: with open(temp_file, "wb") as f: with contextlib.closing( urllib.urlopen(filename_url)) as csvdata: # Take the csv data from the Sailthru API and # convert it to JSON. bq can read columns in # REPEATED mode from JSON files, but not from # CSVs, and we have cells that contain multiple # items. reader = csv.reader(csvdata, delimiter=',', quotechar='"') headers = reader.next() # Correct confusing header names. headers = [ blast_report_header_corrections[hdr] if hdr in blast_report_header_corrections else hdr for hdr in headers ] for row_csv in reader: row_object = {} for idx, column_name in enumerate(headers): cell_content = row_csv[idx].strip() if cell_content == "": row_object[column_name] = None elif (column_name in blast_report_list_column_seperators): sep = blast_report_list_column_seperators[ column_name] row_object[column_name] = cell_content.split( sep) else: row_object[column_name] = cell_content # Append timezone information to TIMESTAMP cells. if column_name in blast_report_timestamp_columns: if isinstance(row_object[column_name], str): row_object[column_name] += " %s" % ( tz_utc_offset) elif isinstance(row_object[column_name], list): row_object[column_name] = [ "%s %s" % (date, tz_utc_offset) for date in row_object[column_name] ] else: assert (row_object[column_name] is None) # Append the blast ID to each row. This way # we can join/union this blast table with # other tables while preserving # blast_ids. Otherwise, the blast_id would # only be accessible from the table name. row_object["blast_id"] = str(blast_id) # Write each row. # In JSON mode, bq expects a JSON object on each line. f.write("%s\n" % (json.dumps(row_object))) table_name = "sailthru_blasts.blast_%s" % str(blast_id) # (TODO: Update schema to port dates in TIMESTAMP format in bq) if dry_run: print( "DRY RUN: if this was for real, for the blast_query " "job with blast_id = %s, we would write data at path " "'%s' to bq table '%s'" % (blast_id, temp_file, table_name)) else: if verbose: print( "For the blast_query job with blast_id = %s, " "writing jsonl file to bigquery" % blast_id) bq_util.call_bq([ 'load', '--source_format=NEWLINE_DELIMITED_JSON', '--replace', table_name, temp_file, os.path.join(os.path.dirname(__file__), 'sailthru_blast_export_schema.json') ], project='khanacademy.org:deductive-jet-827', return_output=False) finally: if not keep_temp: os.unlink(temp_file)
def build_page_load_temp_table(start_date=None, end_date=None): """Builds the temporary table with all relevant page load performance data. Returns the name of the table so it can be used in other functions which perform the queries to get the data we want for this report. The temp table is built from all request logs in the time from start_date to end_date. start_date and end_date are expected to be datetime objects if given. They default to eight days ago and yesterday, respectively. If the table already exists, this function exits successfully and returns the name of the table without rebuilding it. """ start_date = (start_date or _DEFAULT_START_TIME).strftime("%Y%m%d") end_date = (end_date or _DEFAULT_END_TIME).strftime("%Y%m%d") temp_table_name = ( 'khan-academy:logs_streaming_tmp_analysis.email_reliability_%s_%s' % (start_date, end_date)) # If the temp table already exists, don't redo the expensive query. try: bq_util.call_bq(['show', temp_table_name], project='khan-academy') return temp_table_name except subprocess.CalledProcessError: # means 'table does not exist' pass bq_util.call_bq(['mk', temp_table_name], project='khan-academy', return_output=False) query = """\ SELECT * FROM ( SELECT FLOAT(REGEXP_EXTRACT(app_logs.message, r'stats.time.client.sufficiently_usable_ms\.\w+\.\w+:(\d+);' ))/1000 AS page_load_time, REGEXP_EXTRACT(app_logs.message, r'stats.time.client.sufficiently_usable_ms\.\w+\.(\w+):\d+;' ) AS page_load_page, REGEXP_EXTRACT(app_logs.message, r'stats.time.client.sufficiently_usable_ms\.(server|client).+;' ) AS page_load_nav_type, elog_country FROM TABLE_DATE_RANGE([khanacademy.org:deductive-jet-827:logs.requestlogs_], TIMESTAMP('%(start_date)s'), TIMESTAMP('%(end_date)s')) WHERE elog_device_type IS NOT NULL AND elog_device_type != 'bot/dev' ) WHERE page_load_time IS NOT NULL AND ( (page_load_nav_type = 'server' AND page_load_time < 30) OR (page_load_nav_type = 'client' AND page_load_time < 20) ) """ % { "start_date": start_date, "end_date": end_date, } # Modeled from logs_bridge._run_bigquery # Apparently the commandline doesn't like newlines in the script. # Reformat for the commandline. query = query.replace('\n', ' ') bq_util.call_bq([ 'query', '--destination_table', temp_table_name, '--allow_large_results', query ], project='khanacademy.org:deductive-jet-827', return_output=False) return temp_table_name