def _telem_parsed_to_sql(date_str, channel, measure_iter, version = None, query_file = _UPDATE_SQL_FILE): """ Loads _INTERMEDIATE_CSV into the sentiment database """ db = Db('telemetry', is_persistent = True) #version query if not version: query = """ SELECT version FROM sentiment.release_info ri WHERE '{date}' >= ri.{channel}_start_date AND '{date}' <= ri.{channel}_end_date ;""".format(date=date_str, channel=channel) version = str(db.execute_sql(query).first()[0]) # TODO(rrayborn): figure out why this won't work as a temp table with our # consistency rules + sqlalchemy's implicit transactions query ='''DROP TABLE IF EXISTS tmp_weekly_stats;''' db.execute_sql(query) query ='''CREATE TABLE tmp_weekly_stats ( os ENUM('Windows','Mac','Linux'), measure VARCHAR(200), measure_value VARCHAR(200), users INT, measure_average FLOAT, measure_nonzero_average FLOAT, active_users FLOAT, potential_users INT );''' db.execute_sql(query) mappings = { 0:'measure', 1:'os', 2:'measure_average', 3:'measure_nonzero_average', 4:'active_users', 5:'potential_users', 6:'measure_value' } db.insert_data_into_table(measure_iter, 'tmp_weekly_stats', mappings) with open(query_file, 'r') as query_sql: if query_sql: query = query_sql.read() db.execute_sql(query, {'week':date_str, 'channel':channel, 'version':version})
def update( product = 'both', start_date = (date.today() - timedelta(days=15)).strftime('%Y-%m-%d'), end_date = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d'), ua_db = None, input_db = None, sumo_db = None ): if not ua_db: ua_db = UA_DB('sentiment', is_persistent = True) ua_db.execute_sql('SET autocommit=1;') if not input_db: input_db = Input_DB(is_persistent = True) if not sumo_db: sumo_db = Sumo_DB(is_persistent = True) adis_file = _DATA_PATH + '.' + product +'_adis.tsv' input_file = _DATA_PATH + '.' + product +'_adis.tsv' visits_file = _DATA_PATH + '.' + product +'_visits.tsv' params = {'start_date': start_date, 'end_date': end_date} # =========== Create tables ============================================ _execute_query(ua_db, _CREATE_SQL_FILE, params=params, multi=True) # =========== Parse heartbeat data ========================================= #TODO(rrayborn): should this be its own pipeline? # Fetch Input data data = _execute_query(input_db, _HEARTBEAT_SQL_FILE, params=params) # Insert Input data header = data.keys() ua_db.insert_data_into_table(data, 'daily_heartbeat_stats', header, has_header = False, is_replace=True) # =========== Parse input data ============================================= # Fetch Input data data = _execute_query(input_db, _INPUT_SQL_FILE, params=params) # Create tmp_input table query ='''CREATE TEMPORARY TABLE tmp_input ( `date` DATE, `version` INT, `is_desktop` BOOL, `input_average` FLOAT, `input_volume` INT, `heartbeat_average` FLOAT, `heartbeat_surveyed_users` INT, `heartbeat_volume` INT );''' ua_db.execute_sql(query) # Insert Input data header = data.keys() ua_db.insert_data_into_table(data, 'tmp_input', header, has_header = False) # =========== Create base table ============================================ _execute_query(ua_db, _BASE_SQL_FILE, params=params) # =========== Parse sumo data ============================================= # Fetch Sumo data data = _execute_query(sumo_db, _SUMO_SQL_FILE, params=params) # Create tmp_sumo table query ='''CREATE TEMPORARY TABLE tmp_sumo ( `date` DATE, `version` INT, `is_desktop` BOOL, `num_unanswered_72` INT, `num_posts` INT );''' ua_db.execute_sql(query) # Insert Sumo data header = data.keys() ua_db.insert_data_into_table(data, 'tmp_sumo', header, has_header = False) # =========== Parse ADI data =============================================== # Generate query # TODO(rrayborn): Need to investigate why part of the end date is missing. # Doesn't seem to affect the start_date... today_minus_three = (date.today() - timedelta(days=3)).strftime('%Y-%m-%d') adi_end_date = min(today_minus_three, end_date) with open(_ADIS_SQL_FILE, 'r') as adis_sql: query = adis_sql.read().replace('\n',' ') % (start_date, adi_end_date) # Generate/execute command line cmd = 'echo "%s" | isql -v metrics_dsn -b -x0x09 >%s' # | tail -n+10' check_output(cmd % (query, adis_file), shell=True) # Create tmp table query ='''CREATE TEMPORARY TABLE tmp_adis ( `date` DATE, version INT, is_desktop BOOL, num_adis INT );''' ua_db.execute_sql(query) header = ['date', 'version', 'is_desktop', 'num_adis'] ua_db.insert_csv_into_table(adis_file, 'tmp_adis', header, delimiter = '\t') # =========== Parse Analytics data ========================================= # Get Google analytics data google_analytics.generate_inproduct( db = ua_db, device_type = product, filename = visits_file, start_date = start_date, end_date = end_date ) # Create tmp table query ='''CREATE TEMPORARY TABLE tmp_sumo_visits ( `date` DATE, version INT, is_desktop BOOL, visits INT );''' ua_db.execute_sql(query) header = ['date', 'version', 'is_desktop', 'visits'] ua_db.insert_csv_into_table(visits_file, 'tmp_sumo_visits', header, delimiter = '\t') # =========== Parse Play data ============================================== query = '''CREATE TEMPORARY TABLE tmp_play AS SELECT `date`, version, AVG(rating) AS play_average, COUNT(*) AS play_volume FROM google_play_reviews WHERE `date` >= :start_date AND `date` <= :end_date GROUP BY 1,2; ''' ua_db.execute_sql(query, params) # =========== Run Stats query ============================================== query_files = [] if product == 'both' or product == 'desktop': query_files.append(_DESKTOP_FILE_PATTERN) if product == 'both' or product == 'mobile': query_files.append(_MOBILE_FILE_PATTERN) for query_file in query_files: _execute_query(ua_db,query_file, params=params)
def generate_inproduct( db = None, auth_token = google_services.google_service_connection().get_auth_token(), versions = None, device_type = 'both', filename = None, start_date = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d'), end_date = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d'), ga_ids = None, max_results = _MAX_RESULTS ): if not versions: if not db: db = Db('sentiment') max_version = db.execute_sql('SELECT MAX(version) FROM release_info;').first()[0] versions = range(1,max_version) version_filters = [] for version in versions: version_filters.append('ga:browserVersion=@%s.0' % version) if not filename: filename = device_type + '.tsv' device_types = [] if device_type == 'both' or device_type == 'desktop': device_types.append(True) if device_type == 'both' or device_type == 'mobile': device_types.append(False) data = {} # SET UP REQUEST for is_desktop in device_types: if not ga_ids: if is_desktop: ga_ids = _DESKTOP_GA_ID else: ga_ids = _MOBILE_GA_ID device_categories = [] if is_desktop: device_categories = ['ga:deviceCategory==desktop'] else: device_categories = ['ga:deviceCategory==mobile', 'ga:deviceCategory==tablet'] dimensions = ['ga:date', 'ga:browserVersion'] filters = [ ','.join(version_filters), ','.join(device_categories), 'ga:browser==Firefox', 'ga:source=@inproduct' ] parameters = { 'ids': ga_ids, 'access_token': auth_token, 'max-results': max_results, 'filters': ';'.join(filters), 'dimensions': ','.join(dimensions), 'metrics': 'ga:sessions', 'samplingLevel':'HIGHER_PRECISION' } # MAKE REQUESTS # This is done in a for loop since it's easy to hit Google's data limit # in a single multi-day request. day_datetime = datetime.strptime(start_date, '%Y-%m-%d') end_datetime = datetime.strptime(end_date, '%Y-%m-%d') while day_datetime <= end_datetime: day = day_datetime.strftime('%Y-%m-%d') parameters['start-date'] = day parameters['end-date'] = day _make_request(parameters, is_desktop, data) day_datetime += timedelta(days=1) with open(filename, 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter='\t', quotechar='\"', quoting=csv.QUOTE_MINIMAL) header = ['day','version','is_desktop','visits'] spamwriter.writerow(header) for day in sorted(data.keys()): for version in sorted(data[day].keys()): for is_desktop in sorted(data[day][version].keys()): spamwriter.writerow([day, version, is_desktop, data[day][version][is_desktop]])
from subprocess import call from lib.database.backend_db import Db _PIPELINE_PATH = path.dirname(path.realpath(__file__)) + '/' _OTHER_SQL_FILE = _PIPELINE_PATH + 'other.sql' _COUNTS_SQL_FILE = _PIPELINE_PATH + 'counts.sql' _DUMP_SQL_FILE = _PIPELINE_PATH + 'dump.sql' #TODO(rrayborn): Make this a relative path _OUTPUT_PATH = '/var/server/server/useradvocacy/data/static_json/' _OUTPUT_JSON_NAME = 'hello.json' _OUTPUT_CSV_NAME = 'hello.csv.gz' #TODO(rrayborn): make this more elegant _INPUT_DB = Db('input') def update(start_date=date.today() - timedelta(days=6 * 7), end_date=date.today() - timedelta(days=1), last_run_date=date.today() - timedelta(days=1), output_path=_OUTPUT_PATH): ''' Updates the Hello files. Args: start_date (datetime): start date of data to pull, inclusive (default: 42 days ago) end_date (datetime): end date of data to pull, inclusive (default: 1 day ago) last_run_date (datetime): last date that the pipeline was run for (default: 1 day ago) output_path (str): the location where our files should be output (default: _OUTPUT_PATH) '''
from subprocess import check_output from collections import Counter _OVERLAP = 2 # INCREASE THIS IF GOOGLE STARTS BACK POPULATING DATA _PIPELINE_PATH = path.dirname(path.realpath(__file__)) + '/' _DATA_PATH = _PIPELINE_PATH + 'data/' _TMP_CSV = _DATA_PATH + '.google_play_tmp.csv' _LATEST_CSV = _DATA_PATH + 'google_play_latest.csv' _ALL_CSV = _DATA_PATH + 'google_play_all.csv' _REVIEW_FILE_PATTERN = environ['GS_PLAY_BUCKET'] + \ '/reviews/reviews_org.mozilla.*%s.csv' #This should be handled better... _SENTIMENT_DB = Db('sentiment') #FLAGS = gflags.FLAGS # #gflags.DEFINE_integer('my_version', 0, 'Version number.') #gflags.DEFINE_string('filename', None, 'Input file name', short_name='f') # #gflags.RegisterValidator('my_version', # lambda value: value % 2 == 0, # message='--my_version must be divisible by 2') #gflags.MarkFlagAsRequired('filename') # # def main():
def update(product='both', start_date=(date.today() - timedelta(days=15)).strftime('%Y-%m-%d'), end_date=(date.today() - timedelta(days=1)).strftime('%Y-%m-%d'), ua_db=None, input_db=None, sumo_db=None): if not ua_db: ua_db = UA_DB('sentiment', is_persistent=True) ua_db.execute_sql('SET autocommit=1;') if not input_db: input_db = Input_DB(is_persistent=True) if not sumo_db: sumo_db = Sumo_DB(is_persistent=True) adis_file = _DATA_PATH + '.' + product + '_adis.tsv' input_file = _DATA_PATH + '.' + product + '_adis.tsv' visits_file = _DATA_PATH + '.' + product + '_visits.tsv' params = {'start_date': start_date, 'end_date': end_date} # =========== Create tables ============================================ _execute_query(ua_db, _CREATE_SQL_FILE, params=params, multi=True) # =========== Parse heartbeat data ========================================= #TODO(rrayborn): should this be its own pipeline? # Fetch Input data data = _execute_query(input_db, _HEARTBEAT_SQL_FILE, params=params) # Insert Input data header = data.keys() ua_db.insert_data_into_table(data, 'daily_heartbeat_stats', header, has_header=False, is_replace=True) # =========== Parse input data ============================================= # Fetch Input data data = _execute_query(input_db, _INPUT_SQL_FILE, params=params) # Create tmp_input table query = '''CREATE TEMPORARY TABLE tmp_input ( `date` DATE, `version` INT, `is_desktop` BOOL, `input_average` FLOAT, `input_volume` INT, `heartbeat_average` FLOAT, `heartbeat_surveyed_users` INT, `heartbeat_volume` INT );''' ua_db.execute_sql(query) # Insert Input data header = data.keys() ua_db.insert_data_into_table(data, 'tmp_input', header, has_header=False) # =========== Create base table ============================================ _execute_query(ua_db, _BASE_SQL_FILE, params=params) # =========== Parse sumo data ============================================= # Fetch Sumo data data = _execute_query(sumo_db, _SUMO_SQL_FILE, params=params) # Create tmp_sumo table query = '''CREATE TEMPORARY TABLE tmp_sumo ( `date` DATE, `version` INT, `is_desktop` BOOL, `num_unanswered_72` INT, `num_posts` INT );''' ua_db.execute_sql(query) # Insert Sumo data header = data.keys() ua_db.insert_data_into_table(data, 'tmp_sumo', header, has_header=False) # =========== Parse ADI data =============================================== # Generate query # TODO(rrayborn): Need to investigate why part of the end date is missing. # Doesn't seem to affect the start_date... today_minus_three = (date.today() - timedelta(days=3)).strftime('%Y-%m-%d') adi_end_date = min(today_minus_three, end_date) with open(_ADIS_SQL_FILE, 'r') as adis_sql: query = adis_sql.read().replace('\n', ' ') % (start_date, adi_end_date) # Generate/execute command line cmd = 'echo "%s" | isql -v metrics_dsn -b -x0x09 >%s' # | tail -n+10' check_output(cmd % (query, adis_file), shell=True) # Create tmp table query = '''CREATE TEMPORARY TABLE tmp_adis ( `date` DATE, version INT, is_desktop BOOL, num_adis INT );''' ua_db.execute_sql(query) header = ['date', 'version', 'is_desktop', 'num_adis'] ua_db.insert_csv_into_table(adis_file, 'tmp_adis', header, delimiter='\t') # =========== Parse Analytics data ========================================= # Get Google analytics data google_analytics.generate_inproduct(db=ua_db, device_type=product, filename=visits_file, start_date=start_date, end_date=end_date) # Create tmp table query = '''CREATE TEMPORARY TABLE tmp_sumo_visits ( `date` DATE, version INT, is_desktop BOOL, visits INT );''' ua_db.execute_sql(query) header = ['date', 'version', 'is_desktop', 'visits'] ua_db.insert_csv_into_table(visits_file, 'tmp_sumo_visits', header, delimiter='\t') # =========== Parse Play data ============================================== query = '''CREATE TEMPORARY TABLE tmp_play AS SELECT `date`, version, AVG(rating) AS play_average, COUNT(*) AS play_volume FROM google_play_reviews WHERE `date` >= :start_date AND `date` <= :end_date GROUP BY 1,2; ''' ua_db.execute_sql(query, params) # =========== Run Stats query ============================================== query_files = [] if product == 'both' or product == 'desktop': query_files.append(_DESKTOP_FILE_PATTERN) if product == 'both' or product == 'mobile': query_files.append(_MOBILE_FILE_PATTERN) for query_file in query_files: _execute_query(ua_db, query_file, params=params)