def do_extract_subset_person_course_tables(the_dataset, pc_table):
    '''
    Extract (from the latest person_course table, specified by dataset and table), rows for which
    viewed = True (as person_course_viewed), and verified_enroll_time is not Null (for person_course_idv).
    '''
    the_sql = "SELECT * from [%s.%s] where viewed" % (the_dataset, pc_table)
    tablename = "person_course_viewed"

    try:
        ret = bqutil.create_bq_table(the_dataset, tablename, the_sql, 
                                     overwrite=True,
                                     allowLargeResults=True,
        )
    except Exception as err:
        print "ERROR! Failed on SQL="
        print the_sql
        raise
    print "  --> created %s" % tablename

    the_sql = "SELECT * from [%s.%s] where verified_enroll_time is not NULL" % (the_dataset, pc_table)
    tablename = "person_course_idv"

    try:
        ret = bqutil.create_bq_table(the_dataset, tablename, the_sql, 
                                     overwrite=True,
                                     allowLargeResults=True,
        )
    except Exception as err:
        print "ERROR! Failed on SQL="
        print the_sql
        raise
    
    print "  --> created %s" % tablename
    def do_table(self, the_sql, tablename, the_dataset=None, sql_for_description=None, check_skip=True):

        if check_skip:
            if self.skip_or_do_step(tablename) < 0:
                return	# skip step

        if the_dataset is None:
            the_dataset = self.dataset

        print("Computing %s in BigQuery" % tablename)
        sys.stdout.flush()
        try:
            ret = bqutil.create_bq_table(the_dataset, tablename, the_sql, 
                                         overwrite=True,
                                         output_project_id=self.output_project_id,
                                         sql_for_description=sql_for_description or the_sql,
                                     )
        except Exception as err:
            print "ERROR! Failed on SQL="
            print the_sql
            raise

        gsfn = "%s/%s.csv" % (self.gsbucket, tablename)
        bqutil.extract_table_to_gs(the_dataset, tablename, gsfn, 
                                   format='csv', 
                                   do_gzip=False,
                                   wait=False)

        msg = "CSV download link: %s" % gsutil.gs_download_link(gsfn)
        print msg
        bqutil.add_description_to_table(the_dataset, tablename, msg, append=True, project_id=self.output_project_id)
Пример #3
0
def do_extract_subset_person_course_tables(the_dataset, pc_table):
    '''
    Extract (from the latest person_course table, specified by dataset and table), rows for which
    viewed = True (as person_course_viewed), and verified_enroll_time is not Null (for person_course_idv).
    '''
    the_sql = "SELECT * from [%s.%s] where viewed" % (the_dataset, pc_table)
    tablename = "person_course_viewed"

    try:
        ret = bqutil.create_bq_table(
            the_dataset,
            tablename,
            the_sql,
            overwrite=True,
            allowLargeResults=True,
        )
    except Exception as err:
        print "ERROR! Failed on SQL="
        print the_sql
        raise
    print "  --> created %s" % tablename

    the_sql = "SELECT * from [%s.%s] where verified_enroll_time is not NULL" % (
        the_dataset, pc_table)
    tablename = "person_course_idv"

    try:
        ret = bqutil.create_bq_table(
            the_dataset,
            tablename,
            the_sql,
            overwrite=True,
            allowLargeResults=True,
        )
    except Exception as err:
        print "ERROR! Failed on SQL="
        print the_sql
        raise

    print "  --> created %s" % tablename
Пример #4
0
def obsolete_process_course(course_id, force_recompute=False, check_dates=True):
    '''
    make person_course_day tables for specified course_id.  This version
    produces one table for each day.  It is inefficient when there are 
    many days with very small daily tracking log tables.
    '''

    PCDAY_SQL = """
    select username, 
           "{course_id}" as course_id,
           sum(bevent) as nevents,
           sum(bprogress) as nprogcheck,
           sum(bshow_answer) as nshow_answer,
           sum(bvideo) as nvideo, 
           sum(bproblem_check) as nproblem_check,
           sum(bforum) as nforum,
           sum(bshow_transcript) as ntranscript,
           sum(bseq_goto) as nseq_goto,
           sum(bseek_video) as nseek_video,
           sum(bpause_video) as npause_video,
           MAX(time) as last_event,
           AVG(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
               ) as avg_dt,
           STDDEV(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as sdv_dt,
           MAX(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as max_dt,
           COUNT(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as n_dt,
           SUM(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as sum_dt
    from
    (SELECT username, 
      case when event_type = "play_video" then 1 else 0 end as bvideo,
      case when event_type = "problem_check" then 1 else 0 end as bproblem_check,
      case when username != "" then 1 else 0 end as bevent,
      case when regexp_match(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum,
      case when regexp_match(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress,
      case when event_type in ("show_answer", "showanswer") then 1 else 0 end as bshow_answer,
      case when event_type = 'show_transcript' then 1 else 0 end as bshow_transcript,
      case when event_type = 'seq_goto' then 1 else 0 end as bseq_goto,
      case when event_type = 'seek_video' then 1 else 0 end as bseek_video,
      case when event_type = 'pause_video' then 1 else 0 end as bpause_video,
      # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll,
      # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll
      time,
      lag(time, 1) over (partition by username order by time) last_time
      FROM [{dataset}.{table_id}]
      WHERE
        NOT event_type contains "/xblock/"
        AND username != ""
    )
    group by course_id, username
    order by sdv_dt desc
    """

    course_dir = course_id.replace('/','__')
    dataset = bqutil.course_id2dataset(course_id)
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")
    pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday")

    print "Processing course %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    log_tables = bqutil.get_tables(log_dataset)

    try:
        bqutil.create_dataset_if_nonexistent(pcd_dataset)
    except Exception as err:
        print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err))
        
    pcday_tables_info = bqutil.get_tables(pcd_dataset)
    pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])]

    print "pcday_tables = ", pcday_tables

    log_table_list = log_tables['tables']
    log_table_list.sort()

    for table in log_table_list:
        tr = table['tableReference']
        table_id = tr['tableId']
        if not table_id.startswith('tracklog'):
            continue
    
        date = table_id[9:]
    
        table_out = 'pcday_%s' % date
    
        if (table_out in pcday_tables) and not force_recompute:
            skip = True
            if check_dates:
                table_out_date = bqutil.get_bq_table_last_modified_datetime(pcd_dataset, table_out)
                log_table_date = bqutil.get_bq_table_last_modified_datetime(log_dataset, table_id)
                if log_table_date > table_out_date:
                    skip = False
                    print "%s...already exists, but table_out date=%s and log_table date=%s, so re-computing" % (table_out,
                                                                                                                 table_out_date,
                                                                                                                 log_table_date)
            if skip:
                print "%s...already done, skipping" % table_out
                sys.stdout.flush()
                continue

        if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0:
            print "...zero size table %s, skipping" % table_id
            sys.stdout.flush()
            continue

        print ("Creating %s " % table_out),
        
        the_sql = PCDAY_SQL.format(course_id=course_id, 
                                   dataset=log_dataset,
                                   table_id=table_id)
    
        sys.stdout.flush()
    
        bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False)
    
    print "Done with course %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
def run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=False, 
                               start_date=None,
                               end_date=None, 
                               get_date_function=None,
                               existing=None,
                               log_dates=None,
                               days_delta=1,
                               skip_last_day=False,
                               has_hash_limit=False,
                               newer_than=None,
                               table_max_size_mb=800,
                               limit_query_size=False):
    '''
    make a certain table (with SQL given) for specified course_id.

    The master table holds all the data for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    If the query fails because of "Resources exceeded during query execution"
    then try setting the end_date, to do part at a time.

    NOTE: the SQL must produce a result which is ordered by date, in increasing order.

    days_delta = integer number of days to increase each time; specify 0 for one day overlap,
                 but make sure the SQL query only selects for time > TIMESTAMP("{last_date}")

    If skip_last_day is True then do not include the last day of tracking log data
    in the processing.  This is done to avoid processing partial data, e.g. when
    tracking log data are incrementally loaded with a delta of less than one day.

    start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format.

    newer_than = if specified, as datetime, then any existing destination table must
    be newer than this datetime, else force_recompute is made True
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)	# destination
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")

    if existing is None:
        existing = bqutil.get_list_of_table_ids(dataset)
        print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % (len(existing or []), dataset)

    if log_dates is None:
        log_tables = [x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20')]
        log_dates = [x[9:] for x in log_tables]
        log_dates.sort()

    if len(log_dates)==0:
        print "--> no tracking logs in %s aborting!" % (log_dataset)
        return

    if skip_last_day:
        old_max_date = max(log_dates)
        log_dates.remove(max(log_dates))	# remove the last day of data from consideration
        max_date = max(log_dates)
        print "         --> skip_last_day is True: dropping %s, new max_date=%s" % (old_max_date, max_date)
        sys.stdout.flush()

    min_date = min(log_dates)
    max_date = max(log_dates)

    if start_date:
        start_date = start_date.replace('-','')
        if min_date < start_date:
            print "        --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % (min_date, start_date)
            min_date = start_date

    if end_date is not None:
        print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % (table, min_date, max_date, end_date)
        sys.stdout.flush()
        the_end_date = end_date.replace('-','')	# end_date should be YYYY-MM-DD
        if the_end_date < max_date:
            max_date = the_end_date

    if (table in existing) and newer_than:
        # get date on existing table
        table_datetime = bqutil.get_bq_table_last_modified_datetime(dataset, table)
        if (table_datetime < newer_than):
            print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % (dataset, table, table_datetime, newer_than)
            force_recompute = True

    if force_recompute:
        overwrite = True
    else:
        overwrite = False

    last_date = "2010-10-01 01:02:03"    	# default last date

    if (not overwrite) and table in existing:
        # find out what the end date is of the current table
        pc_last = bqutil.get_table_data(dataset, table, startIndex=-10, maxResults=100)
        if (pc_last is None):
            print "--> no data in table %s.%s, starting from scratch!" % (dataset, table)
            overwrite = True
        else:
            last_dates = [get_date_function(x) for x in pc_last['data']]
            last_date = max(last_dates)
            table_max_date = last_date.strftime('%Y%m%d')
            if max_date <= table_max_date:
                print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % (table, 
                                                                                                                  table_max_date,
                                                                                                                  min_date,
                                                                                                                  max_date)
                sys.stdout.flush()
                return
            min_date = (last_date + datetime.timedelta(days=days_delta)).strftime('%Y%m%d')
            print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % (table, 
                                                                                                      table_max_date,
                                                                                                      min_date,
                                                                                                      max_date)
            sys.stdout.flush()
            overwrite = 'append'
    
    if overwrite=='append':
        print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]"  % (table, course_id, min_date, max_date, last_date, datetime.datetime.now())
    else:
        print "Making new %s table for course %s (start=%s, end=%s) [%s]"  % (table, course_id, min_date, max_date, datetime.datetime.now())
    sys.stdout.flush()

    if limit_query_size:
        # do only one day's tracking logs, and force use of hash if table is too large
        print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb
        the_max_date = max_date	# save max_date information

        while min_date not in log_dates:
            print "  tracklog_%s does not exist!" % min_date
            for ld in log_dates:
                if ld < min_date:
                    continue
                if (ld > min_date) and (ld <= max_date):
                    min_date = ld
                    break
        if min_date not in log_dates:
            print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date
            raise Exception("[process_tracking_logs] missing tracking log")

        max_date = min_date
        print '    min_date = max_date = %s' % min_date
        tablename = 'tracklog_%s' % min_date.replace('-', '')
        tablesize_mb = bqutil.get_bq_table_size_bytes(log_dataset, tablename) / (1024.0*1024)
        nhashes =  int(math.ceil(tablesize_mb / table_max_size_mb))
        from_datasets = "[%s.%s]" % (log_dataset, tablename)
        print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % (log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes)
        sys.stdout.flush()

        if nhashes:
            if not has_hash_limit:
                print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % (log_dataset, tablename, 
                                                                                                             tablesize_mb, table_max_size_mb)
                print "SQL: ", SQL
                raise Exception("[process_tracking_logs] table too large")

            for k in range(nhashes):
                hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes, k)
                the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit)
                print "   Hash %d" % k
                sys.stdout.flush()
                try:
                    bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
                except Exception as err:
                    print the_sql
                    raise
                overwrite = "append"
        else:
            the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="")
            try:
                bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
            except Exception as err:
                print the_sql
                raise

        txt = '[%s] added tracking log data from %s' % (datetime.datetime.now(), tablename)
        bqutil.add_description_to_table(dataset, table, txt, append=True)

        print "----> Done with day %s" % max_date

        if the_max_date > max_date:
            # more days still to be done
            print "--> Moving on to another day (max_date=%s)" % the_max_date
            run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, 
                                       use_dataset_latest=use_dataset_latest,
                                       end_date=end_date, 
                                       get_date_function=get_date_function,
                                       existing=existing,
                                       log_dates=log_dates,
                                       has_hash_limit=has_hash_limit,
                                       table_max_size_mb=table_max_size_mb,
                                       limit_query_size=limit_query_size,
                                   )
        return

    from_datasets = """(
                  TABLE_QUERY({dataset},
                       "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}"
                     )
                  )
         """.format(dataset=log_dataset, start=min_date, end=max_date)

    the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="")

    try:
        bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
    except Exception as err:
        if ( ('Response too large to return.' in str(err)) and has_hash_limit ):
            # try using hash limit on username
            # e.g. WHERE ABS(HASH(username)) % 4 = 0

            for k in range(4):
                hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k
                the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit)
                bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
                overwrite = "append"

        elif ('Resources exceeded during query execution' in str(err)):
            if True:
                # figure out time interval in days, and split that in half
                start_date = datetime.datetime.strptime(min_date, '%Y%m%d')
                end_date = datetime.datetime.strptime(max_date, '%Y%m%d')
                ndays = (end_date - start_date).days
                if (ndays < 1) and has_hash_limit:
                    print "----> ndays=%d; retrying with limit_query_size" % ndays
                    sys.stdout.flush()
                    return run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, 
                                                      use_dataset_latest=use_dataset_latest,
                                                      end_date=max_date, 
                                                      get_date_function=get_date_function,
                                                      has_hash_limit=has_hash_limit,
                                                      # existing=existing,
                                                      log_dates=log_dates,
                                                      limit_query_size=True,
                                                  )
                elif (ndays < 1):
                    print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays
                    raise
                
                nd1 = int(ndays/2)
                nd2 = ndays - nd1
                #if nd2 > nd1:
                #    nd1 = nd2
                #    nd2 = ndays - nd1
                print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % (ndays, nd1, nd2)
                sys.stdout.flush()

                end_date = (start_date + datetime.timedelta(days=nd1)).strftime('%Y%m%d')
                print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, 
                                           use_dataset_latest=use_dataset_latest,
                                           end_date=end_date, 
                                           get_date_function=get_date_function,
                                           has_hash_limit=has_hash_limit,
                                           # existing=existing,
                                           log_dates=log_dates)

                end_date = max_date
                print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, 
                                           use_dataset_latest=use_dataset_latest,
                                           end_date=end_date, 
                                           get_date_function=get_date_function,
                                           has_hash_limit=has_hash_limit,
                                           # existing=existing,
                                           log_dates=log_dates)
                print "--> Done with %d + %d days!" % (nd1, nd2)
                return


            if False:
                def get_ym(x):
                    return int(x[0:4]), int(x[4:6]), int(x[6:])
                (min_year, min_month, min_day) = get_ym(min_date)
                (max_year, max_month, max_day) = get_ym(max_date)
                nmonths = max_month - min_month + 12 * (max_year - min_year)
                print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time"
                sys.stdout.flush()
                (end_year, end_month) = (min_year, min_month)
                for dm in range(nmonths):
                    end_month += 1
                    if end_month > 12:
                        end_month = 1
                        end_year += 1
                    end_date = "%04d-%02d-%02d" % (end_year, end_month, min_day)
                    print "--> with end_date=%s" % end_date
                    sys.stdout.flush()
                    run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, 
                                               use_dataset_latest=use_dataset_latest,
                                               end_date=end_date, 
                                               get_date_function=get_date_function,
                                               has_hash_limit=has_hash_limit,
                                               # existing=existing,
                                               log_dates=log_dates)
                    force_recompute = False		# after first, don't force recompute
                return
        else:
            print the_sql
            raise

    if overwrite=='append':
        txt = '[%s] added tracking log data from %s to %s' % (datetime.datetime.now(), min_date, max_date)
        bqutil.add_description_to_table(dataset, table, txt, append=True)
    
    print "Done with course %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
def old_process_course(course_id, force_recompute=False):
    '''
    DEPRACATED - instead of creating one table per day, because there is so little
    total data, create one enrollday_all table (see other function below).

    make enrollday2_* tables for specified course_id
    '''

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
	            time, 
                    event_struct.user_id as user_id, 
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "honor")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "honor")
                          then -1 
                          else 0 end) as diff_enrollment_honor,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "verified")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "verified")
                          then -1 
                          else 0 end) as diff_enrollment_verified,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "audit")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "audit")
                          then -1 
                          else 0 end) as diff_enrollment_audit,
            FROM [{dataset}.{table_id}] 
            where (event_type = "edx.course.enrollment.activated") or
                  (event_type = "edx.course.enrollment.deactivated")
            order by time;
            """

    course_dir = course_id.replace('/','__')
    dataset = bqutil.course_id2dataset(course_id)
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")
    pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday")

    print "Processing course %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    log_tables = bqutil.get_tables(log_dataset)

    try:
        bqutil.create_dataset_if_nonexistent(pcd_dataset)
    except Exception as err:
        print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err))
        
    pcday_tables_info = bqutil.get_tables(pcd_dataset)
    pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])]

    # print "pcday_tables = ", pcday_tables

    log_table_list = log_tables['tables']
    log_table_list.sort()

    for table in log_table_list:
        tr = table['tableReference']
        table_id = tr['tableId']
        if not table_id.startswith('tracklog'):
            continue
    
        date = table_id[9:]
    
        table_out = 'enrollday2_%s' % date
    
        if (table_out in pcday_tables) and not force_recompute:
            print "%s...already done, skipping" % table_id
            sys.stdout.flush()
            continue

        if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0:
            print "...zero size table %s, skipping" % table_id
            sys.stdout.flush()
            continue

        print ("Creating %s " % table_out), 
        
        the_sql = SQL.format(course_id=course_id, 
                             dataset=log_dataset,
                             table_id=table_id)
    
        sys.stdout.flush()
    
        bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False)
    
    print "Done with course %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
def CreateForumPerson(course_id,
                      force_recompute=False,
                      use_dataset_latest=False,
                      skip_last_day=False,
                      end_date=None,
                      has_hash_limit=False,
                      hash_limit=HASH):
    '''
    Create Forum Person table, based on forum events and forum posts tables. 
    This table contains both read and writes for all forum posts, for all users.
    '''

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    table = TABLE_FORUM_PERSON

    original_the_sql = """

                  SELECT (case when PP.username_fp is not null then PP.username_fp else FE.username_fe end) as username,
			 "{course_id}" as course_id,
                         (case when PP.username_fp is not null then PP.slug_id else FE.slug_id end) as slug_id,
                         (case when PP.username_fp is not null then PP.slug_type else FE.slug_type end) as slug_type,
                         (case when PP.username_fp is not null then PP.thread_id else FE.thread_id end) as thread_id,
                         (case when PP.username_fp is not null then PP.parent_id else FE.parent_id end) as parent_id,
                         (case when PP.original_poster is not null then PP.original_poster else FE.original_poster end) as original_poster,
                         (case when PP.responded_to is not null then PP.responded_to else FE.responded_to end) as responded_to,
                         (case when PP.username_fp is not null then PP.title else FE.title end) as title,
                         (case when PP.username_fp is not null then PP.wrote else 0 end) as wrote,
                         FE.read as read,
                         FE.pin as pinned,
                         FE.upvote as upvoted,
                         FE.unvote as unvoted,
                         #FE.del as deleted,
                         FE.follow as followed,
                         (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) <= TIMESTAMP(FE.last_time))
                                   then TIMESTAMP(PP.first_time)
                               else (case when PP.first_time is not null and FE.last_time is null
                                     then TIMESTAMP(PP.first_time) else
                                         (case when FE.first_time is not null
                                               then TIMESTAMP(FE.first_time)
                                              else FE.last_time end) end) end) as first_time,
                         (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) >= TIMESTAMP(FE.last_time))
                                   then TIMESTAMP(PP.first_time)
                               else (case when PP.first_time is not null and FE.last_time is null
                                     then TIMESTAMP(PP.first_time) else
                                         (case when FE.last_time is not null
                                               then TIMESTAMP(FE.last_time)
                                              else FE.first_time end) end) end) as last_time,


                  FROM
                  (
                          # Find 1st level posting => "original_post"
                          SELECT username as username_fp,
                                 slug_id,
                                 slug_type,
                                 thread_id,
                                 parent_id,
                                 original_poster,
                                 responded_to,
                                 title,
                                 1 as wrote,
                                 #created_at as first_time,
                                 first_time
                          FROM [{dataset}.{forum_posts}]
                          {hash_limit_where}
                          ORDER by username_fp, first_time
                  ) PP
                  FULL OUTER JOIN EACH
                  (
                          SELECT username as username_fe, 
                                 MIN(TIMESTAMP(time)) as first_time,
                                 MAX(TIMESTAMP(time)) as last_time,
                                 slug_id,
                                 FE.thread_id as thread_id,
                                 FIRST(parent_id) as parent_id,
				 F.slug_type as slug_type,
				 F.original_poster as original_poster,
				 F.responded_to as responded_to,
				 F.title as title,
                                 #1 as read,
                                 sum(case when forum_action = "read" or forum_action = "read_inline" then 1 else 0 end) as read,
                                 sum(case when forum_action = "pin" then 1 else 0 end) as pin,
                                 sum(case when forum_action = "upvote" then 1 else 0 end) as upvote,
                                 sum(case when forum_action = "unvote" then 1 else 0 end) as unvote,
                                 #sum(case when forum_action = "delete" then 1 else 0 end) as del,
                                 sum(case when forum_action = "follow_thread" then 1 else 0 end) as follow,      
                          FROM [{dataset}.{forum_events}] FE
                          JOIN EACH 
                          (
                                 SELECT username as username_fe,
                                        slug_id,
                                        slug_type,
                                        thread_id,
                                        parent_id,
                                        original_poster,
                                        responded_to,
                                        title,
                                        first_time,
                                 FROM [{dataset}.{forum_posts}]
                                 {hash_limit_where}
                          ) as F
                          ON F.thread_id = FE.thread_id
                          WHERE ((FE.forum_action = "read") or 
                                (FE.forum_action = "read_inline") or
                                (FE.forum_action = "pin") or 
                                (FE.forum_action = "upvote") or 
                                (FE.forum_action = "unvote") or
                                #(FE.forum_action = "delete") or
                                (FE.forum_action = "follow_thread"))
                                {hash_limit_and}
                          GROUP BY username_fe, slug_id, thread_id, slug_type, original_poster, responded_to, title
                  ) as FE
                  ON (PP.username_fp = FE.username_fe) AND (PP.slug_id = FE.slug_id)
                  WHERE (PP.username_fp is not null and PP.username_fp != '') or (FE.username_fe is not null and FE.username_fe != '')

              """

    the_sql = original_the_sql.format(dataset=dataset,
                                      course_id=course_id,
                                      forum=TABLE_FORUM,
                                      forum_posts=TABLE_FORUM_POSTS,
                                      forum_events=TABLE_FORUM_EVENTS,
                                      hash_limit_and='',
                                      hash_limit_where='')

    print "[make_forum_analysis] Creating %s.%s table for %s" % (
        dataset, TABLE_FORUM_PERSON, course_id)
    sys.stdout.flush()

    try:

        tinfo_fe = bqutil.get_bq_table_info(dataset, TABLE_FORUM_EVENTS)
        trows_fe = int(tinfo_fe['numRows'])
        print "[make_forum_analysis] %s Forum Events found " % trows_fe
        tinfo_fp = bqutil.get_bq_table_info(dataset, TABLE_FORUM_POSTS)
        trows_fp = int(tinfo_fp['numRows'])
        print "[make_forum_analysis] %s Forum Posts found " % trows_fp

        assert tinfo_fe is not None and trows_fe != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % (
            TABLE_FORUM_PERSON, TABLE_FORUM_EVENTS)
        assert tinfo_fp is not None and trows_fp != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % (
            TABLE_FORUM_PERSON, TABLE_FORUM_POSTS)

    except (AssertionError, Exception) as err:

        print " --> Err: missing %s.%s and/or %s (including 0 rows in table)?  Skipping creation of %s" % (
            dataset, TABLE_FORUM_POSTS, TABLE_FORUM_EVENTS, TABLE_FORUM_PERSON)
        sys.stdout.flush()
        return

    # Now try to create table
    try:

        if has_hash_limit:

            overwrite = True
            hash_limit = int(hash_limit)
            for k in range(hash_limit):
                hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % (
                    hash_limit, k)
                hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % (
                    hash_limit, k)

                retry_the_sql = original_the_sql.format(
                    dataset=dataset,
                    forum=TABLE_FORUM,
                    forum_posts=TABLE_FORUM_POSTS,
                    forum_events=TABLE_FORUM_EVENTS,
                    hash_limit_and=hash_limit_and,
                    hash_limit_where=hash_limit_where)
                print "[make_forum_analysis] Retrying with this query...", retry_the_sql
                sys.stdout.flush()
                bqutil.create_bq_table(dataset,
                                       table,
                                       retry_the_sql,
                                       wait=True,
                                       overwrite=overwrite,
                                       allowLargeResults=True)
                overwrite = "append"

        else:

            overwrite = True
            bqutil.create_bq_table(dataset,
                                   table,
                                   the_sql,
                                   wait=True,
                                   overwrite=overwrite,
                                   allowLargeResults=True)

    except Exception as err:

        has_hash_limit = True
        if ((('Response too large to return.' in str(err)) or
             ('Resources exceeded during query execution' in str(err)))
                and has_hash_limit):

            # 'Resources exceeded during query execution'
            # try using hash limit on username
            # e.g. WHERE ABS(HASH(username)) % 4 = 0
            print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit

            try:

                for k in range(hash_limit):

                    hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % (
                        hash_limit, k)
                    hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % (
                        hash_limit, k)

                    retry_the_sql = original_the_sql.format(
                        dataset=dataset,
                        forum=TABLE_FORUM,
                        forum_posts=TABLE_FORUM_POSTS,
                        forum_events=TABLE_FORUM_EVENTS,
                        hash_limit_and=hash_limit_and,
                        hash_limit_where=hash_limit_where)
                    print "[make_forum_analysis] Retrying with this query...", retry_the_sql
                    sys.stdout.flush()
                    bqutil.create_bq_table(dataset,
                                           table,
                                           retry_the_sql,
                                           wait=True,
                                           overwrite=overwrite,
                                           allowLargeResults=True)
                    overwrite = "append"

            except Exception as err:

                if ((('Response too large to return.' in str(err)) or
                     ('Resources exceeded during query execution' in str(err)))
                        and has_hash_limit):

                    hash_limit = int(hash_limit * 2.0)
                    print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit
                    CreateForumPerson(course_id,
                                      force_recompute,
                                      use_dataset_latest,
                                      skip_last_day,
                                      end_date,
                                      has_hash_limit=True,
                                      hash_limit=hash_limit)

                else:

                    print '[make_forum_analysis] An error occurred with this query: %s' % the_sql
                    raise

        else:

            print '[make_forum_analysis] An error occurred with this query: %s' % the_sql
            raise

    print "Done with Forum Person for %s (end %s)" % (course_id,
                                                      datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()

    return
def old_process_course(course_id, force_recompute=False):
    '''
    DEPRACATED - instead of creating one table per day, because there is so little
    total data, create one enrollday_all table (see other function below).

    make enrollday2_* tables for specified course_id
    '''

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
	            time, 
                    event_struct.user_id as user_id, 
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "honor")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "honor")
                          then -1 
                          else 0 end) as diff_enrollment_honor,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "verified")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "verified")
                          then -1 
                          else 0 end) as diff_enrollment_verified,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "audit")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "audit")
                          then -1 
                          else 0 end) as diff_enrollment_audit,
            FROM [{dataset}.{table_id}] 
            where (event_type = "edx.course.enrollment.activated") or
                  (event_type = "edx.course.enrollment.deactivated")
            order by time;
            """

    course_dir = course_id.replace('/', '__')
    dataset = bqutil.course_id2dataset(course_id)
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")
    pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday")

    print "Processing course %s (start %s)" % (course_id,
                                               datetime.datetime.now())
    sys.stdout.flush()

    log_tables = bqutil.get_tables(log_dataset)

    try:
        bqutil.create_dataset_if_nonexistent(pcd_dataset)
    except Exception as err:
        print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err))

    pcday_tables_info = bqutil.get_tables(pcd_dataset)
    pcday_tables = [
        x['tableReference']['tableId']
        for x in pcday_tables_info.get('tables', [])
    ]

    # print "pcday_tables = ", pcday_tables

    log_table_list = log_tables['tables']
    log_table_list.sort()

    for table in log_table_list:
        tr = table['tableReference']
        table_id = tr['tableId']
        if not table_id.startswith('tracklog'):
            continue

        date = table_id[9:]

        table_out = 'enrollday2_%s' % date

        if (table_out in pcday_tables) and not force_recompute:
            print "%s...already done, skipping" % table_id
            sys.stdout.flush()
            continue

        if bqutil.get_bq_table_size_rows(log_dataset, table_id) == 0:
            print "...zero size table %s, skipping" % table_id
            sys.stdout.flush()
            continue

        print("Creating %s " % table_out),

        the_sql = SQL.format(course_id=course_id,
                             dataset=log_dataset,
                             table_id=table_id)

        sys.stdout.flush()

        bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False)

    print "Done with course %s (end %s)" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
def CreateForumPerson( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None, has_hash_limit=False, hash_limit=HASH ):
    '''
    Create Forum Person table, based on forum events and forum posts tables. 
    This table contains both read and writes for all forum posts, for all users.
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_FORUM_PERSON

    original_the_sql = """

                  SELECT (case when PP.username_fp is not null then PP.username_fp else FE.username_fe end) as username,
			 "{course_id}" as course_id,
                         (case when PP.username_fp is not null then PP.slug_id else FE.slug_id end) as slug_id,
                         (case when PP.username_fp is not null then PP.slug_type else FE.slug_type end) as slug_type,
                         (case when PP.username_fp is not null then PP.thread_id else FE.thread_id end) as thread_id,
                         (case when PP.username_fp is not null then PP.parent_id else FE.parent_id end) as parent_id,
                         (case when PP.original_poster is not null then PP.original_poster else FE.original_poster end) as original_poster,
                         (case when PP.responded_to is not null then PP.responded_to else FE.responded_to end) as responded_to,
                         (case when PP.username_fp is not null then PP.title else FE.title end) as title,
                         (case when PP.username_fp is not null then PP.wrote else 0 end) as wrote,
                         FE.read as read,
                         FE.pin as pinned,
                         FE.upvote as upvoted,
                         FE.unvote as unvoted,
                         #FE.del as deleted,
                         FE.follow as followed,
                         (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) <= TIMESTAMP(FE.last_time))
                                   then TIMESTAMP(PP.first_time)
                               else (case when PP.first_time is not null and FE.last_time is null
                                     then TIMESTAMP(PP.first_time) else
                                         (case when FE.first_time is not null
                                               then TIMESTAMP(FE.first_time)
                                              else FE.last_time end) end) end) as first_time,
                         (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) >= TIMESTAMP(FE.last_time))
                                   then TIMESTAMP(PP.first_time)
                               else (case when PP.first_time is not null and FE.last_time is null
                                     then TIMESTAMP(PP.first_time) else
                                         (case when FE.last_time is not null
                                               then TIMESTAMP(FE.last_time)
                                              else FE.first_time end) end) end) as last_time,


                  FROM
                  (
                          # Find 1st level posting => "original_post"
                          SELECT username as username_fp,
                                 slug_id,
                                 slug_type,
                                 thread_id,
                                 parent_id,
                                 original_poster,
                                 responded_to,
                                 title,
                                 1 as wrote,
                                 #created_at as first_time,
                                 first_time
                          FROM [{dataset}.{forum_posts}]
                          {hash_limit_where}
                          ORDER by username_fp, first_time
                  ) PP
                  FULL OUTER JOIN EACH
                  (
                          SELECT username as username_fe, 
                                 MIN(TIMESTAMP(time)) as first_time,
                                 MAX(TIMESTAMP(time)) as last_time,
                                 slug_id,
                                 FE.thread_id as thread_id,
                                 FIRST(parent_id) as parent_id,
				 F.slug_type as slug_type,
				 F.original_poster as original_poster,
				 F.responded_to as responded_to,
				 F.title as title,
                                 #1 as read,
                                 sum(case when forum_action = "read" or forum_action = "read_inline" then 1 else 0 end) as read,
                                 sum(case when forum_action = "pin" then 1 else 0 end) as pin,
                                 sum(case when forum_action = "upvote" then 1 else 0 end) as upvote,
                                 sum(case when forum_action = "unvote" then 1 else 0 end) as unvote,
                                 #sum(case when forum_action = "delete" then 1 else 0 end) as del,
                                 sum(case when forum_action = "follow_thread" then 1 else 0 end) as follow,      
                          FROM [{dataset}.{forum_events}] FE
                          JOIN EACH 
                          (
                                 SELECT username as username_fe,
                                        slug_id,
                                        slug_type,
                                        thread_id,
                                        parent_id,
                                        original_poster,
                                        responded_to,
                                        title,
                                        first_time,
                                 FROM [{dataset}.{forum_posts}]
                                 {hash_limit_where}
                          ) as F
                          ON F.thread_id = FE.thread_id
                          WHERE ((FE.forum_action = "read") or 
                                (FE.forum_action = "read_inline") or
                                (FE.forum_action = "pin") or 
                                (FE.forum_action = "upvote") or 
                                (FE.forum_action = "unvote") or
                                #(FE.forum_action = "delete") or
                                (FE.forum_action = "follow_thread"))
                                {hash_limit_and}
                          GROUP BY username_fe, slug_id, thread_id, slug_type, original_poster, responded_to, title
                  ) as FE
                  ON (PP.username_fp = FE.username_fe) AND (PP.slug_id = FE.slug_id)
                  WHERE (PP.username_fp is not null and PP.username_fp != '') or (FE.username_fe is not null and FE.username_fe != '')

              """

    the_sql = original_the_sql.format( dataset=dataset, course_id=course_id, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and='', hash_limit_where='' )

    print "[make_forum_analysis] Creating %s.%s table for %s" % (dataset, TABLE_FORUM_PERSON, course_id)
    sys.stdout.flush()

    try:

        tinfo_fe = bqutil.get_bq_table_info( dataset, TABLE_FORUM_EVENTS )
        trows_fe = int(tinfo_fe['numRows'])
	print "[make_forum_analysis] %s Forum Events found " % trows_fe
        tinfo_fp = bqutil.get_bq_table_info( dataset, TABLE_FORUM_POSTS )
        trows_fp = int(tinfo_fp['numRows'])
	print "[make_forum_analysis] %s Forum Posts found " % trows_fp

        assert tinfo_fe is not None and trows_fe != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % ( TABLE_FORUM_PERSON, TABLE_FORUM_EVENTS )
        assert tinfo_fp is not None and trows_fp != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % ( TABLE_FORUM_PERSON, TABLE_FORUM_POSTS ) 

    except (AssertionError, Exception) as err:

        print " --> Err: missing %s.%s and/or %s (including 0 rows in table)?  Skipping creation of %s" % ( dataset, TABLE_FORUM_POSTS, TABLE_FORUM_EVENTS, TABLE_FORUM_PERSON )
        sys.stdout.flush()
        return

    # Now try to create table
    try:

        if has_hash_limit:

            overwrite = True
            hash_limit = int( hash_limit )
            for k in range( hash_limit ):
                hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % ( hash_limit, k )
                hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % ( hash_limit, k )

                retry_the_sql = original_the_sql.format( dataset=dataset, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and=hash_limit_and, hash_limit_where=hash_limit_where )
                print "[make_forum_analysis] Retrying with this query...", retry_the_sql
                sys.stdout.flush()
                bqutil.create_bq_table( dataset, table, retry_the_sql, wait=True, overwrite=overwrite, allowLargeResults=True )
                overwrite = "append"

        else:

	    overwrite = True
            bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)

    except Exception as err:

        has_hash_limit = True
        if ( (('Response too large to return.' in str(err)) or ('Resources exceeded during query execution' in str(err))) and has_hash_limit ):

            # 'Resources exceeded during query execution'
            # try using hash limit on username
            # e.g. WHERE ABS(HASH(username)) % 4 = 0
            print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit

            try:

                for k in range( hash_limit ):

		    hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % ( hash_limit, k )
		    hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % ( hash_limit, k )

                    retry_the_sql = original_the_sql.format( dataset=dataset, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and=hash_limit_and, hash_limit_where=hash_limit_where )
                    print "[make_forum_analysis] Retrying with this query...", retry_the_sql
                    sys.stdout.flush()
                    bqutil.create_bq_table( dataset, table, retry_the_sql, wait=True, overwrite=overwrite, allowLargeResults=True )
                    overwrite = "append"
  
            except Exception as err:

                if ( (('Response too large to return.' in str(err)) or ('Resources exceeded during query execution' in str(err))) and has_hash_limit ):

                    hash_limit = int( hash_limit * 2.0 )
                    print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit
                    CreateForumPerson( course_id, force_recompute, use_dataset_latest, skip_last_day, end_date, has_hash_limit=True, hash_limit=hash_limit )

                else:

                    print '[make_forum_analysis] An error occurred with this query: %s' % the_sql
                    raise

        else:

	    print '[make_forum_analysis] An error occurred with this query: %s' % the_sql
            raise

    print "Done with Forum Person for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()

    return
Пример #10
0
    def actual_ajax_get_report_data(self, report_name=None):
        '''
        get data for custom report.
        parameters like course_id, chapter_id, problem_id are passed in as GET or POST parameters

        Defined parameters for SQL:

        {person_course} --> person_course table for the specific course
        {dataset} --> dataset for the specific course
        {course_report} --> course_report_* dataset for the ORG or latest
        {course_report_org} --> course_report_ORG dataset for ORG = ORGANIZATION_NAME
        {orgname} --> organization name
        
        '''
        crm, pdata, auth_ok, msg = self.custom_report_auth_check(
            report_name)  # crm = CourseReport model
        if not auth_ok:
            return self.no_auth_sorry()
        course_id = pdata['course_id']
        force_query = pdata.get('force_query', False)
        if force_query == 'false':
            force_query = False
        ignore_cache = pdata.get('ignore_cache', False) or force_query

        # project_id specified?
        optargs = {}
        if 'project_id' in (crm.meta_info or {}):
            optargs['project_id'] = crm.meta_info['project_id']

        if course_id:
            dataset = bqutil.course_id2dataset(
                course_id, use_dataset_latest=self.use_dataset_latest())
            pdata['person_course'] = '[%s.person_course]' % dataset
        elif 'dataset' in (crm.meta_info or {}):
            dataset = crm.meta_info['dataset']
        else:
            dataset = self.get_course_report_dataset()
            # using course report dataset; list the tables, to determine which is the latest
            # person_course dataset, and use that for {person_course}
            pdata[
                'person_course_latest'] = self.find_latest_person_course_table(
                    dataset, project_id=optargs.get('project_id'))
            pdata['person_course'] = '[%s.%s]' % (
                dataset, pdata['person_course_latest'])
        pdata['dataset'] = dataset
        pdata['course_report'] = self.get_course_report_dataset()
        pdata['course_report_org'] = self.get_course_report_dataset(
            force_use_org=True)
        pdata['orgname'] = self.ORGNAME
        pdata['sane_username'] = self.user.replace(' ', '_').replace(
            '.', '_').replace('@', '_')

        if 'module_id' in pdata and pdata['module_id']:
            url_name = pdata['module_id'].rsplit('/', 1)[-1]
            pdata['module_url_name'] = url_name.replace(':', '__').replace(
                '-', '_')

        # what table?  get custom course report configuration metadata for report name as specified
        table = crm.table_name
        if not table or table == "None":
            error = "No table name defined!  Cannot process this custom report"
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'
            self.response.out.write(json.dumps(data))
            return

        # multiple table names?  use parameters to select one
        if ',' in table:
            tables = table.split(',')
            try:
                table_number = int(pdata.get('table_number', 0) or 0)
                table = tables[table_number]
            except Exception as err:
                raise Exception(
                    "[custom_reports] Cannot select table from tables=%s, table_number=%s, err=%s"
                    % (tables, pdata.get('table_number'), err))

        # allow parameters in table name
        if '{' in table:
            table = table.format(**pdata)
            table = table.replace('-', '_').replace(' ', '_')
        if not ('dataset' in (crm.meta_info or {})) and not table.startswith(
                'stats_') and not (crm.meta_info.get('no_stats_ok')):
            table = "stats_" + table

        # special handling for person_course table from particular dataset
        for m in re.findall('{person_course__([^ \}]+)}', crm.sql):
            org = m
            org_dataset = self.get_course_report_dataset(orgname=org)
            pcd = '[%s.%s]' % (org_dataset,
                               self.find_latest_person_course_table(
                                   org_dataset,
                                   project_id=optargs.get('project_id')))
            pdata['person_course__' + org] = pcd
            logging.info('[cr] org=%s, pc=%s.%s' % (org, org_dataset, pcd))

        # special handling for course_report tables for specific orgs
        for m in re.findall('{course_report__([^ \}]+)}', crm.sql):
            org = m
            org_dataset = self.get_course_report_dataset(orgname=org)
            pdata['course_report__' + org] = org_dataset

        logging.info("Using %s for custom report %s person_course" %
                     (pdata.get('person_course'), report_name))

        error = None

        def setup_sql_flags():
            if 'sql_flags' in pdata:
                if not type(pdata['sql_flags']) == dict:
                    try:
                        pdata['sql_flags'] = json.loads(pdata['sql_flags'])
                    except Exception as err:
                        msg = "Cannot parse sql_flags as JSON!  sql_flags=%s" % pdata[
                            'sql_flags']
                        msg += err
                        logging.error(msg)
                        raise Exception(msg)

        # dynamic sql: the SQL is allowed to change based on input parameters
        # do this by treating the SQL as a jinja2 tempate
        if crm.meta_info.get('dynamic_sql'):
            setup_sql_flags()
            # a little sanity checking - disallow spaces in any sql_flags values
            sf = pdata['sql_flags']
            for k in sf:
                if ' ' in sf[k]:
                    msg = "Illegal sql_flags %s=%s!" % (k, sf[k])
                    msg += "sql_flags = %s" % json.dumps(sf, indent=4)
                    error = "<pre>%s</pre>" % (msg.replace(
                        '<', '&lt;').replace('<', '&gt;'))
                    data = {'error': error}
                    self.response.headers['Content-Type'] = 'application/json'
                    self.response.out.write(json.dumps(data))
                    return

            try:
                sql_template = Template(crm.sql)
                sql = sql_template.render(pdata)
            except Exception as err:
                msg = 'Custom report data: failed to render dynamic SQL with pdata=%s, err=%s' % (
                    pdata, err)
                logging.error(msg)
                logging.error('sql=%s' % crm.sql)
                error = "<pre>%s</pre>" % (msg.replace('<', '&lt;').replace(
                    '<', '&gt;'))
                data = {'error': error}
                self.response.headers['Content-Type'] = 'application/json'
                self.response.out.write(json.dumps(data))
                return

            # append username to table name
            table = table + "_%s" % pdata['sane_username']

            force_query = True  # for now, all dynamic_sql is done with force_query
            ignore_cache = True
            the_sql = sql
        else:
            the_sql = crm.sql

        if 1:
            # generate SQL and depends_on
            try:
                sql = the_sql.format(**pdata)
            except Exception as err:
                msg = "Custom report data: failed to prepare SQL, err=%s" % str(
                    err)
                msg += '\npdata = %s' % pdata
                logging.error(msg)
                if self.is_superuser():
                    error = "<pre>%s</pre>" % (str(msg).replace(
                        '<', '&lt;').replace('<', '&gt;'))
                    data = {'error': error}
                    self.response.headers['Content-Type'] = 'application/json'
                    self.response.out.write(json.dumps(data))
                    logging.error("Returning with error message")
                    return
                raise

        def strip_brackets(x):
            x = x.strip()
            if x.startswith('[') and x.endswith(']'):
                x = x[1:-1]
                return x
            return x

        if crm.meta_info.get('debug_sql'):
            msg = "debug_sql is true; not running SQL.  This is the SQL which would have been run:\n"
            msg += sql
            msg += "\n\nwith these parameters:\n"
            msg += json.dumps(pdata, indent=4)
            msg += "\n\nproducing the output table: %s.%s\n" % (dataset, table)
            error = "<pre>%s</pre>" % (msg.replace('<', '&lt;').replace(
                '<', '&gt;'))
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'
            self.response.out.write(json.dumps(data))
            return

        try:
            if crm.depends_on and (not crm.depends_on == "None"):
                depends_on = [
                    strip_brackets(x.format(**pdata))
                    for x in (json.loads(crm.depends_on or "[]"))
                ]
            else:
                depends_on = None
        except Exception as err:
            logging.error(
                "for course report %s, cannot process depends_on=%s" %
                (report_name, crm.depends_on))
            raise Exception(
                "Bad format for the 'depends_on' setting in the custom report specification"
            )
            raise

        # get the data, and do query if needed

        logging.info(
            'custom report get_report_data name=%s, table=%s.%s, depends_on=%s, pdata=%s'
            % (report_name, dataset, table, depends_on, pdata))

        the_msg = []

        def my_logger(msg):
            logging.info(msg)
            the_msg.append(msg)

        def output_error(err):
            error = "<pre>%s</pre>" % (str(err).replace('<', '&lt;').replace(
                '<', '&gt;'))
            logging.error('custom report error %s' % error)
            logging.error(err)
            logging.error(traceback.format_exc())
            # raise
            if self.is_superuser():
                msg = ('\n'.join(the_msg))
                msg = msg.replace('<', '&lt;').replace('<', '&gt;')
                error += "<pre>%s</pre>" % msg
                error += "SQL: <pre>%s</pre>" % sql
                error += "Parameters: <pre>%s</pre>" % json.dumps(pdata,
                                                                  indent=4)
                error += "optargs: <pre>%s</pre>" % json.dumps(optargs,
                                                               indent=4)
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'
            self.response.out.write(json.dumps(data))

        # is the request "indexed", meaning only matching rows of the table are to be returned?
        indexed_column = crm.meta_info.get('indexed')
        if indexed_column:
            if type(indexed_column) == list:
                indexed_columns = indexed_column
                try:
                    table_number = int(pdata.get('table_number', 0) or 0)
                    indexed_column = indexed_columns[table_number]
                except Exception as err:
                    raise Exception(
                        "[custom_reports] Cannot select indexed_column from indexed_columns=%s, table_number=%s, err=%s"
                        % (indexed_columns, pdata.get('table_number'), err))
            setup_sql_flags()
            indexed_value = pdata.get('sql_flags', {}).get('indexed_value')
            logging.info(
                "[custom_reports] retrieving %s.%s with indexing on %s to match value %s"
                % (dataset, table, indexed_column, indexed_value))
            if not indexed_value:
                my_logger(
                    'Error: missing sql_flags.indexed_value to match indexed column %s in %s.%s'
                    % (indexed_column, dataset, table))
                data = {'error': msg}
                self.response.headers['Content-Type'] = 'application/json'
                self.response.out.write(json.dumps(data))
                return
            # ensure existence of indexed version of table.  By convention, that is a table named tablename + "__indexed_" + indexed_column
            # the table has a SHA1 hash of the indexed column added, and is sorted according to the last few characters
            # of the SHA1 hash.
            indexed_table = table + "__indexed_" + indexed_column
            indexing_sql_template = """select *,
                                  SUBSTR(TO_BASE64(SHA1(STRING({indexed_column}))),-3,2) as index_sha1_2ch,
                                  ROW_NUMBER() over (order by index_sha1_2ch, {indexed_column}) as index_row_number{subnum},
                              from [{dataset}.{table}]
                              {where_clause}
                              order by index_sha1_2ch, {indexed_column}
                           """
            indexing_sql = indexing_sql_template.format(
                dataset=dataset,
                table=table,
                indexed_column=indexed_column,
                where_clause="",
                subnum="")
            try:
                bqdata = self.cached_get_bq_table(
                    dataset,
                    indexed_table,
                    sql=indexing_sql,
                    logger=my_logger,
                    depends_on=["%s.%s" % (dataset, table)],
                    raise_exception=True,
                    ignore_cache=ignore_cache,
                    force_query=force_query,
                    startIndex=0,
                    maxResults=1,
                    **optargs)
            except Exception as err:
                if "Response too large to return" in str(the_msg):
                    # hmm - table too large!  can't workaround using allowLargeResult because the return results
                    # need to be ordered.  So let's break it up into multiple queries, appending each,
                    # by index_sha1_2ch
                    b64chars = "+/0123456789" + ''.join(
                        map(chr, range(ord('A'),
                                       ord('Z') + 1))) + ''.join(
                                           map(chr,
                                               range(ord('a'),
                                                     ord('z') + 1)))
                    # get table size, divide by 64M, to get number of divisions to use
                    tinfo = bqutil.get_bq_table_info(dataset, table, **optargs)
                    nbytes = int(tinfo['numBytes'])
                    ndivs = int(round(nbytes / (64 * 1024 * 1024)))
                    logging.info(
                        "Response too large - nbytes=%s, so trying ndivs=%s" %
                        (nbytes, ndivs))
                    end_idx = None
                    start_idx = None
                    dn = int(64 / ndivs)
                    offset = dn
                    overwrite = True
                    nrows = 0
                    while (offset < 65):
                        start_idx = end_idx
                        last_row_index = nrows  # note ROW_NUMBER() starts with 1 (not zero)
                        if (offset < 64):
                            end_idx = b64chars[offset] + "+"
                        else:
                            end_idx = None  # boundary case
                        wc = "where "
                        if start_idx:
                            wc += '(SUBSTR(TO_BASE64(SHA1(STRING(%s))),-3,2) >= "%s") ' % (
                                indexed_column, start_idx)
                        else:
                            wc += "True "
                        if end_idx:
                            wc += 'AND (SUBSTR(TO_BASE64(SHA1(STRING(%s))),-3,2) < "%s") ' % (
                                indexed_column, end_idx)
                        logging.info(
                            "--> start_idx=%s, end_idx=%s, starting row %d" %
                            (start_idx, end_idx, last_row_index))
                        tmp_sql = indexing_sql_template.format(
                            dataset=dataset,
                            table=table,
                            indexed_column=indexed_column,
                            where_clause=wc,
                            subnum="_sub")
                        indexing_sql = "SELECT *, index_row_number_sub + %d as index_row_number FROM (%s)" % (
                            last_row_index, tmp_sql)
                        try:
                            bqutil.create_bq_table(dataset,
                                                   indexed_table,
                                                   sql=indexing_sql,
                                                   overwrite=overwrite,
                                                   logger=my_logger,
                                                   **optargs)
                            cnt = 0
                            tinfo = None
                            while (not tinfo) and (cnt < 10):
                                tinfo = bqutil.get_bq_table_info(
                                    dataset, indexed_table, **optargs)
                                if not tinfo:
                                    logging.info(
                                        "==> ERROR?  got unexpected None for get_bq_table_info %s.%s"
                                        % (dataset, indexed_table))
                                    time.sleep(10)
                                    cnt += 1

                            nrows = int(tinfo['numRows'])
                            logging.info(
                                "--> Result from %s to %s has %d rows" %
                                (start_idx, end_idx, nrows))
                        except Exception as err:
                            bqdata = {'data': None}
                            sql = indexing_sql
                            output_error(err)
                            return
                        overwrite = "append"
                        offset += dn

                else:
                    bqdata = {'data': None}
                    sql = indexing_sql
                    output_error(err)
                    return

            # now ensure table index, and retrieve it.  It has just two columns: index_sha1_2ch, start_row
            tindex_table = table + "__index_for_" + indexed_column
            tindex_sql = """SELECT index_sha1_2ch, 
                                min(index_row_number) as start_row,
                                # max(index_row_number) as end_row,   # don't need this - just take next start_row
                            FROM [{dataset}.{indexed_table}]
                            group by index_sha1_2ch
                            order by index_sha1_2ch
                         """.format(dataset=dataset,
                                    indexed_table=indexed_table)
            try:
                bqdata = self.cached_get_bq_table(
                    dataset,
                    tindex_table,
                    sql=tindex_sql,
                    logger=my_logger,
                    depends_on=["%s.%s" % (dataset, indexed_table)],
                    raise_exception=True,
                    ignore_cache=ignore_cache,
                    force_query=force_query,
                    startIndex=0,
                    maxResults=10000,
                    **optargs)
            except Exception as err:
                bqdata = {'data': None}
                sql = tindex_sql
                output_error(err)
                return

            # find the start and end rows to retrieve, based the last characters of the SHA1 hash of the indexed value
            sha1_2ch = base64.b64encode(
                hashlib.sha1(indexed_value).digest())[-3:-1]
            start_row = None
            end_row = None
            for k in bqdata['data']:
                if start_row and not end_row:
                    end_row = int(k['start_row'])
                if (k['index_sha1_2ch'] == sha1_2ch):
                    start_row = int(k['start_row'])
            logging.info(
                "Retrieving iv=%s, sha1_2ch=%s, rows %s to %s of %s.%s" %
                (indexed_value, sha1_2ch, start_row, end_row, dataset,
                 indexed_table))
            if not start_row:
                output_error(
                    "Cannot find %s=%s in %s.%s" %
                    (indexed_column, indexed_value, dataset, indexed_table))
                bqdata = {'data': None}
                return

            max_results = (end_row or (start_row + 4000)) - start_row
            bqdata = self.cached_get_bq_table(dataset,
                                              indexed_table,
                                              ignore_cache=True,
                                              startIndex=start_row - 1,
                                              maxResults=max_results,
                                              **optargs)

            # extract just the row(s) with indexed_column value matching indexed_value (the hash is many to one)
            newdata = []
            for k in range(len(bqdata['data'])):
                datum = bqdata['data'][k]
                if (datum[indexed_column] == indexed_value):
                    newdata.append(datum)
            logging.info("--> Result has %d items, of which %d match" %
                         (len(bqdata['data']), len(newdata)))
            bqdata['data'] = newdata

            table = indexed_table  # so that columns are retrieved properly

        if not indexed_column:
            # retrieve full table
            try:
                bqdata = self.cached_get_bq_table(
                    dataset,
                    table,
                    sql=sql,
                    logger=my_logger,
                    depends_on=depends_on,
                    startIndex=int(pdata['start'] or 0),
                    maxResults=int(pdata['length'] or 100000),
                    raise_exception=True,
                    ignore_cache=ignore_cache,
                    force_query=force_query,
                    **optargs)
                self.fix_bq_dates(bqdata)
            except Exception as err:
                bqdata = {'data': None}
                output_error(err)
                return

        tablecolumns = []
        if pdata['get_table_columns']:
            try:
                tableinfo = bqutil.get_bq_table_info(dataset, table, **optargs)
            except Exception as err:
                error = (error or "\n") + str(err)
                tableinfo = None
                raise

            if tableinfo:
                fields = tableinfo['schema']['fields']
                field_names = [x['name'] for x in fields]
                tablecolumns = [{
                    'data': x,
                    'title': x,
                    'class': 'dt-center'
                } for x in field_names]

        data = self.common_data.copy()
        data.update({
            'data': bqdata['data'],
            'draw': pdata['draw'],
            'last_modified_date': str(bqdata.get('last_modified_date')),
            'fields': bqdata['fields'],
            'recordsTotal': bqdata.get('numRows', 0),
            'recordsFiltered': bqdata.get('numRows', 0),
            'error': error,
            'tablecolumns': tablecolumns,
            'output_table': table,
            'output_dataset': dataset,
        })

        # logging.info('[cr] data=%s' % data)

        self.response.headers['Content-Type'] = 'application/json'
        self.response.out.write(json.dumps(data))
Пример #11
0
def run_query_on_tracking_logs(SQL,
                               table,
                               course_id,
                               force_recompute=False,
                               use_dataset_latest=False,
                               start_date=None,
                               end_date=None,
                               get_date_function=None,
                               existing=None,
                               log_dates=None,
                               days_delta=1,
                               skip_last_day=False,
                               has_hash_limit=False,
                               newer_than=None,
                               table_max_size_mb=800,
                               limit_query_size=False):
    '''
    make a certain table (with SQL given) for specified course_id.

    The master table holds all the data for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    If the query fails because of "Resources exceeded during query execution"
    then try setting the end_date, to do part at a time.

    NOTE: the SQL must produce a result which is ordered by date, in increasing order.

    days_delta = integer number of days to increase each time; specify 0 for one day overlap,
                 but make sure the SQL query only selects for time > TIMESTAMP("{last_date}")

    If skip_last_day is True then do not include the last day of tracking log data
    in the processing.  This is done to avoid processing partial data, e.g. when
    tracking log data are incrementally loaded with a delta of less than one day.

    start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format.

    newer_than = if specified, as datetime, then any existing destination table must
    be newer than this datetime, else force_recompute is made True
    '''

    dataset = bqutil.course_id2dataset(
        course_id, use_dataset_latest=use_dataset_latest)  # destination
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")

    if existing is None:
        existing = bqutil.get_list_of_table_ids(dataset)
        print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % (
            len(existing or []), dataset)

    if log_dates is None:
        log_tables = [
            x for x in bqutil.get_list_of_table_ids(log_dataset)
            if x.startswith('tracklog_20')
        ]
        log_dates = [x[9:] for x in log_tables]
        log_dates.sort()

    if len(log_dates) == 0:
        print "--> no tracking logs in %s aborting!" % (log_dataset)
        return

    if skip_last_day:
        old_max_date = max(log_dates)
        log_dates.remove(
            max(log_dates))  # remove the last day of data from consideration
        max_date = max(log_dates)
        print "         --> skip_last_day is True: dropping %s, new max_date=%s" % (
            old_max_date, max_date)
        sys.stdout.flush()

    min_date = min(log_dates)
    max_date = max(log_dates)

    if start_date:
        start_date = start_date.replace('-', '')
        if min_date < start_date:
            print "        --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % (
                min_date, start_date)
            min_date = start_date

    if end_date is not None:
        print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % (
            table, min_date, max_date, end_date)
        sys.stdout.flush()
        the_end_date = end_date.replace('-',
                                        '')  # end_date should be YYYY-MM-DD
        if the_end_date < max_date:
            max_date = the_end_date

    if (table in existing) and newer_than:
        # get date on existing table
        table_datetime = bqutil.get_bq_table_last_modified_datetime(
            dataset, table)
        if (table_datetime < newer_than):
            print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % (
                dataset, table, table_datetime, newer_than)
            force_recompute = True

    if force_recompute:
        overwrite = True
    else:
        overwrite = False

    last_date = "2010-10-01 01:02:03"  # default last date

    if (not overwrite) and table in existing:
        # find out what the end date is of the current table
        pc_last = bqutil.get_table_data(dataset,
                                        table,
                                        startIndex=-10,
                                        maxResults=100)
        if (pc_last is None):
            print "--> no data in table %s.%s, starting from scratch!" % (
                dataset, table)
            overwrite = True
        else:
            last_dates = []
            for x in pc_last['data']:
                try:
                    add_date = get_date_function(x)
                    last_dates.append(add_date)
                except Exception as err:
                    print "Error with get_date_function occurred. %s" % str(
                        err)
                    continue
            last_date = max(last_dates)
            table_max_date = last_date.strftime('%Y%m%d')
            if max_date <= table_max_date:
                print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % (
                    table, table_max_date, min_date, max_date)
                sys.stdout.flush()
                return
            min_date = (last_date +
                        datetime.timedelta(days=days_delta)).strftime('%Y%m%d')
            print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % (
                table, table_max_date, min_date, max_date)
            sys.stdout.flush()
            overwrite = 'append'

    if overwrite == 'append':
        print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]" % (
            table, course_id, min_date, max_date, last_date,
            datetime.datetime.now())
    else:
        print "Making new %s table for course %s (start=%s, end=%s) [%s]" % (
            table, course_id, min_date, max_date, datetime.datetime.now())
    sys.stdout.flush()

    if limit_query_size:
        # do only one day's tracking logs, and force use of hash if table is too large
        print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb
        the_max_date = max_date  # save max_date information

        while min_date not in log_dates:
            print "  tracklog_%s does not exist!" % min_date
            for ld in log_dates:
                if ld < min_date:
                    continue
                if (ld > min_date) and (ld <= max_date):
                    min_date = ld
                    break
        if min_date not in log_dates:
            print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date
            raise Exception("[process_tracking_logs] missing tracking log")

        max_date = min_date
        print '    min_date = max_date = %s' % min_date
        tablename = 'tracklog_%s' % min_date.replace('-', '')
        tablesize_mb = bqutil.get_bq_table_size_bytes(
            log_dataset, tablename) / (1024.0 * 1024)
        nhashes = int(math.ceil(tablesize_mb / table_max_size_mb))
        from_datasets = "[%s.%s]" % (log_dataset, tablename)
        print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % (
            log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes)
        sys.stdout.flush()

        if nhashes:
            if not has_hash_limit:
                print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % (
                    log_dataset, tablename, tablesize_mb, table_max_size_mb)
                print "SQL: ", SQL
                raise Exception("[process_tracking_logs] table too large")

            for k in range(nhashes):
                hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes,
                                                                     k)
                the_sql = SQL.format(course_id=course_id,
                                     DATASETS=from_datasets,
                                     last_date=last_date,
                                     hash_limit=hash_limit)
                print "   Hash %d" % k
                sys.stdout.flush()
                try:
                    bqutil.create_bq_table(dataset,
                                           table,
                                           the_sql,
                                           wait=True,
                                           overwrite=overwrite,
                                           allowLargeResults=True)
                except Exception as err:
                    print the_sql
                    raise
                overwrite = "append"
        else:
            the_sql = SQL.format(course_id=course_id,
                                 DATASETS=from_datasets,
                                 last_date=last_date,
                                 hash_limit="")
            try:
                bqutil.create_bq_table(dataset,
                                       table,
                                       the_sql,
                                       wait=True,
                                       overwrite=overwrite,
                                       allowLargeResults=True)
            except Exception as err:
                print the_sql
                raise

        txt = '[%s] added tracking log data from %s' % (
            datetime.datetime.now(), tablename)
        bqutil.add_description_to_table(dataset, table, txt, append=True)

        print "----> Done with day %s" % max_date

        if the_max_date > max_date:
            # more days still to be done
            print "--> Moving on to another day (max_date=%s)" % the_max_date
            run_query_on_tracking_logs(
                SQL,
                table,
                course_id,
                force_recompute=False,
                use_dataset_latest=use_dataset_latest,
                end_date=end_date,
                get_date_function=get_date_function,
                existing=existing,
                log_dates=log_dates,
                has_hash_limit=has_hash_limit,
                table_max_size_mb=table_max_size_mb,
                limit_query_size=limit_query_size,
            )
        return

    from_datasets = """(
                  TABLE_QUERY({dataset},
                       "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}"
                     )
                  )
         """.format(dataset=log_dataset, start=min_date, end=max_date)

    the_sql = SQL.format(course_id=course_id,
                         DATASETS=from_datasets,
                         last_date=last_date,
                         hash_limit="")

    try:
        bqutil.create_bq_table(dataset,
                               table,
                               the_sql,
                               wait=True,
                               overwrite=overwrite,
                               allowLargeResults=True)
    except Exception as err:
        if (('Response too large to return.' in str(err)) and has_hash_limit):
            # try using hash limit on username
            # e.g. WHERE ABS(HASH(username)) % 4 = 0

            for k in range(4):
                hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k
                the_sql = SQL.format(course_id=course_id,
                                     DATASETS=from_datasets,
                                     last_date=last_date,
                                     hash_limit=hash_limit)
                bqutil.create_bq_table(dataset,
                                       table,
                                       the_sql,
                                       wait=True,
                                       overwrite=overwrite,
                                       allowLargeResults=True)
                overwrite = "append"

        elif ('Resources exceeded during query execution' in str(err)):
            if True:
                # figure out time interval in days, and split that in half
                start_date = datetime.datetime.strptime(min_date, '%Y%m%d')
                end_date = datetime.datetime.strptime(max_date, '%Y%m%d')
                ndays = (end_date - start_date).days
                if (ndays < 1) and has_hash_limit:
                    print "----> ndays=%d; retrying with limit_query_size" % ndays
                    sys.stdout.flush()
                    return run_query_on_tracking_logs(
                        SQL,
                        table,
                        course_id,
                        force_recompute=force_recompute,
                        use_dataset_latest=use_dataset_latest,
                        end_date=max_date,
                        get_date_function=get_date_function,
                        has_hash_limit=has_hash_limit,
                        # existing=existing,
                        log_dates=log_dates,
                        limit_query_size=True,
                    )
                elif (ndays < 1):
                    print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays
                    raise

                nd1 = int(ndays / 2)
                nd2 = ndays - nd1
                #if nd2 > nd1:
                #    nd1 = nd2
                #    nd2 = ndays - nd1
                print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % (
                    ndays, nd1, nd2)
                sys.stdout.flush()

                end_date = (start_date +
                            datetime.timedelta(days=nd1)).strftime('%Y%m%d')
                print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(
                    SQL,
                    table,
                    course_id,
                    force_recompute=force_recompute,
                    use_dataset_latest=use_dataset_latest,
                    end_date=end_date,
                    get_date_function=get_date_function,
                    has_hash_limit=has_hash_limit,
                    # existing=existing,
                    log_dates=log_dates)

                end_date = max_date
                print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(
                    SQL,
                    table,
                    course_id,
                    force_recompute=False,
                    use_dataset_latest=use_dataset_latest,
                    end_date=end_date,
                    get_date_function=get_date_function,
                    has_hash_limit=has_hash_limit,
                    # existing=existing,
                    log_dates=log_dates)
                print "--> Done with %d + %d days!" % (nd1, nd2)
                return

            if False:

                def get_ym(x):
                    return int(x[0:4]), int(x[4:6]), int(x[6:])

                (min_year, min_month, min_day) = get_ym(min_date)
                (max_year, max_month, max_day) = get_ym(max_date)
                nmonths = max_month - min_month + 12 * (max_year - min_year)
                print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time"
                sys.stdout.flush()
                (end_year, end_month) = (min_year, min_month)
                for dm in range(nmonths):
                    end_month += 1
                    if end_month > 12:
                        end_month = 1
                        end_year += 1
                    end_date = "%04d-%02d-%02d" % (end_year, end_month,
                                                   min_day)
                    print "--> with end_date=%s" % end_date
                    sys.stdout.flush()
                    run_query_on_tracking_logs(
                        SQL,
                        table,
                        course_id,
                        force_recompute=force_recompute,
                        use_dataset_latest=use_dataset_latest,
                        end_date=end_date,
                        get_date_function=get_date_function,
                        has_hash_limit=has_hash_limit,
                        # existing=existing,
                        log_dates=log_dates)
                    force_recompute = False  # after first, don't force recompute
                return
        else:
            print the_sql
            raise

    if overwrite == 'append':
        txt = '[%s] added tracking log data from %s to %s' % (
            datetime.datetime.now(), min_date, max_date)
        bqutil.add_description_to_table(dataset, table, txt, append=True)

    print "Done with course %s (end %s)" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
Пример #12
0
    def actual_ajax_get_report_data(self, report_name=None):
        '''
        get data for custom report.
        parameters like course_id, chapter_id, problem_id are passed in as GET or POST parameters

        Defined parameters for SQL:

        {person_course} --> person_course table for the specific course
        {dataset} --> dataset for the specific course
        {course_report} --> course_report_* dataset for the ORG or latest
        {course_report_org} --> course_report_ORG dataset for ORG = ORGANIZATION_NAME
        {orgname} --> organization name
        
        '''
        crm, pdata, auth_ok, msg = self.custom_report_auth_check(report_name)	# crm = CourseReport model
        if not auth_ok:
            return self.no_auth_sorry()
        course_id = pdata['course_id']
        force_query = pdata.get('force_query', False)
        if force_query == 'false':
            force_query = False
        ignore_cache = pdata.get('ignore_cache', False) or force_query

        # project_id specified?
        optargs = {}
        if 'project_id' in (crm.meta_info or {}):
            optargs['project_id'] = crm.meta_info['project_id']

        if course_id:
            dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest())
            pdata['person_course'] = '[%s.person_course]' % dataset
        elif 'dataset' in (crm.meta_info or {}):
            dataset = crm.meta_info['dataset']
        else:
            dataset = self.get_course_report_dataset()
            # using course report dataset; list the tables, to determine which is the latest
            # person_course dataset, and use that for {person_course}
            pdata['person_course_latest'] = self.find_latest_person_course_table(dataset, project_id=optargs.get('project_id'))
            pdata['person_course'] = '[%s.%s]' % (dataset, pdata['person_course_latest'])
        pdata['dataset'] = dataset
        pdata['course_report'] = self.get_course_report_dataset()
        pdata['course_report_org'] = self.get_course_report_dataset(force_use_org=True)
        pdata['orgname'] = self.ORGNAME
        pdata['sane_username'] = self.user.replace(' ', '_').replace('.', '_').replace('@', '_')

        if 'module_id' in pdata and pdata['module_id']:
            url_name = pdata['module_id'].rsplit('/',1)[-1]
            pdata['module_url_name'] = url_name.replace(':','__').replace('-','_')

        # what table?  get custom course report configuration metadata for report name as specified
        table = crm.table_name
        if not table or table=="None":
            error = "No table name defined!  Cannot process this custom report"
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'   
            self.response.out.write(json.dumps(data))
            return

        # multiple table names?  use parameters to select one
        if ',' in table:
            tables = table.split(',')
            try:
                table_number = int(pdata.get('table_number', 0) or 0)
                table = tables[table_number]
            except Exception as err:
                raise Exception("[custom_reports] Cannot select table from tables=%s, table_number=%s, err=%s" % (tables, pdata.get('table_number'), err))

        # allow parameters in table name
        if '{' in table:
            table = table.format(**pdata)
            table = table.replace('-', '_').replace(' ', '_')
        if not ('dataset' in (crm.meta_info or {})) and not table.startswith('stats_') and not (crm.meta_info.get('no_stats_ok')):
            table = "stats_" + table

        # special handling for person_course table from particular dataset
        for m in re.findall('{person_course__([^ \}]+)}', crm.sql):
            org = m
            org_dataset = self.get_course_report_dataset(orgname=org)
            pcd = '[%s.%s]' % (org_dataset, self.find_latest_person_course_table(org_dataset, project_id=optargs.get('project_id')))
            pdata['person_course__' + org] = pcd
            logging.info('[cr] org=%s, pc=%s.%s' % (org, org_dataset, pcd))

        # special handling for course_report tables for specific orgs
        for m in re.findall('{course_report__([^ \}]+)}', crm.sql):
            org = m
            org_dataset = self.get_course_report_dataset(orgname=org)
            pdata['course_report__' + org] = org_dataset

        logging.info("Using %s for custom report %s person_course" % (pdata.get('person_course'), report_name))

        error = None

        def setup_sql_flags():
            if 'sql_flags' in pdata:
                if not type(pdata['sql_flags'])==dict:
                    try:
                        pdata['sql_flags'] = json.loads(pdata['sql_flags'])
                    except Exception as err:
                        msg = "Cannot parse sql_flags as JSON!  sql_flags=%s" % pdata['sql_flags']
                        msg += err
                        logging.error(msg)
                        raise Exception(msg)

        # dynamic sql: the SQL is allowed to change based on input parameters
        # do this by treating the SQL as a jinja2 tempate
        if crm.meta_info.get('dynamic_sql'):
            setup_sql_flags()
            # a little sanity checking - disallow spaces in any sql_flags values
            sf = pdata['sql_flags']
            for k in sf:
                if ' ' in sf[k]:
                    msg = "Illegal sql_flags %s=%s!" % (k, sf[k])
                    msg += "sql_flags = %s" % json.dumps(sf, indent=4)
                    error = "<pre>%s</pre>" % (msg.replace('<','&lt;').replace('<','&gt;'))
                    data = {'error': error}
                    self.response.headers['Content-Type'] = 'application/json'   
                    self.response.out.write(json.dumps(data))
                    return

            try:
                sql_template = Template(crm.sql)
                sql = sql_template.render(pdata)
            except Exception as err:
                msg = 'Custom report data: failed to render dynamic SQL with pdata=%s, err=%s' % (pdata, err)
                logging.error(msg)
                logging.error('sql=%s' % crm.sql)
                error = "<pre>%s</pre>" % (msg.replace('<','&lt;').replace('<','&gt;'))
                data = {'error': error}
                self.response.headers['Content-Type'] = 'application/json'   
                self.response.out.write(json.dumps(data))
                return

            # append username to table name
            table = table + "_%s" % pdata['sane_username']

            force_query = True		# for now, all dynamic_sql is done with force_query
            ignore_cache = True
            the_sql = sql
        else:
            the_sql = crm.sql

        if 1:
            # generate SQL and depends_on
            try:
                sql = the_sql.format(**pdata)
            except Exception as err:
                msg = "Custom report data: failed to prepare SQL, err=%s" % str(err)
                msg += '\npdata = %s' %  pdata
                logging.error(msg)
                if self.is_superuser():
                    error = "<pre>%s</pre>" % (str(msg).replace('<','&lt;').replace('<','&gt;'))
                    data = {'error': error}
                    self.response.headers['Content-Type'] = 'application/json'   
                    self.response.out.write(json.dumps(data))
                    logging.error("Returning with error message")
                    return
                raise

        def strip_brackets(x):
            x = x.strip()
            if x.startswith('[') and x.endswith(']'):
                x = x[1:-1]
                return x
            return x

        if crm.meta_info.get('debug_sql'):
            msg = "debug_sql is true; not running SQL.  This is the SQL which would have been run:\n"
            msg += sql
            msg += "\n\nwith these parameters:\n"
            msg += json.dumps(pdata, indent=4)
            msg += "\n\nproducing the output table: %s.%s\n" % (dataset, table)
            error = "<pre>%s</pre>" % (msg.replace('<','&lt;').replace('<','&gt;'))
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'   
            self.response.out.write(json.dumps(data))
            return

        try:
            if crm.depends_on and (not crm.depends_on=="None"):
                depends_on = [ strip_brackets(x.format(**pdata)) for x in (json.loads(crm.depends_on or "[]")) ]
            else:
                depends_on = None
        except Exception as err:
            logging.error("for course report %s, cannot process depends_on=%s" % (report_name, crm.depends_on))
            raise Exception("Bad format for the 'depends_on' setting in the custom report specification")
            raise

        # get the data, and do query if needed

        logging.info('custom report get_report_data name=%s, table=%s.%s, depends_on=%s, pdata=%s' % (report_name, dataset, table, depends_on, pdata))

        the_msg = []

        def my_logger(msg):
            logging.info(msg)
            the_msg.append(msg)

        def output_error(err):
            error = "<pre>%s</pre>" % (str(err).replace('<','&lt;').replace('<','&gt;'))
            logging.error('custom report error %s' % error)
            logging.error(err)
            logging.error(traceback.format_exc())
            # raise
            if self.is_superuser():
                msg = ('\n'.join(the_msg))
                msg = msg.replace('<','&lt;').replace('<','&gt;')
                error += "<pre>%s</pre>" % msg
                error += "SQL: <pre>%s</pre>" % sql
                error += "Parameters: <pre>%s</pre>" % json.dumps(pdata, indent=4)
                error += "optargs: <pre>%s</pre>" % json.dumps(optargs, indent=4)
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'   
            self.response.out.write(json.dumps(data))

        # is the request "indexed", meaning only matching rows of the table are to be returned?
        indexed_column = crm.meta_info.get('indexed')
        if indexed_column:
            if type(indexed_column)==list:
                indexed_columns = indexed_column
                try:
                    table_number = int(pdata.get('table_number', 0) or 0)
                    indexed_column = indexed_columns[table_number]
                except Exception as err:
                    raise Exception("[custom_reports] Cannot select indexed_column from indexed_columns=%s, table_number=%s, err=%s" % (indexed_columns, 
                                                                                                                                        pdata.get('table_number'), 
                                                                                                                                        err))
            setup_sql_flags()
            indexed_value = pdata.get('sql_flags', {}).get('indexed_value')
            logging.info("[custom_reports] retrieving %s.%s with indexing on %s to match value %s" % (dataset, table, indexed_column, indexed_value))
            if not indexed_value:
                my_logger('Error: missing sql_flags.indexed_value to match indexed column %s in %s.%s' % (indexed_column, dataset, table))
                data = {'error': msg}
                self.response.headers['Content-Type'] = 'application/json'   
                self.response.out.write(json.dumps(data))
                return
            # ensure existence of indexed version of table.  By convention, that is a table named tablename + "__indexed_" + indexed_column
            # the table has a SHA1 hash of the indexed column added, and is sorted according to the last few characters
            # of the SHA1 hash.
            indexed_table = table + "__indexed_" + indexed_column
            indexing_sql_template = """select *,
                                  SUBSTR(TO_BASE64(SHA1(STRING({indexed_column}))),-3,2) as index_sha1_2ch,
                                  ROW_NUMBER() over (order by index_sha1_2ch, {indexed_column}) as index_row_number{subnum},
                              from [{dataset}.{table}]
                              {where_clause}
                              order by index_sha1_2ch, {indexed_column}
                           """
            indexing_sql = indexing_sql_template.format(dataset=dataset, table=table, indexed_column=indexed_column, 
                                                        where_clause="",
                                                        subnum="")
            try:
                bqdata = self.cached_get_bq_table(dataset, indexed_table,
                                                  sql=indexing_sql,
                                                  logger=my_logger,
                                                  depends_on=["%s.%s" % (dataset, table)],
                                                  raise_exception=True,
                                                  ignore_cache=ignore_cache,
                                                  force_query=force_query,
                                                  startIndex=0,
                                                  maxResults=1,
                                                  **optargs
                                                )
            except Exception as err:
                if "Response too large to return" in str(the_msg):
                    # hmm - table too large!  can't workaround using allowLargeResult because the return results
                    # need to be ordered.  So let's break it up into multiple queries, appending each,
                    # by index_sha1_2ch
                    b64chars = "+/0123456789" + ''.join(map(chr,range(ord('A'), ord('Z')+1))) + ''.join(map(chr,range(ord('a'), ord('z')+1)))
                    # get table size, divide by 64M, to get number of divisions to use
                    tinfo = bqutil.get_bq_table_info(dataset, table, **optargs)
                    nbytes = int(tinfo['numBytes'])
                    ndivs = int(round(nbytes / (64*1024*1024)))
                    logging.info("Response too large - nbytes=%s, so trying ndivs=%s" % (nbytes, ndivs))
                    end_idx = None
                    start_idx = None
                    dn = int(64 / ndivs)
                    offset = dn
                    overwrite = True
                    nrows = 0
                    while (offset < 65):
                        start_idx = end_idx
                        last_row_index = nrows	# note ROW_NUMBER() starts with 1 (not zero)
                        if (offset < 64):
                            end_idx = b64chars[offset] + "+"
                        else:
                            end_idx = None	# boundary case
                        wc = "where "
                        if start_idx:
                            wc += '(SUBSTR(TO_BASE64(SHA1(STRING(%s))),-3,2) >= "%s") ' % (indexed_column, start_idx)
                        else:
                            wc += "True "
                        if end_idx:
                            wc += 'AND (SUBSTR(TO_BASE64(SHA1(STRING(%s))),-3,2) < "%s") ' % (indexed_column, end_idx)
                        logging.info("--> start_idx=%s, end_idx=%s, starting row %d" % (start_idx, end_idx, last_row_index))
                        tmp_sql = indexing_sql_template.format(dataset=dataset, table=table, indexed_column=indexed_column, 
                                                               where_clause=wc, subnum="_sub")
                        indexing_sql = "SELECT *, index_row_number_sub + %d as index_row_number FROM (%s)" % (last_row_index, tmp_sql)
                        try:
                            bqutil.create_bq_table(dataset, indexed_table,
                                                   sql=indexing_sql,
                                                   overwrite=overwrite,
                                                   logger=my_logger,
                                                   **optargs
                                               )
                            cnt = 0
                            tinfo = None
                            while (not tinfo) and (cnt < 10):
                                tinfo = bqutil.get_bq_table_info(dataset, indexed_table, **optargs)
                                if not tinfo:
                                    logging.info("==> ERROR?  got unexpected None for get_bq_table_info %s.%s" % (dataset, indexed_table))
                                    time.sleep(10)
                                    cnt += 1

                            nrows = int(tinfo['numRows'])
                            logging.info("--> Result from %s to %s has %d rows" % (start_idx, end_idx, nrows))
                        except Exception as err:
                            bqdata = {'data': None}
                            sql = indexing_sql
                            output_error(err)
                            return
                        overwrite = "append"
                        offset += dn
                        
                else:
                    bqdata = {'data': None}
                    sql = indexing_sql
                    output_error(err)
                    return

            # now ensure table index, and retrieve it.  It has just two columns: index_sha1_2ch, start_row
            tindex_table = table + "__index_for_" + indexed_column
            tindex_sql = """SELECT index_sha1_2ch, 
                                min(index_row_number) as start_row,
                                # max(index_row_number) as end_row,   # don't need this - just take next start_row
                            FROM [{dataset}.{indexed_table}]
                            group by index_sha1_2ch
                            order by index_sha1_2ch
                         """.format(dataset=dataset, indexed_table=indexed_table)
            try:
                bqdata = self.cached_get_bq_table(dataset, tindex_table,
                                                    sql=tindex_sql,
                                                    logger=my_logger,
                                                    depends_on=["%s.%s" % (dataset, indexed_table)],
                                                    raise_exception=True,
                                                    ignore_cache=ignore_cache,
                                                    force_query=force_query,
                                                    startIndex=0,
                                                    maxResults=10000,
                                                    **optargs
                                                )
            except Exception as err:
                bqdata = {'data': None}
                sql = tindex_sql
                output_error(err)
                return

            # find the start and end rows to retrieve, based the last characters of the SHA1 hash of the indexed value
            sha1_2ch = base64.b64encode(hashlib.sha1(indexed_value).digest())[-3:-1]
            start_row = None
            end_row = None
            for k in bqdata['data']:
                if start_row and not end_row:
                    end_row =  int(k['start_row'])
                if (k['index_sha1_2ch']==sha1_2ch):
                    start_row = int(k['start_row'])
            logging.info("Retrieving iv=%s, sha1_2ch=%s, rows %s to %s of %s.%s" % (indexed_value,
                                                                                    sha1_2ch,
                                                                                    start_row, end_row, dataset, indexed_table))
            if not start_row:
                output_error("Cannot find %s=%s in %s.%s" % (indexed_column, indexed_value,
                                                             dataset, indexed_table))
                bqdata = {'data': None}
                return
                
            max_results = (end_row or (start_row+4000)) - start_row
            bqdata = self.cached_get_bq_table(dataset, indexed_table, ignore_cache=True,
                                              startIndex=start_row-1,
                                              maxResults=max_results,
                                              **optargs
                                            )

            # extract just the row(s) with indexed_column value matching indexed_value (the hash is many to one)
            newdata = []
            for k in range(len(bqdata['data'])):
                datum = bqdata['data'][k]
                if (datum[indexed_column]==indexed_value):
                    newdata.append(datum)
            logging.info("--> Result has %d items, of which %d match" % (len(bqdata['data']), len(newdata)))
            bqdata['data'] = newdata                
            
            table = indexed_table	# so that columns are retrieved properly
            
        if not indexed_column:
            # retrieve full table
            try:
                bqdata = self.cached_get_bq_table(dataset, table, 
                                                  sql=sql,
                                                  logger=my_logger,
                                                  depends_on=depends_on,
                                                  startIndex=int(pdata['start'] or 0), 
                                                  maxResults=int(pdata['length'] or 100000),
                                                  raise_exception=True,
                                                  ignore_cache=ignore_cache,
                                                  force_query=force_query,
                                                  **optargs
                )
                self.fix_bq_dates(bqdata)
            except Exception as err:
                bqdata = {'data': None}
                output_error(err)
                return

        tablecolumns = []
        if pdata['get_table_columns']:
            try:
                tableinfo = bqutil.get_bq_table_info(dataset, table, **optargs)
            except Exception as err:
                error = (error or "\n") + str(err)
                tableinfo = None
                raise

            if tableinfo:
                fields = tableinfo['schema']['fields']
                field_names = [x['name'] for x in fields]
                tablecolumns = [ { 'data': x, 'title': x, 'class': 'dt-center' } for x in field_names ]

        data = self.common_data.copy()
        data.update({'data': bqdata['data'],
                     'draw': pdata['draw'],
                     'last_modified_date': str(bqdata.get('last_modified_date')),
                     'fields': bqdata['fields'],
                     'recordsTotal': bqdata.get('numRows', 0),
                     'recordsFiltered': bqdata.get('numRows', 0),
                     'error': error,
                     'tablecolumns': tablecolumns,
                     'output_table': table,
                     'output_dataset': dataset,
                 })
        
        
        # logging.info('[cr] data=%s' % data)

        self.response.headers['Content-Type'] = 'application/json'   
        self.response.out.write(json.dumps(data))