Exemplo n.º 1
0
def already_exists(course_id, use_dataset_latest=False):
    '''
    Return True if course_axis table already exists, and is sufficiently new
    '''
    table = "course_axis"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    # tables = bqutil.get_list_of_table_ids(dataset)
    # return table in tables
    try:
        mdt = bqutil.get_bq_table_last_modified_datetime(dataset, table)
    except Exception as err:
        return False
    if mdt < datetime.datetime(2015, 10, 31, 17, 00):
        return False
    return True
Exemplo n.º 2
0
def obsolete_process_course(course_id, force_recompute=False, check_dates=True):
    '''
    make person_course_day tables for specified course_id.  This version
    produces one table for each day.  It is inefficient when there are 
    many days with very small daily tracking log tables.
    '''

    PCDAY_SQL = """
    select username, 
           "{course_id}" as course_id,
           sum(bevent) as nevents,
           sum(bprogress) as nprogcheck,
           sum(bshow_answer) as nshow_answer,
           sum(bvideo) as nvideo, 
           sum(bproblem_check) as nproblem_check,
           sum(bforum) as nforum,
           sum(bshow_transcript) as ntranscript,
           sum(bseq_goto) as nseq_goto,
           sum(bseek_video) as nseek_video,
           sum(bpause_video) as npause_video,
           MAX(time) as last_event,
           AVG(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
               ) as avg_dt,
           STDDEV(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as sdv_dt,
           MAX(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as max_dt,
           COUNT(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as n_dt,
           SUM(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as sum_dt
    from
    (SELECT username, 
      case when event_type = "play_video" then 1 else 0 end as bvideo,
      case when event_type = "problem_check" then 1 else 0 end as bproblem_check,
      case when username != "" then 1 else 0 end as bevent,
      case when regexp_match(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum,
      case when regexp_match(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress,
      case when event_type in ("show_answer", "showanswer") then 1 else 0 end as bshow_answer,
      case when event_type = 'show_transcript' then 1 else 0 end as bshow_transcript,
      case when event_type = 'seq_goto' then 1 else 0 end as bseq_goto,
      case when event_type = 'seek_video' then 1 else 0 end as bseek_video,
      case when event_type = 'pause_video' then 1 else 0 end as bpause_video,
      # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll,
      # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll
      time,
      lag(time, 1) over (partition by username order by time) last_time
      FROM [{dataset}.{table_id}]
      WHERE
        NOT event_type contains "/xblock/"
        AND username != ""
    )
    group by course_id, username
    order by sdv_dt desc
    """

    course_dir = course_id.replace('/','__')
    dataset = bqutil.course_id2dataset(course_id)
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")
    pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday")

    print "Processing course %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    log_tables = bqutil.get_tables(log_dataset)

    try:
        bqutil.create_dataset_if_nonexistent(pcd_dataset)
    except Exception as err:
        print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err))
        
    pcday_tables_info = bqutil.get_tables(pcd_dataset)
    pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])]

    print "pcday_tables = ", pcday_tables

    log_table_list = log_tables['tables']
    log_table_list.sort()

    for table in log_table_list:
        tr = table['tableReference']
        table_id = tr['tableId']
        if not table_id.startswith('tracklog'):
            continue
    
        date = table_id[9:]
    
        table_out = 'pcday_%s' % date
    
        if (table_out in pcday_tables) and not force_recompute:
            skip = True
            if check_dates:
                table_out_date = bqutil.get_bq_table_last_modified_datetime(pcd_dataset, table_out)
                log_table_date = bqutil.get_bq_table_last_modified_datetime(log_dataset, table_id)
                if log_table_date > table_out_date:
                    skip = False
                    print "%s...already exists, but table_out date=%s and log_table date=%s, so re-computing" % (table_out,
                                                                                                                 table_out_date,
                                                                                                                 log_table_date)
            if skip:
                print "%s...already done, skipping" % table_out
                sys.stdout.flush()
                continue

        if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0:
            print "...zero size table %s, skipping" % table_id
            sys.stdout.flush()
            continue

        print ("Creating %s " % table_out),
        
        the_sql = PCDAY_SQL.format(course_id=course_id, 
                                   dataset=log_dataset,
                                   table_id=table_id)
    
        sys.stdout.flush()
    
        bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False)
    
    print "Done with course %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
def run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=False, 
                               start_date=None,
                               end_date=None, 
                               get_date_function=None,
                               existing=None,
                               log_dates=None,
                               days_delta=1,
                               skip_last_day=False,
                               has_hash_limit=False,
                               newer_than=None,
                               table_max_size_mb=800,
                               limit_query_size=False):
    '''
    make a certain table (with SQL given) for specified course_id.

    The master table holds all the data for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    If the query fails because of "Resources exceeded during query execution"
    then try setting the end_date, to do part at a time.

    NOTE: the SQL must produce a result which is ordered by date, in increasing order.

    days_delta = integer number of days to increase each time; specify 0 for one day overlap,
                 but make sure the SQL query only selects for time > TIMESTAMP("{last_date}")

    If skip_last_day is True then do not include the last day of tracking log data
    in the processing.  This is done to avoid processing partial data, e.g. when
    tracking log data are incrementally loaded with a delta of less than one day.

    start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format.

    newer_than = if specified, as datetime, then any existing destination table must
    be newer than this datetime, else force_recompute is made True
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)	# destination
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")

    if existing is None:
        existing = bqutil.get_list_of_table_ids(dataset)
        print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % (len(existing or []), dataset)

    if log_dates is None:
        log_tables = [x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20')]
        log_dates = [x[9:] for x in log_tables]
        log_dates.sort()

    if len(log_dates)==0:
        print "--> no tracking logs in %s aborting!" % (log_dataset)
        return

    if skip_last_day:
        old_max_date = max(log_dates)
        log_dates.remove(max(log_dates))	# remove the last day of data from consideration
        max_date = max(log_dates)
        print "         --> skip_last_day is True: dropping %s, new max_date=%s" % (old_max_date, max_date)
        sys.stdout.flush()

    min_date = min(log_dates)
    max_date = max(log_dates)

    if start_date:
        start_date = start_date.replace('-','')
        if min_date < start_date:
            print "        --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % (min_date, start_date)
            min_date = start_date

    if end_date is not None:
        print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % (table, min_date, max_date, end_date)
        sys.stdout.flush()
        the_end_date = end_date.replace('-','')	# end_date should be YYYY-MM-DD
        if the_end_date < max_date:
            max_date = the_end_date

    if (table in existing) and newer_than:
        # get date on existing table
        table_datetime = bqutil.get_bq_table_last_modified_datetime(dataset, table)
        if (table_datetime < newer_than):
            print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % (dataset, table, table_datetime, newer_than)
            force_recompute = True

    if force_recompute:
        overwrite = True
    else:
        overwrite = False

    last_date = "2010-10-01 01:02:03"    	# default last date

    if (not overwrite) and table in existing:
        # find out what the end date is of the current table
        pc_last = bqutil.get_table_data(dataset, table, startIndex=-10, maxResults=100)
        if (pc_last is None):
            print "--> no data in table %s.%s, starting from scratch!" % (dataset, table)
            overwrite = True
        else:
            last_dates = [get_date_function(x) for x in pc_last['data']]
            last_date = max(last_dates)
            table_max_date = last_date.strftime('%Y%m%d')
            if max_date <= table_max_date:
                print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % (table, 
                                                                                                                  table_max_date,
                                                                                                                  min_date,
                                                                                                                  max_date)
                sys.stdout.flush()
                return
            min_date = (last_date + datetime.timedelta(days=days_delta)).strftime('%Y%m%d')
            print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % (table, 
                                                                                                      table_max_date,
                                                                                                      min_date,
                                                                                                      max_date)
            sys.stdout.flush()
            overwrite = 'append'
    
    if overwrite=='append':
        print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]"  % (table, course_id, min_date, max_date, last_date, datetime.datetime.now())
    else:
        print "Making new %s table for course %s (start=%s, end=%s) [%s]"  % (table, course_id, min_date, max_date, datetime.datetime.now())
    sys.stdout.flush()

    if limit_query_size:
        # do only one day's tracking logs, and force use of hash if table is too large
        print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb
        the_max_date = max_date	# save max_date information

        while min_date not in log_dates:
            print "  tracklog_%s does not exist!" % min_date
            for ld in log_dates:
                if ld < min_date:
                    continue
                if (ld > min_date) and (ld <= max_date):
                    min_date = ld
                    break
        if min_date not in log_dates:
            print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date
            raise Exception("[process_tracking_logs] missing tracking log")

        max_date = min_date
        print '    min_date = max_date = %s' % min_date
        tablename = 'tracklog_%s' % min_date.replace('-', '')
        tablesize_mb = bqutil.get_bq_table_size_bytes(log_dataset, tablename) / (1024.0*1024)
        nhashes =  int(math.ceil(tablesize_mb / table_max_size_mb))
        from_datasets = "[%s.%s]" % (log_dataset, tablename)
        print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % (log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes)
        sys.stdout.flush()

        if nhashes:
            if not has_hash_limit:
                print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % (log_dataset, tablename, 
                                                                                                             tablesize_mb, table_max_size_mb)
                print "SQL: ", SQL
                raise Exception("[process_tracking_logs] table too large")

            for k in range(nhashes):
                hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes, k)
                the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit)
                print "   Hash %d" % k
                sys.stdout.flush()
                try:
                    bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
                except Exception as err:
                    print the_sql
                    raise
                overwrite = "append"
        else:
            the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="")
            try:
                bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
            except Exception as err:
                print the_sql
                raise

        txt = '[%s] added tracking log data from %s' % (datetime.datetime.now(), tablename)
        bqutil.add_description_to_table(dataset, table, txt, append=True)

        print "----> Done with day %s" % max_date

        if the_max_date > max_date:
            # more days still to be done
            print "--> Moving on to another day (max_date=%s)" % the_max_date
            run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, 
                                       use_dataset_latest=use_dataset_latest,
                                       end_date=end_date, 
                                       get_date_function=get_date_function,
                                       existing=existing,
                                       log_dates=log_dates,
                                       has_hash_limit=has_hash_limit,
                                       table_max_size_mb=table_max_size_mb,
                                       limit_query_size=limit_query_size,
                                   )
        return

    from_datasets = """(
                  TABLE_QUERY({dataset},
                       "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}"
                     )
                  )
         """.format(dataset=log_dataset, start=min_date, end=max_date)

    the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="")

    try:
        bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
    except Exception as err:
        if ( ('Response too large to return.' in str(err)) and has_hash_limit ):
            # try using hash limit on username
            # e.g. WHERE ABS(HASH(username)) % 4 = 0

            for k in range(4):
                hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k
                the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit)
                bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
                overwrite = "append"

        elif ('Resources exceeded during query execution' in str(err)):
            if True:
                # figure out time interval in days, and split that in half
                start_date = datetime.datetime.strptime(min_date, '%Y%m%d')
                end_date = datetime.datetime.strptime(max_date, '%Y%m%d')
                ndays = (end_date - start_date).days
                if (ndays < 1) and has_hash_limit:
                    print "----> ndays=%d; retrying with limit_query_size" % ndays
                    sys.stdout.flush()
                    return run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, 
                                                      use_dataset_latest=use_dataset_latest,
                                                      end_date=max_date, 
                                                      get_date_function=get_date_function,
                                                      has_hash_limit=has_hash_limit,
                                                      # existing=existing,
                                                      log_dates=log_dates,
                                                      limit_query_size=True,
                                                  )
                elif (ndays < 1):
                    print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays
                    raise
                
                nd1 = int(ndays/2)
                nd2 = ndays - nd1
                #if nd2 > nd1:
                #    nd1 = nd2
                #    nd2 = ndays - nd1
                print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % (ndays, nd1, nd2)
                sys.stdout.flush()

                end_date = (start_date + datetime.timedelta(days=nd1)).strftime('%Y%m%d')
                print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, 
                                           use_dataset_latest=use_dataset_latest,
                                           end_date=end_date, 
                                           get_date_function=get_date_function,
                                           has_hash_limit=has_hash_limit,
                                           # existing=existing,
                                           log_dates=log_dates)

                end_date = max_date
                print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, 
                                           use_dataset_latest=use_dataset_latest,
                                           end_date=end_date, 
                                           get_date_function=get_date_function,
                                           has_hash_limit=has_hash_limit,
                                           # existing=existing,
                                           log_dates=log_dates)
                print "--> Done with %d + %d days!" % (nd1, nd2)
                return


            if False:
                def get_ym(x):
                    return int(x[0:4]), int(x[4:6]), int(x[6:])
                (min_year, min_month, min_day) = get_ym(min_date)
                (max_year, max_month, max_day) = get_ym(max_date)
                nmonths = max_month - min_month + 12 * (max_year - min_year)
                print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time"
                sys.stdout.flush()
                (end_year, end_month) = (min_year, min_month)
                for dm in range(nmonths):
                    end_month += 1
                    if end_month > 12:
                        end_month = 1
                        end_year += 1
                    end_date = "%04d-%02d-%02d" % (end_year, end_month, min_day)
                    print "--> with end_date=%s" % end_date
                    sys.stdout.flush()
                    run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, 
                                               use_dataset_latest=use_dataset_latest,
                                               end_date=end_date, 
                                               get_date_function=get_date_function,
                                               has_hash_limit=has_hash_limit,
                                               # existing=existing,
                                               log_dates=log_dates)
                    force_recompute = False		# after first, don't force recompute
                return
        else:
            print the_sql
            raise

    if overwrite=='append':
        txt = '[%s] added tracking log data from %s to %s' % (datetime.datetime.now(), min_date, max_date)
        bqutil.add_description_to_table(dataset, table, txt, append=True)
    
    print "Done with course %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
Exemplo n.º 4
0
def run_external_script(extcmd, param, ecinfo, course_id):
    """
    Run external script on specified course.

    extcmd = string specifying external command to run
    param = command line parameters, including extparam
    ecinfo = external command info from edx2bigquery_config
    course_id = course_id to run external command on
    """
    # use default for base set of parameters
    ed_name = ecinfo.get('default_parameters', 'DEFAULT')
    settings = ecinfo.get(ed_name, {})
    settings.update(ecinfo.get(extcmd))
    # print "settings: ", json.dumps(settings, indent=4)
    
    print settings['name']
    
    if param.verbose:
        print settings.get('description', '')

    cidns = course_id.replace('/', '__')
    cidns_nodots = course_id.replace('/', '__').replace('.', '_').replace('-', '_')

    mypath = path(os.path.realpath(__file__)).dirname()
    edx2bigquery_context = {'lib': mypath / "lib",
                            'bin': mypath / "bin",
                        }

    the_template = settings['template'].format(**edx2bigquery_context)
    fnpre = settings['filename_prefix']
    lfn = "%s-%s.log" % (fnpre, cidns)
    if settings.get('logs_dir'):
        lfn = path(settings['logs_dir']) / lfn

    try:
        ofn = settings['script_fn'].format(filename_prefix=fnpre, cidns=cidns)
    except Exception as err:
        print "oops, errr %s" % str(err)
        print "settings=", json.dumps(settings, indent=4)
        raise
    cwd = os.getcwd()

    the_date = str(datetime.datetime.now())

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=param.use_dataset_latest)
    table_prefix = dataset

    if param.force_recompute:
        param.force_recompute = 1
    else:
        param.force_recompute = 0

    context = {'course_id': course_id,
               'script_name': ofn,
               'the_date': the_date,
               'cidns': cidns,
               'cidns_nodots': cidns,
               'template_file': the_template,
               'log_file': lfn,
               'filename_prefix': fnpre,
               'filename_prefix_cidns': "%s__%s" % (fnpre, cidns),
               'working_dir': cwd,
               'table_prefix': table_prefix,
               'lib_dir': edx2bigquery_context['lib'],
               'bin_dir': edx2bigquery_context['bin'],
    }
    context.update(settings)
    context.update(param.__dict__)

    rundir = settings['run_dir'].format(**context)
    runcmd = settings['script_cmd'].format(**context)

    tem = codecs.open(the_template).read()
    tem = unicode(tem)
    try:
        # script_file = tem.format(**context)
        script_file = Template(tem).render(**context)
    except Exception as err:
        print "Oops, cannot properly format template %s" % the_template
        print "Error %s" % str(err)
        print "context: ", json.dumps(context, indent=4)
        raise
    ofndir = path(ofn).dirname()
    if not os.path.exists(ofndir):
        print "[Warning] Directory %s doesn't exist - creating it" % ofndir
        os.mkdir(ofndir)
    fp = codecs.open(ofn, 'w', encoding="utf8")
    fp.write(script_file)
    fp.close()
    print "Generated %s" % ofn

    # if depends_on is defined, and force_recompute is not true, then skip
    # run if output already exists and is newer than all depends_on tables.

    depends_on = settings.get('depends_on')
    output_table = settings.get('output_table')
    if depends_on and not type(depends_on)==list:
        depends_on = [ depends_on ]
    do_compute = param.force_recompute
    if (not param.force_recompute) and depends_on and output_table:
        # does output already exist?
        has_output = False
        try:
            tinfo = bqutil.get_bq_table_info(dataset, output_table)
            if tinfo:
                has_output = True
        except:
            pass
        if not has_output:
            print "Output table %s.%s doesn't exist: running" % (dataset, output_table)
            do_compute = True
        else:
            table_date = tinfo['lastModifiedTime']
            for deptab in depends_on:
                try:
                    dtab_date = bqutil.get_bq_table_last_modified_datetime(dataset, deptab)
                except Exception as err:
                    raise Exception("[run_external] missing dependent table %s.%s" % (dataset, deptab))
                if not dtab_date:
                    raise Exception("[run_external] missing dependent table %s.%s" % (dataset, deptab))
                if table_date and dtab_date > table_date:
                    do_compute = True
                    break
            if not do_compute:
                print "Output table %s.%s exists and is newer than %s, skipping" % (dataset, output_table, depends_on)
            
    if do_compute:
        os.chdir(rundir)
        print "Working directory: %s" % rundir
        print "Logging to %s" % lfn
        print "Run command: %s" % runcmd
        sys.stdout.flush()
        if not param.skiprun:
            start = datetime.datetime.now()

            if param.submit_condor:
                condor_template_fn = settings.get('condor_job_template', '').format(**edx2bigquery_context)
                if not condor_template_fn:
                    raise Exception("[run_external] missing condor_job_template specification for %s" % (extcmd))
                condor_submit_fn = "CONDOR/{filename_prefix}-{cidns}.submit".format(**context)
                context.update({ 'MEMORY': 32768,
                                 'arguments': '{script_name}'.format(**context),
                                 'executable': context['script_cmd'],
                                 'input_file': '',
                                 'filename': condor_submit_fn,
                                 })
                condor_template = Template(open(condor_template_fn).read()).render(**context)
                dirs = ['CONDOR', 'JOBS']
                for dir in dirs:
                    if not os.path.exists(dir):
                        os.mkdir(dir)
                fp = open(condor_submit_fn, 'w')
                fp.write(condor_template)
                fp.close()
                cmd = "condor_submit %s" % condor_submit_fn
                print cmd
                jobid = None
                for k in os.popen(cmd):
                    m = re.search('submitted to cluster ([0-9]+)', k)
                    if m:
                        jobid = m.group(1)
                dt = str(datetime.datetime.now())
                jobfile = 'condor_jobs.csv'
                open(jobfile, 'a').write("%s,%s,%s,%s\n" % (course_id, dt, jobid, lfn))
                print "[%s] Submitted as condor job %s at %s" % (course_id, jobid, dt)
                # print "[run_external] submitted %s, job=%s" % (extcmd, jobnum)
                return
            else:
                os.system(runcmd)

            if settings.get('type')=="stata":
                # cleanup leftover log file after stata batch run
                batch_log = ofn.split('.')[0] + ".log"
                if os.path.exists(batch_log):
                    os.unlink(batch_log)
                    print "Removed old log file %s" % batch_log

            end = datetime.datetime.now()
            has_output = False
            try:
                tinfo = bqutil.get_bq_table_info(dataset, output_table)
                if tinfo:
                    has_output = True
            except:
                pass
            success = has_output
            dt = end-start
            print "[run_external] DONE WITH %s, success=%s, dt=%s" % (extcmd, success, dt)
            sys.stdout.flush()
            if param.parallel and not success:
                raise Exception("[run_external] External command %s failed on %s" % (extcmd, course_id))
def load_all_daily_logs_for_course(course_id, gsbucket="gs://x-data", verbose=True, wait=False,
                                   check_dates=True):
    '''
    Load daily tracking logs for course from google storage into BigQuery.
    
    If wait=True then waits for loading jobs to be completed.  It's desirable to wait
    if subsequent jobs which need these tables (like person_day) are to be run
    immediately afterwards.
    '''

    print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now())
    sys.stdout.flush()
    gsroot = gsutil.path_from_course_id(course_id)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA = json.loads(open('%s/schemas/schema_tracking_log.json' % mypath).read())['tracking_log']

    gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot)

    fnset = gsutil.get_gs_file_list(gsdir)
  
    dataset = bqutil.course_id2dataset(gsroot, dtype="logs")
  
    # create this dataset if necessary
    bqutil.create_dataset_if_nonexistent(dataset)

    tables = bqutil.get_list_of_table_ids(dataset)
    tables = [x for x in tables if x.startswith('track')]
  
    if verbose:
        print "-"*77
        print "current tables loaded:", json.dumps(tables, indent=4)
        print "files to load: ", json.dumps(fnset.keys(), indent=4)
        print "-"*77
        sys.stdout.flush()
  
    for fn, fninfo in fnset.iteritems():

        if int(fninfo['size'])<=45:
            print "Zero size file %s, skipping" % fn
            continue

        m = re.search('(\d\d\d\d-\d\d-\d\d)', fn)
        if not m:
            continue
        date = m.group(1)
        tablename = "tracklog_%s" % date.replace('-','')	# YYYYMMDD for compatibility with table wildcards

        # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True)
        file_date = fninfo['date'].replace(tzinfo=None)
  
        if tablename in tables:
            skip = True
            if check_dates:
                table_date = bqutil.get_bq_table_last_modified_datetime(dataset, tablename)
                if not (table_date > file_date):
                    print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (tablename, fn, file_date, table_date)
                    skip = False
                    
            if skip:
                if verbose:
                    print "Already have table %s, skipping file %s" % (tablename, fn)
                    sys.stdout.flush()
                continue

        #if date < '2014-07-27':
        #  continue
  
        print "Loading %s into table %s " % (fn, tablename)
        if verbose:
            print "start [%s]" % datetime.datetime.now()
        sys.stdout.flush()
        gsfn = fninfo['name']
        ret = bqutil.load_data_to_table(dataset, tablename, gsfn, SCHEMA, wait=wait, maxbad=1000)
  
    if verbose:
        print "-" * 77
        print "done with %s [%s]" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
Exemplo n.º 6
0
    def cached_get_bq_table(self, dataset, table, sql=None, key=None, drop=None,
                            logger=None, ignore_cache=False, 
                            depends_on=None,
                            force_query=False,
                            force_newer_than=None,
                            startIndex=0, maxResults=1000000,
                            allowLargeResults=False,
                            raise_exception=False,
                            project_id=None,
    ):
        '''
        Get a dataset from BigQuery; use memcache.

        If "depends_on" is provided (as a list of strings), and if the desired table
        already exists, then check to make sure it is newer than any of the tables
        listed in "depends_on".

        if force_newer_than is set (should be a datetime) then in the depends_on
        testing, use that date as an override, such that the SQL is re-run if
        the existing table is older than this date.

        project_id: if specified, overrides the default BigQuery project ID (for the actual query)
        '''
        if logger is None:
            logger = logging.info
        memset = '%s.%s' % (dataset,table)
        if startIndex:
            memset += '-%d-%d' % (startIndex, maxResults)
        data = mem.get(memset)

        optargs = {}
        if project_id:
            optargs['project_id'] = project_id

        if depends_on is not None:
            # get the latest mod time of tables in depends_on:
            modtimes = [ bqutil.get_bq_table_last_modified_datetime(*(x.split('.',1)), **optargs) for x in depends_on]
            latest = max([x for x in modtimes if x is not None] or [None])
            
            if not latest:
                raise Exception("[datasource.cached_get_bq_table] Cannot get last mod time for %s (got %s), needed by %s.%s" % (depends_on, modtimes, dataset, table))

            if force_newer_than and force_newer_than > latest:
                latest = force_newer_than

            if data and data.get('lastModifiedTime', None):
                # data has a mod time, let's see if that has expired
                if data.get('lastModifiedTime', None) < latest:
                    ignore_cache = True

            # get the mod time of the computed table, if it exists
            try:
                table_date = bqutil.get_bq_table_last_modified_datetime(dataset, table, **optargs)
            except Exception as err:
                if 'Not Found' in str(err):
                    table_date = None
                    ignore_cache = True
                    logging.info("[datasource.cached_get_bq_table] Table %s.%s doesn't exist, forcing recomputation" % (dataset, table))
                else:
                    raise

            if table_date and table_date < latest:
                ignore_cache = True
                if sql:
                    force_query = True
                    logging.info("[datasource.cached_get_bq_table] Forcing query recomputation of %s.%s, table_date=%s, latest=%s" % (dataset, table,
                                                                                                                                      table_date, latest))
                else:
                    logging.info("[datasource.cached_get_bq_table] Forcing cache reload of %s.%s, table_date=%s, latest=%s" % (dataset, table,
                                                                                                                               table_date, latest))

            # logging.info("[datasource.cached_get_bq_table] %s.%s table_date=%s, latest=%s, force_query=%s" % (dataset, table, table_date, latest, force_query))

        if (not data) or ignore_cache or (not data['data']):	# data['data']=None if table was empty, and in that case try again
            try:
                data = bqutil.get_bq_table(dataset, table, sql, key=key, logger=logger,
                                           force_query=force_query,
                                           startIndex=startIndex, 
                                           maxResults=maxResults, 
                                           allowLargeResults=allowLargeResults,
                                           **optargs)
            except Exception as err:
                logging.error(err)
                if raise_exception:
                    raise
                data = {'fields': {}, 'field_names': [], 'data': [], 'data_by_key': {}}
                return data		# don't cache empty result
            data['depends_on'] = depends_on
            if (drop is not None) and drop:
                for key in drop:
                    data.pop(key)	# because data can be too huge for memcache ("Values may not be more than 1000000 bytes in length")
            try:
                mem.set(memset, data, time=3600*12)
            except Exception as err:
                logging.error('error doing mem.set for %s.%s from bigquery' % (dataset, table))
        self.bqdata[table] = data
        return data
Exemplo n.º 7
0
def run_query_on_tracking_logs(SQL,
                               table,
                               course_id,
                               force_recompute=False,
                               use_dataset_latest=False,
                               start_date=None,
                               end_date=None,
                               get_date_function=None,
                               existing=None,
                               log_dates=None,
                               days_delta=1,
                               skip_last_day=False,
                               has_hash_limit=False,
                               newer_than=None,
                               table_max_size_mb=800,
                               limit_query_size=False):
    '''
    make a certain table (with SQL given) for specified course_id.

    The master table holds all the data for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    If the query fails because of "Resources exceeded during query execution"
    then try setting the end_date, to do part at a time.

    NOTE: the SQL must produce a result which is ordered by date, in increasing order.

    days_delta = integer number of days to increase each time; specify 0 for one day overlap,
                 but make sure the SQL query only selects for time > TIMESTAMP("{last_date}")

    If skip_last_day is True then do not include the last day of tracking log data
    in the processing.  This is done to avoid processing partial data, e.g. when
    tracking log data are incrementally loaded with a delta of less than one day.

    start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format.

    newer_than = if specified, as datetime, then any existing destination table must
    be newer than this datetime, else force_recompute is made True
    '''

    dataset = bqutil.course_id2dataset(
        course_id, use_dataset_latest=use_dataset_latest)  # destination
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")

    if existing is None:
        existing = bqutil.get_list_of_table_ids(dataset)
        print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % (
            len(existing or []), dataset)

    if log_dates is None:
        log_tables = [
            x for x in bqutil.get_list_of_table_ids(log_dataset)
            if x.startswith('tracklog_20')
        ]
        log_dates = [x[9:] for x in log_tables]
        log_dates.sort()

    if len(log_dates) == 0:
        print "--> no tracking logs in %s aborting!" % (log_dataset)
        return

    if skip_last_day:
        old_max_date = max(log_dates)
        log_dates.remove(
            max(log_dates))  # remove the last day of data from consideration
        max_date = max(log_dates)
        print "         --> skip_last_day is True: dropping %s, new max_date=%s" % (
            old_max_date, max_date)
        sys.stdout.flush()

    min_date = min(log_dates)
    max_date = max(log_dates)

    if start_date:
        start_date = start_date.replace('-', '')
        if min_date < start_date:
            print "        --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % (
                min_date, start_date)
            min_date = start_date

    if end_date is not None:
        print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % (
            table, min_date, max_date, end_date)
        sys.stdout.flush()
        the_end_date = end_date.replace('-',
                                        '')  # end_date should be YYYY-MM-DD
        if the_end_date < max_date:
            max_date = the_end_date

    if (table in existing) and newer_than:
        # get date on existing table
        table_datetime = bqutil.get_bq_table_last_modified_datetime(
            dataset, table)
        if (table_datetime < newer_than):
            print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % (
                dataset, table, table_datetime, newer_than)
            force_recompute = True

    if force_recompute:
        overwrite = True
    else:
        overwrite = False

    last_date = "2010-10-01 01:02:03"  # default last date

    if (not overwrite) and table in existing:
        # find out what the end date is of the current table
        pc_last = bqutil.get_table_data(dataset,
                                        table,
                                        startIndex=-10,
                                        maxResults=100)
        if (pc_last is None):
            print "--> no data in table %s.%s, starting from scratch!" % (
                dataset, table)
            overwrite = True
        else:
            last_dates = []
            for x in pc_last['data']:
                try:
                    add_date = get_date_function(x)
                    last_dates.append(add_date)
                except Exception as err:
                    print "Error with get_date_function occurred. %s" % str(
                        err)
                    continue
            last_date = max(last_dates)
            table_max_date = last_date.strftime('%Y%m%d')
            if max_date <= table_max_date:
                print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % (
                    table, table_max_date, min_date, max_date)
                sys.stdout.flush()
                return
            min_date = (last_date +
                        datetime.timedelta(days=days_delta)).strftime('%Y%m%d')
            print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % (
                table, table_max_date, min_date, max_date)
            sys.stdout.flush()
            overwrite = 'append'

    if overwrite == 'append':
        print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]" % (
            table, course_id, min_date, max_date, last_date,
            datetime.datetime.now())
    else:
        print "Making new %s table for course %s (start=%s, end=%s) [%s]" % (
            table, course_id, min_date, max_date, datetime.datetime.now())
    sys.stdout.flush()

    if limit_query_size:
        # do only one day's tracking logs, and force use of hash if table is too large
        print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb
        the_max_date = max_date  # save max_date information

        while min_date not in log_dates:
            print "  tracklog_%s does not exist!" % min_date
            for ld in log_dates:
                if ld < min_date:
                    continue
                if (ld > min_date) and (ld <= max_date):
                    min_date = ld
                    break
        if min_date not in log_dates:
            print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date
            raise Exception("[process_tracking_logs] missing tracking log")

        max_date = min_date
        print '    min_date = max_date = %s' % min_date
        tablename = 'tracklog_%s' % min_date.replace('-', '')
        tablesize_mb = bqutil.get_bq_table_size_bytes(
            log_dataset, tablename) / (1024.0 * 1024)
        nhashes = int(math.ceil(tablesize_mb / table_max_size_mb))
        from_datasets = "[%s.%s]" % (log_dataset, tablename)
        print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % (
            log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes)
        sys.stdout.flush()

        if nhashes:
            if not has_hash_limit:
                print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % (
                    log_dataset, tablename, tablesize_mb, table_max_size_mb)
                print "SQL: ", SQL
                raise Exception("[process_tracking_logs] table too large")

            for k in range(nhashes):
                hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes,
                                                                     k)
                the_sql = SQL.format(course_id=course_id,
                                     DATASETS=from_datasets,
                                     last_date=last_date,
                                     hash_limit=hash_limit)
                print "   Hash %d" % k
                sys.stdout.flush()
                try:
                    bqutil.create_bq_table(dataset,
                                           table,
                                           the_sql,
                                           wait=True,
                                           overwrite=overwrite,
                                           allowLargeResults=True)
                except Exception as err:
                    print the_sql
                    raise
                overwrite = "append"
        else:
            the_sql = SQL.format(course_id=course_id,
                                 DATASETS=from_datasets,
                                 last_date=last_date,
                                 hash_limit="")
            try:
                bqutil.create_bq_table(dataset,
                                       table,
                                       the_sql,
                                       wait=True,
                                       overwrite=overwrite,
                                       allowLargeResults=True)
            except Exception as err:
                print the_sql
                raise

        txt = '[%s] added tracking log data from %s' % (
            datetime.datetime.now(), tablename)
        bqutil.add_description_to_table(dataset, table, txt, append=True)

        print "----> Done with day %s" % max_date

        if the_max_date > max_date:
            # more days still to be done
            print "--> Moving on to another day (max_date=%s)" % the_max_date
            run_query_on_tracking_logs(
                SQL,
                table,
                course_id,
                force_recompute=False,
                use_dataset_latest=use_dataset_latest,
                end_date=end_date,
                get_date_function=get_date_function,
                existing=existing,
                log_dates=log_dates,
                has_hash_limit=has_hash_limit,
                table_max_size_mb=table_max_size_mb,
                limit_query_size=limit_query_size,
            )
        return

    from_datasets = """(
                  TABLE_QUERY({dataset},
                       "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}"
                     )
                  )
         """.format(dataset=log_dataset, start=min_date, end=max_date)

    the_sql = SQL.format(course_id=course_id,
                         DATASETS=from_datasets,
                         last_date=last_date,
                         hash_limit="")

    try:
        bqutil.create_bq_table(dataset,
                               table,
                               the_sql,
                               wait=True,
                               overwrite=overwrite,
                               allowLargeResults=True)
    except Exception as err:
        if (('Response too large to return.' in str(err)) and has_hash_limit):
            # try using hash limit on username
            # e.g. WHERE ABS(HASH(username)) % 4 = 0

            for k in range(4):
                hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k
                the_sql = SQL.format(course_id=course_id,
                                     DATASETS=from_datasets,
                                     last_date=last_date,
                                     hash_limit=hash_limit)
                bqutil.create_bq_table(dataset,
                                       table,
                                       the_sql,
                                       wait=True,
                                       overwrite=overwrite,
                                       allowLargeResults=True)
                overwrite = "append"

        elif ('Resources exceeded during query execution' in str(err)):
            if True:
                # figure out time interval in days, and split that in half
                start_date = datetime.datetime.strptime(min_date, '%Y%m%d')
                end_date = datetime.datetime.strptime(max_date, '%Y%m%d')
                ndays = (end_date - start_date).days
                if (ndays < 1) and has_hash_limit:
                    print "----> ndays=%d; retrying with limit_query_size" % ndays
                    sys.stdout.flush()
                    return run_query_on_tracking_logs(
                        SQL,
                        table,
                        course_id,
                        force_recompute=force_recompute,
                        use_dataset_latest=use_dataset_latest,
                        end_date=max_date,
                        get_date_function=get_date_function,
                        has_hash_limit=has_hash_limit,
                        # existing=existing,
                        log_dates=log_dates,
                        limit_query_size=True,
                    )
                elif (ndays < 1):
                    print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays
                    raise

                nd1 = int(ndays / 2)
                nd2 = ndays - nd1
                #if nd2 > nd1:
                #    nd1 = nd2
                #    nd2 = ndays - nd1
                print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % (
                    ndays, nd1, nd2)
                sys.stdout.flush()

                end_date = (start_date +
                            datetime.timedelta(days=nd1)).strftime('%Y%m%d')
                print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(
                    SQL,
                    table,
                    course_id,
                    force_recompute=force_recompute,
                    use_dataset_latest=use_dataset_latest,
                    end_date=end_date,
                    get_date_function=get_date_function,
                    has_hash_limit=has_hash_limit,
                    # existing=existing,
                    log_dates=log_dates)

                end_date = max_date
                print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(
                    SQL,
                    table,
                    course_id,
                    force_recompute=False,
                    use_dataset_latest=use_dataset_latest,
                    end_date=end_date,
                    get_date_function=get_date_function,
                    has_hash_limit=has_hash_limit,
                    # existing=existing,
                    log_dates=log_dates)
                print "--> Done with %d + %d days!" % (nd1, nd2)
                return

            if False:

                def get_ym(x):
                    return int(x[0:4]), int(x[4:6]), int(x[6:])

                (min_year, min_month, min_day) = get_ym(min_date)
                (max_year, max_month, max_day) = get_ym(max_date)
                nmonths = max_month - min_month + 12 * (max_year - min_year)
                print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time"
                sys.stdout.flush()
                (end_year, end_month) = (min_year, min_month)
                for dm in range(nmonths):
                    end_month += 1
                    if end_month > 12:
                        end_month = 1
                        end_year += 1
                    end_date = "%04d-%02d-%02d" % (end_year, end_month,
                                                   min_day)
                    print "--> with end_date=%s" % end_date
                    sys.stdout.flush()
                    run_query_on_tracking_logs(
                        SQL,
                        table,
                        course_id,
                        force_recompute=force_recompute,
                        use_dataset_latest=use_dataset_latest,
                        end_date=end_date,
                        get_date_function=get_date_function,
                        has_hash_limit=has_hash_limit,
                        # existing=existing,
                        log_dates=log_dates)
                    force_recompute = False  # after first, don't force recompute
                return
        else:
            print the_sql
            raise

    if overwrite == 'append':
        txt = '[%s] added tracking log data from %s to %s' % (
            datetime.datetime.now(), min_date, max_date)
        bqutil.add_description_to_table(dataset, table, txt, append=True)

    print "Done with course %s (end %s)" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
Exemplo n.º 8
0
def load_all_daily_logs_for_course(course_id,
                                   gsbucket="gs://x-data",
                                   verbose=True,
                                   wait=False,
                                   check_dates=True):
    '''
    Load daily tracking logs for course from google storage into BigQuery.
    
    If wait=True then waits for loading jobs to be completed.  It's desirable to wait
    if subsequent jobs which need these tables (like person_day) are to be run
    immediately afterwards.
    '''

    print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()
    gsroot = gsutil.path_from_course_id(course_id)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA = json.loads(
        open('%s/schemas/schema_tracking_log.json' %
             mypath).read())['tracking_log']

    gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot)

    fnset = gsutil.get_gs_file_list(gsdir)

    dataset = bqutil.course_id2dataset(gsroot, dtype="logs")

    # create this dataset if necessary
    bqutil.create_dataset_if_nonexistent(dataset)

    tables = bqutil.get_list_of_table_ids(dataset)
    tables = [x for x in tables if x.startswith('track')]

    if verbose:
        print "-" * 77
        print "current tables loaded:", json.dumps(tables, indent=4)
        print "files to load: ", json.dumps(fnset.keys(), indent=4)
        print "-" * 77
        sys.stdout.flush()

    for fn, fninfo in fnset.iteritems():

        if int(fninfo['size']) <= 45:
            print "Zero size file %s, skipping" % fn
            continue

        m = re.search('(\d\d\d\d-\d\d-\d\d)', fn)
        if not m:
            continue
        date = m.group(1)
        tablename = "tracklog_%s" % date.replace(
            '-', '')  # YYYYMMDD for compatibility with table wildcards

        # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True)
        file_date = fninfo['date'].replace(tzinfo=None)

        if tablename in tables:
            skip = True
            if check_dates:
                table_date = bqutil.get_bq_table_last_modified_datetime(
                    dataset, tablename)
                if not (table_date > file_date):
                    print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (
                        tablename, fn, file_date, table_date)
                    skip = False

            if skip:
                if verbose:
                    print "Already have table %s, skipping file %s" % (
                        tablename, fn)
                    sys.stdout.flush()
                continue

        #if date < '2014-07-27':
        #  continue

        print "Loading %s into table %s " % (fn, tablename)
        if verbose:
            print "start [%s]" % datetime.datetime.now()
        sys.stdout.flush()
        gsfn = fninfo['name']
        ret = bqutil.load_data_to_table(dataset,
                                        tablename,
                                        gsfn,
                                        SCHEMA,
                                        wait=wait,
                                        maxbad=1000)

    if verbose:
        print "-" * 77
        print "done with %s [%s]" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
Exemplo n.º 9
0
    def cached_get_bq_table(
        self,
        dataset,
        table,
        sql=None,
        key=None,
        drop=None,
        logger=None,
        ignore_cache=False,
        depends_on=None,
        force_query=False,
        force_newer_than=None,
        startIndex=0,
        maxResults=1000000,
        allowLargeResults=False,
        raise_exception=False,
        project_id=None,
    ):
        '''
        Get a dataset from BigQuery; use memcache.

        If "depends_on" is provided (as a list of strings), and if the desired table
        already exists, then check to make sure it is newer than any of the tables
        listed in "depends_on".

        if force_newer_than is set (should be a datetime) then in the depends_on
        testing, use that date as an override, such that the SQL is re-run if
        the existing table is older than this date.

        project_id: if specified, overrides the default BigQuery project ID (for the actual query)
        '''
        if logger is None:
            logger = logging.info
        memset = '%s.%s' % (dataset, table)
        if startIndex:
            memset += '-%d-%d' % (startIndex, maxResults)
        data = mem.get(memset)

        optargs = {}
        if project_id:
            optargs['project_id'] = project_id

        table_date = None
        if depends_on is not None:
            # get the latest mod time of tables in depends_on:
            modtimes = [
                bqutil.get_bq_table_last_modified_datetime(
                    *(x.split('.', 1)), **optargs) for x in depends_on
            ]
            latest = max([x for x in modtimes if x is not None] or [None])

            if not latest:
                raise Exception(
                    "[datasource.cached_get_bq_table] Cannot get last mod time for %s (got %s), needed by %s.%s"
                    % (depends_on, modtimes, dataset, table))

            if force_newer_than and force_newer_than > latest:
                latest = force_newer_than

            if data and data.get('lastModifiedTime', None):
                # data has a mod time, let's see if that has expired
                if data.get('lastModifiedTime', None) < latest:
                    ignore_cache = True

            # get the mod time of the computed table, if it exists
            try:
                table_date = bqutil.get_bq_table_last_modified_datetime(
                    dataset, table, **optargs)
            except Exception as err:
                if 'Not Found' in str(err):
                    table_date = None
                    ignore_cache = True
                    logging.info(
                        "[datasource.cached_get_bq_table] Table %s.%s doesn't exist, forcing recomputation"
                        % (dataset, table))
                else:
                    raise

            if table_date and table_date < latest:
                ignore_cache = True
                if sql:
                    force_query = True
                    logging.info(
                        "[datasource.cached_get_bq_table] Forcing query recomputation of %s.%s, table_date=%s, latest=%s"
                        % (dataset, table, table_date, latest))
                else:
                    logging.info(
                        "[datasource.cached_get_bq_table] Forcing cache reload of %s.%s, table_date=%s, latest=%s"
                        % (dataset, table, table_date, latest))

            # logging.info("[datasource.cached_get_bq_table] %s.%s table_date=%s, latest=%s, force_query=%s" % (dataset, table, table_date, latest, force_query))

        if (not data) or ignore_cache or (
                not data['data']
        ):  # data['data']=None if table was empty, and in that case try again
            try:
                data = bqutil.get_bq_table(dataset,
                                           table,
                                           sql,
                                           key=key,
                                           logger=logger,
                                           force_query=force_query,
                                           startIndex=startIndex,
                                           maxResults=maxResults,
                                           allowLargeResults=allowLargeResults,
                                           **optargs)
                if force_query or not table_date:
                    table_date = bqutil.get_bq_table_last_modified_datetime(
                        dataset, table, **optargs)
                data['last_modified_date'] = table_date
            except Exception as err:
                logging.error(err)
                if raise_exception:
                    raise
                data = {
                    'fields': {},
                    'field_names': [],
                    'data': [],
                    'data_by_key': {}
                }
                return data  # don't cache empty result
            data['depends_on'] = depends_on
            if (drop is not None) and drop:
                for key in drop:
                    data.pop(
                        key
                    )  # because data can be too huge for memcache ("Values may not be more than 1000000 bytes in length")
            try:
                mem.set(memset, data, time=3600 * 12)
            except Exception as err:
                logging.error('error doing mem.set for %s.%s from bigquery' %
                              (dataset, table))
        self.bqdata[table] = data
        return data