def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """ Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table. :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" sdir = load_course_sql.find_course_sql_dir(cid, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest), ) csvfn = sdir / csv_name tempfn = sdir / temp_name mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not os.path.exists(csvfn): print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn return if not subsection: cleanup_rows_from_grade_persistent(csvfn, tempfn) else: cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted") gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def extract_logs_mongo2gs(course_id, start="2012-09-01", end="2014-09-24", verbose=False, dbname=DBNAME, collection = 'tracking_log', tracking_logs_directory="TRACKING_LOGS", ): print "extracting logs for course %s" % course_id # list of dates to dump dates = daterange(d2dt(start), d2dt(end)) if verbose: print "Dates to dump:", [x['dstr'] for x in dates] # what files already on gs? gspath = "%s/DAILY" % gs_path_from_course_id(course_id) gsfiles = get_gs_file_list(gspath) DIR = tracking_logs_directory if not os.path.exists(DIR): os.mkdir(DIR) DIR += '/' + path_from_course_id(course_id) if not os.path.exists(DIR): os.mkdir(DIR) filebuf = [] for k in range(len(dates)-1): d = dates[k] ofn = '%s/tracklog-%s.json.gz' % (DIR, d['dstr']) start = d['start'] end = d['end'] ofnb = os.path.basename(ofn) if ofnb in gsfiles: print "Already have %s, skipping" % ofnb sys.stdout.flush() continue # dump tracking log of certain date using mongoexport, if needed if not os.path.exists(ofn): # db.tracking_log.find({'course_id': "HarvardX/ER22x/2013_Spring", # 'time': { '$gte': "2013-08-01T00:00:00.000000", '$lt': "2013-08-02T00:00:00.000000" }}).count() query = '{"course_id": "%s", "time": {"$gte": "%s", "$lt": "%s" }}' % (course_id, start, end) cmd = "mongoexport -d %s -c %s -q '%s'| edx2bigquery rephrase_logs | gzip -9 > %s" % (dbname, collection, query, ofn) # print cmd os.system(cmd) upload_file_to_gs(ofn, gspath + '/' + ofnb) filebuf.append(ofn) if len(filebuf)>20: ffn = filebuf.pop(0) os.unlink(ffn) print "...Deleted %s" % ffn sys.stdout.flush()
def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path( gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" csvfn = '%s/%s/%s/%s' % (basedir, cid.replace('/', '__'), datedir, csv_name) tempfn = '%s/%s/%s/%s' % (basedir, cid.replace('/', '__'), datedir, temp_name) mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not subsection: remove_nulls_from_grade_persistent(csvfn, tempfn) gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent( dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def write_geoip_table(self): ''' Write out the geoipdat table if nchanged > 0 ''' if not self.nchanged: return ofn = 'tmp_geoip_%08d.json' % random.uniform(0, 100000000) print "--> new entries added to geoipdat, writing to %s" % (ofn) sys.stdout.flush() ofp = codecs.open(ofn, 'w', encoding='utf8') for key, val in self.geoipdat.iteritems(): try: ofp.write(json.dumps(val) + '\n') except Exception as err: print "Error! %s" % err sys.stdout.write(repr(val)) raise ofp.close() lock_file(self.gipfn) try: print "--> renaming %s to %s" % (ofn, self.gipfn) sys.stdout.flush() os.rename(ofn, self.gipfn) except Exception as err: print "Error %s in renaming gipfn" % str(err) lock_file(self.gipfn, release=True) mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_extra_geoip.json' % mypath).read())['extra_geoip'] gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn print "--> Uploading %s to %s" % (self.gipfn, gsp) sys.stdout.flush() gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json') print "--> Importing %s to %s" % (gsp, self.giptable) sys.stdout.flush() try: bqutil.create_dataset_if_nonexistent(self.gipdataset) except Exception as err: print "--> Warning: failed to create %s, err=%s" % (gsp, err) try: bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp, the_schema) except Exception as err: print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % ( gsp, self.gipdataset, self.giptable, err) print "---> Continuing anyway" sys.stdout.flush()
def do_course_listings(course_listings_fn): dataset = 'courses' table = 'listings' bqutil.create_dataset_if_nonexistent(dataset) mypath = os.path.dirname(os.path.realpath(__file__)) gsfn = gsutil.gs_path_from_course_id('courses') / 'listings.csv' gsutil.upload_file_to_gs(course_listings_fn, gsfn) schema = json.loads(open('%s/schemas/schema_course_listings.json' % mypath).read())['course_listings'] bqutil.load_data_to_table(dataset, table, gsfn, schema, wait=True, format='csv', skiprows=1)
def do_user_part_csv(course_id, basedir=None, datedir=None, use_dataset_latest=False, verbose=False, pin_date=None): sdir = find_course_sql_dir(course_id, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest and not pin_date), ) # upload to google storage dfn = sdir / "user_api_usercoursetag.csv.gz" if not os.path.exists(dfn): print("[load_user_part] Missing %s, skipping" % dfn) return # reformat True / False to 1/0 for "value" field if verbose: print("[load_user_part] extracting user partition data from %s" % dfn) sys.stdout.flush() cdr = csv.DictReader(gzip.GzipFile(dfn)) fields = cdr.fieldnames if verbose: print("fieldnames = %s" % fields) fixed_data = [] bmap = {'true': 1, 'false': 0} for row in cdr: vstr = row['value'].lower() row['value'] = bmap.get(vstr, vstr) fixed_data.append(row) ofnb = 'user_partitions.csv.gz' odfn = sdir / ofnb with gzip.GzipFile(odfn, 'w') as ofp: cdw = csv.DictWriter(ofp, fieldnames=fields) cdw.writeheader() cdw.writerows(fixed_data) if verbose: print("[load_user_part] Wrote %d rows of user partition data to %s" % (len(fixed_data), odfn)) sys.stdout.flush() gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)) gsutil.upload_file_to_gs(odfn, gsdir / ofnb, verbose=False) mypath = os.path.dirname(os.path.realpath(__file__)) schema = json.loads(open('%s/schemas/schema_user_partitions.json' % mypath).read())['user_partitions'] # import into BigQuery table = "user_partitions" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
def extractResearchData( self, course_id, tablename, the_dataset=None, rdp=None, rdp_format='csv', output_bucket=None, basedir='', datedir='', do_gzip=True): ''' Get research data output into tables and archive onto server ''' # Archive location if course_id is not None: # Individual Course Research Data Products self.gsp = gsutil.gs_path_from_course_id( course_id=course_id, gsbucket=output_bucket, use_dataset_latest=True ) gsfilename = "%s/%s" % ( self.gsp, RESEARCH_DATA_PRODUCTS[ rdp ] ) else: print "ERROR! Must specify course_id's. Aborting." return try: # Copy to Google Storage msg = "[researchData]: Copying Research Data table %s to %s" % ( tablename, gsfilename ) print msg #gsfilename = "%s/%s-*.csv.gz" % ( self.gsp, tablename ) # temp gsfilename = "%s/%s.csv.gz" % ( self.gsp, tablename ) # temp ret = bqutil.extract_table_to_gs( the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True) msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename ) print msg sys.stdout.flush() except Exception as err: print str(err) if ('BQ Error creating table' in str(err) ): msg = "[researchData]: Retrying... by sharding." print msg sys.stdout.flush() gsfilename = "%s/%s-*.csv.gz" % ( self.gsp, tablename ) print gsfilename sys.stdout.flush() ret = bqutil.extract_table_to_gs( the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True) msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename ) print msg sys.stdout.flush() # Copy from Google Storage to Secure Data Warehouse for archiving archiveLocation = find_course_sql_dir(course_id=course_id, basedir=basedir, datedir=datedir, use_dataset_latest=True) #time.sleep( CFG.TIME_TO_WAIT_30s ) # delay needed to allow for GS to upload file fully (This should be size dependent, and may not be enough time) msg = "[researchData]: Archiving Research Data table %s from %s to %s" % ( tablename, gsfilename, archiveLocation ) print msg sys.stdout.flush() gsutil.upload_file_to_gs(src=gsfilename, dst=archiveLocation, verbose=True) pass
def write_geoip_table(self): ''' Write out the geoipdat table if nchanged > 0 ''' if not self.nchanged: return ofn = 'tmp_geoip_%08d.json' % random.uniform(0,100000000) print "--> new entries added to geoipdat, writing to %s" % (ofn) sys.stdout.flush() ofp = codecs.open(ofn, 'w', encoding='utf8') for key, val in self.geoipdat.iteritems(): try: ofp.write(json.dumps(val)+'\n') except Exception as err: print "Error! %s" % err sys.stdout.write(repr(val)) raise ofp.close() lock_file(self.gipfn) try: print "--> renaming %s to %s" % (ofn, self.gipfn) sys.stdout.flush() os.rename(ofn, self.gipfn) except Exception as err: print "Error %s in renaming gipfn" % str(err) lock_file(self.gipfn, release=True) mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_extra_geoip.json' % mypath).read())['extra_geoip'] gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn print "--> Uploading %s to %s" % (self.gipfn, gsp) sys.stdout.flush() gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json') print "--> Importing %s to %s" % (gsp, self.giptable) sys.stdout.flush() try: bqutil.create_dataset_if_nonexistent(self.gipdataset) except Exception as err: print "--> Warning: failed to create %s, err=%s" % (gsp, err) try: bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp, the_schema) except Exception as err: print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % (gsp, self.gipdataset, self.giptable, err) print "---> Continuing anyway" sys.stdout.flush()
def make_gp_table(course_id, basedir=None, datedir=None, use_dataset_latest=False, verbose=False, pin_date=None): if pin_date: datedir = pin_date sdir = load_course_sql.find_course_sql_dir(course_id, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest and not pin_date), ) fn_to_try = ['course.xml.tar.gz', 'course-prod-analytics.xml.tar.gz', 'course-prod-edge-analytics.xml.tar.gz', 'course-prod-edx-replica.xml.tar.gz', ] for fntt in fn_to_try: fn = sdir / fntt if os.path.exists(fn): break if not os.path.exists(fn): msg = "---> oops, cannot get couese content (with grading policy file) for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn) raise Exception(msg) gpstr, gpfn = read_grading_policy_from_tar_file(fn) fields, gptab, schema = load_grading_policy(gpstr, verbose=verbose, gpfn=gpfn) ofnb = 'grading_policy.csv' ofn = sdir / ofnb ofp = open(ofn, 'w') cdw = csv.DictWriter(ofp, fieldnames=fields) cdw.writeheader() cdw.writerows(gptab) ofp.close() # upload to google storage gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)) gsutil.upload_file_to_gs(ofn, gsdir / ofnb, verbose=False) # import into BigQuery table = "grading_policy" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
def copy_if_newer(fn, fnset, options='-z csv,json'): statbuf = os.stat(fn) mt = datetime.datetime.fromtimestamp(statbuf.st_mtime) # do some date checking to upload files which have changed, and are newer than that on google cloud storage local_dt = local.localize(mt, is_dst=None) utc_dt = local_dt.astimezone (pytz.utc) fnb = os.path.basename(fn) if fnb in fnset and fnset[fnb]['date'] > utc_dt: print "...%s already copied, skipping" % fn sys.stdout.flush() return elif fnb in fnset: print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % (fn, fnset[fnb]['date'], mt) gsutil.upload_file_to_gs(fn, gsdir / fnb, options=options, verbose=True)
def extractResearchData(self, course_id, tablename, the_dataset=None, rdp=None, rdp_format='csv', output_bucket=None, basedir='', datedir='', do_gzip=True): ''' Get research data output into tables and archive onto server ''' # Archive location if course_id is not None: # Individual Course Research Data Products self.gsp = gsutil.gs_path_from_course_id(course_id=course_id, gsbucket=output_bucket, use_dataset_latest=True) gsfilename = "%s/%s" % (self.gsp, RESEARCH_DATA_PRODUCTS[rdp]) else: print "ERROR! Must specify course_id's. Aborting." return try: # Copy to Google Storage msg = "[researchData]: Copying Research Data table %s to %s" % ( tablename, gsfilename) print msg #gsfilename = "%s/%s-*.csv.gz" % ( self.gsp, tablename ) # temp gsfilename = "%s/%s.csv.gz" % (self.gsp, tablename) # temp ret = bqutil.extract_table_to_gs(the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True) msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename) print msg sys.stdout.flush() except Exception as err: print str(err) if ('BQ Error creating table' in str(err)): msg = "[researchData]: Retrying... by sharding." print msg sys.stdout.flush() gsfilename = "%s/%s-*.csv.gz" % (self.gsp, tablename) print gsfilename sys.stdout.flush() ret = bqutil.extract_table_to_gs(the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True) msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename) print msg sys.stdout.flush() # Copy from Google Storage to Secure Data Warehouse for archiving archiveLocation = find_course_sql_dir(course_id=course_id, basedir=basedir, datedir=datedir, use_dataset_latest=True) #time.sleep( CFG.TIME_TO_WAIT_30s ) # delay needed to allow for GS to upload file fully (This should be size dependent, and may not be enough time) msg = "[researchData]: Archiving Research Data table %s from %s to %s" % ( tablename, gsfilename, archiveLocation) print msg sys.stdout.flush() gsutil.upload_file_to_gs(src=gsfilename, dst=archiveLocation, verbose=True) pass
def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """ Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table. :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path( gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" sdir = load_course_sql.find_course_sql_dir( cid, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest), ) csvfn = sdir / csv_name tempfn = sdir / temp_name mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not os.path.exists(csvfn): print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn return if not subsection: cleanup_rows_from_grade_persistent(csvfn, tempfn) else: cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted") gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent( dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest): ''' Create Video stats for Videos Viewed and Videos Watched. First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched ''' assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable" # Get Course Dir path basedir = path(basedir or '') course_dir = course_id.replace('/', '__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) # get schema mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/%s' % (mypath, SCHEMA_VIDEO_AXIS) the_schema = json.loads(open(SCHEMA_FILE).read())[SCHEMA_VIDEO_AXIS_NAME] the_dict_schema = schema2dict(the_schema) # Create initial video axis videoAxisExists = False dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) va_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS) assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS) videoAxisExists = True va_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: print "%s --> Attempting to process %s table" % (str(err), TABLE_VIDEO_AXIS) sys.stdout.flush() # get course axis time ca_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS) ca_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: pass if videoAxisExists and (not force_recompute) and ca_date and va_date and ( ca_date > va_date): force_recompute = True print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % ( va_date, ca_date) sys.stdout.flush() if not videoAxisExists or force_recompute: force_recompute = True createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) # Get video lengths va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS) assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table" va_bqdata = va['data'] fileoutput = lfp / FILENAME_VIDEO_AXIS getYoutubeDurations(dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute) # upload and import video axis gsfn = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS gsutil.upload_file_to_gs(fileoutput, gsfn) table = TABLE_VIDEO_AXIS bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True) else: print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS) # Lastly, create video stats createVideoStats_day(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) createVideoStats(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)
def extract_logs_mongo2gs( course_id, start="2012-09-01", end="2014-09-24", verbose=False, dbname=DBNAME, collection='tracking_log', tracking_logs_directory="TRACKING_LOGS", ): print "extracting logs for course %s" % course_id # list of dates to dump dates = daterange(d2dt(start), d2dt(end)) if verbose: print "Dates to dump:", [x['dstr'] for x in dates] # what files already on gs? gspath = "%s/DAILY" % gs_path_from_course_id(course_id) gsfiles = get_gs_file_list(gspath) DIR = tracking_logs_directory if not os.path.exists(DIR): os.mkdir(DIR) DIR += '/' + path_from_course_id(course_id) if not os.path.exists(DIR): os.mkdir(DIR) filebuf = [] for k in range(len(dates) - 1): d = dates[k] ofn = '%s/tracklog-%s.json.gz' % (DIR, d['dstr']) start = d['start'] end = d['end'] ofnb = os.path.basename(ofn) if ofnb in gsfiles: print "Already have %s, skipping" % ofnb sys.stdout.flush() continue # dump tracking log of certain date using mongoexport, if needed if not os.path.exists(ofn): # db.tracking_log.find({'course_id': "HarvardX/ER22x/2013_Spring", # 'time': { '$gte': "2013-08-01T00:00:00.000000", '$lt': "2013-08-02T00:00:00.000000" }}).count() query = '{"course_id": "%s", "time": {"$gte": "%s", "$lt": "%s" }}' % ( course_id, start, end) cmd = "mongoexport -d %s -c %s -q '%s'| edx2bigquery rephrase_logs | gzip -9 > %s" % ( dbname, collection, query, ofn) # print cmd os.system(cmd) upload_file_to_gs(ofn, gspath + '/' + ofnb) filebuf.append(ofn) if len(filebuf) > 20: ffn = filebuf.pop(0) os.unlink(ffn) print "...Deleted %s" % ffn sys.stdout.flush()
def analyze_course_content(course_id, listings_file=None, basedir="X-Year-2-data-sql", datedir="2013-09-21", use_dataset_latest=False, do_upload=False, courses=None, verbose=True, ): ''' Compute course_content table, which quantifies: - number of chapter, sequential, vertical modules - number of video modules - number of problem, *openended, mentoring modules - number of dicussion, annotatable, word_cloud modules Do this using the course "xbundle" file, produced when the course axis is computed. Include only modules which had nontrivial use, to rule out the staff and un-shown content. Do the exclusion based on count of module appearing in the studentmodule table, based on stats_module_usage for each course. Also, from the course listings file, compute the number of weeks the course was open. If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset as the "stats_course_content" table. Also generate a "course_summary_stats" table, stored in the course_report_ORG or course_report_latest dataset. The course_summary_stats table combines data from many reports,, including stats_course_content, the medians report, the listings file, broad_stats_by_course, and time_on_task_stats_by_course. ''' if do_upload: if use_dataset_latest: org = "latest" else: org = courses[0].split('/',1)[0] # extract org from first course_id in courses crname = 'course_report_%s' % org gspath = gsutil.gs_path_from_course_id(crname) gsfnp = gspath / CCDATA gsutil.upload_file_to_gs(CCDATA, gsfnp) tableid = "stats_course_content" dataset = crname mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath try: the_schema = json.loads(open(SCHEMA_FILE).read())[tableid] except Exception as err: print "Oops! Failed to load schema file for %s. Error: %s" % (tableid, str(err)) raise if 0: bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1) table = 'course_metainfo' course_tables = ',\n'.join([('[%s.course_metainfo]' % bqutil.course_id2dataset(x)) for x in courses]) sql = "select * from {course_tables}".format(course_tables=course_tables) print "--> Creating %s.%s using %s" % (dataset, table, sql) if 1: metainfo_dataset = bqutil.get_bq_table(dataset, table, sql=sql, newer_than=datetime.datetime(2015, 1, 16, 3, 0), ) # bqutil.create_bq_table(dataset, table, sql, overwrite=True) #----------------------------------------------------------------------------- # make course_summary_stats table # # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo. # Also use (and create if necessary) the nregistered_by_wrap table. # get the broad_stats_by_course data bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course') table_list = bqutil.get_list_of_table_ids(dataset) latest_person_course = max([ x for x in table_list if x.startswith('person_course_')]) print "Latest person_course table in %s is %s" % (dataset, latest_person_course) sql = """ SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, FROM [{dataset}.{person_course}] as pc left join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' )) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id """.format(dataset=dataset, person_course=latest_person_course) nr_by_wrap = bqutil.get_bq_table(dataset, 'nregistered_by_wrap', sql=sql, key={'name': 'course_id'}) # rates for registrants before and during course sql = """ SELECT *, ncertified / nregistered * 100 as pct_certified_of_reg, ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch, ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course, ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap, ncertified / nviewed * 100 as pct_certified_of_viewed, ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap, ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap, FROM ( # ------------------------ # get aggregate data SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.certified then 1 else 0 end) ncertified, sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap, sum(case when pc.viewed then 1 else 0 end) nviewed, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap, sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap, sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch, sum(case when pc.start_time < cminfo.launch_date and pc.certified then 1 else 0 end) ncertified_and_registered_before_launch, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) and pc.certified then 1 else 0 end) ncertified_and_registered_during_course, FROM [{dataset}.{person_course}] as pc left join ( # -------------------- # get course launch and wrap dates from course_metainfo SELECT AA.course_id as course_id, AA.wrap_date as wrap_date, AA.launch_date as launch_date, BB.ewrap_date as ewrap_date, FROM ( # inner get course launch and wrap dates from course_metainfo SELECT A.course_id as course_id, A.wrap_date as wrap_date, B.launch_date as launch_date, from ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' ) ) as A left outer join ( SELECT course_id, TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month, regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day, regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year, FROM [{dataset}.course_metainfo] where key='listings_Course Launch' ) ) as B on A.course_id = B.course_id # end inner course_metainfo subquery ) as AA left outer join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Empirical Course Wrap' ) ) as BB on AA.course_id = BB.course_id # end course_metainfo subquery # -------------------- ) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id # ---- end get aggregate data ) order by course_id """.format(dataset=dataset, person_course=latest_person_course) print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration' sys.stdout.flush() cert_by_reg = bqutil.get_bq_table(dataset, 'stats_cert_rates_by_registration', sql=sql, newer_than=datetime.datetime(2015, 1, 16, 3, 0), key={'name': 'course_id'}) # start assembling course_summary_stats c_sum_stats = defaultdict(OrderedDict) for entry in bsbc['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] cmci.update(entry) cnbw = nr_by_wrap['data_by_key'][course_id] nbw = int(cnbw['nregistered_by_wrap']) cmci['nbw_wrap_date'] = cnbw['wrap_date'] cmci['nregistered_by_wrap'] = nbw cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct'] cmci['frac_female'] = float(entry['n_female_viewed']) / (float(entry['n_male_viewed']) + float(entry['n_female_viewed'])) ncert = float(cmci['certified_sum']) if ncert: cmci['certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0 else: cmci['certified_of_nregistered_by_wrap_pct'] = None cbr = cert_by_reg['data_by_key'][course_id] for field, value in cbr.items(): cmci['cbr_%s' % field] = value # add medians for viewed, explored, and certified msbc_tables = {'msbc_viewed': "viewed_median_stats_by_course", 'msbc_explored': 'explored_median_stats_by_course', 'msbc_certified': 'certified_median_stats_by_course', 'msbc_verified': 'verified_median_stats_by_course', } for prefix, mtab in msbc_tables.items(): print "--> Merging median stats data from %s" % mtab sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, mtab) for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): cmci['%s_%s' % (prefix, field)] = value # add time on task data tot_table = "time_on_task_stats_by_course" prefix = "ToT" print "--> Merging time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field=='course_id': continue cmci['%s_%s' % (prefix, field)] = value # add serial time on task data tot_table = "time_on_task_serial_stats_by_course" prefix = "SToT" print "--> Merging serial time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field=='course_id': continue cmci['%s_%s' % (prefix, field)] = value # add show_answer stats tot_table = "show_answer_stats_by_course" prefix = "SAS" print "--> Merging show_answer stats data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field=='course_id': continue cmci['%s_%s' % (prefix, field)] = value # setup list of keys, for CSV output css_keys = c_sum_stats.values()[0].keys() # retrieve course_metainfo table, pivot, add that to summary_stats print "--> Merging course_metainfo from %s" % table sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, table) def make_key(key): key = key.strip() key = key.replace(' ', '_').replace("'", "_").replace('/', '_').replace('(','').replace(')','').replace('-', '_').replace(',', '') return key listings_keys = map(make_key, ["Institution", "Semester", "New or Rerun", "Andrew Recodes New/Rerun", "Course Number", "Short Title", "Andrew's Short Titles", "Title", "Instructors", "Registration Open", "Course Launch", "Course Wrap", "course_id", "Empirical Course Wrap", "Andrew's Order", "certifies", "MinPassGrade", '4-way Category by name', "4-way (CS, STEM, HSocSciGov, HumHistRel)" ]) listings_keys.reverse() for lk in listings_keys: css_keys.insert(1, "listings_%s" % lk) COUNTS_TO_KEEP = ['discussion', 'problem', 'optionresponse', 'checkboxgroup', 'optioninput', 'choiceresponse', 'video', 'choicegroup', 'vertical', 'choice', 'sequential', 'multiplechoiceresponse', 'numericalresponse', 'chapter', 'solution', 'img', 'formulaequationinput', 'responseparam', 'selfassessment', 'track', 'task', 'rubric', 'stringresponse', 'combinedopenended', 'description', 'textline', 'prompt', 'category', 'option', 'lti', 'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt', 'comment', 'annotationinput', 'image', 'options', 'comment_prompt', 'conditional', 'answer', 'poll_question', 'section', 'wrapper', 'map', 'area', 'customtag', 'transcript', 'split_test', 'word_cloud', 'openended', 'openendedparam', 'answer_display', 'code', 'drag_and_drop_input', 'customresponse', 'draggable', 'mentoring', 'textannotation', 'imageannotation', 'videosequence', 'feedbackprompt', 'assessments', 'openassessment', 'assessment', 'explanation', 'criterion'] for entry in bqdat['data']: thekey = make_key(entry['key']) # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP: # continue if thekey.startswith('listings_') and thekey[9:] not in listings_keys: # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id']) continue c_sum_stats[entry['course_id']][thekey] = entry['value'] #if 'certifies' in thekey: # print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value']) if thekey not in css_keys: css_keys.append(thekey) # compute forum_posts_per_week for course_id, entry in c_sum_stats.items(): nfps = entry.get('nforum_posts_sum', 0) if nfps: fppw = int(nfps) / float(entry['nweeks']) entry['nforum_posts_per_week'] = fppw print " course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (course_id, entry['total_assessments_per_week'], fppw) else: entry['nforum_posts_per_week'] = None css_keys.append('nforum_posts_per_week') # read in listings file and merge that in also if listings_file: if listings_file.endswith('.csv'): listings = csv.DictReader(open(listings_file)) else: listings = [ json.loads(x) for x in open(listings_file) ] for entry in listings: course_id = entry['course_id'] if course_id not in c_sum_stats: continue cmci = c_sum_stats[course_id] for field, value in entry.items(): lkey = "listings_%s" % make_key(field) if not (lkey in cmci) or (not cmci[lkey]): cmci[lkey] = value print "Storing these fields: %s" % css_keys # get schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_combined_course_summary_stats.json' % mypath).read()) schema_dict = { x['name'] : x for x in the_schema } # write out CSV css_table = "course_summary_stats" ofn = "%s__%s.csv" % (dataset, css_table) ofn2 = "%s__%s.json" % (dataset, css_table) print "Writing data to %s and %s" % (ofn, ofn2) ofp = open(ofn, 'w') ofp2 = open(ofn2, 'w') dw = csv.DictWriter(ofp, fieldnames=css_keys) dw.writeheader() for cid, entry in c_sum_stats.items(): for ek in entry: if ek not in schema_dict: entry.pop(ek) # entry[ek] = str(entry[ek]) # coerce to be string ofp2.write(json.dumps(entry) + "\n") for key in css_keys: if key not in entry: entry[key] = None dw.writerow(entry) ofp.close() ofp2.close() # upload to bigquery # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ] if 1: gsfnp = gspath / dataset / (css_table + ".json") gsutil.upload_file_to_gs(ofn2, gsfnp) # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False, # format='csv', skiprows=1) bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False) return print "-"*60 + " %s" % course_id # get nweeks from listings lfn = path(listings_file) if not lfn.exists(): print "[analyze_content] course listings file %s doesn't exist!" % lfn return data = None for k in csv.DictReader(open(lfn)): if k['course_id']==course_id: data = k break if not data: print "[analyze_content] no entry for %s found in course listings file %s!" % (course_id, lfn) return def date_parse(field): (m, d, y) = map(int, data[field].split('/')) return datetime.datetime(y, m, d) launch = date_parse('Course Launch') wrap = date_parse('Course Wrap') ndays = (wrap - launch).days nweeks = ndays / 7.0 print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays) course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) cfn = gsutil.path_from_course_id(course_id) xbfn = course_dir / ("xbundle_%s.xml" % cfn) if not xbfn.exists(): print "[analyze_content] cannot find xbundle file %s for %s!" % (xbfn, course_id) return print "[analyze_content] For %s using %s" % (course_id, xbfn) # get module usage data mudata = get_stats_module_usage(course_id, basedir, datedir, use_dataset_latest) xml = etree.parse(open(xbfn)).getroot() counts = defaultdict(int) nexcluded = defaultdict(int) IGNORE = ['html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1', 'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike', 'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's', 'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki', 'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4', 'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p', 'P', 'TABLE', 'TD', 'small', 'text', 'title'] def walk_tree(elem): if type(elem.tag)==str and (elem.tag.lower() not in IGNORE): counts[elem.tag.lower()] += 1 for k in elem: midfrag = (k.tag, k.get('url_name_orig', None)) if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20: nexcluded[k.tag] += 1 if verbose: print " -> excluding %s (%s), ncount=%s" % (k.get('display_name', '<no_display_name>').encode('utf8'), midfrag, mudata.get(midfrag, {}).get('ncount')) continue walk_tree(k) walk_tree(xml) print counts # combine some into "qual_axis" and others into "quant_axis" qual_axis = ['openassessment', 'optionresponse', 'multiplechoiceresponse', # 'discussion', 'choiceresponse', 'word_cloud', 'combinedopenended', 'choiceresponse', 'stringresponse', 'textannotation', 'openended', 'lti'] quant_axis = ['formularesponse', 'numericalresponse', 'customresponse', 'symbolicresponse', 'coderesponse', 'imageresponse'] nqual = 0 nquant = 0 for tag, count in counts.items(): if tag in qual_axis: nqual += count if tag in quant_axis: nquant += count print "nqual=%d, nquant=%d" % (nqual, nquant) nqual_per_week = nqual / nweeks nquant_per_week = nquant / nweeks total_per_week = nqual_per_week + nquant_per_week print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (nqual_per_week, nquant_per_week, total_per_week) # save this overall data in CCDATA lock_file(CCDATA) ccdfn = path(CCDATA) ccd = {} if ccdfn.exists(): for k in csv.DictReader(open(ccdfn)): ccd[k['course_id']] = k ccd[course_id] = {'course_id': course_id, 'nweeks': nweeks, 'nqual_per_week': nqual_per_week, 'nquant_per_week': nquant_per_week, 'total_assessments_per_week' : total_per_week, } # fields = ccd[ccd.keys()[0]].keys() fields = ['course_id', 'nquant_per_week', 'total_assessments_per_week', 'nqual_per_week', 'nweeks'] cfp = open(ccdfn, 'w') dw = csv.DictWriter(cfp, fieldnames=fields) dw.writeheader() for cid, entry in ccd.items(): dw.writerow(entry) cfp.close() lock_file(CCDATA, release=True) # store data in course_metainfo table, which has one (course_id, key, value) on each line # keys include nweeks, nqual, nquant, count_* for module types * cmfields = OrderedDict() cmfields['course_id'] = course_id cmfields['course_length_days'] = str(ndays) cmfields.update({ ('listings_%s' % key) : value for key, value in data.items() }) # from course listings cmfields.update(ccd[course_id].copy()) # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() }) # from content counts for key in sorted(counts): # store counts in sorted order, so that the later generated CSV file can have a predictable structure value = counts[key] cmfields['count_%s' % key] = str(value) # from content counts cmfields.update({ ('nexcluded_sub_20_%s' % key) : str(value) for key, value in nexcluded.items() }) # from content counts course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) csvfn = course_dir / CMINFO # manual overriding of the automatically computed fields can be done by storing course_id,key,value data # in the CMINFO_OVERRIDES file csvfn_overrides = course_dir / CMINFO_OVERRIDES if csvfn_overrides.exists(): print "--> Loading manual override information from %s" % csvfn_overrides for ovent in csv.DictReader(open(csvfn_overrides)): if not ovent['course_id']==course_id: print "===> ERROR! override file has entry with wrong course_id: %s" % ovent continue print " overriding key=%s with value=%s" % (ovent['key'], ovent['value']) cmfields[ovent['key']] = ovent['value'] print "--> Course metainfo writing to %s" % csvfn fp = open(csvfn, 'w') cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value']) cdw.writeheader() for k, v in cmfields.items(): cdw.writerow({'course_id': course_id, 'key': k, 'value': v}) fp.close() table = 'course_metainfo' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) gsfnp = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / CMINFO print "--> Course metainfo uploading to %s then to %s.%s" % (gsfnp, dataset, table) gsutil.upload_file_to_gs(csvfn, gsfnp) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())[table] bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)
def analyze_course_content( course_id, listings_file=None, basedir="X-Year-2-data-sql", datedir="2013-09-21", use_dataset_latest=False, do_upload=False, courses=None, verbose=True, pin_date=None, ): ''' Compute course_content table, which quantifies: - number of chapter, sequential, vertical modules - number of video modules - number of problem, *openended, mentoring modules - number of dicussion, annotatable, word_cloud modules Do this using the course "xbundle" file, produced when the course axis is computed. Include only modules which had nontrivial use, to rule out the staff and un-shown content. Do the exclusion based on count of module appearing in the studentmodule table, based on stats_module_usage for each course. Also, from the course listings file, compute the number of weeks the course was open. If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset as the "stats_course_content" table. Also generate a "course_summary_stats" table, stored in the course_report_ORG or course_report_latest dataset. The course_summary_stats table combines data from many reports,, including stats_course_content, the medians report, the listings file, broad_stats_by_course, and time_on_task_stats_by_course. ''' if do_upload: if use_dataset_latest: org = "latest" else: org = courses[0].split( '/', 1)[0] # extract org from first course_id in courses crname = 'course_report_%s' % org gspath = gsutil.gs_path_from_course_id(crname) gsfnp = gspath / CCDATA gsutil.upload_file_to_gs(CCDATA, gsfnp) tableid = "stats_course_content" dataset = crname mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath try: the_schema = json.loads(open(SCHEMA_FILE).read())[tableid] except Exception as err: print "Oops! Failed to load schema file for %s. Error: %s" % ( tableid, str(err)) raise if 0: bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1) table = 'course_metainfo' course_tables = ',\n'.join([ ('[%s.course_metainfo]' % bqutil.course_id2dataset(x)) for x in courses ]) sql = "select * from {course_tables}".format( course_tables=course_tables) print "--> Creating %s.%s using %s" % (dataset, table, sql) if 1: metainfo_dataset = bqutil.get_bq_table( dataset, table, sql=sql, newer_than=datetime.datetime(2015, 1, 16, 3, 0), ) # bqutil.create_bq_table(dataset, table, sql, overwrite=True) #----------------------------------------------------------------------------- # make course_summary_stats table # # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo. # Also use (and create if necessary) the nregistered_by_wrap table. # get the broad_stats_by_course data bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course') table_list = bqutil.get_list_of_table_ids(dataset) latest_person_course = max( [x for x in table_list if x.startswith('person_course_')]) print "Latest person_course table in %s is %s" % (dataset, latest_person_course) sql = """ SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, FROM [{dataset}.{person_course}] as pc left join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' )) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id """.format(dataset=dataset, person_course=latest_person_course) nr_by_wrap = bqutil.get_bq_table(dataset, 'nregistered_by_wrap', sql=sql, key={'name': 'course_id'}) # rates for registrants before and during course sql = """ SELECT *, ncertified / nregistered * 100 as pct_certified_of_reg, ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch, ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course, ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap, ncertified / nviewed * 100 as pct_certified_of_viewed, ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap, ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap, FROM ( # ------------------------ # get aggregate data SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.certified then 1 else 0 end) ncertified, sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap, sum(case when pc.viewed then 1 else 0 end) nviewed, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap, sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap, sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch, sum(case when pc.start_time < cminfo.launch_date and pc.certified then 1 else 0 end) ncertified_and_registered_before_launch, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) and pc.certified then 1 else 0 end) ncertified_and_registered_during_course, FROM [{dataset}.{person_course}] as pc left join ( # -------------------- # get course launch and wrap dates from course_metainfo SELECT AA.course_id as course_id, AA.wrap_date as wrap_date, AA.launch_date as launch_date, BB.ewrap_date as ewrap_date, FROM ( # inner get course launch and wrap dates from course_metainfo SELECT A.course_id as course_id, A.wrap_date as wrap_date, B.launch_date as launch_date, from ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' ) ) as A left outer join ( SELECT course_id, TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month, regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day, regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year, FROM [{dataset}.course_metainfo] where key='listings_Course Launch' ) ) as B on A.course_id = B.course_id # end inner course_metainfo subquery ) as AA left outer join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Empirical Course Wrap' ) ) as BB on AA.course_id = BB.course_id # end course_metainfo subquery # -------------------- ) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id # ---- end get aggregate data ) order by course_id """.format(dataset=dataset, person_course=latest_person_course) print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration' sys.stdout.flush() cert_by_reg = bqutil.get_bq_table(dataset, 'stats_cert_rates_by_registration', sql=sql, newer_than=datetime.datetime( 2015, 1, 16, 3, 0), key={'name': 'course_id'}) # start assembling course_summary_stats c_sum_stats = defaultdict(OrderedDict) for entry in bsbc['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] cmci.update(entry) cnbw = nr_by_wrap['data_by_key'][course_id] nbw = int(cnbw['nregistered_by_wrap']) cmci['nbw_wrap_date'] = cnbw['wrap_date'] cmci['nregistered_by_wrap'] = nbw cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct'] cmci['frac_female'] = float(entry['n_female_viewed']) / (float( entry['n_male_viewed']) + float(entry['n_female_viewed'])) ncert = float(cmci['certified_sum']) if ncert: cmci[ 'certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0 else: cmci['certified_of_nregistered_by_wrap_pct'] = None cbr = cert_by_reg['data_by_key'][course_id] for field, value in cbr.items(): cmci['cbr_%s' % field] = value # add medians for viewed, explored, and certified msbc_tables = { 'msbc_viewed': "viewed_median_stats_by_course", 'msbc_explored': 'explored_median_stats_by_course', 'msbc_certified': 'certified_median_stats_by_course', 'msbc_verified': 'verified_median_stats_by_course', } for prefix, mtab in msbc_tables.items(): print "--> Merging median stats data from %s" % mtab sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, mtab) for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): cmci['%s_%s' % (prefix, field)] = value # add time on task data tot_table = "time_on_task_stats_by_course" prefix = "ToT" print "--> Merging time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # add serial time on task data tot_table = "time_on_task_serial_stats_by_course" prefix = "SToT" print "--> Merging serial time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # add show_answer stats tot_table = "show_answer_stats_by_course" prefix = "SAS" print "--> Merging show_answer stats data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # setup list of keys, for CSV output css_keys = c_sum_stats.values()[0].keys() # retrieve course_metainfo table, pivot, add that to summary_stats print "--> Merging course_metainfo from %s" % table sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, table) listings_keys = map(make_key, [ "Institution", "Semester", "New or Rerun", "Andrew Recodes New/Rerun", "Course Number", "Short Title", "Andrew's Short Titles", "Title", "Instructors", "Registration Open", "Course Launch", "Course Wrap", "course_id", "Empirical Course Wrap", "Andrew's Order", "certifies", "MinPassGrade", '4-way Category by name', "4-way (CS, STEM, HSocSciGov, HumHistRel)" ]) listings_keys.reverse() for lk in listings_keys: css_keys.insert(1, "listings_%s" % lk) COUNTS_TO_KEEP = [ 'discussion', 'problem', 'optionresponse', 'checkboxgroup', 'optioninput', 'choiceresponse', 'video', 'choicegroup', 'vertical', 'choice', 'sequential', 'multiplechoiceresponse', 'numericalresponse', 'chapter', 'solution', 'img', 'formulaequationinput', 'responseparam', 'selfassessment', 'track', 'task', 'rubric', 'stringresponse', 'combinedopenended', 'description', 'textline', 'prompt', 'category', 'option', 'lti', 'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt', 'comment', 'annotationinput', 'image', 'options', 'comment_prompt', 'conditional', 'answer', 'poll_question', 'section', 'wrapper', 'map', 'area', 'customtag', 'transcript', 'split_test', 'word_cloud', 'openended', 'openendedparam', 'answer_display', 'code', 'drag_and_drop_input', 'customresponse', 'draggable', 'mentoring', 'textannotation', 'imageannotation', 'videosequence', 'feedbackprompt', 'assessments', 'openassessment', 'assessment', 'explanation', 'criterion' ] for entry in bqdat['data']: thekey = make_key(entry['key']) # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP: # continue if thekey.startswith( 'listings_') and thekey[9:] not in listings_keys: # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id']) continue c_sum_stats[entry['course_id']][thekey] = entry['value'] #if 'certifies' in thekey: # print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value']) if thekey not in css_keys: css_keys.append(thekey) # compute forum_posts_per_week for course_id, entry in c_sum_stats.items(): nfps = entry.get('nforum_posts_sum', 0) if nfps: fppw = int(nfps) / float(entry['nweeks']) entry['nforum_posts_per_week'] = fppw print " course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % ( course_id, entry['total_assessments_per_week'], fppw) else: entry['nforum_posts_per_week'] = None css_keys.append('nforum_posts_per_week') # read in listings file and merge that in also if listings_file: if listings_file.endswith('.csv'): listings = csv.DictReader(open(listings_file)) else: listings = [json.loads(x) for x in open(listings_file)] for entry in listings: course_id = entry['course_id'] if course_id not in c_sum_stats: continue cmci = c_sum_stats[course_id] for field, value in entry.items(): lkey = "listings_%s" % make_key(field) if not (lkey in cmci) or (not cmci[lkey]): cmci[lkey] = value print "Storing these fields: %s" % css_keys # get schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_combined_course_summary_stats.json' % mypath).read()) schema_dict = {x['name']: x for x in the_schema} # write out CSV css_table = "course_summary_stats" ofn = "%s__%s.csv" % (dataset, css_table) ofn2 = "%s__%s.json" % (dataset, css_table) print "Writing data to %s and %s" % (ofn, ofn2) ofp = open(ofn, 'w') ofp2 = open(ofn2, 'w') dw = csv.DictWriter(ofp, fieldnames=css_keys) dw.writeheader() for cid, entry in c_sum_stats.items(): for ek in entry: if ek not in schema_dict: entry.pop(ek) # entry[ek] = str(entry[ek]) # coerce to be string ofp2.write(json.dumps(entry) + "\n") for key in css_keys: if key not in entry: entry[key] = None dw.writerow(entry) ofp.close() ofp2.close() # upload to bigquery # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ] if 1: gsfnp = gspath / dataset / (css_table + ".json") gsutil.upload_file_to_gs(ofn2, gsfnp) # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False, # format='csv', skiprows=1) bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False) return print "-" * 60 + " %s" % course_id # get nweeks from listings lfn = path(listings_file) if not lfn.exists(): print "[analyze_content] course listings file %s doesn't exist!" % lfn return data = None if listings_file.endswith('.json'): data_feed = map(json.loads, open(lfn)) else: data_feed = csv.DictReader(open(lfn)) for k in data_feed: if not 'course_id' in k: print "Strange course listings row, no course_id in %s" % k raise Exception("Missing course_id") if k['course_id'] == course_id: data = k break if not data: print "[analyze_content] no entry for %s found in course listings file %s!" % ( course_id, lfn) return def date_parse(field): (m, d, y) = map(int, data[field].split('/')) return datetime.datetime(y, m, d) launch = date_parse('Course Launch') wrap = date_parse('Course Wrap') ndays = (wrap - launch).days nweeks = ndays / 7.0 print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays) if pin_date: datedir = pin_date course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest and not pin_date) cfn = gsutil.path_from_course_id(course_id) xbfn = course_dir / ("xbundle_%s.xml" % cfn) if not xbfn.exists(): print "[analyze_content] cannot find xbundle file %s for %s!" % ( xbfn, course_id) if use_dataset_latest: # try looking in earlier directories for xbundle file import glob spath = course_dir / ("../*/xbundle_%s.xml" % cfn) files = list(glob.glob(spath)) if files: xbfn = path(files[-1]) if not xbfn.exists(): print " --> also cannot find any %s ; aborting!" % spath else: print " --> Found and using instead: %s " % xbfn if not xbfn.exists(): raise Exception("[analyze_content] missing xbundle file %s" % xbfn) # if there is an xbundle*.fixed file, use that instead of the normal one if os.path.exists(str(xbfn) + ".fixed"): xbfn = path(str(xbfn) + ".fixed") print "[analyze_content] For %s using %s" % (course_id, xbfn) # get module usage data mudata = get_stats_module_usage(course_id, basedir, datedir, use_dataset_latest) xml = etree.parse(open(xbfn)).getroot() counts = defaultdict(int) nexcluded = defaultdict(int) IGNORE = [ 'html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1', 'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike', 'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's', 'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki', 'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4', 'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p', 'P', 'TABLE', 'TD', 'small', 'text', 'title' ] problem_stats = defaultdict(int) def does_problem_have_random_script(problem): ''' return 1 if problem has a script with "random." in it else return 0 ''' for elem in problem.findall('.//script'): if elem.text and ('random.' in elem.text): return 1 return 0 # walk through xbundle def walk_tree(elem, policy=None): ''' Walk XML tree recursively. elem = current element policy = dict of attributes for children to inherit, with fields like due, graded, showanswer ''' policy = policy or {} if type(elem.tag) == str and (elem.tag.lower() not in IGNORE): counts[elem.tag.lower()] += 1 if elem.tag in [ "sequential", "problem", "problemset", "course", "chapter" ]: # very old courses may use inheritance from course & chapter keys = ["due", "graded", "format", "showanswer", "start"] for k in keys: # copy inheritable attributes, if they are specified val = elem.get(k) if val: policy[k] = val if elem.tag == "problem": # accumulate statistics about problems: how many have show_answer = [past_due, closed] ? have random. in script? problem_stats['n_capa_problems'] += 1 if policy.get('showanswer'): problem_stats["n_showanswer_%s" % policy.get('showanswer')] += 1 else: problem_stats[ 'n_shownanswer_finished'] += 1 # DEFAULT showanswer = finished (make sure this remains true) # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/capa_base.py#L118 # finished = Show the answer after the student has answered the problem correctly, the student has no attempts left, or the problem due date has passed. problem_stats[ 'n_random_script'] += does_problem_have_random_script(elem) if policy.get('graded') == 'true' or policy.get( 'graded') == 'True': problem_stats['n_capa_problems_graded'] += 1 problem_stats[ 'n_graded_random_script'] += does_problem_have_random_script( elem) if policy.get('showanswer'): problem_stats["n_graded_showanswer_%s" % policy.get('showanswer')] += 1 else: problem_stats[ 'n_graded_shownanswer_finished'] += 1 # DEFAULT showanswer = finished (make sure this remains true) for k in elem: midfrag = (k.tag, k.get('url_name_orig', None)) if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20: nexcluded[k.tag] += 1 if verbose: try: print " -> excluding %s (%s), ncount=%s" % ( k.get('display_name', '<no_display_name>').encode('utf8'), midfrag, mudata.get(midfrag, {}).get('ncount')) except Exception as err: print " -> excluding ", k continue walk_tree(k, policy.copy()) walk_tree(xml) print "--> Count of individual element tags throughout XML: ", counts print "--> problem_stats:", json.dumps(problem_stats, indent=4) # combine some into "qual_axis" and others into "quant_axis" qual_axis = [ 'openassessment', 'optionresponse', 'multiplechoiceresponse', # 'discussion', 'choiceresponse', 'word_cloud', 'combinedopenended', 'choiceresponse', 'stringresponse', 'textannotation', 'openended', 'lti' ] quant_axis = [ 'formularesponse', 'numericalresponse', 'customresponse', 'symbolicresponse', 'coderesponse', 'imageresponse' ] nqual = 0 nquant = 0 for tag, count in counts.items(): if tag in qual_axis: nqual += count if tag in quant_axis: nquant += count print "nqual=%d, nquant=%d" % (nqual, nquant) nqual_per_week = nqual / nweeks nquant_per_week = nquant / nweeks total_per_week = nqual_per_week + nquant_per_week print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % ( nqual_per_week, nquant_per_week, total_per_week) # save this overall data in CCDATA lock_file(CCDATA) ccdfn = path(CCDATA) ccd = {} if ccdfn.exists(): for k in csv.DictReader(open(ccdfn)): ccd[k['course_id']] = k ccd[course_id] = { 'course_id': course_id, 'nweeks': nweeks, 'nqual_per_week': nqual_per_week, 'nquant_per_week': nquant_per_week, 'total_assessments_per_week': total_per_week, } # fields = ccd[ccd.keys()[0]].keys() fields = [ 'course_id', 'nquant_per_week', 'total_assessments_per_week', 'nqual_per_week', 'nweeks' ] cfp = open(ccdfn, 'w') dw = csv.DictWriter(cfp, fieldnames=fields) dw.writeheader() for cid, entry in ccd.items(): dw.writerow(entry) cfp.close() lock_file(CCDATA, release=True) # store data in course_metainfo table, which has one (course_id, key, value) on each line # keys include nweeks, nqual, nquant, count_* for module types * cmfields = OrderedDict() cmfields['course_id'] = course_id cmfields['course_length_days'] = str(ndays) cmfields.update( {make_key('listings_%s' % key): value for key, value in data.items()}) # from course listings cmfields.update(ccd[course_id].copy()) # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() }) # from content counts cmfields['filename_xbundle'] = xbfn cmfields['filename_listings'] = lfn for key in sorted( counts ): # store counts in sorted order, so that the later generated CSV file can have a predictable structure value = counts[key] cmfields['count_%s' % key] = str(value) # from content counts for key in sorted(problem_stats): # store problem stats value = problem_stats[key] cmfields['problem_stat_%s' % key] = str(value) cmfields.update({('nexcluded_sub_20_%s' % key): str(value) for key, value in nexcluded.items() }) # from content counts course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) csvfn = course_dir / CMINFO # manual overriding of the automatically computed fields can be done by storing course_id,key,value data # in the CMINFO_OVERRIDES file csvfn_overrides = course_dir / CMINFO_OVERRIDES if csvfn_overrides.exists(): print "--> Loading manual override information from %s" % csvfn_overrides for ovent in csv.DictReader(open(csvfn_overrides)): if not ovent['course_id'] == course_id: print "===> ERROR! override file has entry with wrong course_id: %s" % ovent continue print " overriding key=%s with value=%s" % (ovent['key'], ovent['value']) cmfields[ovent['key']] = ovent['value'] print "--> Course metainfo writing to %s" % csvfn fp = open(csvfn, 'w') cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value']) cdw.writeheader() for k, v in cmfields.items(): cdw.writerow({'course_id': course_id, 'key': k, 'value': v}) fp.close() # build and output course_listings_and_metainfo dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) mypath = os.path.dirname(os.path.realpath(__file__)) clm_table = "course_listing_and_metainfo" clm_schema_file = '%s/schemas/schema_%s.json' % (mypath, clm_table) clm_schema = json.loads(open(clm_schema_file).read()) clm = {} for finfo in clm_schema: field = finfo['name'] clm[field] = cmfields.get(field) clm_fnb = clm_table + ".json" clm_fn = course_dir / clm_fnb open(clm_fn, 'w').write(json.dumps(clm)) gsfnp = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / clm_fnb print "--> Course listing + metainfo uploading to %s then to %s.%s" % ( gsfnp, dataset, clm_table) sys.stdout.flush() gsutil.upload_file_to_gs(clm_fn, gsfnp) bqutil.load_data_to_table(dataset, clm_table, gsfnp, clm_schema, wait=True, verbose=False) # output course_metainfo table = 'course_metainfo' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) gsfnp = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / CMINFO print "--> Course metainfo uploading to %s then to %s.%s" % ( gsfnp, dataset, table) sys.stdout.flush() gsutil.upload_file_to_gs(csvfn, gsfnp) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())[table] bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest, use_latest_sql_dir): ''' Create Video stats for Videos Viewed and Videos Watched. First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched ''' assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable" # Get Course Dir path basedir = path(basedir or '') course_dir = course_id.replace('/','__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest or use_latest_sql_dir) # get schema mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/%s' % ( mypath, SCHEMA_VIDEO_AXIS ) the_schema = json.loads(open(SCHEMA_FILE).read())[ SCHEMA_VIDEO_AXIS_NAME ] the_dict_schema = schema2dict(the_schema) # Create initial video axis videoAxisExists = False dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) va_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS ) assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS ) videoAxisExists = True va_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: print "%s --> Attempting to process %s table" % ( str(err), TABLE_VIDEO_AXIS ) sys.stdout.flush() # get course axis time ca_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS ) ca_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: pass if videoAxisExists and (not force_recompute) and ca_date and va_date and (ca_date > va_date): force_recompute = True print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (va_date, ca_date) sys.stdout.flush() if not videoAxisExists or force_recompute: force_recompute = True createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) # Get video lengths va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS) assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table" va_bqdata = va['data'] fileoutput = lfp / FILENAME_VIDEO_AXIS getYoutubeDurations( dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute ) # upload and import video axis gsfn = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS gsutil.upload_file_to_gs(fileoutput, gsfn) table = TABLE_VIDEO_AXIS bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True) else: print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS ) # Lastly, create video stats createVideoStats_day( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest ) createVideoStats( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest ) # also create person_course_video_watched createPersonCourseVideo( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
def do_save(cid, caset_in, xbundle, datadir, log_msg, use_dataset_latest=False): ''' Save course axis data to bigquery cid = course_id caset = list of course axis data in dict format xbundle = XML bundle of course (everything except static files) datadir = directory where output files should be written log_msg = list of messages about processing errors and issues ''' # BigQuery requires data to fit within a schema; let's make sure our lines all fit the schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_course_axis.json' % mypath).read())['course_axis'] dict_schema = schema2dict(the_schema) caset = copy.deepcopy(caset_in) datadir = path(datadir) cafn = datadir / 'course_axis.json' xbfn = datadir / ('xbundle_%s.xml' % (cid.replace('/','__'))) fp = open(cafn, 'w') linecnt = 0 for ca in caset: linecnt += 1 ca['course_id'] = cid data = ca['data'] if data and not type(data)==dict: try: ca['data'] = json.loads(data) # make it native, for mongo except Exception as err: print "failed to create json for %s, error=%s" % (data, err) if ca['start'] is not None: ca['start'] = str(ca['start']) # datetime to string if ca['due'] is not None: ca['due'] = str(ca['due']) # datetime to string if (ca['data'] is None) or (ca['data']==''): ca.pop('data') check_schema(linecnt, ca, the_ds=dict_schema, coerce=True) try: # db.course_axis.insert(ca) fp.write(json.dumps(ca)+'\n') except Exception as err: print "Failed to save! Error=%s, data=%s" % (err, ca) fp.close() # upload axis.json file and course xbundle gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if 1: gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False) gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False) # import into BigQuery dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent table = "course_axis" bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema) msg = "="*100 + '\n' msg += "Course axis for %s\n" % (cid) msg += "="*100 + '\n' msg += '\n'.join(log_msg) msg = msg[:16184] # max message length 16384 bqutil.add_description_to_table(dataset, table, msg, append=True) print " Done - inserted %s records into course_axis" % len(caset)