Python upload_file_to_gs 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: gsutil

메소드/함수: upload_file_to_gs

hotexamples.com에서의 예제들: 20

Python upload_file_to_gs - 20개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 gsutil.upload_file_to_gs에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: make_grades_persistent.py 프로젝트: AbdouSeck/edx2bigquery

def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False):
    """
    Upload grades_persistent csv.gz to Google Storage,
    create the BigQuery table,
    then insert the data into the table.

    :param cid: the course id
    :param basedir: the base directory path
    :param datedir: the date directory name (represented as YYYY-MM-DD)
    :param use_dataset_latest: should the most recent dataset be used?
    :param subsection: should grades_persistentsubsection be uploaded?
    :type cid: str
    :type basedir: str
    :type datedir: str
    :type use_dataset_latest: bool
    :type subsection: bool
    """
    gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))

    if subsection:
        csv_name = "grades_persistentsubsectiongrade.csv.gz"
        temp_name = "grades_persistentsubsectiongrade_temp.csv.gz"
        table = "grades_persistent_subsection"
    else:
        csv_name = "grades_persistentcoursegrade.csv.gz"
        temp_name = "grades_persistentcoursegrade_temp.csv.gz"
        table = "grades_persistent"

    sdir = load_course_sql.find_course_sql_dir(cid,
                                               basedir=basedir,
                                               datedir=datedir,
                                               use_dataset_latest=(use_dataset_latest),
                                               )

    csvfn = sdir / csv_name
    tempfn = sdir / temp_name

    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table]

    if not os.path.exists(csvfn):
        print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn
        return

    if not subsection:
        cleanup_rows_from_grade_persistent(csvfn, tempfn)
    else:
        cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted")

    gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True)

    dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)  # create dataset if not already existent

    bqutil.load_data_to_table(dataset,
                              table,
                              gsdir / csv_name,
                              the_schema,
                              format="csv",
                              skiprows=1)

예제 #2

파일 보기

파일: extract_logs_mongo2gs.py 프로젝트: codyaustun/edx2bigquery

def extract_logs_mongo2gs(course_id, start="2012-09-01", end="2014-09-24", verbose=False,
                          dbname=DBNAME,
                          collection = 'tracking_log',
                          tracking_logs_directory="TRACKING_LOGS",
                          ):

    print "extracting logs for course %s" % course_id

    # list of dates to dump
    dates = daterange(d2dt(start), d2dt(end))
    
    if verbose:
        print "Dates to dump:", [x['dstr'] for x in dates]

    # what files already on gs?
    gspath = "%s/DAILY" % gs_path_from_course_id(course_id)
    gsfiles = get_gs_file_list(gspath)

    DIR = tracking_logs_directory
    if not os.path.exists(DIR):
        os.mkdir(DIR)
    DIR += '/' + path_from_course_id(course_id)
    if not os.path.exists(DIR):
        os.mkdir(DIR)

    filebuf = []
    for k in range(len(dates)-1):
                          
        d = dates[k]
        ofn = '%s/tracklog-%s.json.gz' % (DIR, d['dstr'])
        start = d['start']
        end = d['end']

        ofnb = os.path.basename(ofn)

        if ofnb in gsfiles:
            print "Already have %s, skipping" % ofnb
            sys.stdout.flush()
            continue

        # dump tracking log of certain date using mongoexport, if needed
        if not os.path.exists(ofn):
            # db.tracking_log.find({'course_id': "HarvardX/ER22x/2013_Spring", 
            #                       'time': { '$gte': "2013-08-01T00:00:00.000000", '$lt': "2013-08-02T00:00:00.000000" }}).count()
            query = '{"course_id": "%s", "time": {"$gte": "%s", "$lt": "%s" }}' % (course_id, start, end)
            cmd = "mongoexport -d %s -c %s -q '%s'| edx2bigquery rephrase_logs | gzip -9 > %s" % (dbname, collection, query, ofn)
            # print cmd
            os.system(cmd)
        
        upload_file_to_gs(ofn, gspath + '/' + ofnb)

        filebuf.append(ofn)

        if len(filebuf)>20:
            ffn = filebuf.pop(0)
            os.unlink(ffn)
            print "...Deleted %s" % ffn
            sys.stdout.flush()

예제 #3

파일 보기

파일: make_grades_persistent.py 프로젝트: lstmemery/edx2bigquery

def upload_grades_persistent_data(cid,
                                  basedir,
                                  datedir,
                                  use_dataset_latest=False,
                                  subsection=False):
    """Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table

    :param cid: the course id
    :param basedir: the base directory path
    :param datedir: the date directory name (represented as YYYY-MM-DD)
    :param use_dataset_latest: should the most recent dataset be used?
    :param subsection: should grades_persistentsubsection be uploaded?
    :type cid: str
    :type basedir: str
    :type datedir: str
    :type use_dataset_latest: bool
    :type subsection: bool
    """
    gsdir = path(
        gsutil.gs_path_from_course_id(cid,
                                      use_dataset_latest=use_dataset_latest))

    if subsection:
        csv_name = "grades_persistentsubsectiongrade.csv.gz"
        temp_name = "grades_persistentsubsectiongrade_temp.csv.gz"
        table = "grades_persistent_subsection"
    else:
        csv_name = "grades_persistentcoursegrade.csv.gz"
        temp_name = "grades_persistentcoursegrade_temp.csv.gz"
        table = "grades_persistent"

    csvfn = '%s/%s/%s/%s' % (basedir, cid.replace('/',
                                                  '__'), datedir, csv_name)
    tempfn = '%s/%s/%s/%s' % (basedir, cid.replace('/',
                                                   '__'), datedir, temp_name)

    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(
        open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table]

    if not subsection:
        remove_nulls_from_grade_persistent(csvfn, tempfn)

    gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True)

    dataset = bqutil.course_id2dataset(cid,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(
        dataset)  # create dataset if not already existent

    bqutil.load_data_to_table(dataset,
                              table,
                              gsdir / csv_name,
                              the_schema,
                              format="csv",
                              skiprows=1)

예제 #4

파일 보기

파일: make_geoip_table.py 프로젝트: sibycharley/edx2bigquery

    def write_geoip_table(self):
        '''
        Write out the geoipdat table if nchanged > 0
        '''
        if not self.nchanged:
            return

        ofn = 'tmp_geoip_%08d.json' % random.uniform(0, 100000000)
        print "--> new entries added to geoipdat, writing to %s" % (ofn)
        sys.stdout.flush()

        ofp = codecs.open(ofn, 'w', encoding='utf8')
        for key, val in self.geoipdat.iteritems():
            try:
                ofp.write(json.dumps(val) + '\n')
            except Exception as err:
                print "Error!  %s" % err
                sys.stdout.write(repr(val))
                raise
        ofp.close()

        lock_file(self.gipfn)
        try:
            print "--> renaming %s to %s" % (ofn, self.gipfn)
            sys.stdout.flush()
            os.rename(ofn, self.gipfn)
        except Exception as err:
            print "Error %s in renaming gipfn" % str(err)
        lock_file(self.gipfn, release=True)

        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(
            open('%s/schemas/schema_extra_geoip.json' %
                 mypath).read())['extra_geoip']

        gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn
        print "--> Uploading %s to %s" % (self.gipfn, gsp)
        sys.stdout.flush()
        gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json')

        print "--> Importing %s to %s" % (gsp, self.giptable)
        sys.stdout.flush()
        try:
            bqutil.create_dataset_if_nonexistent(self.gipdataset)
        except Exception as err:
            print "--> Warning: failed to create %s, err=%s" % (gsp, err)
        try:
            bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp,
                                      the_schema)
        except Exception as err:
            print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % (
                gsp, self.gipdataset, self.giptable, err)
            print "---> Continuing anyway"
            sys.stdout.flush()

예제 #5

파일 보기

파일: make_cinfo.py 프로젝트: scmc/edx2bigquery

def do_course_listings(course_listings_fn):
    dataset = 'courses'
    table = 'listings'
    bqutil.create_dataset_if_nonexistent(dataset)
    mypath = os.path.dirname(os.path.realpath(__file__))

    gsfn = gsutil.gs_path_from_course_id('courses') / 'listings.csv'
    gsutil.upload_file_to_gs(course_listings_fn, gsfn)

    schema = json.loads(open('%s/schemas/schema_course_listings.json' % mypath).read())['course_listings']
    bqutil.load_data_to_table(dataset, table, gsfn, schema, wait=True, format='csv', skiprows=1)

예제 #6

파일 보기

def do_user_part_csv(course_id, basedir=None, datedir=None, 
                  use_dataset_latest=False,
                  verbose=False,
                  pin_date=None):
    sdir = find_course_sql_dir(course_id, 
                               basedir=basedir,
                               datedir=datedir,
                               use_dataset_latest=(use_dataset_latest and not pin_date),
    )
    # upload to google storage
    dfn = sdir / "user_api_usercoursetag.csv.gz"

    if not os.path.exists(dfn):
        print("[load_user_part] Missing %s, skipping" % dfn)
        return

    # reformat True / False to 1/0 for "value" field
    if verbose:
        print("[load_user_part] extracting user partition data from %s" % dfn)
        sys.stdout.flush()

    cdr = csv.DictReader(gzip.GzipFile(dfn))
    fields = cdr.fieldnames
    if verbose:
        print("fieldnames = %s" % fields)
    fixed_data = []
    bmap = {'true': 1, 'false': 0}
    for row in cdr:
        vstr = row['value'].lower()
        row['value'] = bmap.get(vstr, vstr)
        fixed_data.append(row)

    ofnb = 'user_partitions.csv.gz'
    odfn = sdir / ofnb
    with gzip.GzipFile(odfn, 'w') as ofp:
        cdw = csv.DictWriter(ofp, fieldnames=fields)
        cdw.writeheader()
        cdw.writerows(fixed_data)
    if verbose:
        print("[load_user_part] Wrote %d rows of user partition data to %s" % (len(fixed_data), odfn))
        sys.stdout.flush()

    gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest))
    gsutil.upload_file_to_gs(odfn, gsdir / ofnb, verbose=False)
    
    mypath = os.path.dirname(os.path.realpath(__file__))
    schema = json.loads(open('%s/schemas/schema_user_partitions.json' % mypath).read())['user_partitions']

    # import into BigQuery
    table = "user_partitions"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)

예제 #7

파일 보기

파일: make_research_data_tables.py 프로젝트: AbdouSeck/edx2bigquery

    def extractResearchData( self, course_id, tablename, the_dataset=None, rdp=None, rdp_format='csv', output_bucket=None, basedir='', datedir='', do_gzip=True):
	'''
		Get research data output into tables and archive onto server
	'''
	
	# Archive location
	if course_id is not None: # Individual Course Research Data Products

		self.gsp = gsutil.gs_path_from_course_id( course_id=course_id, gsbucket=output_bucket, use_dataset_latest=True )
		gsfilename  = "%s/%s" % ( self.gsp, RESEARCH_DATA_PRODUCTS[ rdp ] )

	else: 
		print "ERROR! Must specify course_id's.  Aborting."
		return

	try:
		# Copy to Google Storage
		msg = "[researchData]: Copying Research Data table %s to %s" % ( tablename, gsfilename )
		print msg
		#gsfilename  = "%s/%s-*.csv.gz" % ( self.gsp, tablename ) # temp
		gsfilename  = "%s/%s.csv.gz" % ( self.gsp, tablename ) # temp
		ret = bqutil.extract_table_to_gs( the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True)
		msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename )
		print msg
		sys.stdout.flush()
	
	except Exception as err:

		print str(err)
		if ('BQ Error creating table' in str(err) ):
			msg = "[researchData]: Retrying... by sharding."
			print msg
			sys.stdout.flush()
			gsfilename  = "%s/%s-*.csv.gz" % ( self.gsp, tablename )
			print gsfilename
			sys.stdout.flush()
			ret = bqutil.extract_table_to_gs( the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True)
			msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename )
			print msg
			sys.stdout.flush()
	

	# Copy from Google Storage to Secure Data Warehouse for archiving
	archiveLocation = find_course_sql_dir(course_id=course_id, basedir=basedir, datedir=datedir, use_dataset_latest=True)
	#time.sleep( CFG.TIME_TO_WAIT_30s ) # delay needed to allow for GS to upload file fully (This should be size dependent, and may not be enough time)
	msg = "[researchData]: Archiving Research Data table %s from %s to %s" % ( tablename, gsfilename, archiveLocation )
	print msg
        sys.stdout.flush()
	gsutil.upload_file_to_gs(src=gsfilename, dst=archiveLocation, verbose=True)

	pass

예제 #8

파일 보기

파일: make_geoip_table.py 프로젝트: AbdouSeck/edx2bigquery

    def write_geoip_table(self):
        '''
        Write out the geoipdat table if nchanged > 0
        '''
        if not self.nchanged:
            return

        ofn = 'tmp_geoip_%08d.json' % random.uniform(0,100000000)
        print "--> new entries added to geoipdat, writing to %s" % (ofn)
        sys.stdout.flush()

        ofp = codecs.open(ofn, 'w', encoding='utf8')
        for key, val in self.geoipdat.iteritems():
            try:
                ofp.write(json.dumps(val)+'\n')
            except Exception as err:
                print "Error!  %s" % err
                sys.stdout.write(repr(val))
                raise
        ofp.close()

        lock_file(self.gipfn)
        try:
            print "--> renaming %s to %s" % (ofn, self.gipfn)
            sys.stdout.flush()
            os.rename(ofn, self.gipfn)
        except Exception as err:
            print "Error %s in renaming gipfn" % str(err)
        lock_file(self.gipfn, release=True)
        
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(open('%s/schemas/schema_extra_geoip.json' % mypath).read())['extra_geoip']

        gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn
        print "--> Uploading %s to %s" % (self.gipfn, gsp)
        sys.stdout.flush()
        gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json')
        
        print "--> Importing %s to %s" % (gsp, self.giptable)
        sys.stdout.flush()
        try:
            bqutil.create_dataset_if_nonexistent(self.gipdataset)
        except Exception as err:
            print "--> Warning: failed to create %s, err=%s" % (gsp, err)
        try:
            bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp, the_schema)
        except Exception as err:
            print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % (gsp, self.gipdataset, self.giptable, err)
            print "---> Continuing anyway"
            sys.stdout.flush()

예제 #9

파일 보기

파일: make_grading_policy_table.py 프로젝트: nilichen/edx2bigquery

def make_gp_table(course_id, basedir=None, datedir=None, 
                  use_dataset_latest=False,
                  verbose=False,
                  pin_date=None):

    if pin_date:
        datedir = pin_date

    sdir = load_course_sql.find_course_sql_dir(course_id, 
                                               basedir=basedir,
                                               datedir=datedir,
                                               use_dataset_latest=(use_dataset_latest and not pin_date),
                                               )

    fn_to_try = ['course.xml.tar.gz',
                'course-prod-analytics.xml.tar.gz',
                'course-prod-edge-analytics.xml.tar.gz',
                'course-prod-edx-replica.xml.tar.gz',
            ]
    
    for fntt in fn_to_try:
        fn = sdir / fntt
        if os.path.exists(fn):
            break
    if not os.path.exists(fn):
        msg = "---> oops, cannot get couese content (with grading policy file) for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn)
        raise Exception(msg)

    gpstr, gpfn = read_grading_policy_from_tar_file(fn)
    fields, gptab, schema = load_grading_policy(gpstr, verbose=verbose, gpfn=gpfn)
    
    ofnb = 'grading_policy.csv'
    ofn = sdir / ofnb
    ofp = open(ofn, 'w')
    cdw = csv.DictWriter(ofp, fieldnames=fields)
    cdw.writeheader()
    cdw.writerows(gptab)
    ofp.close()

    # upload to google storage
    gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest))
    gsutil.upload_file_to_gs(ofn, gsdir / ofnb, verbose=False)
    
    # import into BigQuery
    table = "grading_policy"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)

예제 #10

파일 보기

def make_gp_table(course_id, basedir=None, datedir=None, 
                  use_dataset_latest=False,
                  verbose=False,
                  pin_date=None):

    if pin_date:
        datedir = pin_date

    sdir = load_course_sql.find_course_sql_dir(course_id, 
                                               basedir=basedir,
                                               datedir=datedir,
                                               use_dataset_latest=(use_dataset_latest and not pin_date),
                                               )

    fn_to_try = ['course.xml.tar.gz',
                'course-prod-analytics.xml.tar.gz',
                'course-prod-edge-analytics.xml.tar.gz',
                'course-prod-edx-replica.xml.tar.gz',
            ]
    
    for fntt in fn_to_try:
        fn = sdir / fntt
        if os.path.exists(fn):
            break
    if not os.path.exists(fn):
        msg = "---> oops, cannot get couese content (with grading policy file) for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn)
        raise Exception(msg)

    gpstr, gpfn = read_grading_policy_from_tar_file(fn)
    fields, gptab, schema = load_grading_policy(gpstr, verbose=verbose, gpfn=gpfn)
    
    ofnb = 'grading_policy.csv'
    ofn = sdir / ofnb
    ofp = open(ofn, 'w')
    cdw = csv.DictWriter(ofp, fieldnames=fields)
    cdw.writeheader()
    cdw.writerows(gptab)
    ofp.close()

    # upload to google storage
    gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest))
    gsutil.upload_file_to_gs(ofn, gsdir / ofnb, verbose=False)
    
    # import into BigQuery
    table = "grading_policy"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)

예제 #11

파일 보기

파일: load_course_sql.py 프로젝트: AbdouSeck/edx2bigquery

        def copy_if_newer(fn, fnset, options='-z csv,json'):
            statbuf = os.stat(fn)
            mt = datetime.datetime.fromtimestamp(statbuf.st_mtime)
            
            # do some date checking to upload files which have changed, and are newer than that on google cloud storage
            local_dt = local.localize(mt, is_dst=None)
            utc_dt = local_dt.astimezone (pytz.utc)

            fnb = os.path.basename(fn)
            if fnb in fnset and fnset[fnb]['date'] > utc_dt:
                print "...%s already copied, skipping" % fn
                sys.stdout.flush()
                return
            elif fnb in fnset:
                print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % (fn, fnset[fnb]['date'], mt)

            gsutil.upload_file_to_gs(fn, gsdir / fnb, options=options, verbose=True)

예제 #12

파일 보기

        def copy_if_newer(fn, fnset, options='-z csv,json'):
            statbuf = os.stat(fn)
            mt = datetime.datetime.fromtimestamp(statbuf.st_mtime)
            
            # do some date checking to upload files which have changed, and are newer than that on google cloud storage
            local_dt = local.localize(mt, is_dst=None)
            utc_dt = local_dt.astimezone (pytz.utc)

            fnb = os.path.basename(fn)
            if fnb in fnset and fnset[fnb]['date'] > utc_dt:
                print "...%s already copied, skipping" % fn
                sys.stdout.flush()
                return
            elif fnb in fnset:
                print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % (fn, fnset[fnb]['date'], mt)

            gsutil.upload_file_to_gs(fn, gsdir / fnb, options=options, verbose=True)

예제 #13

파일 보기

파일: make_research_data_tables.py 프로젝트: sibycharley/edx2bigquery

    def extractResearchData(self,
                            course_id,
                            tablename,
                            the_dataset=None,
                            rdp=None,
                            rdp_format='csv',
                            output_bucket=None,
                            basedir='',
                            datedir='',
                            do_gzip=True):
        '''
		Get research data output into tables and archive onto server
	'''

        # Archive location
        if course_id is not None:  # Individual Course Research Data Products

            self.gsp = gsutil.gs_path_from_course_id(course_id=course_id,
                                                     gsbucket=output_bucket,
                                                     use_dataset_latest=True)
            gsfilename = "%s/%s" % (self.gsp, RESEARCH_DATA_PRODUCTS[rdp])

        else:
            print "ERROR! Must specify course_id's.  Aborting."
            return

        try:
            # Copy to Google Storage
            msg = "[researchData]: Copying Research Data table %s to %s" % (
                tablename, gsfilename)
            print msg
            #gsfilename  = "%s/%s-*.csv.gz" % ( self.gsp, tablename ) # temp
            gsfilename = "%s/%s.csv.gz" % (self.gsp, tablename)  # temp
            ret = bqutil.extract_table_to_gs(the_dataset,
                                             tablename,
                                             gsfilename,
                                             format=rdp_format,
                                             do_gzip=True,
                                             wait=True)
            msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link(
                gsfilename)
            print msg
            sys.stdout.flush()

        except Exception as err:

            print str(err)
            if ('BQ Error creating table' in str(err)):
                msg = "[researchData]: Retrying... by sharding."
                print msg
                sys.stdout.flush()
                gsfilename = "%s/%s-*.csv.gz" % (self.gsp, tablename)
                print gsfilename
                sys.stdout.flush()
                ret = bqutil.extract_table_to_gs(the_dataset,
                                                 tablename,
                                                 gsfilename,
                                                 format=rdp_format,
                                                 do_gzip=True,
                                                 wait=True)
                msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link(
                    gsfilename)
                print msg
                sys.stdout.flush()

        # Copy from Google Storage to Secure Data Warehouse for archiving
        archiveLocation = find_course_sql_dir(course_id=course_id,
                                              basedir=basedir,
                                              datedir=datedir,
                                              use_dataset_latest=True)
        #time.sleep( CFG.TIME_TO_WAIT_30s ) # delay needed to allow for GS to upload file fully (This should be size dependent, and may not be enough time)
        msg = "[researchData]: Archiving Research Data table %s from %s to %s" % (
            tablename, gsfilename, archiveLocation)
        print msg
        sys.stdout.flush()
        gsutil.upload_file_to_gs(src=gsfilename,
                                 dst=archiveLocation,
                                 verbose=True)

        pass

예제 #14

파일 보기

파일: make_grades_persistent.py 프로젝트: sibycharley/edx2bigquery

def upload_grades_persistent_data(cid,
                                  basedir,
                                  datedir,
                                  use_dataset_latest=False,
                                  subsection=False):
    """
    Upload grades_persistent csv.gz to Google Storage,
    create the BigQuery table,
    then insert the data into the table.

    :param cid: the course id
    :param basedir: the base directory path
    :param datedir: the date directory name (represented as YYYY-MM-DD)
    :param use_dataset_latest: should the most recent dataset be used?
    :param subsection: should grades_persistentsubsection be uploaded?
    :type cid: str
    :type basedir: str
    :type datedir: str
    :type use_dataset_latest: bool
    :type subsection: bool
    """
    gsdir = path(
        gsutil.gs_path_from_course_id(cid,
                                      use_dataset_latest=use_dataset_latest))

    if subsection:
        csv_name = "grades_persistentsubsectiongrade.csv.gz"
        temp_name = "grades_persistentsubsectiongrade_temp.csv.gz"
        table = "grades_persistent_subsection"
    else:
        csv_name = "grades_persistentcoursegrade.csv.gz"
        temp_name = "grades_persistentcoursegrade_temp.csv.gz"
        table = "grades_persistent"

    sdir = load_course_sql.find_course_sql_dir(
        cid,
        basedir=basedir,
        datedir=datedir,
        use_dataset_latest=(use_dataset_latest),
    )

    csvfn = sdir / csv_name
    tempfn = sdir / temp_name

    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(
        open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table]

    if not os.path.exists(csvfn):
        print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn
        return

    if not subsection:
        cleanup_rows_from_grade_persistent(csvfn, tempfn)
    else:
        cleanup_rows_from_grade_persistent(csvfn,
                                           tempfn,
                                           field_to_fix="first_attempted")

    gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True)

    dataset = bqutil.course_id2dataset(cid,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(
        dataset)  # create dataset if not already existent

    bqutil.load_data_to_table(dataset,
                              table,
                              gsdir / csv_name,
                              the_schema,
                              format="csv",
                              skiprows=1)

예제 #15

파일 보기

파일: make_video_analysis.py 프로젝트: musixhine/edx2bigquery

def make_video_stats(course_id, api_key, basedir, datedir, force_recompute,
                     use_dataset_latest):
    '''
    Create Video stats for Videos Viewed and Videos Watched.
    First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched
    '''

    assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable"

    # Get Course Dir path
    basedir = path(basedir or '')
    course_dir = course_id.replace('/', '__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)

    # get schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/%s' % (mypath, SCHEMA_VIDEO_AXIS)
    the_schema = json.loads(open(SCHEMA_FILE).read())[SCHEMA_VIDEO_AXIS_NAME]
    the_dict_schema = schema2dict(the_schema)

    # Create initial video axis
    videoAxisExists = False
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    va_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS)
        assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % (
            dataset, TABLE_VIDEO_AXIS)
        videoAxisExists = True
        va_date = tinfo['lastModifiedTime']  # datetime
    except (AssertionError, Exception) as err:
        print "%s --> Attempting to process %s table" % (str(err),
                                                         TABLE_VIDEO_AXIS)
        sys.stdout.flush()

    # get course axis time
    ca_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS)
        ca_date = tinfo['lastModifiedTime']  # datetime
    except (AssertionError, Exception) as err:
        pass

    if videoAxisExists and (not force_recompute) and ca_date and va_date and (
            ca_date > va_date):
        force_recompute = True
        print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (
            va_date, ca_date)
        sys.stdout.flush()

    if not videoAxisExists or force_recompute:
        force_recompute = True
        createVideoAxis(course_id=course_id,
                        force_recompute=force_recompute,
                        use_dataset_latest=use_dataset_latest)

        # Get video lengths
        va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS)
        assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table"
        va_bqdata = va['data']
        fileoutput = lfp / FILENAME_VIDEO_AXIS
        getYoutubeDurations(dataset=dataset,
                            bq_table_input=va_bqdata,
                            api_key=api_key,
                            outputfilename=fileoutput,
                            schema=the_dict_schema,
                            force_recompute=force_recompute)

        # upload and import video axis
        gsfn = gsutil.gs_path_from_course_id(
            course_id,
            use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS
        gsutil.upload_file_to_gs(fileoutput, gsfn)
        table = TABLE_VIDEO_AXIS
        bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True)

    else:
        print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % (
            dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS)

    # Lastly, create video stats
    createVideoStats_day(course_id,
                         force_recompute=force_recompute,
                         use_dataset_latest=use_dataset_latest)
    createVideoStats(course_id,
                     force_recompute=force_recompute,
                     use_dataset_latest=use_dataset_latest)

예제 #16

파일 보기

파일: extract_logs_mongo2gs.py 프로젝트: sibycharley/edx2bigquery

def extract_logs_mongo2gs(
    course_id,
    start="2012-09-01",
    end="2014-09-24",
    verbose=False,
    dbname=DBNAME,
    collection='tracking_log',
    tracking_logs_directory="TRACKING_LOGS",
):

    print "extracting logs for course %s" % course_id

    # list of dates to dump
    dates = daterange(d2dt(start), d2dt(end))

    if verbose:
        print "Dates to dump:", [x['dstr'] for x in dates]

    # what files already on gs?
    gspath = "%s/DAILY" % gs_path_from_course_id(course_id)
    gsfiles = get_gs_file_list(gspath)

    DIR = tracking_logs_directory
    if not os.path.exists(DIR):
        os.mkdir(DIR)
    DIR += '/' + path_from_course_id(course_id)
    if not os.path.exists(DIR):
        os.mkdir(DIR)

    filebuf = []
    for k in range(len(dates) - 1):

        d = dates[k]
        ofn = '%s/tracklog-%s.json.gz' % (DIR, d['dstr'])
        start = d['start']
        end = d['end']

        ofnb = os.path.basename(ofn)

        if ofnb in gsfiles:
            print "Already have %s, skipping" % ofnb
            sys.stdout.flush()
            continue

        # dump tracking log of certain date using mongoexport, if needed
        if not os.path.exists(ofn):
            # db.tracking_log.find({'course_id': "HarvardX/ER22x/2013_Spring",
            #                       'time': { '$gte': "2013-08-01T00:00:00.000000", '$lt': "2013-08-02T00:00:00.000000" }}).count()
            query = '{"course_id": "%s", "time": {"$gte": "%s", "$lt": "%s" }}' % (
                course_id, start, end)
            cmd = "mongoexport -d %s -c %s -q '%s'| edx2bigquery rephrase_logs | gzip -9 > %s" % (
                dbname, collection, query, ofn)
            # print cmd
            os.system(cmd)

        upload_file_to_gs(ofn, gspath + '/' + ofnb)

        filebuf.append(ofn)

        if len(filebuf) > 20:
            ffn = filebuf.pop(0)
            os.unlink(ffn)
            print "...Deleted %s" % ffn
            sys.stdout.flush()

예제 #17

파일 보기

파일: analyze_content.py 프로젝트: codyaustun/edx2bigquery

def analyze_course_content(course_id, 
                           listings_file=None,
                           basedir="X-Year-2-data-sql", 
                           datedir="2013-09-21", 
                           use_dataset_latest=False,
                           do_upload=False,
                           courses=None,
                           verbose=True,
                           ):
    '''
    Compute course_content table, which quantifies:

    - number of chapter, sequential, vertical modules
    - number of video modules
    - number of problem, *openended, mentoring modules
    - number of dicussion, annotatable, word_cloud modules

    Do this using the course "xbundle" file, produced when the course axis is computed.

    Include only modules which had nontrivial use, to rule out the staff and un-shown content. 
    Do the exclusion based on count of module appearing in the studentmodule table, based on 
    stats_module_usage for each course.

    Also, from the course listings file, compute the number of weeks the course was open.

    If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset 
    as the "stats_course_content" table.  Also generate a "course_summary_stats" table, stored in the
    course_report_ORG or course_report_latest dataset.  The course_summary_stats table combines
    data from many reports,, including stats_course_content, the medians report, the listings file,
    broad_stats_by_course, and time_on_task_stats_by_course.
    
    '''

    if do_upload:
        if use_dataset_latest:
            org = "latest"
        else:
            org = courses[0].split('/',1)[0]	# extract org from first course_id in courses

        crname = 'course_report_%s' % org

        gspath = gsutil.gs_path_from_course_id(crname)
        gsfnp = gspath / CCDATA
        gsutil.upload_file_to_gs(CCDATA, gsfnp)
        tableid = "stats_course_content"
        dataset = crname

        mypath = os.path.dirname(os.path.realpath(__file__))
        SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath

        try:
            the_schema = json.loads(open(SCHEMA_FILE).read())[tableid]
        except Exception as err:
            print "Oops!  Failed to load schema file for %s.  Error: %s" % (tableid, str(err))
            raise

        if 0:
            bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False,
                                      format='csv', skiprows=1)

        table = 'course_metainfo'
        course_tables = ',\n'.join([('[%s.course_metainfo]' % bqutil.course_id2dataset(x)) for x in courses])
        sql = "select * from {course_tables}".format(course_tables=course_tables)
        print "--> Creating %s.%s using %s" % (dataset, table, sql)

        if 1:
            metainfo_dataset = bqutil.get_bq_table(dataset, table, sql=sql, 
                                          newer_than=datetime.datetime(2015, 1, 16, 3, 0),
                                          )
            # bqutil.create_bq_table(dataset, table, sql, overwrite=True)


        #-----------------------------------------------------------------------------
        # make course_summary_stats table
        #
        # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo.
        # Also use (and create if necessary) the nregistered_by_wrap table.

        # get the broad_stats_by_course data
        bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course')

        table_list = bqutil.get_list_of_table_ids(dataset)

        latest_person_course = max([ x for x in table_list if x.startswith('person_course_')])
        print "Latest person_course table in %s is %s" % (dataset, latest_person_course)
        
        sql = """
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        nr_by_wrap = bqutil.get_bq_table(dataset, 'nregistered_by_wrap', sql=sql, key={'name': 'course_id'})

        # rates for registrants before and during course
        
        sql = """
                SELECT 
                    *,
                    ncertified / nregistered * 100 as pct_certified_of_reg,
                    ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch,
                    ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course,
                    ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap,
                    ncertified / nviewed * 100 as pct_certified_of_viewed,
                    ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap,
                    ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap,
                FROM
                (
                # ------------------------
                # get aggregate data
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.certified then 1 else 0 end) ncertified,
                    sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap,
                    sum(case when pc.viewed then 1 else 0 end) nviewed,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                    sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap,
                    sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap,
                    sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch,
                    sum(case when pc.start_time < cminfo.launch_date 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_before_launch,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_during_course,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                
                # --------------------
                #  get course launch and wrap dates from course_metainfo

       SELECT AA.course_id as course_id, 
              AA.wrap_date as wrap_date,
              AA.launch_date as launch_date,
              BB.ewrap_date as ewrap_date,
       FROM (
               #  inner get course launch and wrap dates from course_metainfo
                SELECT A.course_id as course_id,
                  A.wrap_date as wrap_date,
                  B.launch_date as launch_date,
                from
                (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )
                ) as A
                left outer join 
                (
                 SELECT course_id,
                      TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Launch'
                 )
                ) as B
                on A.course_id = B.course_id 
                # end inner course_metainfo subquery
            ) as AA
            left outer join
            (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Empirical Course Wrap'
                 )
            ) as BB
            on AA.course_id = BB.course_id

                # end course_metainfo subquery
                # --------------------
                
                ) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
                # ---- end get aggregate data
                )
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration'
        sys.stdout.flush()
        cert_by_reg = bqutil.get_bq_table(dataset, 'stats_cert_rates_by_registration', sql=sql, 
                                          newer_than=datetime.datetime(2015, 1, 16, 3, 0),
                                          key={'name': 'course_id'})

        # start assembling course_summary_stats

        c_sum_stats = defaultdict(OrderedDict)
        for entry in bsbc['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            cmci.update(entry)
            cnbw = nr_by_wrap['data_by_key'][course_id]
            nbw = int(cnbw['nregistered_by_wrap'])
            cmci['nbw_wrap_date'] = cnbw['wrap_date']
            cmci['nregistered_by_wrap'] = nbw
            cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct']
            cmci['frac_female'] = float(entry['n_female_viewed']) / (float(entry['n_male_viewed']) + float(entry['n_female_viewed']))
            ncert = float(cmci['certified_sum'])
            if ncert:
                cmci['certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0
            else:
                cmci['certified_of_nregistered_by_wrap_pct'] = None
            cbr = cert_by_reg['data_by_key'][course_id]
            for field, value in cbr.items():
                cmci['cbr_%s' % field] = value

        # add medians for viewed, explored, and certified

        msbc_tables = {'msbc_viewed': "viewed_median_stats_by_course",
                       'msbc_explored': 'explored_median_stats_by_course',
                       'msbc_certified': 'certified_median_stats_by_course',
                       'msbc_verified': 'verified_median_stats_by_course',
                       }
        for prefix, mtab in msbc_tables.items():
            print "--> Merging median stats data from %s" % mtab
            sys.stdout.flush()
            bqdat = bqutil.get_table_data(dataset, mtab)
            for entry in bqdat['data']:
                course_id = entry['course_id']
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    cmci['%s_%s' % (prefix, field)] = value

        # add time on task data

        tot_table = "time_on_task_stats_by_course"
        prefix = "ToT"
        print "--> Merging time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add serial time on task data

        tot_table = "time_on_task_serial_stats_by_course"
        prefix = "SToT"
        print "--> Merging serial time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add show_answer stats

        tot_table = "show_answer_stats_by_course"
        prefix = "SAS"
        print "--> Merging show_answer stats data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # setup list of keys, for CSV output

        css_keys = c_sum_stats.values()[0].keys()

        # retrieve course_metainfo table, pivot, add that to summary_stats

        print "--> Merging course_metainfo from %s" % table
        sys.stdout.flush()
        bqdat = bqutil.get_table_data(dataset, table)

        def make_key(key):
            key = key.strip()
            key = key.replace(' ', '_').replace("'", "_").replace('/', '_').replace('(','').replace(')','').replace('-', '_').replace(',', '')
            return key

        listings_keys = map(make_key, ["Institution", "Semester", "New or Rerun", "Andrew Recodes New/Rerun", 
                                       "Course Number", "Short Title", "Andrew's Short Titles", "Title", 
                                       "Instructors", "Registration Open", "Course Launch", "Course Wrap", "course_id",
                                       "Empirical Course Wrap", "Andrew's Order", "certifies", "MinPassGrade",
                                       '4-way Category by name', "4-way (CS, STEM, HSocSciGov, HumHistRel)"
                                       ])
        listings_keys.reverse()
        
        for lk in listings_keys:
            css_keys.insert(1, "listings_%s" % lk)

        COUNTS_TO_KEEP = ['discussion', 'problem', 'optionresponse', 'checkboxgroup', 'optioninput', 
                          'choiceresponse', 'video', 'choicegroup', 'vertical', 'choice', 'sequential', 
                          'multiplechoiceresponse', 'numericalresponse', 'chapter', 'solution', 'img', 
                          'formulaequationinput', 'responseparam', 'selfassessment', 'track', 'task', 'rubric', 
                          'stringresponse', 'combinedopenended', 'description', 'textline', 'prompt', 'category', 
                          'option', 'lti', 'annotationresponse', 
                          'annotatable', 'colgroup', 'tag_prompt', 'comment', 'annotationinput', 'image', 
                          'options', 'comment_prompt', 'conditional', 
                          'answer', 'poll_question', 'section', 'wrapper', 'map', 'area', 
                          'customtag', 'transcript', 
                          'split_test', 'word_cloud', 
                          'openended', 'openendedparam', 'answer_display', 'code', 
                          'drag_and_drop_input', 'customresponse', 'draggable', 'mentoring', 
                          'textannotation', 'imageannotation', 'videosequence', 
                          'feedbackprompt', 'assessments', 'openassessment', 'assessment', 'explanation', 'criterion']

        for entry in bqdat['data']:
            thekey = make_key(entry['key'])
            # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP:
            #     continue
            if thekey.startswith('listings_') and thekey[9:] not in listings_keys:
                # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id'])
                continue
            c_sum_stats[entry['course_id']][thekey] = entry['value']
            #if 'certifies' in thekey:
            #    print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value'])
            if thekey not in css_keys:
                css_keys.append(thekey)

        # compute forum_posts_per_week
        for course_id, entry in c_sum_stats.items():
            nfps = entry.get('nforum_posts_sum', 0)
            if nfps:
                fppw = int(nfps) / float(entry['nweeks'])
                entry['nforum_posts_per_week'] = fppw
                print "    course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (course_id, entry['total_assessments_per_week'], fppw)
            else:
                entry['nforum_posts_per_week'] = None
        css_keys.append('nforum_posts_per_week')

        # read in listings file and merge that in also
        if listings_file:
            if listings_file.endswith('.csv'):
                listings = csv.DictReader(open(listings_file))
            else:
                listings = [ json.loads(x) for x in open(listings_file) ]
            for entry in listings:
                course_id = entry['course_id']
                if course_id not in c_sum_stats:
                    continue
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    lkey = "listings_%s" % make_key(field)
                    if not (lkey in cmci) or (not cmci[lkey]):
                        cmci[lkey] = value

        print "Storing these fields: %s" % css_keys

        # get schema
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(open('%s/schemas/schema_combined_course_summary_stats.json' % mypath).read())
        schema_dict = { x['name'] : x for x in the_schema }

        # write out CSV
        css_table = "course_summary_stats"
        ofn = "%s__%s.csv" % (dataset, css_table)
        ofn2 = "%s__%s.json" % (dataset, css_table)
        print "Writing data to %s and %s" % (ofn, ofn2)

        ofp = open(ofn, 'w')
        ofp2 = open(ofn2, 'w')
        dw = csv.DictWriter(ofp, fieldnames=css_keys)
        dw.writeheader()
        for cid, entry in c_sum_stats.items():
            for ek in entry:
                if ek not in schema_dict:
                    entry.pop(ek)
                # entry[ek] = str(entry[ek])	# coerce to be string
            ofp2.write(json.dumps(entry) + "\n")
            for key in css_keys:
                if key not in entry:
                    entry[key] = None
            dw.writerow(entry)
        ofp.close()
        ofp2.close()

        # upload to bigquery
        # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ]
        if 1:
            gsfnp = gspath / dataset / (css_table + ".json")
            gsutil.upload_file_to_gs(ofn2, gsfnp)
            # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False,
            #                           format='csv', skiprows=1)
            bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False)

        return

    
    print "-"*60 + " %s" % course_id

    # get nweeks from listings
    lfn = path(listings_file)
    if not lfn.exists():
        print "[analyze_content] course listings file %s doesn't exist!" % lfn
        return

    data = None
    for k in csv.DictReader(open(lfn)):
        if k['course_id']==course_id:
            data = k
            break

    if not data:
        print "[analyze_content] no entry for %s found in course listings file %s!" % (course_id, lfn)
        return

    def date_parse(field):
        (m, d, y) = map(int, data[field].split('/'))
        return datetime.datetime(y, m, d)

    launch = date_parse('Course Launch')
    wrap = date_parse('Course Wrap')
    ndays = (wrap - launch).days
    nweeks = ndays / 7.0

    print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays)

    course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)
    cfn = gsutil.path_from_course_id(course_id)

    xbfn = course_dir / ("xbundle_%s.xml" % cfn)
    
    if not xbfn.exists():
        print "[analyze_content] cannot find xbundle file %s for %s!" % (xbfn, course_id)
        return

    print "[analyze_content] For %s using %s" % (course_id, xbfn)
    
    # get module usage data
    mudata = get_stats_module_usage(course_id, basedir, datedir, use_dataset_latest)

    xml = etree.parse(open(xbfn)).getroot()
    
    counts = defaultdict(int)
    nexcluded = defaultdict(int)

    IGNORE = ['html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1', 'em', 'b', 'h2', 'h3', 'body', 'span', 'strong',
              'a', 'sub', 'strike', 'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's', 'pre', 'policy', 'metadata',
              'grading_policy', 'br', 'center',  'wiki', 'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4', 
              'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p', 'P', 'TABLE', 'TD', 'small', 'text', 'title']

    def walk_tree(elem):
        if  type(elem.tag)==str and (elem.tag.lower() not in IGNORE):
            counts[elem.tag.lower()] += 1
        for k in elem:
            midfrag = (k.tag, k.get('url_name_orig', None))
            if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20:
                nexcluded[k.tag] += 1
                if verbose:
                    print "    -> excluding %s (%s), ncount=%s" % (k.get('display_name', '<no_display_name>').encode('utf8'), 
                                                                   midfrag, 
                                                                   mudata.get(midfrag, {}).get('ncount'))
                continue
            walk_tree(k)

    walk_tree(xml)
    print counts

    # combine some into "qual_axis" and others into "quant_axis"
    qual_axis = ['openassessment', 'optionresponse', 'multiplechoiceresponse', 
                 # 'discussion', 
                 'choiceresponse', 'word_cloud', 
                 'combinedopenended', 'choiceresponse', 'stringresponse', 'textannotation', 'openended', 'lti']
    quant_axis = ['formularesponse', 'numericalresponse', 'customresponse', 'symbolicresponse', 'coderesponse',
                  'imageresponse']

    nqual = 0
    nquant = 0
    for tag, count in counts.items():
        if tag in qual_axis:
            nqual += count
        if tag in quant_axis:
            nquant += count
    
    print "nqual=%d, nquant=%d" % (nqual, nquant)

    nqual_per_week = nqual / nweeks
    nquant_per_week = nquant / nweeks
    total_per_week = nqual_per_week + nquant_per_week

    print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (nqual_per_week, nquant_per_week, total_per_week)

    # save this overall data in CCDATA
    lock_file(CCDATA)
    ccdfn = path(CCDATA)
    ccd = {}
    if ccdfn.exists():
        for k in csv.DictReader(open(ccdfn)):
            ccd[k['course_id']] = k
    
    ccd[course_id] = {'course_id': course_id,
                      'nweeks': nweeks,
                      'nqual_per_week': nqual_per_week,
                      'nquant_per_week': nquant_per_week,
                      'total_assessments_per_week' : total_per_week,
                      }

    # fields = ccd[ccd.keys()[0]].keys()
    fields = ['course_id', 'nquant_per_week', 'total_assessments_per_week', 'nqual_per_week', 'nweeks']
    cfp = open(ccdfn, 'w')
    dw = csv.DictWriter(cfp, fieldnames=fields)
    dw.writeheader()
    for cid, entry in ccd.items():
        dw.writerow(entry)
    cfp.close()
    lock_file(CCDATA, release=True)

    # store data in course_metainfo table, which has one (course_id, key, value) on each line
    # keys include nweeks, nqual, nquant, count_* for module types *

    cmfields = OrderedDict()
    cmfields['course_id'] = course_id
    cmfields['course_length_days'] = str(ndays)
    cmfields.update({ ('listings_%s' % key) : value for key, value in data.items() })	# from course listings
    cmfields.update(ccd[course_id].copy())

    # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() })	# from content counts

    for key in sorted(counts):	# store counts in sorted order, so that the later generated CSV file can have a predictable structure
        value = counts[key]
        cmfields['count_%s' % key] =  str(value) 	# from content counts

    cmfields.update({ ('nexcluded_sub_20_%s' % key) : str(value) for key, value in nexcluded.items() })	# from content counts

    course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)
    csvfn = course_dir / CMINFO

    # manual overriding of the automatically computed fields can be done by storing course_id,key,value data
    # in the CMINFO_OVERRIDES file

    csvfn_overrides = course_dir / CMINFO_OVERRIDES
    if csvfn_overrides.exists():
        print "--> Loading manual override information from %s" % csvfn_overrides
        for ovent in csv.DictReader(open(csvfn_overrides)):
            if not ovent['course_id']==course_id:
                print "===> ERROR! override file has entry with wrong course_id: %s" % ovent
                continue
            print "    overriding key=%s with value=%s" % (ovent['key'], ovent['value'])
            cmfields[ovent['key']] = ovent['value']

    print "--> Course metainfo writing to %s" % csvfn

    fp = open(csvfn, 'w')

    cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value'])
    cdw.writeheader()
    
    for k, v in cmfields.items():
        cdw.writerow({'course_id': course_id, 'key': k, 'value': v})
        
    fp.close()

    table = 'course_metainfo'
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    gsfnp = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / CMINFO
    print "--> Course metainfo uploading to %s then to %s.%s" % (gsfnp, dataset, table)

    gsutil.upload_file_to_gs(csvfn, gsfnp)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())[table]

    bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)

예제 #18

파일 보기

파일: analyze_content.py 프로젝트: musixhine/edx2bigquery

def analyze_course_content(
    course_id,
    listings_file=None,
    basedir="X-Year-2-data-sql",
    datedir="2013-09-21",
    use_dataset_latest=False,
    do_upload=False,
    courses=None,
    verbose=True,
    pin_date=None,
):
    '''
    Compute course_content table, which quantifies:

    - number of chapter, sequential, vertical modules
    - number of video modules
    - number of problem, *openended, mentoring modules
    - number of dicussion, annotatable, word_cloud modules

    Do this using the course "xbundle" file, produced when the course axis is computed.

    Include only modules which had nontrivial use, to rule out the staff and un-shown content. 
    Do the exclusion based on count of module appearing in the studentmodule table, based on 
    stats_module_usage for each course.

    Also, from the course listings file, compute the number of weeks the course was open.

    If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset 
    as the "stats_course_content" table.  Also generate a "course_summary_stats" table, stored in the
    course_report_ORG or course_report_latest dataset.  The course_summary_stats table combines
    data from many reports,, including stats_course_content, the medians report, the listings file,
    broad_stats_by_course, and time_on_task_stats_by_course.
    
    '''

    if do_upload:
        if use_dataset_latest:
            org = "latest"
        else:
            org = courses[0].split(
                '/', 1)[0]  # extract org from first course_id in courses

        crname = 'course_report_%s' % org

        gspath = gsutil.gs_path_from_course_id(crname)
        gsfnp = gspath / CCDATA
        gsutil.upload_file_to_gs(CCDATA, gsfnp)
        tableid = "stats_course_content"
        dataset = crname

        mypath = os.path.dirname(os.path.realpath(__file__))
        SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath

        try:
            the_schema = json.loads(open(SCHEMA_FILE).read())[tableid]
        except Exception as err:
            print "Oops!  Failed to load schema file for %s.  Error: %s" % (
                tableid, str(err))
            raise

        if 0:
            bqutil.load_data_to_table(dataset,
                                      tableid,
                                      gsfnp,
                                      the_schema,
                                      wait=True,
                                      verbose=False,
                                      format='csv',
                                      skiprows=1)

        table = 'course_metainfo'
        course_tables = ',\n'.join([
            ('[%s.course_metainfo]' % bqutil.course_id2dataset(x))
            for x in courses
        ])
        sql = "select * from {course_tables}".format(
            course_tables=course_tables)
        print "--> Creating %s.%s using %s" % (dataset, table, sql)

        if 1:
            metainfo_dataset = bqutil.get_bq_table(
                dataset,
                table,
                sql=sql,
                newer_than=datetime.datetime(2015, 1, 16, 3, 0),
            )
            # bqutil.create_bq_table(dataset, table, sql, overwrite=True)

        #-----------------------------------------------------------------------------
        # make course_summary_stats table
        #
        # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo.
        # Also use (and create if necessary) the nregistered_by_wrap table.

        # get the broad_stats_by_course data
        bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course')

        table_list = bqutil.get_list_of_table_ids(dataset)

        latest_person_course = max(
            [x for x in table_list if x.startswith('person_course_')])
        print "Latest person_course table in %s is %s" % (dataset,
                                                          latest_person_course)

        sql = """
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        nr_by_wrap = bqutil.get_bq_table(dataset,
                                         'nregistered_by_wrap',
                                         sql=sql,
                                         key={'name': 'course_id'})

        # rates for registrants before and during course

        sql = """
                SELECT 
                    *,
                    ncertified / nregistered * 100 as pct_certified_of_reg,
                    ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch,
                    ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course,
                    ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap,
                    ncertified / nviewed * 100 as pct_certified_of_viewed,
                    ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap,
                    ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap,
                FROM
                (
                # ------------------------
                # get aggregate data
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.certified then 1 else 0 end) ncertified,
                    sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap,
                    sum(case when pc.viewed then 1 else 0 end) nviewed,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                    sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap,
                    sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap,
                    sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch,
                    sum(case when pc.start_time < cminfo.launch_date 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_before_launch,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_during_course,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                
                # --------------------
                #  get course launch and wrap dates from course_metainfo

       SELECT AA.course_id as course_id, 
              AA.wrap_date as wrap_date,
              AA.launch_date as launch_date,
              BB.ewrap_date as ewrap_date,
       FROM (
               #  inner get course launch and wrap dates from course_metainfo
                SELECT A.course_id as course_id,
                  A.wrap_date as wrap_date,
                  B.launch_date as launch_date,
                from
                (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )
                ) as A
                left outer join 
                (
                 SELECT course_id,
                      TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Launch'
                 )
                ) as B
                on A.course_id = B.course_id 
                # end inner course_metainfo subquery
            ) as AA
            left outer join
            (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Empirical Course Wrap'
                 )
            ) as BB
            on AA.course_id = BB.course_id

                # end course_metainfo subquery
                # --------------------
                
                ) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
                # ---- end get aggregate data
                )
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration'
        sys.stdout.flush()
        cert_by_reg = bqutil.get_bq_table(dataset,
                                          'stats_cert_rates_by_registration',
                                          sql=sql,
                                          newer_than=datetime.datetime(
                                              2015, 1, 16, 3, 0),
                                          key={'name': 'course_id'})

        # start assembling course_summary_stats

        c_sum_stats = defaultdict(OrderedDict)
        for entry in bsbc['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            cmci.update(entry)
            cnbw = nr_by_wrap['data_by_key'][course_id]
            nbw = int(cnbw['nregistered_by_wrap'])
            cmci['nbw_wrap_date'] = cnbw['wrap_date']
            cmci['nregistered_by_wrap'] = nbw
            cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct']
            cmci['frac_female'] = float(entry['n_female_viewed']) / (float(
                entry['n_male_viewed']) + float(entry['n_female_viewed']))
            ncert = float(cmci['certified_sum'])
            if ncert:
                cmci[
                    'certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0
            else:
                cmci['certified_of_nregistered_by_wrap_pct'] = None
            cbr = cert_by_reg['data_by_key'][course_id]
            for field, value in cbr.items():
                cmci['cbr_%s' % field] = value

        # add medians for viewed, explored, and certified

        msbc_tables = {
            'msbc_viewed': "viewed_median_stats_by_course",
            'msbc_explored': 'explored_median_stats_by_course',
            'msbc_certified': 'certified_median_stats_by_course',
            'msbc_verified': 'verified_median_stats_by_course',
        }
        for prefix, mtab in msbc_tables.items():
            print "--> Merging median stats data from %s" % mtab
            sys.stdout.flush()
            bqdat = bqutil.get_table_data(dataset, mtab)
            for entry in bqdat['data']:
                course_id = entry['course_id']
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    cmci['%s_%s' % (prefix, field)] = value

        # add time on task data

        tot_table = "time_on_task_stats_by_course"
        prefix = "ToT"
        print "--> Merging time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add serial time on task data

        tot_table = "time_on_task_serial_stats_by_course"
        prefix = "SToT"
        print "--> Merging serial time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add show_answer stats

        tot_table = "show_answer_stats_by_course"
        prefix = "SAS"
        print "--> Merging show_answer stats data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # setup list of keys, for CSV output

        css_keys = c_sum_stats.values()[0].keys()

        # retrieve course_metainfo table, pivot, add that to summary_stats

        print "--> Merging course_metainfo from %s" % table
        sys.stdout.flush()
        bqdat = bqutil.get_table_data(dataset, table)

        listings_keys = map(make_key, [
            "Institution", "Semester", "New or Rerun",
            "Andrew Recodes New/Rerun", "Course Number", "Short Title",
            "Andrew's Short Titles", "Title", "Instructors",
            "Registration Open", "Course Launch", "Course Wrap", "course_id",
            "Empirical Course Wrap", "Andrew's Order", "certifies",
            "MinPassGrade", '4-way Category by name',
            "4-way (CS, STEM, HSocSciGov, HumHistRel)"
        ])
        listings_keys.reverse()

        for lk in listings_keys:
            css_keys.insert(1, "listings_%s" % lk)

        COUNTS_TO_KEEP = [
            'discussion', 'problem', 'optionresponse', 'checkboxgroup',
            'optioninput', 'choiceresponse', 'video', 'choicegroup',
            'vertical', 'choice', 'sequential', 'multiplechoiceresponse',
            'numericalresponse', 'chapter', 'solution', 'img',
            'formulaequationinput', 'responseparam', 'selfassessment', 'track',
            'task', 'rubric', 'stringresponse', 'combinedopenended',
            'description', 'textline', 'prompt', 'category', 'option', 'lti',
            'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt',
            'comment', 'annotationinput', 'image', 'options', 'comment_prompt',
            'conditional', 'answer', 'poll_question', 'section', 'wrapper',
            'map', 'area', 'customtag', 'transcript', 'split_test',
            'word_cloud', 'openended', 'openendedparam', 'answer_display',
            'code', 'drag_and_drop_input', 'customresponse', 'draggable',
            'mentoring', 'textannotation', 'imageannotation', 'videosequence',
            'feedbackprompt', 'assessments', 'openassessment', 'assessment',
            'explanation', 'criterion'
        ]

        for entry in bqdat['data']:
            thekey = make_key(entry['key'])
            # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP:
            #     continue
            if thekey.startswith(
                    'listings_') and thekey[9:] not in listings_keys:
                # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id'])
                continue
            c_sum_stats[entry['course_id']][thekey] = entry['value']
            #if 'certifies' in thekey:
            #    print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value'])
            if thekey not in css_keys:
                css_keys.append(thekey)

        # compute forum_posts_per_week
        for course_id, entry in c_sum_stats.items():
            nfps = entry.get('nforum_posts_sum', 0)
            if nfps:
                fppw = int(nfps) / float(entry['nweeks'])
                entry['nforum_posts_per_week'] = fppw
                print "    course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (
                    course_id, entry['total_assessments_per_week'], fppw)
            else:
                entry['nforum_posts_per_week'] = None
        css_keys.append('nforum_posts_per_week')

        # read in listings file and merge that in also
        if listings_file:
            if listings_file.endswith('.csv'):
                listings = csv.DictReader(open(listings_file))
            else:
                listings = [json.loads(x) for x in open(listings_file)]
            for entry in listings:
                course_id = entry['course_id']
                if course_id not in c_sum_stats:
                    continue
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    lkey = "listings_%s" % make_key(field)
                    if not (lkey in cmci) or (not cmci[lkey]):
                        cmci[lkey] = value

        print "Storing these fields: %s" % css_keys

        # get schema
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(
            open('%s/schemas/schema_combined_course_summary_stats.json' %
                 mypath).read())
        schema_dict = {x['name']: x for x in the_schema}

        # write out CSV
        css_table = "course_summary_stats"
        ofn = "%s__%s.csv" % (dataset, css_table)
        ofn2 = "%s__%s.json" % (dataset, css_table)
        print "Writing data to %s and %s" % (ofn, ofn2)

        ofp = open(ofn, 'w')
        ofp2 = open(ofn2, 'w')
        dw = csv.DictWriter(ofp, fieldnames=css_keys)
        dw.writeheader()
        for cid, entry in c_sum_stats.items():
            for ek in entry:
                if ek not in schema_dict:
                    entry.pop(ek)
                # entry[ek] = str(entry[ek])	# coerce to be string
            ofp2.write(json.dumps(entry) + "\n")
            for key in css_keys:
                if key not in entry:
                    entry[key] = None
            dw.writerow(entry)
        ofp.close()
        ofp2.close()

        # upload to bigquery
        # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ]
        if 1:
            gsfnp = gspath / dataset / (css_table + ".json")
            gsutil.upload_file_to_gs(ofn2, gsfnp)
            # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False,
            #                           format='csv', skiprows=1)
            bqutil.load_data_to_table(dataset,
                                      css_table,
                                      gsfnp,
                                      the_schema,
                                      wait=True,
                                      verbose=False)

        return

    print "-" * 60 + " %s" % course_id

    # get nweeks from listings
    lfn = path(listings_file)
    if not lfn.exists():
        print "[analyze_content] course listings file %s doesn't exist!" % lfn
        return

    data = None
    if listings_file.endswith('.json'):
        data_feed = map(json.loads, open(lfn))
    else:
        data_feed = csv.DictReader(open(lfn))
    for k in data_feed:
        if not 'course_id' in k:
            print "Strange course listings row, no course_id in %s" % k
            raise Exception("Missing course_id")
        if k['course_id'] == course_id:
            data = k
            break

    if not data:
        print "[analyze_content] no entry for %s found in course listings file %s!" % (
            course_id, lfn)
        return

    def date_parse(field):
        (m, d, y) = map(int, data[field].split('/'))
        return datetime.datetime(y, m, d)

    launch = date_parse('Course Launch')
    wrap = date_parse('Course Wrap')
    ndays = (wrap - launch).days
    nweeks = ndays / 7.0

    print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays)

    if pin_date:
        datedir = pin_date
    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest and not pin_date)
    cfn = gsutil.path_from_course_id(course_id)

    xbfn = course_dir / ("xbundle_%s.xml" % cfn)

    if not xbfn.exists():
        print "[analyze_content] cannot find xbundle file %s for %s!" % (
            xbfn, course_id)

        if use_dataset_latest:
            # try looking in earlier directories for xbundle file
            import glob
            spath = course_dir / ("../*/xbundle_%s.xml" % cfn)
            files = list(glob.glob(spath))
            if files:
                xbfn = path(files[-1])
            if not xbfn.exists():
                print "   --> also cannot find any %s ; aborting!" % spath
            else:
                print "   --> Found and using instead: %s " % xbfn
        if not xbfn.exists():
            raise Exception("[analyze_content] missing xbundle file %s" % xbfn)

    # if there is an xbundle*.fixed file, use that instead of the normal one
    if os.path.exists(str(xbfn) + ".fixed"):
        xbfn = path(str(xbfn) + ".fixed")

    print "[analyze_content] For %s using %s" % (course_id, xbfn)

    # get module usage data
    mudata = get_stats_module_usage(course_id, basedir, datedir,
                                    use_dataset_latest)

    xml = etree.parse(open(xbfn)).getroot()

    counts = defaultdict(int)
    nexcluded = defaultdict(int)

    IGNORE = [
        'html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1',
        'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike',
        'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's',
        'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki',
        'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4',
        'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p',
        'P', 'TABLE', 'TD', 'small', 'text', 'title'
    ]

    problem_stats = defaultdict(int)

    def does_problem_have_random_script(problem):
        '''
        return 1 if problem has a script with "random." in it
        else return 0
        '''
        for elem in problem.findall('.//script'):
            if elem.text and ('random.' in elem.text):
                return 1
        return 0

    # walk through xbundle
    def walk_tree(elem, policy=None):
        '''
        Walk XML tree recursively.
        elem = current element
        policy = dict of attributes for children to inherit, with fields like due, graded, showanswer
        '''
        policy = policy or {}
        if type(elem.tag) == str and (elem.tag.lower() not in IGNORE):
            counts[elem.tag.lower()] += 1
        if elem.tag in [
                "sequential", "problem", "problemset", "course", "chapter"
        ]:  # very old courses may use inheritance from course & chapter
            keys = ["due", "graded", "format", "showanswer", "start"]
            for k in keys:  # copy inheritable attributes, if they are specified
                val = elem.get(k)
                if val:
                    policy[k] = val
        if elem.tag == "problem":  # accumulate statistics about problems: how many have show_answer = [past_due, closed] ?  have random. in script?
            problem_stats['n_capa_problems'] += 1
            if policy.get('showanswer'):
                problem_stats["n_showanswer_%s" %
                              policy.get('showanswer')] += 1
            else:
                problem_stats[
                    'n_shownanswer_finished'] += 1  # DEFAULT showanswer = finished  (make sure this remains true)
                # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/capa_base.py#L118
                # finished = Show the answer after the student has answered the problem correctly, the student has no attempts left, or the problem due date has passed.
            problem_stats[
                'n_random_script'] += does_problem_have_random_script(elem)

            if policy.get('graded') == 'true' or policy.get(
                    'graded') == 'True':
                problem_stats['n_capa_problems_graded'] += 1
                problem_stats[
                    'n_graded_random_script'] += does_problem_have_random_script(
                        elem)
                if policy.get('showanswer'):
                    problem_stats["n_graded_showanswer_%s" %
                                  policy.get('showanswer')] += 1
                else:
                    problem_stats[
                        'n_graded_shownanswer_finished'] += 1  # DEFAULT showanswer = finished  (make sure this remains true)

        for k in elem:
            midfrag = (k.tag, k.get('url_name_orig', None))
            if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20:
                nexcluded[k.tag] += 1
                if verbose:
                    try:
                        print "    -> excluding %s (%s), ncount=%s" % (
                            k.get('display_name',
                                  '<no_display_name>').encode('utf8'), midfrag,
                            mudata.get(midfrag, {}).get('ncount'))
                    except Exception as err:
                        print "    -> excluding ", k
                continue
            walk_tree(k, policy.copy())

    walk_tree(xml)
    print "--> Count of individual element tags throughout XML: ", counts

    print "--> problem_stats:", json.dumps(problem_stats, indent=4)

    # combine some into "qual_axis" and others into "quant_axis"
    qual_axis = [
        'openassessment',
        'optionresponse',
        'multiplechoiceresponse',
        # 'discussion',
        'choiceresponse',
        'word_cloud',
        'combinedopenended',
        'choiceresponse',
        'stringresponse',
        'textannotation',
        'openended',
        'lti'
    ]
    quant_axis = [
        'formularesponse', 'numericalresponse', 'customresponse',
        'symbolicresponse', 'coderesponse', 'imageresponse'
    ]

    nqual = 0
    nquant = 0
    for tag, count in counts.items():
        if tag in qual_axis:
            nqual += count
        if tag in quant_axis:
            nquant += count

    print "nqual=%d, nquant=%d" % (nqual, nquant)

    nqual_per_week = nqual / nweeks
    nquant_per_week = nquant / nweeks
    total_per_week = nqual_per_week + nquant_per_week

    print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (
        nqual_per_week, nquant_per_week, total_per_week)

    # save this overall data in CCDATA
    lock_file(CCDATA)
    ccdfn = path(CCDATA)
    ccd = {}
    if ccdfn.exists():
        for k in csv.DictReader(open(ccdfn)):
            ccd[k['course_id']] = k

    ccd[course_id] = {
        'course_id': course_id,
        'nweeks': nweeks,
        'nqual_per_week': nqual_per_week,
        'nquant_per_week': nquant_per_week,
        'total_assessments_per_week': total_per_week,
    }

    # fields = ccd[ccd.keys()[0]].keys()
    fields = [
        'course_id', 'nquant_per_week', 'total_assessments_per_week',
        'nqual_per_week', 'nweeks'
    ]
    cfp = open(ccdfn, 'w')
    dw = csv.DictWriter(cfp, fieldnames=fields)
    dw.writeheader()
    for cid, entry in ccd.items():
        dw.writerow(entry)
    cfp.close()
    lock_file(CCDATA, release=True)

    # store data in course_metainfo table, which has one (course_id, key, value) on each line
    # keys include nweeks, nqual, nquant, count_* for module types *

    cmfields = OrderedDict()
    cmfields['course_id'] = course_id
    cmfields['course_length_days'] = str(ndays)
    cmfields.update(
        {make_key('listings_%s' % key): value
         for key, value in data.items()})  # from course listings
    cmfields.update(ccd[course_id].copy())

    # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() })	# from content counts

    cmfields['filename_xbundle'] = xbfn
    cmfields['filename_listings'] = lfn

    for key in sorted(
            counts
    ):  # store counts in sorted order, so that the later generated CSV file can have a predictable structure
        value = counts[key]
        cmfields['count_%s' % key] = str(value)  # from content counts

    for key in sorted(problem_stats):  # store problem stats
        value = problem_stats[key]
        cmfields['problem_stat_%s' % key] = str(value)

    cmfields.update({('nexcluded_sub_20_%s' % key): str(value)
                     for key, value in nexcluded.items()
                     })  # from content counts

    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest)
    csvfn = course_dir / CMINFO

    # manual overriding of the automatically computed fields can be done by storing course_id,key,value data
    # in the CMINFO_OVERRIDES file

    csvfn_overrides = course_dir / CMINFO_OVERRIDES
    if csvfn_overrides.exists():
        print "--> Loading manual override information from %s" % csvfn_overrides
        for ovent in csv.DictReader(open(csvfn_overrides)):
            if not ovent['course_id'] == course_id:
                print "===> ERROR! override file has entry with wrong course_id: %s" % ovent
                continue
            print "    overriding key=%s with value=%s" % (ovent['key'],
                                                           ovent['value'])
            cmfields[ovent['key']] = ovent['value']

    print "--> Course metainfo writing to %s" % csvfn

    fp = open(csvfn, 'w')

    cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value'])
    cdw.writeheader()

    for k, v in cmfields.items():
        cdw.writerow({'course_id': course_id, 'key': k, 'value': v})

    fp.close()

    # build and output course_listings_and_metainfo

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    mypath = os.path.dirname(os.path.realpath(__file__))
    clm_table = "course_listing_and_metainfo"
    clm_schema_file = '%s/schemas/schema_%s.json' % (mypath, clm_table)
    clm_schema = json.loads(open(clm_schema_file).read())

    clm = {}
    for finfo in clm_schema:
        field = finfo['name']
        clm[field] = cmfields.get(field)
    clm_fnb = clm_table + ".json"
    clm_fn = course_dir / clm_fnb
    open(clm_fn, 'w').write(json.dumps(clm))

    gsfnp = gsutil.gs_path_from_course_id(
        course_id, use_dataset_latest=use_dataset_latest) / clm_fnb
    print "--> Course listing + metainfo uploading to %s then to %s.%s" % (
        gsfnp, dataset, clm_table)
    sys.stdout.flush()
    gsutil.upload_file_to_gs(clm_fn, gsfnp)
    bqutil.load_data_to_table(dataset,
                              clm_table,
                              gsfnp,
                              clm_schema,
                              wait=True,
                              verbose=False)

    # output course_metainfo

    table = 'course_metainfo'
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    gsfnp = gsutil.gs_path_from_course_id(
        course_id, use_dataset_latest=use_dataset_latest) / CMINFO
    print "--> Course metainfo uploading to %s then to %s.%s" % (
        gsfnp, dataset, table)
    sys.stdout.flush()

    gsutil.upload_file_to_gs(csvfn, gsfnp)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())[table]

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfnp,
                              the_schema,
                              wait=True,
                              verbose=False,
                              format='csv',
                              skiprows=1)

예제 #19

파일 보기

파일: make_video_analysis.py 프로젝트: AbdouSeck/edx2bigquery

def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest, use_latest_sql_dir):
    '''
    Create Video stats for Videos Viewed and Videos Watched.
    First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched
    '''

    assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable"

    # Get Course Dir path
    basedir = path(basedir or '')
    course_dir = course_id.replace('/','__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest or use_latest_sql_dir)
    
    # get schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/%s' % ( mypath, SCHEMA_VIDEO_AXIS )
    the_schema = json.loads(open(SCHEMA_FILE).read())[ SCHEMA_VIDEO_AXIS_NAME ]
    the_dict_schema = schema2dict(the_schema)

    # Create initial video axis
    videoAxisExists = False
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    va_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS )
        assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS )
	videoAxisExists = True
        va_date = tinfo['lastModifiedTime']		# datetime
    except (AssertionError, Exception) as err:
        print "%s --> Attempting to process %s table" % ( str(err), TABLE_VIDEO_AXIS )
        sys.stdout.flush()

    # get course axis time
    ca_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS )
        ca_date = tinfo['lastModifiedTime']		# datetime
    except (AssertionError, Exception) as err:
        pass

    if videoAxisExists and (not force_recompute) and ca_date and va_date and (ca_date > va_date):
        force_recompute = True
        print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (va_date, ca_date)
        sys.stdout.flush()

    if not videoAxisExists or force_recompute:
        force_recompute = True
        createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)

        # Get video lengths
        va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS)
        assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table"
        va_bqdata = va['data']
        fileoutput = lfp / FILENAME_VIDEO_AXIS
        getYoutubeDurations( dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute )

        # upload and import video axis
        gsfn = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS
        gsutil.upload_file_to_gs(fileoutput, gsfn)
        table = TABLE_VIDEO_AXIS
        bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True)

    else:
        print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS )

    # Lastly, create video stats
    createVideoStats_day( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
    createVideoStats( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )

    # also create person_course_video_watched
    createPersonCourseVideo( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )

예제 #20

파일 보기

파일: axis2bigquery.py 프로젝트: sibycharley/edx2bigquery

def do_save(cid, caset_in, xbundle, datadir, log_msg, use_dataset_latest=False):
    '''
    Save course axis data to bigquery
    
    cid = course_id
    caset = list of course axis data in dict format
    xbundle = XML bundle of course (everything except static files)
    datadir = directory where output files should be written
    log_msg = list of messages about processing errors and issues
    '''

    # BigQuery requires data to fit within a schema; let's make sure our lines all fit the schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(open('%s/schemas/schema_course_axis.json' % mypath).read())['course_axis']
    dict_schema = schema2dict(the_schema)

    caset = copy.deepcopy(caset_in)

    datadir = path(datadir)
    cafn = datadir / 'course_axis.json' 
    xbfn = datadir / ('xbundle_%s.xml' % (cid.replace('/','__')))
    fp = open(cafn, 'w')
    linecnt = 0

    for ca in caset:
        linecnt += 1
        ca['course_id'] = cid
        data = ca['data']
        if data and not type(data)==dict:
            try:
                ca['data'] = json.loads(data)	# make it native, for mongo
            except Exception as err:
                print "failed to create json for %s, error=%s" % (data, err)
        if ca['start'] is not None:
            ca['start'] = str(ca['start'])	# datetime to string
        if  ca['due'] is not None:
            ca['due'] = str(ca['due'])	# datetime to string
        if (ca['data'] is None) or (ca['data']==''):
            ca.pop('data')
        check_schema(linecnt, ca, the_ds=dict_schema, coerce=True)
        try:
            # db.course_axis.insert(ca)
            fp.write(json.dumps(ca)+'\n')
        except Exception as err:
            print "Failed to save!  Error=%s, data=%s" % (err, ca)
    fp.close()

    # upload axis.json file and course xbundle
    gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))
    if 1:
        gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False)
        gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False)

    # import into BigQuery
    dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)	# create dataset if not already existent
    table = "course_axis"
    bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema)

    msg = "="*100 + '\n'
    msg += "Course axis for %s\n" % (cid)
    msg += "="*100 + '\n'
    msg += '\n'.join(log_msg)
    msg = msg[:16184]		# max message length 16384
    
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    print "    Done - inserted %s records into course_axis" % len(caset)