def process_events(cfg_csv_path, cfg_csv_parsing, cfg_open_edx_spec, timestamp_format): print('****** Processing events *******') events_processing_duration = time.time() # MOOCdb storage interface moocdb = MOOCdb(cfg_csv_path['moocdb_csv_dir']) # Instanciating the piping architecture event_formatter = EventFormatter(moocdb, TIMESTAMP_FORMAT=timestamp_format) resource_manager = ResourceManager(moocdb, HIERARCHY_ROOT='https://') event_manager = EventManager(moocdb) submission_manager = SubmissionManager(moocdb) curation_helper = CurationHelper(cfg_csv_path['moocdb_csv_dir']) clickevents_manager = ClickEventsManager(moocdb) print("Processing %s" % cfg_csv_path['edx_track_event_path']) extract = extractor.CSVExtractor(cfg_csv_path, cfg_csv_parsing) num_rows = int( check_output(["wc", "-l", cfg_csv_path['edx_track_event_path']]).split(" ")[0]) event_count = 0 for raw_event in extract: event_count += 1 if event_count % 500 == 0: progress = 'Progress: %0.4f%%' % (100.0 * float(event_count) / float(num_rows)) # A print statement is not used here because # a newline is automatically appended on each print, whereas # we want to use the CR character to move the terminal # pointer back to the beginning of the same line. print(progress) sys.stdout.write("\033[F") # Skip events explicitly not handled by qpipe if event_formatter.pass_filter(raw_event) is False: continue event = event_formatter.polish(raw_event) resource_id = resource_manager.create_resource(event) event.set_data_attr('resource_id', resource_id) submission_manager.update_submission_tables(event) curation_helper.record_curation_hints(event) clickevents_manager.record(event, cfg_open_edx_spec) event_manager.store_event(event) print('* All events processed') print('* Writing CSV output to : %s' % cfg_csv_path['moocdb_csv_dir']) event_formatter.serialize() event_manager.serialize() resource_manager.serialize(pretty_print_to=cfg_csv_path['resource_hierarchy_path']) submission_manager.serialize(pretty_print_to=cfg_csv_path['problem_hierarchy_path']) curation_helper.serialize() print('* Writing resource hierarchy to : %s' % cfg_csv_path['resource_hierarchy_path']) print('* Writing problem hierarchy to : %s' % cfg_csv_path['problem_hierarchy_path']) metadata_file_path = os.path.join(cfg_csv_path['moocdb_csv_dir'], 'metadata.csv') try: os.remove(metadata_file_path) print('* Removed old metadata file at %s' % metadata_file_path) except OSError: pass print('* Writing metadata row to : %s' % metadata_file_path) try: with open(metadata_file_path, 'w') as metafile: process = Popen(['git', 'describe', '--always'], stdout=PIPE) commit_hash, err = process.communicate() commit_hash = commit_hash.rstrip() if err is None else '' events_processing_duration = ( int(time.time() - events_processing_duration)) / 60 # minutes metafile.write('%s,%s\n' % (commit_hash, events_processing_duration)) except OSError: pass moocdb.close()