def test_location_by_course(self): self.upload_tracking_log(self.INPUT_FILE, self.START_DATE) for fixture_file_name in self.SQL_FIXTURES: self.execute_sql_fixture_file(fixture_file_name) self.task.launch([ 'InsertToMysqlCourseEnrollByCountryWorkflow', '--source', self.test_src, '--interval', self.DATE_INTERVAL.to_string(), '--user-country-output', url_path_join(self.test_out, 'user'), '--course-country-output', url_path_join(self.test_out, 'country'), '--n-reduce-tasks', str(self.NUM_REDUCERS), ]) with self.export_db.cursor() as cursor: cursor.execute('SELECT * FROM course_enrollment_location_current ORDER BY country_code') results = cursor.fetchall() self.maxDiff = None # TODO: what happens if the test starts near the UTC day boundary. The task sees that today is day "X", yet this # code sees the following day since the day boundary was crossed between then and now. today = datetime.utcnow().date() self.assertItemsEqual([ row[1:5] for row in results ], [ (today, self.COURSE_ID, '', 1), (today, self.COURSE_ID, 'IE', 1), (today, self.COURSE_ID, 'TH', 1), (today, self.COURSE_ID2, 'TH', 1), ])
def test_enrollment_trends(self): self.upload_tracking_log(self.INPUT_FILE, datetime.date(2014, 8, 1)) blacklist_path = url_path_join(self.test_src, 'blacklist') blacklist_date = '2014-08-29' blacklist_url = url_path_join(blacklist_path, 'dt=' + blacklist_date, 'blacklist.tsv') with S3Target(blacklist_url).open('w') as f: f.write('edX/Open_DemoX/edx_demo_course3') config_override = { 'enrollments': { 'blacklist_date': blacklist_date, 'blacklist_path': blacklist_path, } } self.task.launch([ 'ImportCourseDailyFactsIntoMysql', '--credentials', self.export_db.credentials_file_url, '--src', self.test_src, '--dest', self.test_out, '--name', 'test', '--include', '"*"', '--run-date', '2014-08-06', '--manifest', url_path_join(self.test_root, 'manifest.txt'), '--lib-jar', self.oddjob_jar, '--n-reduce-tasks', str(self.NUM_REDUCERS), ], config_override=config_override) self.validate_output()
def setUp(self): super(DeidentificationAcceptanceTest, self).setUp() self.temporary_dir = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.temporary_dir) self.dump_root = url_path_join(self.test_src, 'course_exports', 'raw') self.filename_safe_course_id = get_filename_safe_course_id(self.COURSE_ID) self.test_gpg_key_dir = url_path_join(self.test_root, 'gpg-keys')
def test_enrollment_validation(self): # Initial setup. self.upload_tracking_log(self.INPUT_FILE, self.START_DATE) self.execute_sql_fixture_file(self.SQL_FIXTURE) self.test_validate = url_path_join(self.test_root, 'validate') # Run once. This will generate the new validation events, but # will not include them in the validation run (because the # requirements for the validation run are computed before any # validation events are generated). self.test_first_run = url_path_join(self.test_out, 'first_run') self.launch_task(self.test_first_run, run_with_validation_events=False) # Check that validation took place. self.check_validation_events() # Run again, with the validation events generated by the first run. self.test_second_run = url_path_join(self.test_out, 'second_run') self.launch_task(self.test_second_run) # Check that synthetic events were created. self.check_synthetic_events(self.test_second_run) # Run again, with the synthetic events generated by the second run. self.test_third_run = url_path_join(self.test_out, 'third_run') self.launch_task(self.test_third_run, extra_source=self.test_second_run) # Check that no events are output. self.check_no_synthetic_events(self.test_third_run)
def _get_required_tasks(self): """Internal method to actually calculate required tasks once.""" start_date = self.interval.date_a end_date = self.interval.date_b table_name = "student_courseenrollment" source_root = url_path_join(self.warehouse_path, table_name) today_datestring = datetime.datetime.utcnow().strftime('%Y-%m-%d') current_date = start_date while current_date <= end_date: datestring = current_date.strftime('%Y-%m-%d') current_date += datetime.timedelta(days=1) src_datestring = "dt={}".format(datestring) source_dir = url_path_join(source_root, src_datestring) target = get_target_from_url(source_dir) output_dir = url_path_join(self.output_root, datestring) if datestring == today_datestring: yield CreateEnrollmentValidationEventsForTodayTask( source_dir=source_dir, output_root=output_dir, n_reduce_tasks=self.n_reduce_tasks, credentials=self.credentials, ) elif target.exists(): yield CreateEnrollmentValidationEventsTask( source_dir=source_dir, output_root=output_dir, n_reduce_tasks=self.n_reduce_tasks, )
def setUp(self): super(ObfuscationAcceptanceTest, self).setUp() self.temporary_dir = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.temporary_dir) self.dump_root = url_path_join(self.test_src, "course_exports", "raw") self.filename_safe_course_id = get_filename_safe_course_id(self.COURSE_ID) self.test_gpg_key_dir = url_path_join(self.test_root, "gpg-keys")
def check_validation_events(self): """Confirm that validation data was properly created.""" validate_output_dir = url_path_join(self.test_validate, str(self.END_DATE)) outputs = self.s3_client.list(validate_output_dir) outputs = [url_path_join(validate_output_dir, p) for p in outputs] # There are 2 courses in the test data. self.assertEqual(len(outputs), 2)
def requires(self): """ Runs each task """ output_destination = url_path_join(self.destination, self.name, str(self.date)) if self.manifest_path is not None: manifest = url_path_join(self.manifest_path, "executive-reports", self.name, str(self.date)) else: manifest = None common_parameters = { "name": self.name, "src": self.src, "include": self.include, "manifest": manifest, "credentials": self.credentials, "blacklist": self.blacklist, "mapreduce_engine": self.mapreduce_engine, "lib_jar": self.lib_jar, "n_reduce_tasks": self.n_reduce_tasks, "destination": output_destination, "date": self.date, } yield ( WeeklyAllUsersAndEnrollments( offsets=self.offsets, history=self.history, weeks=TOTAL_USERS_AND_ENROLLMENTS_NUM_WEEKS, **common_parameters ), WeeklyIncrementalUsersAndEnrollments( offsets=self.offsets, history=self.history, weeks=WEEKLY_ENROLLMENT_REPORT_WEEKS, **common_parameters ), EnrollmentsByWeek( offsets=self.offsets, statuses=self.statuses, weeks=WEEKLY_ENROLLMENT_REPORT_WEEKS, **common_parameters ), DailyRegistrationsEnrollmentsAndCourses( days=DEFAULT_NUM_DAYS, **common_parameters ) )
def __init__(self, *args, **kwargs): super(DeidentifiedCourseDumpTask, self).__init__(*args, **kwargs) filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course) auth_userprofile_targets = PathSetTask([url_path_join(self.dump_root, filename_safe_course_id, 'state')], ['*auth_userprofile*']).output() # TODO: Refactor out this logic of getting latest file. Right now we expect a date, so we use that dates = [re.search(r"\d{4}-\d{2}-\d{2}", target.path).group() for target in auth_userprofile_targets] # TODO: Make the date a parameter that defaults to the most recent, but allows the user to override? latest_date = sorted(dates)[-1] self.data_directory = url_path_join(self.dump_root, filename_safe_course_id, 'state', latest_date) self.output_directory = url_path_join(self.output_root, filename_safe_course_id, 'state', latest_date)
def run_obfuscation_task(self): """Run ObfuscatedCourseTask.""" self.task.launch([ 'ObfuscatedCourseTask', '--course', self.filename_safe_course_id, '--dump-root', self.dump_root, '--obfuscated-output-root', url_path_join(self.test_root, 'obfuscated-output'), '--format-version', self.FORMAT_VERSION, '--pipeline-version', self.PIPELINE_VERSION, '--auth-user-path', url_path_join(self.test_root, 'warehouse', 'auth_user'), '--auth-userprofile-path', url_path_join(self.test_root, 'warehouse', 'auth_userprofile') ])
def test_answer_distribution(self): self.task.launch([ 'AnswerDistributionOneFilePerCourseTask', '--src', self.test_src, '--dest', url_path_join(self.test_root, 'dst'), '--name', 'test', '--output-root', self.test_out, '--include', '"*"', '--manifest', url_path_join(self.test_root, 'manifest.txt'), '--base-input-format', self.input_format, '--lib-jar', self.oddjob_jar, '--n-reduce-tasks', str(self.NUM_REDUCERS), ]) self.validate_output()
def setUp(self): super(FinancialReportsAcceptanceTest, self).setUp() for input_file_name in ('paypal.tsv', 'cybersource_test.tsv'): src = url_path_join(self.data_dir, 'input', input_file_name) dst = url_path_join(self.warehouse_path, "payments", "dt=" + self.IMPORT_DATE, input_file_name) self.upload_file(src, dst) empty_file_path = url_path_join( self.warehouse_path, "payments", "dt=" + self.IMPORT_DATE, 'cybersource_empty_test.tsv') self.upload_file_with_content(empty_file_path, '') self.prepare_database('lms', self.import_db) self.prepare_database('otto', self.otto_db)
def test_answer_distribution_mysql(self): self.task.launch([ 'AnswerDistributionToMySQLTaskWorkflow', '--src', self.test_src, '--dest', url_path_join(self.test_root, 'dst'), '--name', 'test', '--include', '"*"', '--manifest', url_path_join(self.test_root, 'manifest.txt'), '--base-input-format', self.input_format, '--lib-jar', self.oddjob_jar, '--n-reduce-tasks', str(self.NUM_REDUCERS), '--credentials', self.export_db.credentials_file_url, ]) self.validate_output()
def __init__(self, *args, **kwargs): super(ObfuscatedCourseDumpTask, self).__init__(*args, **kwargs) filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course) dump_path = url_path_join(self.dump_root, filename_safe_course_id, 'state') auth_userprofile_targets = PathSetTask([dump_path], ['*auth_userprofile*']).output() # TODO: Refactor out this logic of getting latest file. Right now we expect a date, so we use that dates = [target.path.rsplit('/', 2)[-2] for target in auth_userprofile_targets] # TODO: Make the date a parameter that defaults to the most recent, but allows the user to override? # This should return an error if no data is found, rather than getting a cryptic 'index out of range' error. if len(dates) == 0: raise Exception('Missing auth_userprofile data file in {}'.format(dump_path)) latest_date = sorted(dates)[-1] self.data_directory = url_path_join(self.dump_root, filename_safe_course_id, 'state', latest_date) self.output_directory = url_path_join(self.output_root, filename_safe_course_id, 'state', latest_date)
def test_student_engagement(self): self.upload_tracking_log(self.INPUT_FILE, datetime.date(2015, 4, 10)) self.execute_sql_fixture_file('load_student_engagement.sql') self.interval = '2015-04-06-2015-04-20' # run for exactly two weeks for interval_type in ['daily', 'weekly', 'all']: self.run_task(interval_type) for course_id in self.ALL_COURSES: hashed_course_id = hashlib.sha1(course_id).hexdigest() course_dir = url_path_join(self.test_out, interval_type, hashed_course_id) csv_filenames = list(self.s3_client.list(course_dir)) # Check expected number of CSV files. if interval_type == 'daily': self.assertEqual(len(csv_filenames), 14) elif interval_type == 'weekly': self.assertEqual(len(csv_filenames), 2) elif interval_type == 'all': self.assertEqual(len(csv_filenames), 1) # Check that the CSV files contain the expected data. for csv_filename in csv_filenames: # Parse expected date from filename. if interval_type == 'all': expected_date = '2015-04-19' else: csv_pattern = '.*student_engagement_.*_(\\d\\d\\d\\d-\\d\\d-\\d\\d)\\.csv' match = re.match(csv_pattern, csv_filename) expected_date = match.group(1) # Build dataframe from csv file generated from events. actual_dataframe = [] with S3Target(url_path_join(course_dir, csv_filename)).open() as csvfile: actual_dataframe = read_csv(csvfile) actual_dataframe.fillna('', inplace=True) self.check_engagement_dataframe(actual_dataframe, interval_type, course_id, expected_date) # Validate specific values: expected_dataframe = self.get_expected_engagement(interval_type, hashed_course_id, csv_filename) if expected_dataframe is not None: assert_frame_equal(actual_dataframe, expected_dataframe, check_names=True) else: self.assert_zero_engagement(actual_dataframe)
def check_validation_events(self): """Confirm that validation data was properly created.""" validate_output_dir = url_path_join(self.test_validate, str(self.END_DATE)) outputs = self.get_targets_from_remote_path(validate_output_dir) # There are 2 courses in the test data. self.assertEqual(len(outputs), 2)
def test_event_log_exports_using_manifest(self): config_override = { 'manifest': { 'threshold': 1 } } folders = { 'prod': self.PROD_FOLDER, 'edge': self.EDGE_FOLDER } for environment in ['prod', 'edge']: self.task.launch([ 'EventExportTask', '--source', url_path_join(self.test_src, environment), '--output-root', self.test_out, '--config', self.test_config, '--environment', environment, '--interval', '2014-05', '--gpg-key-dir', self.test_gpg_key_dir, '--gpg-master-key', '*****@*****.**', '--required-path-text', folders[environment], '--n-reduce-tasks', str(self.NUM_REDUCERS), ], config_override) self.validate_output()
def output(self): return get_target_from_url( url_path_join( self.destination, "incremental_users_and_enrollments_{0}.csv".format(self.name) ) )
def output(self): return get_target_from_url(url_path_join( self.output_root, 'transaction', 'dt=' + self.import_date.isoformat(), # pylint: disable=no-member 'transactions.csv' ))
def multi_output_reducer(self, key, values, output_file): """ Write values to the appropriate file as determined by the key. Write to the encrypted file by streaming through gzip, which compresses before encrypting """ _date_string, org_id = key recipients = self.recipients_for_org_id[org_id] log.info('Encryption recipients: %s', str(recipients)) def report_progress(num_bytes): """Update hadoop counters as the file is written""" self.incr_counter('Event Export', 'Bytes Written to Output', num_bytes) key_file_targets = [get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients] with make_encrypted_file(output_file, key_file_targets, progress=report_progress) as encrypted_output_file: outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file) try: for value in values: outfile.write(value.strip()) outfile.write('\n') # WARNING: This line ensures that Hadoop knows that our process is not sitting in an infinite loop. # Do not remove it. self.incr_counter('Event Export', 'Raw Bytes Written', len(value) + 1) finally: outfile.close()
def test_demographic_trends(self): self.upload_tracking_log(self.INPUT_FILE, datetime.date(2014, 8, 1)) self.execute_sql_fixture_file('load_auth_userprofile.sql') blacklist_date = '2014-08-29' blacklist_url = url_path_join( self.warehouse_path, 'course_enrollment_blacklist', 'dt=' + blacklist_date, 'blacklist.tsv') with S3Target(blacklist_url).open('w') as s3_file: s3_file.write('edX/Open_DemoX/edx_demo_course3') config_override = { 'enrollments': { 'blacklist_date': blacklist_date, } } self.task.launch([ 'ImportDemographicsIntoMysql', '--interval', '2014-08-01-2014-08-06', '--n-reduce-tasks', str(self.NUM_REDUCERS), ], config_override=config_override) self.validate_gender() self.validate_birth_year() self.validate_education_level()
def test_end_to_end_without_vertica(self): # Similar to test_end_to_end but it excludes the vertica part and it checks data values, # not just data shape. table_name = 'reconciled_order_transactions' output_root = url_path_join( self.warehouse_path, table_name, 'dt=' + self.UPPER_BOUND_DATE ) + '/' self.task.launch([ 'ReconcileOrdersAndTransactionsTask', '--import-date', self.UPPER_BOUND_DATE, '--n-reduce-tasks', str(self.NUM_REDUCERS), '--output-root', output_root, ]) final_output_task = LoadInternalReportingOrderTransactionsToWarehouse( import_date=luigi.DateParameter().parse(self.UPPER_BOUND_DATE) ) columns = [x[0] for x in final_output_task.columns] expected_output_csv = os.path.join(self.data_dir, 'output', 'expected_financial_report.csv') expected = pandas.read_csv(expected_output_csv, parse_dates=True) raw_output = self.read_dfs_directory(output_root) output = StringIO(raw_output.replace('\t\\N', '\t')) data = pandas.read_table(output, header=None, names=columns, parse_dates=True) # Re-order dataframe for consistent comparison: for frame in (data, expected): frame.sort(['payment_ref_id', 'transaction_type'], inplace=True, ascending=[True, False]) frame.reset_index(drop=True, inplace=True) self.assert_data_frames_equal(data, expected)
def generate_file_list(self): """Yield each individual path given a source folder and a set of file-matching expressions.""" for src in self.src: if src.startswith('s3'): # connect lazily as needed: if self.s3_conn is None: self.s3_conn = boto.connect_s3() for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length): source = url_path_join(src, path) yield ExternalURL(source) elif src.startswith('hdfs'): for source, size in luigi.hdfs.listdir(src, recursive=True, include_size=True): if not self.include_zero_length and size == 0: continue elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include): yield ExternalURL(source) else: # Apply the include patterns to the relative path below the src directory. # TODO: implement exclude_zero_length to match S3 case. for dirpath, _dirnames, files in os.walk(src): for filename in files: filepath = os.path.join(dirpath, filename) relpath = os.path.relpath(filepath, src) if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include): yield ExternalURL(filepath)
def output_path_for_key(self, course_id): filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(course_id, '_') filename = u'{course_id}_enroll_validated_{dumpdate}.log.gz'.format( course_id=filename_safe_course_id, dumpdate=self.dump_date, ) return url_path_join(self.output_root, filename)
def partition_location(self): """Provides location of Hive database table's partition data.""" # The actual folder name where the data is stored is expected to be in the format <key>=<value> partition_name = '='.join(self.partition.items()[0]) # Make sure that input path ends with a slash, to indicate a directory. # (This is necessary for S3 paths that are output from Hadoop jobs.) return url_path_join(self.table_location, partition_name + '/')
def output(self): return get_target_from_url( url_path_join( self.user_country_output, 'dt={0}/'.format(self.interval.date_b.strftime('%Y-%m-%d')) # pylint: disable=no-member ) )
def output(self): return get_target_from_url( url_path_join( self.output_root, 'count-user-activity-per-interval-{interval}.tsv/'.format(interval=self.interval), ) )
def output(self): return get_target_from_url( url_path_join( self.destination, 'total_users_and_enrollments_{0}-{1}.csv'.format(self.start_date, self.date) ) )
def output(self): if len(self.input()['data']) == 0: raise IOError("Course File '{filename}' not found for course '{course}'".format( filename=self.file_pattern, course=self.course )) output_filename = os.path.basename(self.input()['data'][0].path) return get_target_from_url(url_path_join(self.output_directory, output_filename))
def upload_public_keys(self): gpg_key_dir = os.path.join('gpg-keys') for key_filename in os.listdir(gpg_key_dir): full_local_path = os.path.join(gpg_key_dir, key_filename) remote_url = url_path_join(self.test_gpg_key_dir, key_filename) if not key_filename.endswith('.key'): self.s3_client.put(full_local_path, remote_url)
def requires(self): table_name = 'courseware_studentmodule' return SqoopImportFromMysql( credentials=self.credentials, destination=url_path_join(self.dest, table_name), table_name=table_name, num_mappers=self.num_mappers, overwrite=self.sqoop_overwrite, )
def upload_data(self): """Puts the test course catalog where the processing task would look for it, bypassing calling the actual API""" src = os.path.join(self.data_dir, 'input', self.INPUT_FILE) # IMPORTANT: this path should be of the same format as the path that DailyPullCatalogTask uses for output. dst = url_path_join(self.warehouse_path, "course_catalog", "catalog", "dt=2015-06-29", self.INPUT_FILE) # Upload mocked results of the API call self.s3_client.put(src, dst)
def requires(self): table_name = 'courseware_studentmodule' return SqoopImportFromMysql(credentials=self.credentials, destination=url_path_join( self.dump_root, table_name), table_name=table_name, num_mappers=self.num_mappers, where=self.where, verbose=self.verbose)
def output_path_for_key(self, course_id): template = "{course_id}-courseware_studentmodule-{suffix}analytics.sql" filename = template.format( course_id=opaque_key_util.get_filename_safe_course_id( course_id, '-'), suffix=(self.output_suffix + '-') if self.output_suffix else '') return url_path_join(self.output_root, filename)
def validate_output_file(self, date, org_id, site, use_master_key=False): if use_master_key: key_filename = 'insecure_master_secret.key' else: if org_id == 'edx': key_filename = 'insecure_secret.key' else: key_filename = 'insecure_secret_2.key' self.temporary_dir = tempfile.mkdtemp() self.addCleanup(shutil.rmtree, self.temporary_dir) self.downloaded_outputs = os.path.join(self.temporary_dir, 'output') os.makedirs(self.downloaded_outputs) local_file_name = '{org}-{site}-events-{date}.log'.format( org=org_id, site=site, date=date, ) year = str(date).split("-")[0] remote_url = url_path_join(self.test_out, org_id, site, "events", year, local_file_name + '.gz.gpg') # Files won't appear in S3 instantaneously, wait for the files to appear. # TODO: exponential backoff for _index in range(30): key = self.s3_client.get_key(remote_url) if key is not None: break else: time.sleep(2) if key is None: self.fail( 'Unable to find expected output file {0}'.format(remote_url)) downloaded_output_path = os.path.join(self.downloaded_outputs, remote_url.split('/')[-1]) key.get_contents_to_filename(downloaded_output_path) # first decrypt file decrypted_file_name = downloaded_output_path[:-len('.gpg')] fs.decrypt_file(downloaded_output_path, decrypted_file_name, key_filename) # now decompress file decompressed_file_name = decrypted_file_name[:-len(',gz')] fs.decompress_file(decrypted_file_name, decompressed_file_name) shell.run([ 'diff', decompressed_file_name, os.path.join(self.data_dir, 'output', local_file_name) ])
def setUp(self): super(FinancialReportsAcceptanceTest, self).setUp() if not self.should_reset_state: return for input_file_name in ('paypal.tsv', 'cybersource_test.tsv'): src = url_path_join(self.data_dir, 'input', input_file_name) dst = url_path_join(self.warehouse_path, "payments", "dt=" + self.IMPORT_DATE, input_file_name) self.upload_file(src, dst) empty_file_path = url_path_join(self.warehouse_path, "payments", "dt=" + self.IMPORT_DATE, 'cybersource_empty_test.tsv') self.upload_file_with_content(empty_file_path, '') self.prepare_database('lms', self.import_db) self.prepare_database('otto', self.otto_db)
def __init__(self, *args, **kwargs): super(LoadInternalReportingUserActivityToWarehouse, self).__init__(*args, **kwargs) path = url_path_join(self.warehouse_path, 'internal_reporting_user_activity') path_targets = PathSetTask([path]).output() paths = list(set([os.path.dirname(target.path) for target in path_targets])) dates = [path.rsplit('/', 2)[-1] for path in paths] latest_date = sorted(dates)[-1] self.load_date = datetime.datetime.strptime(latest_date, "dt=%Y-%m-%d").date()
def upload_gpg_keys(self): """Uploads test gpg keys, needed for encryption.""" gpg_key_dir = os.path.join('gpg-keys') for key_filename in os.listdir(gpg_key_dir): local_filepath = os.path.join(gpg_key_dir, key_filename) destination_url = url_path_join(self.test_gpg_key_dir, key_filename) if not key_filename.endswith('.key'): self.upload_file(local_filepath, destination_url)
def test_student_engagement(self): self.upload_tracking_log(self.INPUT_FILE, datetime.date(2015, 4, 10)) self.execute_sql_fixture_file('load_student_engagement.sql') self.interval = '2015-04-06-2015-04-20' # run for exactly two weeks for interval_type in ['daily', 'weekly', 'all']: self.run_task(interval_type) for course_id in self.ALL_COURSES: hashed_course_id = hashlib.sha1(course_id).hexdigest() course_dir = url_path_join(self.test_out, interval_type, hashed_course_id) csv_targets = self.get_targets_from_remote_path(course_dir) # Check expected number of CSV files. if interval_type == 'daily': self.assertEqual(len(csv_targets), 14) elif interval_type == 'weekly': self.assertEqual(len(csv_targets), 2) elif interval_type == 'all': self.assertEqual(len(csv_targets), 1) # Check that the CSV files contain the expected data. for csv_target in csv_targets: # Parse expected date from filename. if interval_type == 'all': expected_date = '2015-04-19' else: csv_pattern = '.*student_engagement_.*_(\\d\\d\\d\\d-\\d\\d-\\d\\d)\\.csv' match = re.match(csv_pattern, csv_target.path) expected_date = match.group(1) # Build dataframe from csv file generated from events. actual_dataframe = [] with csv_target.open('r') as csvfile: actual_dataframe = read_csv(csvfile) actual_dataframe.fillna('', inplace=True) self.check_engagement_dataframe(actual_dataframe, interval_type, course_id, expected_date) # Validate specific values: csv_filename = os.path.basename(csv_target.path) expected_dataframe = self.get_expected_engagement( interval_type, hashed_course_id, csv_filename) if expected_dataframe is not None: assert_frame_equal(actual_dataframe, expected_dataframe, check_names=True) else: self.assert_zero_engagement(actual_dataframe)
def output(self): """Output is in the form {output_root}/cybersource/{CCYY-mm}/cybersource_{merchant}_{CCYYmmdd}.csv""" month_year_string = self.run_date.strftime('%Y-%m') # pylint: disable=no-member date_string = self.run_date.strftime('%Y%m%d') # pylint: disable=no-member filename = "cybersource_{merchant_id}_{date_string}.{report_format}".format( merchant_id=self.merchant_id, date_string=date_string, report_format=self.REPORT_FORMAT, ) url_with_filename = url_path_join(self.output_root, "cybersource", month_year_string, filename) return get_target_from_url(url_with_filename)
def output(self): """ Output is set up so it can be read in as a Hive table with partitions. The form is {output_root}/payments/dt={CCYY-mm-dd}/cybersource_{merchant}.tsv """ date_string = self.run_date.strftime('%Y-%m-%d') # pylint: disable=no-member partition_path_spec = HivePartition('dt', date_string).path_spec filename = "cybersource_{}.tsv".format(self.merchant_id) url_with_filename = url_path_join(self.output_root, "payments", partition_path_spec, filename) return get_target_from_url(url_with_filename)
def run_obfuscated_package_task(self): """Run ObfuscatedPackageTask.""" self.task.launch([ 'ObfuscatedPackageTask', '--course', self.filename_safe_course_id, '--obfuscated-output-root', url_path_join(self.test_root, 'obfuscated-output'), '--gpg-key-dir', self.test_gpg_key_dir, '--gpg-master-key', '*****@*****.**', '--output-root', self.test_out, '--recipient', '*****@*****.**', '--format-version', self.FORMAT_VERSION ])
def setup_state_files(self): """Upload input fixture data files, needed to mimic the output produced by course-exporter which is not a part of this test.""" state_files_dir = os.path.join(self.data_dir, 'input', 'obfuscation', 'state') for filename in os.listdir(state_files_dir): local_filepath = os.path.join(state_files_dir, filename) dst_url = url_path_join(self.dump_root, self.filename_safe_course_id, 'state', self.EXPORT_DATE, filename) self.upload_file(local_filepath, dst_url)
def output_path_for_key(self, course_id): """ Match the course folder hierarchy that is expected by the instructor dashboard. The instructor dashboard expects the file to be stored in a folder named sha1(course_id). All files in that directory will be displayed on the instructor dashboard for that course. """ hashed_course_id = hashlib.sha1(course_id).hexdigest() filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(course_id, '_') filename = u'{course_id}_answer_distribution.csv'.format(course_id=filename_safe_course_id) return url_path_join(self.output_root, hashed_course_id, filename)
def output_path_for_key(self, datestamp): if not self.tuple_output: # Match tracking.log-{datestamp}.gz format. filename = u'synthetic_enroll.log-{datestamp}.gz'.format( datestamp=datestamp.replace('-', ''), ) else: # Want to have tsv as extension, rather than date. filename = u'synthetic_enroll-{datestamp}.tsv.gz'.format( datestamp=datestamp.replace('-', ''), ) return url_path_join(self.output_root, filename)
def run_export_task(self): """ Preconditions: Populated courseware_studentmodule table in the MySQL database. External Effect: Generates a single text file with the contents of courseware_studentmodule from the MySQL database for the test course and stores it in S3. Intermediate output will be stored in s3://<tasks_output_url>/intermediate/. This directory will contain the complete data set from the MySQL database with all courses interleaved in the data files. The final output file will be stored in s3://<tasks_output_url>/edX-E929-2014_T1-courseware_studentmodule-acceptance-analytics.sql """ self.task.launch([ 'StudentModulePerCourseAfterImportWorkflow', '--credentials', self.import_db.credentials_file_url, '--dump-root', url_path_join(self.test_src, 'intermediate'), '--output-root', url_path_join(self.test_src, self.ENVIRONMENT), '--output-suffix', self.ENVIRONMENT, '--num-mappers', str(self.NUM_MAPPERS), '--n-reduce-tasks', str(self.NUM_REDUCERS), ])
def setUp(self): """Loads enrollment and course catalog fixtures.""" super(EnrollmentAcceptanceTest, self).setUp() self.upload_tracking_log(self.INPUT_FILE, datetime.date(2014, 7, 30)) self.execute_sql_fixture_file('load_auth_userprofile.sql') self.upload_file( os.path.join(self.data_dir, 'input', 'course_catalog.json'), url_path_join(self.warehouse_path, 'course_catalog_raw', 'dt={}'.format(self.CATALOG_DATE), 'course_catalog.json'))
def requires_hadoop(self): # Check first if running locally with Sqoop output. target = get_target_from_url(self.source_dir) if isinstance(target, luigi.LocalTarget) and os.path.isdir( self.source_dir): files = [ f for f in os.listdir(self.source_dir) if f.startswith("part") ] for filename in files: yield ExternalURL(url_path_join(self.source_dir, filename)) else: yield ExternalURL(self.source_dir)
def output(self): config = configuration.get_config() base_url = config.get(CONFIG_SECTION, 'path') target = get_target_from_url( url_path_join(base_url, str(hash(self))) + '.manifest') lib_jar = config.get(CONFIG_SECTION, 'lib_jar', None) if lib_jar: target.lib_jar = [lib_jar] input_format = config.get(CONFIG_SECTION, 'input_format', None) if input_format: target.input_format = input_format return target
def run(self): recipients = set(self.recipient) if self.gpg_master_key is not None: recipients.add(self.gpg_master_key) key_file_targets = [ get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients ] path_task = PathSetTask([self.course_files_url], ['*.*']) with make_temp_directory(prefix='obfuscate-archive.', dir=self.temporary_dir) as tmp_directory: for target in path_task.output(): with target.open('r') as input_file: # Get path without urlscheme. course_files_path = urlparse.urlparse( self.course_files_url).path # Calculates target's relative path to course_files_path by getting the substring that # occurs after course_files_path substring in target's path. # Needed as target.path returns path with urlscheme for s3target & without for hdfstarget. # Examples: # target.path: /pipeline/output/edX_Demo_Course/events/edX_Demo_Course-events-2015-08-30.log.gz # relative_path: events/edX_Demo_Course-events-2015-08-30.log.gz # target.path: s3://some_bucket/output/edX_Demo_Course/state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql # relative_path: state/2015-11-25/edX-Demo-Course-auth_user-prod-analytics.sql r_index = target.path.find(course_files_path) + len( course_files_path) relative_path = target.path[r_index:].lstrip('/') local_file_path = os.path.join(tmp_directory, relative_path) try: os.makedirs(os.path.dirname(local_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise with open(local_file_path, 'w') as temp_file: copy_file_to_file(input_file, temp_file) def report_encrypt_progress(num_bytes): """Log encryption progress.""" log.info('Encrypted %d bytes', num_bytes) with self.output().open('w') as output_file: with make_encrypted_file( output_file, key_file_targets, progress=report_encrypt_progress, dir=self.temporary_dir) as encrypted_output_file: with tarfile.open(mode='w:gz', fileobj=encrypted_output_file ) as output_archive_file: output_archive_file.add(tmp_directory, arcname='')
def upload_data(self): """ Puts the test course structure information where the processing task would look for it, bypassing calling the actual API """ src = os.path.join(self.data_dir, 'input', self.INPUT_FILE) # IMPORTANT: this path should be of the same format as the path that DailyPullCatalogTask uses for output. dst = url_path_join(self.warehouse_path, "courses_raw", self.DATE.strftime('dt=%Y-%m-%d'), self.INPUT_FILE) # Upload mocked results of the API call self.s3_client.put(src, dst)
def _get_required_tasks(self): """Internal method to actually calculate required tasks once.""" start_date = self.interval.date_a end_date = self.interval.date_b table_name = "student_courseenrollment" source_root = url_path_join(self.warehouse_path, table_name) current_date = start_date while current_date < end_date: datestring = current_date.strftime('%Y-%m-%d') current_date += datetime.timedelta(days=1) src_datestring = "dt={}".format(datestring) source_dir = url_path_join(source_root, src_datestring) target = get_target_from_url(source_dir) if target.exists(): output_dir = url_path_join(self.output_root, datestring) yield CreateEnrollmentValidationEventsTask( source_dir=source_dir, output_root=output_dir, n_reduce_tasks=self.n_reduce_tasks, )