def requires(self): """ Runs each task """ output_destination = url_path_join(self.destination, self.name, str(self.date)) if self.manifest_path is not None: manifest = url_path_join(self.manifest_path, "executive-reports", self.name, str(self.date)) else: manifest = None common_parameters = { "name": self.name, "src": self.src, "include": self.include, "manifest": manifest, "credentials": self.credentials, "blacklist": self.blacklist, "mapreduce_engine": self.mapreduce_engine, "lib_jar": self.lib_jar, "n_reduce_tasks": self.n_reduce_tasks, "destination": output_destination, "date": self.date, } yield ( WeeklyAllUsersAndEnrollments( offsets=self.offsets, history=self.history, weeks=TOTAL_USERS_AND_ENROLLMENTS_NUM_WEEKS, **common_parameters ), WeeklyIncrementalUsersAndEnrollments( offsets=self.offsets, history=self.history, weeks=WEEKLY_ENROLLMENT_REPORT_WEEKS, **common_parameters ), EnrollmentsByWeek( offsets=self.offsets, statuses=self.statuses, weeks=WEEKLY_ENROLLMENT_REPORT_WEEKS, **common_parameters ), DailyRegistrationsEnrollmentsAndCourses( days=DEFAULT_NUM_DAYS, **common_parameters ) )
def test_task_configuration(self): date = datetime.date(2013, 01, 20) task = WeeklyAllUsersAndEnrollments( name='fake_name', n_reduce_tasks='fake_n_reduce_tasks', offsets='s3://bucket/file.txt', destination='s3://path/', history='file://path/history/file.gz', date=date, credentials='s3://bucket/cred.json') requires = task.requires() enrollments = requires['enrollments'].output() self.assertIsInstance(enrollments, luigi.hdfs.HdfsTarget) self.assertEqual(enrollments.format, luigi.hdfs.PlainDir) offsets = requires['offsets'].output() self.assertIsInstance(offsets, luigi.hdfs.HdfsTarget) self.assertEqual(offsets.format, luigi.hdfs.Plain) history = requires['history'].output() self.assertIsInstance(history, luigi.File) registrations = requires['registrations'].output() self.assertIsInstance(requires['registrations'], UserRegistrationsPerDay) self.assertEqual( registrations.path, 's3://path/user_registrations_1900-01-01-2013-01-21.tsv') self.assertIsInstance(registrations, luigi.hdfs.HdfsTarget) self.assertEqual(registrations.format, luigi.hdfs.Plain) destination = task.output() self.assertEqual( destination.path, 's3://path/total_users_and_enrollments_2012-01-22-2013-01-20.csv') self.assertIsInstance(offsets, luigi.hdfs.HdfsTarget) self.assertEqual(offsets.format, luigi.hdfs.Plain)
def test_task_configuration(self): date = datetime.date(2013, 01, 20) task = WeeklyAllUsersAndEnrollments( name='fake_name', n_reduce_tasks='fake_n_reduce_tasks', offsets='s3://bucket/file.txt', destination='s3://path/', history='file://path/history/file.gz', date=date, credentials='s3://bucket/cred.json' ) requires = task.requires() enrollments = requires['enrollments'].output() self.assertIsInstance(enrollments, luigi.hdfs.HdfsTarget) self.assertEqual(enrollments.format, luigi.hdfs.PlainDir) offsets = requires['offsets'].output() self.assertIsInstance(offsets, luigi.hdfs.HdfsTarget) self.assertEqual(offsets.format, luigi.hdfs.Plain) history = requires['history'].output() self.assertIsInstance(history, luigi.File) registrations = requires['registrations'].output() self.assertIsInstance(requires['registrations'], UserRegistrationsPerDay) self.assertEqual(registrations.path, 's3://path/user_registrations_1900-01-01-2013-01-21.tsv') self.assertIsInstance(registrations, luigi.hdfs.HdfsTarget) self.assertEqual(registrations.format, luigi.hdfs.Plain) destination = task.output() self.assertEqual(destination.path, 's3://path/total_users_and_enrollments_2012-01-22-2013-01-20.csv') self.assertIsInstance(offsets, luigi.hdfs.HdfsTarget) self.assertEqual(offsets.format, luigi.hdfs.Plain)
def run_task(self, registrations, enrollments, date, weeks, offset=None, history=None, blacklist=None): """ Run task with fake targets. Returns: the task output as a pandas dataframe. """ parsed_date = datetime.datetime.strptime(date, '%Y-%m-%d').date() # Make offsets None if it was not specified. task = WeeklyAllUsersAndEnrollments( name='fake_name', n_reduce_tasks="fake_n_reduce_tasks", offsets='fake_offsets' if offset else None, history='fake_history' if history else None, destination='fake_destination', date=parsed_date, weeks=weeks, credentials=None, blacklist=blacklist ) # Mock the input and output targets def reformat(string): """Reformat string to make it like a TSV.""" return textwrap.dedent(string).strip().replace(' ', '\t') if enrollments is None: enrollments = """ course_1 2013-03-01 1 course_1 2013-03-30 2 course_2 2013-03-07 1 course_2 2013-03-08 1 course_2 2013-03-10 1 course_2 2013-03-13 1 course_3 2013-03-15 1 course_3 2013-03-18 1 course_3 2013-03-19 1 """ input_targets = { 'enrollments': FakeTarget(value=reformat(enrollments)), 'registrations': FakeTarget(value=reformat(registrations)) } # Mock offsets only if specified. if offset: input_targets.update({'offsets': FakeTarget(value=reformat(offset))}) # Mock history only if specified. if history: input_targets.update({'history': FakeTarget(value=reformat(history))}) # Mock blacklist only if specified. if blacklist: input_targets.update({'blacklist': FakeTarget(value=reformat(blacklist))}) task.input = MagicMock(return_value=input_targets) output_target = FakeTarget() task.output = MagicMock(return_value=output_target) # Run the task and parse the output into a pandas dataframe task.run() data = output_target.buffer.read() result = pandas.read_csv(StringIO(data), na_values=['-'], index_col=self.row_label('header')) return result
def run_task(self, registrations, enrollments, date, weeks, offset=None, history=None, blacklist=None): """ Run task with fake targets. Returns: the task output as a pandas dataframe. """ parsed_date = datetime.datetime.strptime(date, '%Y-%m-%d').date() # Make offsets None if it was not specified. task = WeeklyAllUsersAndEnrollments( name='fake_name', n_reduce_tasks="fake_n_reduce_tasks", offsets='fake_offsets' if offset else None, history='fake_history' if history else None, destination='fake_destination', date=parsed_date, weeks=weeks, credentials=None, blacklist=blacklist) # Mock the input and output targets def reformat(string): """Reformat string to make it like a TSV.""" return textwrap.dedent(string).strip().replace(' ', '\t') if enrollments is None: enrollments = """ course_1 2013-03-01 1 course_1 2013-03-30 2 course_2 2013-03-07 1 course_2 2013-03-08 1 course_2 2013-03-10 1 course_2 2013-03-13 1 course_3 2013-03-15 1 course_3 2013-03-18 1 course_3 2013-03-19 1 """ input_targets = { 'enrollments': FakeTarget(value=reformat(enrollments)), 'registrations': FakeTarget(value=reformat(registrations)) } # Mock offsets only if specified. if offset: input_targets.update( {'offsets': FakeTarget(value=reformat(offset))}) # Mock history only if specified. if history: input_targets.update( {'history': FakeTarget(value=reformat(history))}) # Mock blacklist only if specified. if blacklist: input_targets.update( {'blacklist': FakeTarget(value=reformat(blacklist))}) task.input = MagicMock(return_value=input_targets) output_target = FakeTarget() task.output = MagicMock(return_value=output_target) # Run the task and parse the output into a pandas dataframe task.run() data = output_target.buffer.read() result = pandas.read_csv(StringIO(data), na_values=['-'], index_col=self.row_label('header')) return result