예제 #1
0
    def requires(self):
        """
        Runs each task
        """

        output_destination = url_path_join(self.destination, self.name, str(self.date))

        if self.manifest_path is not None:
            manifest = url_path_join(self.manifest_path, "executive-reports", self.name, str(self.date))
        else:
            manifest = None

        common_parameters = {
            "name": self.name,
            "src": self.src,
            "include": self.include,
            "manifest": manifest,
            "credentials": self.credentials,
            "blacklist": self.blacklist,
            "mapreduce_engine": self.mapreduce_engine,
            "lib_jar": self.lib_jar,
            "n_reduce_tasks": self.n_reduce_tasks,
            "destination": output_destination,
            "date": self.date,
        }

        yield (
            WeeklyAllUsersAndEnrollments(
                offsets=self.offsets,
                history=self.history,
                weeks=TOTAL_USERS_AND_ENROLLMENTS_NUM_WEEKS,
                **common_parameters

            ),

            WeeklyIncrementalUsersAndEnrollments(
                offsets=self.offsets,
                history=self.history,
                weeks=WEEKLY_ENROLLMENT_REPORT_WEEKS,
                **common_parameters
            ),

            EnrollmentsByWeek(
                offsets=self.offsets,
                statuses=self.statuses,
                weeks=WEEKLY_ENROLLMENT_REPORT_WEEKS,
                **common_parameters
            ),

            DailyRegistrationsEnrollmentsAndCourses(
                days=DEFAULT_NUM_DAYS,
                **common_parameters
            )
        )
    def test_task_configuration(self):
        date = datetime.date(2013, 01, 20)

        task = WeeklyAllUsersAndEnrollments(
            name='fake_name',
            n_reduce_tasks='fake_n_reduce_tasks',
            offsets='s3://bucket/file.txt',
            destination='s3://path/',
            history='file://path/history/file.gz',
            date=date,
            credentials='s3://bucket/cred.json')

        requires = task.requires()

        enrollments = requires['enrollments'].output()
        self.assertIsInstance(enrollments, luigi.hdfs.HdfsTarget)
        self.assertEqual(enrollments.format, luigi.hdfs.PlainDir)

        offsets = requires['offsets'].output()
        self.assertIsInstance(offsets, luigi.hdfs.HdfsTarget)
        self.assertEqual(offsets.format, luigi.hdfs.Plain)

        history = requires['history'].output()
        self.assertIsInstance(history, luigi.File)

        registrations = requires['registrations'].output()
        self.assertIsInstance(requires['registrations'],
                              UserRegistrationsPerDay)
        self.assertEqual(
            registrations.path,
            's3://path/user_registrations_1900-01-01-2013-01-21.tsv')
        self.assertIsInstance(registrations, luigi.hdfs.HdfsTarget)
        self.assertEqual(registrations.format, luigi.hdfs.Plain)

        destination = task.output()

        self.assertEqual(
            destination.path,
            's3://path/total_users_and_enrollments_2012-01-22-2013-01-20.csv')
        self.assertIsInstance(offsets, luigi.hdfs.HdfsTarget)
        self.assertEqual(offsets.format, luigi.hdfs.Plain)
    def test_task_configuration(self):
        date = datetime.date(2013, 01, 20)

        task = WeeklyAllUsersAndEnrollments(
            name='fake_name',
            n_reduce_tasks='fake_n_reduce_tasks',
            offsets='s3://bucket/file.txt',
            destination='s3://path/',
            history='file://path/history/file.gz',
            date=date,
            credentials='s3://bucket/cred.json'
        )

        requires = task.requires()

        enrollments = requires['enrollments'].output()
        self.assertIsInstance(enrollments, luigi.hdfs.HdfsTarget)
        self.assertEqual(enrollments.format, luigi.hdfs.PlainDir)

        offsets = requires['offsets'].output()
        self.assertIsInstance(offsets, luigi.hdfs.HdfsTarget)
        self.assertEqual(offsets.format, luigi.hdfs.Plain)

        history = requires['history'].output()
        self.assertIsInstance(history, luigi.File)

        registrations = requires['registrations'].output()
        self.assertIsInstance(requires['registrations'], UserRegistrationsPerDay)
        self.assertEqual(registrations.path, 's3://path/user_registrations_1900-01-01-2013-01-21.tsv')
        self.assertIsInstance(registrations, luigi.hdfs.HdfsTarget)
        self.assertEqual(registrations.format, luigi.hdfs.Plain)

        destination = task.output()

        self.assertEqual(destination.path, 's3://path/total_users_and_enrollments_2012-01-22-2013-01-20.csv')
        self.assertIsInstance(offsets, luigi.hdfs.HdfsTarget)
        self.assertEqual(offsets.format, luigi.hdfs.Plain)
    def run_task(self, registrations, enrollments, date, weeks, offset=None, history=None, blacklist=None):
        """
        Run task with fake targets.

        Returns:
            the task output as a pandas dataframe.
        """

        parsed_date = datetime.datetime.strptime(date, '%Y-%m-%d').date()

        # Make offsets None if it was not specified.
        task = WeeklyAllUsersAndEnrollments(
            name='fake_name',
            n_reduce_tasks="fake_n_reduce_tasks",
            offsets='fake_offsets' if offset else None,
            history='fake_history' if history else None,
            destination='fake_destination',
            date=parsed_date,
            weeks=weeks,
            credentials=None,
            blacklist=blacklist
        )

        # Mock the input and output targets

        def reformat(string):
            """Reformat string to make it like a TSV."""
            return textwrap.dedent(string).strip().replace(' ', '\t')

        if enrollments is None:
            enrollments = """
                course_1 2013-03-01 1
                course_1 2013-03-30 2
                course_2 2013-03-07 1
                course_2 2013-03-08 1
                course_2 2013-03-10 1
                course_2 2013-03-13 1
                course_3 2013-03-15 1
                course_3 2013-03-18 1
                course_3 2013-03-19 1
                """

        input_targets = {
            'enrollments': FakeTarget(value=reformat(enrollments)),
            'registrations': FakeTarget(value=reformat(registrations))
        }

        # Mock offsets only if specified.
        if offset:
            input_targets.update({'offsets': FakeTarget(value=reformat(offset))})

        # Mock history only if specified.
        if history:
            input_targets.update({'history': FakeTarget(value=reformat(history))})

        # Mock blacklist only if specified.
        if blacklist:
            input_targets.update({'blacklist': FakeTarget(value=reformat(blacklist))})

        task.input = MagicMock(return_value=input_targets)

        output_target = FakeTarget()
        task.output = MagicMock(return_value=output_target)

        # Run the task and parse the output into a pandas dataframe

        task.run()

        data = output_target.buffer.read()
        result = pandas.read_csv(StringIO(data),
                                 na_values=['-'],
                                 index_col=self.row_label('header'))

        return result
    def run_task(self,
                 registrations,
                 enrollments,
                 date,
                 weeks,
                 offset=None,
                 history=None,
                 blacklist=None):
        """
        Run task with fake targets.

        Returns:
            the task output as a pandas dataframe.
        """

        parsed_date = datetime.datetime.strptime(date, '%Y-%m-%d').date()

        # Make offsets None if it was not specified.
        task = WeeklyAllUsersAndEnrollments(
            name='fake_name',
            n_reduce_tasks="fake_n_reduce_tasks",
            offsets='fake_offsets' if offset else None,
            history='fake_history' if history else None,
            destination='fake_destination',
            date=parsed_date,
            weeks=weeks,
            credentials=None,
            blacklist=blacklist)

        # Mock the input and output targets

        def reformat(string):
            """Reformat string to make it like a TSV."""
            return textwrap.dedent(string).strip().replace(' ', '\t')

        if enrollments is None:
            enrollments = """
                course_1 2013-03-01 1
                course_1 2013-03-30 2
                course_2 2013-03-07 1
                course_2 2013-03-08 1
                course_2 2013-03-10 1
                course_2 2013-03-13 1
                course_3 2013-03-15 1
                course_3 2013-03-18 1
                course_3 2013-03-19 1
                """

        input_targets = {
            'enrollments': FakeTarget(value=reformat(enrollments)),
            'registrations': FakeTarget(value=reformat(registrations))
        }

        # Mock offsets only if specified.
        if offset:
            input_targets.update(
                {'offsets': FakeTarget(value=reformat(offset))})

        # Mock history only if specified.
        if history:
            input_targets.update(
                {'history': FakeTarget(value=reformat(history))})

        # Mock blacklist only if specified.
        if blacklist:
            input_targets.update(
                {'blacklist': FakeTarget(value=reformat(blacklist))})

        task.input = MagicMock(return_value=input_targets)

        output_target = FakeTarget()
        task.output = MagicMock(return_value=output_target)

        # Run the task and parse the output into a pandas dataframe

        task.run()

        data = output_target.buffer.read()
        result = pandas.read_csv(StringIO(data),
                                 na_values=['-'],
                                 index_col=self.row_label('header'))

        return result