Exemplo n.º 1
0
    def run_task(self, source, date, weeks, offset=None, statuses=None):
        """
        Run task with fake targets.

        Returns:
            the task output as a pandas dataframe.
        """

        parsed_date = datetime.datetime.strptime(date, '%Y-%m-%d').date()

        # Make offsets None if it was not specified.
        task = EnrollmentsByWeek(name='fake_name',
                                 src='fake_source',
                                 offsets='fake_offsets' if offset else None,
                                 destination='fake_destination',
                                 date=parsed_date,
                                 weeks=weeks)

        # Mock the input and output targets

        def reformat(string):
            """Reformat string to make it like a TSV."""
            return textwrap.dedent(string).strip().replace(' ', '\t')

        input_targets = {
            'source': FakeTarget(reformat(source)),
        }

        # Mock offsets only if specified.
        if offset:
            input_targets.update({'offsets': FakeTarget(reformat(offset))})

        # Mock statuses only if specified.
        if statuses:
            input_targets.update({'statuses': FakeTarget(reformat(statuses))})

        task.input = MagicMock(return_value=input_targets)

        output_target = FakeTarget()
        task.output = MagicMock(return_value=output_target)

        # Run the task and parse the output into a pandas dataframe

        task.run()

        data = output_target.buffer.read()

        result = pandas.read_csv(StringIO(data),
                                 na_values=['-'],
                                 index_col='course_id')

        return result
Exemplo n.º 2
0
    def test_task_urls(self):
        date = datetime.date(2013, 01, 20)

        task = EnrollmentsByWeek(name='fake_name',
                                 src='s3://bucket/path/',
                                 offsets='s3://bucket/file.txt',
                                 destination='file://path/file.txt',
                                 date=date)

        requires = task.requires()

        source = requires['source'].output()
        offsets = requires['offsets'].output()
        self.assertIsInstance(offsets, luigi.hdfs.HdfsTarget)
        self.assertEqual(offsets.format, luigi.hdfs.Plain)

        destination = task.output()
        self.assertIsInstance(destination, luigi.File)
Exemplo n.º 3
0
    def test_task_urls(self):
        date = datetime.date(2013, 01, 20)

        task = EnrollmentsByWeek(name='fake_name',
                                 src=['s3://bucket/path/'],
                                 offsets='s3://bucket/file.txt',
                                 destination='file://path/file.txt',
                                 date=date)

        requires = task.requires()

        source = requires['source'].output()
        offsets = requires['offsets'].output()
        self.assertIsInstance(offsets, luigi.hdfs.HdfsTarget)
        self.assertEqual(offsets.format, luigi.hdfs.Plain)

        destination = task.output()
        self.assertIsInstance(destination, luigi.File)
Exemplo n.º 4
0
    def requires(self):
        """
        Runs each task
        """

        output_destination = url_path_join(self.destination, self.name, str(self.date))

        if self.manifest_path is not None:
            manifest = url_path_join(self.manifest_path, "executive-reports", self.name, str(self.date))
        else:
            manifest = None

        common_parameters = {
            "name": self.name,
            "src": self.src,
            "include": self.include,
            "manifest": manifest,
            "credentials": self.credentials,
            "blacklist": self.blacklist,
            "mapreduce_engine": self.mapreduce_engine,
            "lib_jar": self.lib_jar,
            "n_reduce_tasks": self.n_reduce_tasks,
            "destination": output_destination,
            "date": self.date,
        }

        yield (
            WeeklyAllUsersAndEnrollments(
                offsets=self.offsets,
                history=self.history,
                weeks=TOTAL_USERS_AND_ENROLLMENTS_NUM_WEEKS,
                **common_parameters

            ),

            WeeklyIncrementalUsersAndEnrollments(
                offsets=self.offsets,
                history=self.history,
                weeks=WEEKLY_ENROLLMENT_REPORT_WEEKS,
                **common_parameters
            ),

            EnrollmentsByWeek(
                offsets=self.offsets,
                statuses=self.statuses,
                weeks=WEEKLY_ENROLLMENT_REPORT_WEEKS,
                **common_parameters
            ),

            DailyRegistrationsEnrollmentsAndCourses(
                days=DEFAULT_NUM_DAYS,
                **common_parameters
            )
        )
Exemplo n.º 5
0
    def run_task(self, source, date, weeks, offset=None, statuses=None):
        """
        Run task with fake targets.

        Returns:
            the task output as a pandas dataframe.
        """

        parsed_date = datetime.datetime.strptime(date, '%Y-%m-%d').date()

        # Make offsets None if it was not specified.
        task = EnrollmentsByWeek(name='fake_name',
                                 src=['fake_source'],
                                 offsets='fake_offsets' if offset else None,
                                 destination='fake_destination',
                                 date=parsed_date,
                                 weeks=weeks)

        # Mock the input and output targets

        def reformat(string):
            """Reformat string to make it like a TSV."""
            return textwrap.dedent(string).strip().replace(' ', '\t')

        input_targets = {
            'source': FakeTarget(reformat(source)),
        }

        # Mock offsets only if specified.
        if offset:
            input_targets.update({'offsets': FakeTarget(reformat(offset))})

        # Mock statuses only if specified.
        if statuses:
            input_targets.update({'statuses': FakeTarget(reformat(statuses))})

        task.input = MagicMock(return_value=input_targets)

        output_target = FakeTarget()
        task.output = MagicMock(return_value=output_target)

        # Run the task and parse the output into a pandas dataframe

        task.run()

        data = output_target.buffer.read()

        result = pandas.read_csv(StringIO(data),
                                 na_values=['-'],
                                 index_col='course_id')

        return result