def _get_required_tasks(self):
     """Internal method to actually calculate required tasks once."""
     start_date = self.interval.date_a  # pylint: disable=no-member
     end_date = self.interval.date_b  # pylint: disable=no-member
     table_name = "student_courseenrollment"
     source_root = url_path_join(self.warehouse_path, table_name)
     today_datestring = datetime.datetime.utcnow().strftime('%Y-%m-%d')
     current_date = start_date
     while current_date <= end_date:
         datestring = current_date.strftime('%Y-%m-%d')
         current_date += datetime.timedelta(days=1)
         src_datestring = "dt={}".format(datestring)
         source_dir = url_path_join(source_root, src_datestring)
         target = get_target_from_url(source_dir)
         output_dir = url_path_join(self.output_root, datestring)
         if datestring == today_datestring:
             yield CreateEnrollmentValidationEventsForTodayTask(
                 source_dir=source_dir,
                 output_root=output_dir,
                 n_reduce_tasks=self.n_reduce_tasks,
                 credentials=self.credentials,
             )
         elif target.exists():
             yield CreateEnrollmentValidationEventsTask(
                 source_dir=source_dir,
                 output_root=output_dir,
                 n_reduce_tasks=self.n_reduce_tasks,
             )
 def setUp(self):
     super(ObfuscationAcceptanceTest, self).setUp()
     self.temporary_dir = tempfile.mkdtemp()
     self.addCleanup(shutil.rmtree, self.temporary_dir)
     self.dump_root = url_path_join(self.test_src, 'course_exports', 'raw')
     self.filename_safe_course_id = get_filename_safe_course_id(self.COURSE_ID)
     self.test_gpg_key_dir = url_path_join(self.test_root, 'gpg-keys')
    def test_enrollment_validation(self):
        # Initial setup.
        context = {
            'days': lambda n: datetime.timedelta(days=n),
            'start_date': self.START_DATE
        }
        self.upload_tracking_log(self.INPUT_FILE, self.START_DATE, template_context=context)
        self.execute_sql_fixture_file(self.SQL_FIXTURE)
        self.test_validate = url_path_join(self.test_root, 'validate')

        # Run once.  This will generate the new validation events, but
        # will not include them in the validation run (because the
        # requirements for the validation run are computed before any
        # validation events are generated).
        self.test_first_run = url_path_join(self.test_out, 'first_run')
        self.launch_task(self.test_first_run, run_with_validation_events=False)

        # Check that validation took place.
        self.check_validation_events()

        # Run again, with the validation events generated by the first run.
        self.test_second_run = url_path_join(self.test_out, 'second_run')
        self.launch_task(self.test_second_run)

        # Check that synthetic events were created.
        self.check_synthetic_events(self.test_second_run)

        # Run again, with the synthetic events generated by the second run.
        self.test_third_run = url_path_join(self.test_out, 'third_run')
        self.launch_task(self.test_third_run, extra_source=self.test_second_run)

        # Check that no events are output.
        self.check_no_synthetic_events(self.test_third_run)
    def setUp(self):
        """Copy the input data into place."""
        super(CourseListPartitionTaskAcceptanceTest, self).setUp()

        # Copy course list REST API data
        file_name = 'course_list.json'
        daily_partition = self.DATE.strftime(self.DAILY_PARTITION_FORMAT)
        self.upload_file(url_path_join(self.data_dir, 'input', file_name),
                         url_path_join(self.warehouse_path, 'course_list_raw', "dt=" + daily_partition, file_name))
 def setUp(self):
     super(InternalReportingUserCourseLoadAcceptanceTest, self).setUp()
     self.upload_file(
         os.path.join(self.data_dir, 'input', 'course_catalog.json'),
         url_path_join(self.warehouse_path, 'course_catalog_raw', 'dt=' + self.DATE, 'course_catalog.json')
     )
     self.upload_file(
         os.path.join(self.data_dir, 'input', 'programs.json'),
         url_path_join(self.warehouse_path, 'programs_raw', 'dt=' + self.DATE, 'programs.json')
     )
 def run_obfuscation_task(self):
     """Run ObfuscatedCourseTask."""
     self.task.launch([
         'ObfuscatedCourseTask',
         '--course', self.filename_safe_course_id,
         '--dump-root', self.dump_root,
         '--obfuscated-output-root', url_path_join(self.test_root, 'obfuscated-output'),
         '--format-version', self.FORMAT_VERSION,
         '--pipeline-version', self.PIPELINE_VERSION,
         '--auth-user-path', url_path_join(self.test_root, 'warehouse', 'auth_user'),
         '--auth-userprofile-path', url_path_join(self.test_root, 'warehouse', 'auth_userprofile')
     ])
 def test_answer_distribution(self):
     self.task.launch([
         'AnswerDistributionOneFilePerCourseTask',
         '--src', self.test_src,
         '--dest', url_path_join(self.test_root, 'dst'),
         '--name', 'test',
         '--output-root', self.test_out,
         '--include', '"*"',
         '--manifest', url_path_join(self.test_root, 'manifest.txt'),
         '--base-input-format', self.input_format,
         '--lib-jar', self.oddjob_jar,
         '--n-reduce-tasks', str(self.NUM_REDUCERS),
     ])
     self.validate_output()
    def validate_hive(self):
        """Ensure hive partition was created as expected."""
        table_name = 'course_blocks'
        output_dir = url_path_join(self.data_dir, 'output', table_name)
        for file_name in ('_SUCCESS', 'part-00000', 'part-00001'):
            actual_output_file = url_path_join(self.warehouse_path, table_name, self.partition, file_name)
            actual_output_target = get_target_for_local_server(actual_output_file)
            self.assertTrue(actual_output_target.exists(), '{} not created'.format(file_name))
            actual_output = actual_output_target.open('r').read()

            expected_output_file = url_path_join(output_dir, file_name)
            expected_output_target = get_target_for_local_server(expected_output_file)
            expected_output = expected_output_target.open('r').read()
            self.assertEqual(actual_output, expected_output)
    def test_answer_distribution_mysql(self):
        self.task.launch([
            'AnswerDistributionToMySQLTaskWorkflow',
            '--src', self.test_src,
            '--dest', url_path_join(self.test_root, 'dst'),
            '--name', 'test',
            '--include', '"*"',
            '--manifest', url_path_join(self.test_root, 'manifest.txt'),
            '--base-input-format', self.input_format,
            '--lib-jar', self.oddjob_jar,
            '--n-reduce-tasks', str(self.NUM_REDUCERS),
            '--credentials', self.export_db.credentials_file_url,
        ])

        self.validate_output()
Exemplo n.º 10
0
    def __init__(self, *args, **kwargs):
        super(ObfuscatedCourseDumpTask, self).__init__(*args, **kwargs)

        filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(self.course)
        dump_path = url_path_join(self.dump_root, filename_safe_course_id, 'state')
        auth_userprofile_targets = PathSetTask([dump_path], ['*auth_userprofile*']).output()
        # TODO: Refactor out this logic of getting latest file. Right now we expect a date, so we use that
        dates = [target.path.rsplit('/', 2)[-2] for target in auth_userprofile_targets]
        # TODO: Make the date a parameter that defaults to the most recent, but allows the user to override?
        # This should return an error if no data is found, rather than getting a cryptic 'index out of range' error.
        if len(dates) == 0:
            raise Exception('Missing auth_userprofile data file in {}'.format(dump_path))
        latest_date = sorted(dates)[-1]
        self.data_directory = url_path_join(self.dump_root, filename_safe_course_id, 'state', latest_date)
        self.output_directory = url_path_join(self.output_root, filename_safe_course_id, 'state', latest_date)
    def insert_source_task(self):
        """
        We are already exporting vertica tables to S3 using SqoopImportFromVertica through VerticaSchemaToBigQueryTask
        workflow, so we specify ExternalURL here instead. In the future we can change this to a
        SqoopImportFromVertica task.
        """
        partition_path_spec = HivePartition('dt', self.date).path_spec
        intermediate_warehouse_path = url_path_join(self.warehouse_path, 'import/vertica/sqoop/')
        url = url_path_join(intermediate_warehouse_path,
                            self.vertica_warehouse_name,
                            self.vertica_schema_name,
                            self.table_name,
                            partition_path_spec) + '/'

        return ExternalURL(url=url)
Exemplo n.º 12
0
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path('course_list_raw', partition_value=self.partition_value),
             'course_list.json'
         )
     )
 def output_path_for_key(self, course_id):
     filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(course_id, '_')
     filename = u'{course_id}_enroll_validated_{dumpdate}.log.gz'.format(
         course_id=filename_safe_course_id,
         dumpdate=self.dump_date,
     )
     return url_path_join(self.output_root, filename)
Exemplo n.º 14
0
    def test_event_log_exports_using_manifest(self):
        config_override = {
            'manifest': {
                'threshold': 1
            }
        }

        folders = {
            'edx': self.PROD_FOLDER,
            'edge': self.EDGE_FOLDER
        }
        for environment in ['edx', 'edge']:
            self.task.launch([
                'EventExportTask',
                '--source', as_list_param(url_path_join(self.test_src, environment)),
                '--output-root', self.test_out,
                '--config', self.test_config,
                '--environment', environment,
                '--interval', '2014-05',
                '--gpg-key-dir', self.test_gpg_key_dir,
                '--gpg-master-key', '*****@*****.**',
                '--required-path-text', folders[environment],
                '--n-reduce-tasks', str(self.NUM_REDUCERS),
            ], config_override)

        self.validate_output()
 def insert_source_task(self):
     # Get the columns to request from Sqoop, as a side effect of
     # getting the Vertica columns. The Vertica column names are quoted, so strip the quotes off.
     column_names = [name[1:-1] for (name, _) in self.columns]
     partition_path_spec = HivePartition('dt', self.date.isoformat()).path_spec
     destination = url_path_join(
         self.warehouse_path,
         self.warehouse_subdirectory,
         self.database,
         self.table_name,
         partition_path_spec
     ) + '/'
     # The arguments here to SqoopImportFromMysql should be the same as for BigQuery.
     # The old format used mysql_delimiters, and direct mode.  We have now removed direct mode,
     # and that gives us more choices for other settings.   We have already changed null_string and field termination,
     # and we hardcode here the replacement of delimiters (like newlines) with spaces
     # (using Sqoop's --hive-delims-replacement option).
     # We could also set other SqoopImportTask parameters: escaped_by, enclosed_by, optionally_enclosed_by.
     # If we wanted to model 'mysql_delimiters=True', we would set escaped-by: \ optionally-enclosed-by: '.
     # But instead we use the defaults for them, so that there is no escaping or enclosing.
     return SqoopImportFromMysql(
         table_name=self.table_name,
         credentials=self.db_credentials,
         database=self.database,
         destination=destination,
         overwrite=self.overwrite,
         mysql_delimiters=False,
         fields_terminated_by=self.field_delimiter,
         null_string=self.null_marker,
         delimiter_replacement=' ',
         direct=False,
         columns=column_names,
     )
    def validate_problem_response_report(self):
        """Run the ProblemResponseReportWorkflow task and test the output."""
        marker_path = url_path_join(self.test_out, 'marker-{}'.format(str(time.time())))
        report_date = self.DATE.strftime('%Y-%m-%d')

        # The test tracking.log file contains problem_check events for 2016-09-06, 09-07, and 09-08.
        # However, to test the interval parameter propagation, we deliberately exclude all but the 2016-09-07 events.
        #
        # This is important because this task can be run multiple times a day, and so must be configurable to have an
        # interval-end of "tomorrow", which will include all events from today.
        interval_start = '2016-09-07'
        interval_end = '2016-09-08'

        self.task.launch([
            'ProblemResponseReportWorkflow',
            '--interval-start', interval_start,
            '--interval-end', interval_end,
            '--date', report_date,
            '--marker', marker_path,
            '--n-reduce-tasks', str(self.NUM_REDUCERS),
        ])

        self.maxDiff = None
        self.validate_marker(marker_path)
        self.validate_hive()
        self.validate_reports()
    def check_validation_events(self):
        """Confirm that validation data was properly created."""
        validate_output_dir = url_path_join(self.test_validate, str(self.END_DATE))
        outputs = self.get_targets_from_remote_path(validate_output_dir)

        # There are 2 courses in the test data.
        self.assertEqual(len(outputs), 2)
 def insert_source_task(self):
     hive_table = "user_activity_by_user"
     # User activity data for each day is stored in a dated directory.
     # We want to be able to load all that data into Vertica in one go, hence we use
     # a wildcard('*') here.
     url = url_path_join(self.warehouse_path, hive_table) + '/dt=*/'
     return ExternalURL(url=url)
 def validate_hive(self):
     """Ensure hive partition was created."""
     hourly_partition = self.DATE.strftime(self.HOURLY_PARTITION_FORMAT)
     hive_partition = url_path_join(self.warehouse_path, "problem_response_location",
                                    "dt=" + hourly_partition)
     partition_target = get_target_for_local_server(hive_partition)
     self.assertTrue(partition_target.exists())
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path('program_course_order', partition_value=self.date),
             '{0}.tsv'.format('program_course_order')
         )
     )
 def output(self):
     return get_target_from_url(url_path_join(
         self.output_root,
         'transaction',
         'dt=' + self.import_date.isoformat(),  # pylint: disable=no-member
         'transactions.csv'
     ))
Exemplo n.º 22
0
    def requires(self):
        yield ExternalURL(url=self.vertica_credentials)
        yield ExternalURL(url=self.gcp_credentials)

        if self.bigquery_dataset is None:
            self.bigquery_dataset = self.vertica_schema_name

        intermediate_warehouse_path = url_path_join(self.s3_warehouse_path, 'import/vertica/sqoop/')

        query = "SELECT table_name FROM all_tables WHERE schema_name='{schema_name}' AND table_type='TABLE' " \
                "".format(schema_name=self.vertica_schema_name)
        table_list = [row[0] for row in get_vertica_results(self.vertica_credentials, query)]

        for table_name in table_list:
            if not self.should_exclude_table(table_name):

                yield LoadVerticaTableToBigQuery(
                    date=self.date,
                    overwrite=self.overwrite,
                    intermediate_warehouse_path=intermediate_warehouse_path,
                    dataset_id=self.bigquery_dataset,
                    credentials=self.gcp_credentials,
                    max_bad_records=self.max_bad_records,
                    table_name=table_name,
                    vertica_schema_name=self.vertica_schema_name,
                    vertica_warehouse_name=self.vertica_warehouse_name,
                    vertica_credentials=self.vertica_credentials,
                    exclude=self.exclude,
                )
Exemplo n.º 23
0
 def output(self):
     if len(self.input()['data']) == 0:
         raise IOError("Course File '{filename}' not found for course '{course}'".format(
             filename=self.file_pattern, course=self.course
         ))
     output_filename = os.path.basename(self.input()['data'][0].path)
     return get_target_from_url(url_path_join(self.output_directory, output_filename))
    def test_end_to_end_without_vertica(self):
        # Similar to test_end_to_end but it excludes the vertica part and it checks data values,
        # not just data shape.
        table_name = 'reconciled_order_transactions'
        output_root = url_path_join(
            self.warehouse_path, table_name, 'dt=' + self.UPPER_BOUND_DATE
        ) + '/'
        self.task.launch([
            'ReconcileOrdersAndTransactionsTask',
            '--import-date', self.UPPER_BOUND_DATE,
            '--n-reduce-tasks', str(self.NUM_REDUCERS),
            '--output-root', output_root,
        ])
        final_output_task = LoadInternalReportingOrderTransactionsToWarehouse(
            import_date=luigi.DateParameter().parse(self.UPPER_BOUND_DATE)
        )
        columns = [x[0] for x in final_output_task.columns]

        expected_output_csv = os.path.join(self.data_dir, 'output', 'expected_financial_report.csv')
        expected = pandas.read_csv(expected_output_csv, parse_dates=True)

        raw_output = self.read_dfs_directory(output_root)
        output = StringIO(raw_output.replace('\t\\N', '\t'))
        data = pandas.read_table(output, header=None, names=columns, parse_dates=True)
        # Re-order dataframe for consistent comparison:
        for frame in (data, expected):
            frame.sort(['payment_ref_id', 'transaction_type'], inplace=True, ascending=[True, False])
            frame.reset_index(drop=True, inplace=True)

        self.assert_data_frames_equal(data, expected)
Exemplo n.º 25
0
    def multi_output_reducer(self, key, values, output_file):
        """
        Write values to the appropriate file as determined by the key.
        Write to the encrypted file by streaming through gzip, which compresses before encrypting
        """
        _date_string, org_id = key
        recipients = self.recipients_for_org_id[org_id]
        log.info('Encryption recipients: %s', str(recipients))

        def report_progress(num_bytes):
            """Update hadoop counters as the file is written"""
            self.event_export_counter(counter_title='Bytes Written to Output', incr_value=num_bytes)

        key_file_targets = [get_target_from_url(url_path_join(self.gpg_key_dir, recipient)) for recipient in recipients]
        try:
            with make_encrypted_file(output_file, key_file_targets, progress=report_progress,
                                     hadoop_counter_incr_func=self.event_export_counter) as encrypted_output_file:
                outfile = gzip.GzipFile(mode='wb', fileobj=encrypted_output_file)
                try:
                    for value in values:
                        outfile.write(value.strip())
                        outfile.write('\n')
                        # WARNING: This line ensures that Hadoop knows that our process is not sitting in an infinite
                        # loop.  Do not remove it.
                        self.event_export_counter(counter_title='Raw Bytes Written', incr_value=(len(value) + 1))
                finally:
                    outfile.close()
        except IOError as err:
            log.error("Error encountered while encrypting and gzipping Organization: %s file: %s Exception: %s",
                      org_id, key_file_targets, err)
            # This counter is set when there is an error during the generation of the encryption file for an
            # organization for any reason, including encryption errors related to an expired GPG key.
            self.event_export_counter(counter_title="{} org with Errors".format(org_id), incr_value=1)
Exemplo n.º 26
0
 def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     for src in self.src:
         if src.startswith('s3'):
             # connect lazily as needed:
             if self.s3_conn is None:
                 self.s3_conn = ScalableS3Client().s3
             for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length):
                 source = url_path_join(src, path)
                 yield ExternalURL(source)
         elif src.startswith('hdfs'):
             for source, size in luigi.contrib.hdfs.listdir(src, recursive=True, include_size=True):
                 if not self.include_zero_length and size == 0:
                     continue
                 elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include):
                     yield ExternalURL(source)
         else:
             # Apply the include patterns to the relative path below the src directory.
             # TODO: implement exclude_zero_length to match S3 case.
             for dirpath, _dirnames, files in os.walk(src):
                 for filename in files:
                     filepath = os.path.join(dirpath, filename)
                     relpath = os.path.relpath(filepath, src)
                     if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include):
                         yield ExternalURL(filepath)
 def output(self):
     output_root = url_path_join(
         self.warehouse_path,
         self.partition_task.hive_table_task.table,
         self.partition.path_spec + '/'
     )
     return get_target_from_url(output_root, marker=True)
 def partition_location(self):
     """Provides location of Hive database table's partition data."""
     # The actual folder name where the data is stored is expected to be in the format <key>=<value>
     partition_name = '='.join(self.partition.items()[0])
     # Make sure that input path ends with a slash, to indicate a directory.
     # (This is necessary for S3 paths that are output from Hadoop jobs.)
     return url_path_join(self.table_location, partition_name + '/')
Exemplo n.º 29
0
    def upload_public_keys(self):
        gpg_key_dir = os.path.join('gpg-keys')
        for key_filename in os.listdir(gpg_key_dir):
            full_local_path = os.path.join(gpg_key_dir, key_filename)
            remote_url = url_path_join(self.test_gpg_key_dir, key_filename)

            if not key_filename.endswith('.key'):
                self.upload_file(full_local_path, remote_url)
Exemplo n.º 30
0
 def output_path_for_key(self, key):
     date_string = key
     return url_path_join(
         self.hive_partition_path('video_viewing_by_date', date_string),
         'video_viewing_{date}'.format(
             date=date_string,
         ),
     )
 def insert_source_task(self):
     hive_table = "internal_reporting_user_activity"
     partition_location = url_path_join(self.warehouse_path, hive_table,
                                        self.partition.path_spec) + '/'
     return ExternalURL(url=partition_location)
Exemplo n.º 32
0
 def prepare_database(self):
     sql_fixture_base_url = url_path_join(self.data_dir, 'input',
                                          'enterprise')
     for filename in os.listdir(sql_fixture_base_url):
         self.execute_sql_fixture_file(
             url_path_join(sql_fixture_base_url, filename))
Exemplo n.º 33
0
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path('discovery_api_raw',
                                      partition_value=self.date),
             'programs.json'))
Exemplo n.º 34
0
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path('program_course_order',
                                      partition_value=self.date),
             '{0}.tsv'.format('program_course_order')))
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path(self.table_name,
                                      partition_value=self.date),
             '{0}.tsv'.format(self.table_name)))
 def output_path_for_key(self, key):
     date_string = key
     return url_path_join(
         self.hive_partition_path('user_activity_by_user', date_string),
         'user_activity_{date}'.format(date=date_string, ))
Exemplo n.º 37
0
 def output(self):
     return get_target_from_url(
         url_path_join(self.output_root,
                       'temp/CountProgramCohortEnrollments/'))
 def output(self):
     output_name = u'answer_distribution_per_course_{name}/'.format(
         name=self.name)
     return get_target_from_url(url_path_join(self.dest, output_name))
Exemplo n.º 39
0
 def partition_location(self):
     """Returns the full URL of the partition. This allows data to be written to the partition by external systems"""
     return url_path_join(self.hive_table_task.table_location, self.partition.path_spec + '/')
Exemplo n.º 40
0
 def output(self):  # pragma: no cover
     output_root = url_path_join(self.warehouse_path,
                                 self.partition_task.hive_table_task.table,
                                 self.partition.path_spec + '/')
     return get_target_from_url(output_root, marker=True)
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.destination,
             "daily_registrations_enrollments_{0}.csv".format(self.name)))
    def requires(self):
        yield self.hive_table_task

        yield ExternalURL(
            url=url_path_join(self.warehouse_path, 'course_enrollment_summary',
                              self.partition.path_spec) + '/')
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.hive_partition_path('course_catalog_raw',
                                      partition_value=self.date),
             'course_catalog.json'))
Exemplo n.º 44
0
 def metadata_output(self):
     """Return target to which metadata about the task execution can be written."""
     return get_target_from_url(
         url_path_join(self.destination, METADATA_FILENAME))
Exemplo n.º 45
0
 def output_path_for_key(self, key):
     authoring_institution, program_uuid = key
     filename = u'{}__{}.csv'.format(self.report_name, self.date)
     return url_path_join(self.output_root, authoring_institution,
                          program_uuid, filename)
Exemplo n.º 46
0
 def table_location(self):
     """Provides root location of Hive database table's data."""
     return url_path_join(self.warehouse_path, self.table) + '/'
Exemplo n.º 47
0
 def output(self):
     return get_target_from_url(
         url_path_join(self.output_root, 'temp', 'CountCourseEnrollments/'))
Exemplo n.º 48
0
 def partition_location(self):
     """Provides location of Hive database table's partition data."""
     # Make sure that input path ends with a slash, to indicate a directory.
     # (This is necessary for S3 paths that are output from Hadoop jobs.)
     return url_path_join(self.table_location, self.partition.path_spec + '/')
Exemplo n.º 49
0
 def output_path_for_key(self, key):
     org_key, program_uuid = key
     filename = u'{}__{}.csv'.format(self.report_name, self.date)
     return url_path_join(self.output_root, org_key, program_uuid, filename)
Exemplo n.º 50
0
 def complete(self):
     if self.overwrite and not self.attempted_removal:
         return False
     else:
         return get_target_from_url(
             url_path_join(self.output_url(), '_SUCCESS')).exists()
Exemplo n.º 51
0
 def output(self):
     url_with_filename = url_path_join(self.destination, self.filename)
     return get_target_from_url(url_with_filename)
Exemplo n.º 52
0
 def output_path_for_key(self, key):
     date_string = key
     return url_path_join(
         self.hive_partition_path('last_ip_of_user_id', date_string),
         'last_ip_of_user_{date}'.format(date=date_string),
     )
Exemplo n.º 53
0
 def output(self):
     output_name = u'seq_open_dist_{name}/'.format(name=self.name)
     return get_target_from_url(url_path_join(self.dest, output_name))
 def insert_source_task(self):
     url = url_path_join(self.hive_partition_path('course_seat', self.date),
                         'course_seat.tsv')
     return ExternalURL(url=url)
Exemplo n.º 55
0
 def table_location(self):
     return url_path_join(self.destination, self.table_name)
 def output(self):
     return get_target_from_url(
         url_path_join(
             self.destination,
             "incremental_users_and_enrollments_{0}.csv".format(self.name)))
 def output(self):
     return get_target_from_url(
         url_path_join(self.output_root, 'event_type_distribution/'))
Exemplo n.º 58
0
 def marker_output(self):
     """Return target for _SUCCESS marker indicating the task was successfully completed."""
     return get_target_from_url(url_path_join(self.destination, "_SUCCESS"))
Exemplo n.º 59
0
 def complete(self):
     """
     The task is complete if the output_root/_SUCCESS file is present.
     """
     return get_target_from_url(url_path_join(self.output_root,
                                              '_SUCCESS')).exists()
 def output(self):
     output_name = u'problem_check_events_{name}/'.format(name=self.name)
     return get_target_from_url(url_path_join(self.dest, output_name))