def generate_file_list(self):
     """Yield each individual path given a source folder and a set of file-matching expressions."""
     for src in self.src:
         if src.startswith('s3'):
             # connect lazily as needed:
             if self.s3_conn is None:
                 self.s3_conn = ScalableS3Client().s3
             for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length):
                 source = url_path_join(src, path)
                 yield ExternalURL(source)
         elif src.startswith('hdfs'):
             for source, size in luigi.hdfs.listdir(src, recursive=True, include_size=True):
                 if not self.include_zero_length and size == 0:
                     continue
                 elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include):
                     yield ExternalURL(source)
         else:
             # Apply the include patterns to the relative path below the src directory.
             # TODO: implement exclude_zero_length to match S3 case.
             for dirpath, _dirnames, files in os.walk(src):
                 for filename in files:
                     filepath = os.path.join(dirpath, filename)
                     relpath = os.path.relpath(filepath, src)
                     if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include):
                         yield ExternalURL(filepath)
示例#2
0
def get_target_class_from_url(url, marker=False):
    """Returns a luigi target class based on the url scheme"""
    parsed_url = urlparse.urlparse(url)

    if marker:
        target_class = URL_SCHEME_TO_MARKER_TARGET_CLASS.get(
            parsed_url.scheme, DEFAULT_MARKER_TARGET_CLASS)
    else:
        target_class = URL_SCHEME_TO_TARGET_CLASS.get(parsed_url.scheme,
                                                      DEFAULT_TARGET_CLASS)

    kwargs = {}
    if issubclass(target_class, HdfsTarget) and url.endswith('/'):
        kwargs['format'] = hdfs_format.PlainDir
    if issubclass(target_class,
                  luigi.LocalTarget) or parsed_url.scheme == 'hdfs':
        # LocalTarget and HdfsTarget both expect paths without any scheme, netloc etc, just bare paths. So strip
        # everything else off the url and pass that in to the target.
        url = parsed_url.path
    if issubclass(target_class, S3Target):
        kwargs['client'] = ScalableS3Client()
        kwargs['policy'] = DEFAULT_KEY_ACCESS_POLICY

    url = url.rstrip('/')
    args = (url, )

    return target_class, args, kwargs
 def _get_s3_urls(self, source):
     """Recursively list all files inside the source URL directory."""
     s3_conn = ScalableS3Client().s3
     bucket_name, root = get_s3_bucket_key_names(source)
     bucket = s3_conn.get_bucket(bucket_name)
     for key_metadata in bucket.list(root):
         if key_metadata.size > 0:
             key_path = key_metadata.key[len(root):].lstrip('/')
             yield url_path_join(source, key_path)
示例#4
0
def when_s3_available(function):
    s3_available = getattr(when_s3_available, 's3_available', None)
    if s3_available is None:
        try:
            connection = ScalableS3Client().s3
            # ^ The above line will not error out if AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
            # are set, so it can't be used to check if we have a valid connection to S3. Instead:
            connection.get_all_buckets()
        except (boto.exception.S3ResponseError,
                boto.exception.NoAuthHandlerFound):
            s3_available = False
        else:
            s3_available = True
        finally:
            when_s3_available.s3_available = s3_available  # Cache result to avoid having to compute it again
    return unittest.skipIf(not s3_available, 'S3 is not available')(function)
    def validate_exporter_output(self, org_id, exported_filename):
        """
        Preconditions: A complete data package has been uploaded to S3.
        External Effect: Downloads the complete data package, decompresses it, decrypts it and then compares it to the
            static expected output ignoring the ordering of the records in both files.

        Downloads s3://<exporter_output_bucket>/<output_prefix><org_id>-<year>-<month>-<day>.zip to <temporary_dir>/work/validation/.

        """
        today = datetime.datetime.utcnow().strftime('%Y-%m-%d')
        bucket = ScalableS3Client().s3.get_bucket(
            self.config.get('exporter_output_bucket'))
        export_id = '{org}-{date}'.format(org=org_id, date=today)
        filename = export_id + '.zip'
        key = bucket.lookup(self.output_prefix + filename)
        if key is None:
            self.fail(
                'Expected output from legacy exporter not found. Url = s3://{bucket}/{pre}{filename}'
                .format(bucket=self.config.get('exporter_output_bucket'),
                        pre=self.output_prefix,
                        filename=filename))
        exporter_archive_path = os.path.join(self.validation_dir, filename)
        key.get_contents_to_filename(exporter_archive_path)

        shell.run(['unzip', exporter_archive_path, '-d', self.validation_dir])

        gpg = gnupg.GPG(gnupghome=self.gpg_dir)
        with open(os.path.join('gpg-keys', 'insecure_secret.key'),
                  'r') as key_file:
            gpg.import_keys(key_file.read())

        exported_file_path = os.path.join(self.validation_dir,
                                          exported_filename)
        with open(
                os.path.join(self.validation_dir, export_id,
                             exported_filename + '.gpg'),
                'r') as encrypted_file:
            gpg.decrypt_file(encrypted_file, output=exported_file_path)

        sorted_filename = exported_file_path + '.sorted'
        shell.run(['sort', '-o', sorted_filename, exported_file_path])

        expected_output_path = os.path.join(self.data_dir, 'output',
                                            exported_filename + '.sorted')
        shell.run(['diff', sorted_filename, expected_output_path])
示例#6
0
    def setUp(self):
        try:
            self.s3_client = ScalableS3Client()
        except Exception:
            self.s3_client = None

        self.config = get_test_config()

        for env_var in ('TASKS_REPO', 'TASKS_BRANCH', 'IDENTIFIER',
                        'JOB_FLOW_NAME', 'IS_REMOTE'):
            if env_var in os.environ:
                self.config[env_var.lower()] = os.environ[env_var]

        if 'is_remote' in self.config:
            self.config['is_remote'] = self.config['is_remote'].lower(
            ) not in ('0', 'false', 'f')
        else:
            self.config['is_remote'] = True

        if self.config['is_remote']:
            # The name of an existing job flow to run the test on
            assert ('job_flow_name' in self.config or 'host' in self.config)
            # The git URL of the pipeline repository to check this code out from.
            assert ('tasks_repo' in self.config)
            # The branch of the pipeline repository to test. Note this can differ from the branch that is currently
            # checked out and running this code.
            assert ('tasks_branch' in self.config)
            # Where to store logs generated by the pipeline
            assert ('tasks_log_path' in self.config)
            # The user to connect to the job flow over SSH with.
            assert ('connection_user' in self.config)

        # Where the pipeline should output data, should be a URL pointing to a directory.
        assert ('tasks_output_url' in self.config)
        # Allow for parallel execution of the test by specifying a different identifier. Using an identical identifier
        # allows for old virtualenvs to be reused etc, which is why a random one is not simply generated with each run.
        assert ('identifier' in self.config)
        # A URL to a JSON file that contains most of the connection information for the MySQL database.
        assert ('credentials_file_url' in self.config)
        # A URL to a build of the oddjob third party library
        assert 'oddjob_jar' in self.config
        # A URL to a maxmind compatible geolocation database file
        assert 'geolocation_data' in self.config

        self.data_dir = os.path.join(os.path.dirname(__file__), 'fixtures')

        url = self.config['tasks_output_url']
        m = hashlib.md5()
        m.update(self.config['identifier'])
        self.identifier = m.hexdigest()
        self.test_root = url_path_join(url, self.identifier,
                                       self.__class__.__name__)

        self.test_src = url_path_join(self.test_root, 'src')
        self.test_out = url_path_join(self.test_root, 'out')

        # Use a local dir for devstack testing, or s3 for production testing.
        self.report_output_root = self.config.get(
            'report_output_root', url_path_join(self.test_out, 'reports'))

        self.catalog_path = 'http://acceptance.test/api/courses/v2'
        database_name = 'test_' + self.identifier
        schema = 'test_' + self.identifier
        import_database_name = 'acceptance_import_' + database_name
        export_database_name = 'acceptance_export_' + database_name
        otto_database_name = 'acceptance_otto_' + database_name
        elasticsearch_alias = 'alias_test_' + self.identifier
        self.warehouse_path = url_path_join(self.test_root, 'warehouse')
        self.edx_rest_api_cache_root = url_path_join(self.test_src,
                                                     'edx-rest-api-cache')
        task_config_override = {
            'hive': {
                'database': database_name,
                'warehouse_path': self.warehouse_path
            },
            'map-reduce': {
                'marker': url_path_join(self.test_root, 'marker')
            },
            'manifest': {
                'path': url_path_join(self.test_root, 'manifest'),
                'lib_jar': self.config['oddjob_jar'],
            },
            'database-import': {
                'credentials': self.config['credentials_file_url'],
                'destination': self.warehouse_path,
                'database': import_database_name
            },
            'database-export': {
                'credentials': self.config['credentials_file_url'],
                'database': export_database_name
            },
            'otto-database-import': {
                'credentials': self.config['credentials_file_url'],
                'database': otto_database_name
            },
            'course-catalog': {
                'catalog_path': self.catalog_path
            },
            'geolocation': {
                'geolocation_data': self.config['geolocation_data']
            },
            'event-logs': {
                'source':
                as_list_param(self.test_src, escape_quotes=False),
                'pattern':
                as_list_param(".*tracking.log-(?P<date>\\d{8}).*\\.gz",
                              escape_quotes=False),
            },
            'segment-logs': {
                'source':
                as_list_param(self.test_src, escape_quotes=False),
                'pattern':
                as_list_param(".*segment.log-(?P<date>\\d{8}).*\\.gz",
                              escape_quotes=False),
            },
            'course-structure': {
                'api_root_url': 'acceptance.test',
                'access_token': 'acceptance'
            },
            'module-engagement': {
                'alias': elasticsearch_alias
            },
            'elasticsearch': {},
            'problem-response': {
                'report_fields':
                '["username","problem_id","answer_id","location","question","score","max_score",'
                '"correct","answer","total_attempts","first_attempt_date","last_attempt_date"]',
                'report_field_list_delimiter':
                '"|"',
                'report_field_datetime_format':
                '%Y-%m-%dT%H:%M:%SZ',
                'report_output_root':
                self.report_output_root,
                'partition_format':
                '%Y-%m-%dT%H',
            },
            'edx-rest-api': {
                'client_id': 'oauth_id',
                'client_secret': 'oauth_secret',
                'oauth_username': '******',
                'oauth_password': '******',
                'auth_url': 'http://acceptance.test',
            },
            'course-blocks': {
                'api_root_url':
                'http://acceptance.test/api/courses/v1/blocks/',
            },
            'course-list': {
                'api_root_url':
                'http://acceptance.test/api/courses/v1/courses/',
            },
        }

        if 'elasticsearch_host' in self.config:
            task_config_override['elasticsearch']['host'] = as_list_param(
                self.config['elasticsearch_host'], escape_quotes=False)
        if 'elasticsearch_connection_class' in self.config:
            task_config_override['elasticsearch'][
                'connection_type'] = self.config[
                    'elasticsearch_connection_class']
        if 'manifest_input_format' in self.config:
            task_config_override['manifest']['input_format'] = self.config[
                'manifest_input_format']
        if 'hive_version' in self.config:
            task_config_override['hive']['version'] = self.config[
                'hive_version']

        log.info('Running test: %s', self.id())
        log.info('Using executor: %s', self.config['identifier'])
        log.info('Generated Test Identifier: %s', self.identifier)

        self.import_db = db.DatabaseService(self.config, import_database_name)
        self.export_db = db.DatabaseService(self.config, export_database_name)
        self.otto_db = db.DatabaseService(self.config, otto_database_name)
        self.task = task.TaskService(self.config, task_config_override,
                                     self.identifier)
        self.hive = hive.HiveService(self.task, self.config, database_name)
        self.elasticsearch = elasticsearch_service.ElasticsearchService(
            self.config, elasticsearch_alias)

        self.reset_external_state()

        max_diff = os.getenv('MAX_DIFF', None)
        if max_diff is not None:
            if max_diff.lower() == "infinite":
                self.maxDiff = None
            else:
                self.maxDiff = int(max_diff)