def main(): # In order to see errors during extension loading, you can uncomment the next line. logging.basicConfig(level=logging.DEBUG) # Load tasks configured using entry_points # TODO: launch tasks by their entry_point name stevedore.ExtensionManager('edx.analytics.tasks') configuration = luigi.configuration.get_config() if os.path.exists(OVERRIDE_CONFIGURATION_FILE): log.debug('Using %s', OVERRIDE_CONFIGURATION_FILE) configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE) else: log.debug('Configuration file %s does not exist', OVERRIDE_CONFIGURATION_FILE) # Tell luigi what dependencies to pass to the Hadoop nodes # - boto is used for all direct interactions with s3. # - cjson is used for all parsing event logs. # - filechunkio is used for multipart uploads of large files to s3. # - opaque_keys is used to interpret serialized course_ids # - dependencies of opaque_keys: bson, stevedore luigi.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore, ciso8601) # TODO: setup logging for tasks or configured logging mechanism # Launch Luigi using the default builder with profile_if_necessary(os.getenv('WORKFLOW_PROFILER', ''), os.getenv('WORKFLOW_PROFILER_PATH', '')): luigi.run()
def main(): # In order to see errors during extension loading, you can uncomment the next line. # logging.basicConfig(level=logging.DEBUG) # Load tasks configured using entry_points # TODO: launch tasks by their entry_point name stevedore.ExtensionManager('edx.analytics.tasks') configuration = luigi.configuration.get_config() if os.path.exists(OVERRIDE_CONFIGURATION_FILE): log.debug('Using override.cfg') with open(OVERRIDE_CONFIGURATION_FILE, 'r') as override_file: log.debug(override_file.read()) configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE) else: log.debug('override.cfg does not exist') # Tell luigi what dependencies to pass to the Hadoop nodes # - boto is used for all direct interactions with s3. # - cjson is used for all parsing event logs. # - filechunkio is used for multipart uploads of large files to s3. # - opaque_keys is used to interpret serialized course_ids # - dependencies of opaque_keys: bson, stevedore luigi.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore) # TODO: setup logging for tasks or configured logging mechanism # Launch Luigi using the default builder luigi.run()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--additional-config', help='additional configuration file to be loaded after default/override', default=None, action='append') arguments, _extra_args = parser.parse_known_args() # We get a cleaned command-line arguments list, free of the arguments *we* care about, since Luigi will throw # errors when it sees arguments that it or the workflow didn't specify. We pass these in when invoking Luigi. cmdline_args = get_cleaned_command_line_args() # In order to see errors during extension loading, you can uncomment the next line. logging.basicConfig(level=logging.DEBUG) # Load tasks configured using entry_points # TODO: launch tasks by their entry_point name stevedore.ExtensionManager('edx.analytics.tasks') # Load the override configuration if it's specified/exists. configuration = luigi.configuration.get_config() if os.path.exists(OVERRIDE_CONFIGURATION_FILE): log.debug('Loading override configuration \'%s\'...', OVERRIDE_CONFIGURATION_FILE) configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE) else: log.debug('Configuration file \'%s\' does not exist!', OVERRIDE_CONFIGURATION_FILE) # Load any additional configuration files passed in. if arguments.additional_config is not None: for additional_config in arguments.additional_config: if os.path.exists(additional_config): log.debug('Loading additional configuration file \'%s\'...', additional_config) configuration.add_config_path(additional_config) else: log.debug('Configuration file \'%s\' does not exist!', additional_config) # Tell luigi what dependencies to pass to the Hadoop nodes # - edx.analytics.tasks is used to load the pipeline code, since we cannot trust all will be loaded automatically. # - boto is used for all direct interactions with s3. # - cjson is used for all parsing event logs. # - filechunkio is used for multipart uploads of large files to s3. # - opaque_keys is used to interpret serialized course_ids # - opaque_keys extensions: ccx_keys # - dependencies of opaque_keys: bson, stevedore luigi.hadoop.attach(edx.analytics.tasks) luigi.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore, ciso8601, requests) if configuration.getboolean('ccx', 'enabled', default=False): import ccx_keys luigi.hadoop.attach(ccx_keys) # TODO: setup logging for tasks or configured logging mechanism # Launch Luigi using the default builder with profile_if_necessary(os.getenv('WORKFLOW_PROFILER', ''), os.getenv('WORKFLOW_PROFILER_PATH', '')): luigi.run(cmdline_args)
def test_extract_user_metrics(mocker): MockTarget.fs.clear() add_config_path('testconfig/luigi.conf') mocker.patch('luigi.Task.input', return_value=luigi.LocalTarget("data/user_profile.json")) luigi.build([ExtractUserMetricsMock(file_number=0)], local_scheduler=True, no_lock=True, workers=1) r = json.loads(MockTarget.fs.get_data('/tmp/a.txt')) assert 2 == len(r.get("root"))
def test_fetch_user_list(requests_mock): MockTarget.fs.clear() add_config_path('testconfig/luigi.conf') with open('data/publication_response.json') as input_file: url = re.compile("https://medium.com/*.*") requests_mock.register_uri(method='GET', url=url, text=input_file.read()) luigi.build([FetchUserListMock()], local_scheduler=True, no_lock=True, workers=1) assert 1 == requests_mock.call_count r = json.loads(MockTarget.fs.get_data('/tmp/a.txt')) assert 10 == len(r)
def test_fetch_user_profiles(mocker, requests_mock): MockTarget.fs.clear() add_config_path('testconfig/luigi.conf') with open('data/user_profile_response.json') as input_file: mocker.patch('luigi.Task.input', return_value=luigi.LocalTarget("data/user_list.json")) url = re.compile("https://medium.com/*.*") requests_mock.register_uri(method='GET', url=url, text=input_file.read()) luigi.build([FetchUserProfileMock(file_number=0)], local_scheduler=True, no_lock=True, workers=1) assert 8 == requests_mock.call_count r = json.loads(MockTarget.fs.get_data('/tmp/a.txt')) assert 8 == len(r.get("root"))
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--additional-config', help='additional configuration file to be loaded after default/override', default=None, action='append' ) arguments, _extra_args = parser.parse_known_args() # We get a cleaned command-line arguments list, free of the arguments *we* care about, since Luigi will throw # errors when it sees arguments that it or the workflow didn't specify. We pass these in when invoking Luigi. cmdline_args = get_cleaned_command_line_args() # In order to see errors during extension loading, you can uncomment the next line. logging.basicConfig(level=logging.DEBUG) # Load tasks configured using entry_points # TODO: launch tasks by their entry_point name stevedore.ExtensionManager('edx.analytics.tasks') # Load the override configuration if it's specified/exists. configuration = luigi.configuration.get_config() if os.path.exists(OVERRIDE_CONFIGURATION_FILE): log.debug('Loading override configuration \'%s\'...', OVERRIDE_CONFIGURATION_FILE) configuration.add_config_path(OVERRIDE_CONFIGURATION_FILE) else: log.debug('Configuration file \'%s\' does not exist!', OVERRIDE_CONFIGURATION_FILE) # Load any additional configuration files passed in. if arguments.additional_config is not None: for additional_config in arguments.additional_config: if os.path.exists(additional_config): log.debug('Loading additional configuration file \'%s\'...', additional_config) configuration.add_config_path(additional_config) else: log.debug('Configuration file \'%s\' does not exist!', additional_config) # Tell luigi what dependencies to pass to the Hadoop nodes: # - edx.analytics.tasks is used to load the pipeline code, since we cannot trust all will be loaded automatically. # - boto is used for all direct interactions with s3. # - cjson is used for all parsing event logs. # - filechunkio is used for multipart uploads of large files to s3. # - opaque_keys is used to interpret serialized course_ids # - opaque_keys extensions: ccx_keys # - dependencies of opaque_keys: bson, stevedore, six # - requests has several dependencies: # - chardet, urllib3, certifi, idna luigi.contrib.hadoop.attach(edx.analytics.tasks) luigi.contrib.hadoop.attach(boto, cjson, filechunkio, opaque_keys, bson, stevedore, six, ciso8601, chardet, urllib3, certifi, idna, requests) if configuration.getboolean('ccx', 'enabled', default=False): import ccx_keys luigi.contrib.hadoop.attach(ccx_keys) # TODO: setup logging for tasks or configured logging mechanism # Launch Luigi using the default builder with profile_if_necessary(os.getenv('WORKFLOW_PROFILER', ''), os.getenv('WORKFLOW_PROFILER_PATH', '')): luigi.retcodes.run_with_retcodes(cmdline_args)
def test_add_without_install(self): enabled = LuigiTomlParser.enabled LuigiTomlParser.enabled = False with self.assertRaises(ImportError): add_config_path('test/testconfig/luigi.toml') LuigiTomlParser.enabled = enabled
def setUpClass(cls): add_config_path('test/testconfig/luigi.toml') add_config_path('test/testconfig/luigi_local.toml')