def run(self): src_file = os.path.join(WEB_SOURCES_CONFIG_DIR, self.sourcefile) json_raw = Utils.read_file(src_file) handler = HandlersFactory.get_handler(Extractor.handler_name(json_raw)) service = Extractor(json_raw, [lt.path for lt in self.input()], TEMP_PATH, handler) service.extract()
def output(self): src_file = os.path.join(WEB_SOURCES_CONFIG_DIR, str(self.sourcefile)) json_raw = Utils.read_file(src_file) handler = HandlersFactory.get_handler(Extractor.handler_name(json_raw)) service = Extractor(json_raw, self.input().path, TEMP_PATH, handler) return [ luigi.LocalTarget(f) for f in service.path(json_raw, TEMP_PATH) ]
def test_kato_file_extract(self): src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR, 'web_statgov_kato.json') json_raw = Utils.read_file(src_conf_path) download_handler = HandlersFactory.get_handler( Downloader.handler_name(json_raw)) service = Downloader(json_raw, download_handler) file_path = service.download() extract_handler = HandlersFactory.get_handler( Extractor.handler_name(json_raw)) service = Extractor(json_raw, file_path, extract_handler) file_paths = service.extract() self.assertTrue(Utils.all_exists(file_paths))
def test_companies_files_extract(self): src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR, 'web_statgov_companies.json') json_raw = Utils.read_file(src_conf_path) download_handler = HandlersFactory.get_handler( Downloader.handler_name(json_raw)) service = Downloader(json_raw, download_handler) downloaded_files_path = service.download() extract_handler = HandlersFactory.get_handler( Extractor.handler_name(json_raw)) service = Extractor(json_raw, downloaded_files_path, TEMP_PATH, extract_handler) all_files = service.extract() self.assertTrue(Utils.all_exists(all_files))
def test_companies_parse_to_csv(self): srconf_path = os.path.join(WEB_SOURCES_CONFIG_DIR, 'web_statgov_companies.json') jobconf_path = os.path.join(JOBS_CONFIG_DIR, 'to_csv.json') src_json = Utils.read_file(srconf_path) job_json = Utils.read_file(jobconf_path) download_handler = HandlersFactory.get_handler( Downloader.handler_name(src_json)) service = Downloader(src_json, download_handler) downloaded_file = service.download() extract_handler = HandlersFactory.get_handler( Extractor.handler_name(src_json)) service = Extractor(src_json, downloaded_file, self.temp_path, extract_handler) service.extract() xlspaths = service.path(src_json, self.temp_path) parse_handler = HandlersFactory.get_handler( XLSParser.handler_name(src_json, job_json)) service = XLSParser(src_json, job_json, xlspaths, self.data_path, parse_handler) csvfile = service.path(src_json, job_json, self.data_path) rows_cnt = service.parse() self.assertTrue(os.path.exists(csvfile)) self.assertGreater(rows_cnt, 0)