def run(self):
     src_file = os.path.join(WEB_SOURCES_CONFIG_DIR, self.sourcefile)
     json_raw = Utils.read_file(src_file)
     handler = HandlersFactory.get_handler(Extractor.handler_name(json_raw))
     service = Extractor(json_raw, [lt.path for lt in self.input()],
                         TEMP_PATH, handler)
     service.extract()
 def output(self):
     src_file = os.path.join(WEB_SOURCES_CONFIG_DIR, str(self.sourcefile))
     json_raw = Utils.read_file(src_file)
     handler = HandlersFactory.get_handler(Extractor.handler_name(json_raw))
     service = Extractor(json_raw, self.input().path, TEMP_PATH, handler)
     return [
         luigi.LocalTarget(f) for f in service.path(json_raw, TEMP_PATH)
     ]
예제 #3
0
 def test_kato_file_extract(self):
     src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR,
                                  'web_statgov_kato.json')
     json_raw = Utils.read_file(src_conf_path)
     download_handler = HandlersFactory.get_handler(
         Downloader.handler_name(json_raw))
     service = Downloader(json_raw, download_handler)
     file_path = service.download()
     extract_handler = HandlersFactory.get_handler(
         Extractor.handler_name(json_raw))
     service = Extractor(json_raw, file_path, extract_handler)
     file_paths = service.extract()
     self.assertTrue(Utils.all_exists(file_paths))
예제 #4
0
 def test_companies_files_extract(self):
     src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR,
                                  'web_statgov_companies.json')
     json_raw = Utils.read_file(src_conf_path)
     download_handler = HandlersFactory.get_handler(
         Downloader.handler_name(json_raw))
     service = Downloader(json_raw, download_handler)
     downloaded_files_path = service.download()
     extract_handler = HandlersFactory.get_handler(
         Extractor.handler_name(json_raw))
     service = Extractor(json_raw, downloaded_files_path, TEMP_PATH,
                         extract_handler)
     all_files = service.extract()
     self.assertTrue(Utils.all_exists(all_files))
 def test_companies_parse_to_csv(self):
     srconf_path = os.path.join(WEB_SOURCES_CONFIG_DIR,
                                'web_statgov_companies.json')
     jobconf_path = os.path.join(JOBS_CONFIG_DIR, 'to_csv.json')
     src_json = Utils.read_file(srconf_path)
     job_json = Utils.read_file(jobconf_path)
     download_handler = HandlersFactory.get_handler(
         Downloader.handler_name(src_json))
     service = Downloader(src_json, download_handler)
     downloaded_file = service.download()
     extract_handler = HandlersFactory.get_handler(
         Extractor.handler_name(src_json))
     service = Extractor(src_json, downloaded_file, self.temp_path,
                         extract_handler)
     service.extract()
     xlspaths = service.path(src_json, self.temp_path)
     parse_handler = HandlersFactory.get_handler(
         XLSParser.handler_name(src_json, job_json))
     service = XLSParser(src_json, job_json, xlspaths, self.data_path,
                         parse_handler)
     csvfile = service.path(src_json, job_json, self.data_path)
     rows_cnt = service.parse()
     self.assertTrue(os.path.exists(csvfile))
     self.assertGreater(rows_cnt, 0)