示例#1
0
 def test_kato_file_extract(self):
     src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR,
                                  'web_statgov_kato.json')
     json_raw = Utils.read_file(src_conf_path)
     download_handler = HandlersFactory.get_handler(
         Downloader.handler_name(json_raw))
     service = Downloader(json_raw, download_handler)
     file_path = service.download()
     extract_handler = HandlersFactory.get_handler(
         Extractor.handler_name(json_raw))
     service = Extractor(json_raw, file_path, extract_handler)
     file_paths = service.extract()
     self.assertTrue(Utils.all_exists(file_paths))
示例#2
0
 def test_companies_files_extract(self):
     src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR,
                                  'web_statgov_companies.json')
     json_raw = Utils.read_file(src_conf_path)
     download_handler = HandlersFactory.get_handler(
         Downloader.handler_name(json_raw))
     service = Downloader(json_raw, download_handler)
     downloaded_files_path = service.download()
     extract_handler = HandlersFactory.get_handler(
         Extractor.handler_name(json_raw))
     service = Extractor(json_raw, downloaded_files_path, TEMP_PATH,
                         extract_handler)
     all_files = service.extract()
     self.assertTrue(Utils.all_exists(all_files))
 def run(self):
     src_file = os.path.join(WEB_SOURCES_CONFIG_DIR, self.sourcefile)
     json_raw = Utils.read_file(src_file)
     handler = HandlersFactory.get_handler(Extractor.handler_name(json_raw))
     service = Extractor(json_raw, [lt.path for lt in self.input()],
                         TEMP_PATH, handler)
     service.extract()
 def output(self):
     src_file = os.path.join(WEB_SOURCES_CONFIG_DIR, str(self.sourcefile))
     json_raw = Utils.read_file(src_file)
     handler = HandlersFactory.get_handler(Extractor.handler_name(json_raw))
     service = Extractor(json_raw, self.input().path, TEMP_PATH, handler)
     return [
         luigi.LocalTarget(f) for f in service.path(json_raw, TEMP_PATH)
     ]
示例#5
0
 def test_companies_download_by_urllist(self):
     src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR,
                                  'web_statgov_companies.json')
     json_raw = Utils.read_file(src_conf_path)
     handler = HandlersFactory.get_handler(
         Downloader.handler_name(json_raw))
     service = Downloader(json_raw, handler)
     file_paths = service.download()
     self.assertTrue(Utils.all_exists(file_paths))
示例#6
0
 def test_wrong_address_download_by_url(self):
     src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR,
                                  'web_kgdgov_wrong_address.json')
     json_raw = Utils.read_file(src_conf_path)
     handler = HandlersFactory.get_handler(
         Downloader.handler_name(json_raw))
     service = Downloader(json_raw, handler)
     file_path = service.download()
     self.assertTrue(os.path.exists(file_path))
 def test_kurk_parse_to_csv(self):
     srconf_path = os.path.join(WEB_SOURCES_CONFIG_DIR,
                                'web_statgov_kurk.json')
     jobconf_path = os.path.join(JOBS_CONFIG_DIR, 'to_csv.json')
     src_json = Utils.read_file(srconf_path)
     job_json = Utils.read_file(jobconf_path)
     download_handler = HandlersFactory.get_handler(
         Downloader.handler_name(src_json))
     service = Downloader(src_json, download_handler)
     downloaded_file = service.download()
     parse_handler = HandlersFactory.get_handler(
         XLSParser.handler_name(src_json, job_json))
     service = XLSParser(src_json, job_json, downloaded_file,
                         self.data_path, parse_handler)
     csvfile = service.path(src_json, job_json, self.data_path)
     rows_cnt = service.parse()
     self.assertTrue(os.path.exists(csvfile))
     self.assertGreater(rows_cnt, 0)
 def run(self):
     src_file = os.path.join(WEB_SOURCES_CONFIG_DIR, str(self.sourcefile))
     src_conf = Utils.read_file(src_file)
     job_file = os.path.join(JOBS_CONFIG_DIR, str(self.jobfile))
     job_conf = Utils.read_file(job_file)
     handler = HandlersFactory.get_handler(
         Parser.handler_name(src_conf, job_conf))
     service = Parser(src_conf, job_conf, WEB_DATA_PATH, handler)
     service.parse()
示例#9
0
 def test_pseudo_company_download_by_url(self):
     src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR,
                                  'web_kgdgov_pseudo_company.json')
     with open(src_conf_path, "r", encoding="utf8") as f:
         json_raw = f.read()
     handler = HandlersFactory.get_handler(
         Downloader.handler_name(json_raw))
     service = Downloader(json_raw, handler)
     file_path = service.download()
     self.assertTrue(os.path.exists(file_path))
示例#10
0
 def test_address_parse_to_csv(self):
     srconf_path = os.path.join(WEB_SOURCES_CONFIG_DIR,
                                'web_datagov_addresses.json')
     jobconf_path = os.path.join(JOBS_CONFIG_DIR, 'to_csv.json')
     src_json = Utils.read_file(srconf_path)
     job_json = Utils.read_file(jobconf_path)
     parse_handler = HandlersFactory.get_handler(
         Parser.handler_name(src_json, job_json))
     service = Parser(src_json, job_json, self.data_path, parse_handler)
     csvfile = service.path(src_json, job_json, self.data_path)
     service.parse()
     self.assertTrue(os.path.exists(csvfile))
        i = -1
        for arch in archives:
            arch_obj = Utils.get_archive_object(arch)
            data_format = Box(json.loads(instance.srconf)).storage.data_format
            file_path = path.abspath(path.dirname(arch))
            for file in arch_obj.namelist():
                if Utils.ext(file) == data_format:
                    i += 1
                    arch_obj.extract(file, file_path)
                    old_path = path.join(file_path, file).replace('/', os.sep)
                    shutil.move(old_path, targets[i])
        return targets

    @staticmethod
    def path(srconf, dpath):
        files_num = Box(json.loads(srconf)).storage.data_files_num
        archives_num = Box(json.loads(srconf)).storage.data_archives_num
        name = Box(json.loads(srconf)).name
        data_format = Box(json.loads(srconf)).storage.data_format
        files = list()
        for i in range(archives_num):
            for j in range(files_num):
                files.append(
                    path.join(dpath,
                              "{}_{}_{}.{}".format(name, i, j, data_format)))
        return files


HandlersFactory.register("extract_file", ExtractorFile)
HandlersFactory.register("extract_files", ExtractorFiles)
示例#12
0
                json_raw = res.group(2)
                break
        return json.loads(json_raw)

    @staticmethod
    def write_data(fpath, data, delimiter=";"):
        with open(fpath, "a", encoding="utf8") as f:
            csv_writer = csv.writer(f, delimiter=delimiter)
            for row in data:
                csv_writer.writerow(row.values())

    @staticmethod
    def parse(instance, fpath):
        data = ParseJavaScriptJsonToCSV.get_data(instance)
        ParseJavaScriptJsonToCSV.write_data(fpath, data)

    @staticmethod
    def path(srconf, jobconf, dpath):
        name = Box(json.loads(srconf)).name
        data_format = Box(json.loads(jobconf)).data_format
        return os.path.join(dpath, "{}.{}".format(name, data_format))


HandlersFactory.register("xlsparse_to_csv", ParseFromExcelToCSV)
HandlersFactory.register("xlsparse_oked_to_csv", ParseOkedToCsv)
HandlersFactory.register("web_api_raw_json_parse_to_csv", ParseFromAPIToCSV)
HandlersFactory.register("web_html_javascript_json_parse_to_csv",
                         ParseJavaScriptJsonToCSV)
HandlersFactory.register("web_html_table_text_parse_to_csv",
                         ParseGosRegisterToCSV)
示例#13
0
            return fpath
        except Exception as e:
            raise e

    @staticmethod
    def urls(conf):
        url = Box(json.loads(conf)).url
        base_url = Box(json.loads(conf)).base_url
        js = requests.get(url, verify=False).text
        regex = Box(json.loads(conf)).storage.html.url_regexp
        res = re.findall(regex, js)
        urls = ["{}{}".format(base_url, res) for res in res]
        return urls

    @staticmethod
    def path(conf):
        name = Box(json.loads(conf)).name
        ext = Box(json.loads(conf)).storage.type
        urls = DownloaderByUrlStatGovCompanies().urls(conf)
        directory = TEMP_PATH
        os.makedirs(directory, exist_ok=True)
        return [
            os.path.join(directory, "{}_{}.{}".format(name, i, ext))
            for i, url in enumerate(urls)
        ]


HandlersFactory.register("download_url", DownloaderByUrlToFile)
HandlersFactory.register("download_urllist_ajax",
                         DownloaderByUrlStatGovCompanies)