def main(args, logger): if args.conversion_server is not None: TDocConversionClient.DECLARATOR_CONV_URL = args.conversion_server conv_tasks = TDocConversionClient(logger) conv_tasks.start_conversion_thread() try: sent_files = send_files(args, logger, conv_tasks) if args.receive_files and len(sent_files) > 0: conv_tasks.wait_doc_conversion_finished(args.conversion_timeout) else: logger.debug("stop conversion finished") conv_tasks.stop_conversion_thread() except Exception as exp: logger.error("exception: {}, stop_conversion_thread".format(exp)) conv_tasks.stop_conversion_thread() if args.receive_files: if not receive_files(logger, conv_tasks, sent_files): return 1 return 0
class TExportHumanFiles: @staticmethod def parse_args(arg_list): parser = argparse.ArgumentParser() parser.add_argument("--table", dest='table', default="declarations_documentfile") parser.add_argument("--document-file-id", dest='document_file_id', required=False) parser.add_argument("--tmp-folder", dest='tmp_folder', default=None) parser.add_argument("--dlrobot-human-json", dest='dlrobot_human_json', default="human_files.json") parser.add_argument("--start-from-an-empty-file", dest='start_from_empty', action="store_true", default=False) parser.add_argument("--max-files-count", dest='max_files_count', type=int) parser.add_argument("--mysql-port", dest='mysql_port', type=int, default=None) parser.add_argument("--pdf-conversion-timeout", dest='pdf_conversion_timeout', default=1*60*60, type=int, help="pdf conversion timeout") parser.add_argument("--pdf-conversion-queue-limit", dest='pdf_conversion_queue_limit', type=int, default=100 * 2 ** 20, help="max sum size of al pdf files that are in pdf conversion queue", required=False) return parser.parse_args(arg_list) def __init__(self, args): self.logger = setup_logging(log_file_name="export_human_files.log") self.args = args if self.args.tmp_folder is None: self.args.tmp_folder = tempfile.mkdtemp("export_human") self.logger.debug("create folder {}".format(self.args.tmp_folder)) else: self.logger.debug("rm folder {}".format(self.args.tmp_folder)) shutil.rmtree(self.args.tmp_folder, ignore_errors=True) os.mkdir(self.args.tmp_folder) self.source_doc_client = TSourceDocClient(TSourceDocClient.parse_args([]), self.logger) self.pdf_conversion_client = TDocConversionClient(TDocConversionClient.parse_args([]), self.logger) self.smart_parser_server_client = TSmartParserCacheClient(TSmartParserCacheClient.parse_args([]), self.logger) self.new_pdfs = set() def __enter__(self): self.pdf_conversion_client.start_conversion_thread() return self def __exit__(self, exc_type, exc_value, traceback): self.pdf_conversion_client.stop_conversion_thread() shutil.rmtree(self.args.tmp_folder, ignore_errors=True) def unarchive(self, input_file): base_name, file_extension = os.path.splitext(os.path.basename(input_file)) output_folder = os.path.dirname(input_file) dearchiver = TDearchiver(self.logger, output_folder) for _, _, filename in dearchiver.dearchive_one_archive(file_extension, input_file, base_name): yield filename def download_file_and_unzip(self, file_url, filename): file_without_extension, extension = os.path.splitext(filename) if not os.path.isfile(filename): self.logger.debug("download {0} to {1}".format(file_url, filename)) result = requests.get(file_url) with open(filename, 'wb') as fd: fd.write(result.content) if extension == '.zip': try: for archive_filename in self.unarchive(filename): yield archive_filename except Exception as e: self.logger.error("cannot unzip {}, exception={}".format(filename, e)) else: yield filename else: if extension == '.zip': for archive_filename in glob.glob("{}_*".format(file_without_extension)): yield archive_filename else: yield filename def get_all_file_sql_records(self): if self.args.mysql_port is None: db = pymysql.connect(db="declarator", user="******", password="******", unix_socket="/var/run/mysqld/mysqld.sock" ) else: db = pymysql.connect(db="declarator", user="******", password="******", port=self.args.mysql_port) cursor = db.cursor() if self.args.document_file_id is not None: where_clause = "where f.id = {}\n".format(self.args.document_file_id) else: where_clause = "" query = (""" select f.id, d.id, f.file, f.link, d.office_id, d.income_year from {} f join declarations_document d on f.document_id=d.id {} """.format(self.args.table, where_clause)) self.logger.debug(query.replace("\n", " ")) cursor.execute(query) for (document_file_id, document_id, filename, link, office_id, income_year) in cursor: if filename is not None and len(filename) > 0: yield document_file_id, document_id, filename, link, office_id, income_year cursor.close() db.close() def download_unzip_and_send_file_source_doc_server(self, declarator_url_path, document_file_id): path, declarator_filename = os.path.split(declarator_url_path) _, ext = os.path.splitext(declarator_filename) ext = ext.lower() temp_file = os.path.join(self.args.tmp_folder, "{}{}".format(document_file_id, ext)) declarator_url = os.path.join(DECLARATOR_DOMAIN, "media", urllib.parse.quote(declarator_url_path)) declarator_url = declarator_url.replace('\\', '/') for file_name in self.download_file_and_unzip(declarator_url, temp_file): self.source_doc_client.send_file(file_name) if file_name.lower().endswith('.pdf'): _, extension = os.path.splitext(file_name) self.pdf_conversion_client.start_conversion_task_if_needed(file_name, extension) self.new_pdfs.add(build_dislosures_sha256(file_name)) else: self.smart_parser_server_client.send_file(file_name) yield file_name, declarator_url self.pdf_conversion_client.wait_all_tasks_to_be_sent() for f in os.listdir(self.args.tmp_folder): os.unlink(os.path.join(self.args.tmp_folder, f)) def fix_list(self, sha256, office_id): fixed_office_id = FIX_LIST.get(sha256) if fixed_office_id is not None: return fixed_office_id else: return office_id def export_files(self): human_files_db = TDlrobotHumanFileDBM(self.args.dlrobot_human_json) if self.args.start_from_empty: human_files_db.create_db() else: human_files_db.open_write_mode() document_file_ids = set() for sha256, doc in human_files_db.get_all_documents(): for ref in doc.decl_references: if ref.document_file_id is not None: document_file_ids.add(ref.document_file_id) files_count = 0 for document_file_id, document_id, file_path, link, office_id, income_year in self.get_all_file_sql_records(): if document_file_id in document_file_ids: continue while self.pdf_conversion_client.server_is_too_busy(): self.logger.error("wait pdf conversion_server for 5 minutes, last_pdf_conversion_queue_length={}".format( self.pdf_conversion_client.last_pdf_conversion_queue_length )) time.sleep(5*60) web_site = urlsplit_pro(link).netloc if web_site.startswith('www.'): web_site = web_site[len('www.'):] if self.args.max_files_count is not None and files_count >= self.args.max_files_count: break self.logger.debug("export document_file_id={}".format(document_file_id)) for local_file_path, declarator_url in self.download_unzip_and_send_file_source_doc_server(file_path, document_file_id): sha256 = build_dislosures_sha256(local_file_path) self.logger.debug("add {}, sha256={}".format(local_file_path, sha256)) source_document = TSourceDocument(os.path.splitext(local_file_path)[1]) ref = TDeclaratorReference() ref.document_id = document_id ref.document_file_id = document_file_id ref._site_url = web_site ref.office_id = self.fix_list(sha256, office_id) ref.income_year = income_year ref.document_file_url = declarator_url source_document.add_decl_reference(ref) human_files_db.update_source_document(sha256, source_document) files_count += 1 self.logger.debug('added files count: {}'.format(files_count)) human_files_db.close_db() self.send_new_pdfs_to_smart_parser() def send_new_pdfs_to_smart_parser(self): self.logger.debug("wait pdf conversion for {} seconds".format(self.args.pdf_conversion_timeout)) self.pdf_conversion_client.wait_doc_conversion_finished(self.args.pdf_conversion_timeout) missed_pdf_count = 0 received_pdf_count = 0 for sha256 in self.new_pdfs: self.logger.debug("try to converted file for {}".format(sha256)) handle, temp_filename = tempfile.mkstemp(suffix=".docx") os.close(handle) if self.pdf_conversion_client.retrieve_document(sha256, temp_filename): received_pdf_count += 1 self.logger.debug("send the converted file to smart parser") self.smart_parser_server_client.send_file(temp_filename) else: self.logger.error("converted file is not received") missed_pdf_count += 1 os.unlink(temp_filename) if missed_pdf_count > 0: self.logger.error('received_pdf_count = {}, missed_pdf_count={}'.format(received_pdf_count, missed_pdf_count))
class TTestConvBase(TestCase): def __init__(self, methodName='runTest'): super().__init__(methodName) self.port = 8081 self.name = None self.data_folder = None self.server_address = "localhost:{}".format(self.port) self.server = None self.server_thread = None self.server_process = None self.client = None self.converters = TExternalConverters(enable_smart_parser=False, enable_calibre=False, enable_cat_doc=False, enable_xls2csv=False, enable_office_2_txt=False) self.pdf_ocr_folder = os.path.join(os.path.dirname(__file__), "pdf.ocr") self.pdf_ocr_out_folder = os.path.join(os.path.dirname(__file__), "pdf.ocr.out") if not os.path.exists(self.pdf_ocr_folder) or not os.path.exists( self.pdf_ocr_out_folder): raise Exception( "run python update_finereader_task.py, and upload test.hft to finreader hot folder" ) self.project_file = "converted_file_storage.json" self.client = None self.server_args = None self.client_count = 0 def start_server_thread(self): self.server = TConvertProcessor( TConvertProcessor.parse_args(self.server_args)) self.server_thread = threading.Thread(target=start_server, args=(self.server, )) self.server_thread.start() def setup_server(self, name, addit_server_args=list(), start_process=False): self.name = name self.data_folder = os.path.join(os.path.dirname(__file__), "data.{}".format(name)) recreate_folder(self.data_folder) os.chdir(self.data_folder) input_files = "input_files" recreate_folder(input_files) db_converted_files = os.path.join(self.data_folder, "db_converted_files") recreate_folder(db_converted_files) db_input_files = os.path.join(self.data_folder, "db_input_files") recreate_folder(db_input_files) log_file = "db_conv.log" if os.path.exists(log_file): os.unlink(log_file) clear_folder_with_retry(self.pdf_ocr_folder) clear_folder_with_retry(self.pdf_ocr_out_folder) TConvertStorage.create_empty_db(db_input_files, db_converted_files, self.project_file) self.server_args = [ "--server-address", self.server_address, '--logfile', log_file, '--db-json', self.project_file, '--disable-killing-winword', '--ocr-input-folder', self.pdf_ocr_folder, '--ocr-output-folder', self.pdf_ocr_out_folder, '--disable-telegram' ] + addit_server_args if start_process: server_script = os.path.join(os.path.dirname(__file__), "..", "conv_storage_server.py") args = ["python", server_script] + self.server_args self.server_process = subprocess.Popen(args, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) else: self.start_server_thread() def restart_server(self): self.server.stop_http_server() self.server_thread.join(0) self.start_server_thread() def process_with_client(self, input_files, timeout=None, rebuild=False, skip_receiving=False, log_name="client", input_task_timeout=5): output_files = list(os.path.basename(i) + ".docx" for i in input_files) for o in output_files: if os.path.exists(o): os.unlink(o) client_args = [ "--server-address", self.server_address, "--conversion-timeout", "180", "--output-folder", ".", ] + input_files if timeout is not None: client_args.extend(['--conversion-timeout', str(timeout)]) if rebuild: client_args.append('--rebuild') if skip_receiving: client_args.append('--skip-receiving') if self.client_count >= 0 and log_name == "client": log_name = log_name + str(self.client_count) logger = setup_logging(logger_name=log_name) try: self.client_count += 1 self.client = TDocConversionClient( TDocConversionClient.parse_args(client_args), logger=logger) self.client.input_task_timeout = input_task_timeout self.client.start_conversion_thread() self.client.process_files() return output_files finally: close_logger(logger) def list2reason(self, exc_list): if exc_list and exc_list[-1][0] is self: return exc_list[-1][1] def tear_down(self): result = self.defaultTestResult() self._feedErrorsToResult(result, self._outcome.errors) error = self.list2reason(result.errors) failure = self.list2reason(result.failures) delete_temp_files = not error and not failure if self.client is not None: self.client.stop_conversion_thread(1) self.client = None if self.server is not None: self.server.stop_http_server() self.server_thread.join(0) self.server = None else: self.server_process.kill() self.server_process = None time.sleep(5) os.chdir(os.path.dirname(__file__))