def get_text_of_a_document(source_file, keep_txt=False, reuse_txt=False, output_folder=None): global EXTERNAl_CONVERTORS ec = EXTERNAl_CONVERTORS _, file_extension = os.path.splitext(source_file) file_extension = file_extension.lower() if output_folder is None: txt_file = source_file + ".txt" else: txt_file = os.path.join(output_folder, os.path.basename(source_file) + ".txt") if reuse_txt and os.path.exists(txt_file): pass elif file_extension == ".xlsx": ec.run_xlsx2csv(source_file, txt_file) elif file_extension == ".xls": res = ec.run_xls2csv(source_file, txt_file) if res != 0: temp_fname = source_file + ".xlsx" shutil.copy(source_file, temp_fname) ec.run_xlsx2csv(temp_fname, txt_file) os.unlink(temp_fname) elif file_extension == ".docx": ec.run_office2txt(source_file, txt_file) elif file_extension == ".pdf": temp_file = source_file + ".docx" sha256 = build_dislosures_sha256(source_file) if TDocConversionClient( TDocConversionClient.parse_args([])).retrieve_document( sha256, temp_file) and os.path.exists(temp_file): ec.run_office2txt(temp_file, txt_file) else: # the worse case, let's use calibre ec.run_calibre(source_file, txt_file) if os.path.exists(temp_file): os.unlink(temp_file) elif file_extension in {".html", ".rtf", ".htm"}: ec.run_calibre(source_file, txt_file) elif file_extension == ".doc": res = ec.run_catdoc(source_file, txt_file) if res != 0: temp_fname = source_file + ".docx" shutil.copy(source_file, temp_fname) ec.run_office2txt(temp_fname, txt_file) os.unlink(temp_fname) else: return None if os.path.exists(txt_file): doc_text = read_input_text(txt_file) if not keep_txt: os.unlink(txt_file) return doc_text else: return None
def __init__(self, args): self.logger = setup_logging(log_file_name="export_human_files.log") self.args = args if self.args.tmp_folder is None: self.args.tmp_folder = tempfile.mkdtemp("export_human") self.logger.debug("create folder {}".format(self.args.tmp_folder)) else: self.logger.debug("rm folder {}".format(self.args.tmp_folder)) shutil.rmtree(self.args.tmp_folder, ignore_errors=True) os.mkdir(self.args.tmp_folder) self.source_doc_client = TSourceDocClient(TSourceDocClient.parse_args([]), self.logger) self.pdf_conversion_client = TDocConversionClient(TDocConversionClient.parse_args([]), self.logger) self.smart_parser_server_client = TSmartParserCacheClient(TSmartParserCacheClient.parse_args([]), self.logger) self.new_pdfs = set()
def __init__(self, args): self.register_task_result_error_count = 0 self.logger = setup_logging(log_file_name=args.log_file_name, append_mode=True) self.conversion_client = TDocConversionClient( TDocConversionClient.parse_args([]), self.logger) self.args = args rounds = TDeclarationRounds(args.round_file) self.dlrobot_remote_calls = TRemoteDlrobotCallList( logger=self.logger, file_name=args.remote_calls_file, min_start_time_stamp=rounds.start_time_stamp) self.worker_2_running_tasks = defaultdict(list) self.worker_2_continuous_failures_count = defaultdict(int) offices = TOfficeTableInMemory() offices.read_from_local_file(self.args.offices_file) self.web_sites_db = TDeclarationWebSiteList(self.logger, offices=offices) if not os.path.exists(self.args.result_folder): os.makedirs(self.args.result_folder) self.web_sites_to_process = self.find_projects_to_process() self.cloud_id_to_worker_ip = dict() self.config = TRobotConfig.read_by_config_type( self.args.dlrobot_config_type) self.last_remote_call = None # for testing host, port = self.args.server_address.split(":") self.logger.debug("start server on {}:{}".format(host, port)) super().__init__((host, int(port)), TDlrobotRequestHandler) self.last_service_action_time_stamp = time.time() self.service_action_count = 0 self.decl_sender = TDeclarationSender( self.logger, self.args.enable_smart_parser, self.args.enable_source_doc_server) self.stop_process = False if self.args.enable_ip_checking: self.permitted_hosts = set( str(x) for x in ipaddress.ip_network('192.168.100.0/24').hosts()) self.permitted_hosts.add('127.0.0.1') self.permitted_hosts.add('95.165.96.61') # disclosures.ru self.logger.debug("init complete") self.send_to_telegram("start dlrobot central with {} tasks".format( len(self.web_sites_to_process)))
def process_all_tasks(self): if len(self.ocr_tasks) == 0: self.ocr_queue_is_empty_last_time_stamp = time.time() self.try_convert_with_winword() new_files_from_ocr = self.process_docx_from_ocr() if new_files_from_ocr: self.got_ocred_file_last_time_stamp = time.time() # file garbage tasks current_time = time.time() if current_time - self.file_garbage_collection_timestamp >= 60: # just not too often self.file_garbage_collection_timestamp = current_time if not self.http_server_is_working: return self.process_ocr_logs() if not self.http_server_is_working: return self.process_stalled_files() if current_time - self.self_server_ping_timestamp >= 3600: # just not too often args = TDocConversionClient.parse_args( ["--server-address", self.args.server_address]) client = TDocConversionClient(args, self.logger) if not client.assert_declarator_conv_alive(raise_exception=False): self.logger.error("cannot ping itself, exit") self.stop_http_server(run_shutdown=False) sys.exit(1) self.self_server_ping_timestamp = current_time current_time = time.time() if current_time - self.got_ocred_file_last_time_stamp > self.args.ocr_restart_time and \ current_time - self.ocr_queue_is_empty_last_time_stamp > self.args.ocr_restart_time : self.logger.debug( "last ocr file was received long ago and all this time the ocr queue was not empty" ) if not self.http_server_is_working: return self.restart_ocr() self.got_ocred_file_last_time_stamp = time.time( ) #otherwize restart will be too often
def process_with_client(self, input_files, timeout=None, rebuild=False, skip_receiving=False, log_name="client", input_task_timeout=5): output_files = list(os.path.basename(i) + ".docx" for i in input_files) for o in output_files: if os.path.exists(o): os.unlink(o) client_args = [ "--server-address", self.server_address, "--conversion-timeout", "180", "--output-folder", ".", ] + input_files if timeout is not None: client_args.extend(['--conversion-timeout', str(timeout)]) if rebuild: client_args.append('--rebuild') if skip_receiving: client_args.append('--skip-receiving') if self.client_count >= 0 and log_name == "client": log_name = log_name + str(self.client_count) logger = setup_logging(logger_name=log_name) try: self.client_count += 1 self.client = TDocConversionClient( TDocConversionClient.parse_args(client_args), logger=logger) self.client.input_task_timeout = input_task_timeout self.client.start_conversion_thread() self.client.process_files() return output_files finally: close_logger(logger)
import json from ConvStorage.conversion_client import TDocConversionClient import argparse import os import time def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--history-file", dest='history_file', default=None) return parser.parse_args() if __name__ == '__main__': args = parse_args() conv_client = TDocConversionClient(TDocConversionClient.parse_args([]), logging) stats = conv_client.get_stats() if args.history_file is None: print(json.dumps(stats)) else: lines = list() if os.path.exists(args.history_file): with open(args.history_file, "r", encoding="utf-8") as inp: for l in inp: lines.append(l) lines.append("{}\t{}\n".format(int(time.time()), json.dumps(stats))) lines = lines[-400:] with open(args.history_file, "w", encoding="utf-8") as out: for l in lines: out.write(l)
from ConvStorage.conversion_client import TDocConversionClient from common.logging_wrapper import setup_logging import sys if __name__ == '__main__': logger = setup_logging(log_file_name="convert_pdf.log") client = TDocConversionClient( TDocConversionClient.parse_args(sys.argv[1:]), logger) client.start_conversion_thread() exit_code = client.process_files() sys.exit(exit_code)