Пример #1
0
 def __init__(self, args, logger):
     self.timeout = 60 * 10
     self.conversion_client = TDocConversionClient(logger)
     self.args = args
     self.logger = logger
     self.dlrobot_remote_calls = defaultdict(list)
     self.input_files = list()
     self.worker_2_running_tasks = defaultdict(list)
     self.initialize_tasks()
     self.cloud_id_to_worker_ip = dict()
     host, port = self.args.server_address.split(":")
     self.logger.debug("start server on {}:{}".format(host, port))
     super().__init__((host, int(port)), TDlrobotRequestHandler)
     self.last_service_action_time_stamp = time.time()
     self.smart_parser_cache_client = None
     if self.args.enable_smart_parser:
         self.smart_parser_cache_client = TSmartParserCacheClient(
             self.logger)
     self.crawl_epoch_id = self.args.crawl_epoch_id
     self.stop_process = False
     if self.args.enable_ip_checking:
         self.permitted_hosts = set(
             str(x)
             for x in ipaddress.ip_network('192.168.100.0/24').hosts())
         self.permitted_hosts.add('127.0.0.1')
         self.permitted_hosts.add('95.165.96.61')  # disclosures.ru
     self.pdf_conversion_queue_length = self.conversion_client.get_pending_all_file_size(
     )
Пример #2
0
def get_text_of_a_document(source_file,
                           keep_txt=False,
                           reuse_txt=False,
                           output_folder=None):
    global EXTERNAl_CONVERTORS
    ec = EXTERNAl_CONVERTORS
    _, file_extension = os.path.splitext(source_file)
    file_extension = file_extension.lower()
    if output_folder is None:
        txt_file = source_file + ".txt"
    else:
        txt_file = os.path.join(output_folder,
                                os.path.basename(source_file) + ".txt")

    if reuse_txt and os.path.exists(txt_file):
        pass
    elif file_extension == ".xlsx":
        ec.run_xlsx2csv(source_file, txt_file)
    elif file_extension == ".xls":
        res = ec.run_xls2csv(source_file, txt_file)
        if res != 0:
            temp_fname = source_file + ".xlsx"
            shutil.copy(source_file, temp_fname)
            ec.run_xlsx2csv(temp_fname, txt_file)
            os.unlink(temp_fname)
    elif file_extension == ".docx":
        ec.run_office2txt(source_file, txt_file)
    elif file_extension == ".pdf":
        temp_file = source_file + ".docx"
        sha256 = build_dislosures_sha256(source_file)
        if TDocConversionClient(
                TDocConversionClient.parse_args([])).retrieve_document(
                    sha256, temp_file) and os.path.exists(temp_file):
            ec.run_office2txt(temp_file, txt_file)
        else:
            # the worse case, let's use calibre
            ec.run_calibre(source_file, txt_file)
        if os.path.exists(temp_file):
            os.unlink(temp_file)
    elif file_extension in {".html", ".rtf", ".htm"}:
        ec.run_calibre(source_file, txt_file)
    elif file_extension == ".doc":
        res = ec.run_catdoc(source_file, txt_file)
        if res != 0:
            temp_fname = source_file + ".docx"
            shutil.copy(source_file, temp_fname)
            ec.run_office2txt(temp_fname, txt_file)
            os.unlink(temp_fname)
    else:
        return None
    if os.path.exists(txt_file):
        doc_text = read_input_text(txt_file)
        if not keep_txt:
            os.unlink(txt_file)
        return doc_text
    else:
        return None
Пример #3
0
 def __init__(self, args):
     self.logger = setup_logging(log_file_name="export_human_files.log")
     self.args = args
     if self.args.tmp_folder is None:
         self.args.tmp_folder = tempfile.mkdtemp("export_human")
         self.logger.debug("create folder {}".format(self.args.tmp_folder))
     else:
         self.logger.debug("rm folder {}".format(self.args.tmp_folder))
         shutil.rmtree(self.args.tmp_folder, ignore_errors=True)
         os.mkdir(self.args.tmp_folder)
     self.source_doc_client = TSourceDocClient(TSourceDocClient.parse_args([]), self.logger)
     self.pdf_conversion_client = TDocConversionClient(TDocConversionClient.parse_args([]), self.logger)
     self.smart_parser_server_client = TSmartParserCacheClient(TSmartParserCacheClient.parse_args([]), self.logger)
     self.new_pdfs = set()
Пример #4
0
 def __init__(self, args):
     self.register_task_result_error_count = 0
     self.logger = setup_logging(log_file_name=args.log_file_name,
                                 append_mode=True)
     self.conversion_client = TDocConversionClient(
         TDocConversionClient.parse_args([]), self.logger)
     self.args = args
     rounds = TDeclarationRounds(args.round_file)
     self.dlrobot_remote_calls = TRemoteDlrobotCallList(
         logger=self.logger,
         file_name=args.remote_calls_file,
         min_start_time_stamp=rounds.start_time_stamp)
     self.worker_2_running_tasks = defaultdict(list)
     self.worker_2_continuous_failures_count = defaultdict(int)
     offices = TOfficeTableInMemory()
     offices.read_from_local_file(self.args.offices_file)
     self.web_sites_db = TDeclarationWebSiteList(self.logger,
                                                 offices=offices)
     if not os.path.exists(self.args.result_folder):
         os.makedirs(self.args.result_folder)
     self.web_sites_to_process = self.find_projects_to_process()
     self.cloud_id_to_worker_ip = dict()
     self.config = TRobotConfig.read_by_config_type(
         self.args.dlrobot_config_type)
     self.last_remote_call = None  # for testing
     host, port = self.args.server_address.split(":")
     self.logger.debug("start server on {}:{}".format(host, port))
     super().__init__((host, int(port)), TDlrobotRequestHandler)
     self.last_service_action_time_stamp = time.time()
     self.service_action_count = 0
     self.decl_sender = TDeclarationSender(
         self.logger, self.args.enable_smart_parser,
         self.args.enable_source_doc_server)
     self.stop_process = False
     if self.args.enable_ip_checking:
         self.permitted_hosts = set(
             str(x)
             for x in ipaddress.ip_network('192.168.100.0/24').hosts())
         self.permitted_hosts.add('127.0.0.1')
         self.permitted_hosts.add('95.165.96.61')  # disclosures.ru
     self.logger.debug("init complete")
     self.send_to_telegram("start dlrobot central with {} tasks".format(
         len(self.web_sites_to_process)))
Пример #5
0
def main(args, logger):
    if args.conversion_server is not None:
        TDocConversionClient.DECLARATOR_CONV_URL = args.conversion_server
    conv_tasks = TDocConversionClient(logger)
    conv_tasks.start_conversion_thread()

    try:
        sent_files = send_files(args, logger, conv_tasks)
        if args.receive_files and len(sent_files) > 0:
            conv_tasks.wait_doc_conversion_finished(args.conversion_timeout)
        else:
            logger.debug("stop conversion finished")
            conv_tasks.stop_conversion_thread()
    except Exception as exp:
        logger.error("exception: {}, stop_conversion_thread".format(exp))
        conv_tasks.stop_conversion_thread()
    if args.receive_files:
        if not receive_files(logger, conv_tasks, sent_files):
            return 1
    return 0
Пример #6
0
    def process_all_tasks(self):
        if len(self.ocr_tasks) == 0:
            self.ocr_queue_is_empty_last_time_stamp = time.time()
        self.try_convert_with_winword()
        new_files_from_ocr = self.process_docx_from_ocr()
        if new_files_from_ocr:
            self.got_ocred_file_last_time_stamp = time.time()
        # file garbage tasks
        current_time = time.time()
        if current_time - self.file_garbage_collection_timestamp >= 60:  # just not too often
            self.file_garbage_collection_timestamp = current_time
            if not self.http_server_is_working:
                return
            self.process_ocr_logs()
            if not self.http_server_is_working:
                return
            self.process_stalled_files()

        if current_time - self.self_server_ping_timestamp >= 3600:  # just not too often
            args = TDocConversionClient.parse_args(
                ["--server-address", self.args.server_address])
            client = TDocConversionClient(args, self.logger)
            if not client.assert_declarator_conv_alive(raise_exception=False):
                self.logger.error("cannot ping itself, exit")
                self.stop_http_server(run_shutdown=False)
                sys.exit(1)
            self.self_server_ping_timestamp = current_time

        current_time = time.time()
        if  current_time - self.got_ocred_file_last_time_stamp > self.args.ocr_restart_time and \
                current_time - self.ocr_queue_is_empty_last_time_stamp > self.args.ocr_restart_time :
            self.logger.debug(
                "last ocr file was received long ago and all this time the ocr queue was not empty"
            )
            if not self.http_server_is_working:
                return
            self.restart_ocr()
            self.got_ocred_file_last_time_stamp = time.time(
            )  #otherwize restart will be too often
Пример #7
0
 def process_with_client(self,
                         input_files,
                         timeout=None,
                         rebuild=False,
                         skip_receiving=False,
                         log_name="client",
                         input_task_timeout=5):
     output_files = list(os.path.basename(i) + ".docx" for i in input_files)
     for o in output_files:
         if os.path.exists(o):
             os.unlink(o)
     client_args = [
         "--server-address",
         self.server_address,
         "--conversion-timeout",
         "180",
         "--output-folder",
         ".",
     ] + input_files
     if timeout is not None:
         client_args.extend(['--conversion-timeout', str(timeout)])
     if rebuild:
         client_args.append('--rebuild')
     if skip_receiving:
         client_args.append('--skip-receiving')
     if self.client_count >= 0 and log_name == "client":
         log_name = log_name + str(self.client_count)
     logger = setup_logging(logger_name=log_name)
     try:
         self.client_count += 1
         self.client = TDocConversionClient(
             TDocConversionClient.parse_args(client_args), logger=logger)
         self.client.input_task_timeout = input_task_timeout
         self.client.start_conversion_thread()
         self.client.process_files()
         return output_files
     finally:
         close_logger(logger)
Пример #8
0
def get_text_of_a_document(source_file, keep_txt=False, reuse_txt=False):
    global EXTERNAl_CONVERTORS
    ec = EXTERNAl_CONVERTORS
    _, file_extension = os.path.splitext(source_file)
    file_extension = file_extension.lower()
    txt_file = source_file + ".txt"
    if reuse_txt and os.path.exists(txt_file):
        pass
    elif file_extension == ".xlsx":
        ec.run_xlsx2csv(source_file, txt_file)
    elif file_extension == ".xls":
        res = ec.run_xls2csv(source_file, txt_file)
        if res != 0:
            temp_fname = source_file + ".xlsx"
            shutil.copy(source_file, temp_fname)
            ec.run_xlsx2csv(temp_fname, txt_file)
            os.unlink(temp_fname)
    elif file_extension == ".docx":
        ec.run_office2txt(source_file, txt_file)
    elif file_extension == ".pdf":
        temp_file = source_file + ".docx"
        with open(source_file, "rb") as f:
            sha256 = hashlib.sha256(f.read()).hexdigest()
            if TDocConversionClient().retrieve_document(sha256, temp_file):
                ec.run_office2txt(temp_file, txt_file)
            else:
                # the worse case, let's use calibre
                ec.run_calibre(source_file, txt_file)
        if os.path.exists(temp_file):
            os.unlink(temp_file)
    elif file_extension in {".html", ".rtf", ".htm"}:
        ec.run_calibre(source_file, txt_file)
    elif file_extension == ".doc":
        res = ec.run_catdoc(source_file, txt_file)
        if res != 0:
            temp_fname = source_file + ".docx"
            shutil.copy(source_file, temp_fname)
            ec.run_office2txt(temp_fname, txt_file)
            os.unlink(temp_fname)
    else:
        return None
    if os.path.exists(txt_file):
        doc_text = read_input_text(txt_file)
        if not keep_txt:
            os.unlink(txt_file)
        return doc_text
    else:
        return None
Пример #9
0
import json
from ConvStorage.conversion_client import TDocConversionClient
import argparse
import os
import time


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--history-file", dest='history_file', default=None)
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    conv_client = TDocConversionClient(TDocConversionClient.parse_args([]), logging)
    stats = conv_client.get_stats()
    if args.history_file is None:
        print(json.dumps(stats))
    else:
        lines = list()
        if os.path.exists(args.history_file):
            with open(args.history_file, "r", encoding="utf-8") as inp:
                for l in inp:
                    lines.append(l)
        lines.append("{}\t{}\n".format(int(time.time()), json.dumps(stats)))
        lines = lines[-400:]
        with open(args.history_file, "w", encoding="utf-8") as out:
            for l in lines:
                out.write(l)
Пример #10
0
from ConvStorage.conversion_client import TDocConversionClient
from common.logging_wrapper import setup_logging

import sys

if __name__ == '__main__':
    logger = setup_logging(log_file_name="convert_pdf.log")
    client = TDocConversionClient(
        TDocConversionClient.parse_args(sys.argv[1:]), logger)
    client.start_conversion_thread()
    exit_code = client.process_files()
    sys.exit(exit_code)
Пример #11
0
import json
from ConvStorage.conversion_client import TDocConversionClient
import argparse
import os
import time


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--history-file", dest='history_file', default=None)
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    conv_client = TDocConversionClient(logging)
    stats = conv_client.get_stats()
    if args.history_file is None:
        print(json.dumps(stats))
    else:
        lines = list()
        if os.path.exists(args.history_file):
            with open(args.history_file, "r", encoding="utf-8") as inp:
                for l in inp:
                    lines.append(l)
        lines.append("{}\t{}\n".format(int(time.time()), json.dumps(stats)))
        lines = lines[-400:]
        with open(args.history_file, "w", encoding="utf-8") as out:
            for l in lines:
                out.write(l)
Пример #12
0
class TTestConvBase(TestCase):
    def __init__(self, methodName='runTest'):
        super().__init__(methodName)
        self.port = 8081
        self.name = None
        self.data_folder = None
        self.server_address = "localhost:{}".format(self.port)
        self.server = None
        self.server_thread = None
        self.server_process = None
        self.client = None
        self.converters = TExternalConverters(enable_smart_parser=False,
                                              enable_calibre=False,
                                              enable_cat_doc=False,
                                              enable_xls2csv=False,
                                              enable_office_2_txt=False)

        self.pdf_ocr_folder = os.path.join(os.path.dirname(__file__),
                                           "pdf.ocr")
        self.pdf_ocr_out_folder = os.path.join(os.path.dirname(__file__),
                                               "pdf.ocr.out")
        if not os.path.exists(self.pdf_ocr_folder) or not os.path.exists(
                self.pdf_ocr_out_folder):
            raise Exception(
                "run python update_finereader_task.py, and upload test.hft to finreader hot folder"
            )
        self.project_file = "converted_file_storage.json"
        self.client = None
        self.server_args = None
        self.client_count = 0

    def start_server_thread(self):
        self.server = TConvertProcessor(
            TConvertProcessor.parse_args(self.server_args))
        self.server_thread = threading.Thread(target=start_server,
                                              args=(self.server, ))
        self.server_thread.start()

    def setup_server(self,
                     name,
                     addit_server_args=list(),
                     start_process=False):
        self.name = name
        self.data_folder = os.path.join(os.path.dirname(__file__),
                                        "data.{}".format(name))

        recreate_folder(self.data_folder)

        os.chdir(self.data_folder)
        input_files = "input_files"
        recreate_folder(input_files)

        db_converted_files = os.path.join(self.data_folder,
                                          "db_converted_files")
        recreate_folder(db_converted_files)

        db_input_files = os.path.join(self.data_folder, "db_input_files")
        recreate_folder(db_input_files)

        log_file = "db_conv.log"
        if os.path.exists(log_file):
            os.unlink(log_file)

        clear_folder_with_retry(self.pdf_ocr_folder)
        clear_folder_with_retry(self.pdf_ocr_out_folder)
        TConvertStorage.create_empty_db(db_input_files, db_converted_files,
                                        self.project_file)

        self.server_args = [
            "--server-address", self.server_address, '--logfile', log_file,
            '--db-json', self.project_file, '--disable-killing-winword',
            '--ocr-input-folder', self.pdf_ocr_folder, '--ocr-output-folder',
            self.pdf_ocr_out_folder, '--disable-telegram'
        ] + addit_server_args

        if start_process:
            server_script = os.path.join(os.path.dirname(__file__), "..",
                                         "conv_storage_server.py")
            args = ["python", server_script] + self.server_args
            self.server_process = subprocess.Popen(args,
                                                   stderr=subprocess.DEVNULL,
                                                   stdout=subprocess.DEVNULL)
        else:
            self.start_server_thread()

    def restart_server(self):
        self.server.stop_http_server()
        self.server_thread.join(0)
        self.start_server_thread()

    def process_with_client(self,
                            input_files,
                            timeout=None,
                            rebuild=False,
                            skip_receiving=False,
                            log_name="client",
                            input_task_timeout=5):
        output_files = list(os.path.basename(i) + ".docx" for i in input_files)
        for o in output_files:
            if os.path.exists(o):
                os.unlink(o)
        client_args = [
            "--server-address",
            self.server_address,
            "--conversion-timeout",
            "180",
            "--output-folder",
            ".",
        ] + input_files
        if timeout is not None:
            client_args.extend(['--conversion-timeout', str(timeout)])
        if rebuild:
            client_args.append('--rebuild')
        if skip_receiving:
            client_args.append('--skip-receiving')
        if self.client_count >= 0 and log_name == "client":
            log_name = log_name + str(self.client_count)
        logger = setup_logging(logger_name=log_name)
        try:
            self.client_count += 1
            self.client = TDocConversionClient(
                TDocConversionClient.parse_args(client_args), logger=logger)
            self.client.input_task_timeout = input_task_timeout
            self.client.start_conversion_thread()
            self.client.process_files()
            return output_files
        finally:
            close_logger(logger)

    def list2reason(self, exc_list):
        if exc_list and exc_list[-1][0] is self:
            return exc_list[-1][1]

    def tear_down(self):
        result = self.defaultTestResult()
        self._feedErrorsToResult(result, self._outcome.errors)
        error = self.list2reason(result.errors)
        failure = self.list2reason(result.failures)
        delete_temp_files = not error and not failure

        if self.client is not None:
            self.client.stop_conversion_thread(1)
            self.client = None

        if self.server is not None:
            self.server.stop_http_server()
            self.server_thread.join(0)
            self.server = None
        else:
            self.server_process.kill()
            self.server_process = None

        time.sleep(5)

        os.chdir(os.path.dirname(__file__))
Пример #13
0
class TDlrobotHTTPServer(http.server.HTTPServer):
    def initialize_tasks(self):
        self.dlrobot_remote_calls.clear()
        self.worker_2_running_tasks.clear()
        self.input_files = list(x for x in os.listdir(self.args.input_folder)
                                if x.endswith('.txt'))
        if not os.path.exists(self.args.result_folder):
            os.makedirs(self.args.result_folder)
        if args.read_previous_results:
            self.read_prev_dlrobot_remote_calls()
        logger.debug("there are {} dlrobot projects to process".format(
            len(self.input_files)))
        self.worker_2_running_tasks.clear()

    def __init__(self, args, logger):
        self.timeout = 60 * 10
        self.conversion_client = TDocConversionClient(logger)
        self.args = args
        self.logger = logger
        self.dlrobot_remote_calls = defaultdict(list)
        self.input_files = list()
        self.worker_2_running_tasks = defaultdict(list)
        self.initialize_tasks()
        self.cloud_id_to_worker_ip = dict()
        host, port = self.args.server_address.split(":")
        self.logger.debug("start server on {}:{}".format(host, port))
        super().__init__((host, int(port)), TDlrobotRequestHandler)
        self.last_service_action_time_stamp = time.time()
        self.smart_parser_cache_client = None
        if self.args.enable_smart_parser:
            self.smart_parser_cache_client = TSmartParserCacheClient(
                self.logger)
        self.crawl_epoch_id = self.args.crawl_epoch_id
        self.stop_process = False
        if self.args.enable_ip_checking:
            self.permitted_hosts = set(
                str(x)
                for x in ipaddress.ip_network('192.168.100.0/24').hosts())
            self.permitted_hosts.add('127.0.0.1')
            self.permitted_hosts.add('95.165.96.61')  # disclosures.ru
        self.pdf_conversion_queue_length = self.conversion_client.get_pending_all_file_size(
        )

    def verify_request(self, request, client_address):
        if self.args.enable_ip_checking:
            (ip, dummy) = client_address
            if ip not in self.permitted_hosts:
                return False
        return True

    def log_process_result(self, process_result):
        s = process_result.stdout.strip("\n\r ")
        if len(s) > 0:
            for line in s.split("\n"):
                self.logger.error("task stderr: {}".format(line))
        s = process_result.stderr.strip("\n\r ")
        if len(s) > 0:
            for line in s.split("\n"):
                self.logger.error("task stderr: {}".format(line))

    def get_dlrobot_remote_calls_filename(self):
        return os.path.join(self.args.result_folder,
                            "dlrobot_remote_calls.dat")

    def have_tasks(self):
        return len(self.input_files) > 0 and not self.stop_process

    def save_dlrobot_remote_call(self, remote_call: TRemoteDlrobotCall):
        with open(self.get_dlrobot_remote_calls_filename(), "a") as outp:
            outp.write(json.dumps(remote_call.write_to_json()) + "\n")
        self.dlrobot_remote_calls[remote_call.project_file].append(remote_call)
        if remote_call.exit_code != 0:
            max_tries_count = args.tries_count
            tries_count = len(
                self.dlrobot_remote_calls[remote_call.project_file])
            if remote_call.project_folder is None and tries_count == max_tries_count:
                # if the last result was not obtained, may be,
                # worker is down, so the problem is not in the task but in the worker
                # so give this task one more chance
                max_tries_count += 1
                self.logger.debug(
                    "increase max_tries_count for {} to {}".format(
                        remote_call.project_file, max_tries_count))

            if tries_count < max_tries_count:
                self.input_files.append(remote_call.project_file)
                self.logger.debug("register retry for {}".format(
                    remote_call.project_file))

    def input_tasks_exist(self):
        with os.scandir(self.args.input_folder) as it:
            for entry in it:
                if entry.name.endswith(".txt"):
                    return True
        return False

    def can_start_new_epoch(self):
        if self.stop_process:
            return False
        if not self.input_tasks_exist():
            return False
        if self.get_running_jobs_count() > 0:
            return False
        return True

    def start_new_epoch(self):
        archive_filename = "{}.{}".format(
            self.get_dlrobot_remote_calls_filename(), self.crawl_epoch_id)
        if os.path.exists(archive_filename):
            self.logger.error("cannot create file {}, already exists".format(
                archive_filename))
            raise Exception("bad crawl epoch id")
        shutil.move(self.get_dlrobot_remote_calls_filename(), archive_filename)
        self.crawl_epoch_id += 1
        self.logger.error("start new epoch {}".format(self.crawl_epoch_id))
        self.initialize_tasks()

    def read_prev_dlrobot_remote_calls(self):
        if os.path.exists(self.get_dlrobot_remote_calls_filename()):
            self.logger.debug("read {}".format(
                self.get_dlrobot_remote_calls_filename()))
            calls = TRemoteDlrobotCall.read_remote_calls_from_file(
                self.get_dlrobot_remote_calls_filename())
            for remote_call in calls:
                self.dlrobot_remote_calls[remote_call.project_file].append(
                    remote_call)
                if remote_call.exit_code == 0 and remote_call.project_file in self.input_files:
                    self.logger.debug(
                        "delete {}, since it is already processed".format(
                            remote_call.project_file))
                    self.input_files.remove(remote_call.project_file)

    def get_running_jobs_count(self):
        return sum(len(w) for w in self.worker_2_running_tasks.values())

    def get_processed_jobs_count(self):
        return sum(len(w) for w in self.dlrobot_remote_calls.values())

    def conversion_server_queue_is_short(self):
        return self.pdf_conversion_queue_length < self.args.pdf_conversion_queue_limit

    def get_new_job_task(self, worker_host_name, worker_ip):
        project_file = self.input_files.pop(0)
        self.logger.info(
            "start job: {} on {} (host name={}), left jobs: {}, running jobs: {}"
            .format(project_file, worker_ip, worker_host_name,
                    len(self.input_files), self.get_running_jobs_count()))
        res = TRemoteDlrobotCall(worker_ip, project_file)
        res.worker_host_name = worker_host_name
        self.worker_2_running_tasks[worker_ip].append(res)
        return project_file

    def untar_file(self, project_file, result_archive):
        base_folder, _ = os.path.splitext(project_file)
        output_folder = os.path.join(
            args.result_folder, base_folder) + ".{}".format(int(time.time()))
        compressed_file = io.BytesIO(result_archive)
        decompressed_file = gzip.GzipFile(fileobj=compressed_file)
        tar = tarfile.open(fileobj=decompressed_file)
        tar.extractall(output_folder)
        return output_folder

    def pop_project_from_running_tasks(self, worker_ip, project_file):
        if worker_ip not in self.worker_2_running_tasks:
            raise Exception(
                "{} is missing in the worker table".format(worker_ip))
        worker_running_tasks = self.worker_2_running_tasks[worker_ip]
        for i in range(len(worker_running_tasks)):
            if worker_running_tasks[i].project_file == project_file:
                return worker_running_tasks.pop(i)
        raise Exception("{} is missing in the worker {} task table".format(
            project_file, worker_ip))

    def send_declaraion_files_to_smart_parser(self, dlrobot_project_folder):
        doc_folder = os.path.join(dlrobot_project_folder, "result")
        if os.path.exists(doc_folder):
            for website in os.listdir(doc_folder):
                website_folder = os.path.join(doc_folder, website)
                for doc in os.listdir(website_folder):
                    _, extension = os.path.splitext(doc)
                    if extension in ACCEPTED_DOCUMENT_EXTENSIONS:
                        self.smart_parser_cache_client.send_file(
                            os.path.join(website_folder, doc))

    def register_task_result(self, worker_host_name, worker_ip, project_file,
                             exit_code, result_archive):
        if args.skip_worker_check:
            remote_call = TRemoteDlrobotCall(worker_ip, project_file)
        else:
            remote_call = self.pop_project_from_running_tasks(
                worker_ip, project_file)
        remote_call.worker_host_name = worker_host_name
        remote_call.exit_code = exit_code
        remote_call.end_time = int(time.time())
        remote_call.project_folder = self.untar_file(project_file,
                                                     result_archive)
        remote_call.calc_project_stats()
        if self.args.enable_smart_parser:
            self.send_declaraion_files_to_smart_parser(
                remote_call.project_folder)
        self.save_dlrobot_remote_call(remote_call)

        self.logger.debug(
            "got exitcode {} for task result {} from worker {}".format(
                exit_code, project_file, worker_ip))

    def forget_old_remote_processes(self, current_time):
        for running_procs in self.worker_2_running_tasks.values():
            for i in range(len(running_procs) - 1, -1, -1):
                rc = running_procs[i]
                if current_time - rc.start_time > args.dlrobot_project_timeout:
                    self.logger.debug(
                        "task {} on worker {} takes {} seconds, probably it failed, stop waiting for a result"
                        .format(rc.project_file, rc.worker_ip,
                                current_time - rc.start_time))
                    running_procs.pop(i)
                    rc.exit_code = 126
                    self.save_dlrobot_remote_call(rc)

    def forget_remote_processes_for_yandex_worker(self, cloud_id,
                                                  current_time):
        worker_ip = self.cloud_id_to_worker_ip.get(cloud_id)
        if worker_ip is None and len(self.cloud_id_to_worker_ip) > 0:
            self.logger.info(
                "I do not remember ip for cloud_id {}, cannot delete processes"
                .format(cloud_id))
            return

        running_procs = self.worker_2_running_tasks.get(worker_ip, list())
        for i in range(len(running_procs) - 1, -1, -1):
            rc = running_procs[i]
            self.logger.debug(
                "forget task {} on worker {} since the workstation was stopped"
                .format(rc.project_file, rc.worker_ip))
            running_procs.pop(i)
            rc.exit_code = 125
            self.save_dlrobot_remote_call(rc)
        if cloud_id in self.cloud_id_to_worker_ip:
            del self.cloud_id_to_worker_ip[cloud_id]

    def check_yandex_cloud(self):
        if not self.args.check_yandex_cloud:
            return None
        try:
            if not check_internet():
                self.logger.error(
                    "cannot connect to google dns, probably internet is down")
                return None
            current_time = time.time()
            for m in TYandexCloud.list_instances():
                cloud_id = m['id']
                if m['status'] == 'STOPPED':
                    self.forget_remote_processes_for_yandex_worker(
                        cloud_id, current_time)
                    self.logger.info(
                        "start yandex cloud worker {}".format(cloud_id))
                    TYandexCloud.start_yandex_cloud_worker(cloud_id)
                elif m['status'] == "RUNNING":
                    worker_ip = TYandexCloud.get_worker_ip(m)
                    if self.args.enable_ip_checking:
                        self.permitted_hosts.add(worker_ip)
                    self.cloud_id_to_worker_ip[cloud_id] = worker_ip
        except Exception as exp:
            self.logger.error(exp)

    def service_actions(self):
        current_time = time.time()
        if current_time - self.last_service_action_time_stamp >= args.central_heart_rate:
            self.last_service_action_time_stamp = current_time
            self.forget_old_remote_processes(current_time)
            self.check_yandex_cloud()
            if os.path.exists(PITSTOP_FILE):
                self.stop_process = True
                self.logger.debug("stop sending tasks, exit for a pit stop")
                os.unlink(PITSTOP_FILE)
            if self.stop_process and self.get_running_jobs_count() == 0:
                raise Exception("exit for pit stop")
            self.pdf_conversion_queue_length = self.conversion_client.get_pending_all_file_size(
            )
            if not self.conversion_server_queue_is_short():
                self.logger.debug(
                    "stop sending tasks, because conversion pdf queue length is {}"
                    .format(self.pdf_conversion_queue_length))

    def get_stats(self):
        workers = dict((k, list(r.write_to_json() for r in v))
                       for (k, v) in self.worker_2_running_tasks.items())

        return {
            'running_count': self.get_running_jobs_count(),
            'input_tasks': len(self.input_files),
            'processed_tasks': self.get_processed_jobs_count(),
            'worker_2_running_tasks': workers
        }
Пример #14
0
class TDlrobotHTTPServer(http.server.HTTPServer):
    max_continuous_failures_count = 7
    PITSTOP_FILE = ".dlrobot_pit_stop"

    @staticmethod
    def parse_args(arg_list):
        parser = argparse.ArgumentParser()
        parser.add_argument(
            "--server-address",
            dest='server_address',
            default=None,
            help=
            "by default read it from environment variable DLROBOT_CENTRAL_SERVER_ADDRESS"
        )
        parser.add_argument("--dlrobot-config-type",
                            dest='dlrobot_config_type',
                            required=False,
                            default="prod",
                            help="can be prod, preliminary or test")
        parser.add_argument("--custom-offices-file",
                            dest='offices_file',
                            required=False)
        parser.add_argument("--log-file-name",
                            dest='log_file_name',
                            required=False,
                            default="dlrobot_central.log")
        parser.add_argument("--remote-calls-file",
                            dest='remote_calls_file',
                            default=None)
        parser.add_argument("--result-folder",
                            dest='result_folder',
                            required=True)
        parser.add_argument("--tries-count",
                            dest='tries_count',
                            required=False,
                            default=2,
                            type=int)
        parser.add_argument("--central-heart-rate",
                            dest='central_heart_rate',
                            required=False,
                            default='60s')
        parser.add_argument(
            "--check-yandex-cloud",
            dest='check_yandex_cloud',
            default=False,
            action='store_true',
            required=False,
            help="check yandex cloud health and restart workstations")
        parser.add_argument(
            "--skip-worker-check",
            dest='skip_worker_check',
            default=False,
            action='store_true',
            required=False,
            help="skip checking that this task was given to this worker")
        parser.add_argument("--enable-ip-checking",
                            dest='enable_ip_checking',
                            default=False,
                            action='store_true',
                            required=False)
        parser.add_argument("--disable-smart-parser-server",
                            dest="enable_smart_parser",
                            default=True,
                            action="store_false",
                            required=False)
        parser.add_argument("--disable-source-doc-server",
                            dest="enable_source_doc_server",
                            default=True,
                            action="store_false",
                            required=False)
        parser.add_argument("--disable-search-engines",
                            dest="enable_search_engines",
                            default=True,
                            action="store_false",
                            required=False)
        parser.add_argument("--disable-telegram",
                            dest="enable_telegram",
                            default=True,
                            required=False,
                            action="store_false")
        parser.add_argument("--disable-pdf-conversion-server-checking",
                            dest="pdf_conversion_server_checking",
                            default=True,
                            required=False,
                            action="store_false")
        parser.add_argument("--web-site-regexp",
                            dest="web_site_regexp",
                            required=False)
        parser.add_argument("--office-source-id",
                            dest="office_source_id",
                            required=False)
        parser.add_argument(
            "--round-file",
            dest="round_file",
            default=TDeclarationRounds.default_dlrobot_round_path)

        args = parser.parse_args(arg_list)
        args.central_heart_rate = convert_timeout_to_seconds(
            args.central_heart_rate)
        if args.server_address is None:
            args.server_address = os.environ['DLROBOT_CENTRAL_SERVER_ADDRESS']
        if args.check_yandex_cloud:
            assert TYandexCloud.get_yc() is not None

        return args

    def __init__(self, args):
        self.register_task_result_error_count = 0
        self.logger = setup_logging(log_file_name=args.log_file_name,
                                    append_mode=True)
        self.conversion_client = TDocConversionClient(
            TDocConversionClient.parse_args([]), self.logger)
        self.args = args
        rounds = TDeclarationRounds(args.round_file)
        self.dlrobot_remote_calls = TRemoteDlrobotCallList(
            logger=self.logger,
            file_name=args.remote_calls_file,
            min_start_time_stamp=rounds.start_time_stamp)
        self.worker_2_running_tasks = defaultdict(list)
        self.worker_2_continuous_failures_count = defaultdict(int)
        offices = TOfficeTableInMemory()
        offices.read_from_local_file(self.args.offices_file)
        self.web_sites_db = TDeclarationWebSiteList(self.logger,
                                                    offices=offices)
        if not os.path.exists(self.args.result_folder):
            os.makedirs(self.args.result_folder)
        self.web_sites_to_process = self.find_projects_to_process()
        self.cloud_id_to_worker_ip = dict()
        self.config = TRobotConfig.read_by_config_type(
            self.args.dlrobot_config_type)
        self.last_remote_call = None  # for testing
        host, port = self.args.server_address.split(":")
        self.logger.debug("start server on {}:{}".format(host, port))
        super().__init__((host, int(port)), TDlrobotRequestHandler)
        self.last_service_action_time_stamp = time.time()
        self.service_action_count = 0
        self.decl_sender = TDeclarationSender(
            self.logger, self.args.enable_smart_parser,
            self.args.enable_source_doc_server)
        self.stop_process = False
        if self.args.enable_ip_checking:
            self.permitted_hosts = set(
                str(x)
                for x in ipaddress.ip_network('192.168.100.0/24').hosts())
            self.permitted_hosts.add('127.0.0.1')
            self.permitted_hosts.add('95.165.96.61')  # disclosures.ru
        self.logger.debug("init complete")
        self.send_to_telegram("start dlrobot central with {} tasks".format(
            len(self.web_sites_to_process)))

    def send_to_telegram(self, message):
        if self.args.enable_telegram:
            self.logger.debug("send to telegram: {}".format(message))
            telegram_send.send(messages=[message])

    def stop_server(self):
        self.server_close()
        self.shutdown()

    def verify_request(self, request, client_address):
        if self.args.enable_ip_checking:
            (ip, dummy) = client_address
            if ip not in self.permitted_hosts:
                return False
        return True

    def log_process_result(self, process_result):
        s = process_result.stdout.strip("\n\r ")
        if len(s) > 0:
            for line in s.split("\n"):
                self.logger.error("task stderr: {}".format(line))
        s = process_result.stderr.strip("\n\r ")
        if len(s) > 0:
            for line in s.split("\n"):
                self.logger.error("task stderr: {}".format(line))

    def have_tasks(self):
        return len(self.web_sites_to_process) > 0 and not self.stop_process

    def project_is_to_process(self, project_file):
        interactions = self.dlrobot_remote_calls.get_interactions(project_file)
        if sum(1 for i in interactions if i.task_was_successful()) > 0:
            return False
        tries_count = self.args.tries_count
        if sum(1 for i in interactions if not i.task_ended()) > 0:
            # if the last result was not obtained, may be,
            # worker is down, so the problem is not in the task but in the worker
            # so give this task one more chance
            tries_count += 1
            self.logger.debug("increase max_tries_count for {} to {}".format(
                project_file, tries_count))
        return len(interactions) < tries_count

    def save_dlrobot_remote_call(self, remote_call: TRemoteDlrobotCall):
        self.dlrobot_remote_calls.add_dlrobot_remote_call(remote_call)
        if not remote_call.task_was_successful():
            if self.project_is_to_process(remote_call.project_file):
                self.web_sites_to_process.append(remote_call.web_site)
                self.logger.debug("register retry for {}".format(
                    remote_call.web_site))

    def find_projects_to_process(self):
        web_sites_to_process = list()
        self.logger.info("filter web sites")
        web_site_info: TDeclarationWebSite
        for web_site, web_site_info in self.web_sites_db.web_sites.items():
            if self.args.web_site_regexp is not None:
                if re.match(self.args.web_site_regexp, web_site) is None:
                    continue
            if self.args.office_source_id is not None:
                if web_site_info.get_parent_source_id(
                ) != self.args.office_source_id:
                    continue
            if TWebSiteReachStatus.can_communicate(web_site_info.reach_status):
                project_file = TRemoteDlrobotCall.web_site_to_project_file(
                    web_site)
                if self.project_is_to_process(project_file):
                    web_sites_to_process.append(web_site)

        self.logger.info("there are {} sites in the input queue".format(
            len(web_sites_to_process)))
        web_sites_to_process.sort(
            key=(lambda x: self.dlrobot_remote_calls.last_interaction[x]))

        with open("web_sites_to_process_debug.txt", "w") as out:
            for w in web_sites_to_process:
                out.write(w + "\n")
        return web_sites_to_process

    def get_running_jobs_count(self):
        return sum(len(w) for w in self.worker_2_running_tasks.values())

    def get_processed_jobs_count(self):
        return len(list(self.dlrobot_remote_calls.get_all_calls()))

    def get_new_project_to_process(self, worker_host_name, worker_ip):
        site_url = self.web_sites_to_process.pop(0)
        project_file = TRemoteDlrobotCall.web_site_to_project_file(site_url)
        self.logger.info(
            "start job: {} on {} (host name={}), left jobs: {}, running jobs: {}"
            .format(project_file, worker_ip, worker_host_name,
                    len(self.web_sites_to_process),
                    self.get_running_jobs_count()))
        remote_call = TRemoteDlrobotCall(worker_ip=worker_ip,
                                         project_file=project_file,
                                         web_site=site_url)
        remote_call.worker_host_name = worker_host_name
        web_site_passport = self.web_sites_db.get_web_site(site_url)
        regional_main_pages = list()
        if web_site_passport is None:
            self.logger.error(
                "{} is not registered in the web site db, no office information is available for the site"
            )
        project_content_str = TRobotProject.create_project_str(
            site_url,
            regional_main_pages,
            disable_search_engine=not self.args.enable_search_engines)
        self.worker_2_running_tasks[worker_ip].append(remote_call)
        return remote_call, project_content_str.encode("utf8")

    def untar_file(self, project_file, result_archive):
        base_folder, _ = os.path.splitext(project_file)
        output_folder = os.path.join(self.args.result_folder,
                                     base_folder) + ".{}".format(
                                         int(time.time()))
        compressed_file = io.BytesIO(result_archive)
        decompressed_file = gzip.GzipFile(fileobj=compressed_file)
        tar = tarfile.open(fileobj=decompressed_file)
        tar.extractall(output_folder)
        return output_folder

    def pop_project_from_running_tasks(self, worker_ip, project_file):
        if worker_ip not in self.worker_2_running_tasks:
            raise Exception(
                "{} is missing in the worker table".format(worker_ip))
        worker_running_tasks = self.worker_2_running_tasks[worker_ip]
        for i in range(len(worker_running_tasks)):
            if worker_running_tasks[i].project_file == project_file:
                return worker_running_tasks.pop(i)
        raise Exception("{} is missing in the worker {} task table".format(
            project_file, worker_ip))

    def worker_is_banned(self, worker_ip, host_name):
        return self.worker_2_continuous_failures_count[(worker_ip, host_name)] > \
                        TDlrobotHTTPServer.max_continuous_failures_count

    def update_worker_info(self, worker_host_name, worker_ip, exit_code):
        key = (worker_ip, worker_host_name)
        if exit_code == 0:
            self.worker_2_continuous_failures_count[key] = 0
        else:
            self.worker_2_continuous_failures_count[key] += 1
            if self.worker_is_banned(worker_ip, worker_host_name):
                self.send_to_telegram(
                    "too many dlrobot errors from ip {}, hostname={}, the host is banned, "
                    "you have to restart dlrobot_central to unban it".format(
                        worker_ip, worker_host_name))

    def register_task_result(self, worker_host_name, worker_ip, project_file,
                             exit_code, result_archive):
        if self.args.skip_worker_check:
            remote_call = TRemoteDlrobotCall(worker_ip, project_file)
        else:
            try:
                remote_call = self.pop_project_from_running_tasks(
                    worker_ip, project_file)
            except:
                if ipaddress.ip_address(worker_ip).is_private:
                    self.logger.debug(
                        "try to get a result {} from a local ip {}, though this task was not dispatched"
                        .format(project_file, worker_ip))
                    remote_call = TRemoteDlrobotCall(worker_ip, project_file)
                else:
                    raise

        self.update_worker_info(worker_host_name, worker_ip, exit_code)

        remote_call.worker_host_name = worker_host_name
        remote_call.exit_code = exit_code
        remote_call.end_time = int(time.time())
        project_folder = self.untar_file(project_file, result_archive)
        remote_call.calc_project_stats(self.logger, self.web_sites_db,
                                       project_folder, self.config)
        if not TWebSiteReachStatus.can_communicate(remote_call.reach_status):
            remote_call.exit_code = -1
        self.decl_sender.send_declaraion_files_to_other_servers(project_folder)
        self.save_dlrobot_remote_call(remote_call)
        self.last_remote_call = remote_call
        self.logger.debug(
            "got exitcode {} for task result {} from worker {} (host_name = {})"
            .format(exit_code, project_file, worker_ip, worker_host_name))

    def forget_old_remote_processes(self, current_time):
        for running_procs in self.worker_2_running_tasks.values():
            for i in range(len(running_procs) - 1, -1, -1):
                remote_call = running_procs[i]
                elapsed_seconds = current_time - remote_call.start_time
                if elapsed_seconds > self.config.get_kill_timeout_in_central():
                    self.logger.debug(
                        "task {} on worker {}(host={}) takes {} seconds, probably it failed, stop waiting for a result"
                        .format(remote_call.web_site, remote_call.worker_ip,
                                remote_call.worker_host_name, elapsed_seconds))
                    running_procs.pop(i)
                    remote_call.exit_code = 126
                    self.save_dlrobot_remote_call(remote_call)

    def forget_remote_processes_for_yandex_worker(self, cloud_id):
        worker_ip = self.cloud_id_to_worker_ip.get(cloud_id)
        if worker_ip is None and len(self.cloud_id_to_worker_ip) > 0:
            self.logger.info(
                "I do not remember ip for cloud_id {}, cannot delete processes"
                .format(cloud_id))
            return

        running_procs = self.worker_2_running_tasks.get(worker_ip, list())
        for i in range(len(running_procs) - 1, -1, -1):
            rc = running_procs[i]
            self.logger.debug(
                "forget task {} on worker {} since the workstation was stopped"
                .format(rc.project_file, rc.worker_ip))
            running_procs.pop(i)
            rc.exit_code = 125
            self.save_dlrobot_remote_call(rc)
        if cloud_id in self.cloud_id_to_worker_ip:
            del self.cloud_id_to_worker_ip[cloud_id]

    def check_yandex_cloud(self):
        if not self.args.check_yandex_cloud:
            return None
        try:
            if not check_internet():
                self.logger.error(
                    "cannot connect to google dns, probably internet is down")
                return None
            for m in TYandexCloud.list_instances():
                cloud_id = m['id']
                if m['status'] == 'STOPPED':
                    self.forget_remote_processes_for_yandex_worker(cloud_id)
                    self.logger.info(
                        "start yandex cloud worker {}".format(cloud_id))
                    TYandexCloud.start_yandex_cloud_worker(cloud_id)
                elif m['status'] == "RUNNING":
                    worker_ip = TYandexCloud.get_worker_ip(m)
                    if self.args.enable_ip_checking:
                        self.permitted_hosts.add(worker_ip)
                    self.cloud_id_to_worker_ip[cloud_id] = worker_ip
        except Exception as exp:
            self.logger.error(exp)

    def check_pdf_conversion_server(self):
        if not self.args.pdf_conversion_server_checking:
            return True
        return not self.conversion_client.server_is_too_busy()

    def service_actions(self):
        current_time = time.time()
        if current_time - self.last_service_action_time_stamp >= self.args.central_heart_rate:
            self.service_action_count += 1
            if self.service_action_count % 10 == 0:
                self.logger.debug('alive')
            self.last_service_action_time_stamp = current_time
            if os.path.exists(self.PITSTOP_FILE):
                self.stop_process = True
                self.logger.debug(
                    "stop sending tasks, exit for a pit stop after all tasks complete"
                )
                os.unlink(self.PITSTOP_FILE)
            if self.stop_process and self.get_running_jobs_count() == 0:
                self.logger.debug("exit via exception")
                raise Exception("exit for pit stop")
            try:
                self.forget_old_remote_processes(current_time)
            except Exception as exp:
                self.logger.error(exp)
            self.check_yandex_cloud()
            if not self.check_pdf_conversion_server():
                self.logger.debug(
                    "stop sending tasks, because conversion pdf queue length is {}"
                    .format(self.conversion_client.
                            last_pdf_conversion_queue_length))

    def get_stats(self):
        workers = dict((k, list(r.write_to_json() for r in v))
                       for (k, v) in self.worker_2_running_tasks.items())
        stats = {
            'running_count':
            self.get_running_jobs_count(),
            'input_tasks':
            len(self.web_sites_to_process),
            'processed_tasks':
            self.get_processed_jobs_count(),
            'worker_2_running_tasks':
            workers,
            'last_service_action_time_stamp':
            self.last_service_action_time_stamp,
            'central_heart_rate':
            self.args.central_heart_rate,
            'register_task_result_error_count':
            self.register_task_result_error_count
        }
        if self.stop_process:
            stats['stop_process'] = True
        return stats
Пример #15
0
 def init_conversion():
     TDownloadEnv.CONVERSION_CLIENT = TDocConversionClient()
     TDownloadEnv.CONVERSION_CLIENT.start_conversion_thread()
Пример #16
0
from ConvStorage.conversion_client import TDocConversionClient
from common.logging_wrapper import setup_logging

import sys
import argparse
import os

def parse_args(arg_list):
    parser = argparse.ArgumentParser()
    parser.add_argument('input', nargs='*')
    parser.add_argument("--conversion-server", dest='conversion_server', required=False)
    TDocConversionClient.DECLARATOR_CONV_URL = os.environ.get('DECLARATOR_CONV_URL')  # reread for tests
    return parser.parse_args(arg_list)


if __name__ == '__main__':
    logger = setup_logging(log_file_name="get_docx.log", append_mode=True)
    client = TDocConversionClient(parse_args(sys.argv[1:]),  logger)
    for sha256 in client.args.input:
        output_file_path = sha256 + '.docx'
        if client.retrieve_document(sha256, output_file_path, verbose=True):
            logger.info("create {}".format(output_file_path))
        else:
            logger.info("cannot find {}".format(sha256))
            sys.exit(1)

Пример #17
0
class TExportHumanFiles:

    @staticmethod
    def parse_args(arg_list):
        parser = argparse.ArgumentParser()
        parser.add_argument("--table", dest='table', default="declarations_documentfile")
        parser.add_argument("--document-file-id", dest='document_file_id', required=False)
        parser.add_argument("--tmp-folder", dest='tmp_folder', default=None)
        parser.add_argument("--dlrobot-human-json", dest='dlrobot_human_json', default="human_files.json")
        parser.add_argument("--start-from-an-empty-file", dest='start_from_empty', action="store_true", default=False)
        parser.add_argument("--max-files-count", dest='max_files_count', type=int)
        parser.add_argument("--mysql-port", dest='mysql_port', type=int, default=None)
        parser.add_argument("--pdf-conversion-timeout", dest='pdf_conversion_timeout',
                                default=1*60*60,
                                type=int,
                                help="pdf conversion timeout")
        parser.add_argument("--pdf-conversion-queue-limit", dest='pdf_conversion_queue_limit', type=int,
                            default=100 * 2 ** 20, help="max sum size of al pdf files that are in pdf conversion queue",
                            required=False)

        return parser.parse_args(arg_list)

    def __init__(self, args):
        self.logger = setup_logging(log_file_name="export_human_files.log")
        self.args = args
        if self.args.tmp_folder is None:
            self.args.tmp_folder = tempfile.mkdtemp("export_human")
            self.logger.debug("create folder {}".format(self.args.tmp_folder))
        else:
            self.logger.debug("rm folder {}".format(self.args.tmp_folder))
            shutil.rmtree(self.args.tmp_folder, ignore_errors=True)
            os.mkdir(self.args.tmp_folder)
        self.source_doc_client = TSourceDocClient(TSourceDocClient.parse_args([]), self.logger)
        self.pdf_conversion_client = TDocConversionClient(TDocConversionClient.parse_args([]), self.logger)
        self.smart_parser_server_client = TSmartParserCacheClient(TSmartParserCacheClient.parse_args([]), self.logger)
        self.new_pdfs = set()

    def __enter__(self):
        self.pdf_conversion_client.start_conversion_thread()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.pdf_conversion_client.stop_conversion_thread()
        shutil.rmtree(self.args.tmp_folder, ignore_errors=True)

    def unarchive(self, input_file):
        base_name, file_extension = os.path.splitext(os.path.basename(input_file))
        output_folder = os.path.dirname(input_file)
        dearchiver = TDearchiver(self.logger, output_folder)
        for _, _, filename in dearchiver.dearchive_one_archive(file_extension, input_file, base_name):
            yield filename

    def download_file_and_unzip(self, file_url, filename):
        file_without_extension, extension = os.path.splitext(filename)
        if not os.path.isfile(filename):
            self.logger.debug("download {0}  to {1}".format(file_url, filename))
            result = requests.get(file_url)
            with open(filename, 'wb') as fd:
                fd.write(result.content)
            if extension == '.zip':
                try:
                    for archive_filename in self.unarchive(filename):
                        yield archive_filename
                except Exception as e:
                    self.logger.error("cannot unzip  {}, exception={}".format(filename, e))
            else:
                yield filename
        else:
            if extension == '.zip':
                for archive_filename in glob.glob("{}_*".format(file_without_extension)):
                    yield archive_filename
            else:
                yield filename

    def get_all_file_sql_records(self):
        if self.args.mysql_port is None:
            db = pymysql.connect(db="declarator", user="******", password="******", unix_socket="/var/run/mysqld/mysqld.sock" )
        else:
            db = pymysql.connect(db="declarator", user="******", password="******",
                                 port=self.args.mysql_port)
        cursor = db.cursor()
        if self.args.document_file_id is not None:
            where_clause = "where f.id = {}\n".format(self.args.document_file_id)
        else:
            where_clause = ""
        query = ("""
                    select f.id, d.id, f.file, f.link, d.office_id, d.income_year 
                    from {} f
                    join declarations_document d on f.document_id=d.id
                    {} 
                 """.format(self.args.table, where_clause))
        self.logger.debug(query.replace("\n", " "))
        cursor.execute(query)
        for (document_file_id, document_id, filename, link, office_id, income_year) in cursor:
            if filename is not None and len(filename) > 0:
                yield document_file_id, document_id, filename, link, office_id, income_year

        cursor.close()
        db.close()

    def download_unzip_and_send_file_source_doc_server(self, declarator_url_path, document_file_id):
        path, declarator_filename = os.path.split(declarator_url_path)
        _, ext = os.path.splitext(declarator_filename)
        ext = ext.lower()
        temp_file = os.path.join(self.args.tmp_folder, "{}{}".format(document_file_id, ext))
        declarator_url = os.path.join(DECLARATOR_DOMAIN, "media", urllib.parse.quote(declarator_url_path))
        declarator_url = declarator_url.replace('\\', '/')

        for file_name in self.download_file_and_unzip(declarator_url, temp_file):
            self.source_doc_client.send_file(file_name)
            if file_name.lower().endswith('.pdf'):
                _, extension = os.path.splitext(file_name)
                self.pdf_conversion_client.start_conversion_task_if_needed(file_name, extension)
                self.new_pdfs.add(build_dislosures_sha256(file_name))
            else:
                self.smart_parser_server_client.send_file(file_name)
            yield file_name, declarator_url

        self.pdf_conversion_client.wait_all_tasks_to_be_sent()
        for f in os.listdir(self.args.tmp_folder):
            os.unlink(os.path.join(self.args.tmp_folder, f))

    def fix_list(self, sha256, office_id):
        fixed_office_id = FIX_LIST.get(sha256)
        if fixed_office_id is not None:
            return fixed_office_id
        else:
            return office_id

    def export_files(self):
        human_files_db = TDlrobotHumanFileDBM(self.args.dlrobot_human_json)
        if self.args.start_from_empty:
            human_files_db.create_db()
        else:
            human_files_db.open_write_mode()
        document_file_ids = set()
        for sha256, doc in human_files_db.get_all_documents():
            for ref in doc.decl_references:
                if ref.document_file_id is not None:
                    document_file_ids.add(ref.document_file_id)

        files_count = 0
        for document_file_id, document_id, file_path, link, office_id, income_year in self.get_all_file_sql_records():
            if document_file_id in document_file_ids:
                continue

            while self.pdf_conversion_client.server_is_too_busy():
                self.logger.error("wait pdf conversion_server for 5 minutes, last_pdf_conversion_queue_length={}".format(
                    self.pdf_conversion_client.last_pdf_conversion_queue_length
                ))
                time.sleep(5*60)

            web_site = urlsplit_pro(link).netloc
            if web_site.startswith('www.'):
                web_site = web_site[len('www.'):]

            if self.args.max_files_count is not None and files_count >= self.args.max_files_count:
                break
            self.logger.debug("export document_file_id={}".format(document_file_id))
            for local_file_path, declarator_url in self.download_unzip_and_send_file_source_doc_server(file_path,
                                                                                                    document_file_id):
                sha256 = build_dislosures_sha256(local_file_path)
                self.logger.debug("add {}, sha256={}".format(local_file_path, sha256))
                source_document = TSourceDocument(os.path.splitext(local_file_path)[1])
                ref = TDeclaratorReference()
                ref.document_id = document_id
                ref.document_file_id = document_file_id
                ref._site_url = web_site
                ref.office_id = self.fix_list(sha256, office_id)
                ref.income_year = income_year
                ref.document_file_url = declarator_url
                source_document.add_decl_reference(ref)
                human_files_db.update_source_document(sha256, source_document)
                files_count += 1
        self.logger.debug('added files count: {}'.format(files_count))
        human_files_db.close_db()
        self.send_new_pdfs_to_smart_parser()

    def send_new_pdfs_to_smart_parser(self):
        self.logger.debug("wait pdf conversion for {} seconds".format(self.args.pdf_conversion_timeout))
        self.pdf_conversion_client.wait_doc_conversion_finished(self.args.pdf_conversion_timeout)

        missed_pdf_count = 0
        received_pdf_count = 0
        for sha256 in self.new_pdfs:
            self.logger.debug("try to converted file for {}".format(sha256))
            handle, temp_filename = tempfile.mkstemp(suffix=".docx")
            os.close(handle)
            if self.pdf_conversion_client.retrieve_document(sha256, temp_filename):
                received_pdf_count += 1
                self.logger.debug("send the converted file to smart parser")
                self.smart_parser_server_client.send_file(temp_filename)
            else:
                self.logger.error("converted file is not received")
                missed_pdf_count += 1
            os.unlink(temp_filename)
        if missed_pdf_count > 0:
            self.logger.error('received_pdf_count = {}, missed_pdf_count={}'.format(received_pdf_count, missed_pdf_count))