def export_files(self):
        human_files_db = TDlrobotHumanFileDBM(self.args.dlrobot_human_json)
        if self.args.start_from_empty:
            human_files_db.create_db()
        else:
            human_files_db.open_write_mode()
        document_file_ids = set()
        for sha256, doc in human_files_db.get_all_documents():
            for ref in doc.decl_references:
                if ref.document_file_id is not None:
                    document_file_ids.add(ref.document_file_id)

        files_count = 0
        for document_file_id, document_id, file_path, link, office_id, income_year in self.get_all_file_sql_records():
            if document_file_id in document_file_ids:
                continue

            while self.pdf_conversion_client.server_is_too_busy():
                self.logger.error("wait pdf conversion_server for 5 minutes, last_pdf_conversion_queue_length={}".format(
                    self.pdf_conversion_client.last_pdf_conversion_queue_length
                ))
                time.sleep(5*60)

            web_site = urlsplit_pro(link).netloc
            if web_site.startswith('www.'):
                web_site = web_site[len('www.'):]

            if self.args.max_files_count is not None and files_count >= self.args.max_files_count:
                break
            self.logger.debug("export document_file_id={}".format(document_file_id))
            for local_file_path, declarator_url in self.download_unzip_and_send_file_source_doc_server(file_path,
                                                                                                    document_file_id):
                sha256 = build_dislosures_sha256(local_file_path)
                self.logger.debug("add {}, sha256={}".format(local_file_path, sha256))
                source_document = TSourceDocument(os.path.splitext(local_file_path)[1])
                ref = TDeclaratorReference()
                ref.document_id = document_id
                ref.document_file_id = document_file_id
                ref._site_url = web_site
                ref.office_id = self.fix_list(sha256, office_id)
                ref.income_year = income_year
                ref.document_file_url = declarator_url
                source_document.add_decl_reference(ref)
                human_files_db.update_source_document(sha256, source_document)
                files_count += 1
        self.logger.debug('added files count: {}'.format(files_count))
        human_files_db.close_db()
        self.send_new_pdfs_to_smart_parser()
 def get_all_documents(self):
     k = self.db.firstkey()
     while k is not None:
         js = json.loads(self.db[k])
         sha256 = k.decode('latin')
         yield sha256, TSourceDocument().from_json(js)
         k = self.db.nextkey(k)
示例#3
0
    def register_document_in_database(self, sha256, src_doc: TSourceDocument):
        source_document_in_db = models.Source_Document(
            sha256=sha256,
            intersection_status=src_doc.build_intersection_status(),
        )
        source_document_in_db.id, new_file = self.permalinks_db_source_document.get_source_doc_id_by_sha256(
            sha256)
        assert not models.Source_Document.objects.filter(
            id=source_document_in_db.id).exists()
        self.logger.debug("register doc sha256={} id={}, new_file={}".format(
            sha256, source_document_in_db.id, new_file))
        source_document_in_db.file_extension = src_doc.file_extension
        source_document_in_db.save()
        ref: TDeclaratorReference
        for ref in src_doc.decl_references:
            models.Declarator_File_Reference(
                source_document=source_document_in_db,
                declarator_documentfile_id=ref.document_file_id,
                declarator_document_id=ref.document_id,
                web_domain=ref._site_url,
                declarator_document_file_url=ref.document_file_url).save()
        ref: TWebReference
        for ref in src_doc.web_references:
            models.Web_Reference(source_document=source_document_in_db,
                                 dlrobot_url=ref.url,
                                 web_domain=ref._site_url,
                                 crawl_epoch=ref.crawl_epoch).save()

        return source_document_in_db
 def set_office_id(self, sha256, src_doc: TSourceDocument, office_id, method_name: str):
     old_office_id = src_doc.calculated_office_id
     if old_office_id is None or office_id == old_office_id:
         self.logger.debug("set file {} office_id={} ({} )".format(
             sha256, office_id, method_name))
     else:
         self.logger.info("change office_id from {} to {} for file {} , ({})".format( \
             old_office_id, office_id, sha256, method_name))
     src_doc.calculated_office_id = office_id
     self.dlrobot_human.update_source_document(sha256, src_doc)
示例#5
0
    def calc_income_year(self, input_json, src_doc: TSourceDocument,
                         section_json, section_index):
        # take year from a particular declarant (many declarants with different year in one file)
        # do not use here default value for get, since smart_parser explicitly write "year": null
        year = section_json.get('year')
        if year is not None:
            return int(year)

        year = src_doc.calc_document_income_year(input_json)

        # if year is absent, then the file is useless
        if year is None:
            raise TSmartParserSectionJson.SerializerException(
                "year is not defined: section No {}".format(section_index))

        return int(year)
 def add_dlrobot_file(self,
                      sha256,
                      file_extension,
                      web_refs=[],
                      decl_refs=[]):
     src_doc = self.output_dlrobot_human.get_document_maybe(sha256)
     if src_doc is None:
         src_doc = TSourceDocument(file_extension)
         self.output_dlrobot_human.update_source_document(sha256, src_doc)
     for web_ref in web_refs:
         src_doc.add_web_reference(web_ref)
     for decl_ref in decl_refs:
         src_doc.add_decl_reference(decl_ref)
     self.output_dlrobot_human.update_source_document(sha256, src_doc)
    def predict_tax_office(self, sha256, src_doc: TSourceDocument):
        web_ref: TWebReference
        for web_ref in src_doc.web_references:
            if web_ref._site_url.endswith("service.nalog.ru"):
                if src_doc.region_id is None:
                    smart_parser_json = self.smart_parser_server_client.retrieve_json_by_sha256(sha256)
                    if smart_parser_json is None:
                        return False
                    props = smart_parser_json.get('document_sheet_props')
                    if props is None or len(props) == 0 or 'url' not in props[0]:
                        return False
                    url = props[0]['url']
                    region_str = url[:url.find('.')]
                    if not region_str.isdigit():
                        return False
                    src_doc.region_id = int(region_str)

                office_id = self.regional_tax_offices.get(src_doc.region_id)
                if office_id is not None:
                    self.set_office_id(sha256, src_doc, office_id, "regional tax office")
                    return True
        return False
 def convert_from_json_fle(self, json_path: str):
     with open(json_path) as inp:
         js = json.load(inp)
     for k, v in js['documents'].items():
         self.update_source_document(k, TSourceDocument().from_json(v))
 def get_document_maybe(self, sha256):
     s = self.db.get(sha256)
     if s is None:
         return s
     return TSourceDocument().from_json(json.loads(s))
示例#10
0
 def get_document(self, sha256) -> TSourceDocument:
     return TSourceDocument().from_json(json.loads(self.db[sha256]))
示例#11
0
 def update_source_document(self, sha256, src_doc: TSourceDocument):
     assert self.access_mode != 'r'
     self.db[sha256] = json.dumps(src_doc.write_to_json(),
                                  ensure_ascii=False)