def __init__(self): self.LOCAL_FOLDER = settings.LOCAL_FOLDER self.LOCAL_FILE_NAME = settings.LOCAL_FILE_NAME_FOP_FULL self.CHUNK_SIZE = settings.CHUNK_SIZE_FOP_FULL self.RECORD_TAG = 'SUBJECT' self.bulk_manager = BulkCreateManager() self.new_fops_foptokveds = {} self.new_fops_exchange_data = {} super().__init__()
def __init__(self): self.API_ADDRESS_FOR_DATASET = Register.objects.get(source_register_id= "1c7f3815-3259-45e0-bdf1-64dca07ddc10").source_api_address self.LOCAL_FOLDER = settings.LOCAL_FOLDER self.LOCAL_FILE_NAME = settings.LOCAL_FILE_NAME_FOP self.CHUNK_SIZE = settings.CHUNK_SIZE_FOP self.RECORD_TAG = 'RECORD' self.bulk_manager = BulkCreateManager() self.new_fops_foptokveds = {} self.new_fops_exchange_data = {} super().__init__()
def __init__(self): self.LOCAL_FILE_NAME = settings.LOCAL_FILE_NAME_UO self.LOCAL_FOLDER = settings.LOCAL_FOLDER self.CHUNK_SIZE = settings.CHUNK_SIZE_UO self.RECORD_TAG = 'RECORD' self.bulk_manager = BulkCreateManager() self.branch_bulk_manager = BulkCreateManager() self.all_bylaw_dict = self.put_objects_to_dict("name", "business_register", "Bylaw") self.all_predecessors_dict = self.put_objects_to_dict("name", "business_register", "Predecessor") self.all_companies_dict = {} self.branch_to_parent = {} self.all_company_founders = [] super().__init__()
def __init__(self): self.API_ADDRESS_FOR_DATASET = Register.objects.get( source_register_id=settings.LOCATION_RATU_SOURCE_REGISTER_ID ).source_api_address self.LOCAL_FOLDER = settings.LOCAL_FOLDER self.LOCAL_FILE_NAME = settings.LOCAL_FILE_NAME_RATU self.CHUNK_SIZE = settings.CHUNK_SIZE_RATU self.RECORD_TAG = 'RECORD' self.bulk_manager = BulkCreateManager() self.all_regions_dict = self.put_objects_to_dict( 'name', 'location_register', 'RatuRegion') self.all_districts_dict = self.put_objects_to_dict( 'code', 'location_register', 'RatuDistrict') self.all_cities_dict = self.put_objects_to_dict( 'code', 'location_register', 'RatuCity') self.all_citydistricts_dict = self.put_objects_to_dict( 'code', 'location_register', 'RatuCityDistrict') self.all_streets_dict = self.put_objects_to_dict( 'code', 'location_register', 'RatuStreet') self.outdated_districts_dict = self.put_objects_to_dict( 'code', 'location_register', 'RatuDistrict') self.outdated_cities_dict = self.put_objects_to_dict( 'code', 'location_register', 'RatuCity') self.outdated_citydistricts_dict = self.put_objects_to_dict( 'code', 'location_register', 'RatuCityDistrict') self.outdated_streets_dict = self.put_objects_to_dict( 'code', 'location_register', 'RatuStreet') super().__init__()
class RfopConverter(Converter): LOCAL_FILE_NAME = "fop.xml" DATASET_ID = "1c7f3815-3259-45e0-bdf1-64dca07ddc10" CHUNK_SIZE = 200 def rename_file(self, file): new_filename = file if (file.upper().find('UO') >= 0): new_filename = 'uo.xml' if (file.upper().find('FOP') >= 0): new_filename = 'fop.xml' return new_filename # list of models for clearing DB tables = [Rfop] # format record's data record = {'RECORD': '', 'FIO': '', 'ADDRESS': '', 'KVED': '', 'STAN': ''} # creating dictionaries for registration items that had writed to db state_dict = {} # dictionary uses for keeping whole model class objects kved_dict = {} bulk_manager = BulkCreateManager(CHUNK_SIZE) for state in State.objects.all(): state_dict[state.name] = state for kved in Kved.objects.all(): kved_dict[kved.code] = kved # writing entry to db def save_to_db(self, record): state = self.save_to_state_table(record) kved = self.get_kved_from_DB(record, 'FIO') self.save_to_rfop_table(record, state, kved) print('saved') # writing entry to state table def save_to_state_table(self, record): if record['STAN']: state_name = record['STAN'] else: state_name = State.EMPTY_FIELD if not state_name in self.state_dict: state = State(name=state_name) state.save() self.state_dict[state_name] = state return state state = self.state_dict[state_name] return state # writing entry to rfop table def save_to_rfop_table(self, record, state, kved): rfop = Rfop(state=state, kved=kved, fullname=record['FIO'], address=record['ADDRESS']) self.bulk_manager.add(rfop) print( 'Rfop_class already imported. For start rewriting RFOP to the DB run > RfopConverter().process()\n', 'For clear RFOP tables run > RfopConverter().clear_db()')
class UkrCompanyConverter(CompanyConverter): def __init__(self): self.LOCAL_FILE_NAME = settings.LOCAL_FILE_NAME_UO self.LOCAL_FOLDER = settings.LOCAL_FOLDER self.CHUNK_SIZE = settings.CHUNK_SIZE_UO self.RECORD_TAG = 'RECORD' self.bulk_manager = BulkCreateManager() self.branch_bulk_manager = BulkCreateManager() self.all_bylaw_dict = self.put_objects_to_dict("name", "business_register", "Bylaw") self.all_predecessors_dict = self.put_objects_to_dict( "name", "business_register", "Predecessor") self.all_companies_dict = {} self.branch_to_parent = {} self.all_company_founders = [] super().__init__() def save_or_get_bylaw(self, bylaw_from_record): if bylaw_from_record not in self.all_bylaw_dict: new_bylaw = Bylaw.objects.create(name=bylaw_from_record) self.all_bylaw_dict[bylaw_from_record] = new_bylaw return new_bylaw return self.all_bylaw_dict[bylaw_from_record] def save_or_get_predecessor(self, item): if item.xpath('NAME')[0].text not in self.all_predecessors_dict: new_predecessor = Predecessor.objects.create( name=item.xpath('NAME')[0].text.lower(), code=item.xpath('CODE')[0].text) self.all_predecessors_dict[item.xpath('NAME') [0].text] = new_predecessor return new_predecessor return self.all_predecessors_dict[item.xpath('NAME')[0].text] def extract_detail_founder_data(self, founder_info): info_to_list = founder_info.split(',') # deleting spaces between strings if exist info_to_list = [string.strip() for string in info_to_list] # getting first element that is a name name = info_to_list[0] # checking if second element is a EDRPOU code edrpou = info_to_list[1] if self.find_edrpou(info_to_list[1]) else None # checking if other element is an EDRPOU code in case if the name has commas inside if not edrpou: for string in info_to_list: if self.find_edrpou(string): edrpou = string # getting the name with commas inside info_to_new_list = founder_info.split(string) name = info_to_new_list[0] logger.warning(f'Нестандартний запис: {founder_info}') break equity = None element_with_equity = None # usually equity is at the end of the record for string in info_to_list: if string.startswith('розмір внеску до статутного фонду' ) and string.endswith('грн.'): element_with_equity = string equity = float(re.findall('\d+\.\d+', string)[0]) break # deleting all info except the address address = founder_info.replace(name, '') if edrpou: address = address.replace(edrpou, '') if element_with_equity: address = address.replace(element_with_equity, '') if address and len(address) < 15: address = None if address and len(address) > 200: logger.warning( f'Завелика адреса: {address} із запису: {founder_info}') return name, edrpou, address, equity def extract_founder_data(self, founder_info): # split by first comma that usually separates name and equity that also has comma info_to_list = founder_info.split(',', 1) info_to_list = [string.strip() for string in info_to_list] name = info_to_list[0] is_beneficiary = False if name.startswith('КІНЦЕВИЙ БЕНЕФІЦІАРНИЙ ВЛАСНИК'): is_beneficiary = True second_part = info_to_list[1] equity = None address = None if second_part.startswith('розмір частки'): digital_value = re.findall('\d+\,\d+', second_part)[0] equity = float(digital_value.replace(',', '.')) else: address = second_part return name, is_beneficiary, address, equity def save_or_update_founders(self, founders_from_record, company): already_stored_founders = list(Founder.objects.filter(company=company)) for item in founders_from_record: info = item.text # checking if field contains data if not info or info.endswith('ВІДСУТНІЙ'): continue # checking if there is additional data except name if ',' in item.text: name, is_beneficiary, address, equity = self.extract_founder_data( item.text) name = name.lower() else: name = item.text.lower() equity, address = None, None is_beneficiary = False already_stored = False if len(already_stored_founders): for stored_founder in already_stored_founders: if stored_founder.name == name: already_stored = True update_fields = [] if info and stored_founder.info != info: stored_founder.info = info update_fields.append('info') if stored_founder.is_beneficiary != is_beneficiary: stored_founder.is_beneficiary = is_beneficiary update_fields.append('is_beneficiary') if address and stored_founder.address != address: stored_founder.address = address update_fields.append('address') if equity and stored_founder.equity != equity: stored_founder.equity = equity update_fields.append('equity') if update_fields: update_fields.append('updated_at') stored_founder.save(update_fields=update_fields) already_stored_founders.remove(stored_founder) break if not already_stored: Founder.objects.create(company=company, info=info, name=name, address=address, equity=equity, is_beneficiary=is_beneficiary, is_founder=True) if len(already_stored_founders): for outdated_founder in already_stored_founders: outdated_founder.soft_delete() def extract_beneficiary_data(self, beneficiary_info): # split by first comma that usually separates name and equity that also has comma info_to_list = beneficiary_info.split(',', 1) info_to_list = [string.strip() for string in info_to_list] name = info_to_list[0] next_word_after_name = info_to_list[1].split(',', 1)[0] edrpou = next_word_after_name if self.find_edrpou( next_word_after_name) else None if edrpou: address = info_to_list[1].replace(edrpou, '') else: address = info_to_list[1] return name, edrpou, address def save_or_update_beneficiaries(self, beneficiares_from_record, company): already_stored_founders = list(Founder.objects.filter(company=company)) for item in beneficiares_from_record: info = item.text name, edrpou, address = self.extract_beneficiary_data(info) name = name.lower() already_stored = False if len(already_stored_founders): for stored_founder in already_stored_founders: if stored_founder.name == name: already_stored = True update_fields = [] if not stored_founder.is_beneficiary: stored_founder.is_beneficiary = True update_fields.append('is_beneficiary') if edrpou and stored_founder.edrpou != edrpou: stored_founder.edrpou = edrpou update_fields.append('edrpou') if address and stored_founder.address != address: stored_founder.address = address update_fields.append('address') if update_fields: update_fields.append('updated_at') stored_founder.save(update_fields=update_fields) already_stored_founders.remove(stored_founder) break if not already_stored: Founder.objects.create(company=company, info=info, name=name, edrpou=edrpou, address=address, is_beneficiary=True) if len(already_stored_founders): for outdated_founder in already_stored_founders: outdated_founder.soft_delete() def branch_create(self, item, code): branch = Company() branch.name = item.xpath('NAME')[0].text branch.short_name = code branch.address = item.xpath('ADDRESS')[0].text if item.xpath('CREATE_DATE')[0].text: branch.registration_date = format_date_to_yymmdd( item.xpath('CREATE_DATE')[0].text) or None branch.contact_info = item.xpath('CONTACTS')[0].text branch.authority = self.authority branch.bylaw = self.bylaw branch.company_type = self.company_type branch.status = self.status branch.hash_code = self.create_hash_code(branch.name, code) return branch def add_company_detail(self, founding_document_number, executive_power, superior_management, managing_paper, terminated_info, termination_cancel_info, vp_dates, code): company_detail = CompanyDetail() company_detail.founding_document_number = founding_document_number company_detail.executive_power = executive_power company_detail.superior_management = superior_management company_detail.managing_paper = managing_paper company_detail.terminated_info = terminated_info company_detail.termination_cancel_info = termination_cancel_info company_detail.vp_dates = vp_dates company_detail.hash_code = code self.bulk_manager.add(company_detail) def add_assignees(self, assignees_from_record, code): for item in assignees_from_record: assignee = Assignee() assignee.name = item.xpath('NAME')[0].text.lower() assignee.edrpou = item.xpath('CODE')[0].text assignee.hash_code = code self.bulk_manager.add(assignee) def add_bancruptcy_readjustment(self, record, code): bancruptcy_readjustment = BancruptcyReadjustment() if record.xpath('BANKRUPTCY_READJUSTMENT_INFO/OP_DATE'): bancruptcy_readjustment.op_date = format_date_to_yymmdd( record.xpath('BANKRUPTCY_READJUSTMENT_INFO/OP_DATE') [0].text) or None bancruptcy_readjustment.reason = record.xpath( 'BANKRUPTCY_READJUSTMENT_INFO/REASON')[0].text.lower() bancruptcy_readjustment.sbj_state = record.xpath( 'BANKRUPTCY_READJUSTMENT_INFO/SBJ_STATE')[0].text.lower() head_name = record.xpath( 'BANKRUPTCY_READJUSTMENT_INFO/BANKRUPTCY_READJUSTMENT_HEAD_NAME' )[0].text if head_name: bancruptcy_readjustment.head_name = head_name bancruptcy_readjustment.hash_code = code self.bulk_manager.add(bancruptcy_readjustment) def add_company_to_kved(self, kveds_from_record, code): for item in kveds_from_record: if not item.xpath('NAME'): continue kved_name = item.xpath('NAME')[0].text if not kved_name: continue company_to_kved = CompanyToKved() company_to_kved.kved = self.get_kved_from_DB(kved_name) company_to_kved.primary_kved = item.xpath( 'PRIMARY')[0].text == "так" company_to_kved.hash_code = code self.bulk_manager.add(company_to_kved) def add_company_to_kved_branch(self, kveds_from_record, code): for item in kveds_from_record: if not item.xpath('NAME'): continue kved_name = item.xpath('NAME')[0].text if not kved_name: continue company_to_kved = CompanyToKved() company_to_kved.kved = self.get_kved_from_DB(kved_name) company_to_kved.primary_kved = item.xpath( 'PRIMARY')[0].text == "так" company_to_kved.hash_code = code self.branch_bulk_manager.add(company_to_kved) def add_exchange_data(self, exchange_data, code): for item in exchange_data: if item.xpath('AUTHORITY_NAME'): exchange_answer = ExchangeDataCompany() exchange_answer.authority = self.save_or_get_authority( item.xpath('AUTHORITY_NAME')[0].text) taxpayer_type = item.xpath('TAX_PAYER_TYPE')[0].text if taxpayer_type: exchange_answer.taxpayer_type = self.save_or_get_taxpayer_type( taxpayer_type) if item.xpath('START_DATE')[0].text: exchange_answer.start_date = format_date_to_yymmdd( item.xpath('START_DATE')[0].text) or None exchange_answer.start_number = item.xpath('START_NUM')[0].text if item.xpath('END_DATE')[0].text: exchange_answer.end_date = format_date_to_yymmdd( item.xpath('END_DATE')[0].text) or None exchange_answer.end_number = item.xpath('END_NUM')[0].text exchange_answer.hash_code = code self.bulk_manager.add(exchange_answer) def add_exchange_data_branch(self, exchange_data, name, code): if len(exchange_data) > 0: for item in exchange_data: exchange_answer = ExchangeDataCompany() if item.xpath('AUTHORITY_NAME'): exchange_answer.authority = self.save_or_get_authority( item.xpath('AUTHORITY_NAME')[0].text) tax_payer_type = item.xpath( 'TAX_PAYER_TYPE')[0].text or Company.INVALID exchange_answer.taxpayer_type = self.save_or_get_taxpayer_type( tax_payer_type) if item.xpath('START_DATE')[0].text: exchange_answer.start_date = format_date_to_yymmdd( item.xpath('START_DATE')[0].text) or None exchange_answer.start_number = item.xpath( 'START_NUM')[0].text if item.xpath('END_DATE')[0].text: exchange_answer.end_date = format_date_to_yymmdd( item.xpath('END_DATE')[0].text) or None exchange_answer.end_number = item.xpath('END_NUM')[0].text exchange_answer.hash_code = self.create_hash_code( name, code) self.branch_bulk_manager.add(exchange_answer) def add_company_to_predecessors(self, predecessors_from_record, code): for item in predecessors_from_record: if item.xpath('NAME'): company_to_predecessor = CompanyToPredecessor() company_to_predecessor.predecessor = self.save_or_get_predecessor( item) company_to_predecessor.hash_code = code self.bulk_manager.add(company_to_predecessor) def add_signers(self, signers_from_record, code): for item in signers_from_record: signer = Signer() signer.name = item.text.lower() signer.hash_code = code self.bulk_manager.add(signer) def add_termination_started(self, record, code): if record.xpath('TERMINATION_STARTED_INFO/OP_DATE'): termination_started = TerminationStarted() if record.xpath('TERMINATION_STARTED_INFO/OP_DATE')[0].text: termination_started.op_date = format_date_to_yymmdd( record.xpath('TERMINATION_STARTED_INFO/OP_DATE') [0].text) or None termination_started.reason = record.xpath( 'TERMINATION_STARTED_INFO' '/REASON')[0].text.lower() termination_started.sbj_state = record.xpath( 'TERMINATION_STARTED_INFO/SBJ_STATE')[0].text.lower() signer_name = record.xpath( 'TERMINATION_STARTED_INFO/SIGNER_NAME')[0].text if signer_name: termination_started.signer_name = signer_name.lower() if record.xpath( 'TERMINATION_STARTED_INFO/CREDITOR_REQ_END_DATE')[0].text: termination_started.creditor_reg_end_date = format_date_to_yymmdd( record.xpath( 'TERMINATION_STARTED_INFO/CREDITOR_REQ_END_DATE') [0].text) or '01.01.1990' termination_started.hash_code = code self.bulk_manager.add(termination_started) def add_branches(self, record, edrpou): for item in record.xpath('BRANCHES')[0]: code = item.xpath('CODE')[0].text or Company.INVALID self.save_or_get_authority('EMP') self.save_or_get_bylaw('EMP') self.save_or_get_company_type('EMP', 'uk') self.save_or_get_status('EMP') # try: # branch = Company.objects.filter( # hash_code=self.create_hash_code(item.xpath('NAME')[0].text, code)).first() # except: # pass # if branch: # branch.address = item.xpath('ADDRESS')[0].text # if item.xpath('CREATE_DATE')[0].text: # branch.registration_date = format_date_to_yymmdd( # item.xpath('CREATE_DATE')[0].text) or None # branch.contact_info = item.xpath('CONTACTS')[0].text # self.branch_bulk_manager.add_update(branch) # print('update') # else: # branch = self.branch_create(item, code) # self.branch_bulk_manager.add_create(branch) # print('create') # branch = self.branch_create(item, code) # self.branch_bulk_manager.add_create(branch) # branch_kveds = item.xpath('ACTIVITY_KINDS')[0] # if len(branch_kveds): # self.add_company_to_kved_branch(branch_kveds, item.xpath('NAME')[0].text, code) # self.add_exchange_data_branch( # item.xpath('EXCHANGE_DATA')[0], # item.xpath('NAME')[0].text, code # ) # if item.xpath('SIGNER'): # signer = Signer( # name=item.xpath('SIGNER')[0].text, # hash_code=self.create_hash_code(item.xpath('NAME')[0].text, code) # ) # self.branch_bulk_manager.add_create(signer) # self.branch_to_parent[ # self.create_hash_code(item.xpath('NAME')[0].text, code) # ] = self.create_hash_code(record.xpath('NAME')[0].text, edrpou) def save_detail_company_to_db(self, records): for record in records: name = record.xpath('NAME')[0].text.lower() short_name = record.xpath('SHORT_NAME')[0].text if short_name: short_name = short_name.lower() company_type = record.xpath('OPF')[0].text if company_type: company_type = self.save_or_get_company_type( company_type, 'uk') edrpou = record.xpath('EDRPOU')[0].text if not edrpou: self.report.invalid_data += 1 continue code = name + edrpou address = record.xpath('ADDRESS')[0].text status = self.save_or_get_status(record.xpath('STAN')[0].text) founding_document_number = record.xpath( 'FOUNDING_DOCUMENT_NUM')[0].text executive_power = record.xpath('EXECUTIVE_POWER')[0].text if executive_power: executive_power = executive_power.lower() # if len(record.xpath('ACTIVITY_KINDS')[0]): # self.add_company_to_kved(record.xpath('ACTIVITY_KINDS')[0], code) superior_management = record.xpath('SUPERIOR_MANAGEMENT')[0].text if superior_management: superior_management = superior_management.lower() # if len(record.xpath('SIGNERS')[0]): # self.add_signers(record.xpath('SIGNERS')[0], code) authorized_capital = record.xpath('AUTHORIZED_CAPITAL')[0].text if authorized_capital: authorized_capital = authorized_capital.replace(',', '.') authorized_capital = float(authorized_capital) bylaw = self.save_or_get_bylaw(record.xpath('STATUTE')[0].text) registration_date = None registration_info = None registration = record.xpath('REGISTRATION')[0].text if registration: registration_date = format_date_to_yymmdd( get_first_word(registration)) registration_info = cut_first_word(registration) managing_paper = record.xpath('MANAGING_PAPER')[0].text if managing_paper: managing_paper = managing_paper.lower() # TODO: refactor branches storing # if len(record.xpath('BRANCHES')[0]): # self.add_branches(record.xpath('BRANCHES')[0], code) # if record.xpath('TERMINATION_STARTED_INFO/OP_DATE'): # self.add_termination_started(record, code) # if record.xpath('BANKRUPTCY_READJUSTMENT_INFO/OP_DATE'): # self.add_bancruptcy_readjustment(record, code) # if len(record.xpath('PREDECESSORS')[0]): # self.add_company_to_predecessors(record.xpath('PREDECESSORS')[0], code) # if len(record.xpath('ASSIGNEES')[0]): # self.add_assignees(record.xpath('ASSIGNEES')[0], code) terminated_info = record.xpath('TERMINATED_INFO')[0].text if terminated_info: terminated_info = terminated_info.lower() termination_cancel_info = record.xpath( 'TERMINATION_CANCEL_INFO')[0].text if termination_cancel_info: termination_cancel_info = termination_cancel_info.lower() contact_info = record.xpath('CONTACTS')[0].text # if record.xpath('EXCHANGE_DATA')[0]: # self.add_exchange_data(record.xpath('EXCHANGE_DATA')[0], code) vp_dates = record.xpath('VP_DATES')[0].text authority = self.save_or_get_authority( record.xpath('CURRENT_AUTHORITY')[0].text) # self.add_company_detail(founding_document_number, executive_power, superior_management, managing_paper, # terminated_info, termination_cancel_info, vp_dates, code) # ToDo: resolve the problem of having records with the same company name amd edrpou # that results in the same code company = Company.objects.filter(code=code).first() if not company: company = Company(name=name, short_name=short_name, company_type=company_type, edrpou=edrpou, authorized_capital=authorized_capital, status=status, bylaw=bylaw, registration_date=registration_date, registration_info=registration_info, contact_info=contact_info, authority=authority, code=code) company.save() # self.bulk_manager.add_create(company) else: update_fields = [] if company.name != name: company.name = name update_fields.append('name') if company.short_name != short_name: company.short_name = short_name update_fields.append('short_name') if company.company_type != company_type: company.company_type = company_type update_fields.append('company_type') if company.authorized_capital != authorized_capital: company.authorized_capital = authorized_capital update_fields.append('authorized_capital') if company.address != address: company.address = address update_fields.append('address') if company.status != status: company.status = status update_fields.append('status') if company.bylaw != bylaw: company.bylaw = bylaw update_fields.append('bylaw') if to_lower_string_if_exists( company.registration_date) != registration_date: company.registration_date = registration_date update_fields.append('registration_date') if company.registration_info != registration_info: company.registration_info = registration_info update_fields.append('registration_info') if company.contact_info != contact_info: company.contact_info = contact_info update_fields.append('contact_info') if company.authority != authority: company.authority = authority update_fields.append('authority') if update_fields: update_fields.append('updated_at') company.save(update_fields=update_fields) # self.bulk_manager.add_update(company) if len(record.xpath('FOUNDERS')[0]): self.save_or_update_founders( record.xpath('FOUNDERS')[0], company) # if len(self.bulk_manager.update_queues['business_register.Company']): # self.bulk_manager.commit_update(Company, ['name', 'short_name', 'company_type', # 'authorized_capital', 'address', 'status', # 'bylaw', 'registration_date', # 'registration_info', 'contact_info', # 'authority']) # if len(self.bulk_manager.create_queues['business_register.Company']): # self.bulk_manager.commit_create(Company) if len(self.bulk_manager.queues['business_register.Founder']): self.bulk_manager.commit(Founder) self.bulk_manager.queues['business_register.Founder'] = [] # for company in self.bulk_manager.create_queues['business_register.Company']: # self.all_companies_dict[company.company_code] = company # self.bulk_manager.update_queues['business_register.Company'] = [] # self.bulk_manager.create_queues['business_register.Company'] = [] # for branch in self.branch_bulk_manager._create_queues['business_register.Company']: # if self.branch_to_parent[branch.hash_code] in company_update_dict: # branch.parent = company_update_dict[self.branch_to_parent[branch.hash_code]] # else: # branch.parent = company_create_dict[self.branch_to_parent[branch.hash_code]] # # for branch in self.branch_bulk_manager._update_queues['business_register.Company']: # if self.branch_to_parent[branch.hash_code] in company_update_dict: # branch.parent = company_update_dict[self.branch_to_parent[branch.hash_code]] # else: # branch.parent = company_create_dict[self.branch_to_parent[branch.hash_code]] # # branch_to_parent = {} # for assignee in self.bulk_manager.create_queues['business_register.Assignee']: # assignee.company = self.all_companies_dict[assignee.company_code] # # for company_to_kved in self.bulk_manager.create_queues['business_register.CompanyToKved']: # company_to_kved.company = self.all_companies_dict[company_to_kved.company_code] # # for exchange_data in \ # self.bulk_manager.create_queues['business_register.ExchangeDataCompany']: # exchange_data.company = self.all_companies_dict[exchange_data.company_code] # # for founder in self.bulk_manager.create_queues['business_register.FounderFull']: # founder.company = self.all_companies_dict[founder.company_code] # # for bancruptcy_readjustment in \ # self.bulk_manager.create_queues['business_register.BancruptcyReadjustment']: # bancruptcy_readjustment.company = \ # self.all_companies_dict[bancruptcy_readjustment.company_code] # # for company_detail in self.bulk_manager.create_queues['business_register.CompanyDetail']: # company_detail.company = self.all_companies_dict[company_detail.company_code] # # for company_to_predecessor in \ # self.bulk_manager.create_queues['business_register.CompanyToPredecessor']: # company_to_predecessor.company = \ # self.all_companies_dict[company_to_predecessor.company_code] # # for signer in self.bulk_manager.create_queues['business_register.Signer']: # signer.company = self.all_companies_dict[signer.company_code] # # for termination_started in \ # self.bulk_manager.create_queues['business_register.TerminationStarted']: # termination_started.company = self.all_companies_dict[termination_started.company_code] # # self.bulk_manager.commit_create(Assignee) # self.bulk_manager.commit_create(BancruptcyReadjustment) # self.bulk_manager.commit_create(CompanyDetail) # self.bulk_manager.commit_create(CompanyToKved) # self.bulk_manager.commit_create(ExchangeDataCompany) # self.bulk_manager.commit_create(CompanyToPredecessor) # self.bulk_manager.commit_create(Signer) # self.bulk_manager.commit_create(TerminationStarted) # if len(self.branch_bulk_manager.update_queues['business_register.Company']) > 0: # self.branch_bulk_manager.commit_update(Company, ['name', 'short_name']) # self.branch_bulk_manager.commit_create(Company) # company_update_dict = {} # company_create_dict = {} # for company in self.branch_bulk_manager.update_queues['business_register.Company']: # company_update_dict[company.company_code] = company # for company in self.branch_bulk_manager.create_queues['business_register.Company']: # company_create_dict[company.company_code] = company # # self.bulk_manager.create_queues['business_register.Assignee'] = [] # self.bulk_manager.create_queues['business_register.BancruptcyReadjustment'] = [] # self.bulk_manager.create_queues['business_register.CompanyDetail'] = [] # self.bulk_manager.create_queues['business_register.CompanyToKved'] = [] # self.bulk_manager.create_queues['business_register.ExchangeDataCompany'] = [] # self.bulk_manager.create_queues['business_register.CompanyToPredecessor'] = [] # self.bulk_manager.create_queues['business_register.Signer'] = [] # self.bulk_manager.create_queues['business_register.TerminationStarted'] = [] # self.branch_bulk_manager.update_queues['business_register.Company'] = [] # self.branch_bulk_manager.create_queues['business_register.Company'] = [] # # for company_to_kved in self.branch_bulk_manager.create_queues['business_register.CompanyToKved']: # if company_to_kved.company_code in company_update_dict: # company_to_kved.company = company_update_dict[company_to_kved.company_code] # else: # company_to_kved.company = company_create_dict[company_to_kved.company_code] # # for exchange_data in self.branch_bulk_manager.create_queues['business_register.ExchangeDataCompany']: # if exchange_data.company_code in company_update_dict: # exchange_data.company = company_update_dict[exchange_data.company_code] # else: # exchange_data.company = company_create_dict[exchange_data.company_code] # # for signer in self.branch_bulk_manager.create_queues['business_register.Signer']: # if signer.company_code in company_update_dict: # signer.company = company_update_dict[signer.company_code] # else: # signer.company = company_create_dict[signer.company_code] # # self.branch_bulk_manager.commit_create(CompanyToKved) # self.branch_bulk_manager.commit_create(ExchangeDataCompany) # self.branch_bulk_manager.commit_create(Signer) # self.branch_bulk_manager.create_queues['business_register.CompanyToKved'] = [] # self.branch_bulk_manager.create_queues['business_register.ExchangeDataCompany'] = [] # self.branch_bulk_manager.create_queues['business_register.Signer'] = [] def save_or_update_kved(self, kved, company): current_fop_to_kved = CompanyToKved.objects.filter(company=company, kved=kved).first() if not current_fop_to_kved: CompanyToKved.objects.create(company=company, kved=kved, primary_kved=True) else: if not current_fop_to_kved.primary_kved: current_fop_to_kved.primary_kved = True current_fop_to_kved.save( update_fields=['primary_kved', 'updated_at']) def save_to_db(self, records): country = AddressConverter().save_or_get_country('Ukraine') for record in records: # omitting records without company name or edrpou if not record.xpath('NAME')[0].text or not record.xpath( 'EDRPOU')[0].text: self.report.invalid_data += 1 continue name = record.xpath('NAME')[0].text.lower() short_name = record.xpath('SHORT_NAME')[0].text if short_name: short_name = short_name.lower() edrpou = record.xpath('EDRPOU')[0].text code = name + edrpou address = record.xpath('ADDRESS')[0].text status = self.save_or_get_status(record.xpath('STAN')[0].text) boss = record.xpath('BOSS')[0].text if boss: boss = boss.lower() # ToDo: resolve the problem of having records with the same company name amd edrpou company = ( Company.objects # ToDo: use source after storing source in the server DB .exclude(from_antac_only=True).filter(code=code).first()) source = Company.UKRAINE_REGISTER if not company: company = Company.objects.create(name=name, short_name=short_name, edrpou=edrpou, address=address, status=status, boss=boss, country=country, code=code, source=source) else: update_fields = [] if company.name != name: company.name = name update_fields.append('name') if company.short_name != short_name: company.short_name = short_name update_fields.append('short_name') if company.address != address: company.address = address update_fields.append('address') if company.status_id != status.id: company.status = status update_fields.append('status') if company.boss != boss: company.boss = boss update_fields.append('boss') if company.country_id != country.id: company.country = country update_fields.append('country') if company.source != source: company.source = source update_fields.append('source') if update_fields: update_fields.append('updated_at') company.save(update_fields=update_fields) kved_data = record.xpath('KVED')[0].text if kved_data and ' ' in kved_data: kved = self.extract_kved(kved_data) self.save_or_update_kved(kved, company) if len(record.xpath('FOUNDERS')[0]): self.save_or_update_founders( record.xpath('FOUNDERS')[0], company) if len(record.xpath('BENEFICIARIES')[0]): self.save_or_update_beneficiaries( record.xpath('BENEFICIARIES')[0], company)
class FopConverter(BusinessConverter): def __init__(self): self.API_ADDRESS_FOR_DATASET = Register.objects.get(source_register_id= "1c7f3815-3259-45e0-bdf1-64dca07ddc10").source_api_address self.LOCAL_FOLDER = settings.LOCAL_FOLDER self.LOCAL_FILE_NAME = settings.LOCAL_FILE_NAME_FOP self.CHUNK_SIZE = settings.CHUNK_SIZE_FOP self.RECORD_TAG = 'RECORD' self.bulk_manager = BulkCreateManager() self.new_fops_foptokveds = {} self.new_fops_exchange_data = {} super().__init__() def add_fop_kveds_to_dict(self, fop_kveds_from_record, code): all_fop_foptokveds = [] for activity in fop_kveds_from_record: code_info = activity.xpath('CODE') if not code_info: continue kved_code = code_info[0].text if not kved_code: continue name_info = activity.xpath('NAME') if not name_info: continue kved_name = name_info[0].text if not kved_name: continue kved = self.get_kved_from_DB(kved_code, kved_name) is_primary = activity.xpath('PRIMARY')[0].text == "так" fop_to_kved = FopToKved(kved=kved, primary_kved=is_primary) all_fop_foptokveds.append(fop_to_kved) if len(all_fop_foptokveds): self.new_fops_foptokveds[code] = all_fop_foptokveds # putting all kveds into a list def update_fop_kveds(self, fop_kveds_from_record, fop): already_stored_foptokveds = list(FopToKved.objects.filter(fop=fop)) for activity in fop_kveds_from_record: code_info = activity.xpath('CODE') if not code_info: continue kved_code = code_info[0].text if not kved_code: continue name_info = activity.xpath('NAME') if not name_info: continue kved_name = name_info[0].text if not kved_name: continue kved = self.get_kved_from_DB(kved_code, kved_name) is_primary = activity.xpath('PRIMARY')[0].text == "так" alredy_stored = False if len(already_stored_foptokveds): for stored_foptokved in already_stored_foptokveds: if (stored_foptokved.kved.code == kved.code and stored_foptokved.kved.name == kved.name): alredy_stored = True if stored_foptokved.primary_kved != is_primary: stored_foptokved.primary_kved = is_primary stored_foptokved.save(update_fields=['primary_kved', 'updated_at']) already_stored_foptokveds.remove(stored_foptokved) break if not alredy_stored: fop_to_kved = FopToKved(fop=fop, kved=kved, primary_kved=is_primary) self.bulk_manager.add(fop_to_kved) if len(already_stored_foptokveds): for outdated_foptokved in already_stored_foptokveds: outdated_foptokved.soft_delete() def extract_exchange_data(self, answer): authority_info = answer.xpath('AUTHORITY_NAME') authority = None if authority_info and authority_info[0].text: authority = self.save_or_get_authority(authority_info[0].text) taxpayer_info = answer.xpath('TAX_PAYER_TYPE') taxpayer_type = None if taxpayer_info and taxpayer_info[0].text: taxpayer_type = self.save_or_get_taxpayer_type(taxpayer_info[0].text) start_date_info = answer.xpath('START_DATE') start_date = None if start_date_info and start_date_info[0].text: start_date = format_date_to_yymmdd(start_date_info[0].text) start_number_info = answer.xpath('START_NUM') start_number = None if start_number_info: start_number = start_number_info[0].text end_date_info = answer.xpath('END_DATE') end_date = None if end_date_info and end_date_info[0].text: end_date = format_date_to_yymmdd(end_date_info[0].text) end_number_info = answer.xpath('END_NUM') end_number = None if end_number_info and end_number_info[0].text: end_number = end_number_info[0].text return authority, taxpayer_type, start_date, start_number, end_date, end_number def add_fop_exchange_data_to_dict(self, exchange_data, code): all_fop_exchangedata = [] for answer in exchange_data: authority, taxpayer_type, start_date, start_number, end_date, end_number \ = self.extract_exchange_data(answer) if (not authority and not taxpayer_type and not start_date and not start_number and not end_date and not end_number): continue exchange_data = ExchangeDataFop(authority=authority, taxpayer_type=taxpayer_type, start_date=start_date, start_number=start_number, end_date=end_date, end_number=end_number) all_fop_exchangedata.append(exchange_data) if len(all_fop_exchangedata): self.new_fops_exchange_data[code] = all_fop_exchangedata # putting all exchange data into a list def update_fop_exchange_data(self, exchange_data, fop): already_stored_exchange_data = ExchangeDataFop.objects.filter(fop=fop) for answer in exchange_data: authority, taxpayer_type, start_date, start_number, end_date, end_number \ = self.extract_exchange_data(answer) if (not authority and not taxpayer_type and not start_date and not start_number and not end_date and not end_number): continue already_stored = False for stored_exchange_data in already_stored_exchange_data: # ToDo: find way to check dates if (stored_exchange_data.authority == authority and stored_exchange_data.taxpayer_type == taxpayer_type and stored_exchange_data.start_number == start_number and stored_exchange_data.end_number == end_number): already_stored = True break if not already_stored: exchange_data = ExchangeDataFop(fop=fop, authority=authority, taxpayer_type=taxpayer_type, start_date=start_date, start_number=start_number, end_date=end_date, end_number=end_number) self.bulk_manager.add(exchange_data) def save_detailed_fop_to_db(self, records): for record in records: fullname = record.xpath('NAME')[0].text if not fullname: logger.warning(f'ФОП без прізвища: {record}') self.report.invalid_data += 1 continue if len(fullname) > 100: logger.warning(f'ФОП із задовгим прізвищем: {record}') continue if fullname: fullname = fullname.lower() address = record.xpath('ADDRESS')[0].text if not address: address = 'EMPTY' code = fullname + address status = self.save_or_get_status(record.xpath('STAN')[0].text) registration_text = record.xpath('REGISTRATION')[0].text # first getting date, then registration info if REGISTRATION.text exists registration_date = None registration_info = None if registration_text: registration_date = format_date_to_yymmdd(get_first_word(registration_text)) registration_info = cut_first_word(registration_text) estate_manager = record.xpath('ESTATE_MANAGER')[0].text termination_text = record.xpath('TERMINATED_INFO')[0].text termination_date = None terminated_info = None if termination_text: termination_date = format_date_to_yymmdd(get_first_word(termination_text)) terminated_info = cut_first_word(termination_text) termination_cancel_info = record.xpath('TERMINATION_CANCEL_INFO')[0].text contact_info = record.xpath('CONTACTS')[0].text vp_dates = record.xpath('VP_DATES')[0].text authority = self.save_or_get_authority(record.xpath('CURRENT_AUTHORITY')[0].text) fop_kveds = record.xpath('ACTIVITY_KINDS')[0] exchange_data = record.xpath('EXCHANGE_DATA')[0] fop = Fop.objects.filter(code=code).first() if not fop: fop = Fop( fullname=fullname, address=address, status=status, registration_date=registration_date, registration_info=registration_info, estate_manager=estate_manager, termination_date=termination_date, terminated_info=terminated_info, termination_cancel_info=termination_cancel_info, contact_info=contact_info, vp_dates=vp_dates, authority=authority, code=code) self.bulk_manager.add(fop) if len(fop_kveds): self.add_fop_kveds_to_dict(fop_kveds, code) if len(exchange_data): self.add_fop_exchange_data_to_dict(exchange_data, code) else: # TODO: make a decision: our algorithm when Fop changes fullname or address? update_fields = [] if fop.status != status: fop.status = status update_fields.append('status') if fop.registration_date and str(fop.registration_date) != registration_date: fop.registration_date = registration_date update_fields.append('registration_date') if fop.registration_info != registration_info: fop.registration_info = registration_info update_fields.append('registration_info') if fop.estate_manager != estate_manager: fop.estate_manager = estate_manager update_fields.append('estate_manager') if fop.termination_date and str(fop.termination_date) != termination_date: fop.termination_date = termination_date update_fields.append('termination_date') if fop.terminated_info != terminated_info: fop.terminated_info = terminated_info update_fields.append('terminated_info') if fop.termination_cancel_info != termination_cancel_info: fop.termination_cancel_info = termination_cancel_info update_fields.append('termination_cancel_info') if fop.contact_info != contact_info: fop.contact_info = contact_info update_fields.append('contact_info') if fop.vp_dates != vp_dates: fop.vp_dates = vp_dates update_fields.append('vp_dates') if fop.authority != authority: fop.authority = authority update_fields.append('authority') if len(update_fields): update_fields.append('updated_at') fop.save(update_fields=update_fields) if len(fop_kveds): self.update_fop_kveds(fop_kveds, fop) if len(exchange_data): self.update_fop_exchange_data(exchange_data, fop) if len(self.bulk_manager.queues['business_register.Fop']): self.bulk_manager.commit(Fop) for fop in self.bulk_manager.queues['business_register.Fop']: if fop.code not in self.new_fops_foptokveds: continue foptokveds = self.new_fops_foptokveds[fop.code] for foptokved in foptokveds: foptokved.fop = fop self.bulk_manager.add(foptokved) self.new_fops_foptokveds = {} for fop in self.bulk_manager.queues['business_register.Fop']: if fop.code not in self.new_fops_exchange_data: continue fop_exchangedata = self.new_fops_exchange_data[fop.code] for exchangedata in fop_exchangedata: exchangedata.fop = fop self.bulk_manager.add(exchangedata) self.new_fops_exchange_data = {} self.bulk_manager.queues['business_register.Fop'] = [] if len(self.bulk_manager.queues['business_register.FopToKved']): self.bulk_manager.commit(FopToKved) if len(self.bulk_manager.queues['business_register.ExchangeDataFop']): self.bulk_manager.commit(ExchangeDataFop) self.bulk_manager.queues['business_register.FopToKved'] = [] self.bulk_manager.queues['business_register.ExchangeDataFop'] = [] def save_or_update_kved(self, kved, fop): current_fop_to_kved = FopToKved.objects.filter( fop=fop, kved=kved ).first() if not current_fop_to_kved: FopToKved.objects.create( fop=fop, kved=kved, primary_kved=True ) else: if not current_fop_to_kved.primary_kved: current_fop_to_kved.primary_kved = True current_fop_to_kved.save(update_fields=['primary_kved', 'updated_at']) def save_to_db(self, records): for record in records: fullname = record.xpath('FIO')[0].text if not fullname: logger.warning(f'ФОП без прізвища: {record}') self.report.invalid_data += 1 continue if len(fullname) > 100: logger.warning(f'ФОП із задовгим прізвищем: {record}') continue fullname = fullname.lower() address = record.xpath('ADDRESS')[0].text if not address: address = 'EMPTY' code = fullname + address status = self.save_or_get_status(record.xpath('STAN')[0].text) fop = Fop.objects.filter(code=code).first() if not fop: fop = Fop.objects.create( fullname=fullname, address=address, status=status, code=code) else: # TODO: make a decision: our algorithm when Fop changes fullname or address? update_fields = [] if fop.status != status: fop.status = status update_fields.append('status') if len(update_fields): update_fields.append('updated_at') fop.save(update_fields=update_fields) kved_data = record.xpath('KVED')[0].text if kved_data and ' ' in kved_data: kved = self.extract_kved(kved_data) self.save_or_update_kved(kved, fop) print("For storing run FopConverter().process()")
class RuoConverter(Converter): CHUNK_SIZE = 300 LOCAL_FILE_NAME = "uo.xml" DATASET_ID = "1c7f3815-3259-45e0-bdf1-64dca07ddc10" def rename_file(self, file): new_filename = file if (file.upper().find('UO') >= 0): new_filename = 'uo.xml' if (file.upper().find('FOP') >= 0): new_filename = 'fop.xml' return new_filename # list of models for clearing DB tables = [ Founders, Ruo, ] # format record's data record = { 'RECORD': '', 'NAME': '', 'SHORT_NAME': '', 'EDRPOU': '', 'ADDRESS': '', 'BOSS': '', 'KVED': '', 'STAN': '', 'FOUNDING_DOCUMENT_NUM': '', 'FOUNDERS': '', 'FOUNDER': [] } # creating dictionaries for registration items that had writed to db state_dict = {} # dictionary uses for keeping whole model class objects kved_dict = {} index = 0 # index for entries in _create_queues[model_key] list # filling state & kved dictionaries with with existing db items for state in State.objects.all(): state_dict[state.name] = state for kved in Kved.objects.all(): kved_dict[kved.code] = kved # creating BulkCreateManager objects bulk_manager = BulkCreateManager(CHUNK_SIZE) bulk_submanager = BulkCreateManager( 100000) # chunck size 100000 for never reach it # writing entry to db def save_to_db(self, record): state = self.save_to_state_table(record) kved = self.get_kved_from_DB(record, 'NAME') self.save_to_ruo_table(record, state, kved) print('saved') # writing entry to state table def save_to_state_table(self, record): if record['STAN']: state_name = record['STAN'] else: state_name = State.EMPTY_FIELD if not state_name in self.state_dict: state = State(name=state_name) state.save() self.state_dict[state_name] = state return state state = self.state_dict[state_name] return state # writing entry to ruo & founders table def save_to_ruo_table(self, record, state, kved): ruo = Ruo.objects.filter(state=state.id, kved=kved.id, name=record['NAME'], short_name=record['SHORT_NAME'], edrpou=record['EDRPOU'], address=record['ADDRESS'], boss=record['BOSS']) if ruo.exists(): return ruo.first() ruo = Ruo(state=state, kved=kved, name=record['NAME'], short_name=record['SHORT_NAME'], edrpou=record['EDRPOU'], address=record['ADDRESS'], boss=record['BOSS']) '''Для реализации метода bulk_create() при сохранении вложенных записей штатному полю id объекта founders временно присваивается значение индекса объекта ruo в списке _create_queues['ratu.Ruo']. После сохранения в базе данных порции объектов ruo они получают свои уникальные id базы данных, после чего назначаются связанному полю founders.company в соответствии с временным id объекта founders. Далее поле founders.id очищается от временного id для сохранения founders в базе данных с id назначенным базой''' self.bulk_manager.add(ruo) self.add_founders_to_queue(record, ruo) self.index = self.index + 1 if len(self.bulk_manager._create_queues['data_ocean.Ruo'] ) >= self.CHUNK_SIZE: for founders in self.bulk_submanager._create_queues[ 'data_ocean.Founders']: founders.company = self.bulk_manager._create_queues[ 'data_ocean.Ruo'][founders.id] founders.id = None self.bulk_submanager._commit(Founders) self.bulk_submanager._create_queues['data_ocean.Founders'] = [] self.index = 0 # filling _create_queues['ratu.Founders'] list def add_founders_to_queue(self, record, ruo): for founder in record['FOUNDER']: founders = Founders(id=self.index, company=ruo, founder=founder) self.bulk_submanager.add(founders) print( 'Ruo already imported. For start rewriting RUO to the DB run > RuoConverter().process()\n', 'For clear RUO tables run > RuoConverter().clear_db()')