def test(self): logger = setup_logging(logger_name="test_real_dedupe") models.Section.objects.all().delete() permalinks_folder = os.path.dirname(__file__) db = TPermaLinksPerson(permalinks_folder) db.open_db_read_only() db.recreate_auto_increment_table() db.close_db() model_path = os.path.join( os.path.dirname(__file__), "../../../deduplicate/model/random_forest.pickle") dedupe_objects = os.path.join(os.path.dirname(__file__), "dedupe_objects.dump") run_dedupe = RunDedupe(None, None) run_dedupe.handle(None, permalinks_folder=permalinks_folder, input_dedupe_objects=dedupe_objects, model_file=model_path, threshold=0.6, recreate_db=True, surname_bounds=',', write_to_db=True) sec = models.Section.objects.get(id=757036) self.assertEqual(1406125, sec.person_id)
def test(self): self.initialize() person_id = 99 person = models.Person(id=person_id) person.save() section1 = self.create_section(1, "Иванов Иван Иванович", person=person) section2 = self.create_section(2, "Иванов И. И.", person=person) section3 = self.create_section(3, "Петров И. И.") permalinks_folder = os.path.dirname(__file__) db = TPermaLinksPerson(permalinks_folder) db.create_db() db.save_dataset(setup_logging()) db.recreate_auto_increment_table() db.close_db() section1.person = None section1.save() section2.person = None section2.save() person.delete() run_dedupe = RunDedupe(None, None) run_dedupe.handle(None, permalinks_folder=permalinks_folder, write_to_db=True, fake_dedupe=True, surname_bounds=',', take_sections_with_empty_income=True, rebuild=True) self.assertEqual(models.Person.objects.count(), 1) sec1 = models.Section.objects.get(id=1) self.assertEqual(sec1.person_id, person_id) sec2 = models.Section.objects.get(id=2) self.assertEqual(sec2.person_id, person_id) sec3 = models.Section.objects.get(id=3) self.assertEqual(sec3.person_id, person_id)
def test(self): logger = setup_logging(logger_name="test_real_dedupe") sql_script = os.path.join( os.path.dirname(__file__), "disclosures.sql.person_id_5295.n") run_sql_script(logger, sql_script) permalinks_folder = os.path.dirname(__file__) db = TPermaLinksPerson(permalinks_folder) db.create_db() db.save_dataset(setup_logging()) db.recreate_auto_increment_table() db.close_db() model_path = os.path.join(os.path.dirname(__file__), "../../../deduplicate/model/random_forest.pickle" ) run_dedupe = RunDedupe(None, None) run_dedupe.handle(None, permalinks_folder=permalinks_folder, write_to_db=True, surname_bounds=',', model_file=model_path, threshold=0.6 ) person_id = 5295 self.assertEqual(models.Person.objects.count(), 3) person = models.Person.objects.get(id=person_id) self.assertIsNotNone(person) self.assertEqual(5295, person.declarator_person_id) canon_sections = [ (451721, 5295, True), (452066, 5295, True), (452420, 5295, True), (453686, 5295, False), (455039, 5295, False), (1801614, 5296, True), (5105303, 5295, True), (6437989, 5297, True), (6672563, 5297, True), (6674154, 5297, True), (6773981, 5297, True), ] sections = [] for s in models.Section.objects.all(): sections.append ((s.id, s.person_id, s.dedupe_score is not None)) self.assertListEqual(canon_sections, sections)
def test(self): self.initialize() person_id = 2 declarator_person_id = 1111 person = models.Person(id=person_id, declarator_person_id=declarator_person_id, person_name="Иванов Иван Иванович") person.save() self.create_section(1, "Иванов Иван Иванович", person) self.create_section(2, "Иванов И. И.") permalinks_folder = os.path.dirname(__file__) db = TPermaLinksPerson(permalinks_folder) db.create_db() db.save_dataset(setup_logging()) #db.save_max_plus_one_primary_key(3) db.recreate_auto_increment_table() db.close_db() run_dedupe = RunDedupe(None, None) run_dedupe.handle(None, permalinks_folder=permalinks_folder, write_to_db=True, fake_dedupe=True, surname_bounds=',', take_sections_with_empty_income=True, rebuild=True) self.assertEqual(models.Person.objects.count(), 1) person = models.Person.objects.get(id=person_id) self.assertIsNotNone(person) self.assertEqual(declarator_person_id, person.declarator_person_id) sec1 = models.Section.objects.get(id=1) self.assertEqual(sec1.person_id, person.id) sec2 = models.Section.objects.get(id=2) self.assertEqual(sec2.person_id, person.id)
class Command(BaseCommand): help = 'copy person id from declarator to disclosures' def __init__(self, *args, **kwargs): super(Command, self).__init__(*args, **kwargs) self.options = None self.permalinks_db = None self.logger = None self.declarator_person_id_to_disclosures_person = dict() self.disclosures_person_id_to_disclosures_person = dict() def add_arguments(self, parser): parser.add_argument('--read-person-from-json', dest='read_person_from_json', default=None, help='read person info from json for testing') parser.add_argument('--permalinks-folder', dest='permalinks_folder', required=True) parser.add_argument('--declarator-host', dest='declarator_host', required=False) parser.add_argument('--person-name-prefix', dest='person_name_prefix', required=False) def open_permalinks_db(self): self.permalinks_db = TPermaLinksPerson( self.options['permalinks_folder']) self.permalinks_db.open_db_read_only() def build_passport_to_person_id_mapping_from_declarator(self): if self.options.get('read_person_from_json') is not None: with open(self.options.get('read_person_from_json'), "r", encoding="utf8") as inpf: return json.load(inpf) else: return get_all_section_from_declarator_with_person_id( self.options['declarator_host']) # we think that person ids in declarator db are stable def copy_human_merge(self, section, declarator_person_id): person = self.declarator_person_id_to_disclosures_person.get( declarator_person_id) if person is None: person_id = self.permalinks_db.get_person_id_by_declarator_id( declarator_person_id, section.id) if person_id in self.disclosures_person_id_to_disclosures_person: person = self.disclosures_person_id_to_disclosures_person.get( person_id) if declarator_person_id != person.declarator_person_id: self.logger.error( "Person id={} has conflict declarator_person_id ({} != {}), use the first person id {}" .format(person_id, declarator_person_id, person.declarator_person_id, person.declarator_person_id)) else: person = models.Person( id=person_id, declarator_person_id=declarator_person_id, person_name=section.person_name) person.save() self.declarator_person_id_to_disclosures_person[ declarator_person_id] = person self.disclosures_person_id_to_disclosures_person[ declarator_person_id] = person elif person.person_name is None or len(person.person_name) < len( section.person_name): person.person_name = section.person_name person.save() assert person.declarator_person_id is not None self.logger.debug( "connect section {} to person {}, declarator_person_id={}".format( section.id, person.id, person.declarator_person_id)) section.person = person section.dedupe_score = None section.save() def process_section(self, section, section_passports): main_income = 0 for i in section.income_set.all(): if i.relative == models.Relative.main_declarant_code: main_income = i.size found_results = list() for declaration_info in section.source_document.declarator_file_reference_set.all( ): key1 = build_section_passport( declaration_info.declarator_document_id, section.person_name, main_income) found_res1 = section_passports.get(key1) if found_res1 is not None: found_results.append(found_res1) fio = TRussianFio(section.person_name) if fio.is_resolved: key2 = build_section_passport( declaration_info.declarator_document_id, fio.family_name, main_income) found_res2 = section_passports.get(key2) if found_res2 is not None: found_results.append(found_res2) else: self.logger.error( "section {} fio={} cannot find surname".format( section.id, section.person_name)) if len(found_results) == 0: self.logger.debug( "section {} fio={} cannot be found in declarator".format( section.id, section.person_name)) else: for person_id in found_results: if person_id != "AMBIGUOUS_KEY": self.copy_human_merge(section, person_id) return True self.logger.debug("section {} fio={} is ambiguous".format( section.id, section.person_name)) return False def copy_declarator_person_ids(self, section_passports): query = """ select s.id, r.declarator_document_id, s.person_name, i.size from declarations_section s join declarations_income i on i.section_id = s.id join declarations_source_document d on s.source_document_id = d.id join declarations_declarator_file_reference r on r.source_document_id = d.id where i.relative = '{}' """.format(models.Relative.main_declarant_code) merge_count = 0 with connection.cursor() as cursor: cursor.execute(query) for section_id, declarator_document_id, person_name, main_income in cursor: found_results = list() key1 = build_section_passport(declarator_document_id, person_name, main_income) found_res1 = section_passports.get(key1) if found_res1 is not None: found_results.append(found_res1) fio = TRussianFio(person_name) if fio.is_resolved: key2 = build_section_passport(declarator_document_id, fio.family_name, main_income) found_res2 = section_passports.get(key2) if found_res2 is not None: found_results.append(found_res2) if len(found_results) > 0: success = False for person_id in found_results: if person_id != "AMBIGUOUS_KEY": self.copy_human_merge( models.Section.objects.get(id=section_id), person_id) success = True merge_count += 1 break if not success: self.logger.debug( "section {} fio={} is ambiguous".format( section_id, person_name)) self.logger.info( "set human person id to {} records".format(merge_count)) def handle(self, *args, **options): self.logger = setup_logging(logger_name="copy_person") self.options = options self.logger.debug("models.Person.objects.count()={}".format( models.Person.objects.count())) assert models.Person.objects.count() == 0 self.open_permalinks_db() section_passports = self.build_passport_to_person_id_mapping_from_declarator( ) self.logger.info("merge by {} passports from declarator".format( len(section_passports))) self.copy_declarator_person_ids(section_passports) self.permalinks_db.close_db() self.logger.info("all done")