def link_sections_to_a_new_person(self, sections, section_distances, person_id): assert len(section_distances) == len(sections) if person_id is None: person_id = self.permalinks_db._get_new_id() self.logger.debug("create new person.id: {}".format(person_id)) else: self.logger.debug("use old person.id: {}".format(person_id)) try: person = models.Person.objects.get(id=person_id) #reuse person record from declarator if person.declarator_person_id is None: self.logger.error( "Warning! Reuse existing person_id = {} for different sections (cluster), it could happen" "if this person_id was used for different person created by copy_person_id.py " "but should not happen if declarator_person_id is None. ". format(person_id)) except models.Person.DoesNotExist as exp: #create new person record person = models.Person(id=person_id) person.save() for (section, distance) in zip(sections, section_distances): self.link_section_to_person(section, person, distance)
def test(self): src_doc = create_default_source_document() person = models.Person(id=2, declarator_person_id=1111, person_name="Иванов Иван Иванович") person.save() models.Section(id=1, source_document=src_doc, person_name="Иванов Иван Иванович", person=person).save() models.Section(id=2, source_document=src_doc, person_name="Иванов И. И.").save() person.refresh_from_db() permalinks_path = os.path.join(os.path.dirname(__file__), "permalinks.dbm") p = TPermaLinksDB(permalinks_path) p.create_db() p.save_next_primary_key_value(models.Person, 3) p.create_sql_sequences() p.close() run_dedupe = RunDedupe(None, None) run_dedupe.handle(None, permanent_links_db=permalinks_path, write_to_db=True, fake_dedupe=True, surname_bounds=',', rebuild=True) sec1 = models.Section.objects.get(id=1) self.assertEqual(sec1.person_id, person.id) sec2 = models.Section.objects.get(id=2) self.assertEqual(sec2.person_id, person.id)
def test(self): self.create_test_db() person_id = 1 person = models.Person(id=person_id, person_name=self.fio) self.assertIsNone(person.declarator_person_id) person.save() section1 = models.Section.objects.get(id=self.section_id1) section1.person = person section1.save() TPermaLinksPerson(CopyPersonIdTestCaseBase.permalinks_folder ).create_and_save_empty_db() CreatePermalinksStorageCommand(None, None).handle( None, directory=CopyPersonIdTestCaseBase.permalinks_folder) permalinks_db = TPermaLinksPerson( CopyPersonIdTestCaseBase.permalinks_folder) permalinks_db.open_db_read_only() permalinks_db.recreate_auto_increment_table() self.run_copy_person_id(False, False) self.assertEqual(models.Person.objects.count(), 1) section1 = models.Section.objects.get(id=self.section_id1) self.assertEqual(section1.person.declarator_person_id, self.declarator_person_id) self.assertEqual(section1.person.id, person_id)
def test(self): src_doc = create_default_source_document() models.Section(id=1, source_document=src_doc, person_name="Иванов Иван Иванович").save() models.Section(id=2, source_document=src_doc, person_name="Иванов И. И.").save() permalinks_path = os.path.join(os.path.dirname(__file__), "permalinks.dbm") db = TPermaLinksDB(permalinks_path) db.create_db() person = models.Person(id=99) person.tmp_section_set = {str(1), str(2)} db.put_record_id(person) db.save_next_primary_key_value(models.Person, 100) db.create_sql_sequences() db.close() run_dedupe = RunDedupe(None, None) run_dedupe.handle(None, permanent_links_db=permalinks_path, write_to_db=True, fake_dedupe=True, surname_bounds=',', rebuild=True) sec1 = models.Section.objects.get(id=1) self.assertEqual(sec1.person_id, person.id) sec2 = models.Section.objects.get(id=2) self.assertEqual(sec2.person_id, person.id)
def link_sections_to_a_new_person(self, section_ids): person = models.Person() person.tmp_section_set = set(str(id) for (id, score) in section_ids) person.id = self.primary_keys_builder.get_record_id(person) person.save() for (section_id, score) in section_ids: section = models.Section.objects.get(id=section_id) self.link_section_to_person(section, person, score)
def copy_human_merge(self, section, declarator_person_id): # we think that person ids in declarator db are stable person = models.Person(declarator_person_id=declarator_person_id) person.id = self.primary_keys_builder.get_record_id(person) self.logger.debug("connect section {} to person {}, declarator_person_id={}".format( section.id, person.id, declarator_person_id)) if person.person_name is None or len(person.person_name) < len(section.person_name): person.person_name = section.person_name person.save() section.person = person section.dedupe_score = None section.save()
def test_rating(self): models.Person.objects.all().delete() models.Section.objects.all().delete() models.Source_Document.objects.all().delete() src_doc = models.Source_Document(id=1) src_doc.save() person_id = 99 person = models.Person(id=person_id) person.save() models.Section(id=1, source_document=src_doc, person_name="i1", income_year=2019, office_id=1, person=person).save() models.Section(id=2, source_document=src_doc, person_name="i2", income_year=2019, office_id=1, person=person).save() models.Section(id=3, source_document=src_doc, person_name="i3", income_year=2019, office_id=1, person=person).save() models.Income(section_id=1, size=1, relative=models.Relative.main_declarant_code).save() models.Income(section_id=2, size=2, relative=models.Relative.main_declarant_code).save() models.Income(section_id=3, size=3, relative=models.Relative.main_declarant_code).save() builder = BuildRatingCommand(None, None) builder.handle(None, min_members_count=3) self.assertEqual(models.Person_Rating_Items.objects.count(), 3) rating = list(models.Person_Rating_Items.objects.all()) self.assertEqual(rating[0].rating_value, 3) self.assertEqual(rating[0].person_place, 1) self.assertEqual(rating[1].rating_value, 2) self.assertEqual(rating[1].person_place, 2) self.assertEqual(rating[2].rating_value, 1) self.assertEqual(rating[2].person_place, 3)
def test(self): self.initialize() permalinks_folder = os.path.dirname(__file__) person_id = 99 person = models.Person(id=person_id) person.save() section1 = self.create_section(1, "Иванов Иван Иванович", person=person) section2 = self.create_section(2, "Иванов Иван Иванович", person=person) CreatePermalinksStorageCommand(None, None).handle( None, directory=permalinks_folder) TPermaLinksPerson(permalinks_folder).open_db_read_only( ).recreate_auto_increment_table() section1.person = None section1.save() section2.person = None section2.save() person.delete() run_dedupe = RunDedupe(None, None) run_dedupe.handle(None, permalinks_folder=permalinks_folder, write_to_db=True, fake_dedupe=True, separate_sections=True, surname_bounds=',', take_sections_with_empty_income=True, rebuild=True) self.assertEqual(2, models.Person.objects.count()) #"person_id" is inherited by the minimal section_id, if there is no other grounds sec1 = models.Section.objects.get(id=1) self.assertEqual(sec1.person_id, person_id) sec2 = models.Section.objects.get(id=2) self.assertEqual(sec2.person_id, person_id + 1) # a new person_id
def read_dumped_objects(self, file_name): if self.options.get('recreate_db'): assert models.Section.objects.count() == 0 with open(file_name) as inp: for line in inp: js = json.loads(line) o = TDeduplicationObject().from_json(js) if self.options.get('recreate_db'): if o.record_id.source_table == TDeduplicationObject.SECTION: assert len(o.offices) == 1 s = models.Section(id=o.record_id.id, office_id=list(o.offices)[0]) self.section_cache[o.record_id.id] = s s.save() else: models.Person(id=o.record_id.id).save() self.cluster_by_minimal_fio[ o.fio.build_fio_with_initials()].append(o)
def test(self): self.initialize() person_id = 99 person = models.Person(id=person_id) person.save() section1 = self.create_section(1, "Иванов Иван Иванович", person=person) section2 = self.create_section(2, "Иванов И. И.", person=person) section3 = self.create_section(3, "Петров И. И.") permalinks_folder = os.path.dirname(__file__) db = TPermaLinksPerson(permalinks_folder) db.create_db() db.save_dataset(setup_logging()) db.recreate_auto_increment_table() db.close_db() section1.person = None section1.save() section2.person = None section2.save() person.delete() run_dedupe = RunDedupe(None, None) run_dedupe.handle(None, permalinks_folder=permalinks_folder, write_to_db=True, fake_dedupe=True, surname_bounds=',', take_sections_with_empty_income=True, rebuild=True) self.assertEqual(models.Person.objects.count(), 1) sec1 = models.Section.objects.get(id=1) self.assertEqual(sec1.person_id, person_id) sec2 = models.Section.objects.get(id=2) self.assertEqual(sec2.person_id, person_id) sec3 = models.Section.objects.get(id=3) self.assertEqual(sec3.person_id, person_id)
def create_records(self, records): models.Section.objects.all().delete() models.Source_Document.objects.all().delete() models.Person.objects.all().delete() models.PersonRedirect.objects.all().delete() assert models.Office.objects.all().count() > 0 for d in records.get('source_documents', []): d = models.Source_Document(**d) d.save() for d in records.get('persons', []): models.Person(**d).save() for d in records.get('sections', []): if len(models.Office.objects.filter(id=d['office_id'])) == 0: o = models.Office(id=d['office_id'], name="aaa") o.save() models.Section(**d).save() for d in records.get('redirects', []): models.PersonRedirect(**d).save()
def test(self): self.initialize() person_id = 2 declarator_person_id = 1111 person = models.Person(id=person_id, declarator_person_id=declarator_person_id, person_name="Иванов Иван Иванович") person.save() self.create_section(1, "Иванов Иван Иванович", person) self.create_section(2, "Иванов И. И.") permalinks_folder = os.path.dirname(__file__) db = TPermaLinksPerson(permalinks_folder) db.create_db() db.save_dataset(setup_logging()) #db.save_max_plus_one_primary_key(3) db.recreate_auto_increment_table() db.close_db() run_dedupe = RunDedupe(None, None) run_dedupe.handle(None, permalinks_folder=permalinks_folder, write_to_db=True, fake_dedupe=True, surname_bounds=',', take_sections_with_empty_income=True, rebuild=True) self.assertEqual(models.Person.objects.count(), 1) person = models.Person.objects.get(id=person_id) self.assertIsNotNone(person) self.assertEqual(declarator_person_id, person.declarator_person_id) sec1 = models.Section.objects.get(id=1) self.assertEqual(sec1.person_id, person.id) sec2 = models.Section.objects.get(id=2) self.assertEqual(sec2.person_id, person.id)
def copy_human_merge(self, section, declarator_person_id): person = self.declarator_person_id_to_disclosures_person.get( declarator_person_id) if person is None: person_id = self.permalinks_db.get_person_id_by_declarator_id( declarator_person_id, section.id) if person_id in self.disclosures_person_id_to_disclosures_person: person = self.disclosures_person_id_to_disclosures_person.get( person_id) if declarator_person_id != person.declarator_person_id: self.logger.error( "Person id={} has conflict declarator_person_id ({} != {}), use the first person id {}" .format(person_id, declarator_person_id, person.declarator_person_id, person.declarator_person_id)) else: person = models.Person( id=person_id, declarator_person_id=declarator_person_id, person_name=section.person_name) person.save() self.declarator_person_id_to_disclosures_person[ declarator_person_id] = person self.disclosures_person_id_to_disclosures_person[ declarator_person_id] = person elif person.person_name is None or len(person.person_name) < len( section.person_name): person.person_name = section.person_name person.save() assert person.declarator_person_id is not None self.logger.debug( "connect section {} to person {}, declarator_person_id={}".format( section.id, person.id, person.declarator_person_id)) section.person = person section.dedupe_score = None section.save()
def test_corrected_person(self): models.Section.objects.all().delete() models.Source_Document.objects.all().delete() models.Person.objects.all().delete() src_doc = models.Source_Document(id=1) src_doc.save() assert SECTION_CORRECTIONS.get_corrected_section_id(8048661) == 9798543 models.Person(id=1, person_name="Иванов Иван Ильич").save() models.Section(id=8048661, income_year=2016, person_name="Иванов Иван Ильич", source_document=src_doc, office_id=1, person_id=1).save() models.Section(id=9798543, income_year=2016, person_name="Иванов Иван Ильич", source_document=src_doc, office_id=1, person_id=1).save() person = models.Person.objects.get(id=1) sections = person.sections_ordered_by_year self.assertEqual(1, len(sections))