def to_dict(self): """ Convert Minion model to an indexable presentation for ES. """ all_persons = set() names_autocomplete = set() d = model_to_dict(self, fields=["paid"]) d["mp"] = self.mp2convocation.to_dict() all_persons |= parse_and_generate(self.mp2convocation.mp.name, "Депутат") names_autocomplete |= autocomplete_suggestions( self.mp2convocation.mp.name) all_persons |= parse_and_generate(self.minion.name, "Помічник") names_autocomplete |= autocomplete_suggestions(self.minion.name) d["_id"] = self.id d["id"] = self.minion.id d["name"] = self.minion.name d["companies"] = [self.mp2convocation.party] d["persons"] = list(filter(None, all_persons)) d["names_autocomplete"] = list(names_autocomplete) return d
def to_dict(self): dt = self.data dt_g = dt["general"] res = { "_id": self.pk, "last_updated_from_dataset": self.last_updated_from_dataset, "first_updated_from_dataset": self.first_updated_from_dataset, } names_autocomplete = set() addresses = set() companies = set() persons = set() persons |= generate_all_names(dt_g["last_name"], dt_g["name"], dt_g["patronymic"], dt_g["post"]["post"]) names_autocomplete |= autocomplete_suggestions( concat_name(dt_g["last_name"], dt_g["name"], dt_g["patronymic"])) companies.add(dt_g["post"]["office"]) for fam in dt_g["family"]: persons |= generate_all_names(fam["last_name"], fam["name"], fam["patronymic"], fam["relation"]) names_autocomplete |= autocomplete_suggestions( concat_name(fam["last_name"], fam["name"], fam["patronymic"])) for career in fam["career"]: companies.add(career["workplace"]) for fam in dt_g["family_conflicts"]: anything_new = autocomplete_suggestions( concat_name(fam["last_name"], fam["name"], fam["patronymic"])) if anything_new and not anything_new.issubset(names_autocomplete): persons |= generate_all_names(fam["last_name"], fam["name"], fam["patronymic"], "Пов'язана особа") logger.warning( "Oh wow, second section has persons missing from first one {}" .format(anything_new)) names_autocomplete |= anything_new res.update(dt) res.update({ "companies": list(filter(None, companies)), "addresses": list(filter(None, addresses)), "persons": list(filter(None, persons)), "names_autocomplete": list(filter(None, names_autocomplete)), }) return res
def handle(self, *args, **options): translator = Translator() translator.fetch_full_dict_from_db() self.apply_migrations() all_decls = Declaration.search().query('match_all').scan() for decl in tqdm.tqdm(all_decls): decl_dct = decl.to_dict() decl.general.full_name = replace_apostrophes(decl.general.full_name) decl.general.name = replace_apostrophes(decl.general.name) decl.general.last_name = replace_apostrophes(decl.general.last_name) decl.general.patronymic = replace_apostrophes(decl.general.patronymic) decl.general.full_name_suggest = { 'input': [ decl.general.full_name, ' '.join([decl.general.name, decl.general.patronymic, decl.general.last_name]), ' '.join([decl.general.name, decl.general.last_name]) ] } decl_dct["ft_src"] = "" terms = filter_only_interesting(decl_dct) terms += [translator.translate(x)["translation"] for x in terms] decl.ft_src = "\n".join(terms) decl.general.full_name_for_sorting = keyword_for_sorting(decl.general.full_name) decl.index_card = concat_fields(decl_dct, Declaration.INDEX_CARD_FIELDS) extracted_names = [(decl.general.last_name, decl.general.name, decl.general.patronymic, None)] persons = set() names_autocomplete = set() for person in decl.general.family: l, f, p, _ = parse_fullname(person.family_name) extracted_names.append((l, f, p, person.relations)) for name in extracted_names: persons |= generate_all_names( *name ) names_autocomplete |= autocomplete_suggestions( concat_name(*name[:-1]) ) decl.persons = list(filter(None, persons)) decl.names_autocomplete = list(filter(None, names_autocomplete)) decl.save()
def to_dict(self): dt = self.data res = { "_id": self.pk, "last_updated_from_dataset": self.last_updated_from_dataset, "first_updated_from_dataset": self.first_updated_from_dataset, } companies = set() addresses = set() persons = set() if dt["TIN_S"]: companies |= deal_with_mixed_lang(dt["NAME"]) companies |= generate_edrpou_options(dt["TIN_S"]) persons |= parse_and_generate(dt["PIB"], "боржник") else: persons |= parse_and_generate(dt["NAME"], "боржник") companies |= deal_with_mixed_lang(dt["DPI"]) persons |= parse_and_generate(dt["DPI_BOSS"], "керівник податкової") names_autocomplete = ( companies | autocomplete_suggestions(dt["NAME"]) | autocomplete_suggestions(dt["PIB"]) | autocomplete_suggestions(dt["DPI_BOSS"]) ) res.update(dt) res.update( { "companies": list(filter(None, companies)), "addresses": list(filter(None, addresses)), "persons": list(filter(None, persons)), "names_autocomplete": list(filter(None, names_autocomplete)), } ) return res
def handle(self, *args, **options): self.apply_migrations() all_decls = Declaration.search().query('match_all').scan() for decl in all_decls: decl_dct = decl.to_dict() sys.stdout.write('Processing decl for {}\n'.format( decl.general.full_name)) sys.stdout.flush() decl.general.full_name = replace_apostrophes( decl.general.full_name) decl.general.name = replace_apostrophes(decl.general.name) decl.general.last_name = replace_apostrophes( decl.general.last_name) decl.general.patronymic = replace_apostrophes( decl.general.patronymic) decl.general.full_name_suggest = { 'input': [ decl.general.full_name, ' '.join([ decl.general.name, decl.general.patronymic, decl.general.last_name ]), ' '.join([decl.general.name, decl.general.last_name]) ] } decl_dct["ft_src"] = "" decl.ft_src = "\n".join(filter_only_interesting(decl_dct)) decl.general.full_name_for_sorting = keyword_for_sorting( decl.general.full_name) decl.index_card = concat_fields(decl_dct, Declaration.INDEX_CARD_FIELDS) extracted_names = [(decl.general.last_name, decl.general.name, decl.general.patronymic, None)] persons = set() names_autocomplete = set() for person in decl.general.family: l, f, p, _ = parse_fullname(person.family_name) extracted_names.append((l, f, p, person.relations)) for name in extracted_names: persons |= generate_all_names(*name) names_autocomplete |= autocomplete_suggestions( concat_name(*name[:-1])) decl.persons = list(filter(None, persons)) decl.names_autocomplete = list(filter(None, names_autocomplete)) decl.save()
def to_dict(self): dt = self.data res = {"_id": self.pk} names_autocomplete = set() companies = ( set([dt["party"]]) | generate_edrpou_options(dt["donator_code"]) | generate_edrpou_options(dt["party"]) ) if dt.get("branch_code"): companies |= generate_edrpou_options(dt["branch_code"]) if dt.get("branch_name"): companies |= generate_edrpou_options(dt["branch_name"]) addresses = set([dt["donator_location"]]) persons = set([dt.get("candidate_name")]) if dt["donator_code"]: companies |= set([dt["donator_name"]]) else: persons |= parse_and_generate(dt["donator_name"], "Донор") names_autocomplete |= autocomplete_suggestions(dt["donator_name"]) names_autocomplete |= companies raw_records = set( [ dt.get("account_number"), dt.get("payment_subject"), dt["transaction_doc_number"], ] ) res.update(dt) res.update( { "companies": list(filter(None, companies)), "addresses": list(filter(None, addresses)), "persons": list(filter(None, persons)), "names_autocomplete": list(filter(None, names_autocomplete)), "raw_records": list(filter(None, raw_records)), "type": self.get_type_display(), "period": self.period, "ultimate_recepient": self.ultimate_recepient, } ) return res
def to_dict(self): dt = self.data res = { "_id": self.pk, "details_url": "https://z.texty.org.ua/deal/{}".format(dt["id"]), "last_updated_from_dataset": self.last_updated_from_dataset, "first_updated_from_dataset": self.first_updated_from_dataset, } companies = (set([ dt["purchase"]["buyer"]["name"], dt["purchase"]["buyer"]["name_en"], dt["seller"]["name"], ]) | generate_edrpou_options(dt["purchase"]["buyer"]["code"]) | generate_edrpou_options(dt["seller"]["code"]) | generate_edrpou_options( dt["purchase"]["cost_dispatcher_code"])) addresses = set([ dt["seller"]["address"], dt["seller"]["address_full"], dt["purchase"]["buyer"]["address"], dt["purchase"]["buyer"]["address_en"], ]) persons = set() if dt["purchase"]["buyer"]["person"]: persons |= parse_and_generate(dt["purchase"]["buyer"]["person"], "Представник замовника") raw_records = set( [dt["purchase"]["goods_name"], dt["purchase"]["goods_name_short"]]) names_autocomplete = companies if dt["purchase"]["buyer"]["person"]: names_autocomplete |= autocomplete_suggestions( dt["purchase"]["buyer"]["person"]) res.update(dt) res.update({ "companies": list(filter(None, companies)), "addresses": list(filter(None, addresses)), "persons": list(filter(None, persons)), "names_autocomplete": list(filter(None, names_autocomplete)), "raw_records": list(filter(None, raw_records)), }) return res
def to_dict(self): dt = self.data res = { "_id": self.pk, "last_updated_from_dataset": self.last_updated_from_dataset, "first_updated_from_dataset": self.first_updated_from_dataset, } companies = set(dt["CORR_WORK_PLACE"]) addresses = set() persons = set() persons |= generate_all_names(dt["CORRUPTIONER_LAST_NAME"], dt["CORRUPTIONER_FIRST_NAME"], dt["CORRUPTIONER_SURNAME"], "Корупціонер") names_autocomplete = autocomplete_suggestions( concat_name(dt["CORRUPTIONER_LAST_NAME"], dt["CORRUPTIONER_FIRST_NAME"], dt["CORRUPTIONER_SURNAME"])) raw_records = set([ dt["CORR_WORK_POS"], dt["ACTIVITY_SPH_NAME"], dt["OFFENSE_NAME"], dt["PUNISHMENT"], dt["COURT_CASE_NUM"], dt["SENTENCE_NUMBER"], dt["CODEX_ARTICLES_LIST_CODEX_ART_NUMBER"], dt["CODEX_ARTICLES_LIST_CODEX_ART_NAME"], ]) res.update(dt) res.update({ "companies": list(filter(None, companies)), "addresses": list(filter(None, addresses)), "persons": list(filter(None, persons)), "names_autocomplete": list(filter(None, names_autocomplete)), "raw_records": list(filter(None, raw_records)), }) return res
def to_dict(self): dt = self.data res = { "_id": self.pk, } names_autocomplete = set() companies = set([dt["party"]]) persons = set() persons |= parse_and_generate(dt["name"], "Кандидат в депутати") names_autocomplete |= autocomplete_suggestions(dt["name"]) if "область" in dt["body"].lower() or "київ" in dt["body"].lower(): splits = re.split(r"область", dt["body"], flags=re.I, maxsplit=1) if len(splits) != 2: splits = re.split(r"київ", dt["body"], flags=re.I, maxsplit=1) if len(splits) == 2: companies.add(splits[1]) dt["body_name"] = splits[1] dt["body_region"] = dt["body"].replace(dt["body_name"], "").strip() else: logger.warning("Cannot parse body name out of {}".format( dt["body"])) names_autocomplete |= companies res.update(dt) res.update({ "companies": list(filter(None, companies)), "persons": list(filter(None, persons)), "names_autocomplete": list(filter(None, names_autocomplete)), }) return res
def to_dict(self): dt = self.data res = { "_id": self.pk, "report_id": dt["report"]["id"], "timestamp": dt_parse(dt["report"]["timestamp"]), "last_updated_from_dataset": self.last_updated_from_dataset, "first_updated_from_dataset": self.first_updated_from_dataset, } names_autocomplete = set() companies = set() persons = set() addresses = set() title1 = self.title1_jmespath.search(dt) title2 = self.title2_jmespath.search(dt) report_title = self.title3_jmespath.search(dt) if not report_title: return None report_title = report_title[0] report_title["STD"] = dt_parse(report_title["STD"]) report_title["FID"] = dt_parse(report_title["FID"]) titles = title1 + title2 if titles: title = titles[0] address = ", ".join( filter( None, [ title.get("E_CONT"), title.get("E_ADRES"), title.get("E_POST"), title.get("E_RAYON"), title.get("E_STREET"), ], ) ) addresses.add(address) res["detailed_title"] = title companies |= deal_with_mixed_lang(title.get("E_NAME")) if title.get("FIO_PODP"): for p in deal_with_mixed_lang(title["FIO_PODP"]): persons |= parse_and_generate( p, title["POS_PODP"] or "" ) names_autocomplete |= autocomplete_suggestions(p) res["report_title"] = report_title companies |= generate_edrpou_options(report_title.get("D_EDRPOU")) companies |= deal_with_mixed_lang(report_title.get("D_NAME")) associates = self.current_persons_jmespath.search(dt) dismissed_associates = self.fired_persons_jmespath.search(dt) res["associates"] = associates res["dismissed_associates"] = dismissed_associates for assoc in associates + dismissed_associates: assoc["DAT_PASP"] = assoc.get("DAT_PASP") if assoc["DAT_PASP"]: assoc["DAT_PASP"] = dt_parse(assoc["DAT_PASP"]) full_name = assoc.get("P_I_B", "") or "" if full_name.strip(): parsed_name = "" parsed_chunks = [] # TODO: better word splitting for chunk in full_name.split(): # TODO: better detection of latin chunk = try_to_fix_mixed_charset(chunk) if ( is_eng(chunk) or chunk.startswith("(") or chunk.endswith(")") or chunk in "-" or chunk.startswith("-") ): break elif chunk: parsed_chunks.append(chunk) # Looks like real person if len(parsed_chunks) in [2, 3]: persons |= parse_and_generate( " ".join(parsed_chunks), assoc.get("POSADA", "") or "" ) names_autocomplete |= autocomplete_suggestions(" ".join(parsed_chunks)) persons |= parse_and_generate( full_name, assoc.get("POSADA", "") or "" ) names_autocomplete |= autocomplete_suggestions(full_name) else: companies.add(" ".join(parsed_chunks)) companies |= deal_with_mixed_lang(full_name) names_autocomplete |= companies res.update( { "companies": list(filter(None, companies)), "addresses": list(filter(None, addresses)), "persons": list(filter(None, persons)), "names_autocomplete": list(filter(None, names_autocomplete)), } ) return res
def to_dict(self): addresses = set() persons = set() all_persons = set() names_autocomplete = set() companies = set() company_profiles = set() raw_records = set() companies.add(self.full_edrpou) companies.add(str(self.pk)) latest_record = None latest_revision = 0 for company_record in ( self.records.all() .defer("company_hash", "location_parsing_quality") .nocache() ): addresses.add(company_record.location) addresses.add(company_record.parsed_location) addresses.add(company_record.validated_location) raw_records |= phone_variants(company_record.phone1) raw_records |= phone_variants(company_record.phone2) raw_records |= phone_variants(company_record.fax) raw_records.add(company_record.email) raw_records.add(company_record.form) company_profiles.add(company_record.company_profile) companies.add(company_record.name) names_autocomplete.add(company_record.name) names_autocomplete.add(company_record.short_name) names_autocomplete.add(self.full_edrpou) names_autocomplete.add(str(self.pk)) companies.add(company_record.short_name) if company_record.revisions: if max(company_record.revisions) > latest_revision: latest_record = company_record latest_revision = max(company_record.revisions) else: logger.warning( "Cannot find revisions for the CompanyRecord {}".format(self.pk) ) for person in ( self.persons.all().defer("tokenized_record", "share", "revisions").nocache() ): for name in person.name: persons.add((name, person.get_person_type_display())) for addr in person.address: addresses.add(addr) for country in person.country: addresses.add(country) raw_records.add(person.raw_record) snapshot = self.snapshot_stats.order_by("-revision_id").first() flags = None if snapshot: flags = snapshot.to_dict() for name, position in persons: all_persons |= parse_and_generate(name, position) names_autocomplete |= autocomplete_suggestions(name) return { "full_edrpou": self.full_edrpou, "addresses": list(filter(None, addresses)), "raw_persons": list(filter(None, persons)), "persons": list(filter(None, all_persons)), "companies": list(filter(None, companies)), "company_profiles": list(filter(None, company_profiles)), "latest_record": latest_record.to_dict(), "raw_records": list(filter(None, raw_records)), "names_autocomplete": list(filter(None, names_autocomplete)), "internals": {"flags": flags}, }