def handle(self, *args, **options): self.apply_migrations() all_decls = Declaration.search().query('match_all').scan() for decl in all_decls: sys.stdout.write('Processing decl for {}\n'.format( decl.general.full_name)) sys.stdout.flush() decl.general.full_name = replace_apostrophes( decl.general.full_name) decl.general.name = replace_apostrophes(decl.general.name) decl.general.last_name = replace_apostrophes( decl.general.last_name) decl.general.patronymic = replace_apostrophes( decl.general.patronymic) decl.general.full_name_suggest = { 'input': [ decl.general.full_name, ' '.join([ decl.general.name, decl.general.patronymic, decl.general.last_name ]), ' '.join([decl.general.name, decl.general.last_name]) ] } decl.ft_src = "\n".join(filter_only_interesting(decl.to_dict())) decl.general.full_name_for_sorting = keyword_for_sorting( decl.general.full_name) decl.index_card = concat_fields(decl.to_dict(), Declaration.INDEX_CARD_FIELDS) decl.save()
def handle(self, *args, **options): translator = Translator() translator.fetch_full_dict_from_db() self.apply_migrations() all_decls = Declaration.search().query('match_all').scan() for decl in tqdm.tqdm(all_decls): decl_dct = decl.to_dict() decl.general.full_name = replace_apostrophes(decl.general.full_name) decl.general.name = replace_apostrophes(decl.general.name) decl.general.last_name = replace_apostrophes(decl.general.last_name) decl.general.patronymic = replace_apostrophes(decl.general.patronymic) decl.general.full_name_suggest = { 'input': [ decl.general.full_name, ' '.join([decl.general.name, decl.general.patronymic, decl.general.last_name]), ' '.join([decl.general.name, decl.general.last_name]) ] } decl_dct["ft_src"] = "" terms = filter_only_interesting(decl_dct) terms += [translator.translate(x)["translation"] for x in terms] decl.ft_src = "\n".join(terms) decl.general.full_name_for_sorting = keyword_for_sorting(decl.general.full_name) decl.index_card = concat_fields(decl_dct, Declaration.INDEX_CARD_FIELDS) extracted_names = [(decl.general.last_name, decl.general.name, decl.general.patronymic, None)] persons = set() names_autocomplete = set() for person in decl.general.family: l, f, p, _ = parse_fullname(person.family_name) extracted_names.append((l, f, p, person.relations)) for name in extracted_names: persons |= generate_all_names( *name ) names_autocomplete |= autocomplete_suggestions( concat_name(*name[:-1]) ) decl.persons = list(filter(None, persons)) decl.names_autocomplete = list(filter(None, names_autocomplete)) decl.save()
def handle(self, *args, **options): self.apply_migrations() all_decls = Declaration.search().query('match_all').scan() for decl in all_decls: decl_dct = decl.to_dict() sys.stdout.write('Processing decl for {}\n'.format( decl.general.full_name)) sys.stdout.flush() decl.general.full_name = replace_apostrophes( decl.general.full_name) decl.general.name = replace_apostrophes(decl.general.name) decl.general.last_name = replace_apostrophes( decl.general.last_name) decl.general.patronymic = replace_apostrophes( decl.general.patronymic) decl.general.full_name_suggest = { 'input': [ decl.general.full_name, ' '.join([ decl.general.name, decl.general.patronymic, decl.general.last_name ]), ' '.join([decl.general.name, decl.general.last_name]) ] } decl_dct["ft_src"] = "" decl.ft_src = "\n".join(filter_only_interesting(decl_dct)) decl.general.full_name_for_sorting = keyword_for_sorting( decl.general.full_name) decl.index_card = concat_fields(decl_dct, Declaration.INDEX_CARD_FIELDS) extracted_names = [(decl.general.last_name, decl.general.name, decl.general.patronymic, None)] persons = set() names_autocomplete = set() for person in decl.general.family: l, f, p, _ = parse_fullname(person.family_name) extracted_names.append((l, f, p, person.relations)) for name in extracted_names: persons |= generate_all_names(*name) names_autocomplete |= autocomplete_suggestions( concat_name(*name[:-1])) decl.persons = list(filter(None, persons)) decl.names_autocomplete = list(filter(None, names_autocomplete)) decl.save()
def handle(self, *args, **options): all_decls = Declaration.search().query('match_all').scan() for decl in all_decls: print('Processing decl for {}'.format(decl.general.full_name)) decl.general.full_name = replace_apostrophes(decl.general.full_name) decl.general.name = replace_apostrophes(decl.general.name) decl.general.last_name = replace_apostrophes(decl.general.last_name) decl.general.patronymic = replace_apostrophes(decl.general.patronymic) decl.general.full_name_suggest = { 'input': [ decl.general.full_name, ' '.join([decl.general.name, decl.general.patronymic, decl.general.last_name]), ' '.join([decl.general.name, decl.general.last_name]) ] } decl.ft_src = "\n".join(filter_only_interesting(decl.to_dict())) decl.save()
def save_search(request): query = replace_apostrophes(request.GET.get("q", "")).strip() deepsearch = bool(request.GET.get("deepsearch", "")) params = request.GET.copy() for key in ("q", "deepsearch", "format", "page", "sort"): if key in params: params.pop(key) response = do_save_search(request, query, deepsearch, params.urlencode()) if request.is_ajax(): return HttpResponse('OK') return response
def save_search(request): query = replace_apostrophes(request.GET.get("q", "")).strip() deepsearch = bool(request.GET.get("deepsearch", "")) if len(query) < 2: messages.warning(request, 'Не вдалось створити завдання з пустим запитом.') return redirect('search_list') if len(query) > 150: messages.warning( request, 'Не вдалось створити завдання з таким довгим запитом.') return redirect('search_list') if not request.user.email: messages.warning( request, 'Не вдалось створити завдання без адреси електронної пошти. ' + 'Спочатку введіть адресу.') return redirect( reverse_qs('edit_email', qs={'next': request.get_full_path()})) # don't add twice if SearchTask.objects.filter(user=request.user, query=query, deepsearch=deepsearch, is_deleted=False).exists(): messages.warning(request, 'Таке завдання вже існує.') return redirect('search_list') task = SearchTask(user=request.user, query=query, deepsearch=deepsearch) task.save() first_run(task) if not send_newtask_notify(task): messages.warning( request, 'Не вдалось відправити лист на адресу %s' % task.user.email) messages.success(request, 'Завдання "%s" створено.' % task.query) return redirect('search_list')
def _parse_me(cls, base_fname): json_fname = "{}.json".format(base_fname) html_fname = "{}.html".format(base_fname) resp = { "intro": {}, "declaration": {} } try: with open(json_fname, "r") as fp: data = json.load(fp) with open(html_fname, "r") as fp: raw_html = fp.read() html = Selector(raw_html) except ValueError: print( "File {} or it's HTML counterpart cannot be parsed".format(json_fname)) return None except FileNotFoundError: print( "File {} or it's HTML counterpart cannot be found".format(json_fname)) return None id_ = data.get("id") created_date = data.get("created_date") raw_html_lowered = raw_html.lower() for chunk in cls.dangerous_chunks: if chunk in raw_html_lowered: raise BadHTMLData("Dangerous fragment found: {}, {}".format( id_, base_fname)) try: data = data["data"] except KeyError: raise BadJSONData("API brainfart: {}, {}".format(id_, base_fname)) if "step_0" not in data: raise BadJSONData("Bad header format: {}, {}".format(id_, base_fname)) resp["_id"] = "nacp_{}".format(id_) resp["ft_src"] = "\n".join(cls.extract_textual_data(html)) resp["nacp_orig"] = data resp["declaration"]["url"] = "https://public.nazk.gov.ua/declaration/{}".format(id_) resp["declaration"]["source"] = "NACP" resp["declaration"]["basename"] = os.path.basename(base_fname) resp["intro"]["corrected"] = id_ in cls.corrected resp["intro"]["date"] = cls.parse_date(created_date) if "declarationType" not in data["step_0"] or "changesYear" in data["step_0"]: resp["intro"]["doc_type"] = "Форма змін" if "changesYear" in data["step_0"]: resp["intro"]["declaration_year"] = int(data["step_0"]["changesYear"]) else: resp["intro"]["doc_type"] = cls.declaration_types[data["step_0"]["declarationType"]] if "declarationYearTo" in data["step_0"]: resp["intro"]["declaration_year_to"] = cls.parse_date(data["step_0"]["declarationYearTo"]) if "declarationYearFrom" in data["step_0"]: resp["intro"]["declaration_year_from"] = cls.parse_date(data["step_0"]["declarationYearFrom"]) resp["intro"]["declaration_year"] = resp["intro"]["declaration_year_from"].year if "declarationYear1" in data["step_0"]: resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear1"]) if "declarationYear3" in data["step_0"] and data["step_0"]["declarationYear3"]: resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear3"]) if "declarationYear4" in data["step_0"] and data["step_0"]["declarationYear4"]: resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear4"]) resp["general"] = { "last_name": replace_apostrophes(title(data["step_1"]["lastname"])), "name": replace_apostrophes(title(data["step_1"]["firstname"])), "patronymic": replace_apostrophes(title(data["step_1"]["middlename"])), "full_name": replace_apostrophes("{} {} {}".format( title(data["step_1"]["lastname"]), title(data["step_1"]["firstname"]), title(data["step_1"]["middlename"]), )), "post": { "post": replace_apostrophes(data["step_1"].get("workPost", "")), "post_type": replace_apostrophes(data["step_1"].get("postType", "")), "office": replace_apostrophes(data["step_1"].get("workPlace", "")), "actual_region": replace_apostrophes(cls.region_types.get(data["step_1"].get("actual_region", ""), "")), "region": replace_apostrophes(cls.region_types.get(data["step_1"].get("region", ""), "")), } } if "step_2" in data: family = data["step_2"] if isinstance(family, dict): resp["general"]["family"] = [] for member in family.values(): if not isinstance(member, dict): continue resp["general"]["family"].append({ "family_name": replace_apostrophes("{} {} {}".format( title(member.get("lastname", "")), title(member.get("firstname", "")), title(member.get("middlename", "")), )), "relations": member.get("subjectRelation", "") }) # get regions from estate list if "step_3" in data and isinstance(data["step_3"], dict) and data["step_3"]: if "estate" not in resp: resp["estate"] = [] for estate in data["step_3"].values(): if isinstance(estate, dict) and "region" in estate: region = replace_apostrophes(cls.region_types.get(estate.get("region", ""), "")) if region: resp["estate"].append({"region": region}) if "step_4" in data and isinstance(data["step_4"], dict) and data["step_4"]: if "estate" not in resp: resp["estate"] = [] for estate in data["step_4"].values(): if isinstance(estate, dict) and "region" in estate: region = replace_apostrophes(cls.region_types.get(estate.get("region", ""), "")) if region: resp["estate"].append({"region": region}) if "estate" in resp: estate_list = html.css( "table:contains('Місцезнаходження') td:contains('Населений пункт') span::text" ).extract() for estate in estate_list: region = cls.decode_region(estate) if region: resp["estate"].append({"region": region}) resp['general']['full_name_suggest'] = [ { 'input': resp['general']['full_name'], 'weight': 5 }, { 'input': ' '.join( [ resp['general']['name'], resp['general']['patronymic'], resp['general']['last_name'] ] ), 'weight': 3 }, { 'input': ' '.join( [ resp['general']['name'], resp['general']['last_name'] ] ), 'weight': 3 } ] resp['general']['full_name_for_sorting'] = keyword_for_sorting(resp['general']['full_name']) if not resp["general"]["post"]["region"]: region_html = html.css( "fieldset:contains('Зареєстроване місце проживання') .person-info:contains('Місто')::text" ).extract() if len(region_html) > 1: resp["general"]["post"]["region"] = cls.decode_region(region_html[1]) if not resp["general"]["post"]["actual_region"]: region_html = html.css( "fieldset:contains('Місце фактичного проживання') .person-info:contains('Місто')::text" ).extract() if len(region_html) > 1: resp["general"]["post"]["actual_region"] = cls.decode_region(region_html[1]) # if set only one region use it value for second one if not resp["general"]["post"]["actual_region"] and resp["general"]["post"]["region"]: resp["general"]["post"]["actual_region"] = resp["general"]["post"]["region"] elif not resp["general"]["post"]["region"] and resp["general"]["post"]["actual_region"]: resp["general"]["post"]["region"] = resp["general"]["post"]["actual_region"] resp["index_card"] = concat_fields(resp, NACPDeclaration.INDEX_CARD_FIELDS) return NACPDeclaration(**resp).to_dict(True)
def _parse_me(cls, base_fname): json_fname = "{}.json".format(base_fname) html_fname = "{}.html".format(base_fname) resp = {"intro": {}, "declaration": {}} try: with open(json_fname, "r") as fp: data = json.load(fp) with open(html_fname, "r") as fp: raw_html = fp.read() html = Selector(raw_html) except ValueError: print("File {} or it's HTML counterpart cannot be parsed".format( json_fname)) return None except FileNotFoundError: print("File {} or it's HTML counterpart cannot be found".format( json_fname)) return None id_ = data.get("id") created_date = data.get("created_date") raw_html_lowered = raw_html.lower() for chunk in cls.dangerous_chunks: if chunk in raw_html_lowered: raise BadHTMLData("Dangerous fragment found: {}, {}".format( id_, base_fname)) try: data = data["data"] except KeyError: raise BadJSONData("API brainfart: {}, {}".format(id_, base_fname)) if "step_0" not in data: raise BadJSONData("Bad header format: {}, {}".format( id_, base_fname)) resp["_id"] = "nacp_{}".format(id_) resp["nacp_src"] = "\n".join(cls.extract_textual_data(html)) resp["nacp_orig"] = data resp["declaration"][ "url"] = "https://public.nazk.gov.ua/declaration/{}".format(id_) resp["declaration"]["source"] = "NACP" resp["declaration"]["basename"] = os.path.basename(base_fname) resp["intro"]["corrected"] = id_ in cls.corrected resp["intro"]["date"] = cls.parse_date(created_date) if "declarationType" not in data["step_0"] or "changesYear" in data[ "step_0"]: resp["intro"]["doc_type"] = "Форма змін" if "changesYear" in data["step_0"]: resp["intro"]["declaration_year"] = int( data["step_0"]["changesYear"]) else: resp["intro"]["doc_type"] = cls.declaration_types[ data["step_0"]["declarationType"]] if "declarationYearTo" in data["step_0"]: resp["intro"]["declaration_year_to"] = cls.parse_date( data["step_0"]["declarationYearTo"]) if "declarationYearFrom" in data["step_0"]: resp["intro"]["declaration_year_from"] = cls.parse_date( data["step_0"]["declarationYearFrom"]) resp["intro"]["declaration_year"] = resp["intro"][ "declaration_year_from"].year if "declarationYear1" in data["step_0"]: resp["intro"]["declaration_year"] = int( data["step_0"]["declarationYear1"]) if "declarationYear3" in data["step_0"]: resp["intro"]["declaration_year"] = int( data["step_0"]["declarationYear3"]) if "declarationYear4" in data["step_0"]: resp["intro"]["declaration_year"] = int( data["step_0"]["declarationYear4"]) resp["general"] = { "last_name": replace_apostrophes(title(data["step_1"]["lastname"])), "name": replace_apostrophes(title(data["step_1"]["firstname"])), "patronymic": replace_apostrophes(title(data["step_1"]["middlename"])), "full_name": replace_apostrophes("{} {} {}".format( title(data["step_1"]["lastname"]), title(data["step_1"]["firstname"]), title(data["step_1"]["middlename"]), )), "post": { "post": replace_apostrophes(data["step_1"].get("workPost", "")), "office": replace_apostrophes(data["step_1"].get("workPlace", "")), "region": replace_apostrophes( cls.region_types.get( data["step_1"].get("actual_region", ""), "")), } } if "step_2" in data: family = data["step_2"] if isinstance(family, dict): resp["general"]["family"] = [] for member in family.values(): if not isinstance(member, dict): continue resp["general"]["family"].append({ "family_name": replace_apostrophes("{} {} {}".format( title(member.get("lastname", "")), title(member.get("firstname", "")), title(member.get("middlename", "")), )), "relations": member.get("subjectRelation", "") }) resp['general']['full_name_suggest'] = [{ 'input': resp['general']['full_name'], 'weight': 5 }, { 'input': ' '.join([ resp['general']['name'], resp['general']['patronymic'], resp['general']['last_name'] ]), 'weight': 3 }, { 'input': ' '.join([resp['general']['name'], resp['general']['last_name']]), 'weight': 3 }] if not resp["general"]["post"]["region"]: region_html = html.css( "fieldset:contains('Зареєстроване місце проживання') .person-info:contains('Місто')::text" ).extract() if len(region_html) > 1: chunks = region_html[1].split("/") if len(chunks) > 1: resp["general"]["post"]["region"] = chunks[-2].strip() else: pass if resp["general"]["post"]["region"].lower() in cls.region_mapping: resp["general"]["post"]["region"] = cls.region_mapping[ resp["general"]["post"]["region"].lower()] else: resp["general"]["post"]["region"] = "" return NACPDeclaration(**resp).to_dict(True)