def handle(self, *args, **options): all_decls = NACPDeclaration.search().query('match_all') if options["to"] is not None: all_decls = all_decls[options["from"]:options["to"]].execute() elif options["from"]: all_decls = all_decls[options["from"]:].execute() else: all_decls = all_decls.scan() w = DictWriter(options["outfile"], fieldnames=["_id"] + options["field"]) w.writeheader() for decl in all_decls: decl_dict = decl.to_dict() row = { field: self.fetch_field(decl_dict, field) for field in options["field"] } row["_id"] = decl.meta.id w.writerow(row)
def handle(self, *args, **options): to_export = NACPDeclaration.search().source( include=[AGGREGATED_FIELD_NAME]).query("exists", field=AGGREGATED_FIELD_NAME) if not options["export_all"]: to_export = to_export.query( "bool", must=[Q("term", intro__doc_type="Щорічна")], must_not=[Q("exists", field="corrected_declarations")]) if options["filter_future_declarations"]: to_export = to_export.query( "range", intro__declaration_year={"lt": datetime.now().year}) w = None with open(options["destination"], "w") as fp: for i, d in enumerate(to_export.scan()): row = d[AGGREGATED_FIELD_NAME].to_dict() row['id'] = d.meta.id if not w: w = DictWriter(fp, fieldnames=row.keys()) w.writeheader() w.writerow(row) if i % 10000 == 0 and i: self.stdout.write("{} declarations exported".format(i))
def get_raw_data(self, year, order_by, limit=10000): to_export = NACPDeclaration.search().source( include=[AGGREGATED_FIELD_NAME]).query("exists", field=AGGREGATED_FIELD_NAME) to_export = to_export.query( "bool", must=[ Q("term", intro__doc_type="Щорічна"), Q("term", intro__declaration_year=year) ], must_not=[ Q("exists", field="corrected_declarations"), Q("term", _id="nacp_e46bba0c-32d5-4b0d-a290-9fdc4afcc278"), # F*****g Melnytchuk Q("term", _id="nacp_c67549d0-abc0-48fe-b529-9185efe1a3ce"), # F*****g idiots Q("term", _id="nacp_2e07bb01-5ca8-4188-97c6-6297f7a4d2ad"), # F*****g idiots Q("term", _id="nacp_f1b25e4d-e691-48d6-99b1-758e94764b91"), # F*****g Motsyor Q("term", **{"{}__outlier".format(AGGREGATED_FIELD_NAME): True}) ] ).sort( {'{}.{}'.format(AGGREGATED_FIELD_NAME, order_by): {"order": "desc"}} )[:limit] res = [] for d in to_export.execute(): row = d[AGGREGATED_FIELD_NAME].to_dict() if row[order_by] > 10000000000: continue row["id"] = d._id res.append(row) return res
def load_declarations(new_ids, limit=LOAD_DECLS_LIMIT): decl_list = list() fields = ['meta.id', 'general.*', 'intro.declaration_year'] if len(new_ids) > limit: logger.error("load new_ids %d limit %d exceed", len(new_ids), limit) new_ids = new_ids[:limit] decl_list = NACPDeclaration.mget(new_ids, raise_on_error=False, missing='skip', _source=fields) if not decl_list: decl_list = [] if len(decl_list) < len(new_ids): add_list = Declaration.mget(new_ids, raise_on_error=False, missing='skip', _source=fields) if add_list: decl_list.extend(add_list) if len(decl_list) < len(new_ids): logger.error("load new_ids %d docs not found", len(new_ids) - len(decl_list)) return decl_list
def get_raw_data(self, year, order_by, limit=10000): to_export = NACPDeclaration.search().source( include=[AGGREGATED_FIELD_NAME]).query("exists", field=AGGREGATED_FIELD_NAME) to_export = to_export.query( "bool", must=[ Q("term", intro__doc_type="Щорічна"), Q("term", intro__declaration_year=year) ], must_not=[ Q("exists", field="corrected_declarations"), Q("term", _id="nacp_e46bba0c-32d5-4b0d-a290-9fdc4afcc278" ), # F*****g Melnytchuk Q("term", **{"{}__outlier".format(AGGREGATED_FIELD_NAME): True}) ]).sort({'aggregated.{}'.format(order_by): { "order": "desc" }})[:limit] res = [] for d in to_export.execute(): row = d[AGGREGATED_FIELD_NAME].to_dict() row["id"] = d._id res.append(row) return res
def populate_declarant_id(apps, schema_editor): Declaration = apps.get_model('landings', 'Declaration') for d in Declaration.objects.filter(user_declarant_id__isnull=True): d.user_declarant_id = d.source["infocard"].get("user_declarant_id", None) if d.user_declarant_id is None: es_decl = NACPDeclaration.get(id=d.declaration_id) d.user_declarant_id = getattr(es_decl.intro, "user_declarant_id", None) d.save()
def handle(self, *args, **options): all_decls = (NACPDeclaration.search().query("match_all").source([ "declaration.url", "intro.date", "intro.doc_type", "nacp_orig.step_1", ])) all_decls = all_decls.filter( "range", intro__date={ "gte": date(options["year_since"], 1, 1), "lt": datetime.now().replace(hour=0, minute=0, second=0, microsecond=0), }, ) w = DictWriter( options["outfile"], fieldnames=[ "id", "declaration.url", "intro.date", "intro.doc_type", "nacp_orig.step_1.postCategory", "nacp_orig.step_1.postType", ], ) for decl in tqdm(all_decls.scan(), total=all_decls.count()): w.writerow({ "id": decl.meta.id, "declaration.url": decl.declaration.url, "intro.date": decl.intro.date.date(), "intro.doc_type": decl.intro.doc_type, "nacp_orig.step_1.postCategory": getattr(decl.nacp_orig.step_1, "postCategory", ""), "nacp_orig.step_1.postType": getattr(decl.nacp_orig.step_1, "postType", ""), })
def handle(self, *args, **options): try: base_dir = options['file_path'] corrected_file = options['corrected_file'] except IndexError: raise CommandError( 'First argument must be a path to source files and second is file name of CSV with corrected declarations') self.stdout.write("Gathering JSON documents from {}".format(base_dir)) self.jsons = list(glob2.glob(os.path.join(base_dir, "**/*.json"))) self.stdout.write("Gathered {} JSON documents".format(len(self.jsons))) corrected = set() with open(corrected_file, "r") as fp: r = DictReader(fp) for l in r: corrected.add(l["uuid"]) DeclarationStaticObj.corrected = corrected NACPDeclaration.init() counter = 0 my_tiny_pool = Pool(self.number_of_processes) if not options["update_all_docs"]: self.stdout.write("Obtaining uuids of already indexed documents") s = NACPDeclaration.search().source([]) existing_guids = set( h.meta.id.replace("nacp_", "") for h in s.scan()) self.stdout.write("{} uuids are currently in index".format( len(existing_guids))) incoming_files = dict( filter( None, my_tiny_pool.map(parse_guid_from_fname, self.jsons) ) ) incoming_guids = set(incoming_files.keys()) self.stdout.write("{} uuids are found in input folder".format( len(incoming_guids))) self.jsons = [ incoming_files[k] for k in incoming_guids - existing_guids ] self.stdout.write("{} uuids left after the filtering".format( len(self.jsons))) for ix in range(0, len(self.jsons), self.chunk_size): chunk = self.jsons[ix:ix + self.chunk_size] result = list( filter( None, my_tiny_pool.map(DeclarationStaticObj.parse, chunk) ) ) counter += len(result) bulk(self.es, result) if ix: self.stdout.write( 'Loaded {} items to persistence storage'.format(ix)) self.stdout.write( 'Finished loading {} items to persistence storage'.format(counter))
def _parse_me(cls, base_fname): json_fname = "{}.json".format(base_fname) html_fname = "{}.html".format(base_fname) resp = { "intro": {}, "declaration": {} } try: with open(json_fname, "r") as fp: data = json.load(fp) with open(html_fname, "r") as fp: raw_html = fp.read() html = Selector(raw_html) except ValueError: print( "File {} or it's HTML counterpart cannot be parsed".format(json_fname)) return None except FileNotFoundError: print( "File {} or it's HTML counterpart cannot be found".format(json_fname)) return None id_ = data.get("id") created_date = data.get("created_date") raw_html_lowered = raw_html.lower() for chunk in cls.dangerous_chunks: if chunk in raw_html_lowered: raise BadHTMLData("Dangerous fragment found: {}, {}".format( id_, base_fname)) try: data = data["data"] except KeyError: raise BadJSONData("API brainfart: {}, {}".format(id_, base_fname)) if "step_0" not in data: raise BadJSONData("Bad header format: {}, {}".format(id_, base_fname)) resp["_id"] = "nacp_{}".format(id_) resp["ft_src"] = "\n".join(cls.extract_textual_data(html)) resp["nacp_orig"] = data resp["declaration"]["url"] = "https://public.nazk.gov.ua/declaration/{}".format(id_) resp["declaration"]["source"] = "NACP" resp["declaration"]["basename"] = os.path.basename(base_fname) resp["intro"]["corrected"] = id_ in cls.corrected resp["intro"]["date"] = cls.parse_date(created_date) if "declarationType" not in data["step_0"] or "changesYear" in data["step_0"]: resp["intro"]["doc_type"] = "Форма змін" if "changesYear" in data["step_0"]: resp["intro"]["declaration_year"] = int(data["step_0"]["changesYear"]) else: resp["intro"]["doc_type"] = cls.declaration_types[data["step_0"]["declarationType"]] if "declarationYearTo" in data["step_0"]: resp["intro"]["declaration_year_to"] = cls.parse_date(data["step_0"]["declarationYearTo"]) if "declarationYearFrom" in data["step_0"]: resp["intro"]["declaration_year_from"] = cls.parse_date(data["step_0"]["declarationYearFrom"]) resp["intro"]["declaration_year"] = resp["intro"]["declaration_year_from"].year if "declarationYear1" in data["step_0"]: resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear1"]) if "declarationYear3" in data["step_0"] and data["step_0"]["declarationYear3"]: resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear3"]) if "declarationYear4" in data["step_0"] and data["step_0"]["declarationYear4"]: resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear4"]) resp["general"] = { "last_name": replace_apostrophes(title(data["step_1"]["lastname"])), "name": replace_apostrophes(title(data["step_1"]["firstname"])), "patronymic": replace_apostrophes(title(data["step_1"]["middlename"])), "full_name": replace_apostrophes("{} {} {}".format( title(data["step_1"]["lastname"]), title(data["step_1"]["firstname"]), title(data["step_1"]["middlename"]), )), "post": { "post": replace_apostrophes(data["step_1"].get("workPost", "")), "post_type": replace_apostrophes(data["step_1"].get("postType", "")), "office": replace_apostrophes(data["step_1"].get("workPlace", "")), "actual_region": replace_apostrophes(cls.region_types.get(data["step_1"].get("actual_region", ""), "")), "region": replace_apostrophes(cls.region_types.get(data["step_1"].get("region", ""), "")), } } if "step_2" in data: family = data["step_2"] if isinstance(family, dict): resp["general"]["family"] = [] for member in family.values(): if not isinstance(member, dict): continue resp["general"]["family"].append({ "family_name": replace_apostrophes("{} {} {}".format( title(member.get("lastname", "")), title(member.get("firstname", "")), title(member.get("middlename", "")), )), "relations": member.get("subjectRelation", "") }) # get regions from estate list if "step_3" in data and isinstance(data["step_3"], dict) and data["step_3"]: if "estate" not in resp: resp["estate"] = [] for estate in data["step_3"].values(): if isinstance(estate, dict) and "region" in estate: region = replace_apostrophes(cls.region_types.get(estate.get("region", ""), "")) if region: resp["estate"].append({"region": region}) if "step_4" in data and isinstance(data["step_4"], dict) and data["step_4"]: if "estate" not in resp: resp["estate"] = [] for estate in data["step_4"].values(): if isinstance(estate, dict) and "region" in estate: region = replace_apostrophes(cls.region_types.get(estate.get("region", ""), "")) if region: resp["estate"].append({"region": region}) if "estate" in resp: estate_list = html.css( "table:contains('Місцезнаходження') td:contains('Населений пункт') span::text" ).extract() for estate in estate_list: region = cls.decode_region(estate) if region: resp["estate"].append({"region": region}) resp['general']['full_name_suggest'] = [ { 'input': resp['general']['full_name'], 'weight': 5 }, { 'input': ' '.join( [ resp['general']['name'], resp['general']['patronymic'], resp['general']['last_name'] ] ), 'weight': 3 }, { 'input': ' '.join( [ resp['general']['name'], resp['general']['last_name'] ] ), 'weight': 3 } ] resp['general']['full_name_for_sorting'] = keyword_for_sorting(resp['general']['full_name']) if not resp["general"]["post"]["region"]: region_html = html.css( "fieldset:contains('Зареєстроване місце проживання') .person-info:contains('Місто')::text" ).extract() if len(region_html) > 1: resp["general"]["post"]["region"] = cls.decode_region(region_html[1]) if not resp["general"]["post"]["actual_region"]: region_html = html.css( "fieldset:contains('Місце фактичного проживання') .person-info:contains('Місто')::text" ).extract() if len(region_html) > 1: resp["general"]["post"]["actual_region"] = cls.decode_region(region_html[1]) # if set only one region use it value for second one if not resp["general"]["post"]["actual_region"] and resp["general"]["post"]["region"]: resp["general"]["post"]["actual_region"] = resp["general"]["post"]["region"] elif not resp["general"]["post"]["region"] and resp["general"]["post"]["actual_region"]: resp["general"]["post"]["region"] = resp["general"]["post"]["actual_region"] resp["index_card"] = concat_fields(resp, NACPDeclaration.INDEX_CARD_FIELDS) return NACPDeclaration(**resp).to_dict(True)
def _parse_me(cls, base_fname): json_fname = "{}.json".format(base_fname) html_fname = "{}.html".format(base_fname) resp = {"intro": {}, "declaration": {}} try: with open(json_fname, "r") as fp: data = json.load(fp) with open(html_fname, "r") as fp: raw_html = fp.read() html = Selector(raw_html) except ValueError: print("File {} or it's HTML counterpart cannot be parsed".format( json_fname)) return None except FileNotFoundError: print("File {} or it's HTML counterpart cannot be found".format( json_fname)) return None id_ = data.get("id") created_date = data.get("created_date") raw_html_lowered = raw_html.lower() for chunk in cls.dangerous_chunks: if chunk in raw_html_lowered: raise BadHTMLData("Dangerous fragment found: {}, {}".format( id_, base_fname)) try: data = data["data"] except KeyError: raise BadJSONData("API brainfart: {}, {}".format(id_, base_fname)) if "step_0" not in data: raise BadJSONData("Bad header format: {}, {}".format( id_, base_fname)) resp["_id"] = "nacp_{}".format(id_) resp["nacp_src"] = "\n".join(cls.extract_textual_data(html)) resp["nacp_orig"] = data resp["declaration"][ "url"] = "https://public.nazk.gov.ua/declaration/{}".format(id_) resp["declaration"]["source"] = "NACP" resp["declaration"]["basename"] = os.path.basename(base_fname) resp["intro"]["corrected"] = id_ in cls.corrected resp["intro"]["date"] = cls.parse_date(created_date) if "declarationType" not in data["step_0"] or "changesYear" in data[ "step_0"]: resp["intro"]["doc_type"] = "Форма змін" if "changesYear" in data["step_0"]: resp["intro"]["declaration_year"] = int( data["step_0"]["changesYear"]) else: resp["intro"]["doc_type"] = cls.declaration_types[ data["step_0"]["declarationType"]] if "declarationYearTo" in data["step_0"]: resp["intro"]["declaration_year_to"] = cls.parse_date( data["step_0"]["declarationYearTo"]) if "declarationYearFrom" in data["step_0"]: resp["intro"]["declaration_year_from"] = cls.parse_date( data["step_0"]["declarationYearFrom"]) resp["intro"]["declaration_year"] = resp["intro"][ "declaration_year_from"].year if "declarationYear1" in data["step_0"]: resp["intro"]["declaration_year"] = int( data["step_0"]["declarationYear1"]) if "declarationYear3" in data["step_0"]: resp["intro"]["declaration_year"] = int( data["step_0"]["declarationYear3"]) if "declarationYear4" in data["step_0"]: resp["intro"]["declaration_year"] = int( data["step_0"]["declarationYear4"]) resp["general"] = { "last_name": replace_apostrophes(title(data["step_1"]["lastname"])), "name": replace_apostrophes(title(data["step_1"]["firstname"])), "patronymic": replace_apostrophes(title(data["step_1"]["middlename"])), "full_name": replace_apostrophes("{} {} {}".format( title(data["step_1"]["lastname"]), title(data["step_1"]["firstname"]), title(data["step_1"]["middlename"]), )), "post": { "post": replace_apostrophes(data["step_1"].get("workPost", "")), "office": replace_apostrophes(data["step_1"].get("workPlace", "")), "region": replace_apostrophes( cls.region_types.get( data["step_1"].get("actual_region", ""), "")), } } if "step_2" in data: family = data["step_2"] if isinstance(family, dict): resp["general"]["family"] = [] for member in family.values(): if not isinstance(member, dict): continue resp["general"]["family"].append({ "family_name": replace_apostrophes("{} {} {}".format( title(member.get("lastname", "")), title(member.get("firstname", "")), title(member.get("middlename", "")), )), "relations": member.get("subjectRelation", "") }) resp['general']['full_name_suggest'] = [{ 'input': resp['general']['full_name'], 'weight': 5 }, { 'input': ' '.join([ resp['general']['name'], resp['general']['patronymic'], resp['general']['last_name'] ]), 'weight': 3 }, { 'input': ' '.join([resp['general']['name'], resp['general']['last_name']]), 'weight': 3 }] if not resp["general"]["post"]["region"]: region_html = html.css( "fieldset:contains('Зареєстроване місце проживання') .person-info:contains('Місто')::text" ).extract() if len(region_html) > 1: chunks = region_html[1].split("/") if len(chunks) > 1: resp["general"]["post"]["region"] = chunks[-2].strip() else: pass if resp["general"]["post"]["region"].lower() in cls.region_mapping: resp["general"]["post"]["region"] = cls.region_mapping[ resp["general"]["post"]["region"].lower()] else: resp["general"]["post"]["region"] = "" return NACPDeclaration(**resp).to_dict(True)
def pull_declarations(self): def get_search_clause(kwd): if "область" not in kwd: return Q( "multi_match", query=kwd, operator="or", minimum_should_match=1, fields=[ "general.post.region", "general.post.office", "general.post.post", "general.post.actual_region", ], ) else: return Q( "multi_match", query=kwd, fields=[ "general.post.region", "general.post.actual_region" ], ) search_clauses = [ get_search_clause(x) for x in filter( None, map(str.strip, self.body.keywords.split("\n"))) ] q = "{} {}".format(self.name, self.extra_keywords) if search_clauses: for sc in search_clauses: first_pass = (NACPDeclaration.search().query( "bool", must=[ Q( "match", general__full_name={ "query": q, "operator": "and" }, ) ], should=[sc], minimum_should_match=1, )[:100].execute()) if first_pass: break else: first_pass = (NACPDeclaration.search().query( "bool", must=[ Q("match", general__full_name={ "query": q, "operator": "and" }) ], )[:100].execute()) Declaration.objects.create_declarations(self, first_pass) user_declarant_ids = set( filter( None, self.declarations.exclude(exclude=True).values_list( "user_declarant_id", flat=True), )) if user_declarant_ids: second_pass = NACPDeclaration.search().filter( "terms", **{"intro.user_declarant_id": list(user_declarant_ids)}) second_pass = second_pass.execute() if not user_declarant_ids or not second_pass: obj_ids_to_find = set( chain(*self.declarations.exclude( exclude=True).values_list("obj_ids", flat=True))) second_pass = NACPDeclaration.search().query( "bool", must=[ Q("match", general__full_name={ "query": q, "operator": "or" }), Q("match", obj_ids=" ".join(list(obj_ids_to_find)[:512])), ], should=[], minimum_should_match=0, )[:100] second_pass = second_pass.execute() Declaration.objects.create_declarations(self, second_pass)
def handle(self, *args, **options): corrected = NACPDeclaration.search().filter("term", intro__corrected=True) cntr = 0 success_rate = 0 for i, d in enumerate(corrected.scan()): must = [ ConstantScore(query=Q( "multi_match", query=d.general.full_name, operator="and", fields=[ "general.last_name", "general.name", "general.patronymic", "general.full_name", ], ), boost=10) ] should = [ ConstantScore(query=Q( "match", general__post__post={ "query": d.general.post.post, "minimum_should_match": "50%" }, ), boost=2), ConstantScore(query=Q( "match", general__post__office={ "query": d.general.post.office, "minimum_should_match": "50%" }, ), boost=2), ConstantScore(query=Q( "match", general__post__region={ "query": d.general.post.region.replace(" область", ""), "minimum_should_match": "60%" }, ), boost=1) ] for fam in getattr(d.general, "family", []): should.append( ConstantScore(query=Q( "multi_match", query=fam.family_name, operator="and", fields=["general.family.family_name"]), boost=2)) candidates = NACPDeclaration.search() \ .query( FunctionScore( query=Q("bool", must=must, should=should), score_mode="sum" ) ) \ .filter("term", intro__declaration_year=d.intro.declaration_year) \ .query(~Q('term', _id=d.meta.id)) \ .filter("term", intro__corrected=False) \ .query( ConstantScore( query=Q("term", intro__doc_type=d.intro.doc_type), boost=0 ) ) if options["store_matches"]: candidates = candidates \ .highlight_options( order='score', fragment_size=500, number_of_fragments=100, pre_tags=['||!'], post_tags=["||"]) \ .highlight( "general.full_name", "general.post.region", "general.post.office", "general.post.post", "general.family.family_name") candidates = candidates.execute() success = self.store_example( d, candidates, debug=options["debug"], store_matches=options["store_matches"]) if success: success_rate += 1 cntr += 1 if cntr and cntr % 5000 == 0: self.stdout.write("%s declarations processed, SR: %s%%" % (cntr, success_rate / cntr * 100)) self.stdout.write("%s declarations processed, SR: %s%%" % (cntr, success_rate / cntr * 100)) if options["store_matches"]: self.save_to_excel(options["store_matches"])