def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) office_id = self.object.id self.office = RUSSIA.get_office(office_id) self.office_stats = RUSSIA.calc_data_current.office_stats.get_group_data(office_id) region_name = "" if self.office.region_id is not None: region_name = RUSSIA.regions.get_region_by_id(self.office.region_id).name child_examples = list((id, RUSSIA.get_office(id).name) for id in self.office_stats.child_office_examples) extra = { 'source_document_count': self.office_stats.source_document_count, 'region_name': region_name, 'source_document_count_html': self.get_source_doc_html(), 'child_offices_count': self.office_stats.child_offices_count, 'section_count_html': self.section_count_html(), 'section_count_by_years_html': self.section_count_by_years_html(), 'median_income_by_years_html': self.median_income_by_years_html(), 'child_office_examples': child_examples, 'office_in_memory': self.office, 'parent_office_name': "" if self.office.parent_id is None else RUSSIA.get_office(self.office.parent_id).name, "rubric_str": "unknown" if self.office.rubric_id is None else get_russian_rubric_str(self.office.rubric_id), "income_comparison": self.comparison_to_population() } context.update(extra) return context
def build_section_incomes(self): query = """ select o.id, s.income_year, i.size, s.person_id from declarations_section s join declarations_office o on s.office_id=o.id join declarations_income i on i.section_id=s.id join declarations_source_document d on s.source_document_id=d.id where s.income_year >= {} and s.income_year <= {} and i.size < {} and i.size > 50000 and s.person_id is not null and d.median_income > 10000 and i.relative='{}' order by o.id """.format(self.income_stat_start_year, self.last_year, self.max_income, models.Relative.main_declarant_code) office_stats = TAllGroupIncomeStats() rubric_stats = TAllGroupIncomeStats() with connection.cursor() as cursor: cursor.execute(query) for office_id, office_items in groupby(cursor, itemgetter(0)): rubric_id = RUSSIA.get_office(office_id).rubric_id for _, year, income, person_id in office_items: if income / 12 < RUSSIA.get_mrot(year): continue office_stats.add_income(office_id, person_id, year, income) rubric_stats.add_income(rubric_id, person_id, year, income) return office_stats, rubric_stats
def get_fsin_office_id(self, section_json, src_doc: TSourceDocument): department = section_json.get('person', dict()).get('department') if department is None or len(department) < 5: return src_doc.calculated_office_id region_id = self.regions.get_region_all_forms( department, TRussianRegions.Russia_as_s_whole_region_id) return RUSSIA.get_fsin_by_region(region_id)
def build_name_ngrams(self): self.logger.info("build bigrams") office_bigrams = defaultdict(set) office_stems = defaultdict(set) self.office_squeezes = dict() office: TOfficeInMemory for office in RUSSIA.iterate_offices(): region_id = office.region_id if region_id is None: region_id = 0 self.office_squeezes[office.office_id] = { 'name': office.name, 'region': region_id, 'parent_id': office.parent_id } for b in self.get_bigrams(office.name): office_bigrams[b].add(office.office_id) for w in TOfficePredictIndex.get_word_stems(office.name, add_starter_and_enders=False): office_stems[w].add(office.office_id) self.office_name_bigrams = self.ngrams_from_default_dict(office_bigrams) self.logger.info("bigrams count = {}".format(self.get_bigrams_count())) self.office_name_unigrams = self.ngrams_from_default_dict(office_stems, 3) self.logger.info("unigrams count = {}".format(self.get_unigrams_count()))
def build_regional_tax_offices(self): o: TOfficeInMemory tax_offices = dict() for o in RUSSIA.iterate_offices(): if o.rubric_id == TOfficeRubrics.Tax: tax_offices[o.region_id] = o.office_id assert len(tax_offices) > 0 return tax_offices
def init_rubric(self): # json_reader.section.rubric_id = source_document_in_db.office.rubric_id does not work # may be we should call source_document_in_db.refresh_from_db self.section.rubric_id = RUSSIA.get_office( self.section.office.id).rubric_id if self.section.rubric_id == TOfficeRubrics.Municipality and \ TOfficeTableInMemory.convert_municipality_to_education(self.section.position): self.section.rubric_id = TOfficeRubrics.Education
def build_declarant_incomes(self, year, max_income=5000000 ) -> TAllRegionStatsForOneYear: region_data = TAllRegionStatsForOneYear( year, file_name=self.options.get('output_json')) minOboronyId = 450 query = """ select o.region_id, i.size from declarations_section s join declarations_office o on s.office_id=o.id join declarations_income i on i.section_id=s.id where s.income_year = {} and i.size < {} and i.size > 0 and i.relative='{}' and o.id != {} and o.region_id is not null and o.region_id != {} order by o.region_id, i.size """.format(year, max_income, models.Relative.main_declarant_code, minOboronyId, TRussianRegions.Russia_as_s_whole_region_id) regions = TRussianRegions() mrot = RUSSIA.get_mrot(year) assert mrot is not None with connection.cursor() as cursor: cursor.execute(query) for region_id, items in groupby(cursor, itemgetter(0)): incomes = list(income for _, income in items if income / 12 > mrot) if region_id == TRussianRegions.Baikonur: continue region = regions.get_region_by_id(region_id) if region.joined_to is not None: region = regions.get_region_by_id(region.joined_to) stat_info = region_data.ross_stat.get_data(region.id, year) if stat_info is None: raise Exception( "cannot find stat_info for region.id={}, region.name={}" .format(region.id, region.name)) population = stat_info.population population_median_income = region_data.ross_stat.get_or_predict_median_salary( region.id, year) if population_median_income is None: raise Exception( "cannot estimate population median_income for region.id={}, region.name={}" .format(region.id, region.name)) s = TRegionYearStats( region.id, region.name, incomes, population_median_income, population, region_data.ross_stat.get_data(region.id, 2021).er_election_2021) region_data.add_snapshot(s) region_data.calc_aux_params() return region_data
def build_offices_sitemap(self): self.logger.info("build_offices_sitemaps") sitemap_path = os.path.join( os.path.dirname(__file__), "../../../disclosures/static/sitemap-office.xml") url_paths = list() for o in RUSSIA.iterate_offices(): info = RUSSIA.calc_data_current.office_stats.get_group_data( o.office_id) if info is not None: doc_cnt = info.source_document_count if doc_cnt is not None and doc_cnt > 10: url_paths.append("office/{}".format(o.office_id)) self.write_sitemap(url_paths, sitemap_path, priority=0.4) self.sitemaps.append(os.path.basename(sitemap_path))
def build_aux_office_params(self, office_data: TGroupStatDataList): # ignore self.income_stat_start_year query = """ select o.id, min(s.income_year), count(s.id) from declarations_office o join declarations_section s on s.office_id = o.id where s.income_year >= 2009 and s.income_year <= {} group by o.id, s.income_year """.format(self.last_year) with connection.cursor() as cursor: self.logger.info("execute {}".format(query.replace("\n", " "))) cursor.execute(query) params = defaultdict(dict) self.logger.info("read data") for office_id, income_year, section_count in cursor: ys = office_data.get_or_create_group_data( office_id).get_or_create_year_snapshot(income_year) ys.declarants_count = section_count query = """ select o.id, count(distinct d.id) from declarations_office o join declarations_section s on s.office_id = o.id join declarations_source_document d on d.id = s.source_document_id group by o.id """ with connection.cursor() as cursor: self.logger.info("execute {}".format(query.replace("\n", " "))) cursor.execute(query) for office_id, cnt in cursor: oi = office_data.get_or_create_group_data(office_id) oi.source_document_count = cnt offices = RUSSIA.offices_in_memory child_offices = offices.get_child_offices_dict() for office in RUSSIA.iterate_offices(): office_id = office.office_id oi = office_data.get_or_create_group_data(office_id) if office.parent_id is None: oi.child_office_examples = list() else: oi.child_office_examples = list( c.office_id for c in child_offices[office_id][:5]) oi.child_offices_count = len(child_offices[office_id]) oi.section_count = sum(s.declarants_count for s in oi.year_snapshots.values()) oi.urls = list(x.url for x in office.office_web_sites if x.can_communicate())
def build_v2(self): ratios = list() for person_id, incomes in self.incomes_by_person.items(): incomes.sort() for i in range(len(incomes) - 1): i1 = incomes[i] i2 = incomes[i + 1] if i1.year + 1 == i2.year: cmp_result = RUSSIA.get_average_nominal_incomes([i1, i2]) if cmp_result is None: continue ratios.append(cmp_result.compare_to_all_people_income()) if len(ratios) == 0: return None, None else: return round(median(ratios), 2), len(ratios)
def distribute_offices_to_processes(self, process_count): assert process_count > 1 cnt = 0 for office_id in self.office_to_source_documents.keys(): cnt += 1 if RUSSIA.get_office(office_id).rubric_id == TOfficeRubrics.Gulag: #put all fsin offices to the first process bucket_id = 0 else: if len(self.office_buckets[0]) > cnt / process_count: #if bucket 0 contains more offices than other buckets, put to current office to other buckets bucket_id = cnt % (process_count - 1) + 1 else: bucket_id = cnt % process_count self.office_buckets[bucket_id].append(office_id) for i in self.office_buckets.keys(): self.logger.debug("bucket[{}] size = {}".format( i, len(self.office_buckets[i])))
def gen_documents(self): for o in RUSSIA.iterate_offices(): info = RUSSIA.calc_data_current.office_stats.get_group_data( o.office_id) if info is not None: doc_cnt = info.source_document_count else: doc_cnt = 0 yield { "_id": o.office_id, "_index": self.index_name, "_source": { 'id': o.office_id, 'name': o.name, 'parent_id': o.parent_id, 'source_document_count': doc_cnt, 'rubric_id': o.rubric_id, 'region_id': o.region_id } }
def import_office(self, office_id): if self.args.get('rubric_id') is not None and RUSSIA.get_office( office_id).rubric_id != self.args.get('rubric_id'): return all_imported_human_jsons = set() max_doc_id = 2**32 ordered_documents = list() for sha256 in self.office_to_source_documents[office_id]: doc_id = self.permalinks_db_source_document.get_old_source_doc_id_by_sha256( sha256) if doc_id is None: doc_id = max_doc_id ordered_documents.append((doc_id, sha256)) ordered_documents.sort() TImporter.logger.debug("import office {} document count = {}".format( office_id, len(ordered_documents))) for _, sha256 in ordered_documents: src_doc = self.dlrobot_human.get_document(sha256) assert src_doc.calculated_office_id == office_id smart_parser_json = self.get_smart_parser_json( all_imported_human_jsons, sha256, src_doc) doc_file_in_db = self.register_document_in_database( sha256, src_doc) if smart_parser_json is None: self.logger.debug( "file {} has no valid smart parser json, skip it".format( sha256)) else: try: sections_count = self.import_one_smart_parser_json( doc_file_in_db, smart_parser_json, src_doc) TImporter.logger.debug("import {} sections from {}".format( sections_count, sha256)) except TSmartParserSectionJson.SerializerException as exp: TImporter.logger.error( "Error! cannot import smart parser json for file {}: {} " .format(sha256, exp))
def build_ml_office_indices(self): self.ml_office_id_2_office_id = dict((i, k) for i, k in enumerate(RUSSIA.iterate_offices_ids())) self.office_id_2_ml_office_id = dict((k, i) for i, k in enumerate(RUSSIA.iterate_offices_ids())) self.logger.info("target office count = {}".format(len(self.office_id_2_ml_office_id)))
def import_one_smart_parser_json(self, source_document_in_db, input_json, src_doc: TSourceDocument): imported_section_years = list() section_index = 0 TImporter.logger.debug("try to import {} declarants".format( len(input_json['persons']))) incomes = list() is_fsin = RUSSIA.get_office( src_doc.calculated_office_id).rubric_id == TOfficeRubrics.Gulag for raw_section in input_json['persons']: section_index += 1 section_income_year = self.calc_income_year( input_json, src_doc, raw_section, section_index) if is_fsin: office_id = self.get_fsin_office_id(raw_section, src_doc) else: office_id = src_doc.calculated_office_id with transaction.atomic(): try: prepared_section = TSmartParserSectionJson( section_income_year, office_id, source_document_in_db) prepared_section.read_raw_json(raw_section) if len(prepared_section.vehicles ) > TImporter.max_vehicle_count: TImporter.logger.debug( "ignore section {} because it has too many vehicles ( > {})" .format(prepared_section.section.person_name, TImporter.max_vehicle_count)) continue passport1 = prepared_section.get_passport_components1( ).get_main_section_passport() if self.register_section_passport(passport1): prepared_section.section.tmp_income_set = prepared_section.incomes passport2 = prepared_section.get_passport_components2( ).get_main_section_passport() section_id, is_new = self.permalinks_db_section.get_section_id( passport1, passport2) if is_new: TImporter.logger.debug( "found a new section {}, set section.id to {}". format( prepared_section.section. get_permalink_passport(), section_id)) main_income = prepared_section.get_main_declarant_income_size( ) if main_income is not None and main_income > 0: incomes.append(main_income) prepared_section.save_to_database(section_id) imported_section_years.append(section_income_year) except (DatabaseError, TSmartParserSectionJson.SerializerException) as exp: TImporter.logger.error( "Error! cannot import section N {}: {} ".format( section_index, exp)) if len(imported_section_years) > 0: source_document_in_db.min_income_year = min(imported_section_years) source_document_in_db.max_income_year = max(imported_section_years) source_document_in_db.section_count = len(imported_section_years) median_income = 0 if len(incomes) > 0: median_income = median(incomes) if median_income >= 2**31: median_income = 0 source_document_in_db.median_income = median_income source_document_in_db.save() return len(imported_section_years)
def comparison_to_population(self): incomes = list() for year, value in self.office_stats.year_snapshots.items(): incomes.append(TYearIncome(year, value.median_year_income)) incomes.sort() return RUSSIA.get_average_nominal_incomes(incomes)
def test_nominal_income(self): # out of year scope self.assertIsNone( RUSSIA.get_average_nominal_incomes( [TYearIncome(2008, 1), TYearIncome(2009, 2)])) # one year is not enough self.assertIsNone( RUSSIA.get_average_nominal_incomes([TYearIncome(2015, 1)])) # two years comp = RUSSIA.get_average_nominal_incomes( [TYearIncome(2015, 1000000), TYearIncome(2016, 2000000)]) self.assertEqual(comp.declarant_income_growth, 100) # 100% growth self.assertAlmostEqual(comp.population_income_growth, 2) self.assertEqual(comp.min_year, 2015) self.assertEqual(comp.max_year, 2016) # 3 years comp = RUSSIA.get_average_nominal_incomes([ TYearIncome(2015, 1000000), TYearIncome(2016, 1500000), TYearIncome(2017, 2000000) ]) self.assertEqual(comp.declarant_income_growth, 100) # 100% growth self.assertAlmostEqual(comp.population_income_growth, 5, places=3) self.assertEqual(comp.min_year, 2015) self.assertEqual(comp.max_year, 2017) #2040 year is ignored comp = RUSSIA.get_average_nominal_incomes([ TYearIncome(2015, 1000000), TYearIncome(2016, 1500000), TYearIncome(2017, 2000000), TYearIncome(2040, 30000000) ]) self.assertEqual(comp.declarant_income_growth, 100) # 100% growth self.assertAlmostEqual(comp.population_income_growth, 5, places=3) self.assertEqual(comp.min_year, 2015) self.assertEqual(comp.max_year, 2017) #1990 year is ignored comp = RUSSIA.get_average_nominal_incomes([ TYearIncome(1990, 100000000), TYearIncome(2015, 1000000), TYearIncome(2016, 1500000), TYearIncome(2017, 2000000) ]) self.assertEqual(comp.declarant_income_growth, 100) # 100% growth self.assertAlmostEqual(comp.population_income_growth, 5, places=3) # 4.5% growth self.assertEqual(comp.min_year, 2015) self.assertEqual(comp.max_year, 2017) #zero income is ignored comp = RUSSIA.get_average_nominal_incomes([ TYearIncome(2015, 0), TYearIncome(2016, 1500000), TYearIncome(2017, 2000000) ]) self.assertAlmostEqual(comp.declarant_income_growth, 33, places=3) # 33% growth self.assertAlmostEqual(comp.population_income_growth, 3, places=3) # 3% growth of 2017 self.assertEqual(comp.min_year, 2016) self.assertEqual(comp.max_year, 2017) #incomes less than 12*MROT are ignored incomes = [ TYearIncome(2012, 600), TYearIncome(2013, 189744), TYearIncome(2019, 407711) ] comp = RUSSIA.get_average_nominal_incomes(incomes) self.assertAlmostEqual(comp.declarant_income_growth, 114, places=3) self.assertAlmostEqual(comp.population_income_growth, 37, places=3) self.assertEqual(comp.min_year, 2013) self.assertEqual(comp.max_year, 2019) #real example 1 incomes = [ TYearIncome(2012, 1693027), TYearIncome(2013, 2790949), TYearIncome(2017, 4993935), TYearIncome(2019, 6241840) ] comp = RUSSIA.get_average_nominal_incomes(incomes) self.assertAlmostEqual(comp.declarant_income_growth, 268, places=3) self.assertAlmostEqual(comp.population_income_growth, 52, places=3) self.assertEqual(comp.min_year, 2012) self.assertEqual(comp.max_year, 2019) #real example 2 incomes = [ TYearIncome(2012, 783050), TYearIncome(2013, 819684), TYearIncome(2014, 692259), TYearIncome(2015, 736241), TYearIncome(2016, 780312), TYearIncome(2017, 817646), TYearIncome(2018, 817078), TYearIncome(2019, 886266) ] comp = RUSSIA.get_average_nominal_incomes(incomes) self.assertAlmostEqual(comp.declarant_income_growth, 13, places=3) self.assertAlmostEqual(comp.population_income_growth, 52, places=3) self.assertEqual(comp.min_year, 2012) self.assertEqual(comp.max_year, 2019) #real example 3 incomes = [ TYearIncome(2012, 297096), TYearIncome(2013, 856340), TYearIncome(2014, 820063), TYearIncome(2015, 730649), TYearIncome(2016, 706835) ] comp = RUSSIA.get_average_nominal_incomes(incomes) self.assertAlmostEqual(comp.declarant_income_growth, 137, places=3) self.assertAlmostEqual(comp.population_income_growth, 32, places=3) self.assertEqual(comp.min_year, 2012) self.assertEqual(comp.max_year, 2016)
def income_growth_yearly(self): incomes = list() for s in self.sections_ordered_by_year: incomes.append(TYearIncome(s.income_year, s.get_declarant_income_size())) return RUSSIA.get_average_nominal_incomes(incomes)