예제 #1
0
 def get_context_data(self, **kwargs):
     context = super().get_context_data(**kwargs)
     office_id = self.object.id
     self.office = RUSSIA.get_office(office_id)
     self.office_stats = RUSSIA.calc_data_current.office_stats.get_group_data(office_id)
     region_name = ""
     if self.office.region_id is not None:
         region_name = RUSSIA.regions.get_region_by_id(self.office.region_id).name
     child_examples = list((id, RUSSIA.get_office(id).name) for id in self.office_stats.child_office_examples)
     extra = {
         'source_document_count':  self.office_stats.source_document_count,
         'region_name': region_name,
         'source_document_count_html': self.get_source_doc_html(),
         'child_offices_count': self.office_stats.child_offices_count,
         'section_count_html': self.section_count_html(),
         'section_count_by_years_html': self.section_count_by_years_html(),
         'median_income_by_years_html': self.median_income_by_years_html(),
         'child_office_examples': child_examples,
         'office_in_memory': self.office,
         'parent_office_name': "" if self.office.parent_id is None else RUSSIA.get_office(self.office.parent_id).name,
         "rubric_str": "unknown" if self.office.rubric_id is None else get_russian_rubric_str(self.office.rubric_id),
         "income_comparison": self.comparison_to_population()
     }
     context.update(extra)
     return context
    def build_section_incomes(self):
        query = """
            select o.id, s.income_year, i.size, s.person_id 
            from declarations_section s 
            join declarations_office o on s.office_id=o.id  
            join declarations_income i on i.section_id=s.id
            join declarations_source_document d on s.source_document_id=d.id  
            where s.income_year >= {} and
                 s.income_year <= {} and  
                 i.size < {} and

                 i.size > 50000 and
                 s.person_id is not null and
                 d.median_income > 10000 and 
                 i.relative='{}'
            order by o.id
        """.format(self.income_stat_start_year, self.last_year,
                   self.max_income, models.Relative.main_declarant_code)
        office_stats = TAllGroupIncomeStats()
        rubric_stats = TAllGroupIncomeStats()
        with connection.cursor() as cursor:
            cursor.execute(query)
            for office_id, office_items in groupby(cursor, itemgetter(0)):
                rubric_id = RUSSIA.get_office(office_id).rubric_id
                for _, year, income, person_id in office_items:
                    if income / 12 < RUSSIA.get_mrot(year):
                        continue
                    office_stats.add_income(office_id, person_id, year, income)
                    rubric_stats.add_income(rubric_id, person_id, year, income)
        return office_stats, rubric_stats
예제 #3
0
 def get_fsin_office_id(self, section_json, src_doc: TSourceDocument):
     department = section_json.get('person', dict()).get('department')
     if department is None or len(department) < 5:
         return src_doc.calculated_office_id
     region_id = self.regions.get_region_all_forms(
         department, TRussianRegions.Russia_as_s_whole_region_id)
     return RUSSIA.get_fsin_by_region(region_id)
예제 #4
0
    def build_name_ngrams(self):
        self.logger.info("build bigrams")
        office_bigrams = defaultdict(set)
        office_stems = defaultdict(set)
        self.office_squeezes = dict()
        office: TOfficeInMemory
        for office in RUSSIA.iterate_offices():
            region_id = office.region_id
            if region_id is None:
                region_id = 0
            self.office_squeezes[office.office_id] = {
                'name': office.name,
                'region': region_id,
                'parent_id': office.parent_id
            }
            for b in self.get_bigrams(office.name):
                office_bigrams[b].add(office.office_id)
            for w in TOfficePredictIndex.get_word_stems(office.name, add_starter_and_enders=False):
                office_stems[w].add(office.office_id)

        self.office_name_bigrams = self.ngrams_from_default_dict(office_bigrams)
        self.logger.info("bigrams count = {}".format(self.get_bigrams_count()))

        self.office_name_unigrams = self.ngrams_from_default_dict(office_stems, 3)
        self.logger.info("unigrams count = {}".format(self.get_unigrams_count()))
예제 #5
0
 def build_regional_tax_offices(self):
     o: TOfficeInMemory
     tax_offices = dict()
     for o in RUSSIA.iterate_offices():
         if o.rubric_id == TOfficeRubrics.Tax:
             tax_offices[o.region_id] = o.office_id
     assert len(tax_offices) > 0
     return tax_offices
예제 #6
0
    def init_rubric(self):
        # json_reader.section.rubric_id = source_document_in_db.office.rubric_id does not work
        # may be we should call source_document_in_db.refresh_from_db
        self.section.rubric_id = RUSSIA.get_office(
            self.section.office.id).rubric_id

        if self.section.rubric_id == TOfficeRubrics.Municipality and \
                TOfficeTableInMemory.convert_municipality_to_education(self.section.position):
            self.section.rubric_id = TOfficeRubrics.Education
예제 #7
0
    def build_declarant_incomes(self,
                                year,
                                max_income=5000000
                                ) -> TAllRegionStatsForOneYear:
        region_data = TAllRegionStatsForOneYear(
            year, file_name=self.options.get('output_json'))
        minOboronyId = 450
        query = """
            select o.region_id, i.size
            from declarations_section   s 
            join declarations_office o on s.office_id=o.id  
            join declarations_income i on i.section_id=s.id  
            where s.income_year = {} and  
                 i.size < {} and 
                 i.size > 0 and 
                 i.relative='{}' and
                 o.id != {} and
                 o.region_id is  not null and
                 o.region_id != {}
            order by o.region_id, i.size
        """.format(year, max_income, models.Relative.main_declarant_code,
                   minOboronyId, TRussianRegions.Russia_as_s_whole_region_id)
        regions = TRussianRegions()
        mrot = RUSSIA.get_mrot(year)
        assert mrot is not None
        with connection.cursor() as cursor:
            cursor.execute(query)
            for region_id, items in groupby(cursor, itemgetter(0)):
                incomes = list(income for _, income in items
                               if income / 12 > mrot)
                if region_id == TRussianRegions.Baikonur:
                    continue
                region = regions.get_region_by_id(region_id)
                if region.joined_to is not None:
                    region = regions.get_region_by_id(region.joined_to)
                stat_info = region_data.ross_stat.get_data(region.id, year)
                if stat_info is None:
                    raise Exception(
                        "cannot find stat_info for region.id={}, region.name={}"
                        .format(region.id, region.name))
                population = stat_info.population
                population_median_income = region_data.ross_stat.get_or_predict_median_salary(
                    region.id, year)
                if population_median_income is None:
                    raise Exception(
                        "cannot estimate population median_income for region.id={}, region.name={}"
                        .format(region.id, region.name))
                s = TRegionYearStats(
                    region.id, region.name, incomes, population_median_income,
                    population,
                    region_data.ross_stat.get_data(region.id,
                                                   2021).er_election_2021)
                region_data.add_snapshot(s)

        region_data.calc_aux_params()
        return region_data
예제 #8
0
 def build_offices_sitemap(self):
     self.logger.info("build_offices_sitemaps")
     sitemap_path = os.path.join(
         os.path.dirname(__file__),
         "../../../disclosures/static/sitemap-office.xml")
     url_paths = list()
     for o in RUSSIA.iterate_offices():
         info = RUSSIA.calc_data_current.office_stats.get_group_data(
             o.office_id)
         if info is not None:
             doc_cnt = info.source_document_count
             if doc_cnt is not None and doc_cnt > 10:
                 url_paths.append("office/{}".format(o.office_id))
     self.write_sitemap(url_paths, sitemap_path, priority=0.4)
     self.sitemaps.append(os.path.basename(sitemap_path))
    def build_aux_office_params(self, office_data: TGroupStatDataList):
        # ignore self.income_stat_start_year
        query = """
            select o.id, min(s.income_year), count(s.id) 
            from declarations_office o
            join declarations_section s on s.office_id = o.id
            where s.income_year >= 2009 and s.income_year <= {}
            group by o.id, s.income_year
        """.format(self.last_year)
        with connection.cursor() as cursor:
            self.logger.info("execute {}".format(query.replace("\n", " ")))
            cursor.execute(query)
            params = defaultdict(dict)
            self.logger.info("read data")
            for office_id, income_year, section_count in cursor:
                ys = office_data.get_or_create_group_data(
                    office_id).get_or_create_year_snapshot(income_year)
                ys.declarants_count = section_count

        query = """
                    select o.id, count(distinct d.id) 
                    from declarations_office o
                    join declarations_section s on s.office_id = o.id
                    join declarations_source_document d on d.id = s.source_document_id
                    group by o.id
                """
        with connection.cursor() as cursor:
            self.logger.info("execute {}".format(query.replace("\n", " ")))
            cursor.execute(query)
            for office_id, cnt in cursor:
                oi = office_data.get_or_create_group_data(office_id)
                oi.source_document_count = cnt

        offices = RUSSIA.offices_in_memory
        child_offices = offices.get_child_offices_dict()
        for office in RUSSIA.iterate_offices():
            office_id = office.office_id
            oi = office_data.get_or_create_group_data(office_id)
            if office.parent_id is None:
                oi.child_office_examples = list()
            else:
                oi.child_office_examples = list(
                    c.office_id for c in child_offices[office_id][:5])
            oi.child_offices_count = len(child_offices[office_id])
            oi.section_count = sum(s.declarants_count
                                   for s in oi.year_snapshots.values())
            oi.urls = list(x.url for x in office.office_web_sites
                           if x.can_communicate())
 def build_v2(self):
     ratios = list()
     for person_id, incomes in self.incomes_by_person.items():
         incomes.sort()
         for i in range(len(incomes) - 1):
             i1 = incomes[i]
             i2 = incomes[i + 1]
             if i1.year + 1 == i2.year:
                 cmp_result = RUSSIA.get_average_nominal_incomes([i1, i2])
                 if cmp_result is None:
                     continue
                 ratios.append(cmp_result.compare_to_all_people_income())
     if len(ratios) == 0:
         return None, None
     else:
         return round(median(ratios), 2), len(ratios)
예제 #11
0
 def distribute_offices_to_processes(self, process_count):
     assert process_count > 1
     cnt = 0
     for office_id in self.office_to_source_documents.keys():
         cnt += 1
         if RUSSIA.get_office(office_id).rubric_id == TOfficeRubrics.Gulag:
             #put all fsin offices to the first process
             bucket_id = 0
         else:
             if len(self.office_buckets[0]) > cnt / process_count:
                 #if bucket 0 contains more offices than other buckets, put to current office to other buckets
                 bucket_id = cnt % (process_count - 1) + 1
             else:
                 bucket_id = cnt % process_count
         self.office_buckets[bucket_id].append(office_id)
     for i in self.office_buckets.keys():
         self.logger.debug("bucket[{}] size = {}".format(
             i, len(self.office_buckets[i])))
예제 #12
0
    def gen_documents(self):
        for o in RUSSIA.iterate_offices():
            info = RUSSIA.calc_data_current.office_stats.get_group_data(
                o.office_id)
            if info is not None:
                doc_cnt = info.source_document_count
            else:
                doc_cnt = 0

            yield {
                "_id": o.office_id,
                "_index": self.index_name,
                "_source": {
                    'id': o.office_id,
                    'name': o.name,
                    'parent_id': o.parent_id,
                    'source_document_count': doc_cnt,
                    'rubric_id': o.rubric_id,
                    'region_id': o.region_id
                }
            }
예제 #13
0
    def import_office(self, office_id):
        if self.args.get('rubric_id') is not None and RUSSIA.get_office(
                office_id).rubric_id != self.args.get('rubric_id'):
            return

        all_imported_human_jsons = set()
        max_doc_id = 2**32
        ordered_documents = list()
        for sha256 in self.office_to_source_documents[office_id]:
            doc_id = self.permalinks_db_source_document.get_old_source_doc_id_by_sha256(
                sha256)
            if doc_id is None:
                doc_id = max_doc_id
            ordered_documents.append((doc_id, sha256))
        ordered_documents.sort()
        TImporter.logger.debug("import office {} document count = {}".format(
            office_id, len(ordered_documents)))

        for _, sha256 in ordered_documents:
            src_doc = self.dlrobot_human.get_document(sha256)
            assert src_doc.calculated_office_id == office_id
            smart_parser_json = self.get_smart_parser_json(
                all_imported_human_jsons, sha256, src_doc)
            doc_file_in_db = self.register_document_in_database(
                sha256, src_doc)
            if smart_parser_json is None:
                self.logger.debug(
                    "file {} has no valid smart parser json, skip it".format(
                        sha256))
            else:
                try:
                    sections_count = self.import_one_smart_parser_json(
                        doc_file_in_db, smart_parser_json, src_doc)
                    TImporter.logger.debug("import {} sections from {}".format(
                        sections_count, sha256))
                except TSmartParserSectionJson.SerializerException as exp:
                    TImporter.logger.error(
                        "Error! cannot import smart parser json for file {}: {} "
                        .format(sha256, exp))
예제 #14
0
 def build_ml_office_indices(self):
     self.ml_office_id_2_office_id = dict((i, k) for i, k in enumerate(RUSSIA.iterate_offices_ids()))
     self.office_id_2_ml_office_id = dict((k, i) for i, k in enumerate(RUSSIA.iterate_offices_ids()))
     self.logger.info("target office count = {}".format(len(self.office_id_2_ml_office_id)))
예제 #15
0
    def import_one_smart_parser_json(self, source_document_in_db, input_json,
                                     src_doc: TSourceDocument):
        imported_section_years = list()
        section_index = 0
        TImporter.logger.debug("try to import {} declarants".format(
            len(input_json['persons'])))
        incomes = list()
        is_fsin = RUSSIA.get_office(
            src_doc.calculated_office_id).rubric_id == TOfficeRubrics.Gulag

        for raw_section in input_json['persons']:
            section_index += 1
            section_income_year = self.calc_income_year(
                input_json, src_doc, raw_section, section_index)
            if is_fsin:
                office_id = self.get_fsin_office_id(raw_section, src_doc)
            else:
                office_id = src_doc.calculated_office_id
            with transaction.atomic():
                try:
                    prepared_section = TSmartParserSectionJson(
                        section_income_year, office_id, source_document_in_db)
                    prepared_section.read_raw_json(raw_section)

                    if len(prepared_section.vehicles
                           ) > TImporter.max_vehicle_count:
                        TImporter.logger.debug(
                            "ignore section {} because it has too many vehicles ( > {})"
                            .format(prepared_section.section.person_name,
                                    TImporter.max_vehicle_count))
                        continue
                    passport1 = prepared_section.get_passport_components1(
                    ).get_main_section_passport()
                    if self.register_section_passport(passport1):
                        prepared_section.section.tmp_income_set = prepared_section.incomes
                        passport2 = prepared_section.get_passport_components2(
                        ).get_main_section_passport()
                        section_id, is_new = self.permalinks_db_section.get_section_id(
                            passport1, passport2)
                        if is_new:
                            TImporter.logger.debug(
                                "found a new section {}, set section.id to {}".
                                format(
                                    prepared_section.section.
                                    get_permalink_passport(), section_id))

                        main_income = prepared_section.get_main_declarant_income_size(
                        )
                        if main_income is not None and main_income > 0:
                            incomes.append(main_income)
                        prepared_section.save_to_database(section_id)
                        imported_section_years.append(section_income_year)

                except (DatabaseError,
                        TSmartParserSectionJson.SerializerException) as exp:
                    TImporter.logger.error(
                        "Error! cannot import section N {}: {} ".format(
                            section_index, exp))

        if len(imported_section_years) > 0:
            source_document_in_db.min_income_year = min(imported_section_years)
            source_document_in_db.max_income_year = max(imported_section_years)
            source_document_in_db.section_count = len(imported_section_years)
            median_income = 0
            if len(incomes) > 0:
                median_income = median(incomes)
            if median_income >= 2**31:
                median_income = 0
            source_document_in_db.median_income = median_income
            source_document_in_db.save()

        return len(imported_section_years)
예제 #16
0
 def comparison_to_population(self):
     incomes = list()
     for year, value in self.office_stats.year_snapshots.items():
         incomes.append(TYearIncome(year, value.median_year_income))
     incomes.sort()
     return RUSSIA.get_average_nominal_incomes(incomes)
예제 #17
0
    def test_nominal_income(self):
        # out of year scope
        self.assertIsNone(
            RUSSIA.get_average_nominal_incomes(
                [TYearIncome(2008, 1),
                 TYearIncome(2009, 2)]))

        # one year is not enough
        self.assertIsNone(
            RUSSIA.get_average_nominal_incomes([TYearIncome(2015, 1)]))

        # two years
        comp = RUSSIA.get_average_nominal_incomes(
            [TYearIncome(2015, 1000000),
             TYearIncome(2016, 2000000)])
        self.assertEqual(comp.declarant_income_growth, 100)  # 100% growth
        self.assertAlmostEqual(comp.population_income_growth, 2)
        self.assertEqual(comp.min_year, 2015)
        self.assertEqual(comp.max_year, 2016)

        # 3 years
        comp = RUSSIA.get_average_nominal_incomes([
            TYearIncome(2015, 1000000),
            TYearIncome(2016, 1500000),
            TYearIncome(2017, 2000000)
        ])
        self.assertEqual(comp.declarant_income_growth, 100)  # 100% growth
        self.assertAlmostEqual(comp.population_income_growth, 5, places=3)
        self.assertEqual(comp.min_year, 2015)
        self.assertEqual(comp.max_year, 2017)

        #2040  year is ignored
        comp = RUSSIA.get_average_nominal_incomes([
            TYearIncome(2015, 1000000),
            TYearIncome(2016, 1500000),
            TYearIncome(2017, 2000000),
            TYearIncome(2040, 30000000)
        ])
        self.assertEqual(comp.declarant_income_growth, 100)  # 100% growth
        self.assertAlmostEqual(comp.population_income_growth, 5, places=3)
        self.assertEqual(comp.min_year, 2015)
        self.assertEqual(comp.max_year, 2017)

        #1990     year is ignored
        comp = RUSSIA.get_average_nominal_incomes([
            TYearIncome(1990, 100000000),
            TYearIncome(2015, 1000000),
            TYearIncome(2016, 1500000),
            TYearIncome(2017, 2000000)
        ])
        self.assertEqual(comp.declarant_income_growth, 100)  # 100% growth
        self.assertAlmostEqual(comp.population_income_growth, 5,
                               places=3)  # 4.5% growth
        self.assertEqual(comp.min_year, 2015)
        self.assertEqual(comp.max_year, 2017)

        #zero income is ignored
        comp = RUSSIA.get_average_nominal_incomes([
            TYearIncome(2015, 0),
            TYearIncome(2016, 1500000),
            TYearIncome(2017, 2000000)
        ])
        self.assertAlmostEqual(comp.declarant_income_growth, 33,
                               places=3)  # 33% growth
        self.assertAlmostEqual(comp.population_income_growth, 3,
                               places=3)  # 3% growth of 2017
        self.assertEqual(comp.min_year, 2016)
        self.assertEqual(comp.max_year, 2017)

        #incomes less than 12*MROT are ignored
        incomes = [
            TYearIncome(2012, 600),
            TYearIncome(2013, 189744),
            TYearIncome(2019, 407711)
        ]
        comp = RUSSIA.get_average_nominal_incomes(incomes)
        self.assertAlmostEqual(comp.declarant_income_growth, 114, places=3)
        self.assertAlmostEqual(comp.population_income_growth, 37, places=3)
        self.assertEqual(comp.min_year, 2013)
        self.assertEqual(comp.max_year, 2019)

        #real example 1
        incomes = [
            TYearIncome(2012, 1693027),
            TYearIncome(2013, 2790949),
            TYearIncome(2017, 4993935),
            TYearIncome(2019, 6241840)
        ]
        comp = RUSSIA.get_average_nominal_incomes(incomes)
        self.assertAlmostEqual(comp.declarant_income_growth, 268, places=3)
        self.assertAlmostEqual(comp.population_income_growth, 52, places=3)
        self.assertEqual(comp.min_year, 2012)
        self.assertEqual(comp.max_year, 2019)

        #real example 2
        incomes = [
            TYearIncome(2012, 783050),
            TYearIncome(2013, 819684),
            TYearIncome(2014, 692259),
            TYearIncome(2015, 736241),
            TYearIncome(2016, 780312),
            TYearIncome(2017, 817646),
            TYearIncome(2018, 817078),
            TYearIncome(2019, 886266)
        ]
        comp = RUSSIA.get_average_nominal_incomes(incomes)
        self.assertAlmostEqual(comp.declarant_income_growth, 13, places=3)
        self.assertAlmostEqual(comp.population_income_growth, 52, places=3)
        self.assertEqual(comp.min_year, 2012)
        self.assertEqual(comp.max_year, 2019)

        #real example 3
        incomes = [
            TYearIncome(2012, 297096),
            TYearIncome(2013, 856340),
            TYearIncome(2014, 820063),
            TYearIncome(2015, 730649),
            TYearIncome(2016, 706835)
        ]
        comp = RUSSIA.get_average_nominal_incomes(incomes)
        self.assertAlmostEqual(comp.declarant_income_growth, 137, places=3)
        self.assertAlmostEqual(comp.population_income_growth, 32, places=3)
        self.assertEqual(comp.min_year, 2012)
        self.assertEqual(comp.max_year, 2016)
예제 #18
0
 def income_growth_yearly(self):
     incomes = list()
     for s in self.sections_ordered_by_year:
         incomes.append(TYearIncome(s.income_year, s.get_declarant_income_size()))
     return RUSSIA.get_average_nominal_incomes(incomes)