def run(years_calage): import time year_data_list = [1995, 2000, 2005, 2011] for year_calage in years_calage: start = time.time() run_all(year_calage, year_data_list) log.info("Finished {}".format(time.time() - start)) print "Base construite pour l'année {} à partir de l'enquête bdf {}".format( year_calage, find_nearest_inferior(year_data_list, year_calage) )
def get_inflators_cn_to_cn(target_year): ''' Calcule l'inflateur de vieillissement à partir des masses de comptabilité nationale. ''' data_year = find_nearest_inferior(data_years, target_year) data_year_cn_aggregates = get_cn_aggregates(data_year)['consoCN_COICOP_{}'.format(data_year)].to_dict() target_year_cn_aggregates = get_cn_aggregates(target_year)['consoCN_COICOP_{}'.format(target_year)].to_dict() return dict( (key, target_year_cn_aggregates[key] / data_year_cn_aggregates[key]) for key in data_year_cn_aggregates.keys() )
def get_inflators(target_year): ''' Fonction qui calcule les ratios de calage (bdf sur cn pour année de données) et de vieillissement à partir des masses de comptabilité nationale et des masses de consommation de bdf. ''' data_year = find_nearest_inferior(data_years, target_year) inflators_bdf_to_cn = get_inflators_bdf_to_cn(data_year) inflators_cn_to_cn = get_inflators_cn_to_cn(target_year) ratio_by_variable = dict() for key in inflators_cn_to_cn.keys(): ratio_by_variable[key] = inflators_bdf_to_cn[key] * inflators_cn_to_cn[key] return ratio_by_variable
def run_all(year_calage = 2011, year_data_list = [1995, 2000, 2005, 2011]): temporary_store = TemporaryStore.create(file_name = "indirect_taxation_tmp") # Quelle base de données choisir pour le calage ? year_data = find_nearest_inferior(year_data_list, year_calage) # 4 étape parallèles d'homogénéisation des données sources : # Gestion des dépenses de consommation: build_depenses_homogenisees(year = year_data) build_imputation_loyers_proprietaires(year = year_data) depenses = temporary_store["depenses_bdf_{}".format(year_calage)] depenses.index = depenses.index.astype(ident_men_dtype) depenses_by_grosposte = temporary_store["depenses_by_grosposte_{}".format(year_calage)] depenses_by_grosposte.index = depenses_by_grosposte.index.astype(str) # Gestion des véhicules: build_homogeneisation_vehicules(year = year_data) if year_calage != 1995: vehicule = temporary_store['automobile_{}'.format(year_data)] vehicule.index = vehicule.index.astype(ident_men_dtype) else: vehicule = None # Gestion des variables socio démographiques: build_homogeneisation_caracteristiques_sociales(year = year_data) menage = temporary_store['donnes_socio_demog_{}'.format(year_data)] menage.index = menage.index.astype(ident_men_dtype) # Gestion des variables revenus: build_homogeneisation_revenus_menages(year = year_data) revenus = temporary_store["revenus_{}".format(year_calage)] revenus.index = revenus.index.astype(ident_men_dtype) temporary_store.close() # Concaténation des résultas de ces 4 étapes preprocessed_data_frame_by_name = dict( revenus = revenus, vehicule = vehicule, menage = menage, depenses = depenses, depenses_by_grosposte = depenses_by_grosposte ) for name, preprocessed_data_frame in preprocessed_data_frame_by_name.iteritems(): assert preprocessed_data_frame.index.name == 'ident_men', \ 'Index is labelled {} instead of ident_men in data frame {} for year {}'.format( preprocessed_data_frame.index.name, name, year_data) assert len(preprocessed_data_frame) != 0, 'Empty data frame {}'.format(name) assert preprocessed_data_frame.index.dtype == numpy.dtype('O'), "index for {} is {}".format( name, preprocessed_data_frame.index.dtype) data_frame = pandas.concat( preprocessed_data_frame_by_name.values(), axis = 1, ) if year_data == 2005: for vehicule_variable in ['veh_tot', 'veh_essence', 'veh_diesel', 'pourcentage_vehicule_essence']: data_frame.loc[data_frame[vehicule_variable].isnull(), vehicule_variable] = 0 for variable in ['age{}'.format(i) for i in range(3, 14)] + ['agecj', 'agfinetu', 'agfinetu_cj', 'nenfhors']: data_frame.loc[data_frame[variable].isnull(), variable] = 0 if year_data == 2011: for vehicule_variable in ['veh_tot', 'veh_essence', 'veh_diesel', 'pourcentage_vehicule_essence', 'rev_disp_loyerimput', 'rev_disponible', 'loyer_impute']: data_frame.loc[data_frame[vehicule_variable].isnull(), vehicule_variable] = 0 # 'ratio_loyer_impute', 'ratio_revenus' To be added data_frame.index.name = "ident_men" # TODO: Homogénéiser: soit faire en sorte que ident_men existe pour toutes les années # soit qu'elle soit en index pour toutes # On ne garde que les ménages métropolitaines if year_data == 2011: data_frame = data_frame.query('zeat != 0') try: data_frame.reset_index(inplace = True) except ValueError, e: log.info('ignoring reset_index because {}'.format(e))