def create_final(year): temporary_store = TemporaryStore.create(file_name = "erfs") log.info(u"création de final") foy_ind = temporary_store['foy_ind_{}'.format(year)] tot3 = temporary_store['tot3_{}'.format(year)] foy_ind.set_index(['idfoy', 'quifoy'], inplace = True) tot3.set_index(['idfoy', 'quifoy'], inplace = True) final = concat([tot3, foy_ind], join_axes=[tot3.index], axis=1) final.reset_index(inplace = True) foy_ind.reset_index(inplace = True) tot3.reset_index(inplace = True) # tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam']) final = final[final.idmen.notnull()] control(final, verbose=True) del tot3, foy_ind gc.collect() # final <- merge(final, sif, by = c('noindiv'), all.x = TRUE) log.info(" loading fip") sif = temporary_store['sif_{}'.format(year)] log.info("{}".format(sif.columns)) log.info(" update final using fip") final = final.merge(sif, on=["noindiv"], how="left") # TODO: IL FAUT UNE METHODE POUR GERER LES DOUBLES DECLARATIONS control(final, debug=True) final['caseP'] = final.caseP.fillna(False) final['caseF'] = final.caseF.fillna(False) print_id(final) temporary_store['final_{}'.format(year)] = final log.info(u"final sauvegardé") del sif, final
def create_final(temporary_store=None, year=None): assert temporary_store is not None assert year is not None log.info(u"création de final") foy_ind = temporary_store['foy_ind_{}'.format(year)] tot3 = temporary_store['tot3_{}'.format(year)] log.info(u"Stats on tot3") print_id(tot3) log.info(u"Stats on foy_ind") print_id(foy_ind) foy_ind.set_index(['idfoy', 'quifoy'], inplace=True, verify_integrity=True) tot3.set_index(['idfoy', 'quifoy'], inplace=True, verify_integrity=True) # tot3 = concat([tot3, foy_ind], join_axes=[tot3.index], axis=1, verify_integrity = True) # TODO improve this foy_ind.drop([u'alr', u'rsti', u'sali', u'choi'], axis=1, inplace=True) tot3 = tot3.join(foy_ind) tot3.reset_index(inplace=True) foy_ind.reset_index(inplace=True) # tot3 = tot3.drop_duplicates(subset=['idfam', 'quifam']) control(tot3, verbose=True) final = tot3.loc[tot3.idmen.notnull(), :].copy() control(final, verbose=True) del tot3, foy_ind gc.collect() log.info(" loading fip") sif = temporary_store['sif_{}'.format(year)] log.info("Columns from sif dataframe: {}".format(sif.columns)) log.info(" update final using fip") final.set_index('noindiv', inplace=True, verify_integrity=True) # TODO: IL FAUT UNE METHODE POUR GERER LES DOUBLES DECLARATIONS # On ne garde que les sif.noindiv qui correspondent à des idfoy == "vous" # Et on enlève les duplicates idfoys = final.loc[final.quifoy == 0, "idfoy"] sif = sif[sif.noindiv.isin(idfoys) & ~(sif.change.isin(['M', 'S', 'Z']))].copy() sif.drop_duplicates(subset=['noindiv'], inplace=True) sif.set_index('noindiv', inplace=True, verify_integrity=True) final = final.join(sif) final.reset_index(inplace=True) control(final, debug=True) final['caseP'] = final.caseP.fillna(False) final['caseF'] = final.caseF.fillna(False) print_id(final) temporary_store['final_{}'.format(year)] = final log.info(u"final sauvegardé") del sif, final
def create_final(temporary_store = None, year = None): assert temporary_store is not None assert year is not None log.info(u"création de final") foy_ind = temporary_store['foy_ind_{}'.format(year)] tot3 = temporary_store['tot3_{}'.format(year)] log.info(u"Stats on tot3") print_id(tot3) log.info(u"Stats on foy_ind") print_id(foy_ind) foy_ind.set_index(['idfoy', 'quifoy'], inplace = True, verify_integrity = True) tot3.set_index(['idfoy', 'quifoy'], inplace = True, verify_integrity = True) # tot3 = concat([tot3, foy_ind], join_axes=[tot3.index], axis=1, verify_integrity = True) # TODO improve this foy_ind.drop([u'alr', u'rsti', u'sali', u'choi'], axis = 1, inplace = True) tot3 = tot3.join(foy_ind) tot3.reset_index(inplace = True) foy_ind.reset_index(inplace = True) # tot3 = tot3.drop_duplicates(subset=['idfam', 'quifam']) control(tot3, verbose=True) final = tot3.loc[tot3.idmen.notnull(), :].copy() control(final, verbose=True) del tot3, foy_ind gc.collect() log.info(" loading fip") sif = temporary_store['sif_{}'.format(year)] log.info("Columns from sif dataframe: {}".format(sif.columns)) log.info(" update final using fip") final.set_index('noindiv', inplace = True, verify_integrity = True) # TODO: IL FAUT UNE METHODE POUR GERER LES DOUBLES DECLARATIONS # On ne garde que les sif.noindiv qui correspondent à des idfoy == "vous" # Et on enlève les duplicates idfoys = final.loc[final.quifoy == 0, "idfoy"] sif = sif[sif.noindiv.isin(idfoys) & ~(sif.change.isin(['M', 'S', 'Z']))].copy() sif.drop_duplicates(subset = ['noindiv'], inplace = True) sif.set_index('noindiv', inplace = True, verify_integrity = True) final = final.join(sif) final.reset_index(inplace = True) control(final, debug=True) final['caseP'] = final.caseP.fillna(False) final['caseF'] = final.caseF.fillna(False) print_id(final) temporary_store['final_{}'.format(year)] = final log.info(u"final sauvegardé") del sif, final
def create_totals_second_pass(temporary_store=None, year=None): assert temporary_store is not None assert year is not None log.info(u" 5.1 : Elimination idfoy restant") # Voiture balai # On a plein d'idfoy vides, on fait 1 ménage = 1 foyer fiscal indivi = temporary_store['indivi_step_06_{}'.format(year)] idfoyList = indivi.loc[indivi.quifoy == "vous", 'idfoy'].unique() indivi_without_idfoy = ~indivi.idfoy.isin(idfoyList) indivi.loc[indivi_without_idfoy, 'quifoy'] = "pac" indivi.loc[indivi_without_idfoy & (indivi.quimen == 0) & (indivi.age >= 18), 'quifoy'] = "vous" indivi.loc[indivi_without_idfoy & (indivi.quimen == 0) & (indivi.age >= 18), 'idfoy'] = ( indivi.loc[indivi_without_idfoy, "idmen"].astype('int') * 100 + 51) indivi.loc[indivi_without_idfoy & (indivi.quimen == 1) & (indivi.age >= 18), 'quifoy'] = "conj" del idfoyList print_id(indivi) # Sélectionne les variables à garder pour les steps suivants variables = [ "actrec", "age", "age_en_mois", "chpub", "encadr", "idfoy", "idmen", "nbsala", "noi", "noindiv", "prosa", "quelfic", "quifoy", "quimen", "statut", "titc", "txtppb", "wprm", "rc1rev", "maahe", "sali", "rsti", "choi", "alr", "wprm", ] assert set(variables).issubset(set(indivi.columns)), \ "Manquent les colonnes suivantes : {}".format(set(variables).difference(set(indivi.columns))) dropped_columns = [ variable for variable in indivi.columns if variable not in variables ] indivi.drop(dropped_columns, axis=1, inplace=True) # see http://stackoverflow.com/questions/11285613/selecting-columns indivi.reset_index(inplace=True) gc.collect() # TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé # les infos provenant des déclarations) log.info(u"Etape 6 : Création des variables descriptives") log.info(u" 6.1 : Variable activité") log.info(u"Variables présentes; \n {}".format(indivi.columns)) indivi['activite'] = np.nan indivi.loc[indivi.actrec <= 3, 'activite'] = 0 indivi.loc[indivi.actrec == 4, 'activite'] = 1 indivi.loc[indivi.actrec == 5, 'activite'] = 2 indivi.loc[indivi.actrec == 7, 'activite'] = 3 indivi.loc[indivi.actrec == 8, 'activite'] = 4 indivi.loc[indivi.age <= 13, 'activite'] = 2 # ce sont en fait les actrec=9 log.info("Valeurs prises par la variable activité \n {}".format( indivi['activite'].value_counts(dropna=False))) # TODO: MBJ problem avec les actrec # TODO: FIX AND REMOVE indivi.loc[indivi.actrec.isnull(), 'activite'] = 5 indivi.loc[indivi.titc.isnull(), 'titc'] = 0 assert indivi.titc.notnull().all(), \ u"Problème avec les titc" # On a 420 NaN pour les varaibels statut, titc etc log.info(u" 6.2 : Variable statut") indivi.loc[indivi.statut.isnull(), 'statut'] = 0 indivi.statut = indivi.statut.astype('int') indivi.loc[indivi.statut == 11, 'statut'] = 1 indivi.loc[indivi.statut == 12, 'statut'] = 2 indivi.loc[indivi.statut == 13, 'statut'] = 3 indivi.loc[indivi.statut == 21, 'statut'] = 4 indivi.loc[indivi.statut == 22, 'statut'] = 5 indivi.loc[indivi.statut == 33, 'statut'] = 6 indivi.loc[indivi.statut == 34, 'statut'] = 7 indivi.loc[indivi.statut == 35, 'statut'] = 8 indivi.loc[indivi.statut == 43, 'statut'] = 9 indivi.loc[indivi.statut == 44, 'statut'] = 10 indivi.loc[indivi.statut == 45, 'statut'] = 11 assert indivi.statut.isin(range(12)).all(), u"statut value over range" log.info("Valeurs prises par la variable statut \n {}".format( indivi['statut'].value_counts(dropna=False))) log.info(u" 6.3 : variable txtppb") indivi.loc[indivi.txtppb.isnull(), 'txtppb'] = 0 assert indivi.txtppb.notnull().all() indivi.loc[indivi.nbsala.isnull(), 'nbsala'] = 0 indivi.nbsala = indivi.nbsala.astype('int') indivi.loc[indivi.nbsala == 99, 'nbsala'] = 10 assert indivi.nbsala.isin(range(11)).all() log.info("Valeurs prises par la variable txtppb \n {}".format( indivi['txtppb'].value_counts(dropna=False))) log.info(u" 6.4 : variable chpub et CSP") indivi.loc[indivi.chpub.isnull(), 'chpub'] = 0 indivi.chpub = indivi.chpub.astype('int') assert indivi.chpub.isin(range(11)).all() indivi['cadre'] = 0 indivi.loc[indivi.prosa.isnull(), 'prosa'] = 0 assert indivi.prosa.notnull().all() log.info("Valeurs prises par la variable encadr \n {}".format( indivi['encadr'].value_counts(dropna=False))) # encadr : 1=oui, 2=non indivi.loc[indivi.encadr.isnull(), 'encadr'] = 2 indivi.loc[indivi.encadr == 0, 'encadr'] = 2 assert indivi.encadr.notnull().all() assert indivi.encadr.isin([1, 2]).all() indivi.loc[indivi.prosa.isin([7, 8]), 'cadre'] = 1 indivi.loc[(indivi.prosa == 9) & (indivi.encadr == 1), 'cadre'] = 1 assert indivi.cadre.isin(range(2)).all() log.info( u"Etape 7: on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence" ) log.info(u"nb de doublons idfoy/quifoy {}".format( len(indivi[indivi.duplicated(subset=['idfoy', 'quifoy'])]))) log.info(u"On crée les n° de personnes à charge dans le foyer fiscal") assert indivi.idfoy.notnull().all() print_id(indivi) indivi['quifoy_bis'] = 2 indivi.loc[indivi.quifoy == 'vous', 'quifoy_bis'] = 0 indivi.loc[indivi.quifoy == 'conj', 'quifoy_bis'] = 1 indivi.loc[indivi.quifoy == 'pac', 'quifoy_bis'] = 2 del indivi['quifoy'] indivi['quifoy'] = indivi.quifoy_bis.copy() del indivi['quifoy_bis'] print_id(indivi) pac = indivi.loc[indivi['quifoy'] == 2, ['quifoy', 'idfoy', 'noindiv']].copy() print_id(pac) j = 2 while pac.duplicated(['quifoy', 'idfoy']).any(): pac.loc[pac.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j j += 1 print_id(pac) indivi = indivi.merge(pac, on=['noindiv', 'idfoy'], how="left") indivi['quifoy'] = indivi['quifoy_x'] indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'], indivi['quifoy_x']) del indivi['quifoy_x'], indivi['quifoy_y'] print_id(indivi) del pac assert len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])]) == 0, \ u"Il y a {} doublons idfoy/quifoy".format( len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])]) ) print_id(indivi) log.info(u"Etape 8 : création des fichiers totaux") famille = temporary_store['famc_{}'.format(year)] log.info(u" 8.1 : création de tot2 & tot3") tot2 = indivi.merge(famille, on='noindiv', how='inner') # TODO: MBJ increase in number of menage/foyer when merging with family ... del famille control(tot2, debug=True, verbose=True) assert tot2.quifam.notnull().all() temporary_store['tot2_{}'.format(year)] = tot2 del indivi log.info(u" tot2 saved") tot2 = tot2[tot2.idmen.notnull()].copy() print_id(tot2) tot3 = tot2 # TODO: check where they come from log.info("Avant élimination des doublons noindiv: {}".format(len(tot3))) tot3 = tot3.drop_duplicates(subset='noindiv') log.info("Après élimination des doublons noindiv: {}".format(len(tot3))) # Block to remove any unwanted duplicated pair control(tot3, debug=True, verbose=True) tot3 = tot3.drop_duplicates(subset=['idfoy', 'quifoy']) log.info("Après élimination des doublons idfoy, quifoy: {}".format( len(tot3))) tot3 = tot3.drop_duplicates(subset=['idfam', 'quifam']) log.info("Après élimination des doublons idfam, 'quifam: {}".format( len(tot3))) tot3 = tot3.drop_duplicates(subset=['idmen', 'quimen']) log.info("Après élimination des doublons idmen, quimen: {}".format( len(tot3))) tot3 = tot3.drop_duplicates(subset=['noindiv']) control(tot3) log.info(u" 8.2 : On ajoute les variables individualisables") allvars = temporary_store['ind_vars_to_remove_{}'.format(year)] vars2 = set(tot3.columns).difference(set(allvars)) tot3 = tot3[list(vars2)] log.info("{}".format(len(tot3))) assert not (tot3.duplicated( subset=['noindiv']).any()), "doublon dans tot3['noindiv']" lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])]) assert lg_dup == 0, "{} pairs of idfoy/quifoy in tot3 are duplicated".format( lg_dup) temporary_store['tot3_{}'.format(year)] = tot3 control(tot3) del tot2, allvars, tot3, vars2 gc.collect() log.info(u"tot3 sauvegardé")
def invalide(temporary_store=None, year=None): assert temporary_store is not None assert year is not None log.info(u"Entering 07_invalides: construction de la variable invalide") # # # Invalides # # #invalide = caseP (vous), caseF (conj) ou case G, caseI, ou caseR (pac) # # loadTmp("final.Rdata") # # invalides <- final[,c("noindiv","idmen","caseP","caseF","idfoy","quifoy")] # # invalides <- within(invalides,{ # # caseP <- ifelse(is.na(caseP),0,caseP) # # caseF <- ifelse(is.na(caseF),0,caseF) # # invalide <- FALSE}) # # # Les "vous" invalides # # table(invalides[,c("caseF","quifoy")],useNA="ifany") # # invalides[(invalides$caseP==1) & (invalides$quifoy=="vous"),"invalide"] <- TRUE # # log.info(u"Etape 1 : création de la df invalides") log.info(u" 1.1 : déclarants invalides") final = temporary_store['final_{}'.format(year)] if "invalide" in final: # on drop les colones inv et alt au cas ou on aurait déjà lancé le step07 final.drop(["invalide", "alt"], axis=1, inplace=True) invalides_vars = [ "caseF", "caseP", "idfoy", "idmen", "noindiv", "quifoy", ] aah_eec_variables = ["rc1rev", "maahe"] aah_eec = False if set(aah_eec_variables) < set(final.columns): aah_eec = True invalides_vars += aah_eec_variables assert set(invalides_vars) < set(final.columns), \ "Variables {} are missing".format(set(invalides_vars).difference(set(final.columns))) invalides = final.xs(invalides_vars, axis=1).copy() invalides['invalide'] = False invalides['alt'] = False for var in ["caseP", "caseF"]: assert invalides[var].notnull().all(), 'NaN values in {}'.format(var) # Les déclarants invalides invalides.loc[(invalides['caseP'] == 1) & (invalides['quifoy'] == 0), 'invalide'] = True log.info(u"Il y a {} invalides déclarants".format( invalides["invalide"].sum())) # Les personnes qui touchent l'aah dans l'enquête emploi if aah_eec: log.info(u"Inspecting rc1rev") log.info(invalides['rc1rev'].value_counts()) invalides.loc[invalides.maahe > 0, 'invalide'] = True invalides.loc[invalides.rc1rev == 4, 'invalide'] = True # TODO: vérifier le format. # TODO: invalides.rc1rev.astype("str") voir mai mahdi pour pendre en compte 14 24 etc log.info(u"Il y a {} invalides qui touchent des alloc".format( invalides["invalide"].sum())) print_id(invalides) # # # Les conjoints invalides # # # # #men_inv_conj <- invalides[c("idmen","caseF","quifoy")] # # #men_inv_conj <- rename(men_inv_conj, c("caseF"="invalide")) # # #table(men_inv_conj[men_inv_conj$inv==1 ,c("invalide","quifoy")],useNA="ifany") # # # Il y a des caseF suir des conjoints cela vint des doubles d?clarations TODO: shoumd clean this # # #toto <- invalides[invalides$caseF==1 & invalides$quifoy=="conj","idmen"] # # #load(indm) # # #titi <- indivim[(indivim$ident %in% toto) & (indivim$persfip=="vous" |indivim$persfip=="conj") ,c("ident","noindiv","declar1","declar2","persfip","quelfic")] # # #titi <- titi[order(titi$ident),] # # foy_inv_conj <- invalides[,c("idfoy","caseF","quifoy")] # # foy_inv_conj <- rename(foy_inv_conj, c("caseF"="invalide")) # # table(foy_inv_conj[ ,c("invalide","quifoy")],useNA="ifany") # # # On ne garde donc que les caseF des "vous" # # foy_inv_conj <- foy_inv_conj[foy_inv_conj$quifoy=="vous",c("idfoy","invalide")] # # table(foy_inv_conj[ ,c("invalide")],useNA="ifany") # # invalides_conj <- invalides[invalides$quifoy=="conj",c("idfoy","noindiv")] # # invalides_conj <- merge(invalides_conj, foy_inv_conj, by="idfoy", all.x=TRUE) # # table(invalides_conj$inv) # TODO en 2006 On en a 316 au lieu de 328 il doit y avoir de idfoy avec caseF qui n'ont pas de vous because double déclaration' # # invalides[invalides$quifoy=="conj",c("idfoy","noindiv","invalide")] <- invalides_conj # # table(invalides[,c("invalide","quifoy")],useNA="ifany") # # rm(invalides_conj,foy_inv_conj) # On récupère les idfoy des foyers avec une caseF cochée log.info(' 1.2 : Les conjoints invalides') idfoy_inv_conj = final.idfoy[final.caseF].copy() inv_conj_condition = (invalides.idfoy.isin(idfoy_inv_conj) & (invalides.quifoy == 1)) invalides.loc[inv_conj_condition, "invalide"] = True log.info(u"Il y a {} invalides conjoints".format( len(invalides[inv_conj_condition]))) log.info(u" Il y a {} invalides déclarants et invalides conjoints".format( invalides["invalide"].sum())) # Enfants invalides et garde alternée # # # # loadTmp("pacIndiv.Rdata") # # foy_inv_pac <- invalides[!(invalides$quifoy %in% c("vous","conj")),c("invalide","noindiv")] # # foy_inv_pac <- merge(foy_inv_pac, pacIndiv[,c("noindiv","typ","naia")], by="noindiv",all.x =TRUE) # # names(foy_inv_pac) # # table(foy_inv_pac[,c("typ","naia")],useNA="ifany") # # table(foy_inv_pac[,c("typ")],useNA="ifany") # # foy_inv_pac <- within(foy_inv_pac,{ # # invalide <- (typ=="G") | (typ=="R") | (typ=="I") | (typ=="F" & (as.numeric(year)-naia>18)) # # alt <- (typ=="H") | (typ=="I") # # naia <- NULL # # typ <- NULL}) # # # # table(foy_inv_pac[ ,c("invalide")],useNA="ifany") # # table(foy_inv_pac[ ,c("alt")],useNA="ifany") # # invalides$alt <- 0 # # foy_inv_pac[is.na(foy_inv_pac$alt),"alt"] <- 0 # # invalides[!(invalides$quifoy %in% c("vous","conj")),c("noindiv","invalide","alt")] <- foy_inv_pac log.info( u" 1.3 : enfants invalides et en garde alternée (variables inv et alt)" ) pacIndiv = temporary_store['pacIndiv_{}'.format(year)] # print pacIndiv.type_pac.value_counts() log.info(pacIndiv.type_pac.value_counts()) foy_inv_pac = invalides[['noindiv', 'invalide' ]][~(invalides.quifoy.isin([0, 1]))].copy() # pac = pacIndiv.ix[:, ["noindiv", "type_pac", "naia"]] log.info("{}".format(len(foy_inv_pac))) log.info("{}".format(pacIndiv.columns)) foy_inv_pac = foy_inv_pac.merge( pacIndiv[['noindiv', 'type_pac', 'naia']].copy(), on='noindiv', how='left', ) foy_inv_pac['invalide'] = (foy_inv_pac['type_pac'].isin(['G', 'R', 'I']) | ((foy_inv_pac.type_pac == "F") & ((year - foy_inv_pac.naia) > 18))) foy_inv_pac['alt'] = ((foy_inv_pac.type_pac == "H") | (foy_inv_pac.type_pac == "I")) del foy_inv_pac['naia'] del foy_inv_pac['type_pac'] foy_inv_pac['alt'] = foy_inv_pac['alt'].fillna(False) log.info("{}".format(foy_inv_pac['invalide'].describe())) invalides.loc[:, 'alt'] = False invalides.loc[~(invalides.quifoy.isin([0, 1])), ["alt", "invalide"]] = foy_inv_pac[["alt", "invalide" ]].copy().values invalides = invalides[["noindiv", "invalide", 'alt']].copy() invalides['alt'].fillna(False, inplace=True) log.info(invalides.invalide.value_counts()) # invalides = invalides.drop_duplicates(['noindiv', 'invalide', 'alt'], take_last = True) del foy_inv_pac, pacIndiv # Initialisation des NA sur alt et inv # invalides[is.na(invalides$inv), "invalide"] <- 0 # table(invalides[,c("alt","invalide")],useNA="ifany") # # final <- merge(final, invalides[,c("noindiv","invalide","alt")], by="noindiv",all.x=TRUE) # table(final[, c("invalide","alt")],useNA="ifany") log.info('Etape 2 : Initialisation des NA sur alt et inv') assert invalides.invalide.notnull().all() & invalides.alt.notnull().all() final.set_index('noindiv', inplace=True, verify_integrity=True) invalides.set_index('noindiv', inplace=True, verify_integrity=True) final = final.join(invalides) final.reset_index(inplace=True) del invalides log.info("{}".format(final.invalide.value_counts())) control(final, debug=True) temporary_store['final_{}'.format(year)] = final log.info(u'final complétée et sauvegardée')
def create_totals(year = None): assert year is not None temporary_store = TemporaryStore.create(file_name = "erfs") replace = create_replace(year) # On part de la table individu de l'ERFS # on renomme les variables log.info(u"Creating Totals") log.info(u"Etape 1 : Chargement des données") erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory) data = erfs_survey_collection.get_survey('erfs_{}'.format(year)) indivim = temporary_store['indivim_{}'.format(year)] assert not indivim.duplicated(['noindiv']).any(), "Présence de doublons" # Deals individuals with imputed income : some individuals are in 'erf individu table' but # not in the 'foyer' table. We need to create a foyer for them. selection = Series() for var in ["zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici", "zrnci"]: varo = var[:-1] + "o" test = indivim[var] != indivim[varo] if len(selection) == 0: selection = test else: selection = (test) | (selection) indivi_i = indivim[selection].copy() indivi_i.rename( columns = { "ident": "idmen", "persfip": "quifoy", "zsali": "sali2", # Inclu les salaires non imposables des agents d'assurance "zchoi": "choi2", "zrsti": "rsti2", "zalri": "alr2" }, inplace = True, ) indivi_i.quifoy = where(indivi_i.quifoy.isnull(), "vous", indivi_i.quifoy) indivi_i.quelfic = "FIP_IMP" # We merge them with the other individuals indivim.rename( columns = dict( ident = "idmen", persfip = "quifoy", zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance zchoi = "choi2", zrsti = "rsti2", zalri = "alr2", ), inplace = True, ) if not (set(list(indivim.noindiv)) > set(list(indivi_i.noindiv))): raise Exception("Individual ") indivim.set_index("noindiv", inplace = True) indivi_i.set_index("noindiv", inplace = True) indivi = indivim del indivim indivi.update(indivi_i) indivi.reset_index(inplace = True) log.info("Etape 2 : isolation des FIP") fip_imp = indivi.quelfic == "FIP_IMP" indivi["idfoy"] = ( indivi.idmen.astype("int64") * 100 + (indivi.declar1.str[0:2]).convert_objects(convert_numeric=True) ) indivi.loc[fip_imp, "idfoy"] = np.nan # Certains FIP (ou du moins avec revenus imputés) ont un numéro de déclaration d'impôt ( pourquoi ?) fip_has_declar = (fip_imp) & (indivi.declar1.notnull()) indivi["idfoy"] = where( fip_has_declar, indivi.idmen * 100 + indivi.declar1.str[0:2].convert_objects(convert_numeric = True), indivi.idfoy) del fip_has_declar fip_no_declar = (fip_imp) & (indivi.declar1.isnull()) del fip_imp indivi["idfoy"] = where(fip_no_declar, indivi["idmen"] * 100 + 50, indivi["idfoy"]) indivi_fnd = indivi[["idfoy", "noindiv"]][fip_no_declar].copy() while any(indivi_fnd.duplicated(cols=["idfoy"])): indivi_fnd["idfoy"] = where( indivi_fnd.duplicated(cols=["idfoy"]), indivi_fnd["idfoy"] + 1, indivi_fnd["idfoy"] ) # assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len(indivi_fnd["idfoy"].values), "Duplicates remaining" assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons" indivi.idfoy[fip_no_declar] = indivi_fnd.idfoy.copy() del indivi_fnd, fip_no_declar log.info(u"Etape 3 : Récupération des EE_NRT") nrt = indivi.quelfic == "EE_NRT" indivi.idfoy = where(nrt, indivi.idmen * 100 + indivi.noi, indivi.idfoy) indivi.quifoy[nrt] = "vous" del nrt pref_or_cref = indivi.lpr.isin([1, 2]) adults = (indivi.quelfic.isin(["EE", "EE_CAF"])) & (pref_or_cref) indivi.idfoy = where(adults, indivi.idmen * 100 + indivi.noi, indivi.idfoy) indivi.loc[adults, "quifoy"] = "vous" del adults # TODO: hack to avoid assert error log.info("{}".format(indivi.loc[indivi['lpr'].isin([1, 2]), "idfoy"].notnull().value_counts())) assert indivi.idfoy[indivi.lpr.dropna().isin([1, 2])].all() log.info(u"Etape 4 : Rattachement des enfants aux déclarations") assert not(indivi.noindiv.duplicated().any()), "Some noindiv appear twice" lpr3_or_lpr4 = indivi['lpr'].isin([3, 4]) enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE", "EE_CAF"])) assert indivi.noindiv[enf_ee].notnull().all(), " Some noindiv are not set, which will ruin next stage" assert not(indivi.noindiv[enf_ee].duplicated().any()), "Some noindiv appear twice" pere = DataFrame({ "noindiv_enf": indivi.noindiv.loc[enf_ee], "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee] }) mere = DataFrame({ "noindiv_enf": indivi.noindiv.loc[enf_ee], "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee] }) foyer = data.get_values(variables = ["noindiv", "zimpof"], table = replace["foyer"]) pere = pere.merge(foyer, how = "inner", on = "noindiv") mere = mere.merge(foyer, how = "inner", on = "noindiv") df = pere.merge(mere, how = "outer", on = "noindiv_enf", suffixes=('_p', '_m')) log.info(u" 4.1 : gestion des personnes dans 2 foyers") for col in ["noindiv_p", "noindiv_m", "noindiv_enf"]: df[col] = df[col].fillna(0, inplace = True) # beacause groupby drop groups with NA in index df = df.groupby(by = ["noindiv_p", "noindiv_m", "noindiv_enf"]).sum() df.reset_index(inplace = True) df["which"] = "" df.which = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()), "mere", "") df.which = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()), "pere", "") both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull()) df.which = where(both & (df.zimpof_p > df.zimpof_m), "pere", "mere") df.which = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere") assert df.which.notnull().all(), "Some enf_ee individuals are not matched with any pere or mere" del lpr3_or_lpr4, pere, mere df.rename(columns = {"noindiv_enf": "noindiv"}, inplace = True) df['idfoy'] = where(df.which == "pere", df.noindiv_p, df.noindiv_m) df['idfoy'] = where(df.which == "mere", df.noindiv_m, df.noindiv_p) assert df["idfoy"].notnull().all() dropped = [col for col in df.columns if col not in ["idfoy", "noindiv"]] df.drop(dropped, axis = 1, inplace = True) assert not(df.duplicated().any()) df.set_index("noindiv", inplace = True, verify_integrity = True) indivi.set_index("noindiv", inplace = True, verify_integrity = True) ind_notnull = indivi["idfoy"].notnull().sum() ind_isnull = indivi["idfoy"].isnull().sum() indivi = indivi.combine_first(df) assert ind_notnull + ind_isnull == ( indivi["idfoy"].notnull().sum() + indivi["idfoy"].isnull().sum() ) indivi.reset_index(inplace = True) assert not(indivi.duplicated().any()) # MBJ: issue delt with when moving from R code to python # TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs # On suit guide méthodo erf 2003 page 135 # On supprime les conjoints FIP et les FIP de 25 ans et plus; # On conserve les enfants FIP de 19 à 24 ans; # On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un # ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète # EEC que l'on ne retrouvera pas dans les EEC suivantes. # log.info(u" 4.2 : On enlève les individus pour lesquels il manque le déclarant") fip = temporary_store['fipDat_{}'.format(year)] fip["declar"] = np.nan fip["agepf"] = np.nan fip.drop(["actrec", "year", "noidec"], axis = 1, inplace = True) fip.naia = fip.naia.astype("int32") fip.rename( columns = dict( ident = "idmen", persfip = "quifoy", zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance zchoi = "choi2", zrsti = "rsti2", zalri = "alr2"), inplace = True) is_fip_19_25 = ((year - fip.naia - 1) >= 19) & ((year - fip.naia - 1) < 25) # TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented # The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25 # fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"] # fip[is_fip_19_25 ,"lpr"] <- 1 # # indivi <- rbind.fill(indivi,fip[is_fip_19_25,]) indivi = concat([indivi, fip.loc[is_fip_19_25]]) del is_fip_19_25 indivi['age'] = year - indivi.naia - 1 indivi['age_en_mois'] = 12 * indivi.age + 12 - indivi.naim indivi["quimen"] = 0 indivi.quimen[indivi.lpr == 1] = 0 indivi.quimen[indivi.lpr == 2] = 1 indivi.quimen[indivi.lpr == 3] = 2 indivi.quimen[indivi.lpr == 4] = 3 indivi['not_pr_cpr'] = None # Create a new row indivi.not_pr_cpr[indivi.lpr <= 2] = False indivi.not_pr_cpr[indivi.lpr > 2] = True assert indivi.not_pr_cpr.isin([True, False]).all() log.info(u" 4.3 : Creating non pr=0 and cpr=1 idmen's") indivi.reset_index(inplace = True) test1 = indivi[['quimen', 'idmen']][indivi.not_pr_cpr].copy() test1['quimen'] = 2 j = 2 while any(test1.duplicated(['quimen', 'idmen'])): test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j + 1 j += 1 print_id(indivi) indivi.update(test1) print_id(indivi) # indivi.set_index(['quimen']) #TODO: check relevance # TODO problème avec certains idfoy qui n'ont pas de vous log.info(u"Etape 5 : Gestion des idfoy qui n'ont pas de vous") all_ind = indivi.drop_duplicates('idfoy') with_ = indivi.loc[indivi.quifoy == 'vous', 'idfoy'] without = all_ind[~(all_ind.idfoy.isin(with_.values))] log.info(u"On cherche si le déclarant donné par la deuxième déclaration est bien un vous") # TODO: the following should be delt with at the import of the tables indivi.replace( to_replace = { 'declar2': {'NA': np.nan, '': np.nan} }, inplace = True ) has_declar2 = (indivi.idfoy.isin(without.idfoy.values)) & (indivi.declar2.notnull()) decl2_idfoy = ( indivi.loc[has_declar2, "idmen"].astype('int') * 100 + indivi.loc[has_declar2, "declar2"].str[0:2].astype('int') ) indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values), decl2_idfoy, None) del all_ind, with_, without, has_declar2 log.info(u" 5.1 : Elimination idfoy restant") idfoyList = indivi.loc[indivi.quifoy == "vous", 'idfoy'].drop_duplicates() indivi = indivi[indivi.idfoy.isin(idfoyList.values)] del idfoyList print_id(indivi) # Sélectionne les variables à garder pour les steps suivants myvars = [ "actrec", "age", "age_en_mois", "chpub", "encadr", "idfoy", "idmen", "nbsala", "noi", "noindiv", "prosa", "quelfic", "quifoy", "quimen", "statut", "titc", "txtppb", "wprm", "rc1rev", "maahe", ] assert len(set(myvars).difference(set(indivi.columns))) == 0, \ "Manquent les colonnes suivantes : {}".format(set(myvars).difference(set(indivi.columns))) indivi = indivi[myvars].copy() # TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé # les infos provenant des déclarations) log.info(u"Etape 6 : Création des variables descriptives") log.info(u" 6.1 : variable activité") indivi['activite'] = None indivi['activite'][indivi.actrec <= 3] = 0 indivi['activite'][indivi.actrec == 4] = 1 indivi['activite'][indivi.actrec == 5] = 2 indivi['activite'][indivi.actrec == 7] = 3 indivi['activite'][indivi.actrec == 8] = 4 indivi['activite'][indivi.age <= 13] = 2 # ce sont en fait les actrec=9 log.info("{}".format(indivi['activite'].value_counts(dropna = False))) # TODO: MBJ problem avec les actrec # TODO: FIX AND REMOVE indivi.activite[indivi.actrec.isnull()] = 5 indivi.titc[indivi.titc.isnull()] = 0 assert indivi.titc.notnull().all(), u"Problème avec les titc" # On a 420 NaN pour les varaibels statut, titc etc log.info(u" 6.2 : variable statut") indivi.statut[indivi.statut.isnull()] = 0 indivi.statut = indivi.statut.astype('int') indivi.statut[indivi.statut == 11] = 1 indivi.statut[indivi.statut == 12] = 2 indivi.statut[indivi.statut == 13] = 3 indivi.statut[indivi.statut == 21] = 4 indivi.statut[indivi.statut == 22] = 5 indivi.statut[indivi.statut == 33] = 6 indivi.statut[indivi.statut == 34] = 7 indivi.statut[indivi.statut == 35] = 8 indivi.statut[indivi.statut == 43] = 9 indivi.statut[indivi.statut == 44] = 10 indivi.statut[indivi.statut == 45] = 11 assert indivi.statut.isin(range(12)).all(), u"statut value over range" log.info(u" 6.3 : variable txtppb") indivi.txtppb.fillna(0, inplace = True) assert indivi.txtppb.notnull().all() indivi.nbsala.fillna(0, inplace = True) indivi['nbsala'] = indivi.nbsala.astype('int') indivi.nbsala[indivi.nbsala == 99] = 10 assert indivi.nbsala.isin(range(11)).all() log.info(u" 6.4 : variable chpub et CSP") indivi.chpub.fillna(0, inplace = True) indivi.chpub = indivi.chpub.astype('int') indivi.chpub[indivi.chpub.isnull()] = 0 assert indivi.chpub.isin(range(11)).all() indivi['cadre'] = 0 indivi.prosa.fillna(0, inplace = True) assert indivi['prosa'].notnull().all() log.info("{}".format(indivi['encadr'].value_counts(dropna = False))) # encadr : 1=oui, 2=non indivi.encadr.fillna(2, inplace = True) indivi.encadr[indivi.encadr == 0] = 2 assert indivi.encadr.notnull().all() assert indivi.encadr.isin([1, 2]).all() indivi['cadre'][indivi.prosa.isin([7, 8])] = 1 indivi['cadre'][(indivi.prosa == 9) & (indivi.encadr == 1)] = 1 assert indivi['cadre'].isin(range(2)).all() log.info( u"Etape 7: on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence") log.info( u"nb de doublons idfam/quifam {}".format(len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])]))) log.info(u"On crée les n° de personnes à charge") assert indivi['idfoy'].notnull().all() print_id(indivi) indivi['quifoy2'] = 2 indivi.quifoy2[indivi.quifoy == 'vous'] = 0 indivi.quifoy2[indivi.quifoy == 'conj'] = 1 indivi.quifoy2[indivi.quifoy == 'pac'] = 2 del indivi['quifoy'] indivi['quifoy'] = indivi.quifoy2 del indivi['quifoy2'] print_id(indivi) test2 = indivi[['quifoy', 'idfoy', 'noindiv']][indivi['quifoy'] == 2].copy() print_id(test2) j = 2 while test2.duplicated(['quifoy', 'idfoy']).any(): test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j j += 1 print_id(test2) indivi = indivi.merge(test2, on = ['noindiv', 'idfoy'], how = "left") indivi['quifoy'] = indivi['quifoy_x'] indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'], indivi['quifoy_x']) del indivi['quifoy_x'], indivi['quifoy_y'] print_id(indivi) del test2, fip log.info( u"nb de doublons idfam/quifam' {}".format( len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])]) ) ) print_id(indivi) log.info(u"Etape 8 : création des fichiers totaux") famille = temporary_store['famc_{}'.format(year)] log.info(u" 8.1 : création de tot2 & tot3") tot2 = indivi.merge(famille, on = 'noindiv', how = 'inner') # del famille # TODO: MBJ increase in number of menage/foyer when merging with family ... del famille control(tot2, debug = True, verbose = True) assert tot2.quifam.notnull().all() temporary_store['tot2_{}'.format(year)] = tot2 del indivi log.info(u" tot2 saved") tot2.merge(foyer, how = 'left') tot2 = tot2[tot2.idmen.notnull()].copy() print_id(tot2) tot3 = tot2 # TODO: check where they come from tot3 = tot3.drop_duplicates(subset = 'noindiv') log.info("{}".format(len(tot3))) # Block to remove any unwanted duplicated pair control(tot3, debug = True, verbose = True) tot3 = tot3.drop_duplicates(subset = ['idfoy', 'quifoy']) tot3 = tot3.drop_duplicates(subset = ['idfam', 'quifam']) tot3 = tot3.drop_duplicates(subset = ['idmen', 'quimen']) tot3 = tot3.drop_duplicates(subset = ['noindiv']) control(tot3) log.info(u" 8.2 : On ajoute les variables individualisables") allvars = temporary_store['ind_vars_to_remove_{}'.format(year)] vars2 = set(tot3.columns).difference(set(allvars)) tot3 = tot3[list(vars2)] log.info("{}".format(len(tot3))) assert not(tot3.duplicated(subset = ['noindiv']).any()), "doublon dans tot3['noindiv']" lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])]) assert lg_dup == 0, "{} pairs of idfoy/quifoy in tot3 are duplicated".format(lg_dup) temporary_store['tot3_{}'.format(year)] = tot3 control(tot3) del tot2, allvars, tot3, vars2 log.info(u"tot3 sauvegardé") gc.collect()
def final(year=None, filename="test", check=True): assert year is not None temporary_store = TemporaryStore.create(file_name="erfs") ##***********************************************************************/ log.info(u'08_final: derniers réglages') ##***********************************************************************/ # # loadTmp("final.Rdata") # # On définit comme célibataires les individus dont on n'a pas retrouvé la déclaration # final$statmarit[is.na(final$statmarit)] <- 2 # table(final$statmarit, useNA='ifany') # import gc gc.collect() final = temporary_store['final_{}'.format(year)] log.info('check doublons'.format(len(final[final.duplicated(['noindiv' ])]))) final.statmarit = where(final.statmarit.isnull(), 2, final.statmarit) # # # activite des fip # table(final[final$quelfic=="FIP","activite"],useNA="ifany") # summary(final[final$quelfic=="FIP",c("activite","choi","sali","alr","rsti","age")] ) # # activite # actif occup? 0, ch?meur 1, ?tudiant/?l?ve 2, retrait? 3, autre inactif 4 # # final_fip <- final[final$quelfic=="FIP",] # final_fip <- within(final_fip,{ # choi <- ifelse(is.na(choi),0,choi) # sali <- ifelse(is.na(sali),0,sali) # alr <- ifelse(is.na(alr),0,alr) # rsti <- ifelse(is.na(rsti),0,rsti) # activite <- 2 # TODO comment choisr la valeur par d?faut ? # activite <- ifelse(choi > 0,1,activite) # activite <- ifelse(sali > 0,0,activite) # activite <- ifelse(age >= 21, 2,activite) # ne peuvent être rattachés que les étudiants # }) # final[final$quelfic=="FIP",]<- final_fip # table(final_fip[,c("age","activite")]) # rm(final_fip) # # print_id(final) # saveTmp(final, file= "final.Rdata") # log.info(' gestion des FIP de final') final_fip = final[["choi", "sali", "alr", "rsti", "age"]][final.quelfic == "FIP"].copy() log.info( set(["choi", "sali", "alr", "rsti"]).difference(set(final_fip.columns))) for var in ["choi", "sali", "alr", "rsti"]: final_fip[var].fillna(0, inplace=True) assert final_fip[var].notnull().all( ), "Some NaN are remaining in column {}".format(var) final_fip["activite"] = 2 # TODO comment choisr la valeur par défaut ? final_fip.activite = where(final_fip.choi > 0, 1, final_fip.activite) final_fip.activite = where(final_fip.sali > 0, 0, final_fip.activite) final_fip.activite = where( final_fip.age > 21, 2, final_fip.activite) # ne peuvent être rattachés que les étudiants final.update(final_fip) temporary_store['final_{}'.format(year)] = final log.info("final has been updated with fip") menage_en_mois = temporary_store['menage_en_mois_{}'.format(year)] menage_en_mois.rename(columns=dict(ident="idmen", loym="loyer"), inplace=True) menage_en_mois["cstotpragr"] = np.floor(menage_en_mois["cstotpr"] / 10) # 2008 tau99 removed TODO: check ! and check incidence vars = [ "champm", "cstotpragr", "ddipl", "idmen", "loyer", "nbinde", "pol99", "reg", "so", "tau99", "tu99", "typmen15", "wprm", "zthabm", ] if year == 2008: vars.remove("tau99") famille_vars = [ "m_afeamam", "m_agedm", "m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm' ] # if ("naf16pr" %in% names(menage_en_mois)) { # naf16pr <- factor(menage_en_mois$naf16pr) # levels(naf16pr) <- 0:16 # menage_en_mois$naf16pr <- as.character(naf16pr) # menage_en_mois[is.na(menage_en_mois$naf16pr), "naf16pr" ] <- "-1" # Sans objet # vars <- c(vars,"naf16pr") # } else if ("nafg17npr" %in% names(menage_en_mois)) { # # TODO: pb in 2008 with xx # if (year == "2008"){ # menage_en_mois[ menage_en_mois$nafg17npr == "xx" & !is.na(menage_en_mois$nafg17npr), "nafg17npr"] <- "00" # } # nafg17npr <- factor(menage_en_mois$nafg17npr) # levels(nafg17npr) <- 0:17 # menage_en_mois$nafg17npr <- as.character(nafg17npr) # menage_en_mois[is.na(menage_en_mois$nafg17npr), "nafg17npr" ] <- "-1" # Sans objet # } # #TODO: TODO: pytohn translation needed # if "naf16pr" in menage_en_mois.columns: # naf16pr <- factor(menage_en_mois$naf16pr) # levels(naf16pr) <- 0:16 # menage_en_mois$naf16pr <- as.character(naf16pr) # menage_en_mois[is.na(menage_en_mois$naf16pr), "naf16pr" ] <- "-1" # Sans objet # vars <- c(vars,"naf16pr") # } else if ("nafg17npr" %in% names(menage_en_mois)) { # # TODO: pb in 2008 with xx # if (year == "2008"){ # menage_en_mois[ menage_en_mois$nafg17npr == "xx" & !is.na(menage_en_mois$nafg17npr), "nafg17npr"] <- "00" # } # nafg17npr <- factor(menage_en_mois$nafg17npr) # levels(nafg17npr) <- 0:17 # menage_en_mois$nafg17npr <- as.character(nafg17npr) # menage_en_mois[is.na(menage_en_mois$nafg17npr), "nafg17npr" ] <- "-1" # Sans objet # } # TODO: 2008tau99 is not present should be provided by 02_loy.... is it really needed all_vars = vars + famille_vars log.info("liste de toutes les variables : {}".format(all_vars)) log.info(menage_en_mois.info()) available_vars = list( set(all_vars).intersection(set(menage_en_mois.columns))) loyersMenages = menage_en_mois.xs(available_vars, axis=1) # # # Recodage de typmen15: modalités de 1:15 # table(loyersMenages$typmen15, useNA="ifany") # loyersMenages <- within(loyersMenages, { # typmen15[typmen15==10 ] <- 1 # typmen15[typmen15==11 ] <- 2 # typmen15[typmen15==21 ] <- 3 # typmen15[typmen15==22 ] <- 4 # typmen15[typmen15==23 ] <- 5 # typmen15[typmen15==31 ] <- 6 # typmen15[typmen15==32 ] <- 7 # typmen15[typmen15==33 ] <- 8 # typmen15[typmen15==41 ] <- 9 # typmen15[typmen15==42 ] <- 10 # typmen15[typmen15==43 ] <- 11 # typmen15[typmen15==44 ] <- 12 # typmen15[typmen15==51 ] <- 13 # typmen15[typmen15==52 ] <- 14 # typmen15[typmen15==53 ] <- 15 # }) # # # TODO: MBJ UNNECESSARY ? # # # Pb avec ddipl, pas de modalités 2: on décale les chaps >=3 # # Cependant on fait cela après avoir fait les traitement suivants # table(loyersMenages$ddipl, useNA="ifany") # # On convertit les ddipl en numeric # loyersMenages$ddipl <- as.numeric(loyersMenages$ddipl) # table(loyersMenages$ddipl, useNA="ifany") # # On met les non renseignés ie, NA et "" à sans diplome (modalité 7) # loyersMenages[is.na(loyersMenages$ddipl), "ddipl"] <- 7 # # loyersMenages[loyersMenages$ddipl>1, "ddipl"] <- loyersMenages$ddipl[loyersMenages$ddipl>1]-1 # log.info("{}".format(loyersMenages.info())) loyersMenages.ddipl = where(loyersMenages.ddipl.isnull(), 7, loyersMenages.ddipl) loyersMenages.ddipl = where(loyersMenages.ddipl > 1, loyersMenages.ddipl - 1, loyersMenages.ddipl) loyersMenages.ddipl.astype("int32") final['act5'] = NaN final.act5 = where(final.actrec == 1, 2, final.act5) # indépendants final.act5 = where(final.actrec.isin([2, 3]), 1, final.act5) # salariés final.act5 = where(final.actrec == 4, 3, final.act5) # chômeur final.act5 = where(final.actrec == 7, 4, final.act5) # retraité final.act5 = where(final.actrec == 8, 5, final.act5) # autres inactifs final.act5 = where(final.actrec == 1, 2, final.act5) # indépendants final.act5 = where(final.actrec.isin([2, 3]), 1, final.act5) # salariés final.act5 = where(final.actrec == 4, 3, final.act5) # chômeur final.act5 = where(final.actrec == 7, 4, final.act5) # retraité final.act5 = where(final.actrec == 8, 5, final.act5) # autres inactifs log.info("{}".format(final.act5.value_counts())) # assert final.act5.notnull().all(), 'there are NaN inside final.act5' # final$wprm <- NULL # with the intention to extract wprm from menage to deal with FIPs # final$taxe_habitation <- final$zthabm # rename zthabm to taxe_habitation # final$zthabm <- NULL # # final2 <- merge(final, loyersMenages, by="idmen", all.x=TRUE) log.info(' création de final2') del final["wprm"] gc.collect() final.rename(columns=dict(zthabm="taxe_habitation"), inplace=True) # rename zthabm to taxe_habitation final2 = final.merge(loyersMenages, on="idmen", how="left") # TODO: Check log.info("{}".format(loyersMenages.head())) gc.collect() print_id(final2) # # TODO: merging with patrimoine log.info(' traitement des zones apl') import pkg_resources openfisca_france_data_location = pkg_resources.get_distribution( 'openfisca-france-data').location zone_apl_imputation_data_file_path = os.path.join( openfisca_france_data_location, 'openfisca_france_data', 'zone_apl_data', 'zone_apl', 'zone_apl_imputation_data.csv', ) apl_imp = read_csv(zone_apl_imputation_data_file_path) log.info("{}".format(apl_imp.head(10))) if year == 2008: zone_apl = final2.xs(["tu99", "pol99", "reg"], axis=1) else: zone_apl = final2.xs(["tu99", "pol99", "tau99", "reg"], axis=1) for i in range(len(apl_imp["TU99"])): tu = apl_imp["TU99"][i] pol = apl_imp["POL99"][i] tau = apl_imp["TAU99"][i] reg = apl_imp["REG"][i] if year == 2008: indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (final2["reg"] == reg) selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & ( apl_imp["REG"] == reg) else: indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & ( final2["tau99"] == tau) & (final2["reg"] == reg) selection = ((apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (apl_imp["TAU99"] == tau) & (apl_imp["REG"] == reg)) z = random.uniform(size=indices.sum()) log.info(len(z)) log.info(len(indices)) log.info(len(indices) / len(z)) probs = apl_imp[["proba_zone1", "proba_zone2"]][selection].copy() log.info(probs) log.info(probs['proba_zone1'].values) proba_zone_1 = probs['proba_zone1'].values[0] proba_zone_2 = probs['proba_zone2'].values[0] final2["zone_apl"] = 3 final2["zone_apl"][indices] = (1 + (z > proba_zone_1) + (z > (proba_zone_1 + proba_zone_2))) del indices, probs log.info(' performing cleaning on final2') log.info('{} sali nuls'.format(len(final2[final2['sali'].isnull()]))) log.info("{} individus d'âges nuls".format(len( final2[final2.age.isnull()]))) log.info("longueur de final2 avant purge : {}".format(len(final2))) # columns_w_nan = [] # for col in final2.columns: # if final2[final2['idfoy'].notnull()][col].isnull().any() and not final2[col].isnull().all(): # columns_w_nan.append(col) # print columns_w_nan log.info('check doublons : {}'.format( len(final2[final2.duplicated(['noindiv'])]))) log.info("{}".format(final2.age.isnull().sum())) # print final2.loc[final2.duplicated('noindiv'), ['noindiv', 'quifam']].to_string() #TODO: JS: des chefs de famille et conjoints en double il faut trouver la source des ces doublons ! # final2 = final2.drop_duplicates(['noindiv']) final2 = final2[~(final2.age.isnull())] log.info(u"longueur de final2 après purge: {}".format(len(final2))) print_id(final2) # # # var <- names(foyer) # #a1 <- c('f7rb', 'f7ra', 'f7gx', 'f2aa', 'f7gt', 'f2an', 'f2am', 'f7gw', 'f7gs', 'f8td', 'f7nz', 'f1br', 'f7jy', 'f7cu', 'f7xi', 'f7xo', 'f7xn', 'f7xw', 'f7xy', 'f6hj', 'f7qt', 'f7ql', 'f7qm', 'f7qd', 'f7qb', 'f7qc', 'f1ar', 'f7my', 'f3vv', 'f3vu', 'f3vt', 'f7gu', 'f3vd', 'f2al', 'f2bh', 'f7fm', 'f8uy', 'f7td', 'f7gv', 'f7is', 'f7iy', 'f7il', 'f7im', 'f7ij', 'f7ik', 'f1er', 'f7wl', 'f7wk', 'f7we', 'f6eh', 'f7la', 'f7uh', 'f7ly', 'f8wy', 'f8wx', 'f8wv', 'f7sb', 'f7sc', 'f7sd', 'f7se', 'f7sf', 'f7sh', 'f7si', 'f1dr', 'f7hs', 'f7hr', 'f7hy', 'f7hk', 'f7hj', 'f7hm', 'f7hl', 'f7ho', 'f7hn', 'f4gc', 'f4gb', 'f4ga', 'f4gg', 'f4gf', 'f4ge', 'f7vz', 'f7vy', 'f7vx', 'f7vw', 'f7xe', 'f6aa', 'f1cr', 'f7ka', 'f7ky', 'f7db', 'f7dq', 'f2da') # #a2 <- setdiff(a1,names(foyer)) # #b1 <- c('pondfin', 'alt', 'hsup', 'ass_mat', 'zone_apl', 'inactif', 'ass', 'aer', 'code_postal', 'activite', 'type_sal', 'jour_xyz', 'boursier', 'etr', 'partiel1', 'partiel2', 'empl_dir', 'gar_dom', 'categ_inv', 'opt_colca', 'csg_taux_plein','coloc') # # hsup feuille d'impot # # boursier pas dispo # # inactif etc : extraire cela des donn?es clca etc # # # tester activit? car 0 vaut actif # table(is.na(final2$activite),useNA="ifany") # # saveTmp(final2, file= "final2.Rdata") control(final2, debug=True) log.info(final2.age.isnull().sum()) final2 = final2.drop_duplicates(subset='noindiv') log.info(' Filter to manage the new 3-tables structures:') # On récupère les foyer, famille, ménages qui ont un chef : liste_men = unique(final2.loc[final2['quimen'] == 0, 'idmen'].values) liste_fam = unique(final2.loc[final2['quifam'] == 0, 'idfam'].values) liste_foy = unique(final2.loc[final2['quifoy'] == 0, 'idfoy'].values) #On ne conserve dans final2 que ces foyers là : log.info('final2 avant le filtrage {}'.format(len(final2))) final2 = final2.loc[final2.idmen.isin(liste_men), :] final2 = final2.loc[final2.idfam.isin(liste_fam), :] final2 = final2.loc[final2.idfoy.isin(liste_foy), :] log.info('final2 après le filtrage {}'.format(len(final2))) rectify_dtype(final2, verbose=False) # home = os.path.expanduser("~") # test_filename = os.path.join(home, filename + ".h5") # if os.path.exists(test_filename): # import warnings # import datetime # time_stamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M') # renamed_file = os.path.join(DATA_SOURCES_DIR, filename + "_" + time_stamp + ".h5") # warnings.warn("A file with the same name already exists \n Renaming current output and saving to " + renamed_file) # test_filename = renamed_file data_frame = final2 if year == 2006: # Hack crade pur régler un problème rémanent data_frame = data_frame[data_frame.idfam != 602177906].copy() for id_variable in [ 'idfam', 'idfoy', 'idmen', 'noi', 'quifam', 'quifoy', 'quimen' ]: data_frame[id_variable] = data_frame[id_variable].astype('int') check = False if check: check_structure(data_frame) for entity_id in ['idmen', 'idfoy', 'idfam']: data_frame = id_formatter(data_frame, entity_id) set_variables_default_value(data_frame, year) return data_frame
def create_totals_second_pass(temporary_store = None, year = None): assert temporary_store is not None assert year is not None log.info(u" 5.1 : Elimination idfoy restant") # Voiture balai # On a plein d'idfoy vides, on fait 1 ménage = 1 foyer fiscal indivi = temporary_store['indivi_step_06_{}'.format(year)] idfoyList = indivi.loc[indivi.quifoy == "vous", 'idfoy'].unique() indivi_without_idfoy = ~indivi.idfoy.isin(idfoyList) indivi.loc[indivi_without_idfoy, 'quifoy'] = "pac" indivi.loc[indivi_without_idfoy & (indivi.quimen == 0) & (indivi.age >= 18), 'quifoy'] = "vous" indivi.loc[indivi_without_idfoy & (indivi.quimen == 0) & (indivi.age >= 18), 'idfoy'] = ( indivi.loc[indivi_without_idfoy, "idmen"].astype('int') * 100 + 51 ) indivi.loc[indivi_without_idfoy & (indivi.quimen == 1) & (indivi.age >= 18), 'quifoy'] = "conj" del idfoyList print_id(indivi) # Sélectionne les variables à garder pour les steps suivants variables = [ "actrec", "age", "age_en_mois", "chpub", "encadr", "idfoy", "idmen", "nbsala", "noi", "noindiv", "prosa", "quelfic", "quifoy", "quimen", "statut", "titc", "txtppb", "wprm", "rc1rev", "maahe", "sali", "rsti", "choi", "alr", "wprm", ] assert set(variables).issubset(set(indivi.columns)), \ "Manquent les colonnes suivantes : {}".format(set(variables).difference(set(indivi.columns))) dropped_columns = [variable for variable in indivi.columns if variable not in variables] indivi.drop(dropped_columns, axis = 1, inplace = True) # see http://stackoverflow.com/questions/11285613/selecting-columns indivi.reset_index(inplace = True) gc.collect() # TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé # les infos provenant des déclarations) log.info(u"Etape 6 : Création des variables descriptives") log.info(u" 6.1 : Variable activité") log.info(u"Variables présentes; \n {}".format(indivi.columns)) indivi['activite'] = np.nan indivi.loc[indivi.actrec <= 3, 'activite'] = 0 indivi.loc[indivi.actrec == 4, 'activite'] = 1 indivi.loc[indivi.actrec == 5, 'activite'] = 2 indivi.loc[indivi.actrec == 7, 'activite'] = 3 indivi.loc[indivi.actrec == 8, 'activite'] = 4 indivi.loc[indivi.age <= 13, 'activite'] = 2 # ce sont en fait les actrec=9 log.info("Valeurs prises par la variable activité \n {}".format(indivi['activite'].value_counts(dropna = False))) # TODO: MBJ problem avec les actrec # TODO: FIX AND REMOVE indivi.loc[indivi.actrec.isnull(), 'activite'] = 5 indivi.loc[indivi.titc.isnull(), 'titc'] = 0 assert indivi.titc.notnull().all(), \ u"Problème avec les titc" # On a 420 NaN pour les varaibels statut, titc etc log.info(u" 6.2 : Variable statut") indivi.loc[indivi.statut.isnull(), 'statut'] = 0 indivi.statut = indivi.statut.astype('int') indivi.loc[indivi.statut == 11, 'statut'] = 1 indivi.loc[indivi.statut == 12, 'statut'] = 2 indivi.loc[indivi.statut == 13, 'statut'] = 3 indivi.loc[indivi.statut == 21, 'statut'] = 4 indivi.loc[indivi.statut == 22, 'statut'] = 5 indivi.loc[indivi.statut == 33, 'statut'] = 6 indivi.loc[indivi.statut == 34, 'statut'] = 7 indivi.loc[indivi.statut == 35, 'statut'] = 8 indivi.loc[indivi.statut == 43, 'statut'] = 9 indivi.loc[indivi.statut == 44, 'statut'] = 10 indivi.loc[indivi.statut == 45, 'statut'] = 11 assert indivi.statut.isin(range(12)).all(), u"statut value over range" log.info("Valeurs prises par la variable statut \n {}".format( indivi['statut'].value_counts(dropna = False))) log.info(u" 6.3 : variable txtppb") indivi.loc[indivi.txtppb.isnull(), 'txtppb'] = 0 assert indivi.txtppb.notnull().all() indivi.loc[indivi.nbsala.isnull(), 'nbsala'] = 0 indivi.nbsala = indivi.nbsala.astype('int') indivi.loc[indivi.nbsala == 99, 'nbsala'] = 10 assert indivi.nbsala.isin(range(11)).all() log.info("Valeurs prises par la variable txtppb \n {}".format( indivi['txtppb'].value_counts(dropna = False))) log.info(u" 6.4 : variable chpub et CSP") indivi.loc[indivi.chpub.isnull(), 'chpub'] = 0 indivi.chpub = indivi.chpub.astype('int') assert indivi.chpub.isin(range(11)).all() indivi['cadre'] = 0 indivi.loc[indivi.prosa.isnull(), 'prosa'] = 0 assert indivi.prosa.notnull().all() log.info("Valeurs prises par la variable encadr \n {}".format(indivi['encadr'].value_counts(dropna = False))) # encadr : 1=oui, 2=non indivi.loc[indivi.encadr.isnull(), 'encadr'] = 2 indivi.loc[indivi.encadr == 0, 'encadr'] = 2 assert indivi.encadr.notnull().all() assert indivi.encadr.isin([1, 2]).all() indivi.loc[indivi.prosa.isin([7, 8]), 'cadre'] = 1 indivi.loc[(indivi.prosa == 9) & (indivi.encadr == 1), 'cadre'] = 1 assert indivi.cadre.isin(range(2)).all() log.info( u"Etape 7: on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence" ) log.info( u"nb de doublons idfoy/quifoy {}".format(len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])])) ) log.info(u"On crée les n° de personnes à charge dans le foyer fiscal") assert indivi.idfoy.notnull().all() print_id(indivi) indivi['quifoy_bis'] = 2 indivi.loc[indivi.quifoy == 'vous', 'quifoy_bis'] = 0 indivi.loc[indivi.quifoy == 'conj', 'quifoy_bis'] = 1 indivi.loc[indivi.quifoy == 'pac', 'quifoy_bis'] = 2 del indivi['quifoy'] indivi['quifoy'] = indivi.quifoy_bis.copy() del indivi['quifoy_bis'] print_id(indivi) pac = indivi.loc[indivi['quifoy'] == 2, ['quifoy', 'idfoy', 'noindiv']].copy() print_id(pac) j = 2 while pac.duplicated(['quifoy', 'idfoy']).any(): pac.loc[pac.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j j += 1 print_id(pac) indivi = indivi.merge(pac, on = ['noindiv', 'idfoy'], how = "left") indivi['quifoy'] = indivi['quifoy_x'] indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'], indivi['quifoy_x']) del indivi['quifoy_x'], indivi['quifoy_y'] print_id(indivi) del pac assert len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])]) == 0, \ u"Il y a {} doublons idfoy/quifoy".format( len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])]) ) print_id(indivi) log.info(u"Etape 8 : création des fichiers totaux") famille = temporary_store['famc_{}'.format(year)] log.info(u" 8.1 : création de tot2 & tot3") tot2 = indivi.merge(famille, on = 'noindiv', how = 'inner') # TODO: MBJ increase in number of menage/foyer when merging with family ... del famille control(tot2, debug = True, verbose = True) assert tot2.quifam.notnull().all() temporary_store['tot2_{}'.format(year)] = tot2 del indivi log.info(u" tot2 saved") tot2 = tot2[tot2.idmen.notnull()].copy() print_id(tot2) tot3 = tot2 # TODO: check where they come from log.info("Avant élimination des doublons noindiv: {}".format(len(tot3))) tot3 = tot3.drop_duplicates(subset = 'noindiv') log.info("Après élimination des doublons noindiv: {}".format(len(tot3))) # Block to remove any unwanted duplicated pair control(tot3, debug = True, verbose = True) tot3 = tot3.drop_duplicates(subset = ['idfoy', 'quifoy']) log.info("Après élimination des doublons idfoy, quifoy: {}".format(len(tot3))) tot3 = tot3.drop_duplicates(subset = ['idfam', 'quifam']) log.info("Après élimination des doublons idfam, 'quifam: {}".format(len(tot3))) tot3 = tot3.drop_duplicates(subset = ['idmen', 'quimen']) log.info("Après élimination des doublons idmen, quimen: {}".format(len(tot3))) tot3 = tot3.drop_duplicates(subset = ['noindiv']) control(tot3) log.info(u" 8.2 : On ajoute les variables individualisables") allvars = temporary_store['ind_vars_to_remove_{}'.format(year)] vars2 = set(tot3.columns).difference(set(allvars)) tot3 = tot3[list(vars2)] log.info("{}".format(len(tot3))) assert not(tot3.duplicated(subset = ['noindiv']).any()), "doublon dans tot3['noindiv']" lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])]) assert lg_dup == 0, "{} pairs of idfoy/quifoy in tot3 are duplicated".format(lg_dup) temporary_store['tot3_{}'.format(year)] = tot3 control(tot3) del tot2, allvars, tot3, vars2 gc.collect() log.info(u"tot3 sauvegardé")
def final(temporary_store = None, year = None, check = True): assert temporary_store is not None assert year is not None log.info(u'08_final: derniers réglages') # On définit comme célibataires les individus dont on n'a pas retrouvé la déclaration final = temporary_store['final_{}'.format(year)] if year == 2009: final = normalizes_roles_in_entity(final, 'foy') final = normalizes_roles_in_entity(final, 'men') if check: check_structure(final) final.statmarit.fillna(2, inplace = True) # activite des fip log.info(' gestion des FIP de final') final_fip = final.loc[final.quelfic.isin(["FIP", "FIP_IMP"]), ["choi", "sali", "alr", "rsti", "age"]].copy() log.info(set(["choi", "sali", "alr", "rsti"]).difference(set(final_fip.columns))) for var in ["choi", "sali", "alr", "rsti"]: final_fip[var].fillna(0, inplace = True) assert final_fip[var].notnull().all(), "Some NaN are remaining in column {}".format(var) final_fip["activite"] = 2 # TODO comment choisr la valeur par défaut ? final_fip.activite = where(final_fip.choi > 0, 1, final_fip.activite) final_fip.activite = where(final_fip.sali > 0, 0, final_fip.activite) final_fip.activite = where(final_fip.age > 21, 2, final_fip.activite) # ne peuvent être rattachés que les étudiants final.update(final_fip) temporary_store['final_{}'.format(year)] = final log.info("final has been updated with fip") menagem = temporary_store['menagem_{}'.format(year)] assert 'ident' in menagem.columns assert 'so' in menagem.columns menagem.rename( columns = dict(ident = "idmen", so = "statut_occupation"), inplace = True ) menagem["cstotpragr"] = np.floor(menagem["cstotpr"] / 10) # 2008 tau99 removed TODO: check ! and check incidence variables = [ "champm", "cstotpragr", "ddipl", "idmen", "nbinde", "pol99", "reg", "statut_occupation", "tau99", "tu99", "typmen15", "wprm", "zthabm", ] if year == 2008: variables.remove("tau99") famille_variables = ["m_afeamam", "m_agedm", "m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm'] if 'loyer' in menagem.columns: variables.append('loyer') # if ("naf16pr" %in% names(menagem)) { # naf16pr <- factor(menagem$naf16pr) # levels(naf16pr) <- 0:16 # menagem$naf16pr <- as.character(naf16pr) # menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1" # Sans objet # variables <- c(variables,"naf16pr") # } else if ("nafg17npr" %in% names(menagem)) { # # TODO: pb in 2008 with xx # if (year == "2008"){ # menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00" # } # nafg17npr <- factor(menagem$nafg17npr) # levels(nafg17npr) <- 0:17 # menagem$nafg17npr <- as.character(nafg17npr) # menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1" # Sans objet # } # # TODO: pytohn translation needed # if "naf16pr" in menagem.columns: # naf16pr <- factor(menagem$naf16pr) # levels(naf16pr) <- 0:16 # menagem$naf16pr <- as.character(naf16pr) # menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1" # Sans objet # variables <- c(variables,"naf16pr") # } else if ("nafg17npr" %in% names(menagem)) { # # TODO: pb in 2008 with xx # if (year == "2008"){ # menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00" # } # nafg17npr <- factor(menagem$nafg17npr) # levels(nafg17npr) <- 0:17 # menagem$nafg17npr <- as.character(nafg17npr) # menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1" # Sans objet # } # TODO: 2008tau99 is not present should be provided by 02_loy.... is it really needed all_variables = variables + famille_variables log.info("liste de toutes les variables : {}".format(all_variables)) log.info(menagem.info()) available_variables = list(set(all_variables).intersection(set(menagem.columns))) log.info("liste des variables à extraire de menagem: {}".format(available_variables)) loyersMenages = menagem[available_variables].copy() # # # Recodage de typmen15: modalités de 1:15 # table(loyersMenages$typmen15, useNA="ifany") # loyersMenages <- within(loyersMenages, { # typmen15[typmen15==10 ] <- 1 # typmen15[typmen15==11 ] <- 2 # typmen15[typmen15==21 ] <- 3 # typmen15[typmen15==22 ] <- 4 # typmen15[typmen15==23 ] <- 5 # typmen15[typmen15==31 ] <- 6 # typmen15[typmen15==32 ] <- 7 # typmen15[typmen15==33 ] <- 8 # typmen15[typmen15==41 ] <- 9 # typmen15[typmen15==42 ] <- 10 # typmen15[typmen15==43 ] <- 11 # typmen15[typmen15==44 ] <- 12 # typmen15[typmen15==51 ] <- 13 # typmen15[typmen15==52 ] <- 14 # typmen15[typmen15==53 ] <- 15 # }) # # # TODO: MBJ UNNECESSARY ? # # # Pb avec ddipl, pas de modalités 2: on décale les chaps >=3 # # Cependant on fait cela après avoir fait les traitement suivants # table(loyersMenages$ddipl, useNA="ifany") # # On convertit les ddipl en numeric # loyersMenages$ddipl <- as.numeric(loyersMenages$ddipl) # table(loyersMenages$ddipl, useNA="ifany") # # On met les non renseignés ie, NA et "" à sans diplome (modalité 7) # loyersMenages[is.na(loyersMenages$ddipl), "ddipl"] <- 7 # # loyersMenages[loyersMenages$ddipl>1, "ddipl"] <- loyersMenages$ddipl[loyersMenages$ddipl>1]-1 # log.info("loyersMenages \n {}".format(loyersMenages.info())) loyersMenages.ddipl = where(loyersMenages.ddipl.isnull(), 7, loyersMenages.ddipl) loyersMenages.ddipl = where(loyersMenages.ddipl > 1, loyersMenages.ddipl - 1, loyersMenages.ddipl) loyersMenages.ddipl = loyersMenages.ddipl.astype("int32") final['act5'] = NaN final.act5 = where(final.actrec == 1, 2, final.act5) # indépendants final.act5 = where(final.actrec.isin([2, 3]), 1, final.act5) # salariés final.act5 = where(final.actrec == 4, 3, final.act5) # chômeur final.act5 = where(final.actrec == 7, 4, final.act5) # retraité final.act5 = where(final.actrec == 8, 5, final.act5) # autres inactifs log.info("Valeurs prises par act5: \n {}".format(final.act5.value_counts())) # assert final.act5.notnull().all(), 'there are NaN inside final.act5' # final$wprm <- NULL # with the intention to extract wprm from menage to deal with FIPs # final$taxe_habitation <- final$zthabm # rename zthabm to taxe_habitation # final$zthabm <- NULL # # final2 <- merge(final, loyersMenages, by="idmen", all.x=TRUE) log.info(' création de final2') del final["wprm"] gc.collect() final.rename(columns = dict(zthabm = "taxe_habitation"), inplace = True) # rename zthabm to taxe_habitation final2 = final.merge(loyersMenages, on = "idmen", how = "left") # TODO: Check log.info("{}".format(loyersMenages.head())) gc.collect() print_id(final2) # # TODO: merging with patrimoine log.info(' traitement des zones apl') import pkg_resources openfisca_france_data_location = pkg_resources.get_distribution('openfisca-france-data').location zone_apl_imputation_data_file_path = os.path.join( openfisca_france_data_location, 'openfisca_france_data', 'zone_apl_data', 'zone_apl', 'zone_apl_imputation_data.csv', ) apl_imp = read_csv(zone_apl_imputation_data_file_path) log.info("{}".format(apl_imp.head(10))) if year == 2008: zone_apl = final2.xs(["tu99", "pol99", "reg"], axis = 1) else: zone_apl = final2.xs(["tu99", "pol99", "tau99", "reg"], axis = 1) for i in range(len(apl_imp["TU99"])): tu = apl_imp["TU99"][i] pol = apl_imp["POL99"][i] tau = apl_imp["TAU99"][i] reg = apl_imp["REG"][i] if year == 2008: indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (final2["reg"] == reg) selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (apl_imp["REG"] == reg) else: indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (final2["tau99"] == tau) & (final2["reg"] == reg) selection = ( (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (apl_imp["TAU99"] == tau) & (apl_imp["REG"] == reg) ) z = random.uniform(size=indices.sum()) log.info(len(z)) log.info(len(indices)) log.info(len(indices) / len(z)) probs = apl_imp[["proba_zone1", "proba_zone2"]][selection].copy() log.info(probs) log.info(probs['proba_zone1'].values) proba_zone_1 = probs['proba_zone1'].values[0] proba_zone_2 = probs['proba_zone2'].values[0] final2["zone_apl"] = 3 final2.zone_apl.loc[indices] = ( 1 + (z > proba_zone_1) + (z > (proba_zone_1 + proba_zone_2)) ) del indices, probs log.info(' performing cleaning on final2') log.info('{} sali nuls'.format(len(final2[final2['sali'].isnull()]))) log.info("{} individus d'âges nuls".format(len(final2[final2.age.isnull()]))) log.info("longueur de final2 avant purge : {}".format(len(final2))) # columns_w_nan = [] # for col in final2.columns: # if final2[final2['idfoy'].notnull()][col].isnull().any() and not final2[col].isnull().all(): # columns_w_nan.append(col) # print columns_w_nan log.info('check doublons : {}'.format(len(final2[final2.duplicated(['noindiv'])]))) log.info("{}".format(final2.age.isnull().sum())) print_id(final2) # # var <- names(foyer) # #a1 <- c('f7rb', 'f7ra', 'f7gx', 'f2aa', 'f7gt', 'f2an', 'f2am', 'f7gw', 'f7gs', 'f8td', 'f7nz', 'f1br', 'f7jy', 'f7cu', 'f7xi', 'f7xo', 'f7xn', 'f7xw', 'f7xy', 'f6hj', 'f7qt', 'f7ql', 'f7qm', 'f7qd', 'f7qb', 'f7qc', 'f1ar', 'f7my', 'f3vv', 'f3vu', 'f3vt', 'f7gu', 'f3vd', 'f2al', 'f2bh', 'f7fm', 'f8uy', 'f7td', 'f7gv', 'f7is', 'f7iy', 'f7il', 'f7im', 'f7ij', 'f7ik', 'f1er', 'f7wl', 'f7wk', 'f7we', 'f6eh', 'f7la', 'f7uh', 'f7ly', 'f8wy', 'f8wx', 'f8wv', 'f7sb', 'f7sc', 'f7sd', 'f7se', 'f7sf', 'f7sh', 'f7si', 'f1dr', 'f7hs', 'f7hr', 'f7hy', 'f7hk', 'f7hj', 'f7hm', 'f7hl', 'f7ho', 'f7hn', 'f4gc', 'f4gb', 'f4ga', 'f4gg', 'f4gf', 'f4ge', 'f7vz', 'f7vy', 'f7vx', 'f7vw', 'f7xe', 'f6aa', 'f1cr', 'f7ka', 'f7ky', 'f7db', 'f7dq', 'f2da') # #a2 <- setdiff(a1,names(foyer)) # #b1 <- c('pondfin', 'alt', 'hsup', 'ass_mat', 'zone_apl', 'inactif', 'ass', 'aer', 'code_postal', 'activite', 'type_sal', 'jour_xyz', 'boursier', 'etr', 'partiel1', 'partiel2', 'empl_dir', 'gar_dom', 'categ_inv', 'opt_colca', 'csg_taux_plein','coloc') # # hsup feuille d'impot # # boursier pas dispo # # inactif etc : extraire cela des donn?es clca etc # # # tester activit? car 0 vaut actif # table(is.na(final2$activite),useNA="ifany") # # saveTmp(final2, file= "final2.Rdata") control(final2, debug = True) temporary_store['final2'] = final2 log.info("Nombre de personne d'âge NaN: {} ".format(final2.age.isnull().sum())) final2 = final2.drop_duplicates(subset = 'noindiv') log.info(' Filter to manage the new 3-tables structures:') # On récupère les foyer, famille, ménages qui ont un chef : # On ne conserve dans final2 que ces foyers là : log.info('final2 avant le filtrage {}'.format(len(final2))) print_id(final2) liste_fam = final2.loc[final2['quifam'] == 0, 'idfam'].unique() log.info("Dropping {} famille".format((~final2.idfam.isin(liste_fam)).sum())) final2 = final2.loc[final2.idfam.isin(liste_fam)].copy() if check: check_structure(final2) temporary_store['final2bis'] = final2 liste_foy = final2.loc[final2['quifoy'] == 0, 'idfoy'].unique() log.info("Dropping {} foyers".format((~final2.idfoy.isin(liste_foy)).sum())) final2 = final2.loc[final2.idfoy.isin(liste_foy)].copy() liste_men = final2.loc[final2['quimen'] == 0, 'idmen'].unique() log.info(u"Dropping {} ménages".format((~final2.idmen.isin(liste_men)).sum())) final2 = final2.loc[final2.idmen.isin(liste_men)].copy() log.info('final2 après le filtrage {}'.format(len(final2))) print_id(final2) rectify_dtype(final2, verbose = False) # home = os.path.expanduser("~") # test_filename = os.path.join(home, filename + ".h5") # if os.path.exists(test_filename): # import warnings # import datetime # time_stamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M') # renamed_file = os.path.join(DATA_SOURCES_DIR, filename + "_" + time_stamp + ".h5") # warnings.warn("A file with the same name already exists \n Renaming current output and saving to " + renamed_file) # test_filename = renamed_file data_frame = final2 temporary_store['final3'] = data_frame assert not data_frame.duplicated(['idmen', 'quimen']).any(), 'bad ménages indexing' if year == 2006: # Hack crade pur régler un problème rémanent data_frame = data_frame[data_frame.idfam != 602177906].copy() for id_variable in ['idfam', 'idfoy', 'idmen', 'noi', 'quifam', 'quifoy', 'quimen']: data_frame[id_variable] = data_frame[id_variable].astype('int') gc.collect() if check: check_structure(data_frame) temporary_store['final4'] = data_frame assert not data_frame.duplicated(['idmen', 'quimen']).any(), 'bad ménages indexing' gc.collect() for entity_id in ['idmen', 'idfoy', 'idfam']: log.info('Reformat ids: {}'.format(entity_id)) data_frame = id_formatter(data_frame, entity_id) assert not data_frame.duplicated(['idmen', 'quimen']).any(), 'bad ménages indexing' log.info('Dealing with loyer') if 'loyer' in data_frame.columns: assert 'loyer' in data_frame.columns data_frame.loyer = data_frame.loyer * 12.0 log.info('Set variables to their default values') set_variables_default_value(data_frame, year) print_id(data_frame) temporary_store['input_{}'.format(year)] = data_frame return data_frame
def invalide(temporary_store = None, year = None): assert temporary_store is not None assert year is not None log.info(u"Entering 07_invalides: construction de la variable invalide") # # # Invalides # # #invalide = caseP (vous), caseF (conj) ou case G, caseI, ou caseR (pac) # # loadTmp("final.Rdata") # # invalides <- final[,c("noindiv","idmen","caseP","caseF","idfoy","quifoy")] # # invalides <- within(invalides,{ # # caseP <- ifelse(is.na(caseP),0,caseP) # # caseF <- ifelse(is.na(caseF),0,caseF) # # invalide <- FALSE}) # # # Les "vous" invalides # # table(invalides[,c("caseF","quifoy")],useNA="ifany") # # invalides[(invalides$caseP==1) & (invalides$quifoy=="vous"),"invalide"] <- TRUE # # log.info(u"Etape 1 : création de la df invalides") log.info(u" 1.1 : déclarants invalides") final = temporary_store['final_{}'.format(year)] if "invalide" in final: # on drop les colones inv et alt au cas ou on aurait déjà lancé le step07 final.drop(["invalide", "alt"], axis = 1, inplace = True) invalides_vars = [ "caseF", "caseP", "idfoy", "idmen", "noindiv", "quifoy", ] aah_eec_variables = ["rc1rev", "maahe"] aah_eec = False if set(aah_eec_variables) < set(final.columns): aah_eec = True invalides_vars += aah_eec_variables assert set(invalides_vars) < set(final.columns), \ "Variables {} are missing".format(set(invalides_vars).difference(set(final.columns))) invalides = final.xs(invalides_vars, axis = 1).copy() invalides['invalide'] = False invalides['alt'] = False for var in ["caseP", "caseF"]: assert invalides[var].notnull().all(), 'NaN values in {}'.format(var) # Les déclarants invalides invalides.loc[(invalides['caseP'] == 1) & (invalides['quifoy'] == 0), 'invalide'] = True log.info(u"Il y a {} invalides déclarants".format(invalides["invalide"].sum())) # Les personnes qui touchent l'aah dans l'enquête emploi if aah_eec: log.info(u"Inspecting rc1rev") log.info(invalides['rc1rev'].value_counts()) invalides.loc[invalides.maahe > 0, 'invalide'] = True invalides.loc[invalides.rc1rev == 4, 'invalide'] = True # TODO: vérifier le format. # TODO: invalides.rc1rev.astype("str") voir mai mahdi pour pendre en compte 14 24 etc log.info(u"Il y a {} invalides qui touchent des alloc".format(invalides["invalide"].sum())) print_id(invalides) # # # Les conjoints invalides # # # # #men_inv_conj <- invalides[c("idmen","caseF","quifoy")] # # #men_inv_conj <- rename(men_inv_conj, c("caseF"="invalide")) # # #table(men_inv_conj[men_inv_conj$inv==1 ,c("invalide","quifoy")],useNA="ifany") # # # Il y a des caseF suir des conjoints cela vint des doubles d?clarations TODO: shoumd clean this # # #toto <- invalides[invalides$caseF==1 & invalides$quifoy=="conj","idmen"] # # #load(indm) # # #titi <- indivim[(indivim$ident %in% toto) & (indivim$persfip=="vous" |indivim$persfip=="conj") ,c("ident","noindiv","declar1","declar2","persfip","quelfic")] # # #titi <- titi[order(titi$ident),] # # foy_inv_conj <- invalides[,c("idfoy","caseF","quifoy")] # # foy_inv_conj <- rename(foy_inv_conj, c("caseF"="invalide")) # # table(foy_inv_conj[ ,c("invalide","quifoy")],useNA="ifany") # # # On ne garde donc que les caseF des "vous" # # foy_inv_conj <- foy_inv_conj[foy_inv_conj$quifoy=="vous",c("idfoy","invalide")] # # table(foy_inv_conj[ ,c("invalide")],useNA="ifany") # # invalides_conj <- invalides[invalides$quifoy=="conj",c("idfoy","noindiv")] # # invalides_conj <- merge(invalides_conj, foy_inv_conj, by="idfoy", all.x=TRUE) # # table(invalides_conj$inv) # TODO en 2006 On en a 316 au lieu de 328 il doit y avoir de idfoy avec caseF qui n'ont pas de vous because double déclaration' # # invalides[invalides$quifoy=="conj",c("idfoy","noindiv","invalide")] <- invalides_conj # # table(invalides[,c("invalide","quifoy")],useNA="ifany") # # rm(invalides_conj,foy_inv_conj) # On récupère les idfoy des foyers avec une caseF cochée log.info(' 1.2 : Les conjoints invalides') idfoy_inv_conj = final.idfoy[final.caseF].copy() inv_conj_condition = (invalides.idfoy.isin(idfoy_inv_conj) & (invalides.quifoy == 1)) invalides.loc[inv_conj_condition, "invalide"] = True log.info(u"Il y a {} invalides conjoints".format(len(invalides[inv_conj_condition]))) log.info(u" Il y a {} invalides déclarants et invalides conjoints".format(invalides["invalide"].sum())) # Enfants invalides et garde alternée # # # # loadTmp("pacIndiv.Rdata") # # foy_inv_pac <- invalides[!(invalides$quifoy %in% c("vous","conj")),c("invalide","noindiv")] # # foy_inv_pac <- merge(foy_inv_pac, pacIndiv[,c("noindiv","typ","naia")], by="noindiv",all.x =TRUE) # # names(foy_inv_pac) # # table(foy_inv_pac[,c("typ","naia")],useNA="ifany") # # table(foy_inv_pac[,c("typ")],useNA="ifany") # # foy_inv_pac <- within(foy_inv_pac,{ # # invalide <- (typ=="G") | (typ=="R") | (typ=="I") | (typ=="F" & (as.numeric(year)-naia>18)) # # alt <- (typ=="H") | (typ=="I") # # naia <- NULL # # typ <- NULL}) # # # # table(foy_inv_pac[ ,c("invalide")],useNA="ifany") # # table(foy_inv_pac[ ,c("alt")],useNA="ifany") # # invalides$alt <- 0 # # foy_inv_pac[is.na(foy_inv_pac$alt),"alt"] <- 0 # # invalides[!(invalides$quifoy %in% c("vous","conj")),c("noindiv","invalide","alt")] <- foy_inv_pac log.info(u" 1.3 : enfants invalides et en garde alternée (variables inv et alt)") pacIndiv = temporary_store['pacIndiv_{}'.format(year)] # print pacIndiv.type_pac.value_counts() log.info(pacIndiv.type_pac.value_counts()) foy_inv_pac = invalides[['noindiv', 'invalide']][~(invalides.quifoy.isin([0, 1]))].copy() # pac = pacIndiv.ix[:, ["noindiv", "type_pac", "naia"]] log.info("{}".format(len(foy_inv_pac))) log.info("{}".format(pacIndiv.columns)) foy_inv_pac = foy_inv_pac.merge( pacIndiv[['noindiv', 'type_pac', 'naia']].copy(), on = 'noindiv', how = 'left', ) foy_inv_pac['invalide'] = ( foy_inv_pac['type_pac'].isin(['G', 'R', 'I']) | ( (foy_inv_pac.type_pac == "F") & ((year - foy_inv_pac.naia) > 18) ) ) foy_inv_pac['alt'] = ((foy_inv_pac.type_pac == "H") | (foy_inv_pac.type_pac == "I")) del foy_inv_pac['naia'] del foy_inv_pac['type_pac'] foy_inv_pac['alt'] = foy_inv_pac['alt'].fillna(False) log.info("{}".format(foy_inv_pac['invalide'].describe())) invalides.loc[:, 'alt'] = False invalides.loc[~(invalides.quifoy.isin([0, 1])), ["alt", "invalide"]] = foy_inv_pac[["alt", "invalide"]].copy().values invalides = invalides[["noindiv", "invalide", 'alt']].copy() invalides['alt'].fillna(False, inplace = True) log.info(invalides.invalide.value_counts()) # invalides = invalides.drop_duplicates(['noindiv', 'invalide', 'alt'], take_last = True) del foy_inv_pac, pacIndiv # Initialisation des NA sur alt et inv # invalides[is.na(invalides$inv), "invalide"] <- 0 # table(invalides[,c("alt","invalide")],useNA="ifany") # # final <- merge(final, invalides[,c("noindiv","invalide","alt")], by="noindiv",all.x=TRUE) # table(final[, c("invalide","alt")],useNA="ifany") log.info('Etape 2 : Initialisation des NA sur alt et inv') assert invalides.invalide.notnull().all() & invalides.alt.notnull().all() final.set_index('noindiv', inplace = True, verify_integrity = True) invalides.set_index('noindiv', inplace = True, verify_integrity = True) final = final.join(invalides) final.reset_index(inplace = True) del invalides log.info("{}".format(final.invalide.value_counts())) control(final, debug = True) temporary_store['final_{}'.format(year)] = final log.info(u'final complétée et sauvegardée')