def create_final(year=None): if year is None: raise Exception("A year is needed") print 'création de final' foy_ind = load_temp(name = 'foy_ind', year=year) tot3 = load_temp(name='tot3', year=year) foy_ind.set_index(['idfoy', 'quifoy'], inplace=True) tot3.set_index(['idfoy', 'quifoy'], inplace=True) final = concat([tot3, foy_ind], join_axes=[tot3.index], axis=1) final.reset_index(inplace=True) foy_ind.reset_index(inplace=True) tot3.reset_index(inplace=True) # tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam']) final = final[final.idmen.notnull()] control(final, verbose=True) del tot3, foy_ind gc.collect() #final <- merge(final, sif, by = c('noindiv'), all.x = TRUE) print " loading fip" sif = load_temp(name = 'sif', year=year) print sif.columns print " update final using fip" final = final.merge(sif, on=["noindiv"], how="left") #TODO: IL FAUT UNE METHODE POUR GERER LES DOUBLES DECLARATIONS print final.columns control(final, debug=True) final['caseP'] = final.caseP.fillna(False) final['caseF'] = final.caseF.fillna(False) print_id(final) save_temp(final, name='final', year=year) print 'final sauvegardé' del sif, final
def create_final(year=None): if year is None: raise Exception("A year is needed") print 'création de final' foy_ind = load_temp(name='foy_ind', year=year) tot3 = load_temp(name='tot3', year=year) foy_ind.set_index(['idfoy', 'quifoy'], inplace=True) tot3.set_index(['idfoy', 'quifoy'], inplace=True) final = concat([tot3, foy_ind], join_axes=[tot3.index], axis=1) final.reset_index(inplace=True) foy_ind.reset_index(inplace=True) tot3.reset_index(inplace=True) # tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam']) final = final[final.idmen.notnull()] control(final, verbose=True) del tot3, foy_ind gc.collect() #final <- merge(final, sif, by = c('noindiv'), all.x = TRUE) print " loading fip" sif = load_temp(name='sif', year=year) print sif.columns print " update final using fip" final = final.merge(sif, on=["noindiv"], how="left") #TODO: IL FAUT UNE METHODE POUR GERER LES DOUBLES DECLARATIONS print final.columns control(final, debug=True) final['caseP'] = final.caseP.fillna(False) final['caseF'] = final.caseF.fillna(False) print_id(final) save_temp(final, name='final', year=year) print 'final sauvegardé' del sif, final
def create_totals(year=2006): print "Creating Totals" print "Etape 1 : Chargement des données" data = DataCollection(year=year) indivim = load_temp(name="indivim", year=year) assert indivim.duplicated(['noindiv']).any() == False, "Présence de doublons" # Deals individuals with imputed income : some individuals are in 'erf individu table' but # not in the 'foyer' table. We need to create a foyer for them. selection = Series() for var in ["zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici", "zrnci"]: varo = var[:-1]+"o" test = indivim[var] != indivim[varo] if len(selection) == 0: selection = test else: selection = (test) | (selection) indivi_i = indivim[selection] indivi_i.rename(columns={"ident" : "idmen", "persfip":"quifoy", "zsali" : "sali2", # Inclu les salaires non imposables des agents d'assurance "zchoi" : "choi2", "zrsti" : "rsti2", "zalri" : "alr2"}, inplace=True) indivi_i["quifoy"] = where(indivi_i["quifoy"].isnull(), "vous", indivi_i["quifoy"]) indivi_i["quelfic"] = "FIP_IMP" ## We merge them with the other individuals #indivim <- rename(indivim, c(ident = "idmen", # persfip = "quifoy", # zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance # zchoi = "choi2", # zrsti = "rsti2", # zalri = "alr2")) # #indivi <- rbind(indivim[!(indivim$noindiv %in% indivi_i$noindiv),], indivi_i) #rm(indivim, indivi_i) #gc() #table(indivi$quelfic) # indivim.rename( columns= dict(ident = "idmen", persfip = "quifoy", zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance zchoi = "choi2", zrsti = "rsti2", zalri = "alr2"), inplace=True) if not (set(list(indivim.noindiv)) > set(list(indivi_i.noindiv)) ): raise Exception("Individual ") indivim.set_index("noindiv", inplace=True) indivi_i.set_index("noindiv", inplace=True) indivi = indivim del indivim indivi.update(indivi_i) indivi.reset_index( inplace=True) print '' print "Etape 2 : isolation des FIP" fip_imp = indivi.quelfic=="FIP_IMP" indivi["idfoy"] = (indivi["idmen"].astype("int64")*100 + (indivi["declar1"].str[0:2]).convert_objects(convert_numeric=True)) indivi.loc[fip_imp,"idfoy"] = nan ## Certains FIP (ou du moins avec revenus imputés) ont un num?ro de déclaration d'impôt ( pourquoi ?) fip_has_declar = (fip_imp) & (indivi.declar1.notnull()) # indivi.ix[fip_has_declar, "idfoy"] = ( indivi.ix[fip_has_declar, "idmen"]*100 # + (indivi.ix[fip_has_declar, "declar1"].str[0:1]).convert_objects(convert_numeric=True) ) indivi["idfoy"] = where(fip_has_declar, indivi["idmen"]*100 + indivi["declar1"].str[0:2].convert_objects(convert_numeric=True), indivi["idfoy"]) del fip_has_declar fip_no_declar = (fip_imp) & (indivi.declar1.isnull()) del fip_imp indivi["idfoy"] = where(fip_no_declar, indivi["idmen"]*100 + 50, indivi["idfoy"]) indivi_fnd = indivi.loc[fip_no_declar, ["idfoy","noindiv"]] while any(indivi_fnd.duplicated(cols=["idfoy"])): indivi_fnd["idfoy"] = where(indivi_fnd.duplicated(cols=["idfoy"]), indivi_fnd["idfoy"] + 1, indivi_fnd["idfoy"]) assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len(indivi_fnd["idfoy"]), "Duplicates remaining" assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons" indivi.loc[fip_no_declar, ["idfoy"]] = indivi_fnd del indivi_fnd, fip_no_declar print '' print 'Etape 3 : Récupération des EE_NRT' nrt = indivi.quelfic=="EE_NRT" indivi.idfoy = where(nrt, indivi.idmen*100 + indivi.noi, indivi.idfoy) indivi.loc[nrt,"quifoy"] = "vous" del nrt pref_or_cref = indivi['lpr'].isin([1,2]) adults = (indivi.quelfic.isin(["EE","EE_CAF"])) & (pref_or_cref) indivi.idfoy = where(adults, indivi.idmen*100 + indivi.noi, indivi.idfoy) indivi.loc[adults, "quifoy"] = "vous" del adults assert indivi.loc[indivi['lpr'].isin([1,2]),"idfoy"].notnull().all() print '' print 'Etape 4 : Rattachement des enfants aux déclarations' assert indivi["noindiv"].duplicated().any() == False, "Some noindiv appear twice" lpr3_or_lpr4 = indivi['lpr'].isin([3,4]) enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE","EE_CAF"])) assert indivi.loc[enf_ee, "noindiv"].notnull().all(), " Some noindiv are not set, which will ruin next stage" assert indivi.loc[enf_ee, "noindiv"].duplicated().any() == False, "Some noindiv appear twice" pere = DataFrame( {"noindiv_enf" : indivi.noindiv.loc[enf_ee], "noindiv" : 100*indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee] }) mere = DataFrame( {"noindiv_enf" : indivi.noindiv.loc[enf_ee], "noindiv" : 100*indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee] }) foyer = data.get_values(variables=["noindiv","zimpof"], table="foyer" ) pere = pere.merge(foyer, how="inner", on="noindiv") mere = mere.merge(foyer, how="inner", on="noindiv") # print "Some pere et mere are duplicated because people have two foyers" # print pere[pere.duplicated()] # print mere[mere.duplicated()] df = pere.merge(mere, how="outer", on="noindiv_enf", suffixes=('_p', '_m')) # print len(pere) # print len(mere) # print len(df) # ll = df.loc[df["noindiv_enf"].duplicated(), "noindiv_enf"] # print df.loc[df["noindiv_enf"].isin(ll)] # print df[df.duplicated()] print ' 4.1 : gestion des personnes dans 2 foyers' for col in ["noindiv_p","noindiv_m","noindiv_enf"]: df[col] = df[col].fillna(0,inplace=True) # beacause groupby drop groups with NA in index df = df.groupby(by=["noindiv_p","noindiv_m","noindiv_enf"]).sum() df.reset_index(inplace=True) df["which"] = "" df["which"] = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()), "mere", "") df["which"] = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()), "pere", "") both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull()) df["which"] = where(both & (df.zimpof_p > df.zimpof_m), "pere", "mere") df["which"] = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere") assert df["which"].notnull().all(), "Some enf_ee individuals are not matched with any pere or mere" del lpr3_or_lpr4, pere, mere df.rename(columns={"noindiv_enf" : "noindiv"}, inplace=True) df["idfoy"] = where( df.which=="pere", df.noindiv_p, df.noindiv_m) df["idfoy"] = where( df.which=="mere", df.noindiv_m, df.noindiv_p) assert df["idfoy"].notnull().all() for col in df.columns: if col not in ["idfoy", "noindiv"]: del df[col] # assert indivi.loc[enf_ee,"idfoy"].notnull().all() assert df.duplicated().any() == False df.set_index("noindiv",inplace=True, verify_integrity=True) indivi.set_index("noindiv", inplace=True, verify_integrity=True) ind_notnull = indivi["idfoy"].notnull().sum() ind_isnull = indivi["idfoy"].isnull().sum() indivi = indivi.combine_first(df) assert ind_notnull + ind_isnull == (indivi["idfoy"].notnull().sum() + indivi["idfoy"].isnull().sum()) indivi.reset_index(inplace=True) assert indivi.duplicated().any() == False # MBJ: issue delt with when moving from R code to python ## TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs ## On suit guide méthodo erf 2003 page 135 ## On supprime les conjoints FIP et les FIP de 25 ans et plus; ## On conserve les enfants FIP de 19 à 24 ans; ## On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un ## ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète ## EEC que l'on ne retrouvera pas dans les EEC suivantes. # print ' 4.2 : On enlève les individus pour lesquels il manque le déclarant' fip = load_temp(name="fipDat", year=year) fip["declar"] = nan fip["agepf"] = nan fip.drop(["actrec", "year", "noidec"], axis=1, inplace=True) fip.naia = fip.naia.astype("int32") fip.rename( columns=dict(ident="idmen", persfip="quifoy", zsali="sali2", # Inclu les salaires non imposables des agents d'assurance zchoi="choi2", zrsti="rsti2", zalri="alr2"), inplace=True) is_fip_19_25 = ((year-fip.naia-1)>=19) & ((year-fip.naia-1)<25) ## TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented ## The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25 ##fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"] ##fip[is_fip_19_25 ,"lpr"] <- 1 # #indivi <- rbind.fill(indivi,fip[is_fip_19_25,]) indivi = concat([indivi, fip.loc[is_fip_19_25]]) del is_fip_19_25 indivi['age'] = year - indivi.naia - 1 indivi['agem'] = 12*indivi.age + 12-indivi.naim indivi["quimen"] = 0 indivi.quimen[indivi.lpr == 1] = 0 indivi.quimen[indivi.lpr == 2] = 1 indivi.quimen[indivi.lpr == 3] = 2 indivi.quimen[indivi.lpr == 4] = 3 indivi['not_pr_cpr'] = nan indivi['not_pr_cpr'][indivi['lpr']<=2] = False indivi['not_pr_cpr'][indivi['lpr']>2] = True print " 4.3 : Creating non pr=0 and cpr=1 idmen's" indivi.reset_index(inplace=True) test1 = indivi.ix[indivi['not_pr_cpr']==True,['quimen', 'idmen']] test1['quimen'] = 2 j=2 while any(test1.duplicated(['quimen', 'idmen'])): test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j+1 j += 1 print_id(indivi) indivi.update(test1) print_id(indivi) # indivi.set_index(['quiment']) #TODO: check relevance # TODO problème avec certains idfoy qui n'ont pas de vous print '' print "Etape 5 : Gestion des idfoy qui n'ont pas de vous" all = indivi.drop_duplicates('idfoy') with_ = indivi.loc[indivi['quifoy']=='vous', 'idfoy'] without = all[~(all.idfoy.isin(with_.values))] print 'On cherche si le déclarant donné par la deuxième déclaration est bien un vous' has_declar2 = (indivi.idfoy.isin(without.idfoy.values)) & (indivi.declar2.notnull()) decl2_idfoy = (indivi.loc[has_declar2, 'idmen'].astype('int')*100 + indivi.loc[has_declar2, "declar2"].str[0:2].astype('int')) indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values), decl2_idfoy, None) del all,with_,without, has_declar2 print ' 5.1 : Elimination idfoy restant' idfoyList = indivi.loc[indivi['quifoy']=="vous", 'idfoy'].drop_duplicates() indivi = indivi[indivi.idfoy.isin(idfoyList.values)] del idfoyList print_id(indivi) myvars = ["noindiv", "noi", "idmen", "idfoy", "quifoy", "wprm", "age","agem","quelfic","actrec", "quimen", "nbsala","titc","statut","txtppb","chpub","prosa","encadr"] if not(len(set(myvars).difference(set(indivi.columns))) == 0): print set(myvars).difference(set(indivi.columns)) assert len(set(myvars).difference(set(indivi.columns))) == 0 indivi = indivi.loc[:, myvars] ## TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé ## les infos provenant des déclarations) print '' print 'Etape 6 : Création des variables descriptives' print ' 6.1 : variable activité' indivi['activite'] = None indivi['activite'][indivi['actrec']<=3] = 0 indivi['activite'][indivi['actrec']==4] = 1 indivi['activite'][indivi['actrec']==5] = 2 indivi['activite'][indivi['actrec']==7] = 3 indivi['activite'][indivi['actrec']==8] = 4 indivi['activite'][indivi['age']<=13] = 2 # ce sont en fait les actrec=9 print indivi['activite'].value_counts() # TODO: MBJ problem avec les actrec indivi['titc'][indivi['titc'].isnull()] = 0 assert indivi['titc'].notnull().all() , Exception("Problème avec les titc") print ' 6.2 : variable statut' indivi['statut'][indivi['statut'].isnull()] = 0 indivi['statut'] = indivi['statut'].astype('int') indivi['statut'][indivi['statut']==11] = 1 indivi['statut'][indivi['statut']==12] = 2 indivi['statut'][indivi['statut']==13] = 3 indivi['statut'][indivi['statut']==21] = 4 indivi['statut'][indivi['statut']==22] = 5 indivi['statut'][indivi['statut']==33] = 6 indivi['statut'][indivi['statut']==34] = 7 indivi['statut'][indivi['statut']==35] = 8 indivi['statut'][indivi['statut']==43] = 9 indivi['statut'][indivi['statut']==44] = 10 indivi['statut'][indivi['statut']==45] = 11 assert indivi['statut'].isin(range(12)).all(), Exception("statut value over range") #indivi$nbsala <- as.numeric(indivi$nbsala) #indivi <- within(indivi,{ # nbsala[is.na(nbsala) ] <- 0 # nbsala[nbsala==99 ] <- 10 # TODO 418 fip à retracer qui sont NA #}) print ' 6.3 : variable txtppb' indivi['txtppb'] = indivi['txtppb'].fillna(0) assert indivi['txtppb'].notnull().all() indivi['nbsala'] = indivi['nbsala'].fillna(0) indivi['nbsala'] = indivi['nbsala'].astype('int') indivi['nbsala'][indivi['nbsala']==99] = 10 assert indivi['nbsala'].isin(range(11)).all() print ' 6.4 : variable chpub et CSP' indivi['chpub'].fillna(0, inplace=True) indivi['chpub'] = indivi['chpub'].astype('int') indivi['chpub'][indivi['chpub'].isnull()] = 0 print indivi['chpub'].value_counts() assert indivi['chpub'].isin(range(11)).all() indivi['cadre'] = 0 indivi['prosa'][indivi['prosa'].isnull()] = 0 assert indivi['prosa'].notnull().all() print indivi['encadr'].value_counts() # encadr : 1=oui, 2=non indivi['encadr'].fillna(2, inplace=True) assert indivi['encadr'].notnull().all() indivi['cadre'][indivi['prosa'].isin([7,8])] = 1 indivi['cadre'][(indivi['prosa']==9) & (indivi['encadr']==1)] = 1 print "cadre" print indivi['cadre'].value_counts() assert indivi['cadre'].isin(range(2)).all() print '' print "Etape 7 : on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence" print 'nb de doublons idfam/quifam', len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])]) print 'On crée les n° de personnes à charge' assert indivi['idfoy'].notnull().all() print_id(indivi) indivi['quifoy2'] = 2 indivi['quifoy2'][indivi['quifoy']=='vous'] = 0 indivi['quifoy2'][indivi['quifoy']=='conj'] = 1 indivi['quifoy2'][indivi['quifoy']=='pac'] = 2 del indivi['quifoy'] indivi['quifoy'] = indivi['quifoy2'] del indivi['quifoy2'] print_id(indivi) test2 = indivi.loc[indivi['quifoy']==2, ['quifoy', 'idfoy','noindiv']] print_id(test2) j=2 while test2.duplicated(['quifoy', 'idfoy']).any(): test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j j += 1 print_id(test2) indivi = indivi.merge(test2, on=['noindiv','idfoy'], how="left") indivi['quifoy'] = indivi['quifoy_x'] indivi['quifoy'] = where(indivi['quifoy_x']==2, indivi['quifoy_y'], indivi['quifoy_x']) del indivi['quifoy_x'], indivi['quifoy_y'] print_id(indivi) del test2, fip print 'nb de doublons idfam/quifam', len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])]) print_id(indivi) ##################################################################################### ## On ajoute les idfam et quifam #load(famc) # #tot2 <- merge(indivi, famille, by = c('noindiv'), all.x = TRUE) #rm(famille) #print_id(tot2) # ### Les idfam des enfants FIP qui ne font plus partie des familles forment des famille seuls #tot2[is.na(tot2$quifam), "idfam"] <- tot2[is.na(tot2$quifam), "noindiv"] #tot2[is.na(tot2$quifam), "quifam"] <- 0 #print_id(tot2) #saveTmp(tot2, file = "tot2.Rdata") #rm(indivi,tot2) # ## on merge les variables de revenus (foyer_aggr) avec les identifiants précédents ## load foyer #loadTmp(file = "tot2.Rdata") #loadTmp(file= "foyer_aggr.Rdata") # #tot3 <- merge(tot2, foyer, all.x = TRUE) #print_id(tot3) # OK #saveTmp(tot3, file= "tot3.Rdata") #rm(tot3,tot2,foyer) # print '' print 'Etape 8 : création des fichiers totaux' famille = load_temp(name='famc', year=year) print ' 8.1 : création de tot2 & tot3' tot2 = indivi.merge(famille, on='noindiv', how='inner') # del famille # TODO: MBJ increase in number of menage/foyer when merging with family ... del famille control(tot2, debug=True, verbose=True) assert tot2['quifam'].notnull().all() save_temp(tot2, name='tot2', year=year) del indivi print ' tot2 saved' # #On combine les variables de revenu # foyer = load_temp(name='foy_ind', year=year) # print " INTERSERCT THE POOCHAY" # tot2["idfoy"] = tot2["idfoy"][tot2["idfoy"].notnull()] +1 # print "pingas" # print sorted(tot2.loc[tot2.idfoy.notnull(),"idfoy"].astype('int').unique())[0:10] # print "pocchay" # print sorted(foyer["idfoy"].unique())[0:10] # print "final flash" # print 602062550.0 in foyer["idfoy"].values # print len(list(set(tot2["idfoy"].unique()) & set(foyer["idfoy"].unique()))) # print tot2.quifoy.value_counts() #tot2.update(foyer) tot2.merge(foyer, how = 'left') tot2 = tot2[tot2.idmen.notnull()] # tot2['idfoy'] += 1 print_id(tot2) tot3 = tot2 # TODO: check where they come from tot3 = tot3.drop_duplicates(cols='noindiv') print len(tot3) #Block to remove any unwanted duplicated pair print " check tot3" control(tot3, debug=True, verbose=True) tot3 = tot3.drop_duplicates(cols=['idfoy', 'quifoy']) tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam']) tot3 = tot3.drop_duplicates(cols=['idmen', 'quimen']) tot3 = tot3.drop_duplicates(cols='noindiv') control(tot3) ## On ajoute les variables individualisables #loadTmp("foyer_individualise.Rdata") # foy_ind #loadTmp("tot3.Rdata") #loadTmp("allvars.Rdata") #loadTmp("sif.Rdata") # #vars2 <- setdiff(names(tot3), allvars) #tot3 <- tot3[,vars2] # #print_id(tot3) #final <- merge(tot3, foy_ind, by = c('idfoy', 'quifoy'), all.x = TRUE) # print ' 8.2 : On ajoute les variables individualisables' allvars = load_temp(name = 'ind_vars_to_remove', year=year) vars2 = set(tot3.columns).difference(set(allvars)) tot3 = tot3[list(vars2)] print len(tot3) assert not(tot3.duplicated(cols=['noindiv']).any()), "doublon dans tot3['noindiv']" lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])]) assert lg_dup == 0, "%i pairs of idfoy/quifoy in tot3 are duplicated" %(lg_dup) save_temp(tot3, name='tot3', year=year) control(tot3) del tot2, allvars, tot3, vars2 print 'tot3 sauvegardé' gc.collect()
def final(year=2006, filename="test", check=True): ##***********************************************************************/ print('08_final: derniers réglages') ##***********************************************************************/ # # loadTmp("final.Rdata") # # On définit comme célibataires les individus dont on n'a pas retrouvé la déclaration # final$statmarit[is.na(final$statmarit)] <- 2 # table(final$statmarit, useNA='ifany') # import gc gc.collect() final = load_temp("final", year=year) print 'check doublons', len(final[final.duplicated(['noindiv'])]) final.statmarit = where(final.statmarit.isnull(), 2, final.statmarit) # # # activite des fip # table(final[final$quelfic=="FIP","activite"],useNA="ifany") # summary(final[final$quelfic=="FIP",c("activite","choi","sali","alr","rsti","age")] ) # # activite # actif occup? 0, ch?meur 1, ?tudiant/?l?ve 2, retrait? 3, autre inactif 4 # # final_fip <- final[final$quelfic=="FIP",] # final_fip <- within(final_fip,{ # choi <- ifelse(is.na(choi),0,choi) # sali <- ifelse(is.na(sali),0,sali) # alr <- ifelse(is.na(alr),0,alr) # rsti <- ifelse(is.na(rsti),0,rsti) # activite <- 2 # TODO comment choisr la valeur par d?faut ? # activite <- ifelse(choi > 0,1,activite) # activite <- ifelse(sali > 0,0,activite) # activite <- ifelse(age >= 21, 2,activite) # ne peuvent être rattach?s que les ?tudiants # }) # final[final$quelfic=="FIP",]<- final_fip # table(final_fip[,c("age","activite")]) # rm(final_fip) # # print_id(final) # saveTmp(final, file= "final.Rdata") # print ' gestion des FIP de final' final_fip = final.loc[final.quelfic == "FIP", ["choi", "sali", "alr", "rsti", "age"]] print set(["choi", "sali", "alr", "rsti"]).difference(set(final_fip.columns)) for var in ["choi", "sali", "alr", "rsti"]: final_fip[var].fillna(0, inplace=True) assert final_fip[var].notnull().all( ), "some NaN are remaining in column %s" % (var) final_fip["activite"] = 2 # TODO comment choisr la valeur par défaut ? final_fip.activite = where(final_fip.choi > 0, 1, final_fip.activite) final_fip.activite = where(final_fip.sali > 0, 0, final_fip.activite) final_fip.activite = where( final_fip.age > 21, 2, final_fip.activite) # ne peuvent être rattach?s que les ?tudiants final.update(final_fip) save_temp(final, name="final", year=year) print ' final has been updated with fip' # loadTmp("final.Rdata") # load(menm) # menagem <- rename(menagem, c("ident"="idmen","loym"="loyer")) # menagem$cstotpragr <- floor(menagem$cstotpr/10) # from math import floor menagem = load_temp(name="menagem", year=year) menagem.rename(columns=dict(ident="idmen", loym="loyer"), inplace=True) menagem["cstotpragr"] = menagem["cstotpr"].apply(lambda x: floor(x / 10)) # # # 2008 tau99 removed TODO: check ! and check incidence # if (year == "2008") { # vars <- c("loyer", "tu99", "pol99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm") # } else { # vars <- c("loyer", "tu99", "pol99", "tau99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm") # } # # famille_vars <- c("m_afeamam", "m_agedm","m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm') if year == 2008: vars = [ "loyer", "tu99", "pol99", "reg", "idmen", "so", "wprm", "typmen15", "nbinde", "ddipl", "cstotpragr", "champm", "zthabm" ] else: vars = [ "loyer", "tu99", "pol99", "tau99", "reg", "idmen", "so", "wprm", "typmen15", "nbinde", "ddipl", "cstotpragr", "champm", "zthabm" ] famille_vars = [ "m_afeamam", "m_agedm", "m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm' ] # if ("naf16pr" %in% names(menagem)) { # naf16pr <- factor(menagem$naf16pr) # levels(naf16pr) <- 0:16 # menagem$naf16pr <- as.character(naf16pr) # menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1" # Sans objet # vars <- c(vars,"naf16pr") # } else if ("nafg17npr" %in% names(menagem)) { # # TODO: pb in 2008 with xx # if (year == "2008"){ # menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00" # } # nafg17npr <- factor(menagem$nafg17npr) # levels(nafg17npr) <- 0:17 # menagem$nafg17npr <- as.character(nafg17npr) # menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1" # Sans objet # } # #TODO: TODO: pytohn translation needed # if "naf16pr" in menagem.columns: # naf16pr <- factor(menagem$naf16pr) # levels(naf16pr) <- 0:16 # menagem$naf16pr <- as.character(naf16pr) # menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1" # Sans objet # vars <- c(vars,"naf16pr") # } else if ("nafg17npr" %in% names(menagem)) { # # TODO: pb in 2008 with xx # if (year == "2008"){ # menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00" # } # nafg17npr <- factor(menagem$nafg17npr) # levels(nafg17npr) <- 0:17 # menagem$nafg17npr <- as.character(nafg17npr) # menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1" # Sans objet # } # # TODO: 2008tau99 is not present should be provided by 02_loy.... is it really needed # all_vars <- union(vars,famille_vars) # available_vars <- all_vars[union(vars,famille_vars) %in% names(menagem)] # loyersMenages <- menagem[,available_vars] # all_vars = vars + famille_vars print all_vars print set(menagem.columns) available_vars = list(set(all_vars).intersection(set(menagem.columns))) loyersMenages = menagem.xs(available_vars, axis=1) # # # Recodage de typmen15: modalités de 1:15 # table(loyersMenages$typmen15, useNA="ifany") # loyersMenages <- within(loyersMenages, { # typmen15[typmen15==10 ] <- 1 # typmen15[typmen15==11 ] <- 2 # typmen15[typmen15==21 ] <- 3 # typmen15[typmen15==22 ] <- 4 # typmen15[typmen15==23 ] <- 5 # typmen15[typmen15==31 ] <- 6 # typmen15[typmen15==32 ] <- 7 # typmen15[typmen15==33 ] <- 8 # typmen15[typmen15==41 ] <- 9 # typmen15[typmen15==42 ] <- 10 # typmen15[typmen15==43 ] <- 11 # typmen15[typmen15==44 ] <- 12 # typmen15[typmen15==51 ] <- 13 # typmen15[typmen15==52 ] <- 14 # typmen15[typmen15==53 ] <- 15 # }) # # # TODO: MBJ UNNECESSARY ? # # # Pb avec ddipl, pas de modalités 2: on décale les chaps >=3 # # Cependant on fait cela après avoir fait les traitement suivants # table(loyersMenages$ddipl, useNA="ifany") # # On convertit les ddipl en numeric # loyersMenages$ddipl <- as.numeric(loyersMenages$ddipl) # table(loyersMenages$ddipl, useNA="ifany") # # On met les non renseignés ie, NA et "" à sans diplome (modalité 7) # loyersMenages[is.na(loyersMenages$ddipl), "ddipl"] <- 7 # # loyersMenages[loyersMenages$ddipl>1, "ddipl"] <- loyersMenages$ddipl[loyersMenages$ddipl>1]-1 # loyersMenages.ddipl = where(loyersMenages.ddipl.isnull(), 7, loyersMenages.ddipl) loyersMenages.ddipl = where(loyersMenages.ddipl > 1, loyersMenages.ddipl - 1, loyersMenages.ddipl) loyersMenages.ddipl.astype("int32") # # table(final$actrec,useNA="ifany") # final$act5 <- NA # final <- within(final, { # act5[which(actrec==1) ] <- 2 # ind?pendants # act5[which(actrec==2) ] <- 1 # salari?s # act5[which(actrec==3) ] <- 1 # salari?s # act5[which(actrec==4) ] <- 3 # ch?meur # act5[which(actrec==7) ] <- 4 # retrait? # act5[which(actrec==8) ] <- 5 # autres inactifs # }) # table(final$act5,useNA="ifany") # final.act5 = NaN final.act5 = where(final.actrec == 1, 2, final.act5) # indépendants final.act5 = where(final.actrec.isin([2, 3]), 1, final.act5) # salariés final.act5 = where(final.actrec == 4, 3, final.act5) # chômeur final.act5 = where(final.actrec == 7, 4, final.act5) # retraité final.act5 = where(final.actrec == 8, 5, final.act5) # autres inactifs print final.act5.value_counts() # TODO : 29 retraités ? # assert final.act5.notnull().all(), 'there are NaN inside final.act5' # final$wprm <- NULL # with the intention to extract wprm from menage to deal with FIPs # final$tax_hab <- final$zthabm # rename zthabm to tax_hab # final$zthabm <- NULL # # final2 <- merge(final, loyersMenages, by="idmen", all.x=TRUE) print ' création de final2' del final["wprm"] gc.collect() final.rename(columns=dict(zthabm="tax_hab"), inplace=True) # rename zthabm to tax_hab final2 = final.merge(loyersMenages, on="idmen", how="left") # TODO: Check print loyersMenages.head() gc.collect() print_id(final2) # # # TODO: merging with patrimoine # rm(menagem,final) # # # table(final2$activite,useNA="ifany") # # table(final2$alt,useNA="ifany") # # saveTmp(final2, file= "final2.Rdata") # # loadTmp("final2.Rdata") # names(final2) # print_id(final2) # # # # set zone_apl using zone_apl_imputation_data # apl_imp <- read.csv("./zone_apl/zone_apl_imputation_data.csv") # # if (year == "2008") { # zone_apl <- final2[, c("tu99", "pol99", "reg")] # } else { # zone_apl <- final2[, c("tu99", "pol99", "tau99", "reg")] # } # # for (i in 1:length(apl_imp[,"TU99"])) { # tu <- apl_imp[i,"TU99"] # pol <- apl_imp[i,"POL99"] # tau <- apl_imp[i,"TAU99"] # reg <- apl_imp[i,"REG"] # # print(c(tu,pol,tau,reg)) # # if (year == "2008") { # indices <- (final2["tu99"] == tu & final2["pol99"] == pol & final2["reg"] == reg) # selection <- (apl_imp["TU99"] == tu & apl_imp["POL99"] == pol & apl_imp["REG"] == reg) # } else { # indices <- (final2["tu99"] == tu & final2["pol99"] == pol & final2["tau99"] == tau & final2["reg"] == reg) # selection <- (apl_imp["TU99"] == tu & apl_imp["POL99"] == pol & apl_imp["TAU99"] == tau & apl_imp["REG"] == reg) # } # z <- runif(sum(indices)) # probs <- apl_imp[selection , c("proba_zone1", "proba_zone2")] # # print(probs) # final2[indices,"zone_apl"] <- 1 + (z>probs[,'proba_zone1']) + (z>(probs[,'proba_zone1']+probs[,'proba_zone2'])) # rm(indices, probs) # } # print ' traitement des zones apl' apl_imp = read_csv("../../zone_apl/zone_apl_imputation_data.csv") print apl_imp.head(10) if year == 2008: zone_apl = final2.xs(["tu99", "pol99", "reg"], axis=1) else: zone_apl = final2.xs(["tu99", "pol99", "tau99", "reg"], axis=1) for i in range(len(apl_imp["TU99"])): tu = apl_imp["TU99"][i] pol = apl_imp["POL99"][i] tau = apl_imp["TAU99"][i] reg = apl_imp["REG"][i] if year == 2008: indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (final2["reg"] == reg) selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & ( apl_imp["REG"] == reg) else: indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & ( final2["tau99"] == tau) & (final2["reg"] == reg) selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & ( apl_imp["TAU99"] == tau) & (apl_imp["REG"] == reg) z = random.uniform(size=indices.sum()) print len(z) print len(indices) print len(indices) / len(z) probs = apl_imp.loc[selection, ["proba_zone1", "proba_zone2"]] print probs print probs['proba_zone1'].values proba_zone_1 = probs['proba_zone1'].values[0] proba_zone_2 = probs['proba_zone2'].values[0] final2["zone_apl"] = 3 final2["zone_apl"][indices] = (1 + (z > proba_zone_1) + (z > (proba_zone_1 + proba_zone_2))) del indices, probs # control(final2, verbose=True, debug=True, verbose_length=15) print ' performing cleaning on final2' print 'nombre de sali nuls', len(final2[final2['sali'].isnull()]) print "nombre d'âges nuls", len(final2[final2.age.isnull()]) print "longueur de final2 avant purge", len(final2) # columns_w_nan = [] # for col in final2.columns: # if final2[final2['idfoy'].notnull()][col].isnull().any() and not final2[col].isnull().all(): # columns_w_nan.append(col) # print columns_w_nan print 'check doublons', len(final2[final2.duplicated(['noindiv'])]) print final2.age.isnull().sum() # print final2.loc[final2.duplicated('noindiv'), ['noindiv', 'quifam']].to_string() #TODO: JS: des chefs de famille et conjoints en double il faut trouver la source des ces doublons ! # final2 = final2.drop_duplicates(['noindiv']) final2 = final2[~(final2.age.isnull())] print "longueur de final2 après purge", len(final2) print_id(final2) # # # var <- names(foyer) # #a1 <- c('f7rb', 'f7ra', 'f7gx', 'f2aa', 'f7gt', 'f2an', 'f2am', 'f7gw', 'f7gs', 'f8td', 'f7nz', 'f1br', 'f7jy', 'f7cu', 'f7xi', 'f7xo', 'f7xn', 'f7xw', 'f7xy', 'f6hj', 'f7qt', 'f7ql', 'f7qm', 'f7qd', 'f7qb', 'f7qc', 'f1ar', 'f7my', 'f3vv', 'f3vu', 'f3vt', 'f7gu', 'f3vd', 'f2al', 'f2bh', 'f7fm', 'f8uy', 'f7td', 'f7gv', 'f7is', 'f7iy', 'f7il', 'f7im', 'f7ij', 'f7ik', 'f1er', 'f7wl', 'f7wk', 'f7we', 'f6eh', 'f7la', 'f7uh', 'f7ly', 'f8wy', 'f8wx', 'f8wv', 'f7sb', 'f7sc', 'f7sd', 'f7se', 'f7sf', 'f7sh', 'f7si', 'f1dr', 'f7hs', 'f7hr', 'f7hy', 'f7hk', 'f7hj', 'f7hm', 'f7hl', 'f7ho', 'f7hn', 'f4gc', 'f4gb', 'f4ga', 'f4gg', 'f4gf', 'f4ge', 'f7vz', 'f7vy', 'f7vx', 'f7vw', 'f7xe', 'f6aa', 'f1cr', 'f7ka', 'f7ky', 'f7db', 'f7dq', 'f2da') # #a2 <- setdiff(a1,names(foyer)) # #b1 <- c('pondfin', 'alt', 'hsup', 'ass_mat', 'zone_apl', 'inactif', 'ass', 'aer', 'code_postal', 'activite', 'type_sal', 'jour_xyz', 'boursier', 'etr', 'partiel1', 'partiel2', 'empl_dir', 'gar_dom', 'categ_inv', 'opt_colca', 'csg_taux_plein','coloc') # # hsup feuille d'impot # # boursier pas dispo # # inactif etc : extraire cela des donn?es clca etc # # # tester activit? car 0 vaut actif # table(is.na(final2$activite),useNA="ifany") # # saveTmp(final2, file= "final2.Rdata") control(final2, debug=True) print final2.age.isnull().sum() final2 = final2.drop_duplicates(cols='noindiv') print ' Filter to manage the new 3-tables structures:' # On récupère les foyer, famille, ménages qui ont un chef : liste_men = unique(final2.loc[final2['quimen'] == 0, 'idmen'].values) liste_fam = unique(final2.loc[final2['quifam'] == 0, 'idfam'].values) liste_foy = unique(final2.loc[final2['quifoy'] == 0, 'idfoy'].values) #On ne conserve dans final2 que ces foyers là : print 'final2 avant le filtrage', len(final2) final2 = final2.loc[final2.idmen.isin(liste_men), :] final2 = final2.loc[final2.idfam.isin(liste_fam), :] final2 = final2.loc[final2.idfoy.isin(liste_foy), :] print 'final2 après le filtrage', len(final2) if check: check_structure(final2) from openfisca_france import DATA_SOURCES_DIR test_filename = os.path.join(DATA_SOURCES_DIR, filename + ".h5") if os.path.exists(test_filename): import warnings import datetime time_stamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M') renamed_file = os.path.join(DATA_SOURCES_DIR, filename + "_" + time_stamp + ".h5") warnings.warn( "A file with the same name already exists \n Renaming current output and saving to " + renamed_file) test_filename = renamed_file store = HDFStore(test_filename) store['survey_' + str(year)] = final2
def final(year=2006, filename="test", check=True): ##***********************************************************************/ print('08_final: derniers réglages') ##***********************************************************************/ # # loadTmp("final.Rdata") # # On définit comme célibataires les individus dont on n'a pas retrouvé la déclaration # final$statmarit[is.na(final$statmarit)] <- 2 # table(final$statmarit, useNA='ifany') # import gc gc.collect() final = load_temp("final", year=year) print 'check doublons', len(final[final.duplicated(['noindiv'])]) final.statmarit = where(final.statmarit.isnull(), 2, final.statmarit) # # # activite des fip # table(final[final$quelfic=="FIP","activite"],useNA="ifany") # summary(final[final$quelfic=="FIP",c("activite","choi","sali","alr","rsti","age")] ) # # activite # actif occup? 0, ch?meur 1, ?tudiant/?l?ve 2, retrait? 3, autre inactif 4 # # final_fip <- final[final$quelfic=="FIP",] # final_fip <- within(final_fip,{ # choi <- ifelse(is.na(choi),0,choi) # sali <- ifelse(is.na(sali),0,sali) # alr <- ifelse(is.na(alr),0,alr) # rsti <- ifelse(is.na(rsti),0,rsti) # activite <- 2 # TODO comment choisr la valeur par d?faut ? # activite <- ifelse(choi > 0,1,activite) # activite <- ifelse(sali > 0,0,activite) # activite <- ifelse(age >= 21, 2,activite) # ne peuvent être rattach?s que les ?tudiants # }) # final[final$quelfic=="FIP",]<- final_fip # table(final_fip[,c("age","activite")]) # rm(final_fip) # # print_id(final) # saveTmp(final, file= "final.Rdata") # print ' gestion des FIP de final' final_fip = final.loc[final.quelfic=="FIP", ["choi", "sali", "alr", "rsti","age"]] print set(["choi", "sali", "alr", "rsti"]).difference(set(final_fip.columns)) for var in ["choi", "sali", "alr", "rsti"]: final_fip[var].fillna(0, inplace=True) assert final_fip[var].notnull().all(), "some NaN are remaining in column %s" %(var) final_fip["activite"] = 2 # TODO comment choisr la valeur par défaut ? final_fip.activite = where(final_fip.choi > 0, 1, final_fip.activite) final_fip.activite = where(final_fip.sali > 0, 0, final_fip.activite) final_fip.activite = where(final_fip.age > 21, 2, final_fip.activite) # ne peuvent être rattach?s que les ?tudiants final.update(final_fip) save_temp(final, name="final", year=year) print ' final has been updated with fip' # loadTmp("final.Rdata") # load(menm) # menagem <- rename(menagem, c("ident"="idmen","loym"="loyer")) # menagem$cstotpragr <- floor(menagem$cstotpr/10) # from math import floor menagem = load_temp(name="menagem", year=year) menagem.rename(columns=dict(ident="idmen",loym="loyer"), inplace=True) menagem["cstotpragr"] = menagem["cstotpr"].apply(lambda x: floor(x/10)) # # # 2008 tau99 removed TODO: check ! and check incidence # if (year == "2008") { # vars <- c("loyer", "tu99", "pol99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm") # } else { # vars <- c("loyer", "tu99", "pol99", "tau99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm") # } # # famille_vars <- c("m_afeamam", "m_agedm","m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm') if year == 2008: vars = ["loyer", "tu99", "pol99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm"] else: vars = ["loyer", "tu99", "pol99", "tau99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm"] famille_vars = ["m_afeamam", "m_agedm","m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm'] # if ("naf16pr" %in% names(menagem)) { # naf16pr <- factor(menagem$naf16pr) # levels(naf16pr) <- 0:16 # menagem$naf16pr <- as.character(naf16pr) # menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1" # Sans objet # vars <- c(vars,"naf16pr") # } else if ("nafg17npr" %in% names(menagem)) { # # TODO: pb in 2008 with xx # if (year == "2008"){ # menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00" # } # nafg17npr <- factor(menagem$nafg17npr) # levels(nafg17npr) <- 0:17 # menagem$nafg17npr <- as.character(nafg17npr) # menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1" # Sans objet # } # #TODO: TODO: pytohn translation needed # if "naf16pr" in menagem.columns: # naf16pr <- factor(menagem$naf16pr) # levels(naf16pr) <- 0:16 # menagem$naf16pr <- as.character(naf16pr) # menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1" # Sans objet # vars <- c(vars,"naf16pr") # } else if ("nafg17npr" %in% names(menagem)) { # # TODO: pb in 2008 with xx # if (year == "2008"){ # menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00" # } # nafg17npr <- factor(menagem$nafg17npr) # levels(nafg17npr) <- 0:17 # menagem$nafg17npr <- as.character(nafg17npr) # menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1" # Sans objet # } # # TODO: 2008tau99 is not present should be provided by 02_loy.... is it really needed # all_vars <- union(vars,famille_vars) # available_vars <- all_vars[union(vars,famille_vars) %in% names(menagem)] # loyersMenages <- menagem[,available_vars] # all_vars = vars + famille_vars print all_vars print set(menagem.columns) available_vars = list( set(all_vars).intersection(set(menagem.columns))) loyersMenages = menagem.xs(available_vars,axis=1) # # # Recodage de typmen15: modalités de 1:15 # table(loyersMenages$typmen15, useNA="ifany") # loyersMenages <- within(loyersMenages, { # typmen15[typmen15==10 ] <- 1 # typmen15[typmen15==11 ] <- 2 # typmen15[typmen15==21 ] <- 3 # typmen15[typmen15==22 ] <- 4 # typmen15[typmen15==23 ] <- 5 # typmen15[typmen15==31 ] <- 6 # typmen15[typmen15==32 ] <- 7 # typmen15[typmen15==33 ] <- 8 # typmen15[typmen15==41 ] <- 9 # typmen15[typmen15==42 ] <- 10 # typmen15[typmen15==43 ] <- 11 # typmen15[typmen15==44 ] <- 12 # typmen15[typmen15==51 ] <- 13 # typmen15[typmen15==52 ] <- 14 # typmen15[typmen15==53 ] <- 15 # }) # # # TODO: MBJ UNNECESSARY ? # # # Pb avec ddipl, pas de modalités 2: on décale les chaps >=3 # # Cependant on fait cela après avoir fait les traitement suivants # table(loyersMenages$ddipl, useNA="ifany") # # On convertit les ddipl en numeric # loyersMenages$ddipl <- as.numeric(loyersMenages$ddipl) # table(loyersMenages$ddipl, useNA="ifany") # # On met les non renseignés ie, NA et "" à sans diplome (modalité 7) # loyersMenages[is.na(loyersMenages$ddipl), "ddipl"] <- 7 # # loyersMenages[loyersMenages$ddipl>1, "ddipl"] <- loyersMenages$ddipl[loyersMenages$ddipl>1]-1 # loyersMenages.ddipl = where(loyersMenages.ddipl.isnull(), 7, loyersMenages.ddipl) loyersMenages.ddipl = where(loyersMenages.ddipl>1, loyersMenages.ddipl-1, loyersMenages.ddipl) loyersMenages.ddipl.astype("int32") # # table(final$actrec,useNA="ifany") # final$act5 <- NA # final <- within(final, { # act5[which(actrec==1) ] <- 2 # ind?pendants # act5[which(actrec==2) ] <- 1 # salari?s # act5[which(actrec==3) ] <- 1 # salari?s # act5[which(actrec==4) ] <- 3 # ch?meur # act5[which(actrec==7) ] <- 4 # retrait? # act5[which(actrec==8) ] <- 5 # autres inactifs # }) # table(final$act5,useNA="ifany") # final.act5 = NaN final.act5 = where(final.actrec==1, 2, final.act5) # indépendants final.act5 = where(final.actrec.isin([2,3]), 1, final.act5) # salariés final.act5 = where(final.actrec==4, 3, final.act5) # chômeur final.act5 = where(final.actrec==7, 4, final.act5) # retraité final.act5 = where(final.actrec==8, 5, final.act5) # autres inactifs print final.act5.value_counts() # TODO : 29 retraités ? # assert final.act5.notnull().all(), 'there are NaN inside final.act5' # final$wprm <- NULL # with the intention to extract wprm from menage to deal with FIPs # final$tax_hab <- final$zthabm # rename zthabm to tax_hab # final$zthabm <- NULL # # final2 <- merge(final, loyersMenages, by="idmen", all.x=TRUE) print ' création de final2' del final["wprm"] gc.collect() final.rename(columns=dict(zthabm="tax_hab"), inplace=True) # rename zthabm to tax_hab final2 = final.merge(loyersMenages, on="idmen", how="left") # TODO: Check print loyersMenages.head() gc.collect() print_id(final2) # # # TODO: merging with patrimoine # rm(menagem,final) # # # table(final2$activite,useNA="ifany") # # table(final2$alt,useNA="ifany") # # saveTmp(final2, file= "final2.Rdata") # # loadTmp("final2.Rdata") # names(final2) # print_id(final2) # # # # set zone_apl using zone_apl_imputation_data # apl_imp <- read.csv("./zone_apl/zone_apl_imputation_data.csv") # # if (year == "2008") { # zone_apl <- final2[, c("tu99", "pol99", "reg")] # } else { # zone_apl <- final2[, c("tu99", "pol99", "tau99", "reg")] # } # # for (i in 1:length(apl_imp[,"TU99"])) { # tu <- apl_imp[i,"TU99"] # pol <- apl_imp[i,"POL99"] # tau <- apl_imp[i,"TAU99"] # reg <- apl_imp[i,"REG"] # # print(c(tu,pol,tau,reg)) # # if (year == "2008") { # indices <- (final2["tu99"] == tu & final2["pol99"] == pol & final2["reg"] == reg) # selection <- (apl_imp["TU99"] == tu & apl_imp["POL99"] == pol & apl_imp["REG"] == reg) # } else { # indices <- (final2["tu99"] == tu & final2["pol99"] == pol & final2["tau99"] == tau & final2["reg"] == reg) # selection <- (apl_imp["TU99"] == tu & apl_imp["POL99"] == pol & apl_imp["TAU99"] == tau & apl_imp["REG"] == reg) # } # z <- runif(sum(indices)) # probs <- apl_imp[selection , c("proba_zone1", "proba_zone2")] # # print(probs) # final2[indices,"zone_apl"] <- 1 + (z>probs[,'proba_zone1']) + (z>(probs[,'proba_zone1']+probs[,'proba_zone2'])) # rm(indices, probs) # } # print ' traitement des zones apl' apl_imp = read_csv("../../zone_apl/zone_apl_imputation_data.csv") print apl_imp.head(10) if year == 2008: zone_apl = final2.xs(["tu99", "pol99", "reg"], axis=1) else: zone_apl = final2.xs(["tu99", "pol99", "tau99", "reg"], axis=1) for i in range(len(apl_imp["TU99"])): tu = apl_imp["TU99"][i] pol = apl_imp["POL99"][i] tau = apl_imp["TAU99"][i] reg = apl_imp["REG"][i] if year == 2008: indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (final2["reg"] == reg) selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (apl_imp["REG"] == reg) else: indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (final2["tau99"] == tau) & (final2["reg"] == reg) selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (apl_imp["TAU99"] == tau) & (apl_imp["REG"] == reg) z = random.uniform(size=indices.sum()) print len(z) print len(indices) print len(indices)/len(z) probs = apl_imp.loc[selection , ["proba_zone1", "proba_zone2"]] print probs print probs['proba_zone1'].values proba_zone_1 = probs['proba_zone1'].values[0] proba_zone_2 = probs['proba_zone2'].values[0] final2["zone_apl"] = 3 final2["zone_apl"][indices] = ( 1 + (z>proba_zone_1) + (z>(proba_zone_1 + proba_zone_2))) del indices, probs # control(final2, verbose=True, debug=True, verbose_length=15) print ' performing cleaning on final2' print 'nombre de sali nuls', len(final2[final2['sali'].isnull()]) print "nombre d'âges nuls", len(final2[final2.age.isnull()]) print "longueur de final2 avant purge", len(final2) # columns_w_nan = [] # for col in final2.columns: # if final2[final2['idfoy'].notnull()][col].isnull().any() and not final2[col].isnull().all(): # columns_w_nan.append(col) # print columns_w_nan print 'check doublons', len(final2[final2.duplicated(['noindiv'])]) print final2.age.isnull().sum() # print final2.loc[final2.duplicated('noindiv'), ['noindiv', 'quifam']].to_string() #TODO: JS: des chefs de famille et conjoints en double il faut trouver la source des ces doublons ! # final2 = final2.drop_duplicates(['noindiv']) final2 = final2[~(final2.age.isnull())] print "longueur de final2 après purge", len(final2) print_id(final2) # # # var <- names(foyer) # #a1 <- c('f7rb', 'f7ra', 'f7gx', 'f2aa', 'f7gt', 'f2an', 'f2am', 'f7gw', 'f7gs', 'f8td', 'f7nz', 'f1br', 'f7jy', 'f7cu', 'f7xi', 'f7xo', 'f7xn', 'f7xw', 'f7xy', 'f6hj', 'f7qt', 'f7ql', 'f7qm', 'f7qd', 'f7qb', 'f7qc', 'f1ar', 'f7my', 'f3vv', 'f3vu', 'f3vt', 'f7gu', 'f3vd', 'f2al', 'f2bh', 'f7fm', 'f8uy', 'f7td', 'f7gv', 'f7is', 'f7iy', 'f7il', 'f7im', 'f7ij', 'f7ik', 'f1er', 'f7wl', 'f7wk', 'f7we', 'f6eh', 'f7la', 'f7uh', 'f7ly', 'f8wy', 'f8wx', 'f8wv', 'f7sb', 'f7sc', 'f7sd', 'f7se', 'f7sf', 'f7sh', 'f7si', 'f1dr', 'f7hs', 'f7hr', 'f7hy', 'f7hk', 'f7hj', 'f7hm', 'f7hl', 'f7ho', 'f7hn', 'f4gc', 'f4gb', 'f4ga', 'f4gg', 'f4gf', 'f4ge', 'f7vz', 'f7vy', 'f7vx', 'f7vw', 'f7xe', 'f6aa', 'f1cr', 'f7ka', 'f7ky', 'f7db', 'f7dq', 'f2da') # #a2 <- setdiff(a1,names(foyer)) # #b1 <- c('pondfin', 'alt', 'hsup', 'ass_mat', 'zone_apl', 'inactif', 'ass', 'aer', 'code_postal', 'activite', 'type_sal', 'jour_xyz', 'boursier', 'etr', 'partiel1', 'partiel2', 'empl_dir', 'gar_dom', 'categ_inv', 'opt_colca', 'csg_taux_plein','coloc') # # hsup feuille d'impot # # boursier pas dispo # # inactif etc : extraire cela des donn?es clca etc # # # tester activit? car 0 vaut actif # table(is.na(final2$activite),useNA="ifany") # # saveTmp(final2, file= "final2.Rdata") control(final2, debug=True) print final2.age.isnull().sum() final2 = final2.drop_duplicates(cols='noindiv') print ' Filter to manage the new 3-tables structures:' # On récupère les foyer, famille, ménages qui ont un chef : liste_men = unique(final2.loc[final2['quimen']==0,'idmen'].values) liste_fam = unique(final2.loc[final2['quifam']==0,'idfam'].values) liste_foy = unique(final2.loc[final2['quifoy']==0,'idfoy'].values) #On ne conserve dans final2 que ces foyers là : print 'final2 avant le filtrage' ,len(final2) final2 = final2.loc[final2.idmen.isin(liste_men), :] final2 = final2.loc[final2.idfam.isin(liste_fam), :] final2 = final2.loc[final2.idfoy.isin(liste_foy), :] print 'final2 après le filtrage', len(final2) if check: check_structure(final2) from openfisca_france import DATA_SOURCES_DIR test_filename = os.path.join(DATA_SOURCES_DIR, filename + ".h5") if os.path.exists(test_filename): import warnings import datetime time_stamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M') renamed_file = os.path.join(DATA_SOURCES_DIR, filename + "_" + time_stamp + ".h5") warnings.warn("A file with the same name already exists \n Renaming current output and saving to " + renamed_file) test_filename = renamed_file store = HDFStore(test_filename) store['survey_'+ str(year)] = final2
def create_fip(year = 2006): # message('03_fip') """ Creates a 'fipDat' table containing all these 'fip individuals' """ df = DataCollection(year=year) print 'Démarrer 03_fip' # # anaisenf: année de naissance des PAC # erfFoyVar <- c('anaisenf','declar') # foyer <- LoadIn(erfFoyFil) # foyer <- LoadIn(erfFoyFil,erfFoyVar) # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = df.get_values(table="foyer", variables=erfFoyVar) print_id(foyer) # control(foyer, verbose=True, verbose_length=10, debug=True) # #*********************************************************************************************************** # # print "Step 1 : on recupere les personnes à charge des foyers" # #********************************************************************************************************** # # On traite les cas de declarations multiples pour ne pas créer de doublon de pac # # # # On récupère toutes les pac des foyers # L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal # fip <-data.frame(declar = foyer$declar) # for (i in c(1:L)){ # eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = ''))) # eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = ''))) # } # fip <- fip[!is.na(fip$typ.1),] # fip <- reshape(fip,direction ='long', varying=2:17, sep=".") # fip <- fip[!is.na(fip$naia),] # fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')] # fip$N <- row(fip)[,1] # str(fip$N) print "Etape 1 : on recupere les personnes à charge des foyers" print " 1.1 : Création des codes des enfants" foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len))/5 print "il ya a au maximum %s pac par foyer" %nb_pac_max # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] for i in range(1, nb_pac_max + 1): pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable']) fip = DataFrame(randn(len(foyer), 3*nb_pac_max), columns=columns) fip.fillna(NaN, inplace=True) # inutile a cause de la ligne précédente, to remove for i in range(1,nb_pac_max+1): fip[(i, 'declaration')] = foyer['declar'].values fip[(i,'type_pac')] = foyer['anaisenf'].str[5*(i-1)] fip[(i,'naia')] = foyer['anaisenf'].str[5*(i-1)+1:5*(i)] fip = fip.stack("pac_number") fip.reset_index(inplace=True) del fip["level_0"] # print fip.describe() # print fip.head().to_string() print " 1.2 : elimination des foyers fiscaux sans pac" #Clearing missing values and changing data format fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an') & (fip['naia'] != '')] fip = fip.sort(columns=['declaration','naia','type_pac']) # TODO: check if useful fip.set_index(["declaration","pac_number"], inplace=True) fip = fip.reset_index() del fip['pac_number'] # control(fip, debug=True, verbose=True, verbose_columns=['naia']) print " 1.3 : on enlève les individus F pour lesquels il existe un individu G" tyFG = fip[fip.type_pac.isin(['F', 'G'])] #Filtre pour ne travailler que sur F & G tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True) tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac']) tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin'])) #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux #puis on retire les autres (à la fois F et G) print len(tyFG),'/', len(tyFG[tyFG['to_keep']]) print 'longueur fip', len(fip) fip['to_keep'] = NaN fip.update(tyFG) print 'enfants F & G traités' print " 1.4 : on enlève les H pour lesquels il y a un I" tyHI = fip[fip.type_pac.isin(['H', 'I'])] tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True) tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac']) tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin']) fip.update(tyHI) fip['to_keep'] = fip['to_keep'].fillna(True) print 'nb lines to keep/nb initial lines' print len(fip[fip['to_keep']]), '/', len(fip) indivifip = fip[fip['to_keep']]; del indivifip['to_keep'], fip, tyFG, tyHI # control(indivifip, debug=True) # #************************************************************************************************************/ print '' print 'Step 2 : matching indivifip with eec file' # #************************************************************************************************************/ indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES # pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',] # pac$key1 <- paste(pac$naia,pac$declar1) # pac$key2 <- paste(pac$naia,pac$declar2) # indivifip$key <- paste(indivifip$naia,indivifip$declar) #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull() import pdb pdb.set_trace() pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip']=='pac')] pac['naia'] = pac['naia'].astype('int32') # TODO: was float in pac fix upstream indivifip['naia'] = indivifip['naia'].astype('int32') pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29]) pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29]) assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' %(pac.naia.dtype, indivifip.naia.dtype) # fip <- indivifip[!indivifip$key %in% pac$key1,] # fip <- fip[!fip$key %in% pac$key2,] fip = indivifip[~(indivifip.key.isin(pac.key1.values))] fip = fip[~(fip.key.isin(pac.key2.values))] print " 2.1 new fip created" # We build a dataframe to link the pac to their type and noindiv # table(duplicated(pac[,c("noindiv")])) countInd = pac.noindiv.value_counts() # pacInd1 <- merge(pac[,c("noindiv","key1","naia")], # indivifip[,c("key","typ")], by.x="key1", by.y="key") # pacInd2 <- merge(pac[,c("noindiv","key2","naia")], # indivifip[,c("key","typ")], by.x="key2", by.y="key") tmp_pac1 = pac[['noindiv', 'key1']] tmp_pac2 = pac[['noindiv', 'key2']] tmp_indivifip = indivifip[['key', 'type_pac', 'naia']] pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') print 'longueur pacInd1' , len(pac_ind1) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') print 'longueur pacInd2', len(pac_ind2) print "pacInd1&2 créés" # table(duplicated(pacInd1)) # table(duplicated(pacInd2)) print pac_ind1.duplicated().sum() print pac_ind2.duplicated().sum() # pacInd1 <-rename(pacInd1,c("key1" = "key")) # pacInd2 <-rename(pacInd2,c("key2" = "key")) # pacInd <- rbind(pacInd1,pacInd2) # rm(pacInd1,pacInd2) # pacInd1.rename(columns={'key1':'key'}, inplace=True) # pacInd2.rename(columns={'key2':'key'}, inplace=True) del pac_ind1['key1'], pac_ind2['key2'] print pac_ind1.columns print pac_ind2.columns if pac_ind1.index == []: if pac_ind2.index == []: print "Warning : no link between pac and noindiv for both pacInd1&2" else: print "Warning : pacInd1 is an empty data frame" pacInd = pac_ind2 elif pac_ind2.index == []: print "Warning : pacInd2 is an empty data frame" pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) print len(pac_ind1), len(pac_ind2), len(pacInd) print pac_ind2.type_pac.isnull().sum() print pacInd.type_pac.value_counts() print ' 2.2 : pacInd created' # table(duplicated(pacInd[,c("noindiv","typ")])) # table(duplicated(pacInd$noindiv)) print 'doublons noindiv, type_pac', pacInd.duplicated(['noindiv', 'type_pac']).sum() print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum() print 'nb de NaN', pacInd.type_pac.isnull().sum() del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))] # pacIndiv.reset_index(inplace=True) print pacIndiv.columns save_temp(pacIndiv, name="pacIndiv", year=year) print pacIndiv.type_pac.value_counts() gc.collect() # # We keep the fip in the menage of their parents because it is used in to # # build the famille. We should build an individual ident for the fip that are # # older than 18 since they are not in their parents' menage according to the eec # individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous")) # individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")] # individec1 <- upData(individec1,rename=c(declar1="declar")) # fip1 <- merge(fip,individec1) # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2)) indivi['noidec'] = indivi['declar1'].str[0:2].astype('float16') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip']=="vous")] individec1 = individec1.loc[:, ["declar1","noidec","ident","rga","ztsai","ztsao"]] individec1 = individec1.rename(columns={'declar1':'declaration'}) fip1 = fip.merge(individec1, on='declaration') print ' 2.3 : fip1 created' # # TODO: On ne s'occupe pas des declar2 pour l'instant # # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous")) # # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")] # # individec2 <- upData(individec2,rename=c(declar2="declar")) # # fip2 <-merge(fip,individec2) individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip']=="vous")] individec2 = individec2.loc[:, ["declar2","noidec","ident","rga","ztsai","ztsao"]] individec2.rename(columns={'declar2':'declaration'}, inplace=True) print individec2.head() fip2 = fip.merge(individec2) print ' 2.4 : fip2 created' fip1.duplicated().value_counts() fip2.duplicated().value_counts() # #fip <- rbind(fip1,fip2) # fip <- fip1 # table(fip$typ) fip = concat([fip1, fip2]) # fip = fip1 #TODO: Pourquoi cette ligne ? fip.type_pac.value_counts() print fip.columns fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'] fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'] #TODO declar ? fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip['naia'].astype('float') fip['lpr'] = where(fip['agepf'] <=20, 3, 4) # TODO pas très propre d'après Mahdi/Clément fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = where(fip['agepf']<=15, 9, 5) ## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ ## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal # while ( any(duplicated( fip[,c("noi","ident")]) ) ) { # dup <- duplicated( fip[, c("noi","ident")]) # tmp <- fip[dup,"noi"] # fip[dup, "noi"] <- (tmp-1) # } #TODO: Le vecteur dup est-il correct fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi','ident']] while any(fip.duplicated(cols=['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] print len(tmp) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100*fip['ident'] + fip['noidec'] fip['noindiv'] = 100*fip['ident'] + fip['noi'] fip['type_pac'] = 0 ; fip['key'] = 0 print fip.duplicated('noindiv').value_counts() save_temp(fip, name="fipDat", year=year) del fip, fip1, individec1, indivifip, indivi, pac print 'fip sauvegardé'
def foyer_all(year=2006): ## On ajoute les cases de la déclaration #foyer_all <- LoadIn(erfFoyFil) data = DataCollection(year=year) foyer_all = data.get_values(table="foyer") ## on ne garde que les cases de la déclaration ('fxzz') #vars <- names(foyer_all) #vars <- c("noindiv", vars[grep("^f[0-9]", vars)]) # vars = foyer_all.columns regex = re.compile("^f[0-9]") vars = [x for x in vars if regex.match(x)] #foyer <- foyer_all[vars] #rm(foyer_all) #gc() #noindiv <- list(foyer$noindiv) # foyer = foyer_all[vars + ["noindiv"]] del foyer_all gc.collect() # ## On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations #foyer <- aggregate(foyer, by = noindiv, FUN = 'sum') #print foyer.describe()["f1aj"].to_string() foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum) # #print foyer.describe()["f1aj"].to_string() #print foyer.describe()["noindiv"].to_string() # print_id(foyer) ## noindiv have been summed over original noindiv which are now in Group.1 #foyer$noindiv <- NULL #foyer <- rename(foyer, c(Group.1 = 'noindiv')) ## problème avec les dummies () # #saveTmp(foyer, file= "foyer_aggr.Rdata") # # ############################################################################# ## On récupère les variables individualisables #loadTmp("foyer_aggr.Rdata") # #individualisable <- function(table, var, vars, qui){ # print(var) # print(vars) # temp <- table[c('noindiv', vars)] # n = length(qui) # names(temp)[2:(n+1)] <- qui # temp$newvar <- NULL # temp2 <- melt(temp, id = 'noindiv', variable_name = 'quifoy') # temp2 <- transform(temp2, quifoy = as.character(quifoy)) # temp2 <- transform(temp2, noindiv = as.character(noindiv)) # str(temp2) # rename(temp2, c(value = var)) #} var_dict = { 'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'], 'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'], 'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'], 'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'], 'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'], 'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'], 'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'], 'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'], 'f1tv': ['f1tv', 'f1uv'], 'f1tw': ['f1tw', 'f1uw'], 'f1tx': ['f1tx', 'f1ux'], 'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'], 'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'], 'frag_exon': ['f5hn', 'f5in', 'f5jn'], 'frag_impo': ['f5ho', 'f5io', 'f5jo'], 'arag_exon': ['f5hb', 'f5ib', 'f5jb'], 'arag_impg': ['f5hc', 'f5ic', 'f5jc'], 'arag_defi': ['f5hf', 'f5if', 'f5jf'], 'nrag_exon': ['f5hh', 'f5ih', 'f5jh'], 'nrag_impg': ['f5hi', 'f5ii', 'f5ji'], 'nrag_defi': ['f5hl', 'f5il', 'f5jl'], 'nrag_ajag': ['f5hm', 'f5im', 'f5jm'], 'mbic_exon': ['f5kn', 'f5ln', 'f5mn'], 'abic_exon': ['f5kb', 'f5lb', 'f5mb'], 'nbic_exon': ['f5kh', 'f5lh', 'f5mh'], 'mbic_impv': ['f5ko', 'f5lo', 'f5mo'], 'mbic_imps': ['f5kp', 'f5lp', 'f5mp'], 'abic_impn': ['f5kc', 'f5lc', 'f5mc'], 'abic_imps': ['f5kd', 'f5ld', 'f5md'], 'nbic_impn': ['f5ki', 'f5li', 'f5mi'], 'nbic_imps': ['f5kj', 'f5lj', 'f5mj'], 'abic_defn': ['f5kf', 'f5lf', 'f5mf'], 'abic_defs': ['f5kg', 'f5lg', 'f5mg'], 'nbic_defn': ['f5kl', 'f5ll', 'f5ml'], 'nbic_defs': ['f5km', 'f5lm', 'f5mm'], 'nbic_apch': ['f5ks', 'f5ls', 'f5ms'], 'macc_exon': ['f5nn', 'f5on', 'f5pn'], 'aacc_exon': ['f5nb', 'f5ob', 'f5pb'], 'nacc_exon': ['f5nh', 'f5oh', 'f5ph'], 'macc_impv': ['f5no', 'f5oo', 'f5po'], 'macc_imps': ['f5np', 'f5op', 'f5pp'], 'aacc_impn': ['f5nc', 'f5oc', 'f5pc'], 'aacc_imps': ['f5nd', 'f5od', 'f5pd'], 'aacc_defn': ['f5nf', 'f5of', 'f5pf'], 'aacc_defs': ['f5ng', 'f5og', 'f5pg'], 'nacc_impn': ['f5ni', 'f5oi', 'f5pi'], 'nacc_imps': ['f5nj', 'f5oj', 'f5pj'], 'nacc_defn': ['f5nl', 'f5ol', 'f5pl'], 'nacc_defs': ['f5nm', 'f5om', 'f5pm'], 'mncn_impo': ['f5ku', 'f5lu', 'f5mu'], 'cncn_bene': ['f5sn', 'f5ns', 'f5os'], 'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check 'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'], 'abnc_exon': ['f5qb', 'f5rb', 'f5sb'], 'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'], 'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'], 'abnc_impo': ['f5qc', 'f5rc', 'f5sc'], 'abnc_defi': ['f5qe', 'f5re', 'f5se'], 'nbnc_impo': ['f5qi', 'f5ri', 'f5si'], 'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'], # 'ebic_impv' : ['f5ta','f5ua', 'f5va'], # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'], 'mbic_mvct': ['f5hu'], 'macc_mvct': ['f5iu'], 'mncn_mvct': ['f5ju'], 'mbnc_mvct': ['f5kz'], 'frag_pvct': ['f5hw', 'f5iw', 'f5jw'], 'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'], 'macc_pvct': ['f5nx', 'f5ox', 'f5px'], 'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'], 'mncn_pvct': ['f5ky', 'f5ly', 'f5my'], 'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'], 'macc_mvlt': ['f5nr', 'f5or', 'f5pr'], 'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'], 'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'], 'frag_pvce': ['f5hx', 'f5ix', 'f5jx'], 'arag_pvce': ['f5he', 'f5ie', 'f5je'], 'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'], 'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'], 'abic_pvce': ['f5ke', 'f5le', 'f5me'], 'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'], 'macc_pvce': ['f5nq', 'f5oq', 'f5pq'], 'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'], 'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'], 'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'], 'cncn_pvce': ['f5so', 'f5nt', 'f5ot'], 'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'], 'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'], 'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'], 'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'] } # (déménagement) uniquement en 2006 # #varlist = list(list('sali', c('f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej')), # list('choi', c('f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep')), # list('fra', c('f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek')), # ...... # list('mbnc_pvce', c('f5hr', 'f5ir', 'f5jr')), # list('abnc_pvce', c('f5qd', 'f5rd', 'f5sd')), # list('nbnc_pvce', c('f5qj', 'f5rj', 'f5sj')), # list('demenage' , c('f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'))) # (déménagement) uniquement en 2006 # vars_sets = [set(var_list) for var_list in var_dict.values()] eligible_vars = (set().union(*vars_sets)).intersection( set(list(foyer.columns))) print "From %i variables, we keep %i eligibles variables" % (len( set().union(*vars_sets)), len(eligible_vars)) qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3'] err = 0 err_vars = {} foy_ind = DataFrame() for individual_var, foyer_vars in var_dict.iteritems(): try: selection = foyer[foyer_vars + ["noindiv"]] except KeyError: # Testing if at least one variable of foyers_vars is in the eligible list presence = [x in eligible_vars for x in foyer_vars] var_present = any(presence) if not var_present: print individual_var + " is not present" continue else: # Shrink the list foyer_vars_cleaned = [ var for var, present in zip(foyer_vars, presence) if present is True ] selection = foyer[foyer_vars_cleaned + ["noindiv"]] # Reshape the dataframe selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True) selection.set_index("noindiv", inplace=True) selection.columns.name = "quifoy" selection = selection.stack() selection.name = individual_var selection = selection.reset_index( ) # A Series cannot see its index resetted to produce a DataFrame selection = selection.set_index(["quifoy", "noindiv"]) selection = selection[selection[individual_var] != 0] # print len(selection) if len(foy_ind) == 0: foy_ind = selection else: foy_ind = concat([foy_ind, selection], axis=1, join='outer') foy_ind.reset_index(inplace=True) print "foy_ind" print foy_ind.describe().to_string() #not_first <- FALSE #allvars = c() #for (v in varlist){ # vars = intersect(v[[2]],names(foyer)) # to deal with variabes that are not present # if (length(vars) > 0) { # allvars <- c(allvars, vars) # qui <- c('vous', 'conj', 'pac1', 'pac2', 'pac3') # n <- length(vars) # temp <- individualisable(foyer, v[[1]], vars, qui[1:n]) # if (not_first) { # print('merge') # foy_ind <- merge(temp, foy_ind, by = c('noindiv', 'quifoy'), all = TRUE) # names(foy_ind) # } # else { # print('init') # foy_ind <- temp # not_first <- TRUE # } # } #} ind_vars_to_remove = Series(list(eligible_vars)) save_temp(ind_vars_to_remove, name='ind_vars_to_remove', year=year) foy_ind.rename(columns={"noindiv": "idfoy"}, inplace=True) print_id(foy_ind) foy_ind['quifoy'][foy_ind['quifoy'] == 'vous'] = 0 foy_ind['quifoy'][foy_ind['quifoy'] == 'conj'] = 1 foy_ind['quifoy'][foy_ind['quifoy'] == 'pac1'] = 2 foy_ind['quifoy'][foy_ind['quifoy'] == 'pac2'] = 3 foy_ind['quifoy'][foy_ind['quifoy'] == 'pac3'] = 4 assert foy_ind['quifoy'].isin( range(5)).all(), 'présence de valeurs aberrantes dans quifoy' print 'saving foy_ind' print_id(foy_ind) save_temp(foy_ind, name="foy_ind", year=year) show_temp() return
def create_fip(year=2006): # message('03_fip') """ Creates a 'fipDat' table containing all these 'fip individuals' """ df = DataCollection(year=year) print 'Démarrer 03_fip' # # anaisenf: année de naissance des PAC # erfFoyVar <- c('anaisenf','declar') # foyer <- LoadIn(erfFoyFil) # foyer <- LoadIn(erfFoyFil,erfFoyVar) # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = df.get_values(table="foyer", variables=erfFoyVar) print_id(foyer) # control(foyer, verbose=True, verbose_length=10, debug=True) # #*********************************************************************************************************** # # print "Step 1 : on recupere les personnes à charge des foyers" # #********************************************************************************************************** # # On traite les cas de declarations multiples pour ne pas créer de doublon de pac # # # # On récupère toutes les pac des foyers # L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal # fip <-data.frame(declar = foyer$declar) # for (i in c(1:L)){ # eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = ''))) # eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = ''))) # } # fip <- fip[!is.na(fip$typ.1),] # fip <- reshape(fip,direction ='long', varying=2:17, sep=".") # fip <- fip[!is.na(fip$naia),] # fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')] # fip$N <- row(fip)[,1] # str(fip$N) print "Etape 1 : on recupere les personnes à charge des foyers" print " 1.1 : Création des codes des enfants" foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5 print "il ya a au maximum %s pac par foyer" % nb_pac_max # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] for i in range(1, nb_pac_max + 1): pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable']) fip = DataFrame(randn(len(foyer), 3 * nb_pac_max), columns=columns) fip.fillna( NaN, inplace=True) # inutile a cause de la ligne précédente, to remove for i in range(1, nb_pac_max + 1): fip[(i, 'declaration')] = foyer['declar'].values fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)] fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1:5 * (i)] fip = fip.stack("pac_number") fip.reset_index(inplace=True) del fip["level_0"] # print fip.describe() # print fip.head().to_string() print " 1.2 : elimination des foyers fiscaux sans pac" #Clearing missing values and changing data format fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an') & (fip['naia'] != '')] fip = fip.sort(columns=['declaration', 'naia', 'type_pac']) # TODO: check if useful fip.set_index(["declaration", "pac_number"], inplace=True) fip = fip.reset_index() del fip['pac_number'] # control(fip, debug=True, verbose=True, verbose_columns=['naia']) print " 1.3 : on enlève les individus F pour lesquels il existe un individu G" tyFG = fip[fip.type_pac.isin(['F', 'G' ])] #Filtre pour ne travailler que sur F & G tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True) tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac']) tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin'])) #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux #puis on retire les autres (à la fois F et G) print len(tyFG), '/', len(tyFG[tyFG['to_keep']]) print 'longueur fip', len(fip) fip['to_keep'] = NaN fip.update(tyFG) print 'enfants F & G traités' print " 1.4 : on enlève les H pour lesquels il y a un I" tyHI = fip[fip.type_pac.isin(['H', 'I'])] tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True) tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac']) tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin']) fip.update(tyHI) fip['to_keep'] = fip['to_keep'].fillna(True) print 'nb lines to keep/nb initial lines' print len(fip[fip['to_keep']]), '/', len(fip) indivifip = fip[fip['to_keep']] del indivifip['to_keep'], fip, tyFG, tyHI # control(indivifip, debug=True) # #************************************************************************************************************/ print '' print 'Step 2 : matching indivifip with eec file' # #************************************************************************************************************/ indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES # pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',] # pac$key1 <- paste(pac$naia,pac$declar1) # pac$key2 <- paste(pac$naia,pac$declar2) # indivifip$key <- paste(indivifip$naia,indivifip$declar) #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull() import pdb pdb.set_trace() pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip'] == 'pac')] pac['naia'] = pac['naia'].astype( 'int32') # TODO: was float in pac fix upstream indivifip['naia'] = indivifip['naia'].astype('int32') pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29]) pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29]) assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' % ( pac.naia.dtype, indivifip.naia.dtype) # fip <- indivifip[!indivifip$key %in% pac$key1,] # fip <- fip[!fip$key %in% pac$key2,] fip = indivifip[~(indivifip.key.isin(pac.key1.values))] fip = fip[~(fip.key.isin(pac.key2.values))] print " 2.1 new fip created" # We build a dataframe to link the pac to their type and noindiv # table(duplicated(pac[,c("noindiv")])) countInd = pac.noindiv.value_counts() # pacInd1 <- merge(pac[,c("noindiv","key1","naia")], # indivifip[,c("key","typ")], by.x="key1", by.y="key") # pacInd2 <- merge(pac[,c("noindiv","key2","naia")], # indivifip[,c("key","typ")], by.x="key2", by.y="key") tmp_pac1 = pac[['noindiv', 'key1']] tmp_pac2 = pac[['noindiv', 'key2']] tmp_indivifip = indivifip[['key', 'type_pac', 'naia']] pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') print 'longueur pacInd1', len(pac_ind1) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') print 'longueur pacInd2', len(pac_ind2) print "pacInd1&2 créés" # table(duplicated(pacInd1)) # table(duplicated(pacInd2)) print pac_ind1.duplicated().sum() print pac_ind2.duplicated().sum() # pacInd1 <-rename(pacInd1,c("key1" = "key")) # pacInd2 <-rename(pacInd2,c("key2" = "key")) # pacInd <- rbind(pacInd1,pacInd2) # rm(pacInd1,pacInd2) # pacInd1.rename(columns={'key1':'key'}, inplace=True) # pacInd2.rename(columns={'key2':'key'}, inplace=True) del pac_ind1['key1'], pac_ind2['key2'] print pac_ind1.columns print pac_ind2.columns if pac_ind1.index == []: if pac_ind2.index == []: print "Warning : no link between pac and noindiv for both pacInd1&2" else: print "Warning : pacInd1 is an empty data frame" pacInd = pac_ind2 elif pac_ind2.index == []: print "Warning : pacInd2 is an empty data frame" pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) print len(pac_ind1), len(pac_ind2), len(pacInd) print pac_ind2.type_pac.isnull().sum() print pacInd.type_pac.value_counts() print ' 2.2 : pacInd created' # table(duplicated(pacInd[,c("noindiv","typ")])) # table(duplicated(pacInd$noindiv)) print 'doublons noindiv, type_pac', pacInd.duplicated( ['noindiv', 'type_pac']).sum() print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum() print 'nb de NaN', pacInd.type_pac.isnull().sum() del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))] # pacIndiv.reset_index(inplace=True) print pacIndiv.columns save_temp(pacIndiv, name="pacIndiv", year=year) print pacIndiv.type_pac.value_counts() gc.collect() # # We keep the fip in the menage of their parents because it is used in to # # build the famille. We should build an individual ident for the fip that are # # older than 18 since they are not in their parents' menage according to the eec # individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous")) # individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")] # individec1 <- upData(individec1,rename=c(declar1="declar")) # fip1 <- merge(fip,individec1) # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2)) indivi['noidec'] = indivi['declar1'].str[0:2].astype( 'float16') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip'] == "vous")] individec1 = individec1.loc[:, [ "declar1", "noidec", "ident", "rga", "ztsai", "ztsao" ]] individec1 = individec1.rename(columns={'declar1': 'declaration'}) fip1 = fip.merge(individec1, on='declaration') print ' 2.3 : fip1 created' # # TODO: On ne s'occupe pas des declar2 pour l'instant # # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous")) # # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")] # # individec2 <- upData(individec2,rename=c(declar2="declar")) # # fip2 <-merge(fip,individec2) individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous")] individec2 = individec2.loc[:, [ "declar2", "noidec", "ident", "rga", "ztsai", "ztsao" ]] individec2.rename(columns={'declar2': 'declaration'}, inplace=True) print individec2.head() fip2 = fip.merge(individec2) print ' 2.4 : fip2 created' fip1.duplicated().value_counts() fip2.duplicated().value_counts() # #fip <- rbind(fip1,fip2) # fip <- fip1 # table(fip$typ) fip = concat([fip1, fip2]) # fip = fip1 #TODO: Pourquoi cette ligne ? fip.type_pac.value_counts() print fip.columns fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype( 'float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'] fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'] #TODO declar ? fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip['naia'].astype('float') fip['lpr'] = where(fip['agepf'] <= 20, 3, 4) # TODO pas très propre d'après Mahdi/Clément fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = where(fip['agepf'] <= 15, 9, 5) ## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ ## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal # while ( any(duplicated( fip[,c("noi","ident")]) ) ) { # dup <- duplicated( fip[, c("noi","ident")]) # tmp <- fip[dup,"noi"] # fip[dup, "noi"] <- (tmp-1) # } #TODO: Le vecteur dup est-il correct fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi', 'ident']] while any(fip.duplicated(cols=['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] print len(tmp) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100 * fip['ident'] + fip['noidec'] fip['noindiv'] = 100 * fip['ident'] + fip['noi'] fip['type_pac'] = 0 fip['key'] = 0 print fip.duplicated('noindiv').value_counts() save_temp(fip, name="fipDat", year=year) del fip, fip1, individec1, indivifip, indivi, pac print 'fip sauvegardé'
def invalide(year = 2006): print 'Entering 07_invalides: construction de la variable invalide NOTFUNCTIONNAL NAOW' return # # # Invalides # # #inv = caseP (vous), caseF (conj) ou case G, caseI, ou caseR (pac) # # loadTmp("final.Rdata") # # invalides <- final[,c("noindiv","idmen","caseP","caseF","idfoy","quifoy")] # # invalides <- within(invalides,{ # # caseP <- ifelse(is.na(caseP),0,caseP) # # caseF <- ifelse(is.na(caseF),0,caseF) # # inv <- FALSE}) # # # Les "vous" invalides # # table(invalides[,c("caseF","quifoy")],useNA="ifany") # # invalides[(invalides$caseP==1) & (invalides$quifoy=="vous"),"inv"] <- TRUE # # print '' print 'Etape 1 : création de la df invalides' print ' 1.1 : déclarants invalides' final = load_temp(name="final", year=year) invalides = final.xs(["noindiv","idmen","caseP","caseF","idfoy","quifoy","maahe","rc1rev"], axis=1) print invalides['rc1rev'].value_counts() for var in ["caseP", "caseF"]: assert invalides[var].notnull().all(), 'présence de NaN dans %s' %(var) # Les déclarants invalides invalides['inv'] = False invalides['inv'][(invalides['caseP']==1) & (invalides['quifoy']==0)] = True print invalides["inv"].sum(), " invalides déclarants" #Les personnes qui touchent l'aah dans l'enquête emploi invalides['inv'][(invalides['maahe']>0)] = True invalides['inv'][(invalides['rc1rev']==4)] = True #TODO: vérifier le format. print invalides["inv"].sum(), " invalides qui touchent des alloc" print_id(invalides) # # # Les conjoints invalides # # # # #men_inv_conj <- invalides[c("idmen","caseF","quifoy")] # # #men_inv_conj <- rename(men_inv_conj, c("caseF"="inv")) # # #table(men_inv_conj[men_inv_conj$inv==1 ,c("inv","quifoy")],useNA="ifany") # # # Il y a des caseF suir des conjoints cela vint des doubles d?clarations TODO: shoumd clean this # # #toto <- invalides[invalides$caseF==1 & invalides$quifoy=="conj","idmen"] # # #load(indm) # # #titi <- indivim[(indivim$ident %in% toto) & (indivim$persfip=="vous" |indivim$persfip=="conj") ,c("ident","noindiv","declar1","declar2","persfip","quelfic")] # # #titi <- titi[order(titi$ident),] # # foy_inv_conj <- invalides[,c("idfoy","caseF","quifoy")] # # foy_inv_conj <- rename(foy_inv_conj, c("caseF"="inv")) # # table(foy_inv_conj[ ,c("inv","quifoy")],useNA="ifany") # # # On ne garde donc que les caseF des "vous" # # foy_inv_conj <- foy_inv_conj[foy_inv_conj$quifoy=="vous",c("idfoy","inv")] # # table(foy_inv_conj[ ,c("inv")],useNA="ifany") # # invalides_conj <- invalides[invalides$quifoy=="conj",c("idfoy","noindiv")] # # invalides_conj <- merge(invalides_conj, foy_inv_conj, by="idfoy", all.x=TRUE) # # table(invalides_conj$inv) # TODO en 2006 On en a 316 au lieu de 328 il doit y avoir de idfoy avec caseF qui n'ont pas de vous because double déclaration' # # invalides[invalides$quifoy=="conj",c("idfoy","noindiv","inv")] <- invalides_conj # # table(invalides[,c("inv","quifoy")],useNA="ifany") # # rm(invalides_conj,foy_inv_conj) # On récupère les idfoy des foyers avec une caseF cochée print ' 1.2 : Les conjoints invalides' idfoy_inv_conj = final["idfoy"][final["caseF"]] inv_conj_condition = (invalides["idfoy"].isin(idfoy_inv_conj) & (invalides["quifoy"]==1)) invalides["inv"][inv_conj_condition] = True print len(invalides[inv_conj_condition]), "invalides conjoints" print invalides["inv"].sum(), " invalides déclarants et invalides conjoints" # # # Enfants invalides et garde alternée # # # # loadTmp("pacIndiv.Rdata") # # foy_inv_pac <- invalides[!(invalides$quifoy %in% c("vous","conj")),c("inv","noindiv")] # # foy_inv_pac <- merge(foy_inv_pac, pacIndiv[,c("noindiv","typ","naia")], by="noindiv",all.x =TRUE) # # names(foy_inv_pac) # # table(foy_inv_pac[,c("typ","naia")],useNA="ifany") # # table(foy_inv_pac[,c("typ")],useNA="ifany") # # foy_inv_pac <- within(foy_inv_pac,{ # # inv <- (typ=="G") | (typ=="R") | (typ=="I") | (typ=="F" & (as.numeric(year)-naia>18)) # # alt <- (typ=="H") | (typ=="I") # # naia <- NULL # # typ <- NULL}) # # # # table(foy_inv_pac[ ,c("inv")],useNA="ifany") # # table(foy_inv_pac[ ,c("alt")],useNA="ifany") # # invalides$alt <- 0 # # foy_inv_pac[is.na(foy_inv_pac$alt),"alt"] <- 0 # # invalides[!(invalides$quifoy %in% c("vous","conj")),c("noindiv","inv","alt")] <- foy_inv_pac print ' 1.3 : enfants invalides et garde alternée' pacIndiv = load_temp(name='pacIndiv', year=year) print pacIndiv.type_pac.value_counts() foy_inv_pac = invalides.loc[~(invalides.quifoy.isin([0, 1])), ['noindiv', 'inv']] # pac = pacIndiv.ix[:, ["noindiv", "type_pac", "naia"]] print len(foy_inv_pac) print pacIndiv.columns foy_inv_pac = foy_inv_pac.merge(pacIndiv.loc[:, ['noindiv', 'type_pac', 'naia']], on='noindiv', how='left') foy_inv_pac['inv'] = (foy_inv_pac['type_pac'].isin(['G','R','I']) | ((foy_inv_pac['type_pac']=="F") & ((year - foy_inv_pac['naia'])>18))) foy_inv_pac['alt'] = ((foy_inv_pac['type_pac']=="H") | (foy_inv_pac['type_pac']=="I")) foy_inv_pac['naia'] = None foy_inv_pac['type_pac'] = None foy_inv_pac['alt'] = foy_inv_pac['alt'].fillna(False) print foy_inv_pac['inv'].describe() invalides['alt'] = 0 foy_inv_pac['alt'][foy_inv_pac.alt.isnull()] = 0 invalides = invalides.merge(foy_inv_pac, on=["noindiv","inv","alt"]) invalides = invalides.drop_duplicates(['noindiv', 'inv', 'alt'], take_last=True) # ======= # print foy_inv_pac.inv.value_counts() # TODO: JS : trop peu de True là-dedans # print foy_inv_pac.alt.value_counts() # # # # print len(invalides), len(foy_inv_pac) # print invalides.inv.value_counts() # >>>>>>> 67cd9a43177cf3f6f72521cda59dae02485df1e3 invalides = invalides.merge(foy_inv_pac, on='noindiv', how='left') invalides['inv'] = where(invalides['inv_y']==True, invalides['inv_y'], invalides['inv_x']) invalides['alt'] = where(invalides['inv_y']==True, invalides['inv_y'], invalides['inv_x']) invalides = invalides.loc[:, ["noindiv","idmen","caseP","caseF","idfoy","quifoy", "inv", 'alt']] invalides['alt'].fillna(False, inplace=True) print invalides.inv.value_counts() invalides = invalides.drop_duplicates(['noindiv', 'inv', 'alt'], take_last=True) del foy_inv_pac, pacIndiv # # # Initialisation des NA sur alt et inv # # invalides[is.na(invalides$inv), "inv"] <- 0 # # table(invalides[,c("alt","inv")],useNA="ifany") # # # # final <- merge(final, invalides[,c("noindiv","inv","alt")], by="noindiv",all.x=TRUE) # # table(final[, c("inv","alt")],useNA="ifany") print '' print 'Etape 2 : Initialisation des NA sur alt et inv' assert invalides["inv"].notnull().all() & invalides.alt.notnull().all() final = final.merge(invalides.loc[:, ['noindiv', 'inv', 'alt']], on='noindiv', how='left') del invalides print final.inv.value_counts() control(final, debug=True) save_temp(final, name='final', year=year) print 'final complétée et sauvegardée'
def create_totals(year=2006): print "Creating Totals" print "Etape 1 : Chargement des données" data = DataCollection(year=year) indivim = load_temp(name="indivim", year=year) assert indivim.duplicated(['noindiv' ]).any() == False, "Présence de doublons" # Deals individuals with imputed income : some individuals are in 'erf individu table' but # not in the 'foyer' table. We need to create a foyer for them. selection = Series() for var in [ "zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici", "zrnci" ]: varo = var[:-1] + "o" test = indivim[var] != indivim[varo] if len(selection) == 0: selection = test else: selection = (test) | (selection) indivi_i = indivim[selection] indivi_i.rename( columns={ "ident": "idmen", "persfip": "quifoy", "zsali": "sali2", # Inclu les salaires non imposables des agents d'assurance "zchoi": "choi2", "zrsti": "rsti2", "zalri": "alr2" }, inplace=True) indivi_i["quifoy"] = where(indivi_i["quifoy"].isnull(), "vous", indivi_i["quifoy"]) indivi_i["quelfic"] = "FIP_IMP" ## We merge them with the other individuals #indivim <- rename(indivim, c(ident = "idmen", # persfip = "quifoy", # zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance # zchoi = "choi2", # zrsti = "rsti2", # zalri = "alr2")) # #indivi <- rbind(indivim[!(indivim$noindiv %in% indivi_i$noindiv),], indivi_i) #rm(indivim, indivi_i) #gc() #table(indivi$quelfic) # indivim.rename( columns=dict( ident="idmen", persfip="quifoy", zsali= "sali2", # Inclu les salaires non imposables des agents d'assurance zchoi="choi2", zrsti="rsti2", zalri="alr2"), inplace=True) if not (set(list(indivim.noindiv)) > set(list(indivi_i.noindiv))): raise Exception("Individual ") indivim.set_index("noindiv", inplace=True) indivi_i.set_index("noindiv", inplace=True) indivi = indivim del indivim indivi.update(indivi_i) indivi.reset_index(inplace=True) print '' print "Etape 2 : isolation des FIP" fip_imp = indivi.quelfic == "FIP_IMP" indivi["idfoy"] = ( indivi["idmen"].astype("int64") * 100 + (indivi["declar1"].str[0:2]).convert_objects(convert_numeric=True)) indivi.loc[fip_imp, "idfoy"] = nan ## Certains FIP (ou du moins avec revenus imputés) ont un num?ro de déclaration d'impôt ( pourquoi ?) fip_has_declar = (fip_imp) & (indivi.declar1.notnull()) # indivi.ix[fip_has_declar, "idfoy"] = ( indivi.ix[fip_has_declar, "idmen"]*100 # + (indivi.ix[fip_has_declar, "declar1"].str[0:1]).convert_objects(convert_numeric=True) ) indivi["idfoy"] = where( fip_has_declar, indivi["idmen"] * 100 + indivi["declar1"].str[0:2].convert_objects(convert_numeric=True), indivi["idfoy"]) del fip_has_declar fip_no_declar = (fip_imp) & (indivi.declar1.isnull()) del fip_imp indivi["idfoy"] = where(fip_no_declar, indivi["idmen"] * 100 + 50, indivi["idfoy"]) indivi_fnd = indivi.loc[fip_no_declar, ["idfoy", "noindiv"]] while any(indivi_fnd.duplicated(cols=["idfoy"])): indivi_fnd["idfoy"] = where(indivi_fnd.duplicated(cols=["idfoy"]), indivi_fnd["idfoy"] + 1, indivi_fnd["idfoy"]) assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len( indivi_fnd["idfoy"]), "Duplicates remaining" assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons" indivi.loc[fip_no_declar, ["idfoy"]] = indivi_fnd del indivi_fnd, fip_no_declar print '' print 'Etape 3 : Récupération des EE_NRT' nrt = indivi.quelfic == "EE_NRT" indivi.idfoy = where(nrt, indivi.idmen * 100 + indivi.noi, indivi.idfoy) indivi.loc[nrt, "quifoy"] = "vous" del nrt pref_or_cref = indivi['lpr'].isin([1, 2]) adults = (indivi.quelfic.isin(["EE", "EE_CAF"])) & (pref_or_cref) indivi.idfoy = where(adults, indivi.idmen * 100 + indivi.noi, indivi.idfoy) indivi.loc[adults, "quifoy"] = "vous" del adults assert indivi.loc[indivi['lpr'].isin([1, 2]), "idfoy"].notnull().all() print '' print 'Etape 4 : Rattachement des enfants aux déclarations' assert indivi["noindiv"].duplicated().any( ) == False, "Some noindiv appear twice" lpr3_or_lpr4 = indivi['lpr'].isin([3, 4]) enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE", "EE_CAF"])) assert indivi.loc[enf_ee, "noindiv"].notnull().all( ), " Some noindiv are not set, which will ruin next stage" assert indivi.loc[ enf_ee, "noindiv"].duplicated().any() == False, "Some noindiv appear twice" pere = DataFrame({ "noindiv_enf": indivi.noindiv.loc[enf_ee], "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee] }) mere = DataFrame({ "noindiv_enf": indivi.noindiv.loc[enf_ee], "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee] }) foyer = data.get_values(variables=["noindiv", "zimpof"], table="foyer") pere = pere.merge(foyer, how="inner", on="noindiv") mere = mere.merge(foyer, how="inner", on="noindiv") # print "Some pere et mere are duplicated because people have two foyers" # print pere[pere.duplicated()] # print mere[mere.duplicated()] df = pere.merge(mere, how="outer", on="noindiv_enf", suffixes=('_p', '_m')) # print len(pere) # print len(mere) # print len(df) # ll = df.loc[df["noindiv_enf"].duplicated(), "noindiv_enf"] # print df.loc[df["noindiv_enf"].isin(ll)] # print df[df.duplicated()] print ' 4.1 : gestion des personnes dans 2 foyers' for col in ["noindiv_p", "noindiv_m", "noindiv_enf"]: df[col] = df[col].fillna( 0, inplace=True) # beacause groupby drop groups with NA in index df = df.groupby(by=["noindiv_p", "noindiv_m", "noindiv_enf"]).sum() df.reset_index(inplace=True) df["which"] = "" df["which"] = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()), "mere", "") df["which"] = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()), "pere", "") both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull()) df["which"] = where(both & (df.zimpof_p > df.zimpof_m), "pere", "mere") df["which"] = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere") assert df["which"].notnull().all( ), "Some enf_ee individuals are not matched with any pere or mere" del lpr3_or_lpr4, pere, mere df.rename(columns={"noindiv_enf": "noindiv"}, inplace=True) df["idfoy"] = where(df.which == "pere", df.noindiv_p, df.noindiv_m) df["idfoy"] = where(df.which == "mere", df.noindiv_m, df.noindiv_p) assert df["idfoy"].notnull().all() for col in df.columns: if col not in ["idfoy", "noindiv"]: del df[col] # assert indivi.loc[enf_ee,"idfoy"].notnull().all() assert df.duplicated().any() == False df.set_index("noindiv", inplace=True, verify_integrity=True) indivi.set_index("noindiv", inplace=True, verify_integrity=True) ind_notnull = indivi["idfoy"].notnull().sum() ind_isnull = indivi["idfoy"].isnull().sum() indivi = indivi.combine_first(df) assert ind_notnull + ind_isnull == (indivi["idfoy"].notnull().sum() + indivi["idfoy"].isnull().sum()) indivi.reset_index(inplace=True) assert indivi.duplicated().any() == False # MBJ: issue delt with when moving from R code to python ## TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs ## On suit guide méthodo erf 2003 page 135 ## On supprime les conjoints FIP et les FIP de 25 ans et plus; ## On conserve les enfants FIP de 19 à 24 ans; ## On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un ## ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète ## EEC que l'on ne retrouvera pas dans les EEC suivantes. # print ' 4.2 : On enlève les individus pour lesquels il manque le déclarant' fip = load_temp(name="fipDat", year=year) fip["declar"] = nan fip["agepf"] = nan fip.drop(["actrec", "year", "noidec"], axis=1, inplace=True) fip.naia = fip.naia.astype("int32") fip.rename( columns=dict( ident="idmen", persfip="quifoy", zsali= "sali2", # Inclu les salaires non imposables des agents d'assurance zchoi="choi2", zrsti="rsti2", zalri="alr2"), inplace=True) is_fip_19_25 = ((year - fip.naia - 1) >= 19) & ((year - fip.naia - 1) < 25) ## TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented ## The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25 ##fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"] ##fip[is_fip_19_25 ,"lpr"] <- 1 # #indivi <- rbind.fill(indivi,fip[is_fip_19_25,]) indivi = concat([indivi, fip.loc[is_fip_19_25]]) del is_fip_19_25 indivi['age'] = year - indivi.naia - 1 indivi['agem'] = 12 * indivi.age + 12 - indivi.naim indivi["quimen"] = 0 indivi.quimen[indivi.lpr == 1] = 0 indivi.quimen[indivi.lpr == 2] = 1 indivi.quimen[indivi.lpr == 3] = 2 indivi.quimen[indivi.lpr == 4] = 3 indivi['not_pr_cpr'] = nan indivi['not_pr_cpr'][indivi['lpr'] <= 2] = False indivi['not_pr_cpr'][indivi['lpr'] > 2] = True print " 4.3 : Creating non pr=0 and cpr=1 idmen's" indivi.reset_index(inplace=True) test1 = indivi.ix[indivi['not_pr_cpr'] == True, ['quimen', 'idmen']] test1['quimen'] = 2 j = 2 while any(test1.duplicated(['quimen', 'idmen'])): test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j + 1 j += 1 print_id(indivi) indivi.update(test1) print_id(indivi) # indivi.set_index(['quiment']) #TODO: check relevance # TODO problème avec certains idfoy qui n'ont pas de vous print '' print "Etape 5 : Gestion des idfoy qui n'ont pas de vous" all = indivi.drop_duplicates('idfoy') with_ = indivi.loc[indivi['quifoy'] == 'vous', 'idfoy'] without = all[~(all.idfoy.isin(with_.values))] print 'On cherche si le déclarant donné par la deuxième déclaration est bien un vous' has_declar2 = (indivi.idfoy.isin( without.idfoy.values)) & (indivi.declar2.notnull()) decl2_idfoy = (indivi.loc[has_declar2, 'idmen'].astype('int') * 100 + indivi.loc[has_declar2, "declar2"].str[0:2].astype('int')) indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values), decl2_idfoy, None) del all, with_, without, has_declar2 print ' 5.1 : Elimination idfoy restant' idfoyList = indivi.loc[indivi['quifoy'] == "vous", 'idfoy'].drop_duplicates() indivi = indivi[indivi.idfoy.isin(idfoyList.values)] del idfoyList print_id(indivi) myvars = [ "noindiv", "noi", "idmen", "idfoy", "quifoy", "wprm", "age", "agem", "quelfic", "actrec", "quimen", "nbsala", "titc", "statut", "txtppb", "chpub", "prosa", "encadr" ] if not (len(set(myvars).difference(set(indivi.columns))) == 0): print set(myvars).difference(set(indivi.columns)) assert len(set(myvars).difference(set(indivi.columns))) == 0 indivi = indivi.loc[:, myvars] ## TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé ## les infos provenant des déclarations) print '' print 'Etape 6 : Création des variables descriptives' print ' 6.1 : variable activité' indivi['activite'] = None indivi['activite'][indivi['actrec'] <= 3] = 0 indivi['activite'][indivi['actrec'] == 4] = 1 indivi['activite'][indivi['actrec'] == 5] = 2 indivi['activite'][indivi['actrec'] == 7] = 3 indivi['activite'][indivi['actrec'] == 8] = 4 indivi['activite'][indivi['age'] <= 13] = 2 # ce sont en fait les actrec=9 print indivi['activite'].value_counts() # TODO: MBJ problem avec les actrec indivi['titc'][indivi['titc'].isnull()] = 0 assert indivi['titc'].notnull().all(), Exception("Problème avec les titc") print ' 6.2 : variable statut' indivi['statut'][indivi['statut'].isnull()] = 0 indivi['statut'] = indivi['statut'].astype('int') indivi['statut'][indivi['statut'] == 11] = 1 indivi['statut'][indivi['statut'] == 12] = 2 indivi['statut'][indivi['statut'] == 13] = 3 indivi['statut'][indivi['statut'] == 21] = 4 indivi['statut'][indivi['statut'] == 22] = 5 indivi['statut'][indivi['statut'] == 33] = 6 indivi['statut'][indivi['statut'] == 34] = 7 indivi['statut'][indivi['statut'] == 35] = 8 indivi['statut'][indivi['statut'] == 43] = 9 indivi['statut'][indivi['statut'] == 44] = 10 indivi['statut'][indivi['statut'] == 45] = 11 assert indivi['statut'].isin( range(12)).all(), Exception("statut value over range") #indivi$nbsala <- as.numeric(indivi$nbsala) #indivi <- within(indivi,{ # nbsala[is.na(nbsala) ] <- 0 # nbsala[nbsala==99 ] <- 10 # TODO 418 fip à retracer qui sont NA #}) print ' 6.3 : variable txtppb' indivi['txtppb'] = indivi['txtppb'].fillna(0) assert indivi['txtppb'].notnull().all() indivi['nbsala'] = indivi['nbsala'].fillna(0) indivi['nbsala'] = indivi['nbsala'].astype('int') indivi['nbsala'][indivi['nbsala'] == 99] = 10 assert indivi['nbsala'].isin(range(11)).all() print ' 6.4 : variable chpub et CSP' indivi['chpub'].fillna(0, inplace=True) indivi['chpub'] = indivi['chpub'].astype('int') indivi['chpub'][indivi['chpub'].isnull()] = 0 print indivi['chpub'].value_counts() assert indivi['chpub'].isin(range(11)).all() indivi['cadre'] = 0 indivi['prosa'][indivi['prosa'].isnull()] = 0 assert indivi['prosa'].notnull().all() print indivi['encadr'].value_counts() # encadr : 1=oui, 2=non indivi['encadr'].fillna(2, inplace=True) assert indivi['encadr'].notnull().all() indivi['cadre'][indivi['prosa'].isin([7, 8])] = 1 indivi['cadre'][(indivi['prosa'] == 9) & (indivi['encadr'] == 1)] = 1 print "cadre" print indivi['cadre'].value_counts() assert indivi['cadre'].isin(range(2)).all() print '' print "Etape 7 : on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence" print 'nb de doublons idfam/quifam', len( indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])]) print 'On crée les n° de personnes à charge' assert indivi['idfoy'].notnull().all() print_id(indivi) indivi['quifoy2'] = 2 indivi['quifoy2'][indivi['quifoy'] == 'vous'] = 0 indivi['quifoy2'][indivi['quifoy'] == 'conj'] = 1 indivi['quifoy2'][indivi['quifoy'] == 'pac'] = 2 del indivi['quifoy'] indivi['quifoy'] = indivi['quifoy2'] del indivi['quifoy2'] print_id(indivi) test2 = indivi.loc[indivi['quifoy'] == 2, ['quifoy', 'idfoy', 'noindiv']] print_id(test2) j = 2 while test2.duplicated(['quifoy', 'idfoy']).any(): test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j j += 1 print_id(test2) indivi = indivi.merge(test2, on=['noindiv', 'idfoy'], how="left") indivi['quifoy'] = indivi['quifoy_x'] indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'], indivi['quifoy_x']) del indivi['quifoy_x'], indivi['quifoy_y'] print_id(indivi) del test2, fip print 'nb de doublons idfam/quifam', len( indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])]) print_id(indivi) ##################################################################################### ## On ajoute les idfam et quifam #load(famc) # #tot2 <- merge(indivi, famille, by = c('noindiv'), all.x = TRUE) #rm(famille) #print_id(tot2) # ### Les idfam des enfants FIP qui ne font plus partie des familles forment des famille seuls #tot2[is.na(tot2$quifam), "idfam"] <- tot2[is.na(tot2$quifam), "noindiv"] #tot2[is.na(tot2$quifam), "quifam"] <- 0 #print_id(tot2) #saveTmp(tot2, file = "tot2.Rdata") #rm(indivi,tot2) # ## on merge les variables de revenus (foyer_aggr) avec les identifiants précédents ## load foyer #loadTmp(file = "tot2.Rdata") #loadTmp(file= "foyer_aggr.Rdata") # #tot3 <- merge(tot2, foyer, all.x = TRUE) #print_id(tot3) # OK #saveTmp(tot3, file= "tot3.Rdata") #rm(tot3,tot2,foyer) # print '' print 'Etape 8 : création des fichiers totaux' famille = load_temp(name='famc', year=year) print ' 8.1 : création de tot2 & tot3' tot2 = indivi.merge(famille, on='noindiv', how='inner') # del famille # TODO: MBJ increase in number of menage/foyer when merging with family ... del famille control(tot2, debug=True, verbose=True) assert tot2['quifam'].notnull().all() save_temp(tot2, name='tot2', year=year) del indivi print ' tot2 saved' # #On combine les variables de revenu # foyer = load_temp(name='foy_ind', year=year) # print " INTERSERCT THE POOCHAY" # tot2["idfoy"] = tot2["idfoy"][tot2["idfoy"].notnull()] +1 # print "pingas" # print sorted(tot2.loc[tot2.idfoy.notnull(),"idfoy"].astype('int').unique())[0:10] # print "pocchay" # print sorted(foyer["idfoy"].unique())[0:10] # print "final flash" # print 602062550.0 in foyer["idfoy"].values # print len(list(set(tot2["idfoy"].unique()) & set(foyer["idfoy"].unique()))) # print tot2.quifoy.value_counts() #tot2.update(foyer) tot2.merge(foyer, how='left') tot2 = tot2[tot2.idmen.notnull()] # tot2['idfoy'] += 1 print_id(tot2) tot3 = tot2 # TODO: check where they come from tot3 = tot3.drop_duplicates(cols='noindiv') print len(tot3) #Block to remove any unwanted duplicated pair print " check tot3" control(tot3, debug=True, verbose=True) tot3 = tot3.drop_duplicates(cols=['idfoy', 'quifoy']) tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam']) tot3 = tot3.drop_duplicates(cols=['idmen', 'quimen']) tot3 = tot3.drop_duplicates(cols='noindiv') control(tot3) ## On ajoute les variables individualisables #loadTmp("foyer_individualise.Rdata") # foy_ind #loadTmp("tot3.Rdata") #loadTmp("allvars.Rdata") #loadTmp("sif.Rdata") # #vars2 <- setdiff(names(tot3), allvars) #tot3 <- tot3[,vars2] # #print_id(tot3) #final <- merge(tot3, foy_ind, by = c('idfoy', 'quifoy'), all.x = TRUE) # print ' 8.2 : On ajoute les variables individualisables' allvars = load_temp(name='ind_vars_to_remove', year=year) vars2 = set(tot3.columns).difference(set(allvars)) tot3 = tot3[list(vars2)] print len(tot3) assert not (tot3.duplicated( cols=['noindiv']).any()), "doublon dans tot3['noindiv']" lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])]) assert lg_dup == 0, "%i pairs of idfoy/quifoy in tot3 are duplicated" % ( lg_dup) save_temp(tot3, name='tot3', year=year) control(tot3) del tot2, allvars, tot3, vars2 print 'tot3 sauvegardé' gc.collect()
def foyer_all(year=2006): ## On ajoute les cases de la déclaration #foyer_all <- LoadIn(erfFoyFil) data = DataCollection(year=year) foyer_all = data.get_values(table="foyer" ) ## on ne garde que les cases de la déclaration ('fxzz') #vars <- names(foyer_all) #vars <- c("noindiv", vars[grep("^f[0-9]", vars)]) # vars = foyer_all.columns regex = re.compile("^f[0-9]") vars = [x for x in vars if regex.match(x)] #foyer <- foyer_all[vars] #rm(foyer_all) #gc() #noindiv <- list(foyer$noindiv) # foyer = foyer_all[vars + ["noindiv"]] del foyer_all gc.collect() # ## On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations #foyer <- aggregate(foyer, by = noindiv, FUN = 'sum') #print foyer.describe()["f1aj"].to_string() foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum) # #print foyer.describe()["f1aj"].to_string() #print foyer.describe()["noindiv"].to_string() # print_id(foyer) ## noindiv have been summed over original noindiv which are now in Group.1 #foyer$noindiv <- NULL #foyer <- rename(foyer, c(Group.1 = 'noindiv')) ## problème avec les dummies () # #saveTmp(foyer, file= "foyer_aggr.Rdata") # # ############################################################################# ## On récupère les variables individualisables #loadTmp("foyer_aggr.Rdata") # #individualisable <- function(table, var, vars, qui){ # print(var) # print(vars) # temp <- table[c('noindiv', vars)] # n = length(qui) # names(temp)[2:(n+1)] <- qui # temp$newvar <- NULL # temp2 <- melt(temp, id = 'noindiv', variable_name = 'quifoy') # temp2 <- transform(temp2, quifoy = as.character(quifoy)) # temp2 <- transform(temp2, noindiv = as.character(noindiv)) # str(temp2) # rename(temp2, c(value = var)) #} var_dict = {'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'], 'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'], 'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'], 'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'], 'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'], 'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'], 'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'], 'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'], 'f1tv': ['f1tv', 'f1uv'], 'f1tw': ['f1tw', 'f1uw'], 'f1tx': ['f1tx', 'f1ux'], 'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'], 'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'], 'frag_exon': ['f5hn', 'f5in', 'f5jn'], 'frag_impo': ['f5ho', 'f5io', 'f5jo'], 'arag_exon': ['f5hb', 'f5ib', 'f5jb'], 'arag_impg': ['f5hc', 'f5ic', 'f5jc'], 'arag_defi': ['f5hf', 'f5if', 'f5jf'], 'nrag_exon': ['f5hh', 'f5ih', 'f5jh'], 'nrag_impg': ['f5hi', 'f5ii', 'f5ji'], 'nrag_defi': ['f5hl', 'f5il', 'f5jl'], 'nrag_ajag': ['f5hm', 'f5im', 'f5jm'], 'mbic_exon': ['f5kn', 'f5ln', 'f5mn'], 'abic_exon': ['f5kb', 'f5lb', 'f5mb'], 'nbic_exon': ['f5kh', 'f5lh', 'f5mh'], 'mbic_impv': ['f5ko', 'f5lo', 'f5mo'], 'mbic_imps': ['f5kp', 'f5lp', 'f5mp'], 'abic_impn': ['f5kc', 'f5lc', 'f5mc'], 'abic_imps': ['f5kd', 'f5ld', 'f5md'], 'nbic_impn': ['f5ki', 'f5li', 'f5mi'], 'nbic_imps': ['f5kj', 'f5lj', 'f5mj'], 'abic_defn': ['f5kf', 'f5lf', 'f5mf'], 'abic_defs': ['f5kg', 'f5lg', 'f5mg'], 'nbic_defn': ['f5kl', 'f5ll', 'f5ml'], 'nbic_defs': ['f5km', 'f5lm', 'f5mm'], 'nbic_apch': ['f5ks', 'f5ls', 'f5ms'], 'macc_exon': ['f5nn', 'f5on', 'f5pn'], 'aacc_exon': ['f5nb', 'f5ob', 'f5pb'], 'nacc_exon': ['f5nh', 'f5oh', 'f5ph'], 'macc_impv': ['f5no', 'f5oo', 'f5po'], 'macc_imps': ['f5np', 'f5op', 'f5pp'], 'aacc_impn': ['f5nc', 'f5oc', 'f5pc'], 'aacc_imps': ['f5nd', 'f5od', 'f5pd'], 'aacc_defn': ['f5nf', 'f5of', 'f5pf'], 'aacc_defs': ['f5ng', 'f5og', 'f5pg'], 'nacc_impn': ['f5ni', 'f5oi', 'f5pi'], 'nacc_imps': ['f5nj', 'f5oj', 'f5pj'], 'nacc_defn': ['f5nl', 'f5ol', 'f5pl'], 'nacc_defs': ['f5nm', 'f5om', 'f5pm'], 'mncn_impo': ['f5ku', 'f5lu', 'f5mu'], 'cncn_bene': ['f5sn', 'f5ns', 'f5os'], 'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check 'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'], 'abnc_exon': ['f5qb', 'f5rb', 'f5sb'], 'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'], 'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'], 'abnc_impo': ['f5qc', 'f5rc', 'f5sc'], 'abnc_defi': ['f5qe', 'f5re', 'f5se'], 'nbnc_impo': ['f5qi', 'f5ri', 'f5si'], 'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'], # 'ebic_impv' : ['f5ta','f5ua', 'f5va'], # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'], 'mbic_mvct': ['f5hu'], 'macc_mvct': ['f5iu'], 'mncn_mvct': ['f5ju'], 'mbnc_mvct': ['f5kz'], 'frag_pvct': ['f5hw', 'f5iw', 'f5jw'], 'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'], 'macc_pvct': ['f5nx', 'f5ox', 'f5px'], 'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'], 'mncn_pvct': ['f5ky', 'f5ly', 'f5my'], 'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'], 'macc_mvlt': ['f5nr', 'f5or', 'f5pr'], 'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'], 'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'], 'frag_pvce': ['f5hx', 'f5ix', 'f5jx'], 'arag_pvce': ['f5he', 'f5ie', 'f5je'], 'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'], 'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'], 'abic_pvce': ['f5ke', 'f5le', 'f5me'], 'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'], 'macc_pvce': ['f5nq', 'f5oq', 'f5pq'], 'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'], 'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'], 'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'], 'cncn_pvce': ['f5so', 'f5nt', 'f5ot'], 'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'], 'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'], 'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'], 'demenage' : ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er']} # (déménagement) uniquement en 2006 # #varlist = list(list('sali', c('f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej')), # list('choi', c('f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep')), # list('fra', c('f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek')), # ...... # list('mbnc_pvce', c('f5hr', 'f5ir', 'f5jr')), # list('abnc_pvce', c('f5qd', 'f5rd', 'f5sd')), # list('nbnc_pvce', c('f5qj', 'f5rj', 'f5sj')), # list('demenage' , c('f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'))) # (déménagement) uniquement en 2006 # vars_sets = [ set(var_list) for var_list in var_dict.values() ] eligible_vars = (set().union(*vars_sets)).intersection( set(list(foyer.columns))) print "From %i variables, we keep %i eligibles variables" %( len(set().union(*vars_sets)), len(eligible_vars)) qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3'] err = 0 err_vars = {} foy_ind = DataFrame() for individual_var, foyer_vars in var_dict.iteritems(): try: selection = foyer[foyer_vars + ["noindiv"]] except KeyError: # Testing if at least one variable of foyers_vars is in the eligible list presence = [ x in eligible_vars for x in foyer_vars ] var_present = any(presence) if not var_present: print individual_var + " is not present" continue else: # Shrink the list foyer_vars_cleaned = [var for var,present in zip(foyer_vars, presence) if present is True] selection = foyer[foyer_vars_cleaned + ["noindiv"]] # Reshape the dataframe selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True) selection.set_index("noindiv", inplace=True) selection.columns.name = "quifoy" selection = selection.stack() selection.name = individual_var selection = selection.reset_index() # A Series cannot see its index resetted to produce a DataFrame selection = selection.set_index(["quifoy", "noindiv"]) selection = selection[selection[individual_var] !=0] # print len(selection) if len(foy_ind) == 0: foy_ind = selection else: foy_ind = concat([foy_ind, selection], axis=1, join='outer') foy_ind.reset_index(inplace=True) print "foy_ind" print foy_ind.describe().to_string() #not_first <- FALSE #allvars = c() #for (v in varlist){ # vars = intersect(v[[2]],names(foyer)) # to deal with variabes that are not present # if (length(vars) > 0) { # allvars <- c(allvars, vars) # qui <- c('vous', 'conj', 'pac1', 'pac2', 'pac3') # n <- length(vars) # temp <- individualisable(foyer, v[[1]], vars, qui[1:n]) # if (not_first) { # print('merge') # foy_ind <- merge(temp, foy_ind, by = c('noindiv', 'quifoy'), all = TRUE) # names(foy_ind) # } # else { # print('init') # foy_ind <- temp # not_first <- TRUE # } # } #} ind_vars_to_remove = Series(list(eligible_vars)) save_temp(ind_vars_to_remove, name='ind_vars_to_remove', year=year) foy_ind.rename(columns={"noindiv" : "idfoy"}, inplace=True) print_id(foy_ind) foy_ind['quifoy'][foy_ind['quifoy']=='vous'] = 0 foy_ind['quifoy'][foy_ind['quifoy']=='conj'] = 1 foy_ind['quifoy'][foy_ind['quifoy']=='pac1'] = 2 foy_ind['quifoy'][foy_ind['quifoy']=='pac2'] = 3 foy_ind['quifoy'][foy_ind['quifoy']=='pac3'] = 4 assert foy_ind['quifoy'].isin(range(5)).all(), 'présence de valeurs aberrantes dans quifoy' print 'saving foy_ind' print_id(foy_ind) save_temp(foy_ind, name="foy_ind", year = year) show_temp() return