def build_erf_aggregates(variables = None, year = 2006, unit = 1e6): """ Fetch the relevant aggregates from erf data """ erf = DataCollection(year=year) if variables is not None and "wprm" not in variables: variables.append("wprm") print 'Fetching aggregates from erf %s data' %str(year) df = erf.get_of_values(variables=variables, table = "erf_menage") of2erf = get_of2erf() erf2of = get_erf2of() df.rename(columns = erf2of, inplace = True) wprm = df["wprm"] for col in df.columns: try: df[col] = df[col].astype(np.float64) except: pass df = df.mul(wprm, axis = 0) for col in list(set(df.columns) - set(['ident', 'wprm'])): try: df[col] = df[col].sum()/1e6 except: pass return df.ix[0:1] # Aggregate so we only need 1 row
def build_erf_aggregates(variables=None, year=2006, unit=1e6): """ Fetch the relevant aggregates from erf data """ erf = DataCollection(year=year) if variables is not None and "wprm" not in variables: variables.append("wprm") print 'Fetching aggregates from erf %s data' % str(year) df = erf.get_of_values(variables=variables, table="erf_menage") of2erf = get_of2erf() erf2of = get_erf2of() df.rename(columns=erf2of, inplace=True) wprm = df["wprm"] for col in df.columns: try: df[col] = df[col].astype(np.float64) except: pass df = df.mul(wprm, axis=0) for col in list(set(df.columns) - set(['ident', 'wprm'])): try: df[col] = df[col].sum() / 1e6 except: pass return df.ix[0:1] # Aggregate so we only need 1 row
def create_enfnn(year=2006): ''' ''' #load data = DataCollection(year=year) ### Enfant à naître (NN pour nouveaux nés) individual_vars = ['noi', 'noicon', 'noindiv', 'noiper', 'noimer', 'ident', 'naia', 'naim', 'lien', 'acteu','stc','contra','titc','mrec','forter','rstg','retrai','lpr','cohab','sexe', 'agepr','rga'] data = DataCollection(year=year) eeccmp1 = data.get_values(table="eec_cmp_1", variables=individual_vars) eeccmp2 = data.get_values(table="eec_cmp_2", variables=individual_vars) eeccmp3 = data.get_values(table="eec_cmp_3", variables=individual_vars) tmp = eeccmp1.merge(eeccmp2, how="outer") enfnn = tmp.merge(eeccmp3, how="outer") # optimisation des types? Controle de l'existence en passant # pourquoi pas des int quand c'est possible #TODO: minimal dtype for var in individual_vars: print var enfnn[var] = enfnn[var].astype('float') del eeccmp1, eeccmp2, eeccmp3, individual_vars # création de variables print enfnn.describe() enfnn['declar1'] = '' enfnn['noidec'] = 0 enfnn['ztsai'] = 0 enfnn['year'] = year enfnn['year'] = enfnn['year'].astype("float32") # -> integer ? enfnn['agepf'] = enfnn['year'] - enfnn['naia'] enfnn['agepf'][enfnn['naim'] >= 7] -= 1 enfnn['actrec'] = 9 enfnn['quelfic'] = 'ENF_NN' enfnn['persfip'] = "" #selection #enfnn <- enfnn[(enfnn$naia==enfnn$year & enfnn$naim>=10) | (enfnn$naia==enfnn$year+1 & enfnn$naim<=5),] enfnn = enfnn[((enfnn['naia'] == enfnn['year']) & (enfnn['naim'] >= 10)) | ((enfnn['naia'] == enfnn['year'] + 1) & (enfnn['naim'] <= 5))] #save save_temp(enfnn, name="enfnn", year=year) del enfnn print "enfnnm saved" gc.collect()
def create_totals(year=2006): print "Creating Totals" print "Etape 1 : Chargement des données" data = DataCollection(year=year) indivim = load_temp(name="indivim", year=year) assert indivim.duplicated(['noindiv']).any() == False, "Présence de doublons" # Deals individuals with imputed income : some individuals are in 'erf individu table' but # not in the 'foyer' table. We need to create a foyer for them. selection = Series() for var in ["zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici", "zrnci"]: varo = var[:-1]+"o" test = indivim[var] != indivim[varo] if len(selection) == 0: selection = test else: selection = (test) | (selection) indivi_i = indivim[selection] indivi_i.rename(columns={"ident" : "idmen", "persfip":"quifoy", "zsali" : "sali2", # Inclu les salaires non imposables des agents d'assurance "zchoi" : "choi2", "zrsti" : "rsti2", "zalri" : "alr2"}, inplace=True) indivi_i["quifoy"] = where(indivi_i["quifoy"].isnull(), "vous", indivi_i["quifoy"]) indivi_i["quelfic"] = "FIP_IMP" ## We merge them with the other individuals #indivim <- rename(indivim, c(ident = "idmen", # persfip = "quifoy", # zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance # zchoi = "choi2", # zrsti = "rsti2", # zalri = "alr2")) # #indivi <- rbind(indivim[!(indivim$noindiv %in% indivi_i$noindiv),], indivi_i) #rm(indivim, indivi_i) #gc() #table(indivi$quelfic) # indivim.rename( columns= dict(ident = "idmen", persfip = "quifoy", zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance zchoi = "choi2", zrsti = "rsti2", zalri = "alr2"), inplace=True) if not (set(list(indivim.noindiv)) > set(list(indivi_i.noindiv)) ): raise Exception("Individual ") indivim.set_index("noindiv", inplace=True) indivi_i.set_index("noindiv", inplace=True) indivi = indivim del indivim indivi.update(indivi_i) indivi.reset_index( inplace=True) print '' print "Etape 2 : isolation des FIP" fip_imp = indivi.quelfic=="FIP_IMP" indivi["idfoy"] = (indivi["idmen"].astype("int64")*100 + (indivi["declar1"].str[0:2]).convert_objects(convert_numeric=True)) indivi.loc[fip_imp,"idfoy"] = nan ## Certains FIP (ou du moins avec revenus imputés) ont un num?ro de déclaration d'impôt ( pourquoi ?) fip_has_declar = (fip_imp) & (indivi.declar1.notnull()) # indivi.ix[fip_has_declar, "idfoy"] = ( indivi.ix[fip_has_declar, "idmen"]*100 # + (indivi.ix[fip_has_declar, "declar1"].str[0:1]).convert_objects(convert_numeric=True) ) indivi["idfoy"] = where(fip_has_declar, indivi["idmen"]*100 + indivi["declar1"].str[0:2].convert_objects(convert_numeric=True), indivi["idfoy"]) del fip_has_declar fip_no_declar = (fip_imp) & (indivi.declar1.isnull()) del fip_imp indivi["idfoy"] = where(fip_no_declar, indivi["idmen"]*100 + 50, indivi["idfoy"]) indivi_fnd = indivi.loc[fip_no_declar, ["idfoy","noindiv"]] while any(indivi_fnd.duplicated(cols=["idfoy"])): indivi_fnd["idfoy"] = where(indivi_fnd.duplicated(cols=["idfoy"]), indivi_fnd["idfoy"] + 1, indivi_fnd["idfoy"]) assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len(indivi_fnd["idfoy"]), "Duplicates remaining" assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons" indivi.loc[fip_no_declar, ["idfoy"]] = indivi_fnd del indivi_fnd, fip_no_declar print '' print 'Etape 3 : Récupération des EE_NRT' nrt = indivi.quelfic=="EE_NRT" indivi.idfoy = where(nrt, indivi.idmen*100 + indivi.noi, indivi.idfoy) indivi.loc[nrt,"quifoy"] = "vous" del nrt pref_or_cref = indivi['lpr'].isin([1,2]) adults = (indivi.quelfic.isin(["EE","EE_CAF"])) & (pref_or_cref) indivi.idfoy = where(adults, indivi.idmen*100 + indivi.noi, indivi.idfoy) indivi.loc[adults, "quifoy"] = "vous" del adults assert indivi.loc[indivi['lpr'].isin([1,2]),"idfoy"].notnull().all() print '' print 'Etape 4 : Rattachement des enfants aux déclarations' assert indivi["noindiv"].duplicated().any() == False, "Some noindiv appear twice" lpr3_or_lpr4 = indivi['lpr'].isin([3,4]) enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE","EE_CAF"])) assert indivi.loc[enf_ee, "noindiv"].notnull().all(), " Some noindiv are not set, which will ruin next stage" assert indivi.loc[enf_ee, "noindiv"].duplicated().any() == False, "Some noindiv appear twice" pere = DataFrame( {"noindiv_enf" : indivi.noindiv.loc[enf_ee], "noindiv" : 100*indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee] }) mere = DataFrame( {"noindiv_enf" : indivi.noindiv.loc[enf_ee], "noindiv" : 100*indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee] }) foyer = data.get_values(variables=["noindiv","zimpof"], table="foyer" ) pere = pere.merge(foyer, how="inner", on="noindiv") mere = mere.merge(foyer, how="inner", on="noindiv") # print "Some pere et mere are duplicated because people have two foyers" # print pere[pere.duplicated()] # print mere[mere.duplicated()] df = pere.merge(mere, how="outer", on="noindiv_enf", suffixes=('_p', '_m')) # print len(pere) # print len(mere) # print len(df) # ll = df.loc[df["noindiv_enf"].duplicated(), "noindiv_enf"] # print df.loc[df["noindiv_enf"].isin(ll)] # print df[df.duplicated()] print ' 4.1 : gestion des personnes dans 2 foyers' for col in ["noindiv_p","noindiv_m","noindiv_enf"]: df[col] = df[col].fillna(0,inplace=True) # beacause groupby drop groups with NA in index df = df.groupby(by=["noindiv_p","noindiv_m","noindiv_enf"]).sum() df.reset_index(inplace=True) df["which"] = "" df["which"] = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()), "mere", "") df["which"] = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()), "pere", "") both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull()) df["which"] = where(both & (df.zimpof_p > df.zimpof_m), "pere", "mere") df["which"] = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere") assert df["which"].notnull().all(), "Some enf_ee individuals are not matched with any pere or mere" del lpr3_or_lpr4, pere, mere df.rename(columns={"noindiv_enf" : "noindiv"}, inplace=True) df["idfoy"] = where( df.which=="pere", df.noindiv_p, df.noindiv_m) df["idfoy"] = where( df.which=="mere", df.noindiv_m, df.noindiv_p) assert df["idfoy"].notnull().all() for col in df.columns: if col not in ["idfoy", "noindiv"]: del df[col] # assert indivi.loc[enf_ee,"idfoy"].notnull().all() assert df.duplicated().any() == False df.set_index("noindiv",inplace=True, verify_integrity=True) indivi.set_index("noindiv", inplace=True, verify_integrity=True) ind_notnull = indivi["idfoy"].notnull().sum() ind_isnull = indivi["idfoy"].isnull().sum() indivi = indivi.combine_first(df) assert ind_notnull + ind_isnull == (indivi["idfoy"].notnull().sum() + indivi["idfoy"].isnull().sum()) indivi.reset_index(inplace=True) assert indivi.duplicated().any() == False # MBJ: issue delt with when moving from R code to python ## TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs ## On suit guide méthodo erf 2003 page 135 ## On supprime les conjoints FIP et les FIP de 25 ans et plus; ## On conserve les enfants FIP de 19 à 24 ans; ## On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un ## ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète ## EEC que l'on ne retrouvera pas dans les EEC suivantes. # print ' 4.2 : On enlève les individus pour lesquels il manque le déclarant' fip = load_temp(name="fipDat", year=year) fip["declar"] = nan fip["agepf"] = nan fip.drop(["actrec", "year", "noidec"], axis=1, inplace=True) fip.naia = fip.naia.astype("int32") fip.rename( columns=dict(ident="idmen", persfip="quifoy", zsali="sali2", # Inclu les salaires non imposables des agents d'assurance zchoi="choi2", zrsti="rsti2", zalri="alr2"), inplace=True) is_fip_19_25 = ((year-fip.naia-1)>=19) & ((year-fip.naia-1)<25) ## TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented ## The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25 ##fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"] ##fip[is_fip_19_25 ,"lpr"] <- 1 # #indivi <- rbind.fill(indivi,fip[is_fip_19_25,]) indivi = concat([indivi, fip.loc[is_fip_19_25]]) del is_fip_19_25 indivi['age'] = year - indivi.naia - 1 indivi['agem'] = 12*indivi.age + 12-indivi.naim indivi["quimen"] = 0 indivi.quimen[indivi.lpr == 1] = 0 indivi.quimen[indivi.lpr == 2] = 1 indivi.quimen[indivi.lpr == 3] = 2 indivi.quimen[indivi.lpr == 4] = 3 indivi['not_pr_cpr'] = nan indivi['not_pr_cpr'][indivi['lpr']<=2] = False indivi['not_pr_cpr'][indivi['lpr']>2] = True print " 4.3 : Creating non pr=0 and cpr=1 idmen's" indivi.reset_index(inplace=True) test1 = indivi.ix[indivi['not_pr_cpr']==True,['quimen', 'idmen']] test1['quimen'] = 2 j=2 while any(test1.duplicated(['quimen', 'idmen'])): test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j+1 j += 1 print_id(indivi) indivi.update(test1) print_id(indivi) # indivi.set_index(['quiment']) #TODO: check relevance # TODO problème avec certains idfoy qui n'ont pas de vous print '' print "Etape 5 : Gestion des idfoy qui n'ont pas de vous" all = indivi.drop_duplicates('idfoy') with_ = indivi.loc[indivi['quifoy']=='vous', 'idfoy'] without = all[~(all.idfoy.isin(with_.values))] print 'On cherche si le déclarant donné par la deuxième déclaration est bien un vous' has_declar2 = (indivi.idfoy.isin(without.idfoy.values)) & (indivi.declar2.notnull()) decl2_idfoy = (indivi.loc[has_declar2, 'idmen'].astype('int')*100 + indivi.loc[has_declar2, "declar2"].str[0:2].astype('int')) indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values), decl2_idfoy, None) del all,with_,without, has_declar2 print ' 5.1 : Elimination idfoy restant' idfoyList = indivi.loc[indivi['quifoy']=="vous", 'idfoy'].drop_duplicates() indivi = indivi[indivi.idfoy.isin(idfoyList.values)] del idfoyList print_id(indivi) myvars = ["noindiv", "noi", "idmen", "idfoy", "quifoy", "wprm", "age","agem","quelfic","actrec", "quimen", "nbsala","titc","statut","txtppb","chpub","prosa","encadr"] if not(len(set(myvars).difference(set(indivi.columns))) == 0): print set(myvars).difference(set(indivi.columns)) assert len(set(myvars).difference(set(indivi.columns))) == 0 indivi = indivi.loc[:, myvars] ## TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé ## les infos provenant des déclarations) print '' print 'Etape 6 : Création des variables descriptives' print ' 6.1 : variable activité' indivi['activite'] = None indivi['activite'][indivi['actrec']<=3] = 0 indivi['activite'][indivi['actrec']==4] = 1 indivi['activite'][indivi['actrec']==5] = 2 indivi['activite'][indivi['actrec']==7] = 3 indivi['activite'][indivi['actrec']==8] = 4 indivi['activite'][indivi['age']<=13] = 2 # ce sont en fait les actrec=9 print indivi['activite'].value_counts() # TODO: MBJ problem avec les actrec indivi['titc'][indivi['titc'].isnull()] = 0 assert indivi['titc'].notnull().all() , Exception("Problème avec les titc") print ' 6.2 : variable statut' indivi['statut'][indivi['statut'].isnull()] = 0 indivi['statut'] = indivi['statut'].astype('int') indivi['statut'][indivi['statut']==11] = 1 indivi['statut'][indivi['statut']==12] = 2 indivi['statut'][indivi['statut']==13] = 3 indivi['statut'][indivi['statut']==21] = 4 indivi['statut'][indivi['statut']==22] = 5 indivi['statut'][indivi['statut']==33] = 6 indivi['statut'][indivi['statut']==34] = 7 indivi['statut'][indivi['statut']==35] = 8 indivi['statut'][indivi['statut']==43] = 9 indivi['statut'][indivi['statut']==44] = 10 indivi['statut'][indivi['statut']==45] = 11 assert indivi['statut'].isin(range(12)).all(), Exception("statut value over range") #indivi$nbsala <- as.numeric(indivi$nbsala) #indivi <- within(indivi,{ # nbsala[is.na(nbsala) ] <- 0 # nbsala[nbsala==99 ] <- 10 # TODO 418 fip à retracer qui sont NA #}) print ' 6.3 : variable txtppb' indivi['txtppb'] = indivi['txtppb'].fillna(0) assert indivi['txtppb'].notnull().all() indivi['nbsala'] = indivi['nbsala'].fillna(0) indivi['nbsala'] = indivi['nbsala'].astype('int') indivi['nbsala'][indivi['nbsala']==99] = 10 assert indivi['nbsala'].isin(range(11)).all() print ' 6.4 : variable chpub et CSP' indivi['chpub'].fillna(0, inplace=True) indivi['chpub'] = indivi['chpub'].astype('int') indivi['chpub'][indivi['chpub'].isnull()] = 0 print indivi['chpub'].value_counts() assert indivi['chpub'].isin(range(11)).all() indivi['cadre'] = 0 indivi['prosa'][indivi['prosa'].isnull()] = 0 assert indivi['prosa'].notnull().all() print indivi['encadr'].value_counts() # encadr : 1=oui, 2=non indivi['encadr'].fillna(2, inplace=True) assert indivi['encadr'].notnull().all() indivi['cadre'][indivi['prosa'].isin([7,8])] = 1 indivi['cadre'][(indivi['prosa']==9) & (indivi['encadr']==1)] = 1 print "cadre" print indivi['cadre'].value_counts() assert indivi['cadre'].isin(range(2)).all() print '' print "Etape 7 : on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence" print 'nb de doublons idfam/quifam', len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])]) print 'On crée les n° de personnes à charge' assert indivi['idfoy'].notnull().all() print_id(indivi) indivi['quifoy2'] = 2 indivi['quifoy2'][indivi['quifoy']=='vous'] = 0 indivi['quifoy2'][indivi['quifoy']=='conj'] = 1 indivi['quifoy2'][indivi['quifoy']=='pac'] = 2 del indivi['quifoy'] indivi['quifoy'] = indivi['quifoy2'] del indivi['quifoy2'] print_id(indivi) test2 = indivi.loc[indivi['quifoy']==2, ['quifoy', 'idfoy','noindiv']] print_id(test2) j=2 while test2.duplicated(['quifoy', 'idfoy']).any(): test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j j += 1 print_id(test2) indivi = indivi.merge(test2, on=['noindiv','idfoy'], how="left") indivi['quifoy'] = indivi['quifoy_x'] indivi['quifoy'] = where(indivi['quifoy_x']==2, indivi['quifoy_y'], indivi['quifoy_x']) del indivi['quifoy_x'], indivi['quifoy_y'] print_id(indivi) del test2, fip print 'nb de doublons idfam/quifam', len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])]) print_id(indivi) ##################################################################################### ## On ajoute les idfam et quifam #load(famc) # #tot2 <- merge(indivi, famille, by = c('noindiv'), all.x = TRUE) #rm(famille) #print_id(tot2) # ### Les idfam des enfants FIP qui ne font plus partie des familles forment des famille seuls #tot2[is.na(tot2$quifam), "idfam"] <- tot2[is.na(tot2$quifam), "noindiv"] #tot2[is.na(tot2$quifam), "quifam"] <- 0 #print_id(tot2) #saveTmp(tot2, file = "tot2.Rdata") #rm(indivi,tot2) # ## on merge les variables de revenus (foyer_aggr) avec les identifiants précédents ## load foyer #loadTmp(file = "tot2.Rdata") #loadTmp(file= "foyer_aggr.Rdata") # #tot3 <- merge(tot2, foyer, all.x = TRUE) #print_id(tot3) # OK #saveTmp(tot3, file= "tot3.Rdata") #rm(tot3,tot2,foyer) # print '' print 'Etape 8 : création des fichiers totaux' famille = load_temp(name='famc', year=year) print ' 8.1 : création de tot2 & tot3' tot2 = indivi.merge(famille, on='noindiv', how='inner') # del famille # TODO: MBJ increase in number of menage/foyer when merging with family ... del famille control(tot2, debug=True, verbose=True) assert tot2['quifam'].notnull().all() save_temp(tot2, name='tot2', year=year) del indivi print ' tot2 saved' # #On combine les variables de revenu # foyer = load_temp(name='foy_ind', year=year) # print " INTERSERCT THE POOCHAY" # tot2["idfoy"] = tot2["idfoy"][tot2["idfoy"].notnull()] +1 # print "pingas" # print sorted(tot2.loc[tot2.idfoy.notnull(),"idfoy"].astype('int').unique())[0:10] # print "pocchay" # print sorted(foyer["idfoy"].unique())[0:10] # print "final flash" # print 602062550.0 in foyer["idfoy"].values # print len(list(set(tot2["idfoy"].unique()) & set(foyer["idfoy"].unique()))) # print tot2.quifoy.value_counts() #tot2.update(foyer) tot2.merge(foyer, how = 'left') tot2 = tot2[tot2.idmen.notnull()] # tot2['idfoy'] += 1 print_id(tot2) tot3 = tot2 # TODO: check where they come from tot3 = tot3.drop_duplicates(cols='noindiv') print len(tot3) #Block to remove any unwanted duplicated pair print " check tot3" control(tot3, debug=True, verbose=True) tot3 = tot3.drop_duplicates(cols=['idfoy', 'quifoy']) tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam']) tot3 = tot3.drop_duplicates(cols=['idmen', 'quimen']) tot3 = tot3.drop_duplicates(cols='noindiv') control(tot3) ## On ajoute les variables individualisables #loadTmp("foyer_individualise.Rdata") # foy_ind #loadTmp("tot3.Rdata") #loadTmp("allvars.Rdata") #loadTmp("sif.Rdata") # #vars2 <- setdiff(names(tot3), allvars) #tot3 <- tot3[,vars2] # #print_id(tot3) #final <- merge(tot3, foy_ind, by = c('idfoy', 'quifoy'), all.x = TRUE) # print ' 8.2 : On ajoute les variables individualisables' allvars = load_temp(name = 'ind_vars_to_remove', year=year) vars2 = set(tot3.columns).difference(set(allvars)) tot3 = tot3[list(vars2)] print len(tot3) assert not(tot3.duplicated(cols=['noindiv']).any()), "doublon dans tot3['noindiv']" lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])]) assert lg_dup == 0, "%i pairs of idfoy/quifoy in tot3 are duplicated" %(lg_dup) save_temp(tot3, name='tot3', year=year) control(tot3) del tot2, allvars, tot3, vars2 print 'tot3 sauvegardé' gc.collect()
def create_fip(year = 2006): # message('03_fip') """ Creates a 'fipDat' table containing all these 'fip individuals' """ df = DataCollection(year=year) print 'Démarrer 03_fip' # # anaisenf: année de naissance des PAC # erfFoyVar <- c('anaisenf','declar') # foyer <- LoadIn(erfFoyFil) # foyer <- LoadIn(erfFoyFil,erfFoyVar) # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = df.get_values(table="foyer", variables=erfFoyVar) print_id(foyer) # control(foyer, verbose=True, verbose_length=10, debug=True) # #*********************************************************************************************************** # # print "Step 1 : on recupere les personnes à charge des foyers" # #********************************************************************************************************** # # On traite les cas de declarations multiples pour ne pas créer de doublon de pac # # # # On récupère toutes les pac des foyers # L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal # fip <-data.frame(declar = foyer$declar) # for (i in c(1:L)){ # eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = ''))) # eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = ''))) # } # fip <- fip[!is.na(fip$typ.1),] # fip <- reshape(fip,direction ='long', varying=2:17, sep=".") # fip <- fip[!is.na(fip$naia),] # fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')] # fip$N <- row(fip)[,1] # str(fip$N) print "Etape 1 : on recupere les personnes à charge des foyers" print " 1.1 : Création des codes des enfants" foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len))/5 print "il ya a au maximum %s pac par foyer" %nb_pac_max # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] for i in range(1, nb_pac_max + 1): pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable']) fip = DataFrame(randn(len(foyer), 3*nb_pac_max), columns=columns) fip.fillna(NaN, inplace=True) # inutile a cause de la ligne précédente, to remove for i in range(1,nb_pac_max+1): fip[(i, 'declaration')] = foyer['declar'].values fip[(i,'type_pac')] = foyer['anaisenf'].str[5*(i-1)] fip[(i,'naia')] = foyer['anaisenf'].str[5*(i-1)+1:5*(i)] fip = fip.stack("pac_number") fip.reset_index(inplace=True) del fip["level_0"] # print fip.describe() # print fip.head().to_string() print " 1.2 : elimination des foyers fiscaux sans pac" #Clearing missing values and changing data format fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an') & (fip['naia'] != '')] fip = fip.sort(columns=['declaration','naia','type_pac']) # TODO: check if useful fip.set_index(["declaration","pac_number"], inplace=True) fip = fip.reset_index() del fip['pac_number'] # control(fip, debug=True, verbose=True, verbose_columns=['naia']) print " 1.3 : on enlève les individus F pour lesquels il existe un individu G" tyFG = fip[fip.type_pac.isin(['F', 'G'])] #Filtre pour ne travailler que sur F & G tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True) tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac']) tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin'])) #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux #puis on retire les autres (à la fois F et G) print len(tyFG),'/', len(tyFG[tyFG['to_keep']]) print 'longueur fip', len(fip) fip['to_keep'] = NaN fip.update(tyFG) print 'enfants F & G traités' print " 1.4 : on enlève les H pour lesquels il y a un I" tyHI = fip[fip.type_pac.isin(['H', 'I'])] tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True) tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac']) tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin']) fip.update(tyHI) fip['to_keep'] = fip['to_keep'].fillna(True) print 'nb lines to keep/nb initial lines' print len(fip[fip['to_keep']]), '/', len(fip) indivifip = fip[fip['to_keep']]; del indivifip['to_keep'], fip, tyFG, tyHI # control(indivifip, debug=True) # #************************************************************************************************************/ print '' print 'Step 2 : matching indivifip with eec file' # #************************************************************************************************************/ indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES # pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',] # pac$key1 <- paste(pac$naia,pac$declar1) # pac$key2 <- paste(pac$naia,pac$declar2) # indivifip$key <- paste(indivifip$naia,indivifip$declar) #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull() import pdb pdb.set_trace() pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip']=='pac')] pac['naia'] = pac['naia'].astype('int32') # TODO: was float in pac fix upstream indivifip['naia'] = indivifip['naia'].astype('int32') pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29]) pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29]) assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' %(pac.naia.dtype, indivifip.naia.dtype) # fip <- indivifip[!indivifip$key %in% pac$key1,] # fip <- fip[!fip$key %in% pac$key2,] fip = indivifip[~(indivifip.key.isin(pac.key1.values))] fip = fip[~(fip.key.isin(pac.key2.values))] print " 2.1 new fip created" # We build a dataframe to link the pac to their type and noindiv # table(duplicated(pac[,c("noindiv")])) countInd = pac.noindiv.value_counts() # pacInd1 <- merge(pac[,c("noindiv","key1","naia")], # indivifip[,c("key","typ")], by.x="key1", by.y="key") # pacInd2 <- merge(pac[,c("noindiv","key2","naia")], # indivifip[,c("key","typ")], by.x="key2", by.y="key") tmp_pac1 = pac[['noindiv', 'key1']] tmp_pac2 = pac[['noindiv', 'key2']] tmp_indivifip = indivifip[['key', 'type_pac', 'naia']] pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') print 'longueur pacInd1' , len(pac_ind1) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') print 'longueur pacInd2', len(pac_ind2) print "pacInd1&2 créés" # table(duplicated(pacInd1)) # table(duplicated(pacInd2)) print pac_ind1.duplicated().sum() print pac_ind2.duplicated().sum() # pacInd1 <-rename(pacInd1,c("key1" = "key")) # pacInd2 <-rename(pacInd2,c("key2" = "key")) # pacInd <- rbind(pacInd1,pacInd2) # rm(pacInd1,pacInd2) # pacInd1.rename(columns={'key1':'key'}, inplace=True) # pacInd2.rename(columns={'key2':'key'}, inplace=True) del pac_ind1['key1'], pac_ind2['key2'] print pac_ind1.columns print pac_ind2.columns if pac_ind1.index == []: if pac_ind2.index == []: print "Warning : no link between pac and noindiv for both pacInd1&2" else: print "Warning : pacInd1 is an empty data frame" pacInd = pac_ind2 elif pac_ind2.index == []: print "Warning : pacInd2 is an empty data frame" pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) print len(pac_ind1), len(pac_ind2), len(pacInd) print pac_ind2.type_pac.isnull().sum() print pacInd.type_pac.value_counts() print ' 2.2 : pacInd created' # table(duplicated(pacInd[,c("noindiv","typ")])) # table(duplicated(pacInd$noindiv)) print 'doublons noindiv, type_pac', pacInd.duplicated(['noindiv', 'type_pac']).sum() print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum() print 'nb de NaN', pacInd.type_pac.isnull().sum() del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))] # pacIndiv.reset_index(inplace=True) print pacIndiv.columns save_temp(pacIndiv, name="pacIndiv", year=year) print pacIndiv.type_pac.value_counts() gc.collect() # # We keep the fip in the menage of their parents because it is used in to # # build the famille. We should build an individual ident for the fip that are # # older than 18 since they are not in their parents' menage according to the eec # individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous")) # individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")] # individec1 <- upData(individec1,rename=c(declar1="declar")) # fip1 <- merge(fip,individec1) # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2)) indivi['noidec'] = indivi['declar1'].str[0:2].astype('float16') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip']=="vous")] individec1 = individec1.loc[:, ["declar1","noidec","ident","rga","ztsai","ztsao"]] individec1 = individec1.rename(columns={'declar1':'declaration'}) fip1 = fip.merge(individec1, on='declaration') print ' 2.3 : fip1 created' # # TODO: On ne s'occupe pas des declar2 pour l'instant # # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous")) # # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")] # # individec2 <- upData(individec2,rename=c(declar2="declar")) # # fip2 <-merge(fip,individec2) individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip']=="vous")] individec2 = individec2.loc[:, ["declar2","noidec","ident","rga","ztsai","ztsao"]] individec2.rename(columns={'declar2':'declaration'}, inplace=True) print individec2.head() fip2 = fip.merge(individec2) print ' 2.4 : fip2 created' fip1.duplicated().value_counts() fip2.duplicated().value_counts() # #fip <- rbind(fip1,fip2) # fip <- fip1 # table(fip$typ) fip = concat([fip1, fip2]) # fip = fip1 #TODO: Pourquoi cette ligne ? fip.type_pac.value_counts() print fip.columns fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'] fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'] #TODO declar ? fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip['naia'].astype('float') fip['lpr'] = where(fip['agepf'] <=20, 3, 4) # TODO pas très propre d'après Mahdi/Clément fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = where(fip['agepf']<=15, 9, 5) ## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ ## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal # while ( any(duplicated( fip[,c("noi","ident")]) ) ) { # dup <- duplicated( fip[, c("noi","ident")]) # tmp <- fip[dup,"noi"] # fip[dup, "noi"] <- (tmp-1) # } #TODO: Le vecteur dup est-il correct fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi','ident']] while any(fip.duplicated(cols=['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] print len(tmp) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100*fip['ident'] + fip['noidec'] fip['noindiv'] = 100*fip['ident'] + fip['noi'] fip['type_pac'] = 0 ; fip['key'] = 0 print fip.duplicated('noindiv').value_counts() save_temp(fip, name="fipDat", year=year) del fip, fip1, individec1, indivifip, indivi, pac print 'fip sauvegardé'
def sif(year=2006): data = DataCollection(year=year) print u"05_foyer: extraction des données foyer" ## TODO Comment choisir le rfr n -2 pour éxonération de TH ? ## mnrvka Revenu TH n-2 ## mnrvkh revenu TH (revenu fiscal de référence) # ## On récupère les variables du code sif #vars <- c("noindiv", 'sif', "nbptr", "mnrvka") vars = ["noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg"] #sif <- LoadIn(erfFoyFil, vars) sif = data.get_values(variables=vars, table="foyer") #sif$statmarit <- 0 sif['statmarit'] = 0 print sif ## for (index in 60:80){ ## print(index) ## print(table(substr(sif$sif,index,index))) ## } # # ## Pb with some 2 sifs that are misaligned ## index = 61 ## sif[substr(sif$sif,index,index)=="F", "noindiv"] ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="F", "noindiv"]] ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="F", "noindiv"]+1] # ## index = 62 ## sif[substr(sif$sif,index,index)=="G", "noindiv"] ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="G", "noindiv"]] ## sif$sif[1] # #if (year==2009){ # # problem with one entry in 2009 # length <- nchar(sif$sif[1]) # old_sif <- sif$sif[sif$noindiv == 901803201] # new_sif <- paste(substr(old_sif,1,59), substr(old_sif,61,length),"0", sep="") # sif$sif[sif$noindiv == 901803201] <- new_sif # old_sif <- sif$sif[sif$noindiv == 900872201] # new_sif <- paste(substr(old_sif,1,58), " ", substr(old_sif,59,length), sep="") # sif$sif[sif$noindiv == 900872201] <- new_sif # rm(old_sif,new_sif) #} if year == 2009: old_sif = sif['sif'][sif['noindiv'] == 901803201] new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0" sif['sif'][sif['noindiv'] == 901803201] = new_sif old_sif = sif['sif'][sif['noindiv'] == 900872201] new_sif = old_sif.str[0:58] + " " + old_sif.str[58:] sif['sif'][sif['noindiv'] == 900872201] = new_sif del old_sif, new_sif # # ## for (index in 60:80){ ## print(index) ## print(table(substr(sif$sif,index,index))) ## } # # # #sif <- within(sif,{ ## rbg = rbg*((tsrvbg =='+')-(tsrvbg =='-')) print sif["rbg"].describe() sif["rbg"] = sif["rbg"] * ((sif["tsrvbg"] == '+') - (sif["tsrvbg"] == '-')) print sif["rbg"].describe() # stamar <- substr(sif,5,5) sif["stamar"] = sif["sif"].str[4:5] # Converting marital status # statmarit[stamar =="M"] <- 1 # statmarit[stamar =="C"] <- 2 # statmarit[stamar =="D"] <- 3 # statmarit[stamar =="V"] <- 4 # statmarit[stamar =="O"] <- 5 statmarit_dict = {"M": 1, "C": 2, "D": 3, "V": 4, "O": 5} for key, val in statmarit_dict.iteritems(): sif["statmarit"][sif.stamar == key] = val # birthvous <- as.numeric(substr(sif,6,9)) # birthconj <- as.numeric(substr(sif,11,14)) # sif["birthvous"] = sif["sif"].str[5:9] sif["birthconj"] = sif["sif"].str[10:14] # caseE <- as.numeric(substr(sif,16,16)=='E') # caseF <- as.numeric(substr(sif,17,17)=='F') # caseG <- as.numeric(substr(sif,18,18)=='G') # caseK <- as.numeric(substr(sif,19,19)=='K') sif["caseE"] = sif["sif"].str[15:16] == "E" sif["caseF"] = sif["sif"].str[16:17] == "F" sif["caseG"] = sif["sif"].str[17:18] == "G" sif["caseK"] = sif["sif"].str[18:19] == "K" d = 0 # # if (year %in% c(2006,2007)){ # caseL <- as.numeric(substr(sif,20,20)=='L') # caseP <- as.numeric(substr(sif,21,21)=='P') # caseS <- as.numeric(substr(sif,22,22)=='S') # caseW <- as.numeric(substr(sif,23,23)=='W') # caseN <- as.numeric(substr(sif,24,24)=='N') # caseH <- as.numeric(substr(sif,25,28)) # caseT <- as.numeric(substr(sif,29,29) == 'T') # } if year in [2006, 2007]: sif["caseL"] = sif["sif"].str[19:20] == "L" sif["caseP"] = sif["sif"].str[20:21] == "P" sif["caseS"] = sif["sif"].str[21:22] == "S" sif["caseW"] = sif["sif"].str[22:23] == "W" sif["caseN"] = sif["sif"].str[23:24] == "N" sif["caseH"] = sif["sif"].str[24:28] sif["caseT"] = sif["sif"].str[28:29] == "T" # # if (year == 2008){ # d = - 1 # fin de la case L # caseP <- as.numeric(substr(sif,21+d,21+d)=='P') # caseS <- as.numeric(substr(sif,22+d,22+d)=='S') # caseW <- as.numeric(substr(sif,23+d,23+d)=='W') # caseN <- as.numeric(substr(sif,24+d,24+d)=='N') # caseH <- as.numeric(substr(sif,25+d,28+d)) # caseT <- as.numeric(substr(sif,29+d,29+d)=='T') # } # if year in [2008]: d = -1 # fin de la case L sif["caseP"] = sif["sif"].str[20 + d:21 + d] == "P" sif["caseS"] = sif["sif"].str[21 + d:22 + d] == "S" sif["caseW"] = sif["sif"].str[22 + d:23 + d] == "W" sif["caseN"] = sif["sif"].str[23 + d:24 + d] == "N" sif["caseH"] = sif["sif"].str[24 + d:28 + d] sif["caseT"] = sif["sif"].str[28 + d:29 + d] == "T" # if (year == 2009){ # # retour de la case L par rapport à 2008 (donc on retrouve 2006) # caseL <- as.numeric(substr(sif,20,20)=='L') # caseP <- as.numeric(substr(sif,21,21)=='P') # caseS <- as.numeric(substr(sif,22,22)=='S') # caseW <- as.numeric(substr(sif,23,23)=='W') # caseN <- as.numeric(substr(sif,24,24)=='N') # # caseH en moins par rapport à 2008 (mais case en L en plus) # # donc décalage par rapport à 2006 # d = -4 # caseT <- as.numeric(substr(sif,29+d,29+d)=='T') # } if year in [2009]: sif["caseL"] = sif["sif"].str[19:20] == "L" sif["caseP"] = sif["sif"].str[20:21] == "P" sif["caseS"] = sif["sif"].str[21:22] == "S" sif["caseW"] = sif["sif"].str[22:23] == "W" sif["caseN"] = sif["sif"].str[23:24] == "N" # caseH en moins par rapport à 2008 (mais case en L en plus) # donc décalage par rapport à 2006 d = -4 sif["caseT"] = sif["sif"].str[28 + d:29 + d] == "T" # # caseX <- as.numeric(substr(sif,34+d,34+d)=='X') # dateX <- as.Date(substr(sif,35+d,42+d),'%d%m%Y') # caseY <- as.numeric(substr(sif,43+d,43+d)== 'Y') # dateY <- as.Date(substr(sif,44+d,51+d),'%d%m%Y') # caseZ <- as.numeric(substr(sif,52+d,53+d)== 'Z') # dateZ <- as.Date(substr(sif,53+d,60+d),'%d%m%Y') # ERROR 54+d # causeXYZ <- substr(sif,61+d,61+d) # sif["caseX"] = sif["sif"].str[33 + d:34 + d] == "X" sif["dateX"] = sif["sif"].str[34 + d:42 + d] sif["caseY"] = sif["sif"].str[42 + d:43 + d] == "Y" sif["dateY"] = sif["sif"].str[43 + d:51 + d] sif["caseZ"] = sif["sif"].str[51 + d:53 + d] == "Z" sif["dateZ"] = sif["sif"].str[52 + d:60 + d] sif["causeXYZ"] = sif["sif"].str[60 + d:61 + d] # TODO: convert dateXYZ to appropriate date in pandas # print sif["dateY"].unique() # nbptr <- nbptr/100 # rfr_n_2 <- mnrvka sif["nbptr"] = sif["nbptr"] / 100 sif["rfr_n_2"] = sif["mnrvka"] # nbF <- as.numeric(substr(sif,65+d,66+d)) # nbG <- as.numeric(substr(sif,68+d,69+d)) # nbR <- as.numeric(substr(sif,71+d,72+d)) # nbJ <- as.numeric(substr(sif,74+d,75+d)) # nbN <- as.numeric(substr(sif,77+d,78+d)) # nbH <- as.numeric(substr(sif,80+d,81+d)) # nbI <- as.numeric(substr(sif,83+d,84+d)) sif["nbF"] = sif["sif"].str[64 + d:66 + d] sif["nbG"] = sif["sif"].str[67 + d:69 + d] sif["nbR"] = sif["sif"].str[70 + d:72 + d] sif["nbJ"] = sif["sif"].str[73 + d:75 + d] sif["nbN"] = sif["sif"].str[76 + d:78 + d] sif["nbH"] = sif["sif"].str[79 + d:81 + d] sif["nbI"] = sif["sif"].str[82 + d:84 + d] # if (year != 2009){ # nbP <- as.numeric(substr(sif,86+d,87+d)) # } #}) if (year != 2009): sif["nbP"] = sif["sif"].str[85 + d:87 + d] #sif$sif <- NULL #sif$stamar <- NULL # del sif["sif"], sif["stamar"] #table(sif$statmarit) print sif["statmarit"].value_counts() #print(length(table(sif$noindiv))) print "Number of individuals :", len(sif["noindiv"]) print "Number of distinct individuals :", len( sif["noindiv"].value_counts()) #dup <- duplicated(sif$noindiv) #table(dup) #sif <- sif[!dup,] #print(length(table(sif$noindiv))) sif_drop_duplicated = sif.drop_duplicates("noindiv") assert len(sif["noindiv"].value_counts( )) == len(sif_drop_duplicated["noindiv"]), Exception( "Number of distinct individuals after removing duplicates is not correct" ) print 'Saving sif' save_temp(sif, name='sif', year=year) del sif gc.collect()
def foyer_all(year=2006): ## On ajoute les cases de la déclaration #foyer_all <- LoadIn(erfFoyFil) data = DataCollection(year=year) foyer_all = data.get_values(table="foyer" ) ## on ne garde que les cases de la déclaration ('fxzz') #vars <- names(foyer_all) #vars <- c("noindiv", vars[grep("^f[0-9]", vars)]) # vars = foyer_all.columns regex = re.compile("^f[0-9]") vars = [x for x in vars if regex.match(x)] #foyer <- foyer_all[vars] #rm(foyer_all) #gc() #noindiv <- list(foyer$noindiv) # foyer = foyer_all[vars + ["noindiv"]] del foyer_all gc.collect() # ## On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations #foyer <- aggregate(foyer, by = noindiv, FUN = 'sum') #print foyer.describe()["f1aj"].to_string() foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum) # #print foyer.describe()["f1aj"].to_string() #print foyer.describe()["noindiv"].to_string() # print_id(foyer) ## noindiv have been summed over original noindiv which are now in Group.1 #foyer$noindiv <- NULL #foyer <- rename(foyer, c(Group.1 = 'noindiv')) ## problème avec les dummies () # #saveTmp(foyer, file= "foyer_aggr.Rdata") # # ############################################################################# ## On récupère les variables individualisables #loadTmp("foyer_aggr.Rdata") # #individualisable <- function(table, var, vars, qui){ # print(var) # print(vars) # temp <- table[c('noindiv', vars)] # n = length(qui) # names(temp)[2:(n+1)] <- qui # temp$newvar <- NULL # temp2 <- melt(temp, id = 'noindiv', variable_name = 'quifoy') # temp2 <- transform(temp2, quifoy = as.character(quifoy)) # temp2 <- transform(temp2, noindiv = as.character(noindiv)) # str(temp2) # rename(temp2, c(value = var)) #} var_dict = {'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'], 'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'], 'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'], 'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'], 'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'], 'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'], 'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'], 'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'], 'f1tv': ['f1tv', 'f1uv'], 'f1tw': ['f1tw', 'f1uw'], 'f1tx': ['f1tx', 'f1ux'], 'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'], 'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'], 'frag_exon': ['f5hn', 'f5in', 'f5jn'], 'frag_impo': ['f5ho', 'f5io', 'f5jo'], 'arag_exon': ['f5hb', 'f5ib', 'f5jb'], 'arag_impg': ['f5hc', 'f5ic', 'f5jc'], 'arag_defi': ['f5hf', 'f5if', 'f5jf'], 'nrag_exon': ['f5hh', 'f5ih', 'f5jh'], 'nrag_impg': ['f5hi', 'f5ii', 'f5ji'], 'nrag_defi': ['f5hl', 'f5il', 'f5jl'], 'nrag_ajag': ['f5hm', 'f5im', 'f5jm'], 'mbic_exon': ['f5kn', 'f5ln', 'f5mn'], 'abic_exon': ['f5kb', 'f5lb', 'f5mb'], 'nbic_exon': ['f5kh', 'f5lh', 'f5mh'], 'mbic_impv': ['f5ko', 'f5lo', 'f5mo'], 'mbic_imps': ['f5kp', 'f5lp', 'f5mp'], 'abic_impn': ['f5kc', 'f5lc', 'f5mc'], 'abic_imps': ['f5kd', 'f5ld', 'f5md'], 'nbic_impn': ['f5ki', 'f5li', 'f5mi'], 'nbic_imps': ['f5kj', 'f5lj', 'f5mj'], 'abic_defn': ['f5kf', 'f5lf', 'f5mf'], 'abic_defs': ['f5kg', 'f5lg', 'f5mg'], 'nbic_defn': ['f5kl', 'f5ll', 'f5ml'], 'nbic_defs': ['f5km', 'f5lm', 'f5mm'], 'nbic_apch': ['f5ks', 'f5ls', 'f5ms'], 'macc_exon': ['f5nn', 'f5on', 'f5pn'], 'aacc_exon': ['f5nb', 'f5ob', 'f5pb'], 'nacc_exon': ['f5nh', 'f5oh', 'f5ph'], 'macc_impv': ['f5no', 'f5oo', 'f5po'], 'macc_imps': ['f5np', 'f5op', 'f5pp'], 'aacc_impn': ['f5nc', 'f5oc', 'f5pc'], 'aacc_imps': ['f5nd', 'f5od', 'f5pd'], 'aacc_defn': ['f5nf', 'f5of', 'f5pf'], 'aacc_defs': ['f5ng', 'f5og', 'f5pg'], 'nacc_impn': ['f5ni', 'f5oi', 'f5pi'], 'nacc_imps': ['f5nj', 'f5oj', 'f5pj'], 'nacc_defn': ['f5nl', 'f5ol', 'f5pl'], 'nacc_defs': ['f5nm', 'f5om', 'f5pm'], 'mncn_impo': ['f5ku', 'f5lu', 'f5mu'], 'cncn_bene': ['f5sn', 'f5ns', 'f5os'], 'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check 'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'], 'abnc_exon': ['f5qb', 'f5rb', 'f5sb'], 'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'], 'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'], 'abnc_impo': ['f5qc', 'f5rc', 'f5sc'], 'abnc_defi': ['f5qe', 'f5re', 'f5se'], 'nbnc_impo': ['f5qi', 'f5ri', 'f5si'], 'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'], # 'ebic_impv' : ['f5ta','f5ua', 'f5va'], # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'], 'mbic_mvct': ['f5hu'], 'macc_mvct': ['f5iu'], 'mncn_mvct': ['f5ju'], 'mbnc_mvct': ['f5kz'], 'frag_pvct': ['f5hw', 'f5iw', 'f5jw'], 'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'], 'macc_pvct': ['f5nx', 'f5ox', 'f5px'], 'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'], 'mncn_pvct': ['f5ky', 'f5ly', 'f5my'], 'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'], 'macc_mvlt': ['f5nr', 'f5or', 'f5pr'], 'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'], 'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'], 'frag_pvce': ['f5hx', 'f5ix', 'f5jx'], 'arag_pvce': ['f5he', 'f5ie', 'f5je'], 'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'], 'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'], 'abic_pvce': ['f5ke', 'f5le', 'f5me'], 'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'], 'macc_pvce': ['f5nq', 'f5oq', 'f5pq'], 'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'], 'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'], 'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'], 'cncn_pvce': ['f5so', 'f5nt', 'f5ot'], 'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'], 'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'], 'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'], 'demenage' : ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er']} # (déménagement) uniquement en 2006 # #varlist = list(list('sali', c('f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej')), # list('choi', c('f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep')), # list('fra', c('f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek')), # ...... # list('mbnc_pvce', c('f5hr', 'f5ir', 'f5jr')), # list('abnc_pvce', c('f5qd', 'f5rd', 'f5sd')), # list('nbnc_pvce', c('f5qj', 'f5rj', 'f5sj')), # list('demenage' , c('f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'))) # (déménagement) uniquement en 2006 # vars_sets = [ set(var_list) for var_list in var_dict.values() ] eligible_vars = (set().union(*vars_sets)).intersection( set(list(foyer.columns))) print "From %i variables, we keep %i eligibles variables" %( len(set().union(*vars_sets)), len(eligible_vars)) qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3'] err = 0 err_vars = {} foy_ind = DataFrame() for individual_var, foyer_vars in var_dict.iteritems(): try: selection = foyer[foyer_vars + ["noindiv"]] except KeyError: # Testing if at least one variable of foyers_vars is in the eligible list presence = [ x in eligible_vars for x in foyer_vars ] var_present = any(presence) if not var_present: print individual_var + " is not present" continue else: # Shrink the list foyer_vars_cleaned = [var for var,present in zip(foyer_vars, presence) if present is True] selection = foyer[foyer_vars_cleaned + ["noindiv"]] # Reshape the dataframe selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True) selection.set_index("noindiv", inplace=True) selection.columns.name = "quifoy" selection = selection.stack() selection.name = individual_var selection = selection.reset_index() # A Series cannot see its index resetted to produce a DataFrame selection = selection.set_index(["quifoy", "noindiv"]) selection = selection[selection[individual_var] !=0] # print len(selection) if len(foy_ind) == 0: foy_ind = selection else: foy_ind = concat([foy_ind, selection], axis=1, join='outer') foy_ind.reset_index(inplace=True) print "foy_ind" print foy_ind.describe().to_string() #not_first <- FALSE #allvars = c() #for (v in varlist){ # vars = intersect(v[[2]],names(foyer)) # to deal with variabes that are not present # if (length(vars) > 0) { # allvars <- c(allvars, vars) # qui <- c('vous', 'conj', 'pac1', 'pac2', 'pac3') # n <- length(vars) # temp <- individualisable(foyer, v[[1]], vars, qui[1:n]) # if (not_first) { # print('merge') # foy_ind <- merge(temp, foy_ind, by = c('noindiv', 'quifoy'), all = TRUE) # names(foy_ind) # } # else { # print('init') # foy_ind <- temp # not_first <- TRUE # } # } #} ind_vars_to_remove = Series(list(eligible_vars)) save_temp(ind_vars_to_remove, name='ind_vars_to_remove', year=year) foy_ind.rename(columns={"noindiv" : "idfoy"}, inplace=True) print_id(foy_ind) foy_ind['quifoy'][foy_ind['quifoy']=='vous'] = 0 foy_ind['quifoy'][foy_ind['quifoy']=='conj'] = 1 foy_ind['quifoy'][foy_ind['quifoy']=='pac1'] = 2 foy_ind['quifoy'][foy_ind['quifoy']=='pac2'] = 3 foy_ind['quifoy'][foy_ind['quifoy']=='pac3'] = 4 assert foy_ind['quifoy'].isin(range(5)).all(), 'présence de valeurs aberrantes dans quifoy' print 'saving foy_ind' print_id(foy_ind) save_temp(foy_ind, name="foy_ind", year = year) show_temp() return
def test(year=2006, variables=['af']): simulation = SurveySimulation() survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5') simulation.set_config(year=year, survey_filename=survey_filename) simulation.set_param() simulation.compute() # of_aggregates = Aggregates() # of_aggregates.set_simulation(simulation) # of_aggregates.compute() # print of_aggregates.aggr_frame # # from openfisca_france.data.erf.aggregates import build_erf_aggregates # temp = (build_erf_aggregates(variables=variables, year= year)) # print temp # return variable = "af" debugger = Debugger() debugger.set_simulation(simulation) debugger.set_variable(variable) debugger.show_aggregates() def get_all_ancestors(varlist): if len(varlist) == 0: return [] else: if varlist[0]._parents == set(): return ([varlist[0]] + get_all_ancestors(varlist[1:])) else: return ([varlist[0]] + get_all_ancestors(list(varlist[0]._parents)) + get_all_ancestors(varlist[1:])) # We want to get all ancestors + children + the options that we're going to encounter parents = map(lambda x: simulation.output_table.column_by_name.get(x), variables) parents = get_all_ancestors(parents) options = [] for varcol in parents: options.extend(varcol._option.keys()) options = list(set(options)) #print options parents = map(lambda x: x.name, parents) for var in variables: children = set() varcol = simulation.output_table.column_by_name.get(var) children = children.union(set(map(lambda x: x.name, varcol._children))) variables = list(set(parents + list(children))) #print variables del parents, children gc.collect() def get_var(variable): variables = [variable] return simulation.aggregated_by_entity(entity="men", variables=variables, all_output_vars=False, force_sum=True)[0] simu_aggr_tables = get_var(variables[0]) for var in variables[1:]: simu_aggr_tables = simu_aggr_tables.merge(get_var(var)[['idmen', var]], on='idmen', how='outer') # We load the data from erf table in case we have to pick data there erf_data = DataCollection(year=year) os.system('cls') todo = set(variables + ["ident", "wprm"]).union(set(options)) print 'Variables or equivalents to fetch :' print todo ''' Méthode générale pour aller chercher les variables de l'erf/eec ( qui n'ont pas forcément le même nom et parfois sont les variables utilisées pour créér l'of ): 1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats que l'on compare ) Si les variables ne sont pas directement dans la table, elles ont été calculées à partir d'autres variables de données erf/eec donc chercher dans : 2 - build_survey 3 - model/model.py qui dira éventuellement dans quel module de model/ chercher Le 'print todo' vous indique quelles variables chercher ( attention à ne pas inclure les enfants directs ) L'utilisation du Ctrl-H est profitable ! ''' fetch_eec = [ 'statut', 'titc', 'chpub', 'encadr', 'prosa', 'age', 'naim', 'naia', 'noindiv' ] fetch_erf = ['zsali', 'af', 'ident', 'wprm', 'noi', 'noindiv', 'quelfic'] erf_df = erf_data.get_of_values(variables=fetch_erf, table="erf_indivi") eec_df = erf_data.get_of_values(variables=fetch_eec, table="eec_indivi") erf_eec_indivi = erf_df.merge(eec_df, on='noindiv', how='inner') assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns" del eec_df, erf_df # We then get the aggregate variables for the menage ( mainly to compare with of ) print 'Loading data from erf_menage table' erf_menage = erf_data.get_of_values(variables=list(todo) + ['quelfic'], table="erf_menage") del todo gc.collect() assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns" from openfisca_france.data.erf import get_erf2of erf2of = get_erf2of() erf_menage.rename(columns=erf2of, inplace=True) # We get the options from the simulation non aggregated tables: # First from the output_table # We recreate the noindiv in output_table simulation.output_table.table[ 'noindiv'] = 100 * simulation.output_table.table.idmen_ind + simulation.output_table.table.noi_ind simulation.output_table.table['noindiv'] = simulation.output_table.table[ 'noindiv'].astype(np.int64) s1 = [ var for var in set(options).intersection( set(simulation.output_table.table.columns)) ] + ['idmen_ind', 'quimen_ind', 'noindiv'] simu_nonaggr_tables = (simulation.output_table.table)[s1] simu_nonaggr_tables.rename(columns={ 'idmen_ind': 'idmen', 'quimen_ind': 'quimen' }, inplace=True) assert 'noindiv' in simu_nonaggr_tables.columns # If not found, we dwelve into the input_table if (set(s1) - set(['idmen_ind', 'quimen_ind', 'noindiv'])) < set(options): assert 'noindiv' in simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns" s2 = [ var for var in (set(options).intersection( set(simulation.input_table.table.columns)) - set(s1)) ] + ['noindiv'] #print s2 temp = simulation.input_table.table[s2] simu_nonaggr_tables = simu_nonaggr_tables.merge(temp, on='noindiv', how='inner', sort=False) del s2, temp del s1 gc.collect() simu_nonaggr_tables = simu_nonaggr_tables[list(set(options)) + ['idmen', 'quimen', 'noindiv']] #print options, variables assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns' # Check the idmens that are not common erf_menage.rename(columns={'ident': 'idmen'}, inplace=True) print "\n" print 'Checking if idmen is here...' print '\n ERF : ' print 'idmen' in erf_menage.columns print "\n Simulation output" print 'idmen' in simu_aggr_tables.columns print "\n" #print 'Dropping duplicates of idmen for both tables...' assert not erf_menage["idmen"].duplicated().any( ), "Duplicated idmen in erf_menage" #erf_menage.drop_duplicates('idmen', inplace = True) simu_aggr_tables.drop_duplicates('idmen', inplace=True) assert not simu_aggr_tables["idmen"].duplicated().any( ), "Duplicated idmen in of" print 'Checking mismatching idmen... ' s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen'])) if s1: print "idmen that aren't in simu_aggr_tables : %s" % str(len(s1)) pass s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen']) if s2: print "idmen that aren't in erf_menage : %s" % str(len(s2)) pass del s1, s2 # Restrict to common idmens and merge s3 = set(erf_menage['idmen']).intersection(set(simu_aggr_tables['idmen'])) print "Restricting to %s common idmen... \n" % str(len(s3)) erf_menage = erf_menage[erf_menage['idmen'].isin(s3)] simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)] del s3 gc.collect() #print erf_menage.columns #print simu_aggr_tables.columns # Compare differences across of and erf dataframes print "Comparing differences between dataframes... \n" colcom = (set(erf_menage.columns).intersection( set(simu_aggr_tables.columns))) - set(['idmen', 'wprm']) print 'Common variables: ' print colcom erf_menage.reset_index(inplace=True) simu_aggr_tables.reset_index(inplace=True) for col in colcom: temp = set( erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]]) print "Numbers of idmen that aren't equal on variable %s : %s \n" % ( col, str(len(temp))) del temp # Detect the biggest differences bigtable = merge(erf_menage, simu_aggr_tables, on='idmen', how='inner', suffixes=('_erf', '_of')) print 'Length of new dataframe is %s' % str(len(bigtable)) #print bigtable.columns bigtable.set_index('idmen', drop=False, inplace=True) already_met = [] options_met = [] for col in colcom: bigtemp = None table = bigtable[and_(bigtable[col + '_erf'] != 0, bigtable[col + '_of'] != 0)] table[col] = (table[col + '_erf'] - table[col + '_of'] ) / table[col + '_erf'] #Difference relative table[col] = table[col].apply(lambda x: abs(x)) print 'Minimum difference between the two tables for %s is %s' % ( col, str(table[col].min())) print 'Maximum difference between the two tables for %s is %s' % ( col, str(table[col].max())) print table[col].describe() try: assert len(table[col]) == len(table['wprm_of']), "PINAGS" dec, values = mwp(table[col], np.arange(1, 11), table['wprm_of'], 2, return_quantiles=True) #print sorted(values) dec, values = mwp(table[col], np.arange(1, 101), table['wprm_erf'], 2, return_quantiles=True) #print sorted(values)[90:] del dec, values gc.collect() except: #print 'Weighted percentile method didnt work for %s' %col pass print "\n" # Show the relevant information for the most deviant households table.sort(columns=col, ascending=False, inplace=True) #print table[col][0:10].to_string() if bigtemp is None: bigtemp = { 'table': table[[col, col + '_of', col + '_erf', 'idmen']][0:10], 'options': None } bigtemp['table'][col + 'div'] = bigtemp['table'][ col + '_of'] / bigtemp['table'][col + '_erf'] print bigtemp['table'].to_string() ''' bigtemp is the table which will get filled little by little by the relevant variables. Up to the last rows of code 'table' refers to a table of aggregated values, while 'options is a table of individual variables. The reason we call it in a dictionnary is also because we modify it inside the recursive function 'iter_on parents', and it causes an error in Python unless for certain types like dictionnary values. ''' #print "\n" # If variable is a Prestation, we show the dependancies varcol = simulation.output_table.column_by_name.get(col) if isinstance(varcol, Prestation): ''' For the direct children ''' if not varcol._children is None: ch_to_fetch = list(varcol._children) ch_to_fetch = map(lambda x: x.name, ch_to_fetch) ch_fetched = [] if set(ch_to_fetch) <= set(simu_aggr_tables.columns): print "Variables which need %s to be computed :\n %s \n" % ( col, str(ch_to_fetch)) for var in ch_to_fetch: if var + '_of' in table.columns: ch_fetched.append(var + '_of') else: ch_fetched.append(var) elif set(ch_to_fetch) <= set(simu_aggr_tables.columns).union( erf_menage.columns): print "Variables which need %s to be computed (some missing picked in erf):\n %s \n" % ( col, str(ch_to_fetch)) for var in ch_to_fetch: if var in simu_aggr_tables.columns: if var + '_of' in table.columns: ch_fetched.append(var + '_of') elif var + '_erf' in table.columns: ch_fetched.append(var + '_erf') else: ch_fetched.append(var) else: print "Variables which need %s to be computed (some missing):\n %s \n" % ( col, str(ch_to_fetch)) for var in ch_to_fetch: if var in simu_aggr_tables.columns: if var + '_of' in table.columns: ch_fetched.append(var + '_of') elif var in erf_menage.columns: if var + '_erf' in table.columns: ch_fetched.append(var + '_erf') print table[[col] + ch_fetched][0:10] print "\n" del ch_to_fetch, ch_fetched ''' For the parents ''' def iter_on_parents(varcol): if (varcol._parents == set() and varcol._option == {}) or varcol.name in already_met: return else: par_to_fetch = list(varcol._parents) par_to_fetch = map(lambda x: x.name, par_to_fetch) par_fetched = [] if set(par_fetched) <= set(simu_aggr_tables.columns): #print "Variables the prestation %s depends of :\n %s \n" %(varcol.name, str(par_fetched)) for var in par_fetched: if var + '_of' in table.columns: par_fetched.append(var + '_of') else: par_fetched.append(var) elif set(par_fetched) <= set( simu_aggr_tables.columns).union( erf_menage.columns): #print "Variables the prestation %s depends of (some missing picked in erf):\n %s \n" %(varcol.name,str(par_fetched)) for var in par_fetched: if var in simu_aggr_tables.columns: if var + '_of' in table.columns: par_fetched.append(var + '_of') elif var + '_erf' in table.columns: par_fetched.append(var + '_erf') else: par_fetched.append(var) else: for var in par_fetched: if var in simu_aggr_tables.columns: if var + '_of' in table.columns: par_fetched.append(var + '_of') elif var in erf_menage.columns: if var + '_erf' in table.columns: par_fetched.append(var + '_erf') if len(par_fetched) > 0: #print "Variables the prestation %s depends of (some missing):\n %s \n" %(varcol.name, str(par_fetched)) pass else: #print "Variables the prestation %s depends of couldn't be found :\n %s \n" %(varcol.name, str(par_fetched)) pass if len(par_fetched) > 0: temp = table[[col, 'idmen'] + par_fetched][0:10] bigtemp['table'] = pd.merge(temp, bigtemp['table'], how='inner') #print temp.to_string(), "\n" if varcol._option != {} and not set( varcol._option.keys()) < set(options_met): vars_to_fetch = list( set(varcol._option.keys()) - set(options_met)) #print "and the options to current variable %s for the id's with strongest difference :\n %s \n" %(varcol.name, varcol._option.keys()) liste = [i for i in range(0, 10)] liste = map(lambda x: table['idmen'].iloc[x], liste) temp = simu_nonaggr_tables[ ['idmen', 'quimen', 'noindiv'] + vars_to_fetch][simu_nonaggr_tables['idmen'].isin( table['idmen'][0:10])] temp_sorted = temp[temp['idmen'] == liste[0]] for i in xrange(1, 10): temp_sorted = temp_sorted.append( temp[temp['idmen'] == liste[i]]) if bigtemp['options'] is None: bigtemp['options'] = temp_sorted bigtemp['options'] = bigtemp['options'].merge( erf_eec_indivi, on='noindiv', how='outer') else: bigtemp['options'] = bigtemp['options'].merge( temp_sorted, on=['noindiv', 'idmen', 'quimen'], how='outer') # temp_sorted.set_index(['idmen', 'quimen'], drop = True, inplace = True) # If we do that del temp, temp_sorted gc.collect() already_met.append(varcol.name) options_met.extend(varcol._option.keys()) for var in varcol._parents: iter_on_parents(var) iter_on_parents(varcol) # We merge the aggregate table with the option table ( for each individual in entity ) bigtemp['table'] = bigtemp['table'].merge(bigtemp['options'], how='left', on='idmen', suffixes=('(agg)', '(ind)')) # Reshaping the table to group by descending error on col, common entities bigtemp['table'].sort(columns=['af', 'quimen'], ascending=[False, True], inplace=True) bigtemp['table'] = bigtemp['table'].groupby(['idmen', 'quimen'], sort=False).sum() print "Table of values for %s dependencies : \n" % col print bigtemp['table'].to_string() del bigtemp['table'], bigtemp['options'] gc.collect()
def preproc(self): erf_menage = self.erf_menage erf_eec_indivi = self.erf_eec_indivi simu_aggr_tables = self.simu_aggr_tables simu_nonaggr_tables = self.simu_nonaggr_tables def get_all_ancestors(varlist): if len(varlist) == 0: return [] else: if varlist[0]._parents == set(): return ([varlist[0]] + get_all_ancestors(varlist[1:])) else: return ([varlist[0]] + get_all_ancestors(list(varlist[0]._parents)) + get_all_ancestors(varlist[1:])) # We want to get all ancestors + children + the options that we're going to encounter parents = map(lambda x: self.simulation.output_table.column_by_name.get(x), [self.variable]) parents = get_all_ancestors(parents) options = [] for varcol in parents: options.extend(varcol._option.keys()) options = list(set(options)) #print options parents = map(lambda x: x.name, parents) for var in [self.variable]: children = set() varcol = self.simulation.output_table.column_by_name.get(var) children = children.union(set(map(lambda x: x.name, varcol._children))) variables = list(set(parents + list(children))) #print variables del parents, children gc.collect() def get_var(variable): variables =[variable] return self.simulation.aggregated_by_entity(entity="men", variables=variables, all_output_vars = False, force_sum=True)[0] simu_aggr_tables = get_var(variables[0]) for var in variables[1:]: simu_aggr_tables = simu_aggr_tables.merge(get_var(var)[['idmen', var]], on = 'idmen', how = 'outer') # We load the data from erf table in case we have to pick data there erf_data = DataCollection(year= self.simulation.datesim.year) os.system('cls') todo = set(variables + ["ident", "wprm"]).union(set(options)) print 'Variables or equivalents to fetch :' print todo ''' Méthode générale pour aller chercher les variables de l'erf/eec ( qui n'ont pas forcément le même nom et parfois sont les variables utilisées pour créér l'of ): 1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats que l'on compare ) Si les variables ne sont pas directement dans la table, elles ont été calculées à partir d'autres variables de données erf/eec donc chercher dans : 2 - build_survey 3 - model/model.py qui dira éventuellement dans quel module de model/ chercher Le 'print todo' vous indique quelles variables chercher ( attention à ne pas inclure les enfants directs ) L'utilisation du Ctrl-H est profitable ! ''' fetch_eec = ['statut','titc','chpub','encadr','prosa','age','naim','naia','noindiv'] fetch_erf = ['zsali','af','ident','wprm','noi','noindiv','quelfic'] erf_df = erf_data.get_of_values(variables= fetch_erf, table="erf_indivi") eec_df = erf_data.get_of_values(variables= fetch_eec, table="eec_indivi") erf_eec_indivi = erf_df.merge(eec_df, on ='noindiv', how = 'inner' ) assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns" del eec_df, erf_df # We then get the aggregate variables for the menage ( mainly to compare with of ) print 'Loading data from erf_menage table' erf_menage = erf_data.get_of_values(variables= list(todo) + ['quelfic'], table="erf_menage") del todo gc.collect() assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns" from openfisca_france.data.erf import get_erf2of erf2of = get_erf2of() erf_menage.rename(columns = erf2of, inplace = True) # We get the options from the simulation non aggregated tables: # First from the output_table # We recreate the noindiv in output_table self.simulation.output_table.table['noindiv'] = 100 * self.simulation.output_table.table.idmen_ind + self.simulation.output_table.table.noi_ind self.simulation.output_table.table['noindiv'] = self.simulation.output_table.table['noindiv'].astype(np.int64) s1 = [var for var in set(options).intersection(set(self.simulation.output_table.table.columns))] + ['idmen_ind', 'quimen_ind', 'noindiv'] simu_nonaggr_tables = (self.simulation.output_table.table)[s1] simu_nonaggr_tables.rename(columns = {'idmen_ind' : 'idmen', 'quimen_ind':'quimen'}, inplace = True) assert 'noindiv' in simu_nonaggr_tables.columns # If not found, we dwelve into the input_table if (set(s1)- set(['idmen_ind', 'quimen_ind','noindiv'])) < set(options): assert 'noindiv' in self.simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns" s2 = [var for var in (set(options).intersection(set(self.simulation.input_table.table.columns)) - set(s1))] + ['noindiv'] #print s2 temp = self.simulation.input_table.table[s2] simu_nonaggr_tables = simu_nonaggr_tables.merge(temp, on = 'noindiv', how = 'inner', sort = False) del s2, temp del s1 gc.collect() simu_nonaggr_tables = simu_nonaggr_tables[list(set(options)) + ['idmen', 'quimen','noindiv']] #print options, variables assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns' # Check the idmens that are not common erf_menage.rename(columns = {'ident' : 'idmen'}, inplace = True) print "\n" print 'Checking if idmen is here...' print '\n ERF : ' print 'idmen' in erf_menage.columns print "\n Simulation output" print 'idmen' in simu_aggr_tables.columns print "\n" #print 'Dropping duplicates of idmen for both tables...' assert not erf_menage["idmen"].duplicated().any(), "Duplicated idmen in erf_menage" #erf_menage.drop_duplicates('idmen', inplace = True) simu_aggr_tables.drop_duplicates('idmen', inplace = True) assert not simu_aggr_tables["idmen"].duplicated().any(), "Duplicated idmen in of" print 'Checking mismatching idmen... ' s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen'])) if s1: print "idmen that aren't in simu_aggr_tables : %s" %str(len(s1)) pass s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen']) if s2: print "idmen that aren't in erf_menage : %s" %str(len(s2)) pass del s1, s2 # Restrict to common idmens and merge s3 = set(erf_menage['idmen']).intersection(set(simu_aggr_tables['idmen'])) print "Restricting to %s common idmen... \n" %str(len(s3)) erf_menage = erf_menage[erf_menage['idmen'].isin(s3)] simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)] del s3 gc.collect() #print erf_menage.columns #print simu_aggr_tables.columns # Compare differences across of and erf dataframes print "Comparing differences between dataframes... \n" colcom = (set(erf_menage.columns).intersection(set(simu_aggr_tables.columns))) - set(['idmen','wprm']) print 'Common variables: ' print colcom erf_menage.reset_index(inplace = True) simu_aggr_tables.reset_index(inplace = True) for col in colcom: temp = set(erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]]) print "Numbers of idmen that aren't equal on variable %s : %s \n" %(col, str(len(temp))) del temp self.erf_menage = erf_menage self.erf_eec_indivi = erf_eec_indivi self.simu_aggr_tables = simu_aggr_tables self.simu_nonaggr_tables = simu_nonaggr_tables
def test(year=2006, variables = ['af']): simulation = SurveySimulation() survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5') simulation.set_config(year=year, survey_filename=survey_filename) simulation.set_param() simulation.compute() # of_aggregates = Aggregates() # of_aggregates.set_simulation(simulation) # of_aggregates.compute() # print of_aggregates.aggr_frame # # from openfisca_france.data.erf.aggregates import build_erf_aggregates # temp = (build_erf_aggregates(variables=variables, year= year)) # print temp # return variable= "af" debugger = Debugger() debugger.set_simulation(simulation) debugger.set_variable(variable) debugger.show_aggregates() def get_all_ancestors(varlist): if len(varlist) == 0: return [] else: if varlist[0]._parents == set(): return ([varlist[0]] + get_all_ancestors(varlist[1:])) else: return ([varlist[0]] + get_all_ancestors(list(varlist[0]._parents)) + get_all_ancestors(varlist[1:])) # We want to get all ancestors + children + the options that we're going to encounter parents = map(lambda x: simulation.output_table.column_by_name.get(x), variables) parents = get_all_ancestors(parents) options = [] for varcol in parents: options.extend(varcol._option.keys()) options = list(set(options)) #print options parents = map(lambda x: x.name, parents) for var in variables: children = set() varcol = simulation.output_table.column_by_name.get(var) children = children.union(set(map(lambda x: x.name, varcol._children))) variables = list(set(parents + list(children))) #print variables del parents, children gc.collect() def get_var(variable): variables =[variable] return simulation.aggregated_by_entity(entity="men", variables=variables, all_output_vars = False, force_sum=True)[0] simu_aggr_tables = get_var(variables[0]) for var in variables[1:]: simu_aggr_tables = simu_aggr_tables.merge(get_var(var)[['idmen', var]], on = 'idmen', how = 'outer') # We load the data from erf table in case we have to pick data there erf_data = DataCollection(year=year) os.system('cls') todo = set(variables + ["ident", "wprm"]).union(set(options)) print 'Variables or equivalents to fetch :' print todo ''' Méthode générale pour aller chercher les variables de l'erf/eec ( qui n'ont pas forcément le même nom et parfois sont les variables utilisées pour créér l'of ): 1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats que l'on compare ) Si les variables ne sont pas directement dans la table, elles ont été calculées à partir d'autres variables de données erf/eec donc chercher dans : 2 - build_survey 3 - model/model.py qui dira éventuellement dans quel module de model/ chercher Le 'print todo' vous indique quelles variables chercher ( attention à ne pas inclure les enfants directs ) L'utilisation du Ctrl-H est profitable ! ''' fetch_eec = ['statut','titc','chpub','encadr','prosa','age','naim','naia','noindiv'] fetch_erf = ['zsali','af','ident','wprm','noi','noindiv','quelfic'] erf_df = erf_data.get_of_values(variables= fetch_erf, table="erf_indivi") eec_df = erf_data.get_of_values(variables= fetch_eec, table="eec_indivi") erf_eec_indivi = erf_df.merge(eec_df, on ='noindiv', how = 'inner' ) assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns" del eec_df, erf_df # We then get the aggregate variables for the menage ( mainly to compare with of ) print 'Loading data from erf_menage table' erf_menage = erf_data.get_of_values(variables= list(todo) + ['quelfic'], table="erf_menage") del todo gc.collect() assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns" from openfisca_france.data.erf import get_erf2of erf2of = get_erf2of() erf_menage.rename(columns = erf2of, inplace = True) # We get the options from the simulation non aggregated tables: # First from the output_table # We recreate the noindiv in output_table simulation.output_table.table['noindiv'] = 100 * simulation.output_table.table.idmen_ind + simulation.output_table.table.noi_ind simulation.output_table.table['noindiv'] = simulation.output_table.table['noindiv'].astype(np.int64) s1 = [var for var in set(options).intersection(set(simulation.output_table.table.columns))] + ['idmen_ind', 'quimen_ind', 'noindiv'] simu_nonaggr_tables = (simulation.output_table.table)[s1] simu_nonaggr_tables.rename(columns = {'idmen_ind' : 'idmen', 'quimen_ind':'quimen'}, inplace = True) assert 'noindiv' in simu_nonaggr_tables.columns # If not found, we dwelve into the input_table if (set(s1)- set(['idmen_ind', 'quimen_ind','noindiv'])) < set(options): assert 'noindiv' in simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns" s2 = [var for var in (set(options).intersection(set(simulation.input_table.table.columns)) - set(s1))] + ['noindiv'] #print s2 temp = simulation.input_table.table[s2] simu_nonaggr_tables = simu_nonaggr_tables.merge(temp, on = 'noindiv', how = 'inner', sort = False) del s2, temp del s1 gc.collect() simu_nonaggr_tables = simu_nonaggr_tables[list(set(options)) + ['idmen', 'quimen','noindiv']] #print options, variables assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns' # Check the idmens that are not common erf_menage.rename(columns = {'ident' : 'idmen'}, inplace = True) print "\n" print 'Checking if idmen is here...' print '\n ERF : ' print 'idmen' in erf_menage.columns print "\n Simulation output" print 'idmen' in simu_aggr_tables.columns print "\n" #print 'Dropping duplicates of idmen for both tables...' assert not erf_menage["idmen"].duplicated().any(), "Duplicated idmen in erf_menage" #erf_menage.drop_duplicates('idmen', inplace = True) simu_aggr_tables.drop_duplicates('idmen', inplace = True) assert not simu_aggr_tables["idmen"].duplicated().any(), "Duplicated idmen in of" print 'Checking mismatching idmen... ' s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen'])) if s1: print "idmen that aren't in simu_aggr_tables : %s" %str(len(s1)) pass s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen']) if s2: print "idmen that aren't in erf_menage : %s" %str(len(s2)) pass del s1, s2 # Restrict to common idmens and merge s3 = set(erf_menage['idmen']).intersection(set(simu_aggr_tables['idmen'])) print "Restricting to %s common idmen... \n" %str(len(s3)) erf_menage = erf_menage[erf_menage['idmen'].isin(s3)] simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)] del s3 gc.collect() #print erf_menage.columns #print simu_aggr_tables.columns # Compare differences across of and erf dataframes print "Comparing differences between dataframes... \n" colcom = (set(erf_menage.columns).intersection(set(simu_aggr_tables.columns))) - set(['idmen','wprm']) print 'Common variables: ' print colcom erf_menage.reset_index(inplace = True) simu_aggr_tables.reset_index(inplace = True) for col in colcom: temp = set(erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]]) print "Numbers of idmen that aren't equal on variable %s : %s \n" %(col, str(len(temp))) del temp # Detect the biggest differences bigtable = merge(erf_menage, simu_aggr_tables, on = 'idmen', how = 'inner', suffixes=('_erf','_of')) print 'Length of new dataframe is %s' %str(len(bigtable)) #print bigtable.columns bigtable.set_index('idmen', drop = False, inplace = True) already_met = [] options_met = [] for col in colcom: bigtemp = None table = bigtable[and_(bigtable[col+'_erf']!=0,bigtable[col+'_of']!=0)] table[col] = (table[col+'_erf'] - table[col+'_of']) / table[col+'_erf'] #Difference relative table[col] = table[col].apply(lambda x: abs(x)) print 'Minimum difference between the two tables for %s is %s' %(col, str(table[col].min())) print 'Maximum difference between the two tables for %s is %s' %(col, str(table[col].max())) print table[col].describe() try: assert len(table[col]) == len(table['wprm_of']), "PINAGS" dec, values = mwp(table[col], np.arange(1,11), table['wprm_of'], 2, return_quantiles=True) #print sorted(values) dec, values = mwp(table[col], np.arange(1,101), table['wprm_erf'], 2, return_quantiles=True) #print sorted(values)[90:] del dec, values gc.collect() except: #print 'Weighted percentile method didnt work for %s' %col pass print "\n" # Show the relevant information for the most deviant households table.sort(columns = col, ascending = False, inplace = True) #print table[col][0:10].to_string() if bigtemp is None: bigtemp = {'table' : table[[col, col+'_of', col+'_erf', 'idmen']][0:10], 'options' : None} bigtemp['table'][col+'div'] = bigtemp['table'][col+'_of'] / bigtemp['table'][col+'_erf'] print bigtemp['table'].to_string() ''' bigtemp is the table which will get filled little by little by the relevant variables. Up to the last rows of code 'table' refers to a table of aggregated values, while 'options is a table of individual variables. The reason we call it in a dictionnary is also because we modify it inside the recursive function 'iter_on parents', and it causes an error in Python unless for certain types like dictionnary values. ''' #print "\n" # If variable is a Prestation, we show the dependancies varcol = simulation.output_table.column_by_name.get(col) if isinstance(varcol, Prestation): ''' For the direct children ''' if not varcol._children is None: ch_to_fetch = list(varcol._children) ch_to_fetch = map(lambda x: x.name, ch_to_fetch) ch_fetched = [] if set(ch_to_fetch) <= set(simu_aggr_tables.columns): print "Variables which need %s to be computed :\n %s \n" %(col, str(ch_to_fetch)) for var in ch_to_fetch: if var + '_of' in table.columns: ch_fetched.append(var + '_of') else: ch_fetched.append(var) elif set(ch_to_fetch) <= set(simu_aggr_tables.columns).union(erf_menage.columns): print "Variables which need %s to be computed (some missing picked in erf):\n %s \n" %(col, str(ch_to_fetch)) for var in ch_to_fetch: if var in simu_aggr_tables.columns: if var + '_of' in table.columns: ch_fetched.append(var + '_of') elif var + '_erf' in table.columns: ch_fetched.append(var + '_erf') else: ch_fetched.append(var) else: print "Variables which need %s to be computed (some missing):\n %s \n" %(col, str(ch_to_fetch)) for var in ch_to_fetch: if var in simu_aggr_tables.columns: if var + '_of' in table.columns: ch_fetched.append(var + '_of') elif var in erf_menage.columns: if var + '_erf' in table.columns: ch_fetched.append(var + '_erf') print table[[col] + ch_fetched][0:10] print "\n" del ch_to_fetch, ch_fetched ''' For the parents ''' def iter_on_parents(varcol): if (varcol._parents == set() and varcol._option == {}) or varcol.name in already_met: return else: par_to_fetch = list(varcol._parents) par_to_fetch = map(lambda x: x.name, par_to_fetch) par_fetched = [] if set(par_fetched) <= set(simu_aggr_tables.columns): #print "Variables the prestation %s depends of :\n %s \n" %(varcol.name, str(par_fetched)) for var in par_fetched: if var + '_of' in table.columns: par_fetched.append(var + '_of') else: par_fetched.append(var) elif set(par_fetched) <= set(simu_aggr_tables.columns).union(erf_menage.columns): #print "Variables the prestation %s depends of (some missing picked in erf):\n %s \n" %(varcol.name,str(par_fetched)) for var in par_fetched: if var in simu_aggr_tables.columns: if var + '_of' in table.columns: par_fetched.append(var + '_of') elif var + '_erf' in table.columns: par_fetched.append(var + '_erf') else: par_fetched.append(var) else: for var in par_fetched: if var in simu_aggr_tables.columns: if var + '_of' in table.columns: par_fetched.append(var + '_of') elif var in erf_menage.columns: if var + '_erf' in table.columns: par_fetched.append(var + '_erf') if len(par_fetched) > 0: #print "Variables the prestation %s depends of (some missing):\n %s \n" %(varcol.name, str(par_fetched)) pass else: #print "Variables the prestation %s depends of couldn't be found :\n %s \n" %(varcol.name, str(par_fetched)) pass if len(par_fetched) > 0: temp = table[[col, 'idmen'] + par_fetched][0:10] bigtemp['table'] = pd.merge(temp, bigtemp['table'], how = 'inner') #print temp.to_string(), "\n" if varcol._option != {} and not set(varcol._option.keys()) < set(options_met): vars_to_fetch = list(set(varcol._option.keys())-set(options_met)) #print "and the options to current variable %s for the id's with strongest difference :\n %s \n" %(varcol.name, varcol._option.keys()) liste = [i for i in range(0,10)] liste = map(lambda x: table['idmen'].iloc[x], liste) temp = simu_nonaggr_tables[['idmen', 'quimen','noindiv'] + vars_to_fetch][simu_nonaggr_tables['idmen'].isin(table['idmen'][0:10])] temp_sorted = temp[temp['idmen'] == liste[0]] for i in xrange(1,10): temp_sorted = temp_sorted.append(temp[temp['idmen'] == liste[i]]) if bigtemp['options'] is None: bigtemp['options'] = temp_sorted bigtemp['options'] = bigtemp['options'].merge(erf_eec_indivi, on = 'noindiv', how = 'outer') else: bigtemp['options'] = bigtemp['options'].merge(temp_sorted, on = ['noindiv','idmen','quimen'], how = 'outer') # temp_sorted.set_index(['idmen', 'quimen'], drop = True, inplace = True) # If we do that del temp, temp_sorted gc.collect() already_met.append(varcol.name) options_met.extend(varcol._option.keys()) for var in varcol._parents: iter_on_parents(var) iter_on_parents(varcol) # We merge the aggregate table with the option table ( for each individual in entity ) bigtemp['table'] = bigtemp['table'].merge(bigtemp['options'], how = 'left', on = 'idmen', suffixes = ('(agg)', '(ind)')) # Reshaping the table to group by descending error on col, common entities bigtemp['table'].sort(columns = ['af','quimen'], ascending = [False,True], inplace = True) bigtemp['table'] = bigtemp['table'].groupby(['idmen','quimen'], sort = False).sum() print "Table of values for %s dependencies : \n" %col print bigtemp['table'].to_string() del bigtemp['table'], bigtemp['options'] gc.collect()
def create_imput_loyer(year): ''' Impute les loyers à partir de ??? ''' #Variables used for imputation df = DataCollection(year=year) print 'Démarrer 02_imput_loyer' menm_vars = [ "ztsam", "zperm", "zragm", "zricm", "zrncm", "zracm", "nb_uci", "wprm", "so", "nbpiec", "typmen5", "spr", "nbenfc", "agpr", "cstotpr", "nat28pr", "tu99", "aai1", 'ident', "pol99", "reg", "tau99" ] if year == 2008: # Tau99 not present menm_vars = menm_vars.pop('tau99') indm_vars = ["noi", 'ident', "lpr", "dip11"] LgtAdrVars = ["gzc2"] LgtMenVars = [ "sec1", "mrcho", "mrret", "mrsal", "mrtns", "mdiplo", "mtybd", "magtr", "mcs8", "maa1at", "qex", "muc1" ] if year == 2003: LgtMenVars.extend(["typse", "lmlm", "hnph2", "mnatior", "ident"]) LgtAdrVars.extend(["iaat", "tu99", "ident"]) if year < 2010 and year > 2005: LgtMenVars.extend(["mnatio", "idlog"]) LgtAdrVars.extend(["idlog"]) # pas de typse en 2006 LgtLgtVars = ["lmlm", "iaat", "tu99", "hnph2", "idlog"] # pas de typse en 2006 ## Travail sur la base ERF #Preparing ERF menages tables # print show_temp() # TODO : data.get_values erfmenm = load_temp(name="menagem", year=year) # erfmenm = df.get_values(table="erf_menage",variables=menm_vars) erfmenm['revtot'] = (erfmenm['ztsam'] + erfmenm['zperm'] + erfmenm['zragm'] + erfmenm['zricm'] + erfmenm['zrncm'] + erfmenm['zracm']) erfmenm['nvpr'] = erfmenm['revtot'].astype( np.float64) / erfmenm['nb_uci'].astype(np.float64) # On donne la valeur 0 aux nvpr négatifs tmp = np.zeros(erfmenm['nvpr'].shape, dtype=int) erfmenm['nvpr'] = max_(tmp, erfmenm['nvpr']) for v in erfmenm['nvpr']: # On vérifie qu'il n'y a plus de nvpr négatifs assert v >= 0, Exception('Some nvpr are negatives') erfmenm['logt'] = erfmenm['so'] l = erfmenm.columns.tolist() # print l #Preparing ERF individuals table erfindm = load_temp(name="indivim", year=year) # erfindm = df.get_values(table = "eec_indivi", variables = indm_vars) # TODO: clean this later erfindm['dip11'] = 0 count_NA('dip11', erfindm) # erfindm['dip11'] = 99 erfindm = erfindm[['ident', 'dip11']][erfindm['lpr'] == 1] # erf <- merge(erfmenm, erfindm, by ="ident") print('merging erf menage and individu') erf = erfmenm.merge(erfindm, on='ident', how='inner') erf = erf.drop_duplicates('ident') # control(erf) La colonne existe mais est vide, # on a du confondre cette colonne avec dip11 ? dec, values = mark_weighted_percentiles(erf['nvpr'], arange(1, 11), erf['wprm'], 2, return_quantiles=True) values.sort() erf['deci'] = (1 + (erf['nvpr'] > values[1]) + (erf['nvpr'] > values[2]) + (erf['nvpr'] > values[3]) + (erf['nvpr'] > values[4]) + (erf['nvpr'] > values[5]) + (erf['nvpr'] > values[6]) + (erf['nvpr'] > values[7]) + (erf['nvpr'] > values[8]) + (erf['nvpr'] > values[9])) # Problème : tous les individus sont soit dans le premier, soit dans le dernier décile. WTF assert_variable_inrange('deci', [1, 11], erf) count_NA('deci', erf) del dec, values gc.collect() #TODO: faire le lien avec men_vars, il manque "pol99","reg","tau99" et ici on a en plus logt, 'nvpr','revtot','dip11','deci' erf = erf[[ 'ident', 'ztsam', 'zperm', 'zragm', 'zricm', 'zrncm', 'zracm', 'nb_uci', 'logt', 'nbpiec', 'typmen5', 'spr', 'nbenfc', 'agpr', 'cstotpr', 'nat28pr', 'tu99', 'aai1', 'wprm', 'nvpr', 'revtot', 'dip11', 'deci' ]][erf['so'].isin(range(3, 6))] erf.rename(columns={ 'nbpiec': 'hnph2', 'nat28pr': 'mnatio', 'aai1': 'iaat', 'dip11': 'mdiplo' }, inplace=True) # TODO: ne traite pas les types comme dans R teste-les pour voir comment pandas les gère count_NA('agpr', erf) erf['agpr'] = erf['agpr'].astype('int64') # TODO: moche, pourquoi créer deux variables quand une suffit ? erf['tmp'] = 3 erf['tmp'][erf['agpr'] < 65] = 2 erf['tmp'][erf['agpr'] < 40] = 1 erf['magtr'] = erf['tmp'] count_NA('magtr', erf) assert_variable_inrange('magtr', [1, 4], erf) count_NA('cstotpr', erf) erf['tmp'] = erf['cstotpr'].astype('float') / 10.0 erf['tmp'] = map(math.floor, erf['tmp']) erf['mcs8'] = erf['tmp'] erf['mcs8'][erf['mcs8'] == 0] = NaN # assert isinstance(erf['mcs8'], (int, long)).all(), Exception('Some mcs8 are not integers') count_NA('mcs8', erf) # TODO il reste 41 NA's 2003 erf['mtybd'] = NaN erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] != 2)] = 1 erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] == 2)] = 2 erf['mtybd'][erf['typmen5'] == 5] = 3 erf['mtybd'][erf['typmen5'] == 3] = 7 erf['mtybd'][erf['nbenfc'] == 1] = 4 erf['mtybd'][erf['nbenfc'] == 2] = 5 erf['mtybd'][erf['nbenfc'] >= 3] = 6 count_NA('mtybd', erf) # print erf['mtybd'].dtype.fields #assert_variable_inrange('mtybd', [1,7], erf) # bug, on trouve 7.0 qui fait assert # TODO : 3 logements ont 0 pièces !! erf['hnph2'][erf['hnph2'] < 1] = 1 erf['hnph2'][erf['hnph2'] >= 6] = 6 count_NA('hnph2', erf) assert_variable_inrange('hnph2', [1, 7], erf) # # TODO: il reste un NA 2003 # # il rest un NA en 2008 tmp = erf['mnatio'] tmp[erf['mnatio'] == 10] = 1 tmp[erf['mnatio'].isin([ 11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 41, 42, 43, 44, 45, 46, 47, 48, 51, 52, 62, 60 ])] = 2 erf['mnatio'] = tmp count_NA('mnatio', erf) assert_variable_inrange('mnatio', [1, 3], erf) tmp = erf['iaat'] tmp[erf['mnatio'].isin([1, 2, 3])] = 1 tmp[erf['mnatio'] == 4] = 2 tmp[erf['mnatio'] == 5] = 3 tmp[erf['mnatio'] == 6] = 4 tmp[erf['mnatio'] == 7] = 5 tmp[erf['mnatio'] == 8] = 6 erf['iaat'] = tmp count_NA('iaat', erf) assert_variable_inrange('iaat', [1, 7], erf) # # Il reste un NA en 2003 # # reste un NA en 2008 # table(erf$iaat, useNA="ifany") # TODO: comparer logement et erf pour ?tre sur que cela colle tmp = erf['mdiplo'] tmp[erf['mdiplo'].isin([71, ""])] = 1 tmp[erf['mdiplo'].isin([70, 60, 50])] = 2 tmp[erf['mdiplo'].isin([41, 42, 31, 33])] = 3 tmp[erf['mdiplo'].isin([10, 11, 30])] = 4 erf['mdiplo'] = tmp count_NA('mdiplo', erf) #assert_variable_inrange('mdiplo', [1,5], erf) # On a un 99 qui se balade tmp = erf['tu99'] tmp[erf['tu99'] == 0] = 1 tmp[erf['tu99'].isin([1, 2, 3])] = 2 tmp[erf['tu99'].isin([4, 5, 6])] = 3 tmp[erf['tu99'] == 7] = 4 tmp[erf['tu99'] == 8] = 5 erf['tu99_recoded'] = tmp count_NA('tu99_recoded', erf) assert_variable_inrange('tu99_recoded', [1, 6], erf) # TODO : 0 ? Rajouetr 2003 ! tmp = erf['mcs8'] tmp[erf['mcs8'] == 1] = 1 tmp[erf['mcs8'] == 2] = 2 tmp[erf['mcs8'] == 3] = 3 tmp[erf['mcs8'].isin([4, 8])] = 4 tmp[erf['mcs8'].isin([5, 6, 7])] = 5 erf['mcs8'] = tmp count_NA('mcs8', erf) assert_variable_inrange('mcs8', [1, 6], erf) erf['wprm'] = erf['wprm'].astype('int64') count_NA('wprm', erf) del (erf['cstotpr'], erf['agpr'], erf['typmen5'], erf['nbenfc'], erf['spr'], erf['tmp'], erf['tu99']) gc.collect() erf = erf.dropna(subset=[ 'logt', 'magtr', 'mcs8', 'mtybd', 'hnph2', 'mnatio', 'iaat', 'mdiplo', 'tu99_recoded' ]) #On vérifie au final que l'on n'a pas de doublons d'individus assert erf['ident'].value_counts().max() == 1, Exception( 'Number of distinct individuals after removing duplicates is not correct' ) ## Travail sur la table logement # Table menage if year == 2003: year_lgt = 2003 if year > 2005 and year < 2010: year_lgt = 2006 print "preparing logement menage table" # Lgtmen = load_temp(name = "indivim",year = year) # Je rajoute une étape bidon Lgtmen = df.get_values(table="lgt_menage", variables=LgtMenVars) Lgtmen.rename(columns={'idlog': 'ident'}, inplace=True) count_NA('mrcho', Lgtmen) Lgtmen['mrcho'].fillna(0, inplace=True) Lgtmen['mrret'].fillna(0, inplace=True) Lgtmen['mrsal'].fillna(0, inplace=True) Lgtmen['mrtns'].fillna(0, inplace=True) count_NA('mrcho', Lgtmen) Lgtmen['revtot'] = Lgtmen['mrcho'] + Lgtmen['mrret'] + Lgtmen[ 'mrsal'] + Lgtmen['mrtns'] # Virer les revenus négatifs ? count_NA('revtot', Lgtmen) Lgtmen['nvpr'] = 10.0 * Lgtmen['revtot'] / Lgtmen['muc1'] count_NA('qex', Lgtmen) dec, values = mark_weighted_percentiles(Lgtmen['nvpr'], arange(1, 11), Lgtmen['qex'], 2, return_quantiles=True) values.sort() Lgtmen['deci'] = ( 1 + (Lgtmen['nvpr'] > values[1]) + (Lgtmen['nvpr'] > values[2]) + (Lgtmen['nvpr'] > values[3]) + (Lgtmen['nvpr'] > values[4]) + (Lgtmen['nvpr'] > values[5]) + (Lgtmen['nvpr'] > values[6]) + (Lgtmen['nvpr'] > values[7]) + (Lgtmen['nvpr'] > values[8]) + (Lgtmen['nvpr'] > values[9])) del dec, values print Lgtmen['deci'].describe() gc.collect() ##Table logement (pas en 2003 mais en 2006) # str(lgtmen) # if (year_lgt=="2006"){ # message("preparing logement logement table") # lgtlgt <- LoadIn(lgtLgtFil,lgtLgtVars) # lgtlgt <- upData(lgtlgt, rename=renameidlgt) # lgtmen <- merge(lgtmen, lgtlgt, by.x="ident", by.y="ident") if year_lgt == 2006: print 'preparing logement logement table' lgtlgt = df.get_values(table="lgt_logt", variables=LgtLgtVars) lgtlgt.rename(columns={'idlog': 'ident'}, inplace=True) Lgtmen = Lgtmen.merge(lgtlgt, left_on='ident', right_on='ident', how='inner') del lgtlgt data = Lgtmen[Lgtmen['sec1'].isin([21, 22, 23, 24, 30])] del Lgtmen gc.collect() if year_lgt == 2006: data.rename(columns={'mnatio': 'mnatior'}, inplace=True) data = (data[data['mnatior'].notnull()]) data = (data[data['sec1'].notnull()]) data['tmp'] = data['sec1'].astype(np.int64) data['tmp'][data['sec1'].isin([21, 22, 23])] = 3 data['tmp'][data['sec1'] == 24] = 4 data['tmp'][data['sec1'] == 30] = 5 data['logt'] = data['tmp'] count_NA('logt', data) data = (data[data['logt'].notnull()]) Lgtmen = data # ## Table adresse print "preparing logement adresse table" # lgtadr <- LoadIn(lgtAdrFil,lgtAdrVars) # lgtadr <- upData(lgtadr, rename=renameidlgt) # Je rajoute une étae bidon Lgtadr = df.get_values(table="adresse", variables=LgtAdrVars) Lgtadr.rename(columns={'idlog': 'ident'}, inplace=True) print('Merging logement and menage tables') Logement = Lgtmen.merge(Lgtadr, on='ident', how='inner') # control(Logement) # Pas de idfoy, etc. dans la table logement ? Logement['hnph2'][Logement['hnph2'] >= 6] = 6 Logement['hnph2'][Logement['hnph2'] < 1] = 1 count_NA('hnph2', Logement) assert not Logement['hnph2'].isnull().any(), "Some hnph2 are null" # Logement=(Logement[Logement['hnph2'].notnull()]) # Mis en comment car 0 NA pour hnph2 # On est dans la même étape within ici et par la suite ( cf code R ) # TODO : ici problème je transforme les 07 en 7 # car Python considère les 0n comme des nombres octaux ( < 08 ). # J'espère que ce n'est pas important. Logement['tmp'] = Logement['mnatior'] Logement['tmp'][Logement['mnatior'].isin([0, 1])] = 1 Logement['tmp'][Logement['mnatior'].isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11])] = 2 Logement['mnatior'] = Logement['tmp'] count_NA('mnatior', Logement) assert_variable_inrange('mnatior', [1, 3], Logement) Logement['tmp'] = Logement['iaat'] Logement['tmp'][Logement['iaat'].isin([1, 2, 3, 4, 5])] = 1 Logement['tmp'][Logement['iaat'] == 6] = 2 Logement['tmp'][Logement['iaat'] == 7] = 3 Logement['tmp'][Logement['iaat'] == 8] = 4 Logement['tmp'][Logement['iaat'] == 9] = 5 Logement['tmp'][Logement['iaat'] == 10] = 6 # TODO question Clément : et le 9 et le 10 ? Logement['iaat'] = Logement['tmp'] count_NA('iaat', Logement) assert_variable_inrange('iaat', [1, 7], Logement) Logement['tmp'] = Logement['mdiplo'] Logement['tmp'][Logement['mdiplo'] == 1] = 1 Logement['tmp'][Logement['mdiplo'].isin([2, 3, 4])] = 2 Logement['tmp'][Logement['mdiplo'].isin([5, 6, 7, 8])] = 3 Logement['tmp'][Logement['mdiplo'] == 9] = 4 Logement['mdiplo'] = Logement['tmp'] count_NA('mdiplo', Logement) assert_variable_inrange('mdiplo', [1, 5], Logement) Logement['tmp'] = Logement['mtybd'] Logement['tmp'][Logement['mtybd'] == 110] = 1 Logement['tmp'][Logement['mtybd'] == 120] = 2 Logement['tmp'][Logement['mtybd'] == 200] = 3 Logement['tmp'][Logement['mtybd'].isin([311, 321, 401])] = 4 Logement['tmp'][Logement['mtybd'].isin([312, 322, 402])] = 5 Logement['tmp'][Logement['mtybd'].isin([313, 323, 403])] = 6 Logement['tmp'][Logement['mtybd'] == 400] = 7 Logement['mtybd'] = Logement['tmp'] count_NA('mtybd', Logement) assert_variable_inrange('mtybd', [1, 8], Logement) Logement['tmp'] = Logement['tu99'] count_NA('tu99', Logement) Logement['tmp'][Logement['tu99'] == 0] = 1 Logement['tmp'][Logement['tu99'].isin([1, 2, 3])] = 2 Logement['tmp'][Logement['tu99'].isin([4, 5, 6])] = 3 Logement['tmp'][Logement['tu99'] == 7] = 4 Logement['tmp'][Logement['tu99'] == 8] = 5 Logement['tu99_recoded'] = Logement['tmp'] count_NA('tu99_recoded', Logement) assert_variable_inrange('tu99_recoded', [1, 6], Logement) Logement['tmp'] = Logement['gzc2'] Logement['tmp'][Logement['gzc2'] == 1] = 1 Logement['tmp'][Logement['gzc2'].isin([2, 3, 4, 5, 6])] = 2 Logement['tmp'][Logement['gzc2'] == 7] = 3 Logement['gzc2'] = Logement['tmp'] count_NA('gzc2', Logement) assert_variable_inrange('gzc2', [1, 4], Logement) Logement['tmp'] = Logement['magtr'] Logement['tmp'][Logement['magtr'].isin([1, 2])] = 1 Logement['tmp'][Logement['magtr'].isin([3, 4])] = 2 Logement['tmp'][Logement['magtr'] == 5] = 3 Logement['magtr'] = Logement['tmp'] count_NA('magtr', Logement) assert_variable_inrange('magtr', [1, 4], Logement) Logement['tmp'] = Logement['mcs8'] Logement['tmp'][Logement['mcs8'] == 1] = 1 Logement['tmp'][Logement['mcs8'] == 2] = 2 Logement['tmp'][Logement['mcs8'] == 3] = 3 Logement['tmp'][Logement['mcs8'].isin([4, 8])] = 4 Logement['tmp'][Logement['mcs8'].isin([5, 6, 7])] = 5 Logement['mcs8'] = Logement['tmp'] count_NA('mcs8', Logement) assert_variable_inrange('mcs8', [1, 6], Logement) Logement['logloy'] = Logement['lmlm'].apply(lambda x: math.log(x)) Logement = (Logement[Logement['mdiplo'].notnull()]) Logement = (Logement[Logement['mtybd'].notnull()]) Logement = (Logement[Logement['magtr'].notnull()]) Logement = (Logement[Logement['mcs8'].notnull()]) Logement = (Logement[Logement['maa1at'].notnull()]) ## Imputation des loyers proprement dite # library(StatMatch) # loads StatMatch # # library(mice) use md.pattern to locate missing data # TODO : à supprimer ? # logt <- subset(logement,select=c(lmlm,logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, ident)) # logt$wprm <- logement$qex # erf <- subset(erf,select=c( logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, wprm, ident)) print('Compute imputed rents') Logt = Logement[[ 'lmlm', 'logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci', 'ident' ]] Logt['wprm'] = Logement['qex'] erf = erf[[ 'logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci', 'wprm', 'ident' ]] # # debug # # derf <- describe(erf, weights=as.numeric(erf$wprm)) # # dlogt <- describe(logt, weights=logt$wprm) # # # # for (var in as.list(names(derf))){ # # print("erf") # # print(derf[[var]]) # # print("logt") # # print(dlogt[[var]]) # # print("================") # # } # TODO add md.pattern # erf1 <- na.omit(erf) # logt <- na.omit(logt) from pandas import DataFrame erf = erf.dropna( how='any' ) # Si j'ai bien compris ce que l'on fait en R : dropper les lignes avec des NA #erf1 = erf # A-t-on toujours besoin de changer le nom du coup ? Logt = Logt.dropna(how='any') # allvars <- c("logt", "hnph2", "iaat", "mdiplo", "mtybd", "tu99_recoded", "magtr", "mcs8", "deci") # classes <- c("magtr","tu99_recoded") # matchvars <- setdiff(allvars,classes) allvars = [ 'logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci' ] classes = ['magtr', 'tu99_recoded'] matchvars = list(set(allvars) - set(classes)) erf['mcs8'] = erf['mcs8'].astype(int) # out.nnd <- NND.hotdeck(data.rec=erf1,data.don=logt,match.vars=matchvars,don.class=classes,gdist.fun="Gower") # fill.erf.nnd <- create.fused(data.rec=erf1, data.don=logt,mtc.ids=out.nnd$mtc.ids, z.vars="lmlm") from rpy2.robjects.packages import importr import rpy2.robjects.pandas2ri import rpy2.robjects.vectors as vectors rpy2.robjects.pandas2ri.activate( ) # Permet à rpy2 de convertir les dataframes sm = importr( "StatMatch") #, lib_loc = "C:\Program Files\R\R-2.15.2\library") print 'TEST 2' out_nnd = sm.NND_hotdeck(data_rec=erf, data_don=Logt, match_vars=vectors.StrVector(matchvars), don_class=vectors.StrVector(classes), dist_fun="Gower") print 'TEST 3' fill_erf_nnd = sm.create_fused(data_rec=erf, data_don=Logt, mtc_ids=out_nnd[0], z_vars=vectors.StrVector(["lmlm"])) del allvars, matchvars, classes, out_nnd gc.collect() # fill.erf.nnd <- upData(fill.erf.nnd, rename=c(lmlm='loym')) import pandas.rpy.common as com fill_erf_nnd = com.convert_robj(fill_erf_nnd) fill_erf_nnd = DataFrame(fill_erf_nnd) (fill_erf_nnd).rename(columns={'lmlm': 'loym'}, inplace=True) # loy_imput = fill.erf.nnd[c('ident','loym')] loy_imput = (fill_erf_nnd)[['ident', 'loym']] # load(menm) # menagem$loym <- NULL # menagem <- merge(menagem,loy_imput,by='ident',all.x = TRUE) # save(menagem,file=menm) # Mis en comment block, car à manipuler avec précaution je suppose ( ne souhaite pas faire de conneries ) erfmenm = load_temp(name="menagem", year=year) # del erfmenm['loym'] erfmenm = erfmenm.merge(loy_imput, on='ident', how='left') assert 'loym' in erfmenm.columns, 'No loym in erfmenm columns' save_temp(erfmenm, name="menagem", year=year)
def create_imput_loyer(year): ''' Impute les loyers à partir de ??? ''' #Variables used for imputation df = DataCollection(year=year) print 'Démarrer 02_imput_loyer' menm_vars = ["ztsam","zperm","zragm","zricm","zrncm","zracm","nb_uci","wprm", "so","nbpiec","typmen5","spr","nbenfc","agpr","cstotpr","nat28pr","tu99","aai1",'ident',"pol99","reg","tau99"] if year == 2008: # Tau99 not present menm_vars = menm_vars.pop('tau99') indm_vars = ["noi",'ident',"lpr","dip11"] LgtAdrVars = ["gzc2"] LgtMenVars = ["sec1","mrcho","mrret","mrsal","mrtns","mdiplo","mtybd","magtr","mcs8","maa1at","qex","muc1"] if year == 2003: LgtMenVars.extend(["typse","lmlm","hnph2","mnatior","ident"]) LgtAdrVars.extend(["iaat","tu99","ident"]) if year < 2010 and year > 2005: LgtMenVars.extend(["mnatio","idlog"]) LgtAdrVars.extend(["idlog"]) # pas de typse en 2006 LgtLgtVars=["lmlm","iaat","tu99","hnph2","idlog"] # pas de typse en 2006 ## Travail sur la base ERF #Preparing ERF menages tables # print show_temp() # TODO : data.get_values erfmenm = load_temp(name="menagem", year=year) # erfmenm = df.get_values(table="erf_menage",variables=menm_vars) erfmenm['revtot'] = (erfmenm['ztsam'] + erfmenm['zperm'] + erfmenm['zragm'] + erfmenm['zricm'] + erfmenm['zrncm'] + erfmenm['zracm']) erfmenm['nvpr'] = erfmenm['revtot'].astype(np.float64) / erfmenm['nb_uci'].astype(np.float64) # On donne la valeur 0 aux nvpr négatifs tmp = np.zeros(erfmenm['nvpr'].shape, dtype = int) erfmenm['nvpr'] = max_(tmp, erfmenm['nvpr']) for v in erfmenm['nvpr']: # On vérifie qu'il n'y a plus de nvpr négatifs assert v >= 0, Exception('Some nvpr are negatives') erfmenm['logt'] = erfmenm['so'] l = erfmenm.columns.tolist() # print l #Preparing ERF individuals table erfindm = load_temp(name = "indivim",year=year) # erfindm = df.get_values(table = "eec_indivi", variables = indm_vars) # TODO: clean this later erfindm['dip11'] = 0 count_NA('dip11', erfindm) # erfindm['dip11'] = 99 erfindm = erfindm[['ident', 'dip11']][erfindm['lpr'] == 1] # erf <- merge(erfmenm, erfindm, by ="ident") print('merging erf menage and individu') erf = erfmenm.merge(erfindm, on ='ident', how='inner') erf=erf.drop_duplicates('ident') # control(erf) La colonne existe mais est vide, # on a du confondre cette colonne avec dip11 ? dec, values = mark_weighted_percentiles(erf['nvpr'], arange(1,11), erf['wprm'], 2, return_quantiles=True) values.sort() erf['deci'] = (1 + (erf['nvpr']>values[1]) + (erf['nvpr']>values[2]) + (erf['nvpr']>values[3]) + (erf['nvpr']>values[4]) + (erf['nvpr']>values[5]) + (erf['nvpr']>values[6]) + (erf['nvpr']>values[7]) + (erf['nvpr']>values[8]) + (erf['nvpr']>values[9])) # Problème : tous les individus sont soit dans le premier, soit dans le dernier décile. WTF assert_variable_inrange('deci',[1,11], erf) count_NA('deci',erf) del dec, values gc.collect() #TODO: faire le lien avec men_vars, il manque "pol99","reg","tau99" et ici on a en plus logt, 'nvpr','revtot','dip11','deci' erf = erf[['ident','ztsam','zperm','zragm','zricm','zrncm','zracm', 'nb_uci', 'logt' ,'nbpiec','typmen5','spr','nbenfc','agpr','cstotpr', 'nat28pr','tu99','aai1','wprm', 'nvpr','revtot','dip11','deci']][erf['so'].isin(range(3,6))] erf.rename(columns = {'nbpiec':'hnph2','nat28pr':'mnatio','aai1':'iaat','dip11':'mdiplo'}, inplace = True) # TODO: ne traite pas les types comme dans R teste-les pour voir comment pandas les gère count_NA('agpr', erf) erf['agpr'] = erf['agpr'].astype('int64') # TODO: moche, pourquoi créer deux variables quand une suffit ? erf['tmp'] = 3 erf['tmp'][erf['agpr'] < 65] = 2 erf['tmp'][erf['agpr'] < 40] = 1 erf['magtr'] = erf['tmp'] count_NA('magtr',erf) assert_variable_inrange('magtr',[1,4],erf) count_NA('cstotpr',erf) erf['tmp'] = erf['cstotpr'].astype('float')/10.0 erf['tmp']=map(math.floor, erf['tmp']) erf['mcs8'] = erf['tmp'] erf['mcs8'][erf['mcs8'] == 0] = NaN # assert isinstance(erf['mcs8'], (int, long)).all(), Exception('Some mcs8 are not integers') count_NA('mcs8',erf) # TODO il reste 41 NA's 2003 erf['mtybd'] = NaN erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] != 2)] = 1 erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] == 2)] = 2 erf['mtybd'][erf['typmen5'] == 5] = 3 erf['mtybd'][erf['typmen5'] == 3] = 7 erf['mtybd'][erf['nbenfc'] == 1] = 4 erf['mtybd'][erf['nbenfc'] == 2] = 5 erf['mtybd'][erf['nbenfc'] >= 3] = 6 count_NA('mtybd',erf) # print erf['mtybd'].dtype.fields #assert_variable_inrange('mtybd', [1,7], erf) # bug, on trouve 7.0 qui fait assert # TODO : 3 logements ont 0 pièces !! erf['hnph2'][erf['hnph2'] < 1] = 1 erf['hnph2'][erf['hnph2'] >= 6] = 6 count_NA('hnph2', erf) assert_variable_inrange('hnph2', [1,7], erf) # # TODO: il reste un NA 2003 # # il rest un NA en 2008 tmp = erf['mnatio'] tmp[erf['mnatio'] == 10] = 1 tmp[erf['mnatio'].isin([11,12,13,14,15,21,22,23,24,25,26,27,28,29,31,32,41,42,43,44,45,46,47,48,51,52,62,60])] = 2 erf['mnatio'] = tmp count_NA('mnatio', erf) assert_variable_inrange('mnatio', [1,3], erf) tmp = erf['iaat'] tmp[erf['mnatio'].isin([1,2,3])] = 1 tmp[erf['mnatio'] == 4] = 2 tmp[erf['mnatio'] == 5] = 3 tmp[erf['mnatio'] == 6] = 4 tmp[erf['mnatio'] == 7] = 5 tmp[erf['mnatio'] == 8] = 6 erf['iaat'] = tmp count_NA('iaat', erf) assert_variable_inrange('iaat', [1,7], erf) # # Il reste un NA en 2003 # # reste un NA en 2008 # table(erf$iaat, useNA="ifany") # TODO: comparer logement et erf pour ?tre sur que cela colle tmp = erf['mdiplo'] tmp[erf['mdiplo'].isin([71,""])] = 1 tmp[erf['mdiplo'].isin([70,60,50])] = 2 tmp[erf['mdiplo'].isin([41,42,31,33])] = 3 tmp[erf['mdiplo'].isin([10,11,30])] = 4 erf['mdiplo'] = tmp count_NA('mdiplo', erf) #assert_variable_inrange('mdiplo', [1,5], erf) # On a un 99 qui se balade tmp = erf['tu99'] tmp[erf['tu99'] == 0] = 1 tmp[erf['tu99'].isin([1,2,3])] = 2 tmp[erf['tu99'].isin([4,5,6])] = 3 tmp[erf['tu99'] == 7] = 4 tmp[erf['tu99'] == 8] = 5 erf['tu99_recoded'] = tmp count_NA('tu99_recoded', erf) assert_variable_inrange('tu99_recoded', [1,6], erf) # TODO : 0 ? Rajouetr 2003 ! tmp = erf['mcs8'] tmp[erf['mcs8'] == 1] = 1 tmp[erf['mcs8'] == 2] = 2 tmp[erf['mcs8'] == 3] = 3 tmp[erf['mcs8'].isin([4,8])] = 4 tmp[erf['mcs8'].isin([5,6,7])] = 5 erf['mcs8'] = tmp count_NA('mcs8', erf) assert_variable_inrange('mcs8', [1,6], erf) erf['wprm'] = erf['wprm'].astype('int64') count_NA('wprm', erf) del (erf['cstotpr'] ,erf['agpr'], erf['typmen5'], erf['nbenfc'], erf['spr'], erf['tmp'], erf['tu99']) gc.collect() erf = erf.dropna(subset=['logt','magtr','mcs8','mtybd','hnph2','mnatio','iaat','mdiplo','tu99_recoded']) #On vérifie au final que l'on n'a pas de doublons d'individus assert erf['ident'].value_counts().max() == 1, Exception('Number of distinct individuals after removing duplicates is not correct') ## Travail sur la table logement # Table menage if year == 2003: year_lgt = 2003 if year > 2005 and year < 2010: year_lgt = 2006 print "preparing logement menage table" # Lgtmen = load_temp(name = "indivim",year = year) # Je rajoute une étape bidon Lgtmen = df.get_values(table = "lgt_menage", variables = LgtMenVars) Lgtmen.rename(columns = {'idlog':'ident'}, inplace = True) count_NA('mrcho', Lgtmen) Lgtmen['mrcho'].fillna(0, inplace = True) Lgtmen['mrret'].fillna(0, inplace = True) Lgtmen['mrsal'].fillna(0, inplace = True) Lgtmen['mrtns'].fillna(0, inplace = True) count_NA('mrcho', Lgtmen) Lgtmen['revtot'] = Lgtmen['mrcho']+Lgtmen['mrret']+Lgtmen['mrsal']+Lgtmen['mrtns'] # Virer les revenus négatifs ? count_NA('revtot', Lgtmen) Lgtmen['nvpr']=10.0*Lgtmen['revtot']/Lgtmen['muc1'] count_NA('qex', Lgtmen) dec, values = mark_weighted_percentiles(Lgtmen['nvpr'],arange(1,11), Lgtmen['qex'],2,return_quantiles=True) values.sort() Lgtmen['deci'] = (1+(Lgtmen['nvpr']>values[1])+(Lgtmen['nvpr']>values[2])+(Lgtmen['nvpr']>values[3]) +(Lgtmen['nvpr']>values[4])+(Lgtmen['nvpr']>values[5])+(Lgtmen['nvpr']>values[6]) +(Lgtmen['nvpr']>values[7])+(Lgtmen['nvpr']>values[8])+(Lgtmen['nvpr']>values[9])) del dec, values print Lgtmen['deci'].describe() gc.collect() ##Table logement (pas en 2003 mais en 2006) # str(lgtmen) # if (year_lgt=="2006"){ # message("preparing logement logement table") # lgtlgt <- LoadIn(lgtLgtFil,lgtLgtVars) # lgtlgt <- upData(lgtlgt, rename=renameidlgt) # lgtmen <- merge(lgtmen, lgtlgt, by.x="ident", by.y="ident") if year_lgt == 2006: print 'preparing logement logement table' lgtlgt = df.get_values(table = "lgt_logt", variables = LgtLgtVars) lgtlgt.rename(columns = {'idlog':'ident'}, inplace = True) Lgtmen = Lgtmen.merge(lgtlgt, left_on = 'ident', right_on = 'ident', how = 'inner') del lgtlgt data = Lgtmen[Lgtmen['sec1'].isin([21,22,23,24,30])] del Lgtmen gc.collect() if year_lgt == 2006: data.rename(columns = {'mnatio':'mnatior'}, inplace = True) data = (data[data['mnatior'].notnull()]) data = (data[data['sec1'].notnull()]) data['tmp'] = data['sec1'].astype(np.int64) data['tmp'][data['sec1'].isin([21,22,23])] = 3 data['tmp'][data['sec1'] == 24] = 4 data['tmp'][data['sec1'] == 30] = 5 data['logt'] = data['tmp'] count_NA('logt', data) data = (data[data['logt'].notnull()]) Lgtmen = data # ## Table adresse print "preparing logement adresse table" # lgtadr <- LoadIn(lgtAdrFil,lgtAdrVars) # lgtadr <- upData(lgtadr, rename=renameidlgt) # Je rajoute une étae bidon Lgtadr = df.get_values(table = "adresse", variables = LgtAdrVars) Lgtadr.rename(columns = {'idlog':'ident'}, inplace = True) print('Merging logement and menage tables') Logement = Lgtmen.merge(Lgtadr, on = 'ident', how = 'inner') # control(Logement) # Pas de idfoy, etc. dans la table logement ? Logement['hnph2'][Logement['hnph2'] >= 6] = 6 Logement['hnph2'][Logement['hnph2'] < 1] = 1 count_NA('hnph2', Logement) assert not Logement['hnph2'].isnull().any(), "Some hnph2 are null" # Logement=(Logement[Logement['hnph2'].notnull()]) # Mis en comment car 0 NA pour hnph2 # On est dans la même étape within ici et par la suite ( cf code R ) # TODO : ici problème je transforme les 07 en 7 # car Python considère les 0n comme des nombres octaux ( < 08 ). # J'espère que ce n'est pas important. Logement['tmp'] = Logement['mnatior'] Logement['tmp'][Logement['mnatior'].isin([0, 1])] = 1 Logement['tmp'][Logement['mnatior'].isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11])] = 2 Logement['mnatior'] = Logement['tmp'] count_NA('mnatior', Logement) assert_variable_inrange('mnatior', [1,3], Logement) Logement['tmp'] = Logement['iaat'] Logement['tmp'][Logement['iaat'].isin([1,2,3,4,5])] = 1 Logement['tmp'][Logement['iaat'] == 6] = 2 Logement['tmp'][Logement['iaat'] == 7] = 3 Logement['tmp'][Logement['iaat'] == 8] = 4 Logement['tmp'][Logement['iaat'] == 9] = 5 Logement['tmp'][Logement['iaat'] == 10] = 6 # TODO question Clément : et le 9 et le 10 ? Logement['iaat'] = Logement['tmp'] count_NA('iaat', Logement) assert_variable_inrange('iaat', [1,7], Logement) Logement['tmp'] = Logement['mdiplo'] Logement['tmp'][Logement['mdiplo'] == 1] = 1 Logement['tmp'][Logement['mdiplo'].isin([2,3,4])] = 2 Logement['tmp'][Logement['mdiplo'].isin([5,6,7,8])] = 3 Logement['tmp'][Logement['mdiplo'] == 9] = 4 Logement['mdiplo'] = Logement['tmp'] count_NA('mdiplo', Logement) assert_variable_inrange('mdiplo', [1,5], Logement) Logement['tmp'] = Logement['mtybd'] Logement['tmp'][Logement['mtybd'] == 110] = 1 Logement['tmp'][Logement['mtybd'] == 120] = 2 Logement['tmp'][Logement['mtybd'] == 200] = 3 Logement['tmp'][Logement['mtybd'].isin([311,321,401])] = 4 Logement['tmp'][Logement['mtybd'].isin([312,322,402])] = 5 Logement['tmp'][Logement['mtybd'].isin([313,323,403])] = 6 Logement['tmp'][Logement['mtybd'] == 400] = 7 Logement['mtybd'] = Logement['tmp'] count_NA('mtybd', Logement) assert_variable_inrange('mtybd', [1,8], Logement) Logement['tmp'] = Logement['tu99'] count_NA('tu99', Logement) Logement['tmp'][Logement['tu99'] == 0] = 1 Logement['tmp'][Logement['tu99'].isin([1,2,3])] = 2 Logement['tmp'][Logement['tu99'].isin([4,5,6])] = 3 Logement['tmp'][Logement['tu99'] == 7] = 4 Logement['tmp'][Logement['tu99'] == 8] = 5 Logement['tu99_recoded'] = Logement['tmp'] count_NA('tu99_recoded', Logement) assert_variable_inrange('tu99_recoded', [1,6], Logement) Logement['tmp'] = Logement['gzc2'] Logement['tmp'][Logement['gzc2'] == 1] = 1 Logement['tmp'][Logement['gzc2'].isin([2,3,4,5,6])] = 2 Logement['tmp'][Logement['gzc2'] == 7] = 3 Logement['gzc2'] = Logement['tmp'] count_NA('gzc2', Logement) assert_variable_inrange('gzc2', [1,4], Logement) Logement['tmp'] = Logement['magtr'] Logement['tmp'][Logement['magtr'].isin([1,2])] = 1 Logement['tmp'][Logement['magtr'].isin([3,4])] = 2 Logement['tmp'][Logement['magtr'] == 5] = 3 Logement['magtr'] = Logement['tmp'] count_NA('magtr', Logement) assert_variable_inrange('magtr', [1,4], Logement) Logement['tmp'] = Logement['mcs8'] Logement['tmp'][Logement['mcs8'] == 1] = 1 Logement['tmp'][Logement['mcs8'] == 2] = 2 Logement['tmp'][Logement['mcs8'] == 3] = 3 Logement['tmp'][Logement['mcs8'].isin([4,8])] = 4 Logement['tmp'][Logement['mcs8'].isin([5,6,7])] = 5 Logement['mcs8'] = Logement['tmp'] count_NA('mcs8', Logement) assert_variable_inrange('mcs8', [1,6], Logement) Logement['logloy'] = Logement['lmlm'].apply(lambda x: math.log(x)) Logement=(Logement[Logement['mdiplo'].notnull()]) Logement=(Logement[Logement['mtybd'].notnull()]) Logement=(Logement[Logement['magtr'].notnull()]) Logement=(Logement[Logement['mcs8'].notnull()]) Logement=(Logement[Logement['maa1at'].notnull()]) ## Imputation des loyers proprement dite # library(StatMatch) # loads StatMatch # # library(mice) use md.pattern to locate missing data # TODO : à supprimer ? # logt <- subset(logement,select=c(lmlm,logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, ident)) # logt$wprm <- logement$qex # erf <- subset(erf,select=c( logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, wprm, ident)) print ('Compute imputed rents') Logt = Logement[['lmlm','logt' , 'hnph2' , 'iaat' , 'mdiplo' , 'mtybd' , 'tu99_recoded' , 'magtr' , 'mcs8' , 'deci', 'ident']] Logt['wprm'] = Logement['qex'] erf = erf[['logt' , 'hnph2' , 'iaat' , 'mdiplo' , 'mtybd' , 'tu99_recoded' , 'magtr' , 'mcs8' , 'deci', 'wprm' , 'ident']] # # debug # # derf <- describe(erf, weights=as.numeric(erf$wprm)) # # dlogt <- describe(logt, weights=logt$wprm) # # # # for (var in as.list(names(derf))){ # # print("erf") # # print(derf[[var]]) # # print("logt") # # print(dlogt[[var]]) # # print("================") # # } # TODO add md.pattern # erf1 <- na.omit(erf) # logt <- na.omit(logt) from pandas import DataFrame erf = erf.dropna(how = 'any') # Si j'ai bien compris ce que l'on fait en R : dropper les lignes avec des NA #erf1 = erf # A-t-on toujours besoin de changer le nom du coup ? Logt = Logt.dropna(how = 'any') # allvars <- c("logt", "hnph2", "iaat", "mdiplo", "mtybd", "tu99_recoded", "magtr", "mcs8", "deci") # classes <- c("magtr","tu99_recoded") # matchvars <- setdiff(allvars,classes) allvars = ['logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci'] classes = ['magtr', 'tu99_recoded'] matchvars = list(set(allvars)-set(classes)) erf['mcs8'] = erf['mcs8'].astype(int) # out.nnd <- NND.hotdeck(data.rec=erf1,data.don=logt,match.vars=matchvars,don.class=classes,gdist.fun="Gower") # fill.erf.nnd <- create.fused(data.rec=erf1, data.don=logt,mtc.ids=out.nnd$mtc.ids, z.vars="lmlm") from rpy2.robjects.packages import importr import rpy2.robjects.pandas2ri import rpy2.robjects.vectors as vectors rpy2.robjects.pandas2ri.activate() # Permet à rpy2 de convertir les dataframes sm = importr("StatMatch")#, lib_loc = "C:\Program Files\R\R-2.15.2\library") print 'TEST 2' out_nnd = sm.NND_hotdeck(data_rec = erf, data_don = Logt, match_vars = vectors.StrVector(matchvars), don_class = vectors.StrVector(classes), dist_fun = "Gower") print 'TEST 3' fill_erf_nnd = sm.create_fused(data_rec = erf, data_don = Logt, mtc_ids = out_nnd[0], z_vars = vectors.StrVector(["lmlm"])) del allvars, matchvars, classes, out_nnd gc.collect() # fill.erf.nnd <- upData(fill.erf.nnd, rename=c(lmlm='loym')) import pandas.rpy.common as com fill_erf_nnd = com.convert_robj(fill_erf_nnd) fill_erf_nnd = DataFrame(fill_erf_nnd) (fill_erf_nnd).rename(columns={'lmlm':'loym'}, inplace = True) # loy_imput = fill.erf.nnd[c('ident','loym')] loy_imput = (fill_erf_nnd)[['ident','loym']] # load(menm) # menagem$loym <- NULL # menagem <- merge(menagem,loy_imput,by='ident',all.x = TRUE) # save(menagem,file=menm) # Mis en comment block, car à manipuler avec précaution je suppose ( ne souhaite pas faire de conneries ) erfmenm = load_temp(name="menagem", year=year) # del erfmenm['loym'] erfmenm = erfmenm.merge(loy_imput,on='ident',how='left') assert 'loym' in erfmenm.columns, 'No loym in erfmenm columns' save_temp(erfmenm, name = "menagem", year=year)
def check_converted(): #Retrieving the input and output files for analysis : store = HDFStore(survey_test) input_df = store['survey_2006'] output = HDFStore(survey3_test) df_fam = output['survey_2006/fam'] df_foy = output['survey_2006/foy'] df_men = output['survey_2006/men'] df_ind = output['survey_2006/ind'] year = 2006 erf = DataCollection(year=year) df = erf.get_of_values(table = "erf_indivi") df2 = erf.get_of_values(table = "eec_indivi") print '\n' print df.loc[df.ident==6030189, :].to_string() print df2.loc[df2.ident==6030189, :].to_string() print len(np.unique(input_df['idfoy'].values)) print len(np.unique(input_df.loc[input_df['quifoy']==0,'idfoy'].values)) liste = [601228002, 602671302, 602016402, 603069602, 601365902, 602679402, 602680905, 603074902, 600848302, 602684902, 601508802, 601427302, 601774602, 600466102, 603448202, 603091202, 602437502, 603224003, 603093102, 601261802, 601000002, 601789602, 601660602, 600350102, 601927802, 601797902, 601667902, 601537502, 600227602, 602854502, 602071902, 600144702, 602205702, 600769302, 601096602, 602609202, 601301302, 602220302, 602486102, 601376802, 601570902, 600654802, 601443202, 603412402, 603412902, 601055502, 602893001, 601189902, 601850602, 600539902, 602507002, 601460902, 602511602, 601200902, 601601802, 600946903, 600428502, 600953502, 601084802, 601350102, 600829602, 600174402] liste_men = np.unique(input_df.loc[input_df.idfoy.isin(liste), 'idmen'].values) print liste_men print df.loc[df.ident.isin(liste_men), ['noi', 'noindiv', 'ident', 'declar1', 'declar2', 'persfip', 'persfipd', 'quelfic']].head(30).to_string() print input_df.loc[input_df.idfoy.isin(liste), :].head(30).to_string() # print input_df.loc[input_df.idfoy==603018901, # ['idfoy', 'quifoy', 'idfam', 'quifam', 'idmen', 'quimen', 'noi']].to_string() # # print input_df.loc[input_df.idfam==603018902, # ['idfoy', 'quifoy', 'idfam', 'quifam', 'idmen', 'quimen', 'noi']].to_string() return # df_foy['noindiv'] = df_foy['noi'] ; del df_foy['noi'] # df_fam['noindiv'] = df_fam['noi'] ; del df_fam['noi'] # df_men['noindiv'] = df_men['noi'] ; del df_men['noi'] # print df_fam, df_foy, df_men # check_structure(store['survey_2006']) # control(input_df, verbose=True, verbose_columns=['noindiv']) # control(df_foy, verbose=True, verbose_columns=['noindiv']) # print input_df.duplicated('noindiv').sum(), len(input_df) # print df_foy.duplicated('noindiv').sum(), len(df_foy) # print df_fam.duplicated('noindiv').sum(), len(df_fam) # print df_men.duplicated('noindiv').sum(), len(df_men) # print df_ind.head(10).to_string() print ' FAM' print sorted(df_fam.columns) print ' FOY' print sorted(df_foy.columns) print ' MEN' print sorted(df_men.columns) print ' IND' print sorted(df_ind.columns) # print df_fam.columns print ' INPUT' print sorted(input_df.columns)
def create_totals(year=2006): print "Creating Totals" print "Etape 1 : Chargement des données" data = DataCollection(year=year) indivim = load_temp(name="indivim", year=year) assert indivim.duplicated(['noindiv' ]).any() == False, "Présence de doublons" # Deals individuals with imputed income : some individuals are in 'erf individu table' but # not in the 'foyer' table. We need to create a foyer for them. selection = Series() for var in [ "zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici", "zrnci" ]: varo = var[:-1] + "o" test = indivim[var] != indivim[varo] if len(selection) == 0: selection = test else: selection = (test) | (selection) indivi_i = indivim[selection] indivi_i.rename( columns={ "ident": "idmen", "persfip": "quifoy", "zsali": "sali2", # Inclu les salaires non imposables des agents d'assurance "zchoi": "choi2", "zrsti": "rsti2", "zalri": "alr2" }, inplace=True) indivi_i["quifoy"] = where(indivi_i["quifoy"].isnull(), "vous", indivi_i["quifoy"]) indivi_i["quelfic"] = "FIP_IMP" ## We merge them with the other individuals #indivim <- rename(indivim, c(ident = "idmen", # persfip = "quifoy", # zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance # zchoi = "choi2", # zrsti = "rsti2", # zalri = "alr2")) # #indivi <- rbind(indivim[!(indivim$noindiv %in% indivi_i$noindiv),], indivi_i) #rm(indivim, indivi_i) #gc() #table(indivi$quelfic) # indivim.rename( columns=dict( ident="idmen", persfip="quifoy", zsali= "sali2", # Inclu les salaires non imposables des agents d'assurance zchoi="choi2", zrsti="rsti2", zalri="alr2"), inplace=True) if not (set(list(indivim.noindiv)) > set(list(indivi_i.noindiv))): raise Exception("Individual ") indivim.set_index("noindiv", inplace=True) indivi_i.set_index("noindiv", inplace=True) indivi = indivim del indivim indivi.update(indivi_i) indivi.reset_index(inplace=True) print '' print "Etape 2 : isolation des FIP" fip_imp = indivi.quelfic == "FIP_IMP" indivi["idfoy"] = ( indivi["idmen"].astype("int64") * 100 + (indivi["declar1"].str[0:2]).convert_objects(convert_numeric=True)) indivi.loc[fip_imp, "idfoy"] = nan ## Certains FIP (ou du moins avec revenus imputés) ont un num?ro de déclaration d'impôt ( pourquoi ?) fip_has_declar = (fip_imp) & (indivi.declar1.notnull()) # indivi.ix[fip_has_declar, "idfoy"] = ( indivi.ix[fip_has_declar, "idmen"]*100 # + (indivi.ix[fip_has_declar, "declar1"].str[0:1]).convert_objects(convert_numeric=True) ) indivi["idfoy"] = where( fip_has_declar, indivi["idmen"] * 100 + indivi["declar1"].str[0:2].convert_objects(convert_numeric=True), indivi["idfoy"]) del fip_has_declar fip_no_declar = (fip_imp) & (indivi.declar1.isnull()) del fip_imp indivi["idfoy"] = where(fip_no_declar, indivi["idmen"] * 100 + 50, indivi["idfoy"]) indivi_fnd = indivi.loc[fip_no_declar, ["idfoy", "noindiv"]] while any(indivi_fnd.duplicated(cols=["idfoy"])): indivi_fnd["idfoy"] = where(indivi_fnd.duplicated(cols=["idfoy"]), indivi_fnd["idfoy"] + 1, indivi_fnd["idfoy"]) assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len( indivi_fnd["idfoy"]), "Duplicates remaining" assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons" indivi.loc[fip_no_declar, ["idfoy"]] = indivi_fnd del indivi_fnd, fip_no_declar print '' print 'Etape 3 : Récupération des EE_NRT' nrt = indivi.quelfic == "EE_NRT" indivi.idfoy = where(nrt, indivi.idmen * 100 + indivi.noi, indivi.idfoy) indivi.loc[nrt, "quifoy"] = "vous" del nrt pref_or_cref = indivi['lpr'].isin([1, 2]) adults = (indivi.quelfic.isin(["EE", "EE_CAF"])) & (pref_or_cref) indivi.idfoy = where(adults, indivi.idmen * 100 + indivi.noi, indivi.idfoy) indivi.loc[adults, "quifoy"] = "vous" del adults assert indivi.loc[indivi['lpr'].isin([1, 2]), "idfoy"].notnull().all() print '' print 'Etape 4 : Rattachement des enfants aux déclarations' assert indivi["noindiv"].duplicated().any( ) == False, "Some noindiv appear twice" lpr3_or_lpr4 = indivi['lpr'].isin([3, 4]) enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE", "EE_CAF"])) assert indivi.loc[enf_ee, "noindiv"].notnull().all( ), " Some noindiv are not set, which will ruin next stage" assert indivi.loc[ enf_ee, "noindiv"].duplicated().any() == False, "Some noindiv appear twice" pere = DataFrame({ "noindiv_enf": indivi.noindiv.loc[enf_ee], "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee] }) mere = DataFrame({ "noindiv_enf": indivi.noindiv.loc[enf_ee], "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee] }) foyer = data.get_values(variables=["noindiv", "zimpof"], table="foyer") pere = pere.merge(foyer, how="inner", on="noindiv") mere = mere.merge(foyer, how="inner", on="noindiv") # print "Some pere et mere are duplicated because people have two foyers" # print pere[pere.duplicated()] # print mere[mere.duplicated()] df = pere.merge(mere, how="outer", on="noindiv_enf", suffixes=('_p', '_m')) # print len(pere) # print len(mere) # print len(df) # ll = df.loc[df["noindiv_enf"].duplicated(), "noindiv_enf"] # print df.loc[df["noindiv_enf"].isin(ll)] # print df[df.duplicated()] print ' 4.1 : gestion des personnes dans 2 foyers' for col in ["noindiv_p", "noindiv_m", "noindiv_enf"]: df[col] = df[col].fillna( 0, inplace=True) # beacause groupby drop groups with NA in index df = df.groupby(by=["noindiv_p", "noindiv_m", "noindiv_enf"]).sum() df.reset_index(inplace=True) df["which"] = "" df["which"] = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()), "mere", "") df["which"] = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()), "pere", "") both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull()) df["which"] = where(both & (df.zimpof_p > df.zimpof_m), "pere", "mere") df["which"] = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere") assert df["which"].notnull().all( ), "Some enf_ee individuals are not matched with any pere or mere" del lpr3_or_lpr4, pere, mere df.rename(columns={"noindiv_enf": "noindiv"}, inplace=True) df["idfoy"] = where(df.which == "pere", df.noindiv_p, df.noindiv_m) df["idfoy"] = where(df.which == "mere", df.noindiv_m, df.noindiv_p) assert df["idfoy"].notnull().all() for col in df.columns: if col not in ["idfoy", "noindiv"]: del df[col] # assert indivi.loc[enf_ee,"idfoy"].notnull().all() assert df.duplicated().any() == False df.set_index("noindiv", inplace=True, verify_integrity=True) indivi.set_index("noindiv", inplace=True, verify_integrity=True) ind_notnull = indivi["idfoy"].notnull().sum() ind_isnull = indivi["idfoy"].isnull().sum() indivi = indivi.combine_first(df) assert ind_notnull + ind_isnull == (indivi["idfoy"].notnull().sum() + indivi["idfoy"].isnull().sum()) indivi.reset_index(inplace=True) assert indivi.duplicated().any() == False # MBJ: issue delt with when moving from R code to python ## TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs ## On suit guide méthodo erf 2003 page 135 ## On supprime les conjoints FIP et les FIP de 25 ans et plus; ## On conserve les enfants FIP de 19 à 24 ans; ## On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un ## ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète ## EEC que l'on ne retrouvera pas dans les EEC suivantes. # print ' 4.2 : On enlève les individus pour lesquels il manque le déclarant' fip = load_temp(name="fipDat", year=year) fip["declar"] = nan fip["agepf"] = nan fip.drop(["actrec", "year", "noidec"], axis=1, inplace=True) fip.naia = fip.naia.astype("int32") fip.rename( columns=dict( ident="idmen", persfip="quifoy", zsali= "sali2", # Inclu les salaires non imposables des agents d'assurance zchoi="choi2", zrsti="rsti2", zalri="alr2"), inplace=True) is_fip_19_25 = ((year - fip.naia - 1) >= 19) & ((year - fip.naia - 1) < 25) ## TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented ## The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25 ##fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"] ##fip[is_fip_19_25 ,"lpr"] <- 1 # #indivi <- rbind.fill(indivi,fip[is_fip_19_25,]) indivi = concat([indivi, fip.loc[is_fip_19_25]]) del is_fip_19_25 indivi['age'] = year - indivi.naia - 1 indivi['agem'] = 12 * indivi.age + 12 - indivi.naim indivi["quimen"] = 0 indivi.quimen[indivi.lpr == 1] = 0 indivi.quimen[indivi.lpr == 2] = 1 indivi.quimen[indivi.lpr == 3] = 2 indivi.quimen[indivi.lpr == 4] = 3 indivi['not_pr_cpr'] = nan indivi['not_pr_cpr'][indivi['lpr'] <= 2] = False indivi['not_pr_cpr'][indivi['lpr'] > 2] = True print " 4.3 : Creating non pr=0 and cpr=1 idmen's" indivi.reset_index(inplace=True) test1 = indivi.ix[indivi['not_pr_cpr'] == True, ['quimen', 'idmen']] test1['quimen'] = 2 j = 2 while any(test1.duplicated(['quimen', 'idmen'])): test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j + 1 j += 1 print_id(indivi) indivi.update(test1) print_id(indivi) # indivi.set_index(['quiment']) #TODO: check relevance # TODO problème avec certains idfoy qui n'ont pas de vous print '' print "Etape 5 : Gestion des idfoy qui n'ont pas de vous" all = indivi.drop_duplicates('idfoy') with_ = indivi.loc[indivi['quifoy'] == 'vous', 'idfoy'] without = all[~(all.idfoy.isin(with_.values))] print 'On cherche si le déclarant donné par la deuxième déclaration est bien un vous' has_declar2 = (indivi.idfoy.isin( without.idfoy.values)) & (indivi.declar2.notnull()) decl2_idfoy = (indivi.loc[has_declar2, 'idmen'].astype('int') * 100 + indivi.loc[has_declar2, "declar2"].str[0:2].astype('int')) indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values), decl2_idfoy, None) del all, with_, without, has_declar2 print ' 5.1 : Elimination idfoy restant' idfoyList = indivi.loc[indivi['quifoy'] == "vous", 'idfoy'].drop_duplicates() indivi = indivi[indivi.idfoy.isin(idfoyList.values)] del idfoyList print_id(indivi) myvars = [ "noindiv", "noi", "idmen", "idfoy", "quifoy", "wprm", "age", "agem", "quelfic", "actrec", "quimen", "nbsala", "titc", "statut", "txtppb", "chpub", "prosa", "encadr" ] if not (len(set(myvars).difference(set(indivi.columns))) == 0): print set(myvars).difference(set(indivi.columns)) assert len(set(myvars).difference(set(indivi.columns))) == 0 indivi = indivi.loc[:, myvars] ## TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé ## les infos provenant des déclarations) print '' print 'Etape 6 : Création des variables descriptives' print ' 6.1 : variable activité' indivi['activite'] = None indivi['activite'][indivi['actrec'] <= 3] = 0 indivi['activite'][indivi['actrec'] == 4] = 1 indivi['activite'][indivi['actrec'] == 5] = 2 indivi['activite'][indivi['actrec'] == 7] = 3 indivi['activite'][indivi['actrec'] == 8] = 4 indivi['activite'][indivi['age'] <= 13] = 2 # ce sont en fait les actrec=9 print indivi['activite'].value_counts() # TODO: MBJ problem avec les actrec indivi['titc'][indivi['titc'].isnull()] = 0 assert indivi['titc'].notnull().all(), Exception("Problème avec les titc") print ' 6.2 : variable statut' indivi['statut'][indivi['statut'].isnull()] = 0 indivi['statut'] = indivi['statut'].astype('int') indivi['statut'][indivi['statut'] == 11] = 1 indivi['statut'][indivi['statut'] == 12] = 2 indivi['statut'][indivi['statut'] == 13] = 3 indivi['statut'][indivi['statut'] == 21] = 4 indivi['statut'][indivi['statut'] == 22] = 5 indivi['statut'][indivi['statut'] == 33] = 6 indivi['statut'][indivi['statut'] == 34] = 7 indivi['statut'][indivi['statut'] == 35] = 8 indivi['statut'][indivi['statut'] == 43] = 9 indivi['statut'][indivi['statut'] == 44] = 10 indivi['statut'][indivi['statut'] == 45] = 11 assert indivi['statut'].isin( range(12)).all(), Exception("statut value over range") #indivi$nbsala <- as.numeric(indivi$nbsala) #indivi <- within(indivi,{ # nbsala[is.na(nbsala) ] <- 0 # nbsala[nbsala==99 ] <- 10 # TODO 418 fip à retracer qui sont NA #}) print ' 6.3 : variable txtppb' indivi['txtppb'] = indivi['txtppb'].fillna(0) assert indivi['txtppb'].notnull().all() indivi['nbsala'] = indivi['nbsala'].fillna(0) indivi['nbsala'] = indivi['nbsala'].astype('int') indivi['nbsala'][indivi['nbsala'] == 99] = 10 assert indivi['nbsala'].isin(range(11)).all() print ' 6.4 : variable chpub et CSP' indivi['chpub'].fillna(0, inplace=True) indivi['chpub'] = indivi['chpub'].astype('int') indivi['chpub'][indivi['chpub'].isnull()] = 0 print indivi['chpub'].value_counts() assert indivi['chpub'].isin(range(11)).all() indivi['cadre'] = 0 indivi['prosa'][indivi['prosa'].isnull()] = 0 assert indivi['prosa'].notnull().all() print indivi['encadr'].value_counts() # encadr : 1=oui, 2=non indivi['encadr'].fillna(2, inplace=True) assert indivi['encadr'].notnull().all() indivi['cadre'][indivi['prosa'].isin([7, 8])] = 1 indivi['cadre'][(indivi['prosa'] == 9) & (indivi['encadr'] == 1)] = 1 print "cadre" print indivi['cadre'].value_counts() assert indivi['cadre'].isin(range(2)).all() print '' print "Etape 7 : on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence" print 'nb de doublons idfam/quifam', len( indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])]) print 'On crée les n° de personnes à charge' assert indivi['idfoy'].notnull().all() print_id(indivi) indivi['quifoy2'] = 2 indivi['quifoy2'][indivi['quifoy'] == 'vous'] = 0 indivi['quifoy2'][indivi['quifoy'] == 'conj'] = 1 indivi['quifoy2'][indivi['quifoy'] == 'pac'] = 2 del indivi['quifoy'] indivi['quifoy'] = indivi['quifoy2'] del indivi['quifoy2'] print_id(indivi) test2 = indivi.loc[indivi['quifoy'] == 2, ['quifoy', 'idfoy', 'noindiv']] print_id(test2) j = 2 while test2.duplicated(['quifoy', 'idfoy']).any(): test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j j += 1 print_id(test2) indivi = indivi.merge(test2, on=['noindiv', 'idfoy'], how="left") indivi['quifoy'] = indivi['quifoy_x'] indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'], indivi['quifoy_x']) del indivi['quifoy_x'], indivi['quifoy_y'] print_id(indivi) del test2, fip print 'nb de doublons idfam/quifam', len( indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])]) print_id(indivi) ##################################################################################### ## On ajoute les idfam et quifam #load(famc) # #tot2 <- merge(indivi, famille, by = c('noindiv'), all.x = TRUE) #rm(famille) #print_id(tot2) # ### Les idfam des enfants FIP qui ne font plus partie des familles forment des famille seuls #tot2[is.na(tot2$quifam), "idfam"] <- tot2[is.na(tot2$quifam), "noindiv"] #tot2[is.na(tot2$quifam), "quifam"] <- 0 #print_id(tot2) #saveTmp(tot2, file = "tot2.Rdata") #rm(indivi,tot2) # ## on merge les variables de revenus (foyer_aggr) avec les identifiants précédents ## load foyer #loadTmp(file = "tot2.Rdata") #loadTmp(file= "foyer_aggr.Rdata") # #tot3 <- merge(tot2, foyer, all.x = TRUE) #print_id(tot3) # OK #saveTmp(tot3, file= "tot3.Rdata") #rm(tot3,tot2,foyer) # print '' print 'Etape 8 : création des fichiers totaux' famille = load_temp(name='famc', year=year) print ' 8.1 : création de tot2 & tot3' tot2 = indivi.merge(famille, on='noindiv', how='inner') # del famille # TODO: MBJ increase in number of menage/foyer when merging with family ... del famille control(tot2, debug=True, verbose=True) assert tot2['quifam'].notnull().all() save_temp(tot2, name='tot2', year=year) del indivi print ' tot2 saved' # #On combine les variables de revenu # foyer = load_temp(name='foy_ind', year=year) # print " INTERSERCT THE POOCHAY" # tot2["idfoy"] = tot2["idfoy"][tot2["idfoy"].notnull()] +1 # print "pingas" # print sorted(tot2.loc[tot2.idfoy.notnull(),"idfoy"].astype('int').unique())[0:10] # print "pocchay" # print sorted(foyer["idfoy"].unique())[0:10] # print "final flash" # print 602062550.0 in foyer["idfoy"].values # print len(list(set(tot2["idfoy"].unique()) & set(foyer["idfoy"].unique()))) # print tot2.quifoy.value_counts() #tot2.update(foyer) tot2.merge(foyer, how='left') tot2 = tot2[tot2.idmen.notnull()] # tot2['idfoy'] += 1 print_id(tot2) tot3 = tot2 # TODO: check where they come from tot3 = tot3.drop_duplicates(cols='noindiv') print len(tot3) #Block to remove any unwanted duplicated pair print " check tot3" control(tot3, debug=True, verbose=True) tot3 = tot3.drop_duplicates(cols=['idfoy', 'quifoy']) tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam']) tot3 = tot3.drop_duplicates(cols=['idmen', 'quimen']) tot3 = tot3.drop_duplicates(cols='noindiv') control(tot3) ## On ajoute les variables individualisables #loadTmp("foyer_individualise.Rdata") # foy_ind #loadTmp("tot3.Rdata") #loadTmp("allvars.Rdata") #loadTmp("sif.Rdata") # #vars2 <- setdiff(names(tot3), allvars) #tot3 <- tot3[,vars2] # #print_id(tot3) #final <- merge(tot3, foy_ind, by = c('idfoy', 'quifoy'), all.x = TRUE) # print ' 8.2 : On ajoute les variables individualisables' allvars = load_temp(name='ind_vars_to_remove', year=year) vars2 = set(tot3.columns).difference(set(allvars)) tot3 = tot3[list(vars2)] print len(tot3) assert not (tot3.duplicated( cols=['noindiv']).any()), "doublon dans tot3['noindiv']" lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])]) assert lg_dup == 0, "%i pairs of idfoy/quifoy in tot3 are duplicated" % ( lg_dup) save_temp(tot3, name='tot3', year=year) control(tot3) del tot2, allvars, tot3, vars2 print 'tot3 sauvegardé' gc.collect()
def sif(year=2006): data = DataCollection(year=year) print u"05_foyer: extraction des données foyer" ## TODO Comment choisir le rfr n -2 pour éxonération de TH ? ## mnrvka Revenu TH n-2 ## mnrvkh revenu TH (revenu fiscal de référence) # ## On récupère les variables du code sif #vars <- c("noindiv", 'sif', "nbptr", "mnrvka") vars = ["noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg"] #sif <- LoadIn(erfFoyFil, vars) sif = data.get_values(variables=vars, table="foyer" ) #sif$statmarit <- 0 sif['statmarit'] = 0 print sif ## for (index in 60:80){ ## print(index) ## print(table(substr(sif$sif,index,index))) ## } # # ## Pb with some 2 sifs that are misaligned ## index = 61 ## sif[substr(sif$sif,index,index)=="F", "noindiv"] ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="F", "noindiv"]] ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="F", "noindiv"]+1] # ## index = 62 ## sif[substr(sif$sif,index,index)=="G", "noindiv"] ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="G", "noindiv"]] ## sif$sif[1] # #if (year==2009){ # # problem with one entry in 2009 # length <- nchar(sif$sif[1]) # old_sif <- sif$sif[sif$noindiv == 901803201] # new_sif <- paste(substr(old_sif,1,59), substr(old_sif,61,length),"0", sep="") # sif$sif[sif$noindiv == 901803201] <- new_sif # old_sif <- sif$sif[sif$noindiv == 900872201] # new_sif <- paste(substr(old_sif,1,58), " ", substr(old_sif,59,length), sep="") # sif$sif[sif$noindiv == 900872201] <- new_sif # rm(old_sif,new_sif) #} if year == 2009: old_sif = sif['sif'][sif['noindiv'] == 901803201] new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0" sif['sif'][sif['noindiv'] == 901803201] = new_sif old_sif = sif['sif'][sif['noindiv'] == 900872201] new_sif = old_sif.str[0:58] + " " + old_sif.str[58:] sif['sif'][sif['noindiv'] == 900872201] = new_sif del old_sif, new_sif # # ## for (index in 60:80){ ## print(index) ## print(table(substr(sif$sif,index,index))) ## } # # # #sif <- within(sif,{ ## rbg = rbg*((tsrvbg =='+')-(tsrvbg =='-')) print sif["rbg"].describe() sif["rbg"] = sif["rbg"]*( (sif["tsrvbg"]=='+')-(sif["tsrvbg"] =='-')) print sif["rbg"].describe() # stamar <- substr(sif,5,5) sif["stamar"] = sif["sif"].str[4:5] # Converting marital status # statmarit[stamar =="M"] <- 1 # statmarit[stamar =="C"] <- 2 # statmarit[stamar =="D"] <- 3 # statmarit[stamar =="V"] <- 4 # statmarit[stamar =="O"] <- 5 statmarit_dict = {"M": 1, "C" : 2, "D" : 3, "V" : 4, "O": 5} for key , val in statmarit_dict.iteritems(): sif["statmarit"][ sif.stamar == key] = val # birthvous <- as.numeric(substr(sif,6,9)) # birthconj <- as.numeric(substr(sif,11,14)) # sif["birthvous"] = sif["sif"].str[5:9] sif["birthconj"] = sif["sif"].str[10:14] # caseE <- as.numeric(substr(sif,16,16)=='E') # caseF <- as.numeric(substr(sif,17,17)=='F') # caseG <- as.numeric(substr(sif,18,18)=='G') # caseK <- as.numeric(substr(sif,19,19)=='K') sif["caseE"] = sif["sif"].str[15:16] == "E" sif["caseF"] = sif["sif"].str[16:17] == "F" sif["caseG"] = sif["sif"].str[17:18] == "G" sif["caseK"] = sif["sif"].str[18:19] == "K" d = 0 # # if (year %in% c(2006,2007)){ # caseL <- as.numeric(substr(sif,20,20)=='L') # caseP <- as.numeric(substr(sif,21,21)=='P') # caseS <- as.numeric(substr(sif,22,22)=='S') # caseW <- as.numeric(substr(sif,23,23)=='W') # caseN <- as.numeric(substr(sif,24,24)=='N') # caseH <- as.numeric(substr(sif,25,28)) # caseT <- as.numeric(substr(sif,29,29) == 'T') # } if year in [2006,2007]: sif["caseL"] = sif["sif"].str[19:20] == "L" sif["caseP"] = sif["sif"].str[20:21] == "P" sif["caseS"] = sif["sif"].str[21:22] == "S" sif["caseW"] = sif["sif"].str[22:23] == "W" sif["caseN"] = sif["sif"].str[23:24] == "N" sif["caseH"] = sif["sif"].str[24:28] sif["caseT"] = sif["sif"].str[28:29] == "T" # # if (year == 2008){ # d = - 1 # fin de la case L # caseP <- as.numeric(substr(sif,21+d,21+d)=='P') # caseS <- as.numeric(substr(sif,22+d,22+d)=='S') # caseW <- as.numeric(substr(sif,23+d,23+d)=='W') # caseN <- as.numeric(substr(sif,24+d,24+d)=='N') # caseH <- as.numeric(substr(sif,25+d,28+d)) # caseT <- as.numeric(substr(sif,29+d,29+d)=='T') # } # if year in [2008]: d = - 1 # fin de la case L sif["caseP"] = sif["sif"].str[20+d:21+d] == "P" sif["caseS"] = sif["sif"].str[21+d:22+d] == "S" sif["caseW"] = sif["sif"].str[22+d:23+d] == "W" sif["caseN"] = sif["sif"].str[23+d:24+d] == "N" sif["caseH"] = sif["sif"].str[24+d:28+d] sif["caseT"] = sif["sif"].str[28+d:29+d] == "T" # if (year == 2009){ # # retour de la case L par rapport à 2008 (donc on retrouve 2006) # caseL <- as.numeric(substr(sif,20,20)=='L') # caseP <- as.numeric(substr(sif,21,21)=='P') # caseS <- as.numeric(substr(sif,22,22)=='S') # caseW <- as.numeric(substr(sif,23,23)=='W') # caseN <- as.numeric(substr(sif,24,24)=='N') # # caseH en moins par rapport à 2008 (mais case en L en plus) # # donc décalage par rapport à 2006 # d = -4 # caseT <- as.numeric(substr(sif,29+d,29+d)=='T') # } if year in [2009]: sif["caseL"] = sif["sif"].str[19:20] == "L" sif["caseP"] = sif["sif"].str[20:21] == "P" sif["caseS"] = sif["sif"].str[21:22] == "S" sif["caseW"] = sif["sif"].str[22:23] == "W" sif["caseN"] = sif["sif"].str[23:24] == "N" # caseH en moins par rapport à 2008 (mais case en L en plus) # donc décalage par rapport à 2006 d = -4 sif["caseT"] = sif["sif"].str[28+d:29+d] == "T" # # caseX <- as.numeric(substr(sif,34+d,34+d)=='X') # dateX <- as.Date(substr(sif,35+d,42+d),'%d%m%Y') # caseY <- as.numeric(substr(sif,43+d,43+d)== 'Y') # dateY <- as.Date(substr(sif,44+d,51+d),'%d%m%Y') # caseZ <- as.numeric(substr(sif,52+d,53+d)== 'Z') # dateZ <- as.Date(substr(sif,53+d,60+d),'%d%m%Y') # ERROR 54+d # causeXYZ <- substr(sif,61+d,61+d) # sif["caseX"] = sif["sif"].str[33+d:34+d] == "X" sif["dateX"] = sif["sif"].str[34+d:42+d] sif["caseY"] = sif["sif"].str[42+d:43+d] == "Y" sif["dateY"] = sif["sif"].str[43+d:51+d] sif["caseZ"] = sif["sif"].str[51+d:53+d] == "Z" sif["dateZ"] = sif["sif"].str[52+d:60+d] sif["causeXYZ"] = sif["sif"].str[60+d:61+d] # TODO: convert dateXYZ to appropriate date in pandas # print sif["dateY"].unique() # nbptr <- nbptr/100 # rfr_n_2 <- mnrvka sif["nbptr"] = sif["nbptr"]/100 sif["rfr_n_2"] = sif["mnrvka"] # nbF <- as.numeric(substr(sif,65+d,66+d)) # nbG <- as.numeric(substr(sif,68+d,69+d)) # nbR <- as.numeric(substr(sif,71+d,72+d)) # nbJ <- as.numeric(substr(sif,74+d,75+d)) # nbN <- as.numeric(substr(sif,77+d,78+d)) # nbH <- as.numeric(substr(sif,80+d,81+d)) # nbI <- as.numeric(substr(sif,83+d,84+d)) sif["nbF"] = sif["sif"].str[64+d:66+d] sif["nbG"] = sif["sif"].str[67+d:69+d] sif["nbR"] = sif["sif"].str[70+d:72+d] sif["nbJ"] = sif["sif"].str[73+d:75+d] sif["nbN"] = sif["sif"].str[76+d:78+d] sif["nbH"] = sif["sif"].str[79+d:81+d] sif["nbI"] = sif["sif"].str[82+d:84+d] # if (year != 2009){ # nbP <- as.numeric(substr(sif,86+d,87+d)) # } #}) if (year != 2009): sif["nbP"] = sif["sif"].str[85+d:87+d] #sif$sif <- NULL #sif$stamar <- NULL # del sif["sif"], sif["stamar"] #table(sif$statmarit) print sif["statmarit"].value_counts() #print(length(table(sif$noindiv))) print "Number of individuals :", len(sif["noindiv"]) print "Number of distinct individuals :", len(sif["noindiv"].value_counts()) #dup <- duplicated(sif$noindiv) #table(dup) #sif <- sif[!dup,] #print(length(table(sif$noindiv))) sif_drop_duplicated = sif.drop_duplicates("noindiv") assert len(sif["noindiv"].value_counts()) == len(sif_drop_duplicated["noindiv"]), Exception("Number of distinct individuals after removing duplicates is not correct") print 'Saving sif' save_temp(sif, name='sif', year=year) del sif gc.collect()
def create_fip(year=2006): # message('03_fip') """ Creates a 'fipDat' table containing all these 'fip individuals' """ df = DataCollection(year=year) print 'Démarrer 03_fip' # # anaisenf: année de naissance des PAC # erfFoyVar <- c('anaisenf','declar') # foyer <- LoadIn(erfFoyFil) # foyer <- LoadIn(erfFoyFil,erfFoyVar) # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = df.get_values(table="foyer", variables=erfFoyVar) print_id(foyer) # control(foyer, verbose=True, verbose_length=10, debug=True) # #*********************************************************************************************************** # # print "Step 1 : on recupere les personnes à charge des foyers" # #********************************************************************************************************** # # On traite les cas de declarations multiples pour ne pas créer de doublon de pac # # # # On récupère toutes les pac des foyers # L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal # fip <-data.frame(declar = foyer$declar) # for (i in c(1:L)){ # eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = ''))) # eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = ''))) # } # fip <- fip[!is.na(fip$typ.1),] # fip <- reshape(fip,direction ='long', varying=2:17, sep=".") # fip <- fip[!is.na(fip$naia),] # fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')] # fip$N <- row(fip)[,1] # str(fip$N) print "Etape 1 : on recupere les personnes à charge des foyers" print " 1.1 : Création des codes des enfants" foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5 print "il ya a au maximum %s pac par foyer" % nb_pac_max # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] for i in range(1, nb_pac_max + 1): pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable']) fip = DataFrame(randn(len(foyer), 3 * nb_pac_max), columns=columns) fip.fillna( NaN, inplace=True) # inutile a cause de la ligne précédente, to remove for i in range(1, nb_pac_max + 1): fip[(i, 'declaration')] = foyer['declar'].values fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)] fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1:5 * (i)] fip = fip.stack("pac_number") fip.reset_index(inplace=True) del fip["level_0"] # print fip.describe() # print fip.head().to_string() print " 1.2 : elimination des foyers fiscaux sans pac" #Clearing missing values and changing data format fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an') & (fip['naia'] != '')] fip = fip.sort(columns=['declaration', 'naia', 'type_pac']) # TODO: check if useful fip.set_index(["declaration", "pac_number"], inplace=True) fip = fip.reset_index() del fip['pac_number'] # control(fip, debug=True, verbose=True, verbose_columns=['naia']) print " 1.3 : on enlève les individus F pour lesquels il existe un individu G" tyFG = fip[fip.type_pac.isin(['F', 'G' ])] #Filtre pour ne travailler que sur F & G tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True) tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac']) tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin'])) #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux #puis on retire les autres (à la fois F et G) print len(tyFG), '/', len(tyFG[tyFG['to_keep']]) print 'longueur fip', len(fip) fip['to_keep'] = NaN fip.update(tyFG) print 'enfants F & G traités' print " 1.4 : on enlève les H pour lesquels il y a un I" tyHI = fip[fip.type_pac.isin(['H', 'I'])] tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True) tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac']) tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin']) fip.update(tyHI) fip['to_keep'] = fip['to_keep'].fillna(True) print 'nb lines to keep/nb initial lines' print len(fip[fip['to_keep']]), '/', len(fip) indivifip = fip[fip['to_keep']] del indivifip['to_keep'], fip, tyFG, tyHI # control(indivifip, debug=True) # #************************************************************************************************************/ print '' print 'Step 2 : matching indivifip with eec file' # #************************************************************************************************************/ indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES # pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',] # pac$key1 <- paste(pac$naia,pac$declar1) # pac$key2 <- paste(pac$naia,pac$declar2) # indivifip$key <- paste(indivifip$naia,indivifip$declar) #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull() import pdb pdb.set_trace() pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip'] == 'pac')] pac['naia'] = pac['naia'].astype( 'int32') # TODO: was float in pac fix upstream indivifip['naia'] = indivifip['naia'].astype('int32') pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29]) pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29]) assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' % ( pac.naia.dtype, indivifip.naia.dtype) # fip <- indivifip[!indivifip$key %in% pac$key1,] # fip <- fip[!fip$key %in% pac$key2,] fip = indivifip[~(indivifip.key.isin(pac.key1.values))] fip = fip[~(fip.key.isin(pac.key2.values))] print " 2.1 new fip created" # We build a dataframe to link the pac to their type and noindiv # table(duplicated(pac[,c("noindiv")])) countInd = pac.noindiv.value_counts() # pacInd1 <- merge(pac[,c("noindiv","key1","naia")], # indivifip[,c("key","typ")], by.x="key1", by.y="key") # pacInd2 <- merge(pac[,c("noindiv","key2","naia")], # indivifip[,c("key","typ")], by.x="key2", by.y="key") tmp_pac1 = pac[['noindiv', 'key1']] tmp_pac2 = pac[['noindiv', 'key2']] tmp_indivifip = indivifip[['key', 'type_pac', 'naia']] pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') print 'longueur pacInd1', len(pac_ind1) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') print 'longueur pacInd2', len(pac_ind2) print "pacInd1&2 créés" # table(duplicated(pacInd1)) # table(duplicated(pacInd2)) print pac_ind1.duplicated().sum() print pac_ind2.duplicated().sum() # pacInd1 <-rename(pacInd1,c("key1" = "key")) # pacInd2 <-rename(pacInd2,c("key2" = "key")) # pacInd <- rbind(pacInd1,pacInd2) # rm(pacInd1,pacInd2) # pacInd1.rename(columns={'key1':'key'}, inplace=True) # pacInd2.rename(columns={'key2':'key'}, inplace=True) del pac_ind1['key1'], pac_ind2['key2'] print pac_ind1.columns print pac_ind2.columns if pac_ind1.index == []: if pac_ind2.index == []: print "Warning : no link between pac and noindiv for both pacInd1&2" else: print "Warning : pacInd1 is an empty data frame" pacInd = pac_ind2 elif pac_ind2.index == []: print "Warning : pacInd2 is an empty data frame" pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) print len(pac_ind1), len(pac_ind2), len(pacInd) print pac_ind2.type_pac.isnull().sum() print pacInd.type_pac.value_counts() print ' 2.2 : pacInd created' # table(duplicated(pacInd[,c("noindiv","typ")])) # table(duplicated(pacInd$noindiv)) print 'doublons noindiv, type_pac', pacInd.duplicated( ['noindiv', 'type_pac']).sum() print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum() print 'nb de NaN', pacInd.type_pac.isnull().sum() del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))] # pacIndiv.reset_index(inplace=True) print pacIndiv.columns save_temp(pacIndiv, name="pacIndiv", year=year) print pacIndiv.type_pac.value_counts() gc.collect() # # We keep the fip in the menage of their parents because it is used in to # # build the famille. We should build an individual ident for the fip that are # # older than 18 since they are not in their parents' menage according to the eec # individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous")) # individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")] # individec1 <- upData(individec1,rename=c(declar1="declar")) # fip1 <- merge(fip,individec1) # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2)) indivi['noidec'] = indivi['declar1'].str[0:2].astype( 'float16') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip'] == "vous")] individec1 = individec1.loc[:, [ "declar1", "noidec", "ident", "rga", "ztsai", "ztsao" ]] individec1 = individec1.rename(columns={'declar1': 'declaration'}) fip1 = fip.merge(individec1, on='declaration') print ' 2.3 : fip1 created' # # TODO: On ne s'occupe pas des declar2 pour l'instant # # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous")) # # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")] # # individec2 <- upData(individec2,rename=c(declar2="declar")) # # fip2 <-merge(fip,individec2) individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous")] individec2 = individec2.loc[:, [ "declar2", "noidec", "ident", "rga", "ztsai", "ztsao" ]] individec2.rename(columns={'declar2': 'declaration'}, inplace=True) print individec2.head() fip2 = fip.merge(individec2) print ' 2.4 : fip2 created' fip1.duplicated().value_counts() fip2.duplicated().value_counts() # #fip <- rbind(fip1,fip2) # fip <- fip1 # table(fip$typ) fip = concat([fip1, fip2]) # fip = fip1 #TODO: Pourquoi cette ligne ? fip.type_pac.value_counts() print fip.columns fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype( 'float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'] fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'] #TODO declar ? fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip['naia'].astype('float') fip['lpr'] = where(fip['agepf'] <= 20, 3, 4) # TODO pas très propre d'après Mahdi/Clément fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = where(fip['agepf'] <= 15, 9, 5) ## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ ## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal # while ( any(duplicated( fip[,c("noi","ident")]) ) ) { # dup <- duplicated( fip[, c("noi","ident")]) # tmp <- fip[dup,"noi"] # fip[dup, "noi"] <- (tmp-1) # } #TODO: Le vecteur dup est-il correct fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi', 'ident']] while any(fip.duplicated(cols=['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] print len(tmp) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100 * fip['ident'] + fip['noidec'] fip['noindiv'] = 100 * fip['ident'] + fip['noi'] fip['type_pac'] = 0 fip['key'] = 0 print fip.duplicated('noindiv').value_counts() save_temp(fip, name="fipDat", year=year) del fip, fip1, individec1, indivifip, indivi, pac print 'fip sauvegardé'
def create_indivim(year=2006): ''' ''' # load data = DataCollection(year=year) erfmen = data.get_values(table="erf_menage") eecmen = data.get_values(table="eec_menage") print sorted(eecmen.columns) erfind = data.get_values(table="erf_indivi") eecind = data.get_values(table="eec_indivi") print eecind.columns print erfind.columns # travail sur la cohérence entre les bases noappar_m = eecmen[ ~(eecmen.ident.isin( erfmen.ident.values))] print 'describe noappar_m' print noappar_m.describe() noappar_i = eecmen[ ~(eecmen.ident.isin( erfmen.ident.values))] noappar_i = noappar_i.drop_duplicates(cols='ident', take_last = True) #TODO: vérifier qu'il n'y a théoriquement pas de doublon dif = set(noappar_i.ident).symmetric_difference(noappar_m.ident) int = set(noappar_i.ident) & set(noappar_m.ident) print "dif, int --------------------------------" print dif, int del noappar_i, noappar_m, dif, int gc.collect() #fusion enquete emploi et source fiscale menagem = erfmen.merge(eecmen) indivim = eecind.merge(erfind, on = ['noindiv', 'ident', 'noi'], how="inner") # optimisation des types? Controle de l'existence en passant #TODO: minimal dtype var_list = (['acteu', 'stc', 'contra', 'titc', 'forter', 'mrec', 'rstg', 'retrai', 'lien', 'noicon', 'noiper', 'noimer', 'naia', 'cohab', 'agepr', 'statut', 'txtppb', 'encadr', 'prosa']) for var in var_list: try: indivim[var] = indivim[var].astype("float32") except: print "%s is missing" %var # création de variables ## actrec indivim['actrec'] = 0 #TODO: pas de 6 ?!! filter1 = (indivim['acteu'] == 1) & (indivim['stc'].isin([1,3])) indivim['actrec'][filter1] = 1 filter2 = (indivim['acteu'] == 1) & (((indivim['stc'] == 2) & (indivim['contra'] == 1)) | (indivim['titc'] == 2)) indivim['actrec'][filter2] = 2 indivim['actrec'][indivim['acteu'] == 1] = 3 filter4 = (indivim['acteu'] == 2) | ((indivim['acteu'] == 3) & (indivim['mrec'] == 1)) indivim['actrec'][filter4] = 4 filter5 = (indivim['acteu'] == 3) & ((indivim['forter'] == 2) | (indivim['rstg'] == 1)) indivim['actrec'][filter5] = 5 filter7 = (indivim['acteu'] == 3) & ((indivim['retrai'] == 1) | (indivim['retrai'] == 2)) indivim['actrec'][filter7] = 7 indivim['actrec'][indivim['acteu'] == 3] = 8 indivim['actrec'][indivim['acteu'].isnull()] = 9 print indivim['actrec'].value_counts() # tu99 if year == 2009: #erfind['tu99'] = None #eecind['tu99'] = float(eecind['tu99']) erfind['tu99'] = NaN ## locataire menagem["locataire"] = menagem["so"].isin([3,4,5]) menagem["locataire"] = menagem["locataire"].astype("int32") ## ?? c'est bizarre d'avoir besoin du diplome de la personne de référence, ## ce serait mieux de faire le merge quand on a besoin seulement ## laissons à la table individuel ce qui doit l'être # NOTE pas de ddipl en year=2006 visiblement # transfert = indivim.ix[indivim['lpr'] == 1, ['ident', 'ddipl']] # print transfert # #TODO: Forget not to uncomment 'dat ## #menagem <- merge(erfmen,eecmen) ## #menagem <- merge(menagem,transfert) # menagem = menagem.merge(transfert) # correction def _manually_remove_errors(): ''' This method is here because some oddities can make it through the controls throughout the procedure It is here to remove all these individual errors that compromise the process. GL,HF ''' if year==2006: indivim.lien[indivim.noindiv==603018905] = 2 indivim.noimer[indivim.noindiv==603018905] = 1 print indivim[indivim.noindiv==603018905].to_string() _manually_remove_errors() # save save_temp(menagem, name="menagem", year=year) del erfmen, eecmen, menagem #, transfert print 'menagem saved' gc.collect() save_temp(indivim, name="indivim", year=year) del erfind, eecind print 'indivim saved' gc.collect()
def foyer_all(year=2006): ## On ajoute les cases de la déclaration #foyer_all <- LoadIn(erfFoyFil) data = DataCollection(year=year) foyer_all = data.get_values(table="foyer") ## on ne garde que les cases de la déclaration ('fxzz') #vars <- names(foyer_all) #vars <- c("noindiv", vars[grep("^f[0-9]", vars)]) # vars = foyer_all.columns regex = re.compile("^f[0-9]") vars = [x for x in vars if regex.match(x)] #foyer <- foyer_all[vars] #rm(foyer_all) #gc() #noindiv <- list(foyer$noindiv) # foyer = foyer_all[vars + ["noindiv"]] del foyer_all gc.collect() # ## On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations #foyer <- aggregate(foyer, by = noindiv, FUN = 'sum') #print foyer.describe()["f1aj"].to_string() foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum) # #print foyer.describe()["f1aj"].to_string() #print foyer.describe()["noindiv"].to_string() # print_id(foyer) ## noindiv have been summed over original noindiv which are now in Group.1 #foyer$noindiv <- NULL #foyer <- rename(foyer, c(Group.1 = 'noindiv')) ## problème avec les dummies () # #saveTmp(foyer, file= "foyer_aggr.Rdata") # # ############################################################################# ## On récupère les variables individualisables #loadTmp("foyer_aggr.Rdata") # #individualisable <- function(table, var, vars, qui){ # print(var) # print(vars) # temp <- table[c('noindiv', vars)] # n = length(qui) # names(temp)[2:(n+1)] <- qui # temp$newvar <- NULL # temp2 <- melt(temp, id = 'noindiv', variable_name = 'quifoy') # temp2 <- transform(temp2, quifoy = as.character(quifoy)) # temp2 <- transform(temp2, noindiv = as.character(noindiv)) # str(temp2) # rename(temp2, c(value = var)) #} var_dict = { 'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'], 'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'], 'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'], 'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'], 'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'], 'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'], 'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'], 'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'], 'f1tv': ['f1tv', 'f1uv'], 'f1tw': ['f1tw', 'f1uw'], 'f1tx': ['f1tx', 'f1ux'], 'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'], 'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'], 'frag_exon': ['f5hn', 'f5in', 'f5jn'], 'frag_impo': ['f5ho', 'f5io', 'f5jo'], 'arag_exon': ['f5hb', 'f5ib', 'f5jb'], 'arag_impg': ['f5hc', 'f5ic', 'f5jc'], 'arag_defi': ['f5hf', 'f5if', 'f5jf'], 'nrag_exon': ['f5hh', 'f5ih', 'f5jh'], 'nrag_impg': ['f5hi', 'f5ii', 'f5ji'], 'nrag_defi': ['f5hl', 'f5il', 'f5jl'], 'nrag_ajag': ['f5hm', 'f5im', 'f5jm'], 'mbic_exon': ['f5kn', 'f5ln', 'f5mn'], 'abic_exon': ['f5kb', 'f5lb', 'f5mb'], 'nbic_exon': ['f5kh', 'f5lh', 'f5mh'], 'mbic_impv': ['f5ko', 'f5lo', 'f5mo'], 'mbic_imps': ['f5kp', 'f5lp', 'f5mp'], 'abic_impn': ['f5kc', 'f5lc', 'f5mc'], 'abic_imps': ['f5kd', 'f5ld', 'f5md'], 'nbic_impn': ['f5ki', 'f5li', 'f5mi'], 'nbic_imps': ['f5kj', 'f5lj', 'f5mj'], 'abic_defn': ['f5kf', 'f5lf', 'f5mf'], 'abic_defs': ['f5kg', 'f5lg', 'f5mg'], 'nbic_defn': ['f5kl', 'f5ll', 'f5ml'], 'nbic_defs': ['f5km', 'f5lm', 'f5mm'], 'nbic_apch': ['f5ks', 'f5ls', 'f5ms'], 'macc_exon': ['f5nn', 'f5on', 'f5pn'], 'aacc_exon': ['f5nb', 'f5ob', 'f5pb'], 'nacc_exon': ['f5nh', 'f5oh', 'f5ph'], 'macc_impv': ['f5no', 'f5oo', 'f5po'], 'macc_imps': ['f5np', 'f5op', 'f5pp'], 'aacc_impn': ['f5nc', 'f5oc', 'f5pc'], 'aacc_imps': ['f5nd', 'f5od', 'f5pd'], 'aacc_defn': ['f5nf', 'f5of', 'f5pf'], 'aacc_defs': ['f5ng', 'f5og', 'f5pg'], 'nacc_impn': ['f5ni', 'f5oi', 'f5pi'], 'nacc_imps': ['f5nj', 'f5oj', 'f5pj'], 'nacc_defn': ['f5nl', 'f5ol', 'f5pl'], 'nacc_defs': ['f5nm', 'f5om', 'f5pm'], 'mncn_impo': ['f5ku', 'f5lu', 'f5mu'], 'cncn_bene': ['f5sn', 'f5ns', 'f5os'], 'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check 'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'], 'abnc_exon': ['f5qb', 'f5rb', 'f5sb'], 'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'], 'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'], 'abnc_impo': ['f5qc', 'f5rc', 'f5sc'], 'abnc_defi': ['f5qe', 'f5re', 'f5se'], 'nbnc_impo': ['f5qi', 'f5ri', 'f5si'], 'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'], # 'ebic_impv' : ['f5ta','f5ua', 'f5va'], # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'], 'mbic_mvct': ['f5hu'], 'macc_mvct': ['f5iu'], 'mncn_mvct': ['f5ju'], 'mbnc_mvct': ['f5kz'], 'frag_pvct': ['f5hw', 'f5iw', 'f5jw'], 'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'], 'macc_pvct': ['f5nx', 'f5ox', 'f5px'], 'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'], 'mncn_pvct': ['f5ky', 'f5ly', 'f5my'], 'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'], 'macc_mvlt': ['f5nr', 'f5or', 'f5pr'], 'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'], 'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'], 'frag_pvce': ['f5hx', 'f5ix', 'f5jx'], 'arag_pvce': ['f5he', 'f5ie', 'f5je'], 'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'], 'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'], 'abic_pvce': ['f5ke', 'f5le', 'f5me'], 'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'], 'macc_pvce': ['f5nq', 'f5oq', 'f5pq'], 'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'], 'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'], 'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'], 'cncn_pvce': ['f5so', 'f5nt', 'f5ot'], 'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'], 'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'], 'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'], 'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'] } # (déménagement) uniquement en 2006 # #varlist = list(list('sali', c('f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej')), # list('choi', c('f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep')), # list('fra', c('f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek')), # ...... # list('mbnc_pvce', c('f5hr', 'f5ir', 'f5jr')), # list('abnc_pvce', c('f5qd', 'f5rd', 'f5sd')), # list('nbnc_pvce', c('f5qj', 'f5rj', 'f5sj')), # list('demenage' , c('f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'))) # (déménagement) uniquement en 2006 # vars_sets = [set(var_list) for var_list in var_dict.values()] eligible_vars = (set().union(*vars_sets)).intersection( set(list(foyer.columns))) print "From %i variables, we keep %i eligibles variables" % (len( set().union(*vars_sets)), len(eligible_vars)) qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3'] err = 0 err_vars = {} foy_ind = DataFrame() for individual_var, foyer_vars in var_dict.iteritems(): try: selection = foyer[foyer_vars + ["noindiv"]] except KeyError: # Testing if at least one variable of foyers_vars is in the eligible list presence = [x in eligible_vars for x in foyer_vars] var_present = any(presence) if not var_present: print individual_var + " is not present" continue else: # Shrink the list foyer_vars_cleaned = [ var for var, present in zip(foyer_vars, presence) if present is True ] selection = foyer[foyer_vars_cleaned + ["noindiv"]] # Reshape the dataframe selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True) selection.set_index("noindiv", inplace=True) selection.columns.name = "quifoy" selection = selection.stack() selection.name = individual_var selection = selection.reset_index( ) # A Series cannot see its index resetted to produce a DataFrame selection = selection.set_index(["quifoy", "noindiv"]) selection = selection[selection[individual_var] != 0] # print len(selection) if len(foy_ind) == 0: foy_ind = selection else: foy_ind = concat([foy_ind, selection], axis=1, join='outer') foy_ind.reset_index(inplace=True) print "foy_ind" print foy_ind.describe().to_string() #not_first <- FALSE #allvars = c() #for (v in varlist){ # vars = intersect(v[[2]],names(foyer)) # to deal with variabes that are not present # if (length(vars) > 0) { # allvars <- c(allvars, vars) # qui <- c('vous', 'conj', 'pac1', 'pac2', 'pac3') # n <- length(vars) # temp <- individualisable(foyer, v[[1]], vars, qui[1:n]) # if (not_first) { # print('merge') # foy_ind <- merge(temp, foy_ind, by = c('noindiv', 'quifoy'), all = TRUE) # names(foy_ind) # } # else { # print('init') # foy_ind <- temp # not_first <- TRUE # } # } #} ind_vars_to_remove = Series(list(eligible_vars)) save_temp(ind_vars_to_remove, name='ind_vars_to_remove', year=year) foy_ind.rename(columns={"noindiv": "idfoy"}, inplace=True) print_id(foy_ind) foy_ind['quifoy'][foy_ind['quifoy'] == 'vous'] = 0 foy_ind['quifoy'][foy_ind['quifoy'] == 'conj'] = 1 foy_ind['quifoy'][foy_ind['quifoy'] == 'pac1'] = 2 foy_ind['quifoy'][foy_ind['quifoy'] == 'pac2'] = 3 foy_ind['quifoy'][foy_ind['quifoy'] == 'pac3'] = 4 assert foy_ind['quifoy'].isin( range(5)).all(), 'présence de valeurs aberrantes dans quifoy' print 'saving foy_ind' print_id(foy_ind) save_temp(foy_ind, name="foy_ind", year=year) show_temp() return
def preproc(self): erf_menage = self.erf_menage erf_eec_indivi = self.erf_eec_indivi simu_aggr_tables = self.simu_aggr_tables simu_nonaggr_tables = self.simu_nonaggr_tables def get_all_ancestors(varlist): if len(varlist) == 0: return [] else: if varlist[0]._parents == set(): return ([varlist[0]] + get_all_ancestors(varlist[1:])) else: return ([varlist[0]] + get_all_ancestors(list(varlist[0]._parents)) + get_all_ancestors(varlist[1:])) # We want to get all ancestors + children + the options that we're going to encounter parents = map( lambda x: self.simulation.output_table.column_by_name.get(x), [self.variable]) parents = get_all_ancestors(parents) options = [] for varcol in parents: options.extend(varcol._option.keys()) options = list(set(options)) #print options parents = map(lambda x: x.name, parents) for var in [self.variable]: children = set() varcol = self.simulation.output_table.column_by_name.get(var) children = children.union( set(map(lambda x: x.name, varcol._children))) variables = list(set(parents + list(children))) #print variables del parents, children gc.collect() def get_var(variable): variables = [variable] return self.simulation.aggregated_by_entity(entity="men", variables=variables, all_output_vars=False, force_sum=True)[0] simu_aggr_tables = get_var(variables[0]) for var in variables[1:]: simu_aggr_tables = simu_aggr_tables.merge( get_var(var)[['idmen', var]], on='idmen', how='outer') # We load the data from erf table in case we have to pick data there erf_data = DataCollection(year=self.simulation.datesim.year) os.system('cls') todo = set(variables + ["ident", "wprm"]).union(set(options)) print 'Variables or equivalents to fetch :' print todo ''' Méthode générale pour aller chercher les variables de l'erf/eec ( qui n'ont pas forcément le même nom et parfois sont les variables utilisées pour créér l'of ): 1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats que l'on compare ) Si les variables ne sont pas directement dans la table, elles ont été calculées à partir d'autres variables de données erf/eec donc chercher dans : 2 - build_survey 3 - model/model.py qui dira éventuellement dans quel module de model/ chercher Le 'print todo' vous indique quelles variables chercher ( attention à ne pas inclure les enfants directs ) L'utilisation du Ctrl-H est profitable ! ''' fetch_eec = [ 'statut', 'titc', 'chpub', 'encadr', 'prosa', 'age', 'naim', 'naia', 'noindiv' ] fetch_erf = [ 'zsali', 'af', 'ident', 'wprm', 'noi', 'noindiv', 'quelfic' ] erf_df = erf_data.get_of_values(variables=fetch_erf, table="erf_indivi") eec_df = erf_data.get_of_values(variables=fetch_eec, table="eec_indivi") erf_eec_indivi = erf_df.merge(eec_df, on='noindiv', how='inner') assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns" del eec_df, erf_df # We then get the aggregate variables for the menage ( mainly to compare with of ) print 'Loading data from erf_menage table' erf_menage = erf_data.get_of_values(variables=list(todo) + ['quelfic'], table="erf_menage") del todo gc.collect() assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns" from openfisca_france.data.erf import get_erf2of erf2of = get_erf2of() erf_menage.rename(columns=erf2of, inplace=True) # We get the options from the simulation non aggregated tables: # First from the output_table # We recreate the noindiv in output_table self.simulation.output_table.table[ 'noindiv'] = 100 * self.simulation.output_table.table.idmen_ind + self.simulation.output_table.table.noi_ind self.simulation.output_table.table[ 'noindiv'] = self.simulation.output_table.table['noindiv'].astype( np.int64) s1 = [ var for var in set(options).intersection( set(self.simulation.output_table.table.columns)) ] + ['idmen_ind', 'quimen_ind', 'noindiv'] simu_nonaggr_tables = (self.simulation.output_table.table)[s1] simu_nonaggr_tables.rename(columns={ 'idmen_ind': 'idmen', 'quimen_ind': 'quimen' }, inplace=True) assert 'noindiv' in simu_nonaggr_tables.columns # If not found, we dwelve into the input_table if (set(s1) - set(['idmen_ind', 'quimen_ind', 'noindiv'])) < set(options): assert 'noindiv' in self.simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns" s2 = [ var for var in (set(options).intersection( set(self.simulation.input_table.table.columns)) - set(s1)) ] + ['noindiv'] #print s2 temp = self.simulation.input_table.table[s2] simu_nonaggr_tables = simu_nonaggr_tables.merge(temp, on='noindiv', how='inner', sort=False) del s2, temp del s1 gc.collect() simu_nonaggr_tables = simu_nonaggr_tables[ list(set(options)) + ['idmen', 'quimen', 'noindiv']] #print options, variables assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns' # Check the idmens that are not common erf_menage.rename(columns={'ident': 'idmen'}, inplace=True) print "\n" print 'Checking if idmen is here...' print '\n ERF : ' print 'idmen' in erf_menage.columns print "\n Simulation output" print 'idmen' in simu_aggr_tables.columns print "\n" #print 'Dropping duplicates of idmen for both tables...' assert not erf_menage["idmen"].duplicated().any( ), "Duplicated idmen in erf_menage" #erf_menage.drop_duplicates('idmen', inplace = True) simu_aggr_tables.drop_duplicates('idmen', inplace=True) assert not simu_aggr_tables["idmen"].duplicated().any( ), "Duplicated idmen in of" print 'Checking mismatching idmen... ' s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen'])) if s1: print "idmen that aren't in simu_aggr_tables : %s" % str(len(s1)) pass s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen']) if s2: print "idmen that aren't in erf_menage : %s" % str(len(s2)) pass del s1, s2 # Restrict to common idmens and merge s3 = set(erf_menage['idmen']).intersection( set(simu_aggr_tables['idmen'])) print "Restricting to %s common idmen... \n" % str(len(s3)) erf_menage = erf_menage[erf_menage['idmen'].isin(s3)] simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)] del s3 gc.collect() #print erf_menage.columns #print simu_aggr_tables.columns # Compare differences across of and erf dataframes print "Comparing differences between dataframes... \n" colcom = (set(erf_menage.columns).intersection( set(simu_aggr_tables.columns))) - set(['idmen', 'wprm']) print 'Common variables: ' print colcom erf_menage.reset_index(inplace=True) simu_aggr_tables.reset_index(inplace=True) for col in colcom: temp = set( erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]]) print "Numbers of idmen that aren't equal on variable %s : %s \n" % ( col, str(len(temp))) del temp self.erf_menage = erf_menage self.erf_eec_indivi = erf_eec_indivi self.simu_aggr_tables = simu_aggr_tables self.simu_nonaggr_tables = simu_nonaggr_tables
def check_converted(): #Retrieving the input and output files for analysis : store = HDFStore(survey_test) input_df = store['survey_2006'] output = HDFStore(survey3_test) df_fam = output['survey_2006/fam'] df_foy = output['survey_2006/foy'] df_men = output['survey_2006/men'] df_ind = output['survey_2006/ind'] year = 2006 erf = DataCollection(year=year) df = erf.get_of_values(table="erf_indivi") df2 = erf.get_of_values(table="eec_indivi") print '\n' print df.loc[df.ident == 6030189, :].to_string() print df2.loc[df2.ident == 6030189, :].to_string() print len(np.unique(input_df['idfoy'].values)) print len(np.unique(input_df.loc[input_df['quifoy'] == 0, 'idfoy'].values)) liste = [ 601228002, 602671302, 602016402, 603069602, 601365902, 602679402, 602680905, 603074902, 600848302, 602684902, 601508802, 601427302, 601774602, 600466102, 603448202, 603091202, 602437502, 603224003, 603093102, 601261802, 601000002, 601789602, 601660602, 600350102, 601927802, 601797902, 601667902, 601537502, 600227602, 602854502, 602071902, 600144702, 602205702, 600769302, 601096602, 602609202, 601301302, 602220302, 602486102, 601376802, 601570902, 600654802, 601443202, 603412402, 603412902, 601055502, 602893001, 601189902, 601850602, 600539902, 602507002, 601460902, 602511602, 601200902, 601601802, 600946903, 600428502, 600953502, 601084802, 601350102, 600829602, 600174402 ] liste_men = np.unique(input_df.loc[input_df.idfoy.isin(liste), 'idmen'].values) print liste_men print df.loc[df.ident.isin(liste_men), [ 'noi', 'noindiv', 'ident', 'declar1', 'declar2', 'persfip', 'persfipd', 'quelfic' ]].head(30).to_string() print input_df.loc[input_df.idfoy.isin(liste), :].head(30).to_string() # print input_df.loc[input_df.idfoy==603018901, # ['idfoy', 'quifoy', 'idfam', 'quifam', 'idmen', 'quimen', 'noi']].to_string() # # print input_df.loc[input_df.idfam==603018902, # ['idfoy', 'quifoy', 'idfam', 'quifam', 'idmen', 'quimen', 'noi']].to_string() return # df_foy['noindiv'] = df_foy['noi'] ; del df_foy['noi'] # df_fam['noindiv'] = df_fam['noi'] ; del df_fam['noi'] # df_men['noindiv'] = df_men['noi'] ; del df_men['noi'] # print df_fam, df_foy, df_men # check_structure(store['survey_2006']) # control(input_df, verbose=True, verbose_columns=['noindiv']) # control(df_foy, verbose=True, verbose_columns=['noindiv']) # print input_df.duplicated('noindiv').sum(), len(input_df) # print df_foy.duplicated('noindiv').sum(), len(df_foy) # print df_fam.duplicated('noindiv').sum(), len(df_fam) # print df_men.duplicated('noindiv').sum(), len(df_men) # print df_ind.head(10).to_string() print ' FAM' print sorted(df_fam.columns) print ' FOY' print sorted(df_foy.columns) print ' MEN' print sorted(df_men.columns) print ' IND' print sorted(df_ind.columns) # print df_fam.columns print ' INPUT' print sorted(input_df.columns)