Exemplo n.º 1
0
def create_final(year=None):
    if year is None:
        raise Exception("A year is needed")
    print 'création de final'
    foy_ind = load_temp(name = 'foy_ind', year=year)
    tot3 = load_temp(name='tot3', year=year)


    foy_ind.set_index(['idfoy', 'quifoy'], inplace=True)
    tot3.set_index(['idfoy', 'quifoy'], inplace=True)
    final = concat([tot3, foy_ind], join_axes=[tot3.index], axis=1)
    final.reset_index(inplace=True)
    foy_ind.reset_index(inplace=True)
    tot3.reset_index(inplace=True)

#     tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam'])
    final = final[final.idmen.notnull()]

    control(final, verbose=True)
    del tot3, foy_ind
    gc.collect()

#final <- merge(final, sif, by = c('noindiv'), all.x = TRUE)
    print "    loading fip"
    sif = load_temp(name = 'sif', year=year)

    print sif.columns
    print "    update final using fip"
    final = final.merge(sif, on=["noindiv"], how="left")
    #TODO: IL FAUT UNE METHODE POUR GERER LES DOUBLES DECLARATIONS


    print final.columns
    control(final, debug=True)

    final['caseP'] = final.caseP.fillna(False)
    final['caseF'] = final.caseF.fillna(False)
    print_id(final)

    save_temp(final, name='final', year=year)
    print 'final sauvegardé'
    del sif, final
def create_final(year=None):
    if year is None:
        raise Exception("A year is needed")
    print 'création de final'
    foy_ind = load_temp(name='foy_ind', year=year)
    tot3 = load_temp(name='tot3', year=year)

    foy_ind.set_index(['idfoy', 'quifoy'], inplace=True)
    tot3.set_index(['idfoy', 'quifoy'], inplace=True)
    final = concat([tot3, foy_ind], join_axes=[tot3.index], axis=1)
    final.reset_index(inplace=True)
    foy_ind.reset_index(inplace=True)
    tot3.reset_index(inplace=True)

    #     tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam'])
    final = final[final.idmen.notnull()]

    control(final, verbose=True)
    del tot3, foy_ind
    gc.collect()

    #final <- merge(final, sif, by = c('noindiv'), all.x = TRUE)
    print "    loading fip"
    sif = load_temp(name='sif', year=year)

    print sif.columns
    print "    update final using fip"
    final = final.merge(sif, on=["noindiv"], how="left")
    #TODO: IL FAUT UNE METHODE POUR GERER LES DOUBLES DECLARATIONS

    print final.columns
    control(final, debug=True)

    final['caseP'] = final.caseP.fillna(False)
    final['caseF'] = final.caseF.fillna(False)
    print_id(final)

    save_temp(final, name='final', year=year)
    print 'final sauvegardé'
    del sif, final
Exemplo n.º 3
0
def create_totals(year=2006):

    print "Creating Totals"
    print "Etape 1 : Chargement des données"

    data = DataCollection(year=year)
    indivim = load_temp(name="indivim", year=year)

    assert indivim.duplicated(['noindiv']).any() == False, "Présence de doublons"


    # Deals individuals with imputed income : some individuals are in 'erf individu table' but
    # not in the 'foyer' table. We need to create a foyer for them.


    selection = Series()
    for var in ["zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici", "zrnci"]:
        varo = var[:-1]+"o"
        test = indivim[var] != indivim[varo]
        if len(selection) == 0:
            selection = test
        else:
            selection = (test) | (selection)


    indivi_i = indivim[selection]
    indivi_i.rename(columns={"ident" : "idmen",
                     "persfip":"quifoy",
                     "zsali" : "sali2", # Inclu les salaires non imposables des agents d'assurance
                     "zchoi" : "choi2",
                     "zrsti" : "rsti2",
                     "zalri" : "alr2"}, inplace=True)


    indivi_i["quifoy"] = where(indivi_i["quifoy"].isnull(), "vous", indivi_i["quifoy"])
    indivi_i["quelfic"] = "FIP_IMP"



## We merge them with the other individuals
#indivim <- rename(indivim, c(ident = "idmen",
#                             persfip = "quifoy",
#                             zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance
#                             zchoi = "choi2",
#                             zrsti = "rsti2",
#                             zalri = "alr2"))
#
#indivi <- rbind(indivim[!(indivim$noindiv %in% indivi_i$noindiv),], indivi_i)
#rm(indivim, indivi_i)
#gc()
#table(indivi$quelfic)
#

    indivim.rename( columns= dict(ident = "idmen",
                             persfip = "quifoy",
                             zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance
                             zchoi = "choi2",
                             zrsti = "rsti2",
                             zalri = "alr2"), inplace=True)

    if not (set(list(indivim.noindiv)) >  set(list(indivi_i.noindiv)) ):
        raise Exception("Individual ")
    indivim.set_index("noindiv", inplace=True)
    indivi_i.set_index("noindiv", inplace=True)
    indivi = indivim
    del indivim
    indivi.update(indivi_i)

    indivi.reset_index( inplace=True)

    print ''
    print "Etape 2 : isolation des FIP"
    fip_imp = indivi.quelfic=="FIP_IMP"
    indivi["idfoy"] = (indivi["idmen"].astype("int64")*100 +
                       (indivi["declar1"].str[0:2]).convert_objects(convert_numeric=True))

    indivi.loc[fip_imp,"idfoy"] = nan

## Certains FIP (ou du moins avec revenus imputés) ont un num?ro de déclaration d'impôt ( pourquoi ?)


    fip_has_declar = (fip_imp) & (indivi.declar1.notnull())

#    indivi.ix[fip_has_declar, "idfoy"] = ( indivi.ix[fip_has_declar, "idmen"]*100
#                                        + (indivi.ix[fip_has_declar, "declar1"].str[0:1]).convert_objects(convert_numeric=True) )
    indivi["idfoy"] = where(fip_has_declar,
                            indivi["idmen"]*100 + indivi["declar1"].str[0:2].convert_objects(convert_numeric=True),
                            indivi["idfoy"])

    del fip_has_declar


    fip_no_declar = (fip_imp) & (indivi.declar1.isnull())
    del fip_imp
    indivi["idfoy"] = where(fip_no_declar,
                            indivi["idmen"]*100 + 50,
                            indivi["idfoy"])

    indivi_fnd = indivi.loc[fip_no_declar, ["idfoy","noindiv"]]


    while any(indivi_fnd.duplicated(cols=["idfoy"])):
        indivi_fnd["idfoy"] = where(indivi_fnd.duplicated(cols=["idfoy"]),
                                    indivi_fnd["idfoy"] + 1,
                                    indivi_fnd["idfoy"])

    assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len(indivi_fnd["idfoy"]), "Duplicates remaining"
    assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons"


    indivi.loc[fip_no_declar, ["idfoy"]] = indivi_fnd
    del indivi_fnd, fip_no_declar

    print ''
    print 'Etape 3 : Récupération des EE_NRT'

    nrt = indivi.quelfic=="EE_NRT"
    indivi.idfoy = where(nrt, indivi.idmen*100 + indivi.noi, indivi.idfoy)
    indivi.loc[nrt,"quifoy"] = "vous"
    del nrt

    pref_or_cref = indivi['lpr'].isin([1,2])
    adults = (indivi.quelfic.isin(["EE","EE_CAF"])) & (pref_or_cref)
    indivi.idfoy = where(adults, indivi.idmen*100 + indivi.noi, indivi.idfoy)
    indivi.loc[adults, "quifoy"] = "vous"
    del adults
    assert indivi.loc[indivi['lpr'].isin([1,2]),"idfoy"].notnull().all()

    print ''
    print 'Etape 4 : Rattachement des enfants aux déclarations'

    assert indivi["noindiv"].duplicated().any() == False, "Some noindiv appear twice"
    lpr3_or_lpr4 = indivi['lpr'].isin([3,4])
    enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE","EE_CAF"]))
    assert indivi.loc[enf_ee, "noindiv"].notnull().all(), " Some noindiv are not set, which will ruin next stage"
    assert indivi.loc[enf_ee, "noindiv"].duplicated().any() == False, "Some noindiv appear twice"

    pere = DataFrame( {"noindiv_enf" : indivi.noindiv.loc[enf_ee], "noindiv" : 100*indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee] })
    mere = DataFrame( {"noindiv_enf" : indivi.noindiv.loc[enf_ee], "noindiv" : 100*indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee] })

    foyer = data.get_values(variables=["noindiv","zimpof"], table="foyer" )
    pere  = pere.merge(foyer, how="inner", on="noindiv")
    mere  = mere.merge(foyer, how="inner", on="noindiv")

#     print "Some pere et mere are duplicated because people have two foyers"
#     print pere[pere.duplicated()]
#     print mere[mere.duplicated()]

    df = pere.merge(mere, how="outer", on="noindiv_enf",  suffixes=('_p', '_m'))

#     print len(pere)
#     print len(mere)
#     print len(df)
#     ll = df.loc[df["noindiv_enf"].duplicated(), "noindiv_enf"]
#     print df.loc[df["noindiv_enf"].isin(ll)]
#     print df[df.duplicated()]



    print '    4.1 : gestion des personnes dans 2 foyers'
    for col in ["noindiv_p","noindiv_m","noindiv_enf"]:
        df[col] = df[col].fillna(0,inplace=True) # beacause groupby drop groups with NA in index
    df = df.groupby(by=["noindiv_p","noindiv_m","noindiv_enf"]).sum()
    df.reset_index(inplace=True)

    df["which"] = ""
    df["which"] = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()), "mere", "")
    df["which"] = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()), "pere", "")
    both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull())
    df["which"] = where(both & (df.zimpof_p  > df.zimpof_m), "pere", "mere")
    df["which"] = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere")

    assert df["which"].notnull().all(), "Some enf_ee individuals are not matched with any pere or mere"
    del lpr3_or_lpr4, pere, mere

    df.rename(columns={"noindiv_enf" : "noindiv"}, inplace=True)
    df["idfoy"] = where( df.which=="pere", df.noindiv_p, df.noindiv_m)
    df["idfoy"] = where( df.which=="mere", df.noindiv_m, df.noindiv_p)

    assert df["idfoy"].notnull().all()

    for col in df.columns:
        if col not in ["idfoy", "noindiv"]:
            del df[col]

#     assert indivi.loc[enf_ee,"idfoy"].notnull().all()
    assert df.duplicated().any() == False

    df.set_index("noindiv",inplace=True, verify_integrity=True)
    indivi.set_index("noindiv", inplace=True, verify_integrity=True)

    ind_notnull = indivi["idfoy"].notnull().sum()
    ind_isnull = indivi["idfoy"].isnull().sum()
    indivi = indivi.combine_first(df)
    assert ind_notnull + ind_isnull == (indivi["idfoy"].notnull().sum() +
                                        indivi["idfoy"].isnull().sum())

    indivi.reset_index(inplace=True)
    assert indivi.duplicated().any() == False



# MBJ: issue delt with when moving from R code to python
## TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs
## On suit guide méthodo erf 2003 page 135
## On supprime les conjoints FIP et les FIP de 25 ans et plus;
## On conserve les enfants FIP de 19 à 24 ans;
## On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un
## ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète
## EEC que l'on ne retrouvera pas dans les EEC suivantes.
#
    print '    4.2 : On enlève les individus pour lesquels il manque le déclarant'
    fip = load_temp(name="fipDat", year=year)
    fip["declar"] = nan
    fip["agepf"] = nan


    fip.drop(["actrec", "year", "noidec"], axis=1, inplace=True)
    fip.naia = fip.naia.astype("int32")
    fip.rename( columns=dict(ident="idmen",
                     persfip="quifoy",
                     zsali="sali2", # Inclu les salaires non imposables des agents d'assurance
                     zchoi="choi2",
                     zrsti="rsti2",
                     zalri="alr2"),
                inplace=True)

    is_fip_19_25 = ((year-fip.naia-1)>=19) & ((year-fip.naia-1)<25)

## TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented
## The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25
##fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"]
##fip[is_fip_19_25 ,"lpr"]  <- 1
#
#indivi <- rbind.fill(indivi,fip[is_fip_19_25,])

    indivi = concat([indivi, fip.loc[is_fip_19_25]])
    del is_fip_19_25
    indivi['age'] = year - indivi.naia - 1
    indivi['agem'] = 12*indivi.age  + 12-indivi.naim

    indivi["quimen"] = 0
    indivi.quimen[indivi.lpr == 1] = 0
    indivi.quimen[indivi.lpr == 2] = 1
    indivi.quimen[indivi.lpr == 3] = 2
    indivi.quimen[indivi.lpr == 4] = 3
    indivi['not_pr_cpr'] = nan
    indivi['not_pr_cpr'][indivi['lpr']<=2] = False
    indivi['not_pr_cpr'][indivi['lpr']>2] = True


    print "    4.3 : Creating non pr=0 and cpr=1 idmen's"
    indivi.reset_index(inplace=True)
    test1 = indivi.ix[indivi['not_pr_cpr']==True,['quimen', 'idmen']]
    test1['quimen'] = 2

    j=2
    while any(test1.duplicated(['quimen', 'idmen'])):
        test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j+1
        j += 1

    print_id(indivi)
    indivi.update(test1)

    print_id(indivi)

#     indivi.set_index(['quiment']) #TODO: check relevance
#     TODO problème avec certains idfoy qui n'ont pas de vous
    print ''
    print "Etape 5 : Gestion des idfoy qui n'ont pas de vous"
    all = indivi.drop_duplicates('idfoy')
    with_ = indivi.loc[indivi['quifoy']=='vous', 'idfoy']
    without = all[~(all.idfoy.isin(with_.values))]

    print 'On cherche si le déclarant donné par la deuxième déclaration est bien un vous'
    has_declar2 = (indivi.idfoy.isin(without.idfoy.values)) & (indivi.declar2.notnull())
    decl2_idfoy = (indivi.loc[has_declar2, 'idmen'].astype('int')*100 +
                    indivi.loc[has_declar2, "declar2"].str[0:2].astype('int'))
    indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values), decl2_idfoy, None)
    del all,with_,without, has_declar2

    print '    5.1 : Elimination idfoy restant'
    idfoyList = indivi.loc[indivi['quifoy']=="vous", 'idfoy'].drop_duplicates()
    indivi = indivi[indivi.idfoy.isin(idfoyList.values)]
    del idfoyList
    print_id(indivi)

    myvars = ["noindiv", "noi", "idmen", "idfoy", "quifoy", "wprm",
                            "age","agem","quelfic","actrec", "quimen",
                            "nbsala","titc","statut","txtppb","chpub","prosa","encadr"]

    if not(len(set(myvars).difference(set(indivi.columns))) == 0):
        print set(myvars).difference(set(indivi.columns))

    assert len(set(myvars).difference(set(indivi.columns))) == 0

    indivi = indivi.loc[:, myvars]

## TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé
## les infos provenant des déclarations)

    print ''
    print 'Etape 6 : Création des variables descriptives'
    print '    6.1 : variable activité'
    indivi['activite'] = None
    indivi['activite'][indivi['actrec']<=3] = 0
    indivi['activite'][indivi['actrec']==4] = 1
    indivi['activite'][indivi['actrec']==5] = 2
    indivi['activite'][indivi['actrec']==7] = 3
    indivi['activite'][indivi['actrec']==8] = 4
    indivi['activite'][indivi['age']<=13] = 2 # ce sont en fait les actrec=9
    print indivi['activite'].value_counts()
    # TODO: MBJ problem avec les actrec


    indivi['titc'][indivi['titc'].isnull()] = 0
    assert indivi['titc'].notnull().all() , Exception("Problème avec les titc")


    print '    6.2 : variable statut'
    indivi['statut'][indivi['statut'].isnull()] = 0
    indivi['statut'] = indivi['statut'].astype('int')
    indivi['statut'][indivi['statut']==11] = 1
    indivi['statut'][indivi['statut']==12] = 2
    indivi['statut'][indivi['statut']==13] = 3
    indivi['statut'][indivi['statut']==21] = 4
    indivi['statut'][indivi['statut']==22] = 5
    indivi['statut'][indivi['statut']==33] = 6
    indivi['statut'][indivi['statut']==34] = 7
    indivi['statut'][indivi['statut']==35] = 8
    indivi['statut'][indivi['statut']==43] = 9
    indivi['statut'][indivi['statut']==44] = 10
    indivi['statut'][indivi['statut']==45] = 11
    assert indivi['statut'].isin(range(12)).all(), Exception("statut value over range")


#indivi$nbsala <- as.numeric(indivi$nbsala)
#indivi <- within(indivi,{
#  nbsala[is.na(nbsala) ]    <- 0
#  nbsala[nbsala==99 ] <- 10  # TODO  418 fip à retracer qui sont NA
#})

    print '    6.3 : variable txtppb'
    indivi['txtppb'] = indivi['txtppb'].fillna(0)
    assert indivi['txtppb'].notnull().all()

    indivi['nbsala'] = indivi['nbsala'].fillna(0)
    indivi['nbsala'] = indivi['nbsala'].astype('int')
    indivi['nbsala'][indivi['nbsala']==99] = 10
    assert indivi['nbsala'].isin(range(11)).all()

    print '    6.4 : variable chpub et CSP'
    indivi['chpub'].fillna(0, inplace=True)
    indivi['chpub'] = indivi['chpub'].astype('int')
    indivi['chpub'][indivi['chpub'].isnull()] = 0
    print indivi['chpub'].value_counts()
    assert indivi['chpub'].isin(range(11)).all()

    indivi['cadre'] = 0
    indivi['prosa'][indivi['prosa'].isnull()] = 0
    assert indivi['prosa'].notnull().all()
    print indivi['encadr'].value_counts()

    # encadr : 1=oui, 2=non
    indivi['encadr'].fillna(2, inplace=True)
    assert indivi['encadr'].notnull().all()
    indivi['cadre'][indivi['prosa'].isin([7,8])] = 1
    indivi['cadre'][(indivi['prosa']==9) & (indivi['encadr']==1)] = 1
    print "cadre"
    print indivi['cadre'].value_counts()
    assert indivi['cadre'].isin(range(2)).all()

    print ''
    print "Etape 7 : on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence"

    print 'nb de doublons idfam/quifam', len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])

    print 'On crée les n° de personnes à charge'
    assert indivi['idfoy'].notnull().all()
    print_id(indivi)
    indivi['quifoy2'] = 2
    indivi['quifoy2'][indivi['quifoy']=='vous'] = 0
    indivi['quifoy2'][indivi['quifoy']=='conj'] = 1
    indivi['quifoy2'][indivi['quifoy']=='pac'] = 2


    del indivi['quifoy']
    indivi['quifoy'] = indivi['quifoy2']
    del indivi['quifoy2']

    print_id(indivi)
    test2 = indivi.loc[indivi['quifoy']==2, ['quifoy', 'idfoy','noindiv']]
    print_id(test2)

    j=2
    while test2.duplicated(['quifoy', 'idfoy']).any():
        test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j
        j += 1

    print_id(test2)
    indivi = indivi.merge(test2, on=['noindiv','idfoy'], how="left")
    indivi['quifoy'] = indivi['quifoy_x']
    indivi['quifoy'] = where(indivi['quifoy_x']==2, indivi['quifoy_y'], indivi['quifoy_x'])
    del indivi['quifoy_x'], indivi['quifoy_y']
    print_id(indivi)

    del test2, fip
    print 'nb de doublons idfam/quifam', len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])
    print_id(indivi)

#####################################################################################
## On ajoute les idfam et quifam
#load(famc)
#
#tot2 <- merge(indivi, famille, by = c('noindiv'), all.x = TRUE)
#rm(famille)
#print_id(tot2)
#
### Les idfam des enfants FIP qui ne font plus partie des familles forment des famille seuls
#tot2[is.na(tot2$quifam), "idfam"] <- tot2[is.na(tot2$quifam), "noindiv"]
#tot2[is.na(tot2$quifam), "quifam"] <- 0
#print_id(tot2)
#saveTmp(tot2, file = "tot2.Rdata")
#rm(indivi,tot2)
#
## on merge les variables de revenus (foyer_aggr) avec les identifiants précédents
## load foyer
#loadTmp(file = "tot2.Rdata")
#loadTmp(file= "foyer_aggr.Rdata")
#
#tot3 <- merge(tot2, foyer, all.x = TRUE)
#print_id(tot3) # OK
#saveTmp(tot3, file= "tot3.Rdata")
#rm(tot3,tot2,foyer)
#
    print ''
    print 'Etape 8 : création des fichiers totaux'
    famille = load_temp(name='famc', year=year)



    print '    8.1 : création de tot2 & tot3'
    tot2 = indivi.merge(famille, on='noindiv', how='inner')
#     del famille # TODO: MBJ increase in number of menage/foyer when merging with family ...
    del famille


    control(tot2, debug=True, verbose=True)
    assert tot2['quifam'].notnull().all()

    save_temp(tot2, name='tot2', year=year)
    del indivi
    print '    tot2 saved'

#     #On combine les variables de revenu
#     foyer = load_temp(name='foy_ind', year=year)
#     print " INTERSERCT THE POOCHAY"
#     tot2["idfoy"] = tot2["idfoy"][tot2["idfoy"].notnull()] +1
#     print "pingas"
#     print sorted(tot2.loc[tot2.idfoy.notnull(),"idfoy"].astype('int').unique())[0:10]
#     print "pocchay"
#     print sorted(foyer["idfoy"].unique())[0:10]
#     print "final flash"
#     print 602062550.0 in foyer["idfoy"].values
#     print len(list(set(tot2["idfoy"].unique()) & set(foyer["idfoy"].unique())))
#     print tot2.quifoy.value_counts()
    #tot2.update(foyer)
    tot2.merge(foyer, how = 'left')

    tot2 = tot2[tot2.idmen.notnull()]
#     tot2['idfoy'] += 1

    print_id(tot2)

    tot3 = tot2
    # TODO: check where they come from
    tot3 = tot3.drop_duplicates(cols='noindiv')
    print len(tot3)

    #Block to remove any unwanted duplicated pair
    print "    check tot3"
    control(tot3, debug=True, verbose=True)
    tot3 = tot3.drop_duplicates(cols=['idfoy', 'quifoy'])
    tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam'])
    tot3 = tot3.drop_duplicates(cols=['idmen', 'quimen'])
    tot3 = tot3.drop_duplicates(cols='noindiv')
    control(tot3)

## On ajoute les variables individualisables
#loadTmp("foyer_individualise.Rdata") # foy_ind
#loadTmp("tot3.Rdata")
#loadTmp("allvars.Rdata")
#loadTmp("sif.Rdata")
#
#vars2 <- setdiff(names(tot3),  allvars)
#tot3 <- tot3[,vars2]
#
#print_id(tot3)
#final <- merge(tot3, foy_ind, by = c('idfoy', 'quifoy'), all.x = TRUE)
#
    print '    8.2 : On ajoute les variables individualisables'

    allvars = load_temp(name = 'ind_vars_to_remove', year=year)
    vars2 = set(tot3.columns).difference(set(allvars))
    tot3 = tot3[list(vars2)]
    print len(tot3)


    assert not(tot3.duplicated(cols=['noindiv']).any()), "doublon dans tot3['noindiv']"
    lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])])
    assert lg_dup == 0, "%i pairs of idfoy/quifoy in tot3 are duplicated" %(lg_dup)

    save_temp(tot3, name='tot3', year=year)
    control(tot3)

    del tot2, allvars, tot3, vars2
    print 'tot3 sauvegardé'
    gc.collect()
Exemplo n.º 4
0
def final(year=2006, filename="test", check=True):

    ##***********************************************************************/
    print('08_final: derniers réglages')
    ##***********************************************************************/
    #
    # loadTmp("final.Rdata")
    # # On définit comme célibataires les individus dont on n'a pas retrouvé la déclaration
    # final$statmarit[is.na(final$statmarit)] <- 2
    # table(final$statmarit, useNA='ifany')
    #
    import gc
    gc.collect()
    final = load_temp("final", year=year)
    print 'check doublons', len(final[final.duplicated(['noindiv'])])
    final.statmarit = where(final.statmarit.isnull(), 2, final.statmarit)
    #

    # # activite des fip
    # table(final[final$quelfic=="FIP","activite"],useNA="ifany")
    # summary(final[final$quelfic=="FIP",c("activite","choi","sali","alr","rsti","age")] )
    # # activite      # actif occup? 0, ch?meur 1, ?tudiant/?l?ve 2, retrait? 3, autre inactif 4
    #
    # final_fip <- final[final$quelfic=="FIP",]
    # final_fip <- within(final_fip,{
    #   choi <- ifelse(is.na(choi),0,choi)
    #   sali <- ifelse(is.na(sali),0,sali)
    #   alr <- ifelse(is.na(alr),0,alr)
    #   rsti <- ifelse(is.na(rsti),0,rsti)
    #   activite <- 2 # TODO comment choisr la valeur par d?faut ?
    #   activite <- ifelse(choi > 0,1,activite)
    #   activite <- ifelse(sali > 0,0,activite)
    #   activite <- ifelse(age  >= 21, 2,activite) # ne peuvent être rattach?s que les ?tudiants
    # })
    # final[final$quelfic=="FIP",]<- final_fip
    # table(final_fip[,c("age","activite")])
    # rm(final_fip)
    #
    # print_id(final)
    # saveTmp(final, file= "final.Rdata")
    #
    print '    gestion des FIP de final'
    final_fip = final.loc[final.quelfic == "FIP",
                          ["choi", "sali", "alr", "rsti", "age"]]

    print set(["choi", "sali", "alr",
               "rsti"]).difference(set(final_fip.columns))
    for var in ["choi", "sali", "alr", "rsti"]:
        final_fip[var].fillna(0, inplace=True)
        assert final_fip[var].notnull().all(
        ), "some NaN are remaining in column %s" % (var)

    final_fip["activite"] = 2  # TODO comment choisr la valeur par défaut ?
    final_fip.activite = where(final_fip.choi > 0, 1, final_fip.activite)
    final_fip.activite = where(final_fip.sali > 0, 0, final_fip.activite)
    final_fip.activite = where(
        final_fip.age > 21, 2,
        final_fip.activite)  # ne peuvent être rattach?s que les ?tudiants

    final.update(final_fip)
    save_temp(final, name="final", year=year)
    print '    final has been updated with fip'

    # loadTmp("final.Rdata")
    # load(menm)
    # menagem <- rename(menagem, c("ident"="idmen","loym"="loyer"))
    # menagem$cstotpragr <- floor(menagem$cstotpr/10)
    #
    from math import floor

    menagem = load_temp(name="menagem", year=year)
    menagem.rename(columns=dict(ident="idmen", loym="loyer"), inplace=True)
    menagem["cstotpragr"] = menagem["cstotpr"].apply(lambda x: floor(x / 10))
    #
    # # 2008 tau99 removed TODO: check ! and check incidence
    # if (year == "2008") {
    #  vars <- c("loyer", "tu99", "pol99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm")
    # } else {
    #   vars <- c("loyer", "tu99", "pol99", "tau99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm")
    # }
    #
    # famille_vars <- c("m_afeamam", "m_agedm","m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm')

    if year == 2008:
        vars = [
            "loyer", "tu99", "pol99", "reg", "idmen", "so", "wprm", "typmen15",
            "nbinde", "ddipl", "cstotpragr", "champm", "zthabm"
        ]
    else:
        vars = [
            "loyer", "tu99", "pol99", "tau99", "reg", "idmen", "so", "wprm",
            "typmen15", "nbinde", "ddipl", "cstotpragr", "champm", "zthabm"
        ]
    famille_vars = [
        "m_afeamam", "m_agedm", "m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm'
    ]

    # if ("naf16pr" %in% names(menagem)) {
    #   naf16pr <- factor(menagem$naf16pr)
    #   levels(naf16pr) <-  0:16
    #   menagem$naf16pr <- as.character(naf16pr)
    #   menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1"  # Sans objet
    #   vars <- c(vars,"naf16pr")
    # } else if ("nafg17npr" %in% names(menagem)) {
    #   # TODO: pb in 2008 with xx
    #   if (year == "2008"){
    #     menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00"
    #   }
    #   nafg17npr <- factor(menagem$nafg17npr)
    #   levels(nafg17npr) <-  0:17
    #   menagem$nafg17npr <- as.character(nafg17npr)
    #   menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1"  # Sans objet
    # }
    #

    #TODO: TODO: pytohn translation needed
    #    if "naf16pr" in menagem.columns:
    #        naf16pr <- factor(menagem$naf16pr)
    #   levels(naf16pr) <-  0:16
    #   menagem$naf16pr <- as.character(naf16pr)
    #   menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1"  # Sans objet
    #   vars <- c(vars,"naf16pr")
    # } else if ("nafg17npr" %in% names(menagem)) {
    #   # TODO: pb in 2008 with xx
    #   if (year == "2008"){
    #     menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00"
    #   }
    #   nafg17npr <- factor(menagem$nafg17npr)
    #   levels(nafg17npr) <-  0:17
    #   menagem$nafg17npr <- as.character(nafg17npr)
    #   menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1"  # Sans objet
    # }

    # # TODO: 2008tau99 is not present should be provided by 02_loy.... is it really needed
    # all_vars <- union(vars,famille_vars)
    # available_vars <- all_vars[union(vars,famille_vars) %in% names(menagem)]
    # loyersMenages <- menagem[,available_vars]
    #
    all_vars = vars + famille_vars

    print all_vars
    print set(menagem.columns)
    available_vars = list(set(all_vars).intersection(set(menagem.columns)))

    loyersMenages = menagem.xs(available_vars, axis=1)

    #
    # # Recodage de typmen15: modalités de 1:15
    # table(loyersMenages$typmen15, useNA="ifany")
    # loyersMenages <- within(loyersMenages, {
    #   typmen15[typmen15==10 ] <- 1
    #   typmen15[typmen15==11 ] <- 2
    #   typmen15[typmen15==21 ] <- 3
    #   typmen15[typmen15==22 ] <- 4
    #   typmen15[typmen15==23 ] <- 5
    #   typmen15[typmen15==31 ] <- 6
    #   typmen15[typmen15==32 ] <- 7
    #   typmen15[typmen15==33 ] <- 8
    #   typmen15[typmen15==41 ] <- 9
    #   typmen15[typmen15==42 ] <- 10
    #   typmen15[typmen15==43 ] <- 11
    #   typmen15[typmen15==44 ] <- 12
    #   typmen15[typmen15==51 ] <- 13
    #   typmen15[typmen15==52 ] <- 14
    #   typmen15[typmen15==53 ] <- 15
    # })
    #
    #
    # TODO: MBJ UNNECESSARY ?

    #
    # # Pb avec ddipl, pas de modalités 2: on décale les chaps >=3
    # # Cependant on fait cela après avoir fait les traitement suivants
    # table(loyersMenages$ddipl, useNA="ifany")
    # # On convertit les ddipl en numeric
    # loyersMenages$ddipl <- as.numeric(loyersMenages$ddipl)
    # table(loyersMenages$ddipl, useNA="ifany")
    # #   On met les non renseignés ie, NA et "" à sans diplome (modalité 7)
    # loyersMenages[is.na(loyersMenages$ddipl), "ddipl"] <- 7
    #
    # loyersMenages[loyersMenages$ddipl>1, "ddipl"] <- loyersMenages$ddipl[loyersMenages$ddipl>1]-1
    #

    loyersMenages.ddipl = where(loyersMenages.ddipl.isnull(), 7,
                                loyersMenages.ddipl)
    loyersMenages.ddipl = where(loyersMenages.ddipl > 1,
                                loyersMenages.ddipl - 1, loyersMenages.ddipl)
    loyersMenages.ddipl.astype("int32")
    #
    # table(final$actrec,useNA="ifany")
    # final$act5 <- NA
    # final <- within(final, {
    #   act5[which(actrec==1) ] <- 2 # ind?pendants
    #   act5[which(actrec==2) ] <- 1 # salari?s
    #   act5[which(actrec==3) ] <- 1 # salari?s
    #   act5[which(actrec==4) ] <- 3 # ch?meur
    #   act5[which(actrec==7) ] <- 4 # retrait?
    #   act5[which(actrec==8) ] <- 5 # autres inactifs
    # })
    # table(final$act5,useNA="ifany")
    #

    final.act5 = NaN

    final.act5 = where(final.actrec == 1, 2, final.act5)  # indépendants
    final.act5 = where(final.actrec.isin([2, 3]), 1, final.act5)  # salariés

    final.act5 = where(final.actrec == 4, 3, final.act5)  # chômeur
    final.act5 = where(final.actrec == 7, 4, final.act5)  # retraité
    final.act5 = where(final.actrec == 8, 5, final.act5)  # autres inactifs
    print final.act5.value_counts()  # TODO : 29 retraités ?

    #     assert final.act5.notnull().all(), 'there are NaN inside final.act5'

    # final$wprm <- NULL # with the intention to extract wprm from menage to deal with FIPs
    # final$tax_hab <- final$zthabm # rename zthabm to tax_hab
    # final$zthabm <- NULL
    #
    # final2 <- merge(final, loyersMenages, by="idmen", all.x=TRUE)
    print '    création de final2'
    del final["wprm"]
    gc.collect()
    final.rename(columns=dict(zthabm="tax_hab"),
                 inplace=True)  # rename zthabm to tax_hab
    final2 = final.merge(loyersMenages, on="idmen", how="left")  # TODO: Check
    print loyersMenages.head()
    gc.collect()
    print_id(final2)

    #
    # # TODO: merging with patrimoine
    # rm(menagem,final)
    #
    # # table(final2$activite,useNA="ifany")
    # # table(final2$alt,useNA="ifany")
    #
    # saveTmp(final2, file= "final2.Rdata")
    #
    # loadTmp("final2.Rdata")
    # names(final2)
    # print_id(final2)
    #
    #
    # # set zone_apl using zone_apl_imputation_data
    # apl_imp <- read.csv("./zone_apl/zone_apl_imputation_data.csv")
    #
    # if (year == "2008") {
    #   zone_apl <- final2[, c("tu99", "pol99", "reg")]
    # } else {
    #   zone_apl <- final2[, c("tu99", "pol99", "tau99", "reg")]
    # }
    #
    # for (i in 1:length(apl_imp[,"TU99"])) {
    #   tu <- apl_imp[i,"TU99"]
    #   pol <- apl_imp[i,"POL99"]
    #   tau <- apl_imp[i,"TAU99"]
    #   reg <- apl_imp[i,"REG"]
    #   #  print(c(tu,pol,tau,reg))
    #
    #   if (year == "2008") {
    #     indices <- (final2["tu99"] == tu & final2["pol99"] == pol  & final2["reg"] == reg)
    #     selection <-  (apl_imp["TU99"] == tu & apl_imp["POL99"] == pol & apl_imp["REG"] == reg)
    #   } else {
    #     indices <- (final2["tu99"] == tu & final2["pol99"] == pol & final2["tau99"] == tau & final2["reg"] == reg)
    #     selection <-  (apl_imp["TU99"] == tu & apl_imp["POL99"] == pol & apl_imp["TAU99"] == tau & apl_imp["REG"] == reg)
    #   }
    #   z <- runif(sum(indices))
    #   probs <- apl_imp[selection , c("proba_zone1", "proba_zone2")]
    #   #  print(probs)
    #   final2[indices,"zone_apl"] <- 1 + (z>probs[,'proba_zone1']) + (z>(probs[,'proba_zone1']+probs[,'proba_zone2']))
    #   rm(indices, probs)
    # }
    #

    print '    traitement des zones apl'
    apl_imp = read_csv("../../zone_apl/zone_apl_imputation_data.csv")

    print apl_imp.head(10)
    if year == 2008:
        zone_apl = final2.xs(["tu99", "pol99", "reg"], axis=1)
    else:
        zone_apl = final2.xs(["tu99", "pol99", "tau99", "reg"], axis=1)

    for i in range(len(apl_imp["TU99"])):
        tu = apl_imp["TU99"][i]
        pol = apl_imp["POL99"][i]
        tau = apl_imp["TAU99"][i]
        reg = apl_imp["REG"][i]

    if year == 2008:
        indices = (final2["tu99"] == tu) & (final2["pol99"]
                                            == pol) & (final2["reg"] == reg)
        selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (
            apl_imp["REG"] == reg)
    else:
        indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (
            final2["tau99"] == tau) & (final2["reg"] == reg)
        selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (
            apl_imp["TAU99"] == tau) & (apl_imp["REG"] == reg)

    z = random.uniform(size=indices.sum())
    print len(z)
    print len(indices)

    print len(indices) / len(z)
    probs = apl_imp.loc[selection, ["proba_zone1", "proba_zone2"]]
    print probs
    print probs['proba_zone1'].values

    proba_zone_1 = probs['proba_zone1'].values[0]
    proba_zone_2 = probs['proba_zone2'].values[0]

    final2["zone_apl"] = 3
    final2["zone_apl"][indices] = (1 + (z > proba_zone_1) +
                                   (z > (proba_zone_1 + proba_zone_2)))
    del indices, probs

    #     control(final2, verbose=True, debug=True, verbose_length=15)

    print '    performing cleaning on final2'
    print 'nombre de sali nuls', len(final2[final2['sali'].isnull()])
    print "nombre d'âges nuls", len(final2[final2.age.isnull()])
    print "longueur de final2 avant purge", len(final2)
    #     columns_w_nan = []
    #     for col in final2.columns:
    #         if final2[final2['idfoy'].notnull()][col].isnull().any() and not final2[col].isnull().all():
    #             columns_w_nan.append(col)
    #     print columns_w_nan
    print 'check doublons', len(final2[final2.duplicated(['noindiv'])])
    print final2.age.isnull().sum()

    #     print final2.loc[final2.duplicated('noindiv'), ['noindiv', 'quifam']].to_string()
    #TODO: JS: des chefs de famille et conjoints en double il faut trouver la source des ces doublons !
    #     final2 = final2.drop_duplicates(['noindiv'])

    final2 = final2[~(final2.age.isnull())]
    print "longueur de final2 après purge", len(final2)
    print_id(final2)

    #
    # # var <- names(foyer)
    # #a1 <- c('f7rb', 'f7ra', 'f7gx', 'f2aa', 'f7gt', 'f2an', 'f2am', 'f7gw', 'f7gs', 'f8td', 'f7nz', 'f1br', 'f7jy', 'f7cu', 'f7xi', 'f7xo', 'f7xn', 'f7xw', 'f7xy', 'f6hj', 'f7qt', 'f7ql', 'f7qm', 'f7qd', 'f7qb', 'f7qc', 'f1ar', 'f7my', 'f3vv', 'f3vu', 'f3vt', 'f7gu', 'f3vd', 'f2al', 'f2bh', 'f7fm', 'f8uy', 'f7td', 'f7gv', 'f7is', 'f7iy', 'f7il', 'f7im', 'f7ij', 'f7ik', 'f1er', 'f7wl', 'f7wk', 'f7we', 'f6eh', 'f7la', 'f7uh', 'f7ly', 'f8wy', 'f8wx', 'f8wv', 'f7sb', 'f7sc', 'f7sd', 'f7se', 'f7sf', 'f7sh', 'f7si',  'f1dr', 'f7hs', 'f7hr', 'f7hy', 'f7hk', 'f7hj', 'f7hm', 'f7hl', 'f7ho', 'f7hn', 'f4gc', 'f4gb', 'f4ga', 'f4gg', 'f4gf', 'f4ge', 'f7vz', 'f7vy', 'f7vx', 'f7vw', 'f7xe', 'f6aa', 'f1cr', 'f7ka', 'f7ky', 'f7db', 'f7dq', 'f2da')
    # #a2 <- setdiff(a1,names(foyer))
    # #b1 <- c('pondfin', 'alt', 'hsup', 'ass_mat', 'zone_apl', 'inactif', 'ass', 'aer', 'code_postal', 'activite', 'type_sal', 'jour_xyz', 'boursier', 'etr', 'partiel1', 'partiel2', 'empl_dir', 'gar_dom', 'categ_inv', 'opt_colca', 'csg_taux_plein','coloc')
    # # hsup feuille d'impot
    # # boursier pas dispo
    # # inactif etc : extraire cela des donn?es clca etc
    #
    # # tester activit? car 0 vaut actif
    # table(is.na(final2$activite),useNA="ifany")
    #
    # saveTmp(final2, file= "final2.Rdata")

    control(final2, debug=True)
    print final2.age.isnull().sum()
    final2 = final2.drop_duplicates(cols='noindiv')

    print '    Filter to manage the new 3-tables structures:'
    # On récupère les foyer, famille, ménages qui ont un chef :
    liste_men = unique(final2.loc[final2['quimen'] == 0, 'idmen'].values)
    liste_fam = unique(final2.loc[final2['quifam'] == 0, 'idfam'].values)
    liste_foy = unique(final2.loc[final2['quifoy'] == 0, 'idfoy'].values)

    #On ne conserve dans final2 que ces foyers là :
    print 'final2 avant le filtrage', len(final2)
    final2 = final2.loc[final2.idmen.isin(liste_men), :]
    final2 = final2.loc[final2.idfam.isin(liste_fam), :]
    final2 = final2.loc[final2.idfoy.isin(liste_foy), :]
    print 'final2 après le filtrage', len(final2)

    if check:
        check_structure(final2)

    from openfisca_france import DATA_SOURCES_DIR
    test_filename = os.path.join(DATA_SOURCES_DIR, filename + ".h5")
    if os.path.exists(test_filename):
        import warnings
        import datetime
        time_stamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')
        renamed_file = os.path.join(DATA_SOURCES_DIR,
                                    filename + "_" + time_stamp + ".h5")
        warnings.warn(
            "A file with the same name already exists \n Renaming current output and saving to "
            + renamed_file)
        test_filename = renamed_file

    store = HDFStore(test_filename)
    store['survey_' + str(year)] = final2
Exemplo n.º 5
0
def final(year=2006, filename="test", check=True):

##***********************************************************************/
    print('08_final: derniers réglages')
##***********************************************************************/
#
# loadTmp("final.Rdata")
# # On définit comme célibataires les individus dont on n'a pas retrouvé la déclaration
# final$statmarit[is.na(final$statmarit)] <- 2
# table(final$statmarit, useNA='ifany')
#
    import gc
    gc.collect()
    final = load_temp("final", year=year)
    print 'check doublons', len(final[final.duplicated(['noindiv'])])
    final.statmarit = where(final.statmarit.isnull(), 2, final.statmarit)
#

# # activite des fip
# table(final[final$quelfic=="FIP","activite"],useNA="ifany")
# summary(final[final$quelfic=="FIP",c("activite","choi","sali","alr","rsti","age")] )
# # activite      # actif occup? 0, ch?meur 1, ?tudiant/?l?ve 2, retrait? 3, autre inactif 4
#
# final_fip <- final[final$quelfic=="FIP",]
# final_fip <- within(final_fip,{
#   choi <- ifelse(is.na(choi),0,choi)
#   sali <- ifelse(is.na(sali),0,sali)
#   alr <- ifelse(is.na(alr),0,alr)
#   rsti <- ifelse(is.na(rsti),0,rsti)
#   activite <- 2 # TODO comment choisr la valeur par d?faut ?
#   activite <- ifelse(choi > 0,1,activite)
#   activite <- ifelse(sali > 0,0,activite)
#   activite <- ifelse(age  >= 21, 2,activite) # ne peuvent être rattach?s que les ?tudiants
# })
# final[final$quelfic=="FIP",]<- final_fip
# table(final_fip[,c("age","activite")])
# rm(final_fip)
#
# print_id(final)
# saveTmp(final, file= "final.Rdata")
#
    print '    gestion des FIP de final'
    final_fip = final.loc[final.quelfic=="FIP", ["choi", "sali", "alr", "rsti","age"]]

    print set(["choi", "sali", "alr", "rsti"]).difference(set(final_fip.columns))
    for var in  ["choi", "sali", "alr", "rsti"]:
        final_fip[var].fillna(0, inplace=True)
        assert final_fip[var].notnull().all(), "some NaN are remaining in column %s" %(var)


    final_fip["activite"] = 2 # TODO comment choisr la valeur par défaut ?
    final_fip.activite = where(final_fip.choi > 0, 1, final_fip.activite)
    final_fip.activite = where(final_fip.sali > 0, 0, final_fip.activite)
    final_fip.activite = where(final_fip.age > 21, 2, final_fip.activite)  # ne peuvent être rattach?s que les ?tudiants

    final.update(final_fip)
    save_temp(final, name="final", year=year)
    print '    final has been updated with fip'

# loadTmp("final.Rdata")
# load(menm)
# menagem <- rename(menagem, c("ident"="idmen","loym"="loyer"))
# menagem$cstotpragr <- floor(menagem$cstotpr/10)
#
    from math import floor

    menagem = load_temp(name="menagem", year=year)
    menagem.rename(columns=dict(ident="idmen",loym="loyer"), inplace=True)
    menagem["cstotpragr"] = menagem["cstotpr"].apply(lambda x: floor(x/10))
#
# # 2008 tau99 removed TODO: check ! and check incidence
# if (year == "2008") {
#  vars <- c("loyer", "tu99", "pol99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm")
# } else {
#   vars <- c("loyer", "tu99", "pol99", "tau99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm")
# }
#
# famille_vars <- c("m_afeamam", "m_agedm","m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm')

    if year == 2008:
        vars = ["loyer", "tu99", "pol99", "reg","idmen", "so", "wprm", "typmen15",
                 "nbinde","ddipl","cstotpragr","champm","zthabm"]
    else:
        vars = ["loyer", "tu99", "pol99", "tau99", "reg","idmen", "so", "wprm",
                "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm"]
    famille_vars = ["m_afeamam", "m_agedm","m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm']


# if ("naf16pr" %in% names(menagem)) {
#   naf16pr <- factor(menagem$naf16pr)
#   levels(naf16pr) <-  0:16
#   menagem$naf16pr <- as.character(naf16pr)
#   menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1"  # Sans objet
#   vars <- c(vars,"naf16pr")
# } else if ("nafg17npr" %in% names(menagem)) {
#   # TODO: pb in 2008 with xx
#   if (year == "2008"){
#     menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00"
#   }
#   nafg17npr <- factor(menagem$nafg17npr)
#   levels(nafg17npr) <-  0:17
#   menagem$nafg17npr <- as.character(nafg17npr)
#   menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1"  # Sans objet
# }
#


#TODO: TODO: pytohn translation needed
#    if "naf16pr" in menagem.columns:
#        naf16pr <- factor(menagem$naf16pr)
#   levels(naf16pr) <-  0:16
#   menagem$naf16pr <- as.character(naf16pr)
#   menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1"  # Sans objet
#   vars <- c(vars,"naf16pr")
# } else if ("nafg17npr" %in% names(menagem)) {
#   # TODO: pb in 2008 with xx
#   if (year == "2008"){
#     menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00"
#   }
#   nafg17npr <- factor(menagem$nafg17npr)
#   levels(nafg17npr) <-  0:17
#   menagem$nafg17npr <- as.character(nafg17npr)
#   menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1"  # Sans objet
# }



# # TODO: 2008tau99 is not present should be provided by 02_loy.... is it really needed
# all_vars <- union(vars,famille_vars)
# available_vars <- all_vars[union(vars,famille_vars) %in% names(menagem)]
# loyersMenages <- menagem[,available_vars]
#
    all_vars = vars + famille_vars

    print all_vars
    print  set(menagem.columns)
    available_vars = list( set(all_vars).intersection(set(menagem.columns)))

    loyersMenages = menagem.xs(available_vars,axis=1)


#
# # Recodage de typmen15: modalités de 1:15
# table(loyersMenages$typmen15, useNA="ifany")
# loyersMenages <- within(loyersMenages, {
#   typmen15[typmen15==10 ] <- 1
#   typmen15[typmen15==11 ] <- 2
#   typmen15[typmen15==21 ] <- 3
#   typmen15[typmen15==22 ] <- 4
#   typmen15[typmen15==23 ] <- 5
#   typmen15[typmen15==31 ] <- 6
#   typmen15[typmen15==32 ] <- 7
#   typmen15[typmen15==33 ] <- 8
#   typmen15[typmen15==41 ] <- 9
#   typmen15[typmen15==42 ] <- 10
#   typmen15[typmen15==43 ] <- 11
#   typmen15[typmen15==44 ] <- 12
#   typmen15[typmen15==51 ] <- 13
#   typmen15[typmen15==52 ] <- 14
#   typmen15[typmen15==53 ] <- 15
# })
#
#
# TODO: MBJ UNNECESSARY ?

#
# # Pb avec ddipl, pas de modalités 2: on décale les chaps >=3
# # Cependant on fait cela après avoir fait les traitement suivants
# table(loyersMenages$ddipl, useNA="ifany")
# # On convertit les ddipl en numeric
# loyersMenages$ddipl <- as.numeric(loyersMenages$ddipl)
# table(loyersMenages$ddipl, useNA="ifany")
# #   On met les non renseignés ie, NA et "" à sans diplome (modalité 7)
# loyersMenages[is.na(loyersMenages$ddipl), "ddipl"] <- 7
#
# loyersMenages[loyersMenages$ddipl>1, "ddipl"] <- loyersMenages$ddipl[loyersMenages$ddipl>1]-1
#


    loyersMenages.ddipl = where(loyersMenages.ddipl.isnull(), 7, loyersMenages.ddipl)
    loyersMenages.ddipl = where(loyersMenages.ddipl>1,
                                loyersMenages.ddipl-1,
                                loyersMenages.ddipl)
    loyersMenages.ddipl.astype("int32")
#
# table(final$actrec,useNA="ifany")
# final$act5 <- NA
# final <- within(final, {
#   act5[which(actrec==1) ] <- 2 # ind?pendants
#   act5[which(actrec==2) ] <- 1 # salari?s
#   act5[which(actrec==3) ] <- 1 # salari?s
#   act5[which(actrec==4) ] <- 3 # ch?meur
#   act5[which(actrec==7) ] <- 4 # retrait?
#   act5[which(actrec==8) ] <- 5 # autres inactifs
# })
# table(final$act5,useNA="ifany")
#


    final.act5 = NaN

    final.act5 = where(final.actrec==1, 2, final.act5) # indépendants
    final.act5 = where(final.actrec.isin([2,3]), 1, final.act5)  # salariés

    final.act5 = where(final.actrec==4, 3, final.act5) # chômeur
    final.act5 = where(final.actrec==7, 4, final.act5) # retraité
    final.act5 = where(final.actrec==8, 5, final.act5) # autres inactifs
    print final.act5.value_counts() # TODO : 29 retraités ?

#     assert final.act5.notnull().all(), 'there are NaN inside final.act5'

# final$wprm <- NULL # with the intention to extract wprm from menage to deal with FIPs
# final$tax_hab <- final$zthabm # rename zthabm to tax_hab
# final$zthabm <- NULL
#
# final2 <- merge(final, loyersMenages, by="idmen", all.x=TRUE)
    print '    création de final2'
    del final["wprm"]
    gc.collect()
    final.rename(columns=dict(zthabm="tax_hab"), inplace=True) # rename zthabm to tax_hab
    final2 = final.merge(loyersMenages, on="idmen", how="left") # TODO: Check
    print loyersMenages.head()
    gc.collect()
    print_id(final2)

#
# # TODO: merging with patrimoine
# rm(menagem,final)
#
# # table(final2$activite,useNA="ifany")
# # table(final2$alt,useNA="ifany")
#
# saveTmp(final2, file= "final2.Rdata")
#
# loadTmp("final2.Rdata")
# names(final2)
# print_id(final2)
#
#
# # set zone_apl using zone_apl_imputation_data
# apl_imp <- read.csv("./zone_apl/zone_apl_imputation_data.csv")
#
# if (year == "2008") {
#   zone_apl <- final2[, c("tu99", "pol99", "reg")]
# } else {
#   zone_apl <- final2[, c("tu99", "pol99", "tau99", "reg")]
# }
#
# for (i in 1:length(apl_imp[,"TU99"])) {
#   tu <- apl_imp[i,"TU99"]
#   pol <- apl_imp[i,"POL99"]
#   tau <- apl_imp[i,"TAU99"]
#   reg <- apl_imp[i,"REG"]
#   #  print(c(tu,pol,tau,reg))
#
#   if (year == "2008") {
#     indices <- (final2["tu99"] == tu & final2["pol99"] == pol  & final2["reg"] == reg)
#     selection <-  (apl_imp["TU99"] == tu & apl_imp["POL99"] == pol & apl_imp["REG"] == reg)
#   } else {
#     indices <- (final2["tu99"] == tu & final2["pol99"] == pol & final2["tau99"] == tau & final2["reg"] == reg)
#     selection <-  (apl_imp["TU99"] == tu & apl_imp["POL99"] == pol & apl_imp["TAU99"] == tau & apl_imp["REG"] == reg)
#   }
#   z <- runif(sum(indices))
#   probs <- apl_imp[selection , c("proba_zone1", "proba_zone2")]
#   #  print(probs)
#   final2[indices,"zone_apl"] <- 1 + (z>probs[,'proba_zone1']) + (z>(probs[,'proba_zone1']+probs[,'proba_zone2']))
#   rm(indices, probs)
# }
#

    print '    traitement des zones apl'
    apl_imp = read_csv("../../zone_apl/zone_apl_imputation_data.csv")

    print apl_imp.head(10)
    if year == 2008:
        zone_apl = final2.xs(["tu99", "pol99", "reg"], axis=1)
    else:
        zone_apl = final2.xs(["tu99", "pol99", "tau99", "reg"], axis=1)

    for i in range(len(apl_imp["TU99"])):
        tu = apl_imp["TU99"][i]
        pol = apl_imp["POL99"][i]
        tau = apl_imp["TAU99"][i]
        reg = apl_imp["REG"][i]

    if year == 2008:
        indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (final2["reg"] == reg)
        selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (apl_imp["REG"] == reg)
    else:
        indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (final2["tau99"] == tau) & (final2["reg"] == reg)
        selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (apl_imp["TAU99"] == tau) & (apl_imp["REG"] == reg)

    z = random.uniform(size=indices.sum())
    print len(z)
    print len(indices)

    print len(indices)/len(z)
    probs = apl_imp.loc[selection , ["proba_zone1", "proba_zone2"]]
    print probs
    print probs['proba_zone1'].values

    proba_zone_1 =  probs['proba_zone1'].values[0]
    proba_zone_2 =  probs['proba_zone2'].values[0]

    final2["zone_apl"] = 3
    final2["zone_apl"][indices] = ( 1 + (z>proba_zone_1) +
                                       (z>(proba_zone_1 + proba_zone_2)))
    del indices, probs

#     control(final2, verbose=True, debug=True, verbose_length=15)

    print '    performing cleaning on final2'
    print 'nombre de sali nuls', len(final2[final2['sali'].isnull()])
    print "nombre d'âges nuls", len(final2[final2.age.isnull()])
    print "longueur de final2 avant purge", len(final2)
#     columns_w_nan = []
#     for col in final2.columns:
#         if final2[final2['idfoy'].notnull()][col].isnull().any() and not final2[col].isnull().all():
#             columns_w_nan.append(col)
#     print columns_w_nan
    print 'check doublons', len(final2[final2.duplicated(['noindiv'])])
    print final2.age.isnull().sum()

#     print final2.loc[final2.duplicated('noindiv'), ['noindiv', 'quifam']].to_string()
    #TODO: JS: des chefs de famille et conjoints en double il faut trouver la source des ces doublons !
#     final2 = final2.drop_duplicates(['noindiv'])

    final2 = final2[~(final2.age.isnull())]
    print "longueur de final2 après purge", len(final2)
    print_id(final2)

#
# # var <- names(foyer)
# #a1 <- c('f7rb', 'f7ra', 'f7gx', 'f2aa', 'f7gt', 'f2an', 'f2am', 'f7gw', 'f7gs', 'f8td', 'f7nz', 'f1br', 'f7jy', 'f7cu', 'f7xi', 'f7xo', 'f7xn', 'f7xw', 'f7xy', 'f6hj', 'f7qt', 'f7ql', 'f7qm', 'f7qd', 'f7qb', 'f7qc', 'f1ar', 'f7my', 'f3vv', 'f3vu', 'f3vt', 'f7gu', 'f3vd', 'f2al', 'f2bh', 'f7fm', 'f8uy', 'f7td', 'f7gv', 'f7is', 'f7iy', 'f7il', 'f7im', 'f7ij', 'f7ik', 'f1er', 'f7wl', 'f7wk', 'f7we', 'f6eh', 'f7la', 'f7uh', 'f7ly', 'f8wy', 'f8wx', 'f8wv', 'f7sb', 'f7sc', 'f7sd', 'f7se', 'f7sf', 'f7sh', 'f7si',  'f1dr', 'f7hs', 'f7hr', 'f7hy', 'f7hk', 'f7hj', 'f7hm', 'f7hl', 'f7ho', 'f7hn', 'f4gc', 'f4gb', 'f4ga', 'f4gg', 'f4gf', 'f4ge', 'f7vz', 'f7vy', 'f7vx', 'f7vw', 'f7xe', 'f6aa', 'f1cr', 'f7ka', 'f7ky', 'f7db', 'f7dq', 'f2da')
# #a2 <- setdiff(a1,names(foyer))
# #b1 <- c('pondfin', 'alt', 'hsup', 'ass_mat', 'zone_apl', 'inactif', 'ass', 'aer', 'code_postal', 'activite', 'type_sal', 'jour_xyz', 'boursier', 'etr', 'partiel1', 'partiel2', 'empl_dir', 'gar_dom', 'categ_inv', 'opt_colca', 'csg_taux_plein','coloc')
# # hsup feuille d'impot
# # boursier pas dispo
# # inactif etc : extraire cela des donn?es clca etc
#
# # tester activit? car 0 vaut actif
# table(is.na(final2$activite),useNA="ifany")
#
# saveTmp(final2, file= "final2.Rdata")



    control(final2, debug=True)
    print final2.age.isnull().sum()
    final2 = final2.drop_duplicates(cols='noindiv')

    print '    Filter to manage the new 3-tables structures:'
    # On récupère les foyer, famille, ménages qui ont un chef :
    liste_men = unique(final2.loc[final2['quimen']==0,'idmen'].values)
    liste_fam = unique(final2.loc[final2['quifam']==0,'idfam'].values)
    liste_foy = unique(final2.loc[final2['quifoy']==0,'idfoy'].values)

    #On ne conserve dans final2 que ces foyers là :
    print 'final2 avant le filtrage' ,len(final2)
    final2 = final2.loc[final2.idmen.isin(liste_men), :]
    final2 = final2.loc[final2.idfam.isin(liste_fam), :]
    final2 = final2.loc[final2.idfoy.isin(liste_foy), :]
    print 'final2 après le filtrage', len(final2)

    if check:
        check_structure(final2)

    from openfisca_france import DATA_SOURCES_DIR
    test_filename = os.path.join(DATA_SOURCES_DIR, filename + ".h5")
    if os.path.exists(test_filename):
        import warnings
        import datetime
        time_stamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')
        renamed_file = os.path.join(DATA_SOURCES_DIR, filename + "_" + time_stamp + ".h5")
        warnings.warn("A file with the same name already exists \n Renaming current output and saving to " + renamed_file)
        test_filename = renamed_file


    store = HDFStore(test_filename)
    store['survey_'+ str(year)] = final2
Exemplo n.º 6
0
def create_fip(year = 2006): # message('03_fip')
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """

    df = DataCollection(year=year)

    print 'Démarrer 03_fip'
# # anaisenf: année de naissance des PAC
# erfFoyVar <- c('anaisenf','declar')
# foyer <- LoadIn(erfFoyFil)
# foyer <- LoadIn(erfFoyFil,erfFoyVar)

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = df.get_values(table="foyer", variables=erfFoyVar)
    print_id(foyer)
#    control(foyer, verbose=True, verbose_length=10, debug=True)


# #***********************************************************************************************************
# # print "Step 1 : on recupere les personnes à charge des foyers"
# #**********************************************************************************************************
# # On traite les cas de declarations multiples pour ne pas créer de doublon de pac
#
#
# # On récupère toutes les pac des foyers
# L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal
# fip <-data.frame(declar = foyer$declar)
# for (i in c(1:L)){
#   eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = '')))
#   eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = '')))
# }
# fip <- fip[!is.na(fip$typ.1),]
# fip <- reshape(fip,direction ='long', varying=2:17, sep=".")
# fip <- fip[!is.na(fip$naia),]
# fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')]
# fip$N <- row(fip)[,1]
# str(fip$N)

    print "Etape 1 : on recupere les personnes à charge des foyers"
    print "    1.1 : Création des codes des enfants"
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len))/5
    print "il ya a au maximum %s pac par foyer" %nb_pac_max

# Separating the string coding the pac of each "déclaration".
# Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable'])
    fip = DataFrame(randn(len(foyer), 3*nb_pac_max), columns=columns)
    fip.fillna(NaN, inplace=True) # inutile a cause de la ligne précédente, to remove
    for i in range(1,nb_pac_max+1):
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i,'type_pac')] = foyer['anaisenf'].str[5*(i-1)]
        fip[(i,'naia')] = foyer['anaisenf'].str[5*(i-1)+1:5*(i)]

    fip = fip.stack("pac_number")
    fip.reset_index(inplace=True)
    del fip["level_0"]

#     print fip.describe()
#     print fip.head().to_string()
    print "    1.2 : elimination des foyers fiscaux sans pac"
    #Clearing missing values and changing data format
    fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an')  & (fip['naia'] != '')]
    fip = fip.sort(columns=['declaration','naia','type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration","pac_number"], inplace=True)
    fip = fip.reset_index()

    del fip['pac_number']
#    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    print "    1.3 : on enlève les individus F pour lesquels il existe un individu G"
    tyFG = fip[fip.type_pac.isin(['F', 'G'])] #Filtre pour ne travailler que sur F & G

    tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin']))
    #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #puis on retire les autres (à la fois F et G)
    print len(tyFG),'/', len(tyFG[tyFG['to_keep']])
    print 'longueur fip', len(fip)

    fip['to_keep'] = NaN
    fip.update(tyFG)
    print 'enfants F & G traités'

    print "    1.4 : on enlève les H pour lesquels il y a un I"
    tyHI = fip[fip.type_pac.isin(['H', 'I'])]
    tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin'])

    fip.update(tyHI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    print 'nb lines to keep/nb initial lines'
    print len(fip[fip['to_keep']]), '/', len(fip)

    indivifip = fip[fip['to_keep']]; del indivifip['to_keep'], fip, tyFG, tyHI

#    control(indivifip, debug=True)


# #************************************************************************************************************/
    print ''
    print 'Step 2 : matching indivifip with eec file'
# #************************************************************************************************************/

    indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES


# pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',]
# pac$key1 <- paste(pac$naia,pac$declar1)
# pac$key2 <- paste(pac$naia,pac$declar2)
# indivifip$key <- paste(indivifip$naia,indivifip$declar)

    #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull()
    import pdb
    pdb.set_trace()
    pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip']=='pac')]

    pac['naia'] = pac['naia'].astype('int32') # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip['naia'].astype('int32')
    pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29])
    pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29])
    assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' %(pac.naia.dtype, indivifip.naia.dtype)

# fip <- indivifip[!indivifip$key %in% pac$key1,]
# fip <- fip[!fip$key %in% pac$key2,]
    fip = indivifip[~(indivifip.key.isin(pac.key1.values))]
    fip = fip[~(fip.key.isin(pac.key2.values))]


    print "    2.1 new fip created"
# We build a dataframe to link the pac to their type and noindiv
# table(duplicated(pac[,c("noindiv")]))
    countInd = pac.noindiv.value_counts()

# pacInd1 <- merge(pac[,c("noindiv","key1","naia")],
#                 indivifip[,c("key","typ")], by.x="key1", by.y="key")
# pacInd2 <- merge(pac[,c("noindiv","key2","naia")],
#                 indivifip[,c("key","typ")], by.x="key2", by.y="key")

    tmp_pac1 = pac[['noindiv', 'key1']]
    tmp_pac2 = pac[['noindiv', 'key2']]
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']]

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    print 'longueur pacInd1' , len(pac_ind1)
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    print 'longueur pacInd2', len(pac_ind2)
    print "pacInd1&2 créés"

# table(duplicated(pacInd1))
# table(duplicated(pacInd2))

    print pac_ind1.duplicated().sum()
    print pac_ind2.duplicated().sum()

# pacInd1 <-rename(pacInd1,c("key1" = "key"))
# pacInd2 <-rename(pacInd2,c("key2" = "key"))
# pacInd <- rbind(pacInd1,pacInd2)
# rm(pacInd1,pacInd2)

#     pacInd1.rename(columns={'key1':'key'}, inplace=True)
#     pacInd2.rename(columns={'key2':'key'}, inplace=True)
    del pac_ind1['key1'], pac_ind2['key2']
    print pac_ind1.columns
    print pac_ind2.columns

    if pac_ind1.index == []:
        if pac_ind2.index == []:
                print "Warning : no link between pac and noindiv for both pacInd1&2"
        else:
            print "Warning : pacInd1 is an empty data frame"
            pacInd = pac_ind2
    elif pac_ind2.index == []:
        print "Warning : pacInd2 is an empty data frame"
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    print len(pac_ind1), len(pac_ind2), len(pacInd)
    print pac_ind2.type_pac.isnull().sum()
    print pacInd.type_pac.value_counts()

    print '    2.2 : pacInd created'

# table(duplicated(pacInd[,c("noindiv","typ")]))
# table(duplicated(pacInd$noindiv))

    print 'doublons noindiv, type_pac', pacInd.duplicated(['noindiv', 'type_pac']).sum()
    print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum()
    print 'nb de NaN', pacInd.type_pac.isnull().sum()

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))]
#     pacIndiv.reset_index(inplace=True)
    print pacIndiv.columns

    save_temp(pacIndiv, name="pacIndiv", year=year)

    print pacIndiv.type_pac.value_counts()
    gc.collect()

# # We keep the fip in the menage of their parents because it is used in to
# # build the famille. We should build an individual ident for the fip that are
# # older than 18 since they are not in their parents' menage according to the eec

# individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
# individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
# individec1 <- upData(individec1,rename=c(declar1="declar"))
# fip1       <- merge(fip,individec1)
    # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    indivi['noidec'] = indivi['declar1'].str[0:2].astype('float16') # To be used later to set idfoy
    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec1 = individec1.loc[:, ["declar1","noidec","ident","rga","ztsai","ztsao"]]
    individec1 = individec1.rename(columns={'declar1':'declaration'})
    fip1 = fip.merge(individec1, on='declaration')
    print '    2.3 : fip1 created'

# # TODO: On ne s'occupe pas des declar2 pour l'instant
# # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
# # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
# # individec2 <- upData(individec2,rename=c(declar2="declar"))
# # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec2 = individec2.loc[:, ["declar2","noidec","ident","rga","ztsai","ztsao"]]
    individec2.rename(columns={'declar2':'declaration'}, inplace=True)
    print individec2.head()
    fip2 = fip.merge(individec2)
    print '    2.4 : fip2 created'


    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

# #fip <- rbind(fip1,fip2)
# fip <- fip1
# table(fip$typ)

    fip = concat([fip1, fip2])
#     fip = fip1 #TODO: Pourquoi cette ligne ?
    fip.type_pac.value_counts()

    print fip.columns
    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'] #TODO declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip['naia'].astype('float')
    fip['lpr'] = where(fip['agepf'] <=20, 3, 4) # TODO pas très propre d'après Mahdi/Clément
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = where(fip['agepf']<=15, 9, 5)

## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

# Reassigning noi for fip children if they are more than one per foyer fiscal
# while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
#   dup <- duplicated( fip[, c("noi","ident")])
#   tmp <- fip[dup,"noi"]
#   fip[dup, "noi"] <- (tmp-1)
# }
    #TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi','ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        print len(tmp)
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100*fip['ident'] + fip['noidec']
    fip['noindiv'] = 100*fip['ident'] + fip['noi']
    fip['type_pac'] = 0 ; fip['key'] = 0

    print fip.duplicated('noindiv').value_counts()
    save_temp(fip, name="fipDat", year=year)
    del fip, fip1, individec1, indivifip, indivi, pac
    print 'fip sauvegardé'
Exemplo n.º 7
0
def foyer_all(year=2006):

    ## On ajoute les cases de la déclaration
    #foyer_all <- LoadIn(erfFoyFil)
    data = DataCollection(year=year)
    foyer_all = data.get_values(table="foyer")

    ## on ne garde que les cases de la déclaration ('fxzz')
    #vars <- names(foyer_all)
    #vars <- c("noindiv", vars[grep("^f[0-9]", vars)])
    #

    vars = foyer_all.columns
    regex = re.compile("^f[0-9]")
    vars = [x for x in vars if regex.match(x)]

    #foyer <- foyer_all[vars]
    #rm(foyer_all)
    #gc()
    #noindiv <- list(foyer$noindiv)
    #

    foyer = foyer_all[vars + ["noindiv"]]

    del foyer_all
    gc.collect()

    #
    ## On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations
    #foyer <- aggregate(foyer, by = noindiv, FUN = 'sum')
    #print foyer.describe()["f1aj"].to_string()
    foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum)
    #
    #print foyer.describe()["f1aj"].to_string()
    #print foyer.describe()["noindiv"].to_string()
    #

    print_id(foyer)

    ## noindiv have been summed over original noindiv which are now in Group.1
    #foyer$noindiv <- NULL
    #foyer <- rename(foyer, c(Group.1 = 'noindiv'))
    ## problème avec les dummies ()
    #
    #saveTmp(foyer, file= "foyer_aggr.Rdata")
    #
    #
    #############################################################################
    ## On récupère les variables individualisables
    #loadTmp("foyer_aggr.Rdata")
    #
    #individualisable <- function(table, var, vars, qui){
    #  print(var)
    #  print(vars)
    #  temp <- table[c('noindiv', vars)]
    #  n = length(qui)
    #  names(temp)[2:(n+1)] <- qui
    #  temp$newvar <- NULL
    #  temp2 <- melt(temp, id = 'noindiv', variable_name = 'quifoy')
    #  temp2 <- transform(temp2, quifoy = as.character(quifoy))
    #  temp2 <- transform(temp2, noindiv = as.character(noindiv))
    #  str(temp2)
    #  rename(temp2, c(value = var))
    #}

    var_dict = {
        'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'],
        'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'],
        'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'],
        'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'],
        'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'],
        'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'],
        'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'],
        'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'],
        'f1tv': ['f1tv', 'f1uv'],
        'f1tw': ['f1tw', 'f1uw'],
        'f1tx': ['f1tx', 'f1ux'],
        'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'],
        'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'],
        'frag_exon': ['f5hn', 'f5in', 'f5jn'],
        'frag_impo': ['f5ho', 'f5io', 'f5jo'],
        'arag_exon': ['f5hb', 'f5ib', 'f5jb'],
        'arag_impg': ['f5hc', 'f5ic', 'f5jc'],
        'arag_defi': ['f5hf', 'f5if', 'f5jf'],
        'nrag_exon': ['f5hh', 'f5ih', 'f5jh'],
        'nrag_impg': ['f5hi', 'f5ii', 'f5ji'],
        'nrag_defi': ['f5hl', 'f5il', 'f5jl'],
        'nrag_ajag': ['f5hm', 'f5im', 'f5jm'],
        'mbic_exon': ['f5kn', 'f5ln', 'f5mn'],
        'abic_exon': ['f5kb', 'f5lb', 'f5mb'],
        'nbic_exon': ['f5kh', 'f5lh', 'f5mh'],
        'mbic_impv': ['f5ko', 'f5lo', 'f5mo'],
        'mbic_imps': ['f5kp', 'f5lp', 'f5mp'],
        'abic_impn': ['f5kc', 'f5lc', 'f5mc'],
        'abic_imps': ['f5kd', 'f5ld', 'f5md'],
        'nbic_impn': ['f5ki', 'f5li', 'f5mi'],
        'nbic_imps': ['f5kj', 'f5lj', 'f5mj'],
        'abic_defn': ['f5kf', 'f5lf', 'f5mf'],
        'abic_defs': ['f5kg', 'f5lg', 'f5mg'],
        'nbic_defn': ['f5kl', 'f5ll', 'f5ml'],
        'nbic_defs': ['f5km', 'f5lm', 'f5mm'],
        'nbic_apch': ['f5ks', 'f5ls', 'f5ms'],
        'macc_exon': ['f5nn', 'f5on', 'f5pn'],
        'aacc_exon': ['f5nb', 'f5ob', 'f5pb'],
        'nacc_exon': ['f5nh', 'f5oh', 'f5ph'],
        'macc_impv': ['f5no', 'f5oo', 'f5po'],
        'macc_imps': ['f5np', 'f5op', 'f5pp'],
        'aacc_impn': ['f5nc', 'f5oc', 'f5pc'],
        'aacc_imps': ['f5nd', 'f5od', 'f5pd'],
        'aacc_defn': ['f5nf', 'f5of', 'f5pf'],
        'aacc_defs': ['f5ng', 'f5og', 'f5pg'],
        'nacc_impn': ['f5ni', 'f5oi', 'f5pi'],
        'nacc_imps': ['f5nj', 'f5oj', 'f5pj'],
        'nacc_defn': ['f5nl', 'f5ol', 'f5pl'],
        'nacc_defs': ['f5nm', 'f5om', 'f5pm'],
        'mncn_impo': ['f5ku', 'f5lu', 'f5mu'],
        'cncn_bene': ['f5sn', 'f5ns', 'f5os'],
        'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'],  # TODO: check
        'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'],
        'abnc_exon': ['f5qb', 'f5rb', 'f5sb'],
        'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'],
        'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'],
        'abnc_impo': ['f5qc', 'f5rc', 'f5sc'],
        'abnc_defi': ['f5qe', 'f5re', 'f5se'],
        'nbnc_impo': ['f5qi', 'f5ri', 'f5si'],
        'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'],
        #               'ebic_impv' : ['f5ta','f5ua', 'f5va'],
        #               'ebic_imps' : ['f5tb','f5ub', 'f5vb'],
        'mbic_mvct': ['f5hu'],
        'macc_mvct': ['f5iu'],
        'mncn_mvct': ['f5ju'],
        'mbnc_mvct': ['f5kz'],
        'frag_pvct': ['f5hw', 'f5iw', 'f5jw'],
        'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'],
        'macc_pvct': ['f5nx', 'f5ox', 'f5px'],
        'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'],
        'mncn_pvct': ['f5ky', 'f5ly', 'f5my'],
        'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'],
        'macc_mvlt': ['f5nr', 'f5or', 'f5pr'],
        'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'],
        'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'],
        'frag_pvce': ['f5hx', 'f5ix', 'f5jx'],
        'arag_pvce': ['f5he', 'f5ie', 'f5je'],
        'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'],
        'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'],
        'abic_pvce': ['f5ke', 'f5le', 'f5me'],
        'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'],
        'macc_pvce': ['f5nq', 'f5oq', 'f5pq'],
        'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'],
        'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'],
        'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'],
        'cncn_pvce': ['f5so', 'f5nt', 'f5ot'],
        'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'],
        'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'],
        'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'],
        'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er']
    }  # (déménagement) uniquement en 2006

    #
    #varlist = list(list('sali', c('f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej')),
    #                list('choi', c('f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep')),
    #               list('fra', c('f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek')),
    # ......
    #               list('mbnc_pvce', c('f5hr', 'f5ir', 'f5jr')),
    #               list('abnc_pvce', c('f5qd', 'f5rd', 'f5sd')),
    #               list('nbnc_pvce', c('f5qj', 'f5rj', 'f5sj')),
    #               list('demenage' , c('f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'))) # (déménagement) uniquement en 2006
    #
    vars_sets = [set(var_list) for var_list in var_dict.values()]
    eligible_vars = (set().union(*vars_sets)).intersection(
        set(list(foyer.columns)))

    print "From %i variables, we keep %i eligibles variables" % (len(
        set().union(*vars_sets)), len(eligible_vars))
    qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3']
    err = 0
    err_vars = {}

    foy_ind = DataFrame()

    for individual_var, foyer_vars in var_dict.iteritems():
        try:
            selection = foyer[foyer_vars + ["noindiv"]]
        except KeyError:
            # Testing if at least one variable of foyers_vars is in the eligible list
            presence = [x in eligible_vars for x in foyer_vars]
            var_present = any(presence)
            if not var_present:
                print individual_var + " is not present"
                continue
            else:
                # Shrink the list
                foyer_vars_cleaned = [
                    var for var, present in zip(foyer_vars, presence)
                    if present is True
                ]
                selection = foyer[foyer_vars_cleaned + ["noindiv"]]

        # Reshape the dataframe
        selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True)
        selection.set_index("noindiv", inplace=True)
        selection.columns.name = "quifoy"

        selection = selection.stack()
        selection.name = individual_var
        selection = selection.reset_index(
        )  # A Series cannot see its index resetted to produce a DataFrame
        selection = selection.set_index(["quifoy", "noindiv"])
        selection = selection[selection[individual_var] != 0]
        #        print len(selection)

        if len(foy_ind) == 0:
            foy_ind = selection
        else:

            foy_ind = concat([foy_ind, selection], axis=1, join='outer')

    foy_ind.reset_index(inplace=True)

    print "foy_ind"
    print foy_ind.describe().to_string()

    #not_first <- FALSE
    #allvars = c()
    #for (v in varlist){
    #  vars = intersect(v[[2]],names(foyer)) # to deal with variabes that are not present
    #  if (length(vars) > 0) {
    #    allvars <-  c(allvars, vars)
    #    qui <- c('vous', 'conj', 'pac1', 'pac2', 'pac3')
    #    n <- length(vars)
    #    temp <- individualisable(foyer, v[[1]], vars, qui[1:n])
    #    if (not_first) {
    #      print('merge')
    #      foy_ind <- merge(temp, foy_ind, by = c('noindiv', 'quifoy'), all = TRUE)
    #      names(foy_ind)
    #    }
    #    else   {
    #      print('init')
    #      foy_ind <- temp
    #      not_first <- TRUE
    #    }
    #  }
    #}

    ind_vars_to_remove = Series(list(eligible_vars))
    save_temp(ind_vars_to_remove, name='ind_vars_to_remove', year=year)
    foy_ind.rename(columns={"noindiv": "idfoy"}, inplace=True)

    print_id(foy_ind)
    foy_ind['quifoy'][foy_ind['quifoy'] == 'vous'] = 0
    foy_ind['quifoy'][foy_ind['quifoy'] == 'conj'] = 1
    foy_ind['quifoy'][foy_ind['quifoy'] == 'pac1'] = 2
    foy_ind['quifoy'][foy_ind['quifoy'] == 'pac2'] = 3
    foy_ind['quifoy'][foy_ind['quifoy'] == 'pac3'] = 4

    assert foy_ind['quifoy'].isin(
        range(5)).all(), 'présence de valeurs aberrantes dans quifoy'

    print 'saving foy_ind'
    print_id(foy_ind)
    save_temp(foy_ind, name="foy_ind", year=year)
    show_temp()
    return
Exemplo n.º 8
0
def create_fip(year=2006):  # message('03_fip')
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """

    df = DataCollection(year=year)

    print 'Démarrer 03_fip'
    # # anaisenf: année de naissance des PAC
    # erfFoyVar <- c('anaisenf','declar')
    # foyer <- LoadIn(erfFoyFil)
    # foyer <- LoadIn(erfFoyFil,erfFoyVar)

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = df.get_values(table="foyer", variables=erfFoyVar)
    print_id(foyer)
    #    control(foyer, verbose=True, verbose_length=10, debug=True)

    # #***********************************************************************************************************
    # # print "Step 1 : on recupere les personnes à charge des foyers"
    # #**********************************************************************************************************
    # # On traite les cas de declarations multiples pour ne pas créer de doublon de pac
    #
    #
    # # On récupère toutes les pac des foyers
    # L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal
    # fip <-data.frame(declar = foyer$declar)
    # for (i in c(1:L)){
    #   eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = '')))
    #   eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = '')))
    # }
    # fip <- fip[!is.na(fip$typ.1),]
    # fip <- reshape(fip,direction ='long', varying=2:17, sep=".")
    # fip <- fip[!is.na(fip$naia),]
    # fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')]
    # fip$N <- row(fip)[,1]
    # str(fip$N)

    print "Etape 1 : on recupere les personnes à charge des foyers"
    print "    1.1 : Création des codes des enfants"
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    print "il ya a au maximum %s pac par foyer" % nb_pac_max

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(multi_index_columns,
                                     names=['pac_number', 'variable'])
    fip = DataFrame(randn(len(foyer), 3 * nb_pac_max), columns=columns)
    fip.fillna(
        NaN, inplace=True)  # inutile a cause de la ligne précédente, to remove
    for i in range(1, nb_pac_max + 1):
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)]
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1:5 * (i)]

    fip = fip.stack("pac_number")
    fip.reset_index(inplace=True)
    del fip["level_0"]

    #     print fip.describe()
    #     print fip.head().to_string()
    print "    1.2 : elimination des foyers fiscaux sans pac"
    #Clearing missing values and changing data format
    fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an') &
              (fip['naia'] != '')]
    fip = fip.sort(columns=['declaration', 'naia', 'type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration", "pac_number"], inplace=True)
    fip = fip.reset_index()

    del fip['pac_number']
    #    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    print "    1.3 : on enlève les individus F pour lesquels il existe un individu G"
    tyFG = fip[fip.type_pac.isin(['F', 'G'
                                  ])]  #Filtre pour ne travailler que sur F & G

    tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'],
                                        take_last=True)
    tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin']))
    #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #puis on retire les autres (à la fois F et G)
    print len(tyFG), '/', len(tyFG[tyFG['to_keep']])
    print 'longueur fip', len(fip)

    fip['to_keep'] = NaN
    fip.update(tyFG)
    print 'enfants F & G traités'

    print "    1.4 : on enlève les H pour lesquels il y a un I"
    tyHI = fip[fip.type_pac.isin(['H', 'I'])]
    tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'],
                                        take_last=True)
    tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin'])

    fip.update(tyHI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    print 'nb lines to keep/nb initial lines'
    print len(fip[fip['to_keep']]), '/', len(fip)

    indivifip = fip[fip['to_keep']]
    del indivifip['to_keep'], fip, tyFG, tyHI

    #    control(indivifip, debug=True)

    # #************************************************************************************************************/
    print ''
    print 'Step 2 : matching indivifip with eec file'
    # #************************************************************************************************************/

    indivi = load_temp(name="indivim",
                       year=year)  #TODO: USE THIS INSTEAD OF PREVIOUS LINES

    # pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',]
    # pac$key1 <- paste(pac$naia,pac$declar1)
    # pac$key2 <- paste(pac$naia,pac$declar2)
    # indivifip$key <- paste(indivifip$naia,indivifip$declar)

    #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull()
    import pdb
    pdb.set_trace()
    pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip'] == 'pac')]

    pac['naia'] = pac['naia'].astype(
        'int32')  # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip['naia'].astype('int32')
    pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29])
    pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip['naia'],
                           indivifip['declaration'].str[:29])
    assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' % (
        pac.naia.dtype, indivifip.naia.dtype)

    # fip <- indivifip[!indivifip$key %in% pac$key1,]
    # fip <- fip[!fip$key %in% pac$key2,]
    fip = indivifip[~(indivifip.key.isin(pac.key1.values))]
    fip = fip[~(fip.key.isin(pac.key2.values))]

    print "    2.1 new fip created"
    # We build a dataframe to link the pac to their type and noindiv
    # table(duplicated(pac[,c("noindiv")]))
    countInd = pac.noindiv.value_counts()

    # pacInd1 <- merge(pac[,c("noindiv","key1","naia")],
    #                 indivifip[,c("key","typ")], by.x="key1", by.y="key")
    # pacInd2 <- merge(pac[,c("noindiv","key2","naia")],
    #                 indivifip[,c("key","typ")], by.x="key2", by.y="key")

    tmp_pac1 = pac[['noindiv', 'key1']]
    tmp_pac2 = pac[['noindiv', 'key2']]
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']]

    pac_ind1 = tmp_pac1.merge(tmp_indivifip,
                              left_on='key1',
                              right_on='key',
                              how='inner')
    print 'longueur pacInd1', len(pac_ind1)
    pac_ind2 = tmp_pac2.merge(tmp_indivifip,
                              left_on='key2',
                              right_on='key',
                              how='inner')
    print 'longueur pacInd2', len(pac_ind2)
    print "pacInd1&2 créés"

    # table(duplicated(pacInd1))
    # table(duplicated(pacInd2))

    print pac_ind1.duplicated().sum()
    print pac_ind2.duplicated().sum()

    # pacInd1 <-rename(pacInd1,c("key1" = "key"))
    # pacInd2 <-rename(pacInd2,c("key2" = "key"))
    # pacInd <- rbind(pacInd1,pacInd2)
    # rm(pacInd1,pacInd2)

    #     pacInd1.rename(columns={'key1':'key'}, inplace=True)
    #     pacInd2.rename(columns={'key2':'key'}, inplace=True)
    del pac_ind1['key1'], pac_ind2['key2']
    print pac_ind1.columns
    print pac_ind2.columns

    if pac_ind1.index == []:
        if pac_ind2.index == []:
            print "Warning : no link between pac and noindiv for both pacInd1&2"
        else:
            print "Warning : pacInd1 is an empty data frame"
            pacInd = pac_ind2
    elif pac_ind2.index == []:
        print "Warning : pacInd2 is an empty data frame"
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    print len(pac_ind1), len(pac_ind2), len(pacInd)
    print pac_ind2.type_pac.isnull().sum()
    print pacInd.type_pac.value_counts()

    print '    2.2 : pacInd created'

    # table(duplicated(pacInd[,c("noindiv","typ")]))
    # table(duplicated(pacInd$noindiv))

    print 'doublons noindiv, type_pac', pacInd.duplicated(
        ['noindiv', 'type_pac']).sum()
    print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum()
    print 'nb de NaN', pacInd.type_pac.isnull().sum()

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))]
    #     pacIndiv.reset_index(inplace=True)
    print pacIndiv.columns

    save_temp(pacIndiv, name="pacIndiv", year=year)

    print pacIndiv.type_pac.value_counts()
    gc.collect()

    # # We keep the fip in the menage of their parents because it is used in to
    # # build the famille. We should build an individual ident for the fip that are
    # # older than 18 since they are not in their parents' menage according to the eec

    # individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
    # individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
    # individec1 <- upData(individec1,rename=c(declar1="declar"))
    # fip1       <- merge(fip,individec1)
    # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    indivi['noidec'] = indivi['declar1'].str[0:2].astype(
        'float16')  # To be used later to set idfoy
    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values))
                        & (indivi['persfip'] == "vous")]
    individec1 = individec1.loc[:, [
        "declar1", "noidec", "ident", "rga", "ztsai", "ztsao"
    ]]
    individec1 = individec1.rename(columns={'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on='declaration')
    print '    2.3 : fip1 created'

    # # TODO: On ne s'occupe pas des declar2 pour l'instant
    # # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
    # # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
    # # individec2 <- upData(individec2,rename=c(declar2="declar"))
    # # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values))
                        & (indivi['persfip'] == "vous")]
    individec2 = individec2.loc[:, [
        "declar2", "noidec", "ident", "rga", "ztsai", "ztsao"
    ]]
    individec2.rename(columns={'declar2': 'declaration'}, inplace=True)
    print individec2.head()
    fip2 = fip.merge(individec2)
    print '    2.4 : fip2 created'

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    # #fip <- rbind(fip1,fip2)
    # fip <- fip1
    # table(fip$typ)

    fip = concat([fip1, fip2])
    #     fip = fip1 #TODO: Pourquoi cette ligne ?
    fip.type_pac.value_counts()

    print fip.columns
    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype(
        'float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration']  #TODO declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip['naia'].astype('float')
    fip['lpr'] = where(fip['agepf'] <= 20, 3,
                       4)  # TODO pas très propre d'après Mahdi/Clément
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = where(fip['agepf'] <= 15, 9, 5)

    ## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
    ## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

    # Reassigning noi for fip children if they are more than one per foyer fiscal
    # while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
    #   dup <- duplicated( fip[, c("noi","ident")])
    #   tmp <- fip[dup,"noi"]
    #   fip[dup, "noi"] <- (tmp-1)
    # }
    #TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        print len(tmp)
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    print fip.duplicated('noindiv').value_counts()
    save_temp(fip, name="fipDat", year=year)
    del fip, fip1, individec1, indivifip, indivi, pac
    print 'fip sauvegardé'
def invalide(year = 2006):

    print 'Entering 07_invalides: construction de la variable invalide NOTFUNCTIONNAL NAOW'

    return
# # # Invalides
# # #inv = caseP (vous), caseF (conj) ou case G, caseI, ou caseR (pac)

# # loadTmp("final.Rdata")
# # invalides <- final[,c("noindiv","idmen","caseP","caseF","idfoy","quifoy")]
# # invalides <- within(invalides,{
# #   caseP <- ifelse(is.na(caseP),0,caseP)
# #   caseF <- ifelse(is.na(caseF),0,caseF)
# #   inv <- FALSE})
# # # Les "vous" invalides
# # table(invalides[,c("caseF","quifoy")],useNA="ifany")
# # invalides[(invalides$caseP==1) & (invalides$quifoy=="vous"),"inv"] <- TRUE
# #
    print ''
    print 'Etape 1 : création de la df invalides'
    print '    1.1 : déclarants invalides'
    final = load_temp(name="final", year=year)
    invalides = final.xs(["noindiv","idmen","caseP","caseF","idfoy","quifoy","maahe","rc1rev"], axis=1)

    print invalides['rc1rev'].value_counts()

    for var in ["caseP", "caseF"]:
        assert invalides[var].notnull().all(), 'présence de NaN dans %s' %(var)

    # Les déclarants invalides
    invalides['inv'] = False
    invalides['inv'][(invalides['caseP']==1) & (invalides['quifoy']==0)] = True
    print invalides["inv"].sum(), " invalides déclarants"

    #Les personnes qui touchent l'aah dans l'enquête emploi
    invalides['inv'][(invalides['maahe']>0)] = True
    invalides['inv'][(invalides['rc1rev']==4)] = True #TODO: vérifier le format.
    print invalides["inv"].sum(), " invalides qui touchent des alloc"

    print_id(invalides)


# # # Les conjoints invalides
# #
# # #men_inv_conj <- invalides[c("idmen","caseF","quifoy")]
# # #men_inv_conj <- rename(men_inv_conj, c("caseF"="inv"))
# # #table(men_inv_conj[men_inv_conj$inv==1 ,c("inv","quifoy")],useNA="ifany")
# # # Il y a des caseF suir des conjoints cela vint des doubles d?clarations TODO: shoumd clean this
# # #toto <- invalides[invalides$caseF==1 & invalides$quifoy=="conj","idmen"]
# # #load(indm)
# # #titi <- indivim[(indivim$ident %in% toto) & (indivim$persfip=="vous" |indivim$persfip=="conj") ,c("ident","noindiv","declar1","declar2","persfip","quelfic")]
# # #titi <- titi[order(titi$ident),]
# # foy_inv_conj <- invalides[,c("idfoy","caseF","quifoy")]
# # foy_inv_conj <- rename(foy_inv_conj, c("caseF"="inv"))
# # table(foy_inv_conj[ ,c("inv","quifoy")],useNA="ifany")
# # # On ne garde donc que les caseF des "vous"

# # foy_inv_conj   <- foy_inv_conj[foy_inv_conj$quifoy=="vous",c("idfoy","inv")]
# # table(foy_inv_conj[ ,c("inv")],useNA="ifany")
# # invalides_conj <- invalides[invalides$quifoy=="conj",c("idfoy","noindiv")]
# # invalides_conj <- merge(invalides_conj, foy_inv_conj, by="idfoy", all.x=TRUE)
# # table(invalides_conj$inv) # TODO en 2006 On en a 316 au lieu de 328 il doit y avoir de idfoy avec caseF qui n'ont pas de vous because double déclaration'
# # invalides[invalides$quifoy=="conj",c("idfoy","noindiv","inv")] <- invalides_conj
# # table(invalides[,c("inv","quifoy")],useNA="ifany")
# # rm(invalides_conj,foy_inv_conj)

    # On récupère les idfoy des foyers avec une caseF cochée
    print '    1.2 : Les conjoints invalides'
    idfoy_inv_conj = final["idfoy"][final["caseF"]]
    inv_conj_condition = (invalides["idfoy"].isin(idfoy_inv_conj)  & (invalides["quifoy"]==1))
    invalides["inv"][inv_conj_condition] = True

    print len(invalides[inv_conj_condition]), "invalides conjoints"
    print invalides["inv"].sum(), " invalides déclarants et invalides conjoints"

# # # Enfants invalides et garde alternée
# #
# # loadTmp("pacIndiv.Rdata")
# # foy_inv_pac <- invalides[!(invalides$quifoy %in% c("vous","conj")),c("inv","noindiv")]
# # foy_inv_pac <- merge(foy_inv_pac, pacIndiv[,c("noindiv","typ","naia")], by="noindiv",all.x =TRUE)
# # names(foy_inv_pac)
# # table(foy_inv_pac[,c("typ","naia")],useNA="ifany")
# # table(foy_inv_pac[,c("typ")],useNA="ifany")
# # foy_inv_pac <- within(foy_inv_pac,{
# #   inv  <- (typ=="G") | (typ=="R") | (typ=="I") | (typ=="F" & (as.numeric(year)-naia>18))
# #   alt  <- (typ=="H") | (typ=="I")
# #   naia <- NULL
# #   typ  <- NULL})
# #
# # table(foy_inv_pac[ ,c("inv")],useNA="ifany")
# # table(foy_inv_pac[ ,c("alt")],useNA="ifany")
# # invalides$alt <- 0
# # foy_inv_pac[is.na(foy_inv_pac$alt),"alt"] <- 0
# # invalides[!(invalides$quifoy %in% c("vous","conj")),c("noindiv","inv","alt")] <- foy_inv_pac


    print '    1.3 : enfants invalides et garde alternée'

    pacIndiv = load_temp(name='pacIndiv', year=year)
    print pacIndiv.type_pac.value_counts()

    foy_inv_pac = invalides.loc[~(invalides.quifoy.isin([0, 1])), ['noindiv', 'inv']]
#     pac = pacIndiv.ix[:, ["noindiv", "type_pac", "naia"]]
    print len(foy_inv_pac)

    print pacIndiv.columns
    foy_inv_pac = foy_inv_pac.merge(pacIndiv.loc[:, ['noindiv', 'type_pac', 'naia']],
                                    on='noindiv', how='left')
    foy_inv_pac['inv'] = (foy_inv_pac['type_pac'].isin(['G','R','I']) |
                             ((foy_inv_pac['type_pac']=="F") & ((year - foy_inv_pac['naia'])>18)))

    foy_inv_pac['alt'] = ((foy_inv_pac['type_pac']=="H") | (foy_inv_pac['type_pac']=="I"))
    foy_inv_pac['naia'] = None
    foy_inv_pac['type_pac'] = None
    foy_inv_pac['alt'] = foy_inv_pac['alt'].fillna(False)


    print foy_inv_pac['inv'].describe()
    invalides['alt'] = 0
    foy_inv_pac['alt'][foy_inv_pac.alt.isnull()] = 0
    invalides = invalides.merge(foy_inv_pac, on=["noindiv","inv","alt"])

    invalides = invalides.drop_duplicates(['noindiv', 'inv', 'alt'], take_last=True)
# =======
#     print foy_inv_pac.inv.value_counts() # TODO: JS : trop peu de True là-dedans
#     print foy_inv_pac.alt.value_counts() #
#
#
#     print  len(invalides), len(foy_inv_pac)
#     print invalides.inv.value_counts()
# >>>>>>> 67cd9a43177cf3f6f72521cda59dae02485df1e3

    invalides = invalides.merge(foy_inv_pac, on='noindiv', how='left')
    invalides['inv'] = where(invalides['inv_y']==True, invalides['inv_y'], invalides['inv_x'])
    invalides['alt'] = where(invalides['inv_y']==True, invalides['inv_y'], invalides['inv_x'])

    invalides = invalides.loc[:, ["noindiv","idmen","caseP","caseF","idfoy","quifoy", "inv", 'alt']]
    invalides['alt'].fillna(False, inplace=True)

    print invalides.inv.value_counts()
    invalides = invalides.drop_duplicates(['noindiv', 'inv', 'alt'], take_last=True)
    del foy_inv_pac, pacIndiv

# # # Initialisation des NA sur alt et inv
# # invalides[is.na(invalides$inv), "inv"] <- 0
# # table(invalides[,c("alt","inv")],useNA="ifany")
# #
# # final <- merge(final, invalides[,c("noindiv","inv","alt")], by="noindiv",all.x=TRUE)
# # table(final[, c("inv","alt")],useNA="ifany")

    print ''
    print 'Etape 2 : Initialisation des NA sur alt et inv'
    assert invalides["inv"].notnull().all() & invalides.alt.notnull().all()
    final = final.merge(invalides.loc[:, ['noindiv', 'inv', 'alt']], on='noindiv', how='left')
    del invalides

    print final.inv.value_counts()
    control(final, debug=True)

    save_temp(final, name='final', year=year)
    print 'final complétée et sauvegardée'
Exemplo n.º 10
0
def create_totals(year=2006):

    print "Creating Totals"
    print "Etape 1 : Chargement des données"

    data = DataCollection(year=year)
    indivim = load_temp(name="indivim", year=year)

    assert indivim.duplicated(['noindiv'
                               ]).any() == False, "Présence de doublons"

    # Deals individuals with imputed income : some individuals are in 'erf individu table' but
    # not in the 'foyer' table. We need to create a foyer for them.

    selection = Series()
    for var in [
            "zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici",
            "zrnci"
    ]:
        varo = var[:-1] + "o"
        test = indivim[var] != indivim[varo]
        if len(selection) == 0:
            selection = test
        else:
            selection = (test) | (selection)

    indivi_i = indivim[selection]
    indivi_i.rename(
        columns={
            "ident": "idmen",
            "persfip": "quifoy",
            "zsali":
            "sali2",  # Inclu les salaires non imposables des agents d'assurance
            "zchoi": "choi2",
            "zrsti": "rsti2",
            "zalri": "alr2"
        },
        inplace=True)

    indivi_i["quifoy"] = where(indivi_i["quifoy"].isnull(), "vous",
                               indivi_i["quifoy"])
    indivi_i["quelfic"] = "FIP_IMP"

    ## We merge them with the other individuals
    #indivim <- rename(indivim, c(ident = "idmen",
    #                             persfip = "quifoy",
    #                             zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance
    #                             zchoi = "choi2",
    #                             zrsti = "rsti2",
    #                             zalri = "alr2"))
    #
    #indivi <- rbind(indivim[!(indivim$noindiv %in% indivi_i$noindiv),], indivi_i)
    #rm(indivim, indivi_i)
    #gc()
    #table(indivi$quelfic)
    #

    indivim.rename(
        columns=dict(
            ident="idmen",
            persfip="quifoy",
            zsali=
            "sali2",  # Inclu les salaires non imposables des agents d'assurance
            zchoi="choi2",
            zrsti="rsti2",
            zalri="alr2"),
        inplace=True)

    if not (set(list(indivim.noindiv)) > set(list(indivi_i.noindiv))):
        raise Exception("Individual ")
    indivim.set_index("noindiv", inplace=True)
    indivi_i.set_index("noindiv", inplace=True)
    indivi = indivim
    del indivim
    indivi.update(indivi_i)

    indivi.reset_index(inplace=True)

    print ''
    print "Etape 2 : isolation des FIP"
    fip_imp = indivi.quelfic == "FIP_IMP"
    indivi["idfoy"] = (
        indivi["idmen"].astype("int64") * 100 +
        (indivi["declar1"].str[0:2]).convert_objects(convert_numeric=True))

    indivi.loc[fip_imp, "idfoy"] = nan

    ## Certains FIP (ou du moins avec revenus imputés) ont un num?ro de déclaration d'impôt ( pourquoi ?)

    fip_has_declar = (fip_imp) & (indivi.declar1.notnull())

    #    indivi.ix[fip_has_declar, "idfoy"] = ( indivi.ix[fip_has_declar, "idmen"]*100
    #                                        + (indivi.ix[fip_has_declar, "declar1"].str[0:1]).convert_objects(convert_numeric=True) )
    indivi["idfoy"] = where(
        fip_has_declar, indivi["idmen"] * 100 +
        indivi["declar1"].str[0:2].convert_objects(convert_numeric=True),
        indivi["idfoy"])

    del fip_has_declar

    fip_no_declar = (fip_imp) & (indivi.declar1.isnull())
    del fip_imp
    indivi["idfoy"] = where(fip_no_declar, indivi["idmen"] * 100 + 50,
                            indivi["idfoy"])

    indivi_fnd = indivi.loc[fip_no_declar, ["idfoy", "noindiv"]]

    while any(indivi_fnd.duplicated(cols=["idfoy"])):
        indivi_fnd["idfoy"] = where(indivi_fnd.duplicated(cols=["idfoy"]),
                                    indivi_fnd["idfoy"] + 1,
                                    indivi_fnd["idfoy"])

    assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len(
        indivi_fnd["idfoy"]), "Duplicates remaining"
    assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons"

    indivi.loc[fip_no_declar, ["idfoy"]] = indivi_fnd
    del indivi_fnd, fip_no_declar

    print ''
    print 'Etape 3 : Récupération des EE_NRT'

    nrt = indivi.quelfic == "EE_NRT"
    indivi.idfoy = where(nrt, indivi.idmen * 100 + indivi.noi, indivi.idfoy)
    indivi.loc[nrt, "quifoy"] = "vous"
    del nrt

    pref_or_cref = indivi['lpr'].isin([1, 2])
    adults = (indivi.quelfic.isin(["EE", "EE_CAF"])) & (pref_or_cref)
    indivi.idfoy = where(adults, indivi.idmen * 100 + indivi.noi, indivi.idfoy)
    indivi.loc[adults, "quifoy"] = "vous"
    del adults
    assert indivi.loc[indivi['lpr'].isin([1, 2]), "idfoy"].notnull().all()

    print ''
    print 'Etape 4 : Rattachement des enfants aux déclarations'

    assert indivi["noindiv"].duplicated().any(
    ) == False, "Some noindiv appear twice"
    lpr3_or_lpr4 = indivi['lpr'].isin([3, 4])
    enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE", "EE_CAF"]))
    assert indivi.loc[enf_ee, "noindiv"].notnull().all(
    ), " Some noindiv are not set, which will ruin next stage"
    assert indivi.loc[
        enf_ee,
        "noindiv"].duplicated().any() == False, "Some noindiv appear twice"

    pere = DataFrame({
        "noindiv_enf":
        indivi.noindiv.loc[enf_ee],
        "noindiv":
        100 * indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee]
    })
    mere = DataFrame({
        "noindiv_enf":
        indivi.noindiv.loc[enf_ee],
        "noindiv":
        100 * indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee]
    })

    foyer = data.get_values(variables=["noindiv", "zimpof"], table="foyer")
    pere = pere.merge(foyer, how="inner", on="noindiv")
    mere = mere.merge(foyer, how="inner", on="noindiv")

    #     print "Some pere et mere are duplicated because people have two foyers"
    #     print pere[pere.duplicated()]
    #     print mere[mere.duplicated()]

    df = pere.merge(mere, how="outer", on="noindiv_enf", suffixes=('_p', '_m'))

    #     print len(pere)
    #     print len(mere)
    #     print len(df)
    #     ll = df.loc[df["noindiv_enf"].duplicated(), "noindiv_enf"]
    #     print df.loc[df["noindiv_enf"].isin(ll)]
    #     print df[df.duplicated()]

    print '    4.1 : gestion des personnes dans 2 foyers'
    for col in ["noindiv_p", "noindiv_m", "noindiv_enf"]:
        df[col] = df[col].fillna(
            0, inplace=True)  # beacause groupby drop groups with NA in index
    df = df.groupby(by=["noindiv_p", "noindiv_m", "noindiv_enf"]).sum()
    df.reset_index(inplace=True)

    df["which"] = ""
    df["which"] = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()),
                        "mere", "")
    df["which"] = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()),
                        "pere", "")
    both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull())
    df["which"] = where(both & (df.zimpof_p > df.zimpof_m), "pere", "mere")
    df["which"] = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere")

    assert df["which"].notnull().all(
    ), "Some enf_ee individuals are not matched with any pere or mere"
    del lpr3_or_lpr4, pere, mere

    df.rename(columns={"noindiv_enf": "noindiv"}, inplace=True)
    df["idfoy"] = where(df.which == "pere", df.noindiv_p, df.noindiv_m)
    df["idfoy"] = where(df.which == "mere", df.noindiv_m, df.noindiv_p)

    assert df["idfoy"].notnull().all()

    for col in df.columns:
        if col not in ["idfoy", "noindiv"]:
            del df[col]


#     assert indivi.loc[enf_ee,"idfoy"].notnull().all()
    assert df.duplicated().any() == False

    df.set_index("noindiv", inplace=True, verify_integrity=True)
    indivi.set_index("noindiv", inplace=True, verify_integrity=True)

    ind_notnull = indivi["idfoy"].notnull().sum()
    ind_isnull = indivi["idfoy"].isnull().sum()
    indivi = indivi.combine_first(df)
    assert ind_notnull + ind_isnull == (indivi["idfoy"].notnull().sum() +
                                        indivi["idfoy"].isnull().sum())

    indivi.reset_index(inplace=True)
    assert indivi.duplicated().any() == False

    # MBJ: issue delt with when moving from R code to python
    ## TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs
    ## On suit guide méthodo erf 2003 page 135
    ## On supprime les conjoints FIP et les FIP de 25 ans et plus;
    ## On conserve les enfants FIP de 19 à 24 ans;
    ## On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un
    ## ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète
    ## EEC que l'on ne retrouvera pas dans les EEC suivantes.
    #
    print '    4.2 : On enlève les individus pour lesquels il manque le déclarant'
    fip = load_temp(name="fipDat", year=year)
    fip["declar"] = nan
    fip["agepf"] = nan

    fip.drop(["actrec", "year", "noidec"], axis=1, inplace=True)
    fip.naia = fip.naia.astype("int32")
    fip.rename(
        columns=dict(
            ident="idmen",
            persfip="quifoy",
            zsali=
            "sali2",  # Inclu les salaires non imposables des agents d'assurance
            zchoi="choi2",
            zrsti="rsti2",
            zalri="alr2"),
        inplace=True)

    is_fip_19_25 = ((year - fip.naia - 1) >= 19) & ((year - fip.naia - 1) < 25)

    ## TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented
    ## The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25
    ##fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"]
    ##fip[is_fip_19_25 ,"lpr"]  <- 1
    #
    #indivi <- rbind.fill(indivi,fip[is_fip_19_25,])

    indivi = concat([indivi, fip.loc[is_fip_19_25]])
    del is_fip_19_25
    indivi['age'] = year - indivi.naia - 1
    indivi['agem'] = 12 * indivi.age + 12 - indivi.naim

    indivi["quimen"] = 0
    indivi.quimen[indivi.lpr == 1] = 0
    indivi.quimen[indivi.lpr == 2] = 1
    indivi.quimen[indivi.lpr == 3] = 2
    indivi.quimen[indivi.lpr == 4] = 3
    indivi['not_pr_cpr'] = nan
    indivi['not_pr_cpr'][indivi['lpr'] <= 2] = False
    indivi['not_pr_cpr'][indivi['lpr'] > 2] = True

    print "    4.3 : Creating non pr=0 and cpr=1 idmen's"
    indivi.reset_index(inplace=True)
    test1 = indivi.ix[indivi['not_pr_cpr'] == True, ['quimen', 'idmen']]
    test1['quimen'] = 2

    j = 2
    while any(test1.duplicated(['quimen', 'idmen'])):
        test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j + 1
        j += 1

    print_id(indivi)
    indivi.update(test1)

    print_id(indivi)

    #     indivi.set_index(['quiment']) #TODO: check relevance
    #     TODO problème avec certains idfoy qui n'ont pas de vous
    print ''
    print "Etape 5 : Gestion des idfoy qui n'ont pas de vous"
    all = indivi.drop_duplicates('idfoy')
    with_ = indivi.loc[indivi['quifoy'] == 'vous', 'idfoy']
    without = all[~(all.idfoy.isin(with_.values))]

    print 'On cherche si le déclarant donné par la deuxième déclaration est bien un vous'
    has_declar2 = (indivi.idfoy.isin(
        without.idfoy.values)) & (indivi.declar2.notnull())
    decl2_idfoy = (indivi.loc[has_declar2, 'idmen'].astype('int') * 100 +
                   indivi.loc[has_declar2, "declar2"].str[0:2].astype('int'))
    indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values),
                                             decl2_idfoy, None)
    del all, with_, without, has_declar2

    print '    5.1 : Elimination idfoy restant'
    idfoyList = indivi.loc[indivi['quifoy'] == "vous",
                           'idfoy'].drop_duplicates()
    indivi = indivi[indivi.idfoy.isin(idfoyList.values)]
    del idfoyList
    print_id(indivi)

    myvars = [
        "noindiv", "noi", "idmen", "idfoy", "quifoy", "wprm", "age", "agem",
        "quelfic", "actrec", "quimen", "nbsala", "titc", "statut", "txtppb",
        "chpub", "prosa", "encadr"
    ]

    if not (len(set(myvars).difference(set(indivi.columns))) == 0):
        print set(myvars).difference(set(indivi.columns))

    assert len(set(myvars).difference(set(indivi.columns))) == 0

    indivi = indivi.loc[:, myvars]

    ## TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé
    ## les infos provenant des déclarations)

    print ''
    print 'Etape 6 : Création des variables descriptives'
    print '    6.1 : variable activité'
    indivi['activite'] = None
    indivi['activite'][indivi['actrec'] <= 3] = 0
    indivi['activite'][indivi['actrec'] == 4] = 1
    indivi['activite'][indivi['actrec'] == 5] = 2
    indivi['activite'][indivi['actrec'] == 7] = 3
    indivi['activite'][indivi['actrec'] == 8] = 4
    indivi['activite'][indivi['age'] <= 13] = 2  # ce sont en fait les actrec=9
    print indivi['activite'].value_counts()
    # TODO: MBJ problem avec les actrec

    indivi['titc'][indivi['titc'].isnull()] = 0
    assert indivi['titc'].notnull().all(), Exception("Problème avec les titc")

    print '    6.2 : variable statut'
    indivi['statut'][indivi['statut'].isnull()] = 0
    indivi['statut'] = indivi['statut'].astype('int')
    indivi['statut'][indivi['statut'] == 11] = 1
    indivi['statut'][indivi['statut'] == 12] = 2
    indivi['statut'][indivi['statut'] == 13] = 3
    indivi['statut'][indivi['statut'] == 21] = 4
    indivi['statut'][indivi['statut'] == 22] = 5
    indivi['statut'][indivi['statut'] == 33] = 6
    indivi['statut'][indivi['statut'] == 34] = 7
    indivi['statut'][indivi['statut'] == 35] = 8
    indivi['statut'][indivi['statut'] == 43] = 9
    indivi['statut'][indivi['statut'] == 44] = 10
    indivi['statut'][indivi['statut'] == 45] = 11
    assert indivi['statut'].isin(
        range(12)).all(), Exception("statut value over range")

    #indivi$nbsala <- as.numeric(indivi$nbsala)
    #indivi <- within(indivi,{
    #  nbsala[is.na(nbsala) ]    <- 0
    #  nbsala[nbsala==99 ] <- 10  # TODO  418 fip à retracer qui sont NA
    #})

    print '    6.3 : variable txtppb'
    indivi['txtppb'] = indivi['txtppb'].fillna(0)
    assert indivi['txtppb'].notnull().all()

    indivi['nbsala'] = indivi['nbsala'].fillna(0)
    indivi['nbsala'] = indivi['nbsala'].astype('int')
    indivi['nbsala'][indivi['nbsala'] == 99] = 10
    assert indivi['nbsala'].isin(range(11)).all()

    print '    6.4 : variable chpub et CSP'
    indivi['chpub'].fillna(0, inplace=True)
    indivi['chpub'] = indivi['chpub'].astype('int')
    indivi['chpub'][indivi['chpub'].isnull()] = 0
    print indivi['chpub'].value_counts()
    assert indivi['chpub'].isin(range(11)).all()

    indivi['cadre'] = 0
    indivi['prosa'][indivi['prosa'].isnull()] = 0
    assert indivi['prosa'].notnull().all()
    print indivi['encadr'].value_counts()

    # encadr : 1=oui, 2=non
    indivi['encadr'].fillna(2, inplace=True)
    assert indivi['encadr'].notnull().all()
    indivi['cadre'][indivi['prosa'].isin([7, 8])] = 1
    indivi['cadre'][(indivi['prosa'] == 9) & (indivi['encadr'] == 1)] = 1
    print "cadre"
    print indivi['cadre'].value_counts()
    assert indivi['cadre'].isin(range(2)).all()

    print ''
    print "Etape 7 : on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence"

    print 'nb de doublons idfam/quifam', len(
        indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])

    print 'On crée les n° de personnes à charge'
    assert indivi['idfoy'].notnull().all()
    print_id(indivi)
    indivi['quifoy2'] = 2
    indivi['quifoy2'][indivi['quifoy'] == 'vous'] = 0
    indivi['quifoy2'][indivi['quifoy'] == 'conj'] = 1
    indivi['quifoy2'][indivi['quifoy'] == 'pac'] = 2

    del indivi['quifoy']
    indivi['quifoy'] = indivi['quifoy2']
    del indivi['quifoy2']

    print_id(indivi)
    test2 = indivi.loc[indivi['quifoy'] == 2, ['quifoy', 'idfoy', 'noindiv']]
    print_id(test2)

    j = 2
    while test2.duplicated(['quifoy', 'idfoy']).any():
        test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j
        j += 1

    print_id(test2)
    indivi = indivi.merge(test2, on=['noindiv', 'idfoy'], how="left")
    indivi['quifoy'] = indivi['quifoy_x']
    indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'],
                             indivi['quifoy_x'])
    del indivi['quifoy_x'], indivi['quifoy_y']
    print_id(indivi)

    del test2, fip
    print 'nb de doublons idfam/quifam', len(
        indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])
    print_id(indivi)

    #####################################################################################
    ## On ajoute les idfam et quifam
    #load(famc)
    #
    #tot2 <- merge(indivi, famille, by = c('noindiv'), all.x = TRUE)
    #rm(famille)
    #print_id(tot2)
    #
    ### Les idfam des enfants FIP qui ne font plus partie des familles forment des famille seuls
    #tot2[is.na(tot2$quifam), "idfam"] <- tot2[is.na(tot2$quifam), "noindiv"]
    #tot2[is.na(tot2$quifam), "quifam"] <- 0
    #print_id(tot2)
    #saveTmp(tot2, file = "tot2.Rdata")
    #rm(indivi,tot2)
    #
    ## on merge les variables de revenus (foyer_aggr) avec les identifiants précédents
    ## load foyer
    #loadTmp(file = "tot2.Rdata")
    #loadTmp(file= "foyer_aggr.Rdata")
    #
    #tot3 <- merge(tot2, foyer, all.x = TRUE)
    #print_id(tot3) # OK
    #saveTmp(tot3, file= "tot3.Rdata")
    #rm(tot3,tot2,foyer)
    #
    print ''
    print 'Etape 8 : création des fichiers totaux'
    famille = load_temp(name='famc', year=year)

    print '    8.1 : création de tot2 & tot3'
    tot2 = indivi.merge(famille, on='noindiv', how='inner')
    #     del famille # TODO: MBJ increase in number of menage/foyer when merging with family ...
    del famille

    control(tot2, debug=True, verbose=True)
    assert tot2['quifam'].notnull().all()

    save_temp(tot2, name='tot2', year=year)
    del indivi
    print '    tot2 saved'

    #     #On combine les variables de revenu
    #     foyer = load_temp(name='foy_ind', year=year)
    #     print " INTERSERCT THE POOCHAY"
    #     tot2["idfoy"] = tot2["idfoy"][tot2["idfoy"].notnull()] +1
    #     print "pingas"
    #     print sorted(tot2.loc[tot2.idfoy.notnull(),"idfoy"].astype('int').unique())[0:10]
    #     print "pocchay"
    #     print sorted(foyer["idfoy"].unique())[0:10]
    #     print "final flash"
    #     print 602062550.0 in foyer["idfoy"].values
    #     print len(list(set(tot2["idfoy"].unique()) & set(foyer["idfoy"].unique())))
    #     print tot2.quifoy.value_counts()
    #tot2.update(foyer)
    tot2.merge(foyer, how='left')

    tot2 = tot2[tot2.idmen.notnull()]
    #     tot2['idfoy'] += 1

    print_id(tot2)

    tot3 = tot2
    # TODO: check where they come from
    tot3 = tot3.drop_duplicates(cols='noindiv')
    print len(tot3)

    #Block to remove any unwanted duplicated pair
    print "    check tot3"
    control(tot3, debug=True, verbose=True)
    tot3 = tot3.drop_duplicates(cols=['idfoy', 'quifoy'])
    tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam'])
    tot3 = tot3.drop_duplicates(cols=['idmen', 'quimen'])
    tot3 = tot3.drop_duplicates(cols='noindiv')
    control(tot3)

    ## On ajoute les variables individualisables
    #loadTmp("foyer_individualise.Rdata") # foy_ind
    #loadTmp("tot3.Rdata")
    #loadTmp("allvars.Rdata")
    #loadTmp("sif.Rdata")
    #
    #vars2 <- setdiff(names(tot3),  allvars)
    #tot3 <- tot3[,vars2]
    #
    #print_id(tot3)
    #final <- merge(tot3, foy_ind, by = c('idfoy', 'quifoy'), all.x = TRUE)
    #
    print '    8.2 : On ajoute les variables individualisables'

    allvars = load_temp(name='ind_vars_to_remove', year=year)
    vars2 = set(tot3.columns).difference(set(allvars))
    tot3 = tot3[list(vars2)]
    print len(tot3)

    assert not (tot3.duplicated(
        cols=['noindiv']).any()), "doublon dans tot3['noindiv']"
    lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])])
    assert lg_dup == 0, "%i pairs of idfoy/quifoy in tot3 are duplicated" % (
        lg_dup)

    save_temp(tot3, name='tot3', year=year)
    control(tot3)

    del tot2, allvars, tot3, vars2
    print 'tot3 sauvegardé'
    gc.collect()
Exemplo n.º 11
0
def foyer_all(year=2006):

    ## On ajoute les cases de la déclaration
    #foyer_all <- LoadIn(erfFoyFil)
    data = DataCollection(year=year)
    foyer_all = data.get_values(table="foyer" )

    ## on ne garde que les cases de la déclaration ('fxzz')
    #vars <- names(foyer_all)
    #vars <- c("noindiv", vars[grep("^f[0-9]", vars)])
    #

    vars = foyer_all.columns
    regex = re.compile("^f[0-9]")
    vars = [x for x in vars if regex.match(x)]

    #foyer <- foyer_all[vars]
    #rm(foyer_all)
    #gc()
    #noindiv <- list(foyer$noindiv)
    #

    foyer = foyer_all[vars + ["noindiv"]]

    del foyer_all
    gc.collect()

    #
    ## On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations
    #foyer <- aggregate(foyer, by = noindiv, FUN = 'sum')
    #print foyer.describe()["f1aj"].to_string()
    foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum)
    #
    #print foyer.describe()["f1aj"].to_string()
    #print foyer.describe()["noindiv"].to_string()
    #

    print_id(foyer)

    ## noindiv have been summed over original noindiv which are now in Group.1
    #foyer$noindiv <- NULL
    #foyer <- rename(foyer, c(Group.1 = 'noindiv'))
    ## problème avec les dummies ()
    #
    #saveTmp(foyer, file= "foyer_aggr.Rdata")
    #
    #
    #############################################################################
    ## On récupère les variables individualisables
    #loadTmp("foyer_aggr.Rdata")
    #
    #individualisable <- function(table, var, vars, qui){
    #  print(var)
    #  print(vars)
    #  temp <- table[c('noindiv', vars)]
    #  n = length(qui)
    #  names(temp)[2:(n+1)] <- qui
    #  temp$newvar <- NULL
    #  temp2 <- melt(temp, id = 'noindiv', variable_name = 'quifoy')
    #  temp2 <- transform(temp2, quifoy = as.character(quifoy))
    #  temp2 <- transform(temp2, noindiv = as.character(noindiv))
    #  str(temp2)
    #  rename(temp2, c(value = var))
    #}


    var_dict = {'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'],
                'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'],
               'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'],
               'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'],
               'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'],
               'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'],
               'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'],
               'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'],
               'f1tv': ['f1tv', 'f1uv'],
               'f1tw': ['f1tw', 'f1uw'],
               'f1tx': ['f1tx', 'f1ux'],
               'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'],
               'ppe_du_ns':  ['f5nv', 'f5ov', 'f5pv'],
               'frag_exon': ['f5hn', 'f5in', 'f5jn'],
               'frag_impo': ['f5ho', 'f5io', 'f5jo'],
               'arag_exon': ['f5hb', 'f5ib', 'f5jb'],
               'arag_impg': ['f5hc', 'f5ic', 'f5jc'],
               'arag_defi': ['f5hf', 'f5if', 'f5jf'],
               'nrag_exon': ['f5hh', 'f5ih', 'f5jh'],
               'nrag_impg': ['f5hi', 'f5ii', 'f5ji'],
               'nrag_defi': ['f5hl', 'f5il', 'f5jl'],
               'nrag_ajag': ['f5hm', 'f5im', 'f5jm'],
               'mbic_exon': ['f5kn', 'f5ln', 'f5mn'],
               'abic_exon': ['f5kb', 'f5lb', 'f5mb'],
               'nbic_exon': ['f5kh', 'f5lh', 'f5mh'],
               'mbic_impv': ['f5ko', 'f5lo', 'f5mo'],
               'mbic_imps': ['f5kp', 'f5lp', 'f5mp'],
               'abic_impn': ['f5kc', 'f5lc', 'f5mc'],
               'abic_imps': ['f5kd', 'f5ld', 'f5md'],
               'nbic_impn': ['f5ki', 'f5li', 'f5mi'],
               'nbic_imps': ['f5kj', 'f5lj', 'f5mj'],
               'abic_defn': ['f5kf', 'f5lf', 'f5mf'],
               'abic_defs': ['f5kg', 'f5lg', 'f5mg'],
               'nbic_defn': ['f5kl', 'f5ll', 'f5ml'],
               'nbic_defs': ['f5km', 'f5lm', 'f5mm'],
               'nbic_apch': ['f5ks', 'f5ls', 'f5ms'],
               'macc_exon': ['f5nn', 'f5on', 'f5pn'],
               'aacc_exon': ['f5nb', 'f5ob', 'f5pb'],
               'nacc_exon': ['f5nh', 'f5oh', 'f5ph'],
               'macc_impv': ['f5no', 'f5oo', 'f5po'],
               'macc_imps': ['f5np', 'f5op', 'f5pp'],
               'aacc_impn': ['f5nc', 'f5oc', 'f5pc'],
               'aacc_imps': ['f5nd', 'f5od', 'f5pd'],
               'aacc_defn': ['f5nf', 'f5of', 'f5pf'],
               'aacc_defs': ['f5ng', 'f5og', 'f5pg'],
               'nacc_impn': ['f5ni', 'f5oi', 'f5pi'],
               'nacc_imps': ['f5nj', 'f5oj', 'f5pj'],
               'nacc_defn': ['f5nl', 'f5ol', 'f5pl'],
               'nacc_defs': ['f5nm', 'f5om', 'f5pm'],
               'mncn_impo': ['f5ku', 'f5lu', 'f5mu'],
               'cncn_bene': ['f5sn', 'f5ns', 'f5os'],
               'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check
               'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'],
               'abnc_exon': ['f5qb', 'f5rb', 'f5sb'],
               'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'],
               'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'],
               'abnc_impo': ['f5qc', 'f5rc', 'f5sc'],
               'abnc_defi': ['f5qe', 'f5re', 'f5se'],
               'nbnc_impo': ['f5qi', 'f5ri', 'f5si'],
               'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'],
#               'ebic_impv' : ['f5ta','f5ua', 'f5va'],
#               'ebic_imps' : ['f5tb','f5ub', 'f5vb'],
               'mbic_mvct': ['f5hu'],
               'macc_mvct': ['f5iu'],
               'mncn_mvct': ['f5ju'],
               'mbnc_mvct': ['f5kz'],
               'frag_pvct': ['f5hw', 'f5iw', 'f5jw'],
               'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'],
               'macc_pvct': ['f5nx', 'f5ox', 'f5px'],
               'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'],
               'mncn_pvct': ['f5ky', 'f5ly', 'f5my'],
               'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'],
               'macc_mvlt': ['f5nr', 'f5or', 'f5pr'],
               'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'],
               'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'],
               'frag_pvce': ['f5hx', 'f5ix', 'f5jx'],
               'arag_pvce': ['f5he', 'f5ie', 'f5je'],
               'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'],
               'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'],
               'abic_pvce': ['f5ke', 'f5le', 'f5me'],
               'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'],
               'macc_pvce': ['f5nq', 'f5oq', 'f5pq'],
               'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'],
               'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'],
               'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'],
               'cncn_pvce': ['f5so', 'f5nt', 'f5ot'],
               'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'],
               'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'],
               'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'],
               'demenage' : ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er']}  # (déménagement) uniquement en 2006


#
#varlist = list(list('sali', c('f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej')),
#                list('choi', c('f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep')),
#               list('fra', c('f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek')),
# ......
#               list('mbnc_pvce', c('f5hr', 'f5ir', 'f5jr')),
#               list('abnc_pvce', c('f5qd', 'f5rd', 'f5sd')),
#               list('nbnc_pvce', c('f5qj', 'f5rj', 'f5sj')),
#               list('demenage' , c('f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'))) # (déménagement) uniquement en 2006
#
    vars_sets = [ set(var_list) for var_list in var_dict.values() ]
    eligible_vars = (set().union(*vars_sets)).intersection( set(list(foyer.columns)))

    print "From %i variables, we keep %i eligibles variables"   %( len(set().union(*vars_sets)), len(eligible_vars))
    qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3']
    err = 0
    err_vars = {}

    foy_ind = DataFrame()

    for individual_var, foyer_vars in var_dict.iteritems():
        try:
            selection = foyer[foyer_vars + ["noindiv"]]
        except KeyError:
            # Testing if at least one variable of foyers_vars is in the eligible list
            presence = [ x  in eligible_vars for x in foyer_vars ]
            var_present = any(presence)
            if not var_present:
                print individual_var + " is not present"
                continue
            else:
                # Shrink the list
                foyer_vars_cleaned = [var for var,present in zip(foyer_vars, presence) if present is True]
                selection = foyer[foyer_vars_cleaned + ["noindiv"]]

        # Reshape the dataframe
        selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True)
        selection.set_index("noindiv", inplace=True)
        selection.columns.name = "quifoy"

        selection = selection.stack()
        selection.name = individual_var
        selection = selection.reset_index() # A Series cannot see its index resetted to produce a DataFrame
        selection = selection.set_index(["quifoy", "noindiv"])
        selection = selection[selection[individual_var] !=0]
#        print len(selection)

        if len(foy_ind) == 0:
            foy_ind = selection
        else:

            foy_ind = concat([foy_ind, selection], axis=1, join='outer')

    foy_ind.reset_index(inplace=True)

    print "foy_ind"
    print foy_ind.describe().to_string()


#not_first <- FALSE
#allvars = c()
#for (v in varlist){
#  vars = intersect(v[[2]],names(foyer)) # to deal with variabes that are not present
#  if (length(vars) > 0) {
#    allvars <-  c(allvars, vars)
#    qui <- c('vous', 'conj', 'pac1', 'pac2', 'pac3')
#    n <- length(vars)
#    temp <- individualisable(foyer, v[[1]], vars, qui[1:n])
#    if (not_first) {
#      print('merge')
#      foy_ind <- merge(temp, foy_ind, by = c('noindiv', 'quifoy'), all = TRUE)
#      names(foy_ind)
#    }
#    else   {
#      print('init')
#      foy_ind <- temp
#      not_first <- TRUE
#    }
#  }
#}

    ind_vars_to_remove = Series(list(eligible_vars))
    save_temp(ind_vars_to_remove, name='ind_vars_to_remove', year=year)
    foy_ind.rename(columns={"noindiv" : "idfoy"}, inplace=True)

    print_id(foy_ind)
    foy_ind['quifoy'][foy_ind['quifoy']=='vous'] = 0
    foy_ind['quifoy'][foy_ind['quifoy']=='conj'] = 1
    foy_ind['quifoy'][foy_ind['quifoy']=='pac1'] = 2
    foy_ind['quifoy'][foy_ind['quifoy']=='pac2'] = 3
    foy_ind['quifoy'][foy_ind['quifoy']=='pac3'] = 4

    assert foy_ind['quifoy'].isin(range(5)).all(), 'présence de valeurs aberrantes dans quifoy'

    print 'saving foy_ind'
    print_id(foy_ind)
    save_temp(foy_ind, name="foy_ind", year = year)
    show_temp()
    return