def create_enfnn(year=2006):
    '''
    '''
    #load
    data = DataCollection(year=year)
    ### Enfant à naître (NN pour nouveaux nés)
    individual_vars = ['noi', 'noicon', 'noindiv', 'noiper', 'noimer', 'ident', 'naia', 'naim', 'lien',
               'acteu','stc','contra','titc','mrec','forter','rstg','retrai','lpr','cohab','sexe',
               'agepr','rga']
    data = DataCollection(year=year)
    eeccmp1 = data.get_values(table="eec_cmp_1", variables=individual_vars)
    eeccmp2 = data.get_values(table="eec_cmp_2", variables=individual_vars)
    eeccmp3 = data.get_values(table="eec_cmp_3", variables=individual_vars)
    tmp = eeccmp1.merge(eeccmp2, how="outer")
    enfnn = tmp.merge(eeccmp3, how="outer")

    # optimisation des types? Controle de l'existence en passant
    # pourquoi pas des int quand c'est possible
    #TODO: minimal dtype
    for var in individual_vars:
        print var
        enfnn[var] = enfnn[var].astype('float')
    del eeccmp1, eeccmp2, eeccmp3, individual_vars

    # création de variables
    print enfnn.describe()
    enfnn['declar1'] = ''
    enfnn['noidec'] = 0
    enfnn['ztsai'] = 0
    enfnn['year'] = year
    enfnn['year'] = enfnn['year'].astype("float32") # -> integer ?
    enfnn['agepf'] = enfnn['year'] - enfnn['naia']
    enfnn['agepf'][enfnn['naim'] >= 7] -= 1
    enfnn['actrec'] = 9
    enfnn['quelfic'] = 'ENF_NN'
    enfnn['persfip'] = ""

    #selection
    #enfnn <- enfnn[(enfnn$naia==enfnn$year & enfnn$naim>=10) | (enfnn$naia==enfnn$year+1 & enfnn$naim<=5),]
    enfnn = enfnn[((enfnn['naia'] == enfnn['year']) & (enfnn['naim'] >= 10)) |
                      ((enfnn['naia'] == enfnn['year'] + 1) & (enfnn['naim'] <= 5))]
    #save
    save_temp(enfnn, name="enfnn", year=year)
    del enfnn
    print "enfnnm saved"
    gc.collect()
Пример #2
0
def create_final(year=None):
    if year is None:
        raise Exception("A year is needed")
    print 'création de final'
    foy_ind = load_temp(name = 'foy_ind', year=year)
    tot3 = load_temp(name='tot3', year=year)


    foy_ind.set_index(['idfoy', 'quifoy'], inplace=True)
    tot3.set_index(['idfoy', 'quifoy'], inplace=True)
    final = concat([tot3, foy_ind], join_axes=[tot3.index], axis=1)
    final.reset_index(inplace=True)
    foy_ind.reset_index(inplace=True)
    tot3.reset_index(inplace=True)

#     tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam'])
    final = final[final.idmen.notnull()]

    control(final, verbose=True)
    del tot3, foy_ind
    gc.collect()

#final <- merge(final, sif, by = c('noindiv'), all.x = TRUE)
    print "    loading fip"
    sif = load_temp(name = 'sif', year=year)

    print sif.columns
    print "    update final using fip"
    final = final.merge(sif, on=["noindiv"], how="left")
    #TODO: IL FAUT UNE METHODE POUR GERER LES DOUBLES DECLARATIONS


    print final.columns
    control(final, debug=True)

    final['caseP'] = final.caseP.fillna(False)
    final['caseF'] = final.caseF.fillna(False)
    print_id(final)

    save_temp(final, name='final', year=year)
    print 'final sauvegardé'
    del sif, final
def create_final(year=None):
    if year is None:
        raise Exception("A year is needed")
    print 'création de final'
    foy_ind = load_temp(name='foy_ind', year=year)
    tot3 = load_temp(name='tot3', year=year)

    foy_ind.set_index(['idfoy', 'quifoy'], inplace=True)
    tot3.set_index(['idfoy', 'quifoy'], inplace=True)
    final = concat([tot3, foy_ind], join_axes=[tot3.index], axis=1)
    final.reset_index(inplace=True)
    foy_ind.reset_index(inplace=True)
    tot3.reset_index(inplace=True)

    #     tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam'])
    final = final[final.idmen.notnull()]

    control(final, verbose=True)
    del tot3, foy_ind
    gc.collect()

    #final <- merge(final, sif, by = c('noindiv'), all.x = TRUE)
    print "    loading fip"
    sif = load_temp(name='sif', year=year)

    print sif.columns
    print "    update final using fip"
    final = final.merge(sif, on=["noindiv"], how="left")
    #TODO: IL FAUT UNE METHODE POUR GERER LES DOUBLES DECLARATIONS

    print final.columns
    control(final, debug=True)

    final['caseP'] = final.caseP.fillna(False)
    final['caseF'] = final.caseF.fillna(False)
    print_id(final)

    save_temp(final, name='final', year=year)
    print 'final sauvegardé'
    del sif, final
def famille(year=2006):
### On suit la méthode décrite dans le Guide ERF_2002_rétropolée page 135
#
    #TODO: appeler un fichier de paramètres de législation
    if year == 2006:
        smic = 1254
    elif year == 2007:
        smic = 1280
    elif year == 2008:
        smic = 1308
    elif year==2009:
        smic = 1337
    else:
        print("smic non défini")

## TODO check if we can remove acteu forter etc since dealt with in 01_pre_proc
#
#indivi <- LoadIn(indm,indVar)

#    indivi = erf_indivi.merge(eec_indivi)
#
    print 'Etape 1 : préparation de base'
    print '    1.1 : récupération de indivi'
    indivi = load_temp(name="indivim", year=year)

    indivi['year'] = year
    indivi["noidec"] =   indivi["declar1"].apply(lambda x: str(x)[0:2])
    indivi["agepf"] = where(indivi['naim'] < 7, indivi['year'] - indivi['naia'] ,
                            indivi['year'] - indivi['naia'] - 1)

    indivi = indivi[ ~((indivi['lien']==6) & (indivi['agepf']<16) & ("quelfic"=="EE"))]


    print '    1.2 : récupération des enfants à naître'
    indVar = ['noi','noicon','noindiv','noiper','noimer','ident','declar1','naia','naim','lien','quelfic','acteu','stc','contra','titc','mrec',
            'forter','rstg','retrai','lpr','cohab','ztsai','sexe','persfip','agepr','rga','actrec',
            "agepf","noidec","year"]

    enfnn = load_temp(name='enfnn', year=year)
    enfnn = enfnn[indVar] #NOTE: la moitié des colonnes est remplie avec des NaN

    enfnn = enfnn.drop_duplicates('noindiv')
    print 'nb enfants à naitre', len(enfnn.index)
    print 'On enlève les enfants à naitre qui ne sont pas les enfants de la personne de référence'
    enfnn = enfnn[enfnn['lpr']==3]
    enfnn = enfnn[~(enfnn.noindiv.isin(indivi.noindiv.values))]
    print len(enfnn.index)

# # # PB with vars "agepf"  "noidec" "year"  NOTE: quels problèmes ? JS
# # base <- rbind(indivi,enfnn)
# # setdiff(names(indivi),names(enfnn))
# #

    print '    1.3 : création de base'
    base = concat([indivi, enfnn])
    print 'length of base', len(base.index)

    base['noindiv'] = 100*base['ident'] + base['noi']
    base['m15'] = (base['agepf']<16)
    base['p16m20'] = ((base['agepf']>=16) & (base['agepf']<=20))
    base['p21'] = (base['agepf']>=21)
    base['ztsai'] = where(base['ztsai'] is None, 0, base['ztsai'])
    base['smic55'] = (base['ztsai'] >= smic*12*0.55) ##55% du smic mensuel brut
    base['famille'] = 0
    base['kid'] = False
    print base.smic55.describe()

    def control_04(dataframe):
        print 'longueur de la dataframe après opération =', len(dataframe.index)
        dup = dataframe.duplicated(cols='noindiv')
        print 'contrôle des doublons =>', any(dup==True) #dup.describe()
        print 'contrôle des colonnes ->', len(dataframe.columns)
        print 'nombre de familles différentes', len(set(famille.noifam.values))
        print 'contrôle noifam is null:', len(dataframe[dataframe['noifam'].isnull()])
        if len(dataframe.index) > len(base.index): raise Exception('too many rows compared to base')

# # message('Etape 1: On cherche les enfants ayant père et/ou mère')
# # pr <- subset(base,lpr==1,c('ident','noi'))
# # pr$noifam <- 100*pr$ident + pr$noi
# # pr <- pr[c('ident','noifam')]
# #
# # nof01 <- subset(base,(lpr %in% c(1,2) )|(lpr==3 & m15) | (lpr==3 & (p16m20 & !smic55) ))
# # nof01 <- merge(pr,nof01,by ='ident')
# # nof01 <- within(nof01,{
# #   famille <- 10
# #   kid <-(lpr==3 & m15) | (lpr==3 & (p16m20 & !smic55 ) )
# #   })
# # famille <- nof01

    print ''
    print 'Etape 2 : On cherche les enfants ayant père et/ou mère'

    pr = base[base['lpr']==1].loc[:, ['ident', 'noi']]
    pr['noifam'] = 100*pr['ident'] + pr['noi']
    pr = pr.loc[:, ['ident', 'noifam']]
    print 'length pr', len(pr.index)

    nof01 = base[(base.lpr.isin([1,2])) | ((base['lpr'] == 3) & (base['m15'])) |
                 ((base['lpr'] == 3) & (base['p16m20']) & (~base['smic55']))]
    print 'longueur de nof01 avant merge', len(nof01.index)
    nof01 = nof01.merge(pr, on='ident', how='outer')
    nof01['famille'] = 10
    nof01['kid'] = ((nof01['lpr']==3) & (nof01['m15'])) | ((nof01['lpr']==3) & (nof01['p16m20']) & ~(nof01['smic55']))
    famille = nof01
    print nof01['kid'].value_counts()
    print nof01.lpr.value_counts()

    del nof01
    control_04(famille)


    print '    2.1 : identification des couples'
    # l'ID est le noi de l'homme
    hcouple = subset_base(base,famille)
    hcouple = hcouple[(hcouple['cohab']==1) & (hcouple['lpr']>=3) & (hcouple['sexe']==1)]
    hcouple['noifam'] = 100*hcouple['ident'] + hcouple['noi']
    hcouple['famille'] = 21
    print 'longueur hcouple', len(hcouple.index)

# # message('Etape 2b')
# # fcouple<- base[!base$noindiv %in% famille$noindiv,]
# # fcouple <- subset(fcouple,(cohab==1) & (lpr>=3) & (sexe==2))
# # fcouple <- within(fcouple,{
# #     noifam <- 100*ident + noicon ## l'identifiant est le conjoint du ménage  */
# #     famille <- 22 })
# #
# # famcom<- merge(fcouple['noifam'],hcouple['noifam'])
# # fcouple <- merge(famcom,fcouple)
# #
# # famille <- rbind(famille,hcouple,fcouple)

    print '    2.2 : attributing the noifam to the wives'
    fcouple = base[~(base.noindiv.isin(famille.noindiv.values))]
    fcouple = fcouple[(fcouple['cohab']==1) & (fcouple['lpr']>=3) & (fcouple['sexe']==2)]
    fcouple['noifam'] = 100*fcouple['ident'] + fcouple['noi']
    fcouple['famille'] = 22
    print 'longueur fcouple', len(fcouple.index)

    famcom = fcouple.merge(hcouple, on='noifam', how='outer')
    print 'longueur fancom après fusion', len(famcom.index)
    fcouple = fcouple.merge(famcom) #NOTE : faire un inner merge sinon présence de doublons
    print 'longueur fcouple après fusion', len(fcouple.index)

    famille = concat([famille, hcouple, fcouple], join='inner')
    control_04(famille)

    print ''
    print 'Etape 3: Récupération des personnes seules'
    print '    3.1 : personnes seules de catégorie 1'
    seul1 = base[~(base.noindiv.isin(famille.noindiv.values))]
    seul1 = seul1[(seul1.lpr.isin([3,4])) & ((seul1['p16m20'] & seul1['smic55'])|seul1['p21']) & (seul1['cohab']==1) &
                  (seul1['sexe']==2)]
    if len(seul1.index)>0:
        seul1['noifam'] = 100*seul1['ident'] + seul1['noi']
        seul1['famille'] = 31
        famille = concat([famille, seul1])
    print len(seul1.index)
    control_04(famille)

# # message('  3.2 personnes seules 2')
# # seul2 <- base[(!base$noindiv %in% famille$noindiv),]
# # seul2 <- subset(seul2,(lpr %in% c(3,4)) & p16m20 & smic55 & (cohab!=1))
# # seul2 <- within(seul2,{noifam <- 100*ident+noi
# #                      famille <- 32})
# # famille <- rbind(famille,seul2)
    print '    3.1 personnes seules de catégorie 2'
    seul2 = base[~(base.noindiv.isin(famille.noindiv.values))]
    seul2 = seul2[(seul2.lpr.isin([3,4])) & seul2['p16m20'] & seul2['smic55'] & (seul2['cohab'] != 1)]
    seul2['noifam'] = 100*seul2['ident'] + seul2['noi']
    seul2['famille'] = 32
    famille = concat([famille, seul2])
    control_04(famille)

# # message(' 3.3 personnes seules 3')
# # seul3 <- base[(!base$noindiv %in% famille$noindiv),]
# # seul3 <- subset(seul3,(lpr %in% c(3,4)) & p21 & cohab!=1)
# #     ## TODO CHECK erreur dans le guide méthodologique ERF 2002 lpr 3,4 au lieu de 3 seulement */
# # seul3 <- within(seul3,{noifam=100*ident+noi
# #                          famille = 33})
# # famille <- rbind(famille,seul3)

    print '    3.3 personnes seules de catégorie 3'
    seul3 = subset_base(base,famille)
    seul3 = seul3[(seul3.lpr.isin([3,4])) & seul3['p21'] & (seul3['cohab'] != 1)]
    seul3['noifam'] = 100*seul3['ident'] + seul3['noi']
    seul3['famille'] = 33
    famille = concat([famille, seul3])
    control_04(famille)

# # message(' 3.4 personnes seules 4')
# # seul4 <- base[(!base$noindiv %in% famille$noindiv),]
# # seul4 <- subset(seul4,(lpr==4) & p16m20 & !smic55 & noimer==0 & noiper==0 & persfip=="vous")
# #
# # if (nrow(seul4) >0 ) {  # 2006, 2009 pas de personne seule (sans enfant fip)
# #   seul4 <- within(seul4,{noifam = 100*ident + noi
# #                          famille = 34})
# # }
# #
# # famille <- rbind(famille,seul4)

    print '    3.4 : personnes seules de catégorie 4'
    seul4 = subset_base(base,famille)
    seul4 = seul4[(seul4['lpr']==4) & seul4['p16m20'] & ~(seul4['smic55']) & (seul4['noimer']==0) &
                  (seul4['persfip']=='vous')]

    if len(seul4.index)>0:
        seul4['noifam'] = 100*seul4['ident'] + seul4['noi']
        seul4['famille'] = 34
        famille = concat([famille, seul4])
    control_04(famille)

# # message('Etape 4')
# # message(' 4.1 enfant avec mère')
# # avec_mere <- base[(!base$noindiv %in% famille$noindiv),]
# # avec_mere <- subset(avec_mere,((lpr=4) & ( (p16m20=1) | (m15=1))) & noimer!=0)
# #
# # avec_mere <- within(avec_mere,{noifam=100*ident + noimer
# #              famille=41
# #              kid=TRUE})
# #
# # ## on récupère les mères */
# # mereid <- upData(avec_mere['noifam'], rename = c(noifam = 'noindiv'));
# # mereid <- unique(mereid)
# #
# # mere <- merge(mereid,base)
# # mere <- within(mere,{noifam=100*ident + noi
# #                      famille=42})
# # # TODO il y a deux mères qui ne sont pas dans les individus (problème des conjoints fip ? MBJ ne comprends pas) :
# # dim(mereid)
# # dim(mere)
# # # TODO on préfère  donc enlever leurs enfants
# # avec_mere <- avec_mere[avec_mere$noifam %in% mere$noifam,]
# #
    print ''
    print 'Etape 4 : traitement des enfants'
    print '    4.1 : enfant avec mère'
    avec_mere = subset_base(base,famille)
    avec_mere =  avec_mere[((avec_mere['lpr']==4) & ((avec_mere['p16m20']==1) | (avec_mere['m15']==1)) &
                           (avec_mere['noimer'] != 0))]
    avec_mere['noifam'] = 100*avec_mere['ident'] + avec_mere['noimer']
    avec_mere['famille'] = 41
    avec_mere['kid'] = True

    #On récupère les mères des enfants
    mereid = DataFrame(avec_mere['noifam'])
    mereid.columns = ['noindiv']
    mereid = mereid.drop_duplicates()

    mere = mereid.merge(base)
    mere['noifam'] = 100*mere['ident'] + mere['noi']
    mere['famille'] = 42 #H2G2 nous voilà
    avec_mere = avec_mere[avec_mere.noifam.isin(mere.noifam.values)]
    print 'contrôle df mère'
    control_04(mere)

# # conj_mere <- merge(conj_mereid,base)
# # conj_mere$famille <- 43
# #
# # famille <- famille[(!famille$noindiv %in% mere$noindiv),]
# #
# # ## on récupère les conjoints des mères */
# # conj_mereid <- mere[mere$noicon!=0,c('ident','noicon','noifam')]
# #
# # conj_mereid$noindiv = 100*conj_mereid$ident + conj_mereid$noicon
# # conj_mereid <- conj_mereid[c('noindiv','noifam')]
# #
# # conj_mere <- merge(conj_mereid,base)
# # conj_mere$famille <- 43
# #
# # famille <- famille[(!famille$noindiv %in% conj_mere$noindiv),]
# # famille <- rbind(famille,avec_mere,mere,conj_mere)
# #

    famille = famille[~(famille.noindiv.isin(mere.noindiv.values))]
    control_04(famille)

    #on retrouve les conjoints des mères
    conj_mereid = mere[mere['noicon']!=0].loc[:, ['ident', 'noicon', 'noifam']]
    conj_mereid['noindiv'] = 100*conj_mereid['ident'] + conj_mereid['noicon']
    conj_mereid = conj_mereid.loc[:, ['noindiv', 'noifam']]
    conj_mereid = conj_mereid.merge(base)
    control_04(conj_mereid)

    conj_mere = conj_mereid.merge(base)
    conj_mere['famille'] = 43

    famille = famille[~(famille.noindiv.isin(conj_mere.noindiv.values))]
    famille = concat([famille, avec_mere, mere, conj_mere])
    control_04(famille)
    del avec_mere, mere, conj_mere, mereid, conj_mereid

# # message(' 4.2 enfants avec père')
# # avec_pere <- base[(!base$noindiv %in% famille$noindiv),]
# # avec_pere <- subset(avec_pere,((lpr=4) & ( (p16m20=1) | (m15=1))) & noiper!=0)
# # avec_pere <- within(avec_pere,{noifam=100*ident + noiper
# #              famille=44
# #              kid=TRUE})
# #
# # ## on récupère les pères  pour leur attribuer une famille propre */
# # pereid <- upData(avec_pere['noifam'], rename = c(noifam = 'noindiv'));
# # pereid <- unique(pereid)
# # pere <- merge(pereid,base)
# # pere <- within(pere,{noifam=100*ident + noi
# #                        famille=45})
# #
# # famille <- famille[(!famille$noindiv %in% pere$noindiv),]
# #
# # ## on récupère les conjoints des pères */
# # conj_pereid <- pere[pere$noicon!=0,c('ident','noicon','noifam')]
# # conj_pereid$noindiv = 100*conj_pereid$ident + conj_pereid$noicon
# # conj_pereid <- conj_pereid[c('noindiv','noifam')]
# #
# # conj_pere <- merge(conj_pereid,base)
# # if (nrow(conj_pere) >0) conj_pere$famille <- 46
# # # 2006: erreur pas de conjoint de père ?
# #
# # famille <- famille[(!famille$noindiv %in% conj_pere$noindiv),]
# # famille <- rbind(famille,avec_pere,pere,conj_pere)

    print '    4.2 : enfants avec père'
    avec_pere = subset_base(base,famille)
    avec_pere = avec_pere[(avec_pere['lpr']==4) &
                          ((avec_pere['p16m20']==1) | (avec_pere['m15']==1)) &
                          (avec_pere['noiper'].notnull())]
    avec_pere['noifam'] = 100*avec_pere['ident'] + avec_pere['noiper']
    avec_pere['famille'] = 44
    avec_pere['kid'] = True
    print 'presence of NaN in avec_pere ?', avec_pere['noifam'].isnull().any()


    pereid = DataFrame(avec_pere['noifam']); pereid.columns = ['noindiv']
    pereid = pereid.drop_duplicates()
    pere = base.merge(pereid, on='noindiv', how='inner')

    pere['noifam'] = 100*pere['ident'] + pere['noi']
    pere['famille'] = 45
    famille = famille[~(famille.noindiv.isin(pere.noindiv.values))]

    #On récupère les conjoints des pères
    conj_pereid = pere.loc[array(pere['noicon']!=0), ['ident','noicon','noifam']]
    conj_pereid['noindiv'] = 100*conj_pereid['ident'] + conj_pereid['noicon']
    conj_pereid = conj_pereid.loc[:, ['noindiv','noifam']]

    conj_pere = base.merge(conj_pereid, on=['noindiv'] ,how='inner')
    control_04(conj_pere)
    if len(conj_pere.index)>0 : conj_pere['famille'] = 46

    famille = famille[~(famille.noindiv.isin(conj_pere.noindiv.values))]
    famille = concat([famille, avec_pere, pere, conj_pere])
    print 'contrôle de famille après ajout des pères'
    control_04(famille)
    del avec_pere,pere,pereid,conj_pere,conj_pereid

# # ##* 42. enfants avec déclarant */
# # avec_dec <- base[(!base$noindiv %in% famille$noindiv),]
# # avec_dec <- subset(avec_dec,(persfip=="pac") & (lpr=4) &  ( (p16m20&!smic55) | (m15=1 )))
# # avec_dec <- within(avec_dec,{noifam = 100*ident + noidec
# #             famille=47
# #             kid=TRUE})
# #
# # ## on récupère les déclarants pour leur attribuer une famille propre */
# # decid <- upData(avec_dec['noifam'], rename = c(noifam = 'noindiv'));
# # decid <- unique(decid)
# #
# # dec <- merge(decid,base)
# # dec <- within(dec,{noifam=100*ident + noi
# #                    famille=48})
# #
# # famille <- famille[(!famille$noindiv %in% dec$noindiv),]
# # famille <- rbind(famille,avec_dec,dec)

    print '    4.3 : enfants avec déclarant'
    avec_dec = subset_base(base,famille)
    avec_dec = avec_dec[(avec_dec['persfip']=="pac") & (avec_dec['lpr']==4) &
                    ( (avec_dec['p16m20'] & ~(avec_dec['smic55'])) | (avec_dec['m15']==1 ))]
    avec_dec['noifam'] = 100*avec_dec['ident'] + avec_dec['noidec'].astype('float')
    avec_dec['famille'] = 47
    avec_dec['kid'] = True
    control_04(avec_dec)

    #on récupère les déclarants pour leur attribuer une famille propre
    decid = DataFrame(avec_dec['noifam']) ; decid.columns = ['noindiv']
    decid = decid.drop_duplicates()
    dec = base.merge(decid, how='inner')
    dec['noifam'] = 100*dec['ident'] + dec['noi']
    dec['famille'] = 48

    famille = famille[~(famille.noindiv.isin(dec.noindiv.values))]
    famille = concat([famille, avec_dec, dec])
    del dec, decid, avec_dec
    control_04(famille)

# # ## famille etape 5 : enfants fip */
# # message('Etape 5 : enfants fip')
# # # On rajoute les enfants fip
# # # (on le fait ici pour que cela n'interfère pas avec les recherches précédentes)
# # fip <- LoadIn(fipDat)
# #
# # indVar = c('noi','noicon','noindiv','noiper','noimer','ident','declar1','naia','naim','lien','quelfic','acteu','stc','contra','titc','mrec',
# #             'forter','rstg','retrai','lpr','cohab','ztsai','sexe','persfip','agepr','rga')
# #
# # fip <- fip[c(indVar,'actrec','agepf','noidec','year')]
# #
# # table(duplicated(fip$noindiv))
# #
# # ## Variables auxilaires présentes dans base qu'il faut rajouter aux fip'
# # ## WARNING les noindiv des fip sont construits sur les ident des déclarants
# # ## pas d'orvelap possible avec les autres noindiv car on a des noi =99, 98, 97 ,...'
# # names(fip)
# #
# # fip <- within(fip,{
# #   m15 <- (agepf<16)
# #   p16m20 <- ((agepf>=16) & (agepf<=20))
# #   p21 <- (agepf>=21)
# #   ztsai[is.na(ztsai)] <- 0
# #   smic55 <- (ztsai >= smic*12*0.55)   ## 55% du smic mensuel brut */
# #   famille <- 0
# #   kid <- FALSE
# # })

    print ''
    print 'Etape 5 : récupération des enfants fip-----------'
    print '    5.1 : création de la df fip'
    fip = load_temp(name='fipDat', year=year)
    indVar_fip = ['noi','noicon','noindiv','noiper','noimer','ident','declar1','naia','naim','lien','quelfic','acteu','stc','contra','titc','mrec',
            'forter','rstg','retrai','lpr','cohab','ztsai','sexe','persfip','agepr','rga','actrec','agepf','noidec','year']
    fip = fip.loc[:, indVar_fip]

    # Variables auxilaires présentes dans base qu'il faut rajouter aux fip'
    # WARNING les noindiv des fip sont construits sur les ident des déclarants
    # pas d'orvelap possible avec les autres noindiv car on a des noi =99, 98, 97 ,...'
    fip['m15'] = (fip['agepf']<16)
    fip['p16m20'] = ((fip['agepf']>=16) & (fip['agepf']<=20))
    fip['p21'] = (fip['agepf']>=21)
#     fip['ztsai'][fip['ztsai'] is None] = 0 #there are alrdy zeros
    fip['smic55'] = (fip['ztsai'] >= smic*12*0.55)
    fip['famille'] = 0
    fip['kid'] = False
    print fip['ztsai'].isnull().describe()

# # base <- rbind(base,fip)
# # table(base$quelfic)

# # enfant_fip <- base[(!base$noindiv %in% famille$noindiv),]
# # enfant_fip <- subset(enfant_fip, (quelfic=="FIP") & (( (agepf %in% c(19,20)) & !smic55 ) | (naia==year & rga=='6')) )  # TODO check year ou year-1 !
# # enfant_fip <- within(enfant_fip,{
# #                      noifam=100*ident+noidec
# #                      famille=50
# #                      kid=TRUE})
# # #                     ident=NA}) # TODO : je ne sais pas quoi mettre un NA fausse les manips suivantes
# # famille <- rbind(famille,enfant_fip)
# #
# # # TODO: En 2006 on peut faire ce qui suit car tous les parents fip sont déjà dans une famille
# # parent_fip <- famille[famille$noindiv %in% enfant_fip$noifam,]
# # any(enfant_fip$noifam %in% parent_fip$noindiv)
# # parent_fip <- within(parent_fip,{
# #                      noifam <- noindiv
# #                      famille <- 51
# #                      kid <- FALSE})
# # famille[famille$noindiv %in% enfant_fip$noifam,] <- parent_fip
# # # TODO quid du conjoint ?

    print "    5.2 : extension de base avec les fip"
    print fip[['noindiv', 'noidec', 'ztsai']].describe()
    base_ = concat([base, fip])
    print len(base.index)

    enfant_fip = subset_base(base_, famille)
    print enfant_fip.ix[enfant_fip['quelfic']=="FIP","agepf"].describe()

    enfant_fip = enfant_fip[(enfant_fip['quelfic']=="FIP") &
                            ((enfant_fip.agepf.isin([19,20]) & ~(enfant_fip['smic55'])) |
                            ((enfant_fip['naia']==enfant_fip['year']-1) & (enfant_fip['rga'].astype('int')==6)))]

    enfant_fip['noifam'] = 100*enfant_fip['ident'] + enfant_fip['noidec']
    enfant_fip['famille'] = 50
    enfant_fip['kid'] = True
    enfant_fip['ident'] = None
    control_04(enfant_fip)

    famille = concat([famille, enfant_fip])
    base = concat([base, enfant_fip])
    parent_fip = famille[famille.noindiv.isin(enfant_fip.noifam.values)]

    if any(enfant_fip.noifam.isin(parent_fip.noindiv.values)): print "Doublons entre enfant_fip et parent fip !"
    parent_fip['noifam'] = parent_fip['noindiv']
    parent_fip['famille'] = 51
    parent_fip['kid'] = False
    print 'contrôle de parent_fip'
    control_04(parent_fip)

    print 'famille defore merge and clearing'
    control_04(famille)

    famille = famille.merge(parent_fip, how='outer'); famille['famille'] = famille['famille'].astype('int')
    famille = famille.drop_duplicates(cols='noindiv', take_last=True)

    print 'famille after merge and clearing'
    print set(famille.famille.values)
    control_04(famille)
    print famille.loc[famille.noindiv.isin(enfant_fip.noifam), 'famille'].describe()
    del enfant_fip, fip, parent_fip

# # message('Etape 6 : non attribué')
# # non_attribue1 <- base[(!base$noindiv %in% famille$noindiv),]
# # non_attribue1 <- subset(non_attribue1,
# #                         (quelfic!="FIP") & (m15 | (p16m20&(lien %in% c(1,2,3,4) & agepr>=35)))
# #                         )
# # # On rattache les moins de 15 ans avec la PR (on a déjà éliminé les enfants en nourrice)
# # non_attribue1 <- merge(pr,non_attribue1)
# # non_attribue1 <- within(non_attribue1,{
# #   famille <- ifelse(m15,61,62)
# #     kid <- TRUE })
# #
# # rm(pr)
# # famille <- rbind(famille,non_attribue1)
# # dup <- duplicated(famille$noindiv)
# # table(dup)
# # rm(non_attribue1)
# # table(famille$famille, useNA="ifany")
# #
# # non_attribue2 <- base[(!base$noindiv %in% famille$noindiv) & (base$quelfic!="FIP"),]
# # non_attribue2 <- within(non_attribue2,{
# #   noifam <- 100*ident+noi # l'identifiant est celui du jeune */
# #     kid<-FALSE
# #     famille<-63})
# #
# # famille <- rbind(famille,non_attribue2)
    print ''
    print 'Etape 6 : gestion des non attribués'
    print '    6.1 : non attribués type 1'
    non_attribue1 = subset_base(base,famille)
    non_attribue1 = non_attribue1[~(non_attribue1['quelfic'] != 'FIP') & (non_attribue1['m15'] |
                                    (non_attribue1['p16m20'] & (non_attribue1.lien.isin(range(1,5))) &
                                     (non_attribue1['agepr']>=35)))]
    # On rattache les moins de 15 ans avec la PR (on a déjà éliminé les enfants en nourrice)
    non_attribue1 = pr.merge(non_attribue1)
    control_04(non_attribue1)
    non_attribue1['famille'] = where(non_attribue1['m15'], 61, 62)
    non_attribue1['kid'] = True

    famille = concat([famille, non_attribue1])
    control_04(famille)
    del pr, non_attribue1

    print '    6.2 : non attribué type 2'
    non_attribue2 = base[(~(base.noindiv.isin(famille.noindiv.values)) & (base['quelfic']!="FIP"))]
    non_attribue2['noifam'] = 100*non_attribue2['ident'] + non_attribue2['noi']
    non_attribue2['noifam'] = non_attribue2['noifam'].astype('int')
    non_attribue2['kid'] = False
    non_attribue2['famille'] = 63

    famille = concat([famille, non_attribue2], join='inner')
    control_04(famille)
    del non_attribue2

# # ## Sauvegarde de la table famille */
# #
# # # TODO nettoyer les champs qui ne servent plus à rien
# #
    print ''
    print 'Etape 7 : Sauvegarde de la table famille'
    print '    7.1 : Mise en forme finale'
    famille['idec'] = famille['declar1'].str[3:11]
    print famille['declar1'].notnull().describe()
    famille['idec'].apply(lambda x: str(x)+'-')
    famille['idec'] += famille['declar1'].str[0:2]
    famille['chef'] = (famille['noifam'] == famille['ident']*100+famille['noi'])

    famille.reset_index(inplace=True)
    print famille['idec'].isnull().describe()

    control_04(famille)

    print '    7.2 : création de la colonne rang'
    famille['rang'] = famille['kid'].astype('int')

    while any(famille[(famille['rang']!=0)].duplicated(cols=['rang', 'noifam'])):
        famille["rang"][famille['rang']!=0] = where(
                                    famille[famille['rang']!=0].duplicated(cols=["rang", 'noifam']),
                                    famille["rang"][famille['rang']!=0] + 1,
                                    famille["rang"][famille['rang']!=0])
        print "nb de rangs différents", len(set(famille.rang.values))

    print '    7.3 : création de la colonne quifam et troncature'

    print 'value_counts chef',famille['chef'].value_counts()
    print 'value_counts kid', famille['kid'].value_counts()

    famille['quifam'] = -1
    print 'controle initial', len(famille[famille['quifam']==-1])
    famille['quifam'] = where(famille['chef'], 0, famille['quifam'])
    famille['quifam'] = where(famille['kid'], 1 + famille['rang'], famille['quifam'])
    famille['quifam'] = where(~(famille['chef']) & ~(famille['kid']), 1, famille['quifam'])

    famille['noifam'] = famille['noifam'].astype('int')
    print famille['quifam'].value_counts()

    famille_check = famille
    famille = famille.loc[:, ['noindiv', 'quifam', 'noifam']]
    famille.columns = ['noindiv', 'quifam', 'idfam']

    print 'Vérifications sur famille'
    assert len(famille_check.loc[famille_check['chef'], :]) == len(set(famille.idfam.values)), 'the number of family chiefs is different from the number of families'
    assert not(any(famille.duplicated(cols=['idfam', 'quifam']))), 'there are duplicates of quifam inside a family'
    assert famille['quifam'].notnull().all(), 'there are missing values in quifam'
    assert famille['idfam'].notnull().all(), 'there are missing values in idfam'
#    control(famille, debug=True, verbose=True, verbose_columns=['idfam', 'quifam'])

    print '    Sauvegarde de famille'
    save_temp(famille, name="famc", year=year)
    del famille_check, indivi, enfnn
Пример #5
0
def create_totals(year=2006):

    print "Creating Totals"
    print "Etape 1 : Chargement des données"

    data = DataCollection(year=year)
    indivim = load_temp(name="indivim", year=year)

    assert indivim.duplicated(['noindiv']).any() == False, "Présence de doublons"


    # Deals individuals with imputed income : some individuals are in 'erf individu table' but
    # not in the 'foyer' table. We need to create a foyer for them.


    selection = Series()
    for var in ["zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici", "zrnci"]:
        varo = var[:-1]+"o"
        test = indivim[var] != indivim[varo]
        if len(selection) == 0:
            selection = test
        else:
            selection = (test) | (selection)


    indivi_i = indivim[selection]
    indivi_i.rename(columns={"ident" : "idmen",
                     "persfip":"quifoy",
                     "zsali" : "sali2", # Inclu les salaires non imposables des agents d'assurance
                     "zchoi" : "choi2",
                     "zrsti" : "rsti2",
                     "zalri" : "alr2"}, inplace=True)


    indivi_i["quifoy"] = where(indivi_i["quifoy"].isnull(), "vous", indivi_i["quifoy"])
    indivi_i["quelfic"] = "FIP_IMP"



## We merge them with the other individuals
#indivim <- rename(indivim, c(ident = "idmen",
#                             persfip = "quifoy",
#                             zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance
#                             zchoi = "choi2",
#                             zrsti = "rsti2",
#                             zalri = "alr2"))
#
#indivi <- rbind(indivim[!(indivim$noindiv %in% indivi_i$noindiv),], indivi_i)
#rm(indivim, indivi_i)
#gc()
#table(indivi$quelfic)
#

    indivim.rename( columns= dict(ident = "idmen",
                             persfip = "quifoy",
                             zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance
                             zchoi = "choi2",
                             zrsti = "rsti2",
                             zalri = "alr2"), inplace=True)

    if not (set(list(indivim.noindiv)) >  set(list(indivi_i.noindiv)) ):
        raise Exception("Individual ")
    indivim.set_index("noindiv", inplace=True)
    indivi_i.set_index("noindiv", inplace=True)
    indivi = indivim
    del indivim
    indivi.update(indivi_i)

    indivi.reset_index( inplace=True)

    print ''
    print "Etape 2 : isolation des FIP"
    fip_imp = indivi.quelfic=="FIP_IMP"
    indivi["idfoy"] = (indivi["idmen"].astype("int64")*100 +
                       (indivi["declar1"].str[0:2]).convert_objects(convert_numeric=True))

    indivi.loc[fip_imp,"idfoy"] = nan

## Certains FIP (ou du moins avec revenus imputés) ont un num?ro de déclaration d'impôt ( pourquoi ?)


    fip_has_declar = (fip_imp) & (indivi.declar1.notnull())

#    indivi.ix[fip_has_declar, "idfoy"] = ( indivi.ix[fip_has_declar, "idmen"]*100
#                                        + (indivi.ix[fip_has_declar, "declar1"].str[0:1]).convert_objects(convert_numeric=True) )
    indivi["idfoy"] = where(fip_has_declar,
                            indivi["idmen"]*100 + indivi["declar1"].str[0:2].convert_objects(convert_numeric=True),
                            indivi["idfoy"])

    del fip_has_declar


    fip_no_declar = (fip_imp) & (indivi.declar1.isnull())
    del fip_imp
    indivi["idfoy"] = where(fip_no_declar,
                            indivi["idmen"]*100 + 50,
                            indivi["idfoy"])

    indivi_fnd = indivi.loc[fip_no_declar, ["idfoy","noindiv"]]


    while any(indivi_fnd.duplicated(cols=["idfoy"])):
        indivi_fnd["idfoy"] = where(indivi_fnd.duplicated(cols=["idfoy"]),
                                    indivi_fnd["idfoy"] + 1,
                                    indivi_fnd["idfoy"])

    assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len(indivi_fnd["idfoy"]), "Duplicates remaining"
    assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons"


    indivi.loc[fip_no_declar, ["idfoy"]] = indivi_fnd
    del indivi_fnd, fip_no_declar

    print ''
    print 'Etape 3 : Récupération des EE_NRT'

    nrt = indivi.quelfic=="EE_NRT"
    indivi.idfoy = where(nrt, indivi.idmen*100 + indivi.noi, indivi.idfoy)
    indivi.loc[nrt,"quifoy"] = "vous"
    del nrt

    pref_or_cref = indivi['lpr'].isin([1,2])
    adults = (indivi.quelfic.isin(["EE","EE_CAF"])) & (pref_or_cref)
    indivi.idfoy = where(adults, indivi.idmen*100 + indivi.noi, indivi.idfoy)
    indivi.loc[adults, "quifoy"] = "vous"
    del adults
    assert indivi.loc[indivi['lpr'].isin([1,2]),"idfoy"].notnull().all()

    print ''
    print 'Etape 4 : Rattachement des enfants aux déclarations'

    assert indivi["noindiv"].duplicated().any() == False, "Some noindiv appear twice"
    lpr3_or_lpr4 = indivi['lpr'].isin([3,4])
    enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE","EE_CAF"]))
    assert indivi.loc[enf_ee, "noindiv"].notnull().all(), " Some noindiv are not set, which will ruin next stage"
    assert indivi.loc[enf_ee, "noindiv"].duplicated().any() == False, "Some noindiv appear twice"

    pere = DataFrame( {"noindiv_enf" : indivi.noindiv.loc[enf_ee], "noindiv" : 100*indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee] })
    mere = DataFrame( {"noindiv_enf" : indivi.noindiv.loc[enf_ee], "noindiv" : 100*indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee] })

    foyer = data.get_values(variables=["noindiv","zimpof"], table="foyer" )
    pere  = pere.merge(foyer, how="inner", on="noindiv")
    mere  = mere.merge(foyer, how="inner", on="noindiv")

#     print "Some pere et mere are duplicated because people have two foyers"
#     print pere[pere.duplicated()]
#     print mere[mere.duplicated()]

    df = pere.merge(mere, how="outer", on="noindiv_enf",  suffixes=('_p', '_m'))

#     print len(pere)
#     print len(mere)
#     print len(df)
#     ll = df.loc[df["noindiv_enf"].duplicated(), "noindiv_enf"]
#     print df.loc[df["noindiv_enf"].isin(ll)]
#     print df[df.duplicated()]



    print '    4.1 : gestion des personnes dans 2 foyers'
    for col in ["noindiv_p","noindiv_m","noindiv_enf"]:
        df[col] = df[col].fillna(0,inplace=True) # beacause groupby drop groups with NA in index
    df = df.groupby(by=["noindiv_p","noindiv_m","noindiv_enf"]).sum()
    df.reset_index(inplace=True)

    df["which"] = ""
    df["which"] = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()), "mere", "")
    df["which"] = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()), "pere", "")
    both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull())
    df["which"] = where(both & (df.zimpof_p  > df.zimpof_m), "pere", "mere")
    df["which"] = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere")

    assert df["which"].notnull().all(), "Some enf_ee individuals are not matched with any pere or mere"
    del lpr3_or_lpr4, pere, mere

    df.rename(columns={"noindiv_enf" : "noindiv"}, inplace=True)
    df["idfoy"] = where( df.which=="pere", df.noindiv_p, df.noindiv_m)
    df["idfoy"] = where( df.which=="mere", df.noindiv_m, df.noindiv_p)

    assert df["idfoy"].notnull().all()

    for col in df.columns:
        if col not in ["idfoy", "noindiv"]:
            del df[col]

#     assert indivi.loc[enf_ee,"idfoy"].notnull().all()
    assert df.duplicated().any() == False

    df.set_index("noindiv",inplace=True, verify_integrity=True)
    indivi.set_index("noindiv", inplace=True, verify_integrity=True)

    ind_notnull = indivi["idfoy"].notnull().sum()
    ind_isnull = indivi["idfoy"].isnull().sum()
    indivi = indivi.combine_first(df)
    assert ind_notnull + ind_isnull == (indivi["idfoy"].notnull().sum() +
                                        indivi["idfoy"].isnull().sum())

    indivi.reset_index(inplace=True)
    assert indivi.duplicated().any() == False



# MBJ: issue delt with when moving from R code to python
## TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs
## On suit guide méthodo erf 2003 page 135
## On supprime les conjoints FIP et les FIP de 25 ans et plus;
## On conserve les enfants FIP de 19 à 24 ans;
## On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un
## ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète
## EEC que l'on ne retrouvera pas dans les EEC suivantes.
#
    print '    4.2 : On enlève les individus pour lesquels il manque le déclarant'
    fip = load_temp(name="fipDat", year=year)
    fip["declar"] = nan
    fip["agepf"] = nan


    fip.drop(["actrec", "year", "noidec"], axis=1, inplace=True)
    fip.naia = fip.naia.astype("int32")
    fip.rename( columns=dict(ident="idmen",
                     persfip="quifoy",
                     zsali="sali2", # Inclu les salaires non imposables des agents d'assurance
                     zchoi="choi2",
                     zrsti="rsti2",
                     zalri="alr2"),
                inplace=True)

    is_fip_19_25 = ((year-fip.naia-1)>=19) & ((year-fip.naia-1)<25)

## TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented
## The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25
##fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"]
##fip[is_fip_19_25 ,"lpr"]  <- 1
#
#indivi <- rbind.fill(indivi,fip[is_fip_19_25,])

    indivi = concat([indivi, fip.loc[is_fip_19_25]])
    del is_fip_19_25
    indivi['age'] = year - indivi.naia - 1
    indivi['agem'] = 12*indivi.age  + 12-indivi.naim

    indivi["quimen"] = 0
    indivi.quimen[indivi.lpr == 1] = 0
    indivi.quimen[indivi.lpr == 2] = 1
    indivi.quimen[indivi.lpr == 3] = 2
    indivi.quimen[indivi.lpr == 4] = 3
    indivi['not_pr_cpr'] = nan
    indivi['not_pr_cpr'][indivi['lpr']<=2] = False
    indivi['not_pr_cpr'][indivi['lpr']>2] = True


    print "    4.3 : Creating non pr=0 and cpr=1 idmen's"
    indivi.reset_index(inplace=True)
    test1 = indivi.ix[indivi['not_pr_cpr']==True,['quimen', 'idmen']]
    test1['quimen'] = 2

    j=2
    while any(test1.duplicated(['quimen', 'idmen'])):
        test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j+1
        j += 1

    print_id(indivi)
    indivi.update(test1)

    print_id(indivi)

#     indivi.set_index(['quiment']) #TODO: check relevance
#     TODO problème avec certains idfoy qui n'ont pas de vous
    print ''
    print "Etape 5 : Gestion des idfoy qui n'ont pas de vous"
    all = indivi.drop_duplicates('idfoy')
    with_ = indivi.loc[indivi['quifoy']=='vous', 'idfoy']
    without = all[~(all.idfoy.isin(with_.values))]

    print 'On cherche si le déclarant donné par la deuxième déclaration est bien un vous'
    has_declar2 = (indivi.idfoy.isin(without.idfoy.values)) & (indivi.declar2.notnull())
    decl2_idfoy = (indivi.loc[has_declar2, 'idmen'].astype('int')*100 +
                    indivi.loc[has_declar2, "declar2"].str[0:2].astype('int'))
    indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values), decl2_idfoy, None)
    del all,with_,without, has_declar2

    print '    5.1 : Elimination idfoy restant'
    idfoyList = indivi.loc[indivi['quifoy']=="vous", 'idfoy'].drop_duplicates()
    indivi = indivi[indivi.idfoy.isin(idfoyList.values)]
    del idfoyList
    print_id(indivi)

    myvars = ["noindiv", "noi", "idmen", "idfoy", "quifoy", "wprm",
                            "age","agem","quelfic","actrec", "quimen",
                            "nbsala","titc","statut","txtppb","chpub","prosa","encadr"]

    if not(len(set(myvars).difference(set(indivi.columns))) == 0):
        print set(myvars).difference(set(indivi.columns))

    assert len(set(myvars).difference(set(indivi.columns))) == 0

    indivi = indivi.loc[:, myvars]

## TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé
## les infos provenant des déclarations)

    print ''
    print 'Etape 6 : Création des variables descriptives'
    print '    6.1 : variable activité'
    indivi['activite'] = None
    indivi['activite'][indivi['actrec']<=3] = 0
    indivi['activite'][indivi['actrec']==4] = 1
    indivi['activite'][indivi['actrec']==5] = 2
    indivi['activite'][indivi['actrec']==7] = 3
    indivi['activite'][indivi['actrec']==8] = 4
    indivi['activite'][indivi['age']<=13] = 2 # ce sont en fait les actrec=9
    print indivi['activite'].value_counts()
    # TODO: MBJ problem avec les actrec


    indivi['titc'][indivi['titc'].isnull()] = 0
    assert indivi['titc'].notnull().all() , Exception("Problème avec les titc")


    print '    6.2 : variable statut'
    indivi['statut'][indivi['statut'].isnull()] = 0
    indivi['statut'] = indivi['statut'].astype('int')
    indivi['statut'][indivi['statut']==11] = 1
    indivi['statut'][indivi['statut']==12] = 2
    indivi['statut'][indivi['statut']==13] = 3
    indivi['statut'][indivi['statut']==21] = 4
    indivi['statut'][indivi['statut']==22] = 5
    indivi['statut'][indivi['statut']==33] = 6
    indivi['statut'][indivi['statut']==34] = 7
    indivi['statut'][indivi['statut']==35] = 8
    indivi['statut'][indivi['statut']==43] = 9
    indivi['statut'][indivi['statut']==44] = 10
    indivi['statut'][indivi['statut']==45] = 11
    assert indivi['statut'].isin(range(12)).all(), Exception("statut value over range")


#indivi$nbsala <- as.numeric(indivi$nbsala)
#indivi <- within(indivi,{
#  nbsala[is.na(nbsala) ]    <- 0
#  nbsala[nbsala==99 ] <- 10  # TODO  418 fip à retracer qui sont NA
#})

    print '    6.3 : variable txtppb'
    indivi['txtppb'] = indivi['txtppb'].fillna(0)
    assert indivi['txtppb'].notnull().all()

    indivi['nbsala'] = indivi['nbsala'].fillna(0)
    indivi['nbsala'] = indivi['nbsala'].astype('int')
    indivi['nbsala'][indivi['nbsala']==99] = 10
    assert indivi['nbsala'].isin(range(11)).all()

    print '    6.4 : variable chpub et CSP'
    indivi['chpub'].fillna(0, inplace=True)
    indivi['chpub'] = indivi['chpub'].astype('int')
    indivi['chpub'][indivi['chpub'].isnull()] = 0
    print indivi['chpub'].value_counts()
    assert indivi['chpub'].isin(range(11)).all()

    indivi['cadre'] = 0
    indivi['prosa'][indivi['prosa'].isnull()] = 0
    assert indivi['prosa'].notnull().all()
    print indivi['encadr'].value_counts()

    # encadr : 1=oui, 2=non
    indivi['encadr'].fillna(2, inplace=True)
    assert indivi['encadr'].notnull().all()
    indivi['cadre'][indivi['prosa'].isin([7,8])] = 1
    indivi['cadre'][(indivi['prosa']==9) & (indivi['encadr']==1)] = 1
    print "cadre"
    print indivi['cadre'].value_counts()
    assert indivi['cadre'].isin(range(2)).all()

    print ''
    print "Etape 7 : on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence"

    print 'nb de doublons idfam/quifam', len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])

    print 'On crée les n° de personnes à charge'
    assert indivi['idfoy'].notnull().all()
    print_id(indivi)
    indivi['quifoy2'] = 2
    indivi['quifoy2'][indivi['quifoy']=='vous'] = 0
    indivi['quifoy2'][indivi['quifoy']=='conj'] = 1
    indivi['quifoy2'][indivi['quifoy']=='pac'] = 2


    del indivi['quifoy']
    indivi['quifoy'] = indivi['quifoy2']
    del indivi['quifoy2']

    print_id(indivi)
    test2 = indivi.loc[indivi['quifoy']==2, ['quifoy', 'idfoy','noindiv']]
    print_id(test2)

    j=2
    while test2.duplicated(['quifoy', 'idfoy']).any():
        test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j
        j += 1

    print_id(test2)
    indivi = indivi.merge(test2, on=['noindiv','idfoy'], how="left")
    indivi['quifoy'] = indivi['quifoy_x']
    indivi['quifoy'] = where(indivi['quifoy_x']==2, indivi['quifoy_y'], indivi['quifoy_x'])
    del indivi['quifoy_x'], indivi['quifoy_y']
    print_id(indivi)

    del test2, fip
    print 'nb de doublons idfam/quifam', len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])
    print_id(indivi)

#####################################################################################
## On ajoute les idfam et quifam
#load(famc)
#
#tot2 <- merge(indivi, famille, by = c('noindiv'), all.x = TRUE)
#rm(famille)
#print_id(tot2)
#
### Les idfam des enfants FIP qui ne font plus partie des familles forment des famille seuls
#tot2[is.na(tot2$quifam), "idfam"] <- tot2[is.na(tot2$quifam), "noindiv"]
#tot2[is.na(tot2$quifam), "quifam"] <- 0
#print_id(tot2)
#saveTmp(tot2, file = "tot2.Rdata")
#rm(indivi,tot2)
#
## on merge les variables de revenus (foyer_aggr) avec les identifiants précédents
## load foyer
#loadTmp(file = "tot2.Rdata")
#loadTmp(file= "foyer_aggr.Rdata")
#
#tot3 <- merge(tot2, foyer, all.x = TRUE)
#print_id(tot3) # OK
#saveTmp(tot3, file= "tot3.Rdata")
#rm(tot3,tot2,foyer)
#
    print ''
    print 'Etape 8 : création des fichiers totaux'
    famille = load_temp(name='famc', year=year)



    print '    8.1 : création de tot2 & tot3'
    tot2 = indivi.merge(famille, on='noindiv', how='inner')
#     del famille # TODO: MBJ increase in number of menage/foyer when merging with family ...
    del famille


    control(tot2, debug=True, verbose=True)
    assert tot2['quifam'].notnull().all()

    save_temp(tot2, name='tot2', year=year)
    del indivi
    print '    tot2 saved'

#     #On combine les variables de revenu
#     foyer = load_temp(name='foy_ind', year=year)
#     print " INTERSERCT THE POOCHAY"
#     tot2["idfoy"] = tot2["idfoy"][tot2["idfoy"].notnull()] +1
#     print "pingas"
#     print sorted(tot2.loc[tot2.idfoy.notnull(),"idfoy"].astype('int').unique())[0:10]
#     print "pocchay"
#     print sorted(foyer["idfoy"].unique())[0:10]
#     print "final flash"
#     print 602062550.0 in foyer["idfoy"].values
#     print len(list(set(tot2["idfoy"].unique()) & set(foyer["idfoy"].unique())))
#     print tot2.quifoy.value_counts()
    #tot2.update(foyer)
    tot2.merge(foyer, how = 'left')

    tot2 = tot2[tot2.idmen.notnull()]
#     tot2['idfoy'] += 1

    print_id(tot2)

    tot3 = tot2
    # TODO: check where they come from
    tot3 = tot3.drop_duplicates(cols='noindiv')
    print len(tot3)

    #Block to remove any unwanted duplicated pair
    print "    check tot3"
    control(tot3, debug=True, verbose=True)
    tot3 = tot3.drop_duplicates(cols=['idfoy', 'quifoy'])
    tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam'])
    tot3 = tot3.drop_duplicates(cols=['idmen', 'quimen'])
    tot3 = tot3.drop_duplicates(cols='noindiv')
    control(tot3)

## On ajoute les variables individualisables
#loadTmp("foyer_individualise.Rdata") # foy_ind
#loadTmp("tot3.Rdata")
#loadTmp("allvars.Rdata")
#loadTmp("sif.Rdata")
#
#vars2 <- setdiff(names(tot3),  allvars)
#tot3 <- tot3[,vars2]
#
#print_id(tot3)
#final <- merge(tot3, foy_ind, by = c('idfoy', 'quifoy'), all.x = TRUE)
#
    print '    8.2 : On ajoute les variables individualisables'

    allvars = load_temp(name = 'ind_vars_to_remove', year=year)
    vars2 = set(tot3.columns).difference(set(allvars))
    tot3 = tot3[list(vars2)]
    print len(tot3)


    assert not(tot3.duplicated(cols=['noindiv']).any()), "doublon dans tot3['noindiv']"
    lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])])
    assert lg_dup == 0, "%i pairs of idfoy/quifoy in tot3 are duplicated" %(lg_dup)

    save_temp(tot3, name='tot3', year=year)
    control(tot3)

    del tot2, allvars, tot3, vars2
    print 'tot3 sauvegardé'
    gc.collect()
Пример #6
0
def final(year=2006, filename="test", check=True):

    ##***********************************************************************/
    print('08_final: derniers réglages')
    ##***********************************************************************/
    #
    # loadTmp("final.Rdata")
    # # On définit comme célibataires les individus dont on n'a pas retrouvé la déclaration
    # final$statmarit[is.na(final$statmarit)] <- 2
    # table(final$statmarit, useNA='ifany')
    #
    import gc
    gc.collect()
    final = load_temp("final", year=year)
    print 'check doublons', len(final[final.duplicated(['noindiv'])])
    final.statmarit = where(final.statmarit.isnull(), 2, final.statmarit)
    #

    # # activite des fip
    # table(final[final$quelfic=="FIP","activite"],useNA="ifany")
    # summary(final[final$quelfic=="FIP",c("activite","choi","sali","alr","rsti","age")] )
    # # activite      # actif occup? 0, ch?meur 1, ?tudiant/?l?ve 2, retrait? 3, autre inactif 4
    #
    # final_fip <- final[final$quelfic=="FIP",]
    # final_fip <- within(final_fip,{
    #   choi <- ifelse(is.na(choi),0,choi)
    #   sali <- ifelse(is.na(sali),0,sali)
    #   alr <- ifelse(is.na(alr),0,alr)
    #   rsti <- ifelse(is.na(rsti),0,rsti)
    #   activite <- 2 # TODO comment choisr la valeur par d?faut ?
    #   activite <- ifelse(choi > 0,1,activite)
    #   activite <- ifelse(sali > 0,0,activite)
    #   activite <- ifelse(age  >= 21, 2,activite) # ne peuvent être rattach?s que les ?tudiants
    # })
    # final[final$quelfic=="FIP",]<- final_fip
    # table(final_fip[,c("age","activite")])
    # rm(final_fip)
    #
    # print_id(final)
    # saveTmp(final, file= "final.Rdata")
    #
    print '    gestion des FIP de final'
    final_fip = final.loc[final.quelfic == "FIP",
                          ["choi", "sali", "alr", "rsti", "age"]]

    print set(["choi", "sali", "alr",
               "rsti"]).difference(set(final_fip.columns))
    for var in ["choi", "sali", "alr", "rsti"]:
        final_fip[var].fillna(0, inplace=True)
        assert final_fip[var].notnull().all(
        ), "some NaN are remaining in column %s" % (var)

    final_fip["activite"] = 2  # TODO comment choisr la valeur par défaut ?
    final_fip.activite = where(final_fip.choi > 0, 1, final_fip.activite)
    final_fip.activite = where(final_fip.sali > 0, 0, final_fip.activite)
    final_fip.activite = where(
        final_fip.age > 21, 2,
        final_fip.activite)  # ne peuvent être rattach?s que les ?tudiants

    final.update(final_fip)
    save_temp(final, name="final", year=year)
    print '    final has been updated with fip'

    # loadTmp("final.Rdata")
    # load(menm)
    # menagem <- rename(menagem, c("ident"="idmen","loym"="loyer"))
    # menagem$cstotpragr <- floor(menagem$cstotpr/10)
    #
    from math import floor

    menagem = load_temp(name="menagem", year=year)
    menagem.rename(columns=dict(ident="idmen", loym="loyer"), inplace=True)
    menagem["cstotpragr"] = menagem["cstotpr"].apply(lambda x: floor(x / 10))
    #
    # # 2008 tau99 removed TODO: check ! and check incidence
    # if (year == "2008") {
    #  vars <- c("loyer", "tu99", "pol99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm")
    # } else {
    #   vars <- c("loyer", "tu99", "pol99", "tau99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm")
    # }
    #
    # famille_vars <- c("m_afeamam", "m_agedm","m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm')

    if year == 2008:
        vars = [
            "loyer", "tu99", "pol99", "reg", "idmen", "so", "wprm", "typmen15",
            "nbinde", "ddipl", "cstotpragr", "champm", "zthabm"
        ]
    else:
        vars = [
            "loyer", "tu99", "pol99", "tau99", "reg", "idmen", "so", "wprm",
            "typmen15", "nbinde", "ddipl", "cstotpragr", "champm", "zthabm"
        ]
    famille_vars = [
        "m_afeamam", "m_agedm", "m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm'
    ]

    # if ("naf16pr" %in% names(menagem)) {
    #   naf16pr <- factor(menagem$naf16pr)
    #   levels(naf16pr) <-  0:16
    #   menagem$naf16pr <- as.character(naf16pr)
    #   menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1"  # Sans objet
    #   vars <- c(vars,"naf16pr")
    # } else if ("nafg17npr" %in% names(menagem)) {
    #   # TODO: pb in 2008 with xx
    #   if (year == "2008"){
    #     menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00"
    #   }
    #   nafg17npr <- factor(menagem$nafg17npr)
    #   levels(nafg17npr) <-  0:17
    #   menagem$nafg17npr <- as.character(nafg17npr)
    #   menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1"  # Sans objet
    # }
    #

    #TODO: TODO: pytohn translation needed
    #    if "naf16pr" in menagem.columns:
    #        naf16pr <- factor(menagem$naf16pr)
    #   levels(naf16pr) <-  0:16
    #   menagem$naf16pr <- as.character(naf16pr)
    #   menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1"  # Sans objet
    #   vars <- c(vars,"naf16pr")
    # } else if ("nafg17npr" %in% names(menagem)) {
    #   # TODO: pb in 2008 with xx
    #   if (year == "2008"){
    #     menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00"
    #   }
    #   nafg17npr <- factor(menagem$nafg17npr)
    #   levels(nafg17npr) <-  0:17
    #   menagem$nafg17npr <- as.character(nafg17npr)
    #   menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1"  # Sans objet
    # }

    # # TODO: 2008tau99 is not present should be provided by 02_loy.... is it really needed
    # all_vars <- union(vars,famille_vars)
    # available_vars <- all_vars[union(vars,famille_vars) %in% names(menagem)]
    # loyersMenages <- menagem[,available_vars]
    #
    all_vars = vars + famille_vars

    print all_vars
    print set(menagem.columns)
    available_vars = list(set(all_vars).intersection(set(menagem.columns)))

    loyersMenages = menagem.xs(available_vars, axis=1)

    #
    # # Recodage de typmen15: modalités de 1:15
    # table(loyersMenages$typmen15, useNA="ifany")
    # loyersMenages <- within(loyersMenages, {
    #   typmen15[typmen15==10 ] <- 1
    #   typmen15[typmen15==11 ] <- 2
    #   typmen15[typmen15==21 ] <- 3
    #   typmen15[typmen15==22 ] <- 4
    #   typmen15[typmen15==23 ] <- 5
    #   typmen15[typmen15==31 ] <- 6
    #   typmen15[typmen15==32 ] <- 7
    #   typmen15[typmen15==33 ] <- 8
    #   typmen15[typmen15==41 ] <- 9
    #   typmen15[typmen15==42 ] <- 10
    #   typmen15[typmen15==43 ] <- 11
    #   typmen15[typmen15==44 ] <- 12
    #   typmen15[typmen15==51 ] <- 13
    #   typmen15[typmen15==52 ] <- 14
    #   typmen15[typmen15==53 ] <- 15
    # })
    #
    #
    # TODO: MBJ UNNECESSARY ?

    #
    # # Pb avec ddipl, pas de modalités 2: on décale les chaps >=3
    # # Cependant on fait cela après avoir fait les traitement suivants
    # table(loyersMenages$ddipl, useNA="ifany")
    # # On convertit les ddipl en numeric
    # loyersMenages$ddipl <- as.numeric(loyersMenages$ddipl)
    # table(loyersMenages$ddipl, useNA="ifany")
    # #   On met les non renseignés ie, NA et "" à sans diplome (modalité 7)
    # loyersMenages[is.na(loyersMenages$ddipl), "ddipl"] <- 7
    #
    # loyersMenages[loyersMenages$ddipl>1, "ddipl"] <- loyersMenages$ddipl[loyersMenages$ddipl>1]-1
    #

    loyersMenages.ddipl = where(loyersMenages.ddipl.isnull(), 7,
                                loyersMenages.ddipl)
    loyersMenages.ddipl = where(loyersMenages.ddipl > 1,
                                loyersMenages.ddipl - 1, loyersMenages.ddipl)
    loyersMenages.ddipl.astype("int32")
    #
    # table(final$actrec,useNA="ifany")
    # final$act5 <- NA
    # final <- within(final, {
    #   act5[which(actrec==1) ] <- 2 # ind?pendants
    #   act5[which(actrec==2) ] <- 1 # salari?s
    #   act5[which(actrec==3) ] <- 1 # salari?s
    #   act5[which(actrec==4) ] <- 3 # ch?meur
    #   act5[which(actrec==7) ] <- 4 # retrait?
    #   act5[which(actrec==8) ] <- 5 # autres inactifs
    # })
    # table(final$act5,useNA="ifany")
    #

    final.act5 = NaN

    final.act5 = where(final.actrec == 1, 2, final.act5)  # indépendants
    final.act5 = where(final.actrec.isin([2, 3]), 1, final.act5)  # salariés

    final.act5 = where(final.actrec == 4, 3, final.act5)  # chômeur
    final.act5 = where(final.actrec == 7, 4, final.act5)  # retraité
    final.act5 = where(final.actrec == 8, 5, final.act5)  # autres inactifs
    print final.act5.value_counts()  # TODO : 29 retraités ?

    #     assert final.act5.notnull().all(), 'there are NaN inside final.act5'

    # final$wprm <- NULL # with the intention to extract wprm from menage to deal with FIPs
    # final$tax_hab <- final$zthabm # rename zthabm to tax_hab
    # final$zthabm <- NULL
    #
    # final2 <- merge(final, loyersMenages, by="idmen", all.x=TRUE)
    print '    création de final2'
    del final["wprm"]
    gc.collect()
    final.rename(columns=dict(zthabm="tax_hab"),
                 inplace=True)  # rename zthabm to tax_hab
    final2 = final.merge(loyersMenages, on="idmen", how="left")  # TODO: Check
    print loyersMenages.head()
    gc.collect()
    print_id(final2)

    #
    # # TODO: merging with patrimoine
    # rm(menagem,final)
    #
    # # table(final2$activite,useNA="ifany")
    # # table(final2$alt,useNA="ifany")
    #
    # saveTmp(final2, file= "final2.Rdata")
    #
    # loadTmp("final2.Rdata")
    # names(final2)
    # print_id(final2)
    #
    #
    # # set zone_apl using zone_apl_imputation_data
    # apl_imp <- read.csv("./zone_apl/zone_apl_imputation_data.csv")
    #
    # if (year == "2008") {
    #   zone_apl <- final2[, c("tu99", "pol99", "reg")]
    # } else {
    #   zone_apl <- final2[, c("tu99", "pol99", "tau99", "reg")]
    # }
    #
    # for (i in 1:length(apl_imp[,"TU99"])) {
    #   tu <- apl_imp[i,"TU99"]
    #   pol <- apl_imp[i,"POL99"]
    #   tau <- apl_imp[i,"TAU99"]
    #   reg <- apl_imp[i,"REG"]
    #   #  print(c(tu,pol,tau,reg))
    #
    #   if (year == "2008") {
    #     indices <- (final2["tu99"] == tu & final2["pol99"] == pol  & final2["reg"] == reg)
    #     selection <-  (apl_imp["TU99"] == tu & apl_imp["POL99"] == pol & apl_imp["REG"] == reg)
    #   } else {
    #     indices <- (final2["tu99"] == tu & final2["pol99"] == pol & final2["tau99"] == tau & final2["reg"] == reg)
    #     selection <-  (apl_imp["TU99"] == tu & apl_imp["POL99"] == pol & apl_imp["TAU99"] == tau & apl_imp["REG"] == reg)
    #   }
    #   z <- runif(sum(indices))
    #   probs <- apl_imp[selection , c("proba_zone1", "proba_zone2")]
    #   #  print(probs)
    #   final2[indices,"zone_apl"] <- 1 + (z>probs[,'proba_zone1']) + (z>(probs[,'proba_zone1']+probs[,'proba_zone2']))
    #   rm(indices, probs)
    # }
    #

    print '    traitement des zones apl'
    apl_imp = read_csv("../../zone_apl/zone_apl_imputation_data.csv")

    print apl_imp.head(10)
    if year == 2008:
        zone_apl = final2.xs(["tu99", "pol99", "reg"], axis=1)
    else:
        zone_apl = final2.xs(["tu99", "pol99", "tau99", "reg"], axis=1)

    for i in range(len(apl_imp["TU99"])):
        tu = apl_imp["TU99"][i]
        pol = apl_imp["POL99"][i]
        tau = apl_imp["TAU99"][i]
        reg = apl_imp["REG"][i]

    if year == 2008:
        indices = (final2["tu99"] == tu) & (final2["pol99"]
                                            == pol) & (final2["reg"] == reg)
        selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (
            apl_imp["REG"] == reg)
    else:
        indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (
            final2["tau99"] == tau) & (final2["reg"] == reg)
        selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (
            apl_imp["TAU99"] == tau) & (apl_imp["REG"] == reg)

    z = random.uniform(size=indices.sum())
    print len(z)
    print len(indices)

    print len(indices) / len(z)
    probs = apl_imp.loc[selection, ["proba_zone1", "proba_zone2"]]
    print probs
    print probs['proba_zone1'].values

    proba_zone_1 = probs['proba_zone1'].values[0]
    proba_zone_2 = probs['proba_zone2'].values[0]

    final2["zone_apl"] = 3
    final2["zone_apl"][indices] = (1 + (z > proba_zone_1) +
                                   (z > (proba_zone_1 + proba_zone_2)))
    del indices, probs

    #     control(final2, verbose=True, debug=True, verbose_length=15)

    print '    performing cleaning on final2'
    print 'nombre de sali nuls', len(final2[final2['sali'].isnull()])
    print "nombre d'âges nuls", len(final2[final2.age.isnull()])
    print "longueur de final2 avant purge", len(final2)
    #     columns_w_nan = []
    #     for col in final2.columns:
    #         if final2[final2['idfoy'].notnull()][col].isnull().any() and not final2[col].isnull().all():
    #             columns_w_nan.append(col)
    #     print columns_w_nan
    print 'check doublons', len(final2[final2.duplicated(['noindiv'])])
    print final2.age.isnull().sum()

    #     print final2.loc[final2.duplicated('noindiv'), ['noindiv', 'quifam']].to_string()
    #TODO: JS: des chefs de famille et conjoints en double il faut trouver la source des ces doublons !
    #     final2 = final2.drop_duplicates(['noindiv'])

    final2 = final2[~(final2.age.isnull())]
    print "longueur de final2 après purge", len(final2)
    print_id(final2)

    #
    # # var <- names(foyer)
    # #a1 <- c('f7rb', 'f7ra', 'f7gx', 'f2aa', 'f7gt', 'f2an', 'f2am', 'f7gw', 'f7gs', 'f8td', 'f7nz', 'f1br', 'f7jy', 'f7cu', 'f7xi', 'f7xo', 'f7xn', 'f7xw', 'f7xy', 'f6hj', 'f7qt', 'f7ql', 'f7qm', 'f7qd', 'f7qb', 'f7qc', 'f1ar', 'f7my', 'f3vv', 'f3vu', 'f3vt', 'f7gu', 'f3vd', 'f2al', 'f2bh', 'f7fm', 'f8uy', 'f7td', 'f7gv', 'f7is', 'f7iy', 'f7il', 'f7im', 'f7ij', 'f7ik', 'f1er', 'f7wl', 'f7wk', 'f7we', 'f6eh', 'f7la', 'f7uh', 'f7ly', 'f8wy', 'f8wx', 'f8wv', 'f7sb', 'f7sc', 'f7sd', 'f7se', 'f7sf', 'f7sh', 'f7si',  'f1dr', 'f7hs', 'f7hr', 'f7hy', 'f7hk', 'f7hj', 'f7hm', 'f7hl', 'f7ho', 'f7hn', 'f4gc', 'f4gb', 'f4ga', 'f4gg', 'f4gf', 'f4ge', 'f7vz', 'f7vy', 'f7vx', 'f7vw', 'f7xe', 'f6aa', 'f1cr', 'f7ka', 'f7ky', 'f7db', 'f7dq', 'f2da')
    # #a2 <- setdiff(a1,names(foyer))
    # #b1 <- c('pondfin', 'alt', 'hsup', 'ass_mat', 'zone_apl', 'inactif', 'ass', 'aer', 'code_postal', 'activite', 'type_sal', 'jour_xyz', 'boursier', 'etr', 'partiel1', 'partiel2', 'empl_dir', 'gar_dom', 'categ_inv', 'opt_colca', 'csg_taux_plein','coloc')
    # # hsup feuille d'impot
    # # boursier pas dispo
    # # inactif etc : extraire cela des donn?es clca etc
    #
    # # tester activit? car 0 vaut actif
    # table(is.na(final2$activite),useNA="ifany")
    #
    # saveTmp(final2, file= "final2.Rdata")

    control(final2, debug=True)
    print final2.age.isnull().sum()
    final2 = final2.drop_duplicates(cols='noindiv')

    print '    Filter to manage the new 3-tables structures:'
    # On récupère les foyer, famille, ménages qui ont un chef :
    liste_men = unique(final2.loc[final2['quimen'] == 0, 'idmen'].values)
    liste_fam = unique(final2.loc[final2['quifam'] == 0, 'idfam'].values)
    liste_foy = unique(final2.loc[final2['quifoy'] == 0, 'idfoy'].values)

    #On ne conserve dans final2 que ces foyers là :
    print 'final2 avant le filtrage', len(final2)
    final2 = final2.loc[final2.idmen.isin(liste_men), :]
    final2 = final2.loc[final2.idfam.isin(liste_fam), :]
    final2 = final2.loc[final2.idfoy.isin(liste_foy), :]
    print 'final2 après le filtrage', len(final2)

    if check:
        check_structure(final2)

    from openfisca_france import DATA_SOURCES_DIR
    test_filename = os.path.join(DATA_SOURCES_DIR, filename + ".h5")
    if os.path.exists(test_filename):
        import warnings
        import datetime
        time_stamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')
        renamed_file = os.path.join(DATA_SOURCES_DIR,
                                    filename + "_" + time_stamp + ".h5")
        warnings.warn(
            "A file with the same name already exists \n Renaming current output and saving to "
            + renamed_file)
        test_filename = renamed_file

    store = HDFStore(test_filename)
    store['survey_' + str(year)] = final2
Пример #7
0
def final(year=2006, filename="test", check=True):

##***********************************************************************/
    print('08_final: derniers réglages')
##***********************************************************************/
#
# loadTmp("final.Rdata")
# # On définit comme célibataires les individus dont on n'a pas retrouvé la déclaration
# final$statmarit[is.na(final$statmarit)] <- 2
# table(final$statmarit, useNA='ifany')
#
    import gc
    gc.collect()
    final = load_temp("final", year=year)
    print 'check doublons', len(final[final.duplicated(['noindiv'])])
    final.statmarit = where(final.statmarit.isnull(), 2, final.statmarit)
#

# # activite des fip
# table(final[final$quelfic=="FIP","activite"],useNA="ifany")
# summary(final[final$quelfic=="FIP",c("activite","choi","sali","alr","rsti","age")] )
# # activite      # actif occup? 0, ch?meur 1, ?tudiant/?l?ve 2, retrait? 3, autre inactif 4
#
# final_fip <- final[final$quelfic=="FIP",]
# final_fip <- within(final_fip,{
#   choi <- ifelse(is.na(choi),0,choi)
#   sali <- ifelse(is.na(sali),0,sali)
#   alr <- ifelse(is.na(alr),0,alr)
#   rsti <- ifelse(is.na(rsti),0,rsti)
#   activite <- 2 # TODO comment choisr la valeur par d?faut ?
#   activite <- ifelse(choi > 0,1,activite)
#   activite <- ifelse(sali > 0,0,activite)
#   activite <- ifelse(age  >= 21, 2,activite) # ne peuvent être rattach?s que les ?tudiants
# })
# final[final$quelfic=="FIP",]<- final_fip
# table(final_fip[,c("age","activite")])
# rm(final_fip)
#
# print_id(final)
# saveTmp(final, file= "final.Rdata")
#
    print '    gestion des FIP de final'
    final_fip = final.loc[final.quelfic=="FIP", ["choi", "sali", "alr", "rsti","age"]]

    print set(["choi", "sali", "alr", "rsti"]).difference(set(final_fip.columns))
    for var in  ["choi", "sali", "alr", "rsti"]:
        final_fip[var].fillna(0, inplace=True)
        assert final_fip[var].notnull().all(), "some NaN are remaining in column %s" %(var)


    final_fip["activite"] = 2 # TODO comment choisr la valeur par défaut ?
    final_fip.activite = where(final_fip.choi > 0, 1, final_fip.activite)
    final_fip.activite = where(final_fip.sali > 0, 0, final_fip.activite)
    final_fip.activite = where(final_fip.age > 21, 2, final_fip.activite)  # ne peuvent être rattach?s que les ?tudiants

    final.update(final_fip)
    save_temp(final, name="final", year=year)
    print '    final has been updated with fip'

# loadTmp("final.Rdata")
# load(menm)
# menagem <- rename(menagem, c("ident"="idmen","loym"="loyer"))
# menagem$cstotpragr <- floor(menagem$cstotpr/10)
#
    from math import floor

    menagem = load_temp(name="menagem", year=year)
    menagem.rename(columns=dict(ident="idmen",loym="loyer"), inplace=True)
    menagem["cstotpragr"] = menagem["cstotpr"].apply(lambda x: floor(x/10))
#
# # 2008 tau99 removed TODO: check ! and check incidence
# if (year == "2008") {
#  vars <- c("loyer", "tu99", "pol99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm")
# } else {
#   vars <- c("loyer", "tu99", "pol99", "tau99", "reg","idmen", "so", "wprm", "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm")
# }
#
# famille_vars <- c("m_afeamam", "m_agedm","m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm')

    if year == 2008:
        vars = ["loyer", "tu99", "pol99", "reg","idmen", "so", "wprm", "typmen15",
                 "nbinde","ddipl","cstotpragr","champm","zthabm"]
    else:
        vars = ["loyer", "tu99", "pol99", "tau99", "reg","idmen", "so", "wprm",
                "typmen15", "nbinde","ddipl","cstotpragr","champm","zthabm"]
    famille_vars = ["m_afeamam", "m_agedm","m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm']


# if ("naf16pr" %in% names(menagem)) {
#   naf16pr <- factor(menagem$naf16pr)
#   levels(naf16pr) <-  0:16
#   menagem$naf16pr <- as.character(naf16pr)
#   menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1"  # Sans objet
#   vars <- c(vars,"naf16pr")
# } else if ("nafg17npr" %in% names(menagem)) {
#   # TODO: pb in 2008 with xx
#   if (year == "2008"){
#     menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00"
#   }
#   nafg17npr <- factor(menagem$nafg17npr)
#   levels(nafg17npr) <-  0:17
#   menagem$nafg17npr <- as.character(nafg17npr)
#   menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1"  # Sans objet
# }
#


#TODO: TODO: pytohn translation needed
#    if "naf16pr" in menagem.columns:
#        naf16pr <- factor(menagem$naf16pr)
#   levels(naf16pr) <-  0:16
#   menagem$naf16pr <- as.character(naf16pr)
#   menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1"  # Sans objet
#   vars <- c(vars,"naf16pr")
# } else if ("nafg17npr" %in% names(menagem)) {
#   # TODO: pb in 2008 with xx
#   if (year == "2008"){
#     menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00"
#   }
#   nafg17npr <- factor(menagem$nafg17npr)
#   levels(nafg17npr) <-  0:17
#   menagem$nafg17npr <- as.character(nafg17npr)
#   menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1"  # Sans objet
# }



# # TODO: 2008tau99 is not present should be provided by 02_loy.... is it really needed
# all_vars <- union(vars,famille_vars)
# available_vars <- all_vars[union(vars,famille_vars) %in% names(menagem)]
# loyersMenages <- menagem[,available_vars]
#
    all_vars = vars + famille_vars

    print all_vars
    print  set(menagem.columns)
    available_vars = list( set(all_vars).intersection(set(menagem.columns)))

    loyersMenages = menagem.xs(available_vars,axis=1)


#
# # Recodage de typmen15: modalités de 1:15
# table(loyersMenages$typmen15, useNA="ifany")
# loyersMenages <- within(loyersMenages, {
#   typmen15[typmen15==10 ] <- 1
#   typmen15[typmen15==11 ] <- 2
#   typmen15[typmen15==21 ] <- 3
#   typmen15[typmen15==22 ] <- 4
#   typmen15[typmen15==23 ] <- 5
#   typmen15[typmen15==31 ] <- 6
#   typmen15[typmen15==32 ] <- 7
#   typmen15[typmen15==33 ] <- 8
#   typmen15[typmen15==41 ] <- 9
#   typmen15[typmen15==42 ] <- 10
#   typmen15[typmen15==43 ] <- 11
#   typmen15[typmen15==44 ] <- 12
#   typmen15[typmen15==51 ] <- 13
#   typmen15[typmen15==52 ] <- 14
#   typmen15[typmen15==53 ] <- 15
# })
#
#
# TODO: MBJ UNNECESSARY ?

#
# # Pb avec ddipl, pas de modalités 2: on décale les chaps >=3
# # Cependant on fait cela après avoir fait les traitement suivants
# table(loyersMenages$ddipl, useNA="ifany")
# # On convertit les ddipl en numeric
# loyersMenages$ddipl <- as.numeric(loyersMenages$ddipl)
# table(loyersMenages$ddipl, useNA="ifany")
# #   On met les non renseignés ie, NA et "" à sans diplome (modalité 7)
# loyersMenages[is.na(loyersMenages$ddipl), "ddipl"] <- 7
#
# loyersMenages[loyersMenages$ddipl>1, "ddipl"] <- loyersMenages$ddipl[loyersMenages$ddipl>1]-1
#


    loyersMenages.ddipl = where(loyersMenages.ddipl.isnull(), 7, loyersMenages.ddipl)
    loyersMenages.ddipl = where(loyersMenages.ddipl>1,
                                loyersMenages.ddipl-1,
                                loyersMenages.ddipl)
    loyersMenages.ddipl.astype("int32")
#
# table(final$actrec,useNA="ifany")
# final$act5 <- NA
# final <- within(final, {
#   act5[which(actrec==1) ] <- 2 # ind?pendants
#   act5[which(actrec==2) ] <- 1 # salari?s
#   act5[which(actrec==3) ] <- 1 # salari?s
#   act5[which(actrec==4) ] <- 3 # ch?meur
#   act5[which(actrec==7) ] <- 4 # retrait?
#   act5[which(actrec==8) ] <- 5 # autres inactifs
# })
# table(final$act5,useNA="ifany")
#


    final.act5 = NaN

    final.act5 = where(final.actrec==1, 2, final.act5) # indépendants
    final.act5 = where(final.actrec.isin([2,3]), 1, final.act5)  # salariés

    final.act5 = where(final.actrec==4, 3, final.act5) # chômeur
    final.act5 = where(final.actrec==7, 4, final.act5) # retraité
    final.act5 = where(final.actrec==8, 5, final.act5) # autres inactifs
    print final.act5.value_counts() # TODO : 29 retraités ?

#     assert final.act5.notnull().all(), 'there are NaN inside final.act5'

# final$wprm <- NULL # with the intention to extract wprm from menage to deal with FIPs
# final$tax_hab <- final$zthabm # rename zthabm to tax_hab
# final$zthabm <- NULL
#
# final2 <- merge(final, loyersMenages, by="idmen", all.x=TRUE)
    print '    création de final2'
    del final["wprm"]
    gc.collect()
    final.rename(columns=dict(zthabm="tax_hab"), inplace=True) # rename zthabm to tax_hab
    final2 = final.merge(loyersMenages, on="idmen", how="left") # TODO: Check
    print loyersMenages.head()
    gc.collect()
    print_id(final2)

#
# # TODO: merging with patrimoine
# rm(menagem,final)
#
# # table(final2$activite,useNA="ifany")
# # table(final2$alt,useNA="ifany")
#
# saveTmp(final2, file= "final2.Rdata")
#
# loadTmp("final2.Rdata")
# names(final2)
# print_id(final2)
#
#
# # set zone_apl using zone_apl_imputation_data
# apl_imp <- read.csv("./zone_apl/zone_apl_imputation_data.csv")
#
# if (year == "2008") {
#   zone_apl <- final2[, c("tu99", "pol99", "reg")]
# } else {
#   zone_apl <- final2[, c("tu99", "pol99", "tau99", "reg")]
# }
#
# for (i in 1:length(apl_imp[,"TU99"])) {
#   tu <- apl_imp[i,"TU99"]
#   pol <- apl_imp[i,"POL99"]
#   tau <- apl_imp[i,"TAU99"]
#   reg <- apl_imp[i,"REG"]
#   #  print(c(tu,pol,tau,reg))
#
#   if (year == "2008") {
#     indices <- (final2["tu99"] == tu & final2["pol99"] == pol  & final2["reg"] == reg)
#     selection <-  (apl_imp["TU99"] == tu & apl_imp["POL99"] == pol & apl_imp["REG"] == reg)
#   } else {
#     indices <- (final2["tu99"] == tu & final2["pol99"] == pol & final2["tau99"] == tau & final2["reg"] == reg)
#     selection <-  (apl_imp["TU99"] == tu & apl_imp["POL99"] == pol & apl_imp["TAU99"] == tau & apl_imp["REG"] == reg)
#   }
#   z <- runif(sum(indices))
#   probs <- apl_imp[selection , c("proba_zone1", "proba_zone2")]
#   #  print(probs)
#   final2[indices,"zone_apl"] <- 1 + (z>probs[,'proba_zone1']) + (z>(probs[,'proba_zone1']+probs[,'proba_zone2']))
#   rm(indices, probs)
# }
#

    print '    traitement des zones apl'
    apl_imp = read_csv("../../zone_apl/zone_apl_imputation_data.csv")

    print apl_imp.head(10)
    if year == 2008:
        zone_apl = final2.xs(["tu99", "pol99", "reg"], axis=1)
    else:
        zone_apl = final2.xs(["tu99", "pol99", "tau99", "reg"], axis=1)

    for i in range(len(apl_imp["TU99"])):
        tu = apl_imp["TU99"][i]
        pol = apl_imp["POL99"][i]
        tau = apl_imp["TAU99"][i]
        reg = apl_imp["REG"][i]

    if year == 2008:
        indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (final2["reg"] == reg)
        selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (apl_imp["REG"] == reg)
    else:
        indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (final2["tau99"] == tau) & (final2["reg"] == reg)
        selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (apl_imp["TAU99"] == tau) & (apl_imp["REG"] == reg)

    z = random.uniform(size=indices.sum())
    print len(z)
    print len(indices)

    print len(indices)/len(z)
    probs = apl_imp.loc[selection , ["proba_zone1", "proba_zone2"]]
    print probs
    print probs['proba_zone1'].values

    proba_zone_1 =  probs['proba_zone1'].values[0]
    proba_zone_2 =  probs['proba_zone2'].values[0]

    final2["zone_apl"] = 3
    final2["zone_apl"][indices] = ( 1 + (z>proba_zone_1) +
                                       (z>(proba_zone_1 + proba_zone_2)))
    del indices, probs

#     control(final2, verbose=True, debug=True, verbose_length=15)

    print '    performing cleaning on final2'
    print 'nombre de sali nuls', len(final2[final2['sali'].isnull()])
    print "nombre d'âges nuls", len(final2[final2.age.isnull()])
    print "longueur de final2 avant purge", len(final2)
#     columns_w_nan = []
#     for col in final2.columns:
#         if final2[final2['idfoy'].notnull()][col].isnull().any() and not final2[col].isnull().all():
#             columns_w_nan.append(col)
#     print columns_w_nan
    print 'check doublons', len(final2[final2.duplicated(['noindiv'])])
    print final2.age.isnull().sum()

#     print final2.loc[final2.duplicated('noindiv'), ['noindiv', 'quifam']].to_string()
    #TODO: JS: des chefs de famille et conjoints en double il faut trouver la source des ces doublons !
#     final2 = final2.drop_duplicates(['noindiv'])

    final2 = final2[~(final2.age.isnull())]
    print "longueur de final2 après purge", len(final2)
    print_id(final2)

#
# # var <- names(foyer)
# #a1 <- c('f7rb', 'f7ra', 'f7gx', 'f2aa', 'f7gt', 'f2an', 'f2am', 'f7gw', 'f7gs', 'f8td', 'f7nz', 'f1br', 'f7jy', 'f7cu', 'f7xi', 'f7xo', 'f7xn', 'f7xw', 'f7xy', 'f6hj', 'f7qt', 'f7ql', 'f7qm', 'f7qd', 'f7qb', 'f7qc', 'f1ar', 'f7my', 'f3vv', 'f3vu', 'f3vt', 'f7gu', 'f3vd', 'f2al', 'f2bh', 'f7fm', 'f8uy', 'f7td', 'f7gv', 'f7is', 'f7iy', 'f7il', 'f7im', 'f7ij', 'f7ik', 'f1er', 'f7wl', 'f7wk', 'f7we', 'f6eh', 'f7la', 'f7uh', 'f7ly', 'f8wy', 'f8wx', 'f8wv', 'f7sb', 'f7sc', 'f7sd', 'f7se', 'f7sf', 'f7sh', 'f7si',  'f1dr', 'f7hs', 'f7hr', 'f7hy', 'f7hk', 'f7hj', 'f7hm', 'f7hl', 'f7ho', 'f7hn', 'f4gc', 'f4gb', 'f4ga', 'f4gg', 'f4gf', 'f4ge', 'f7vz', 'f7vy', 'f7vx', 'f7vw', 'f7xe', 'f6aa', 'f1cr', 'f7ka', 'f7ky', 'f7db', 'f7dq', 'f2da')
# #a2 <- setdiff(a1,names(foyer))
# #b1 <- c('pondfin', 'alt', 'hsup', 'ass_mat', 'zone_apl', 'inactif', 'ass', 'aer', 'code_postal', 'activite', 'type_sal', 'jour_xyz', 'boursier', 'etr', 'partiel1', 'partiel2', 'empl_dir', 'gar_dom', 'categ_inv', 'opt_colca', 'csg_taux_plein','coloc')
# # hsup feuille d'impot
# # boursier pas dispo
# # inactif etc : extraire cela des donn?es clca etc
#
# # tester activit? car 0 vaut actif
# table(is.na(final2$activite),useNA="ifany")
#
# saveTmp(final2, file= "final2.Rdata")



    control(final2, debug=True)
    print final2.age.isnull().sum()
    final2 = final2.drop_duplicates(cols='noindiv')

    print '    Filter to manage the new 3-tables structures:'
    # On récupère les foyer, famille, ménages qui ont un chef :
    liste_men = unique(final2.loc[final2['quimen']==0,'idmen'].values)
    liste_fam = unique(final2.loc[final2['quifam']==0,'idfam'].values)
    liste_foy = unique(final2.loc[final2['quifoy']==0,'idfoy'].values)

    #On ne conserve dans final2 que ces foyers là :
    print 'final2 avant le filtrage' ,len(final2)
    final2 = final2.loc[final2.idmen.isin(liste_men), :]
    final2 = final2.loc[final2.idfam.isin(liste_fam), :]
    final2 = final2.loc[final2.idfoy.isin(liste_foy), :]
    print 'final2 après le filtrage', len(final2)

    if check:
        check_structure(final2)

    from openfisca_france import DATA_SOURCES_DIR
    test_filename = os.path.join(DATA_SOURCES_DIR, filename + ".h5")
    if os.path.exists(test_filename):
        import warnings
        import datetime
        time_stamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')
        renamed_file = os.path.join(DATA_SOURCES_DIR, filename + "_" + time_stamp + ".h5")
        warnings.warn("A file with the same name already exists \n Renaming current output and saving to " + renamed_file)
        test_filename = renamed_file


    store = HDFStore(test_filename)
    store['survey_'+ str(year)] = final2
Пример #8
0
def create_fip(year = 2006): # message('03_fip')
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """

    df = DataCollection(year=year)

    print 'Démarrer 03_fip'
# # anaisenf: année de naissance des PAC
# erfFoyVar <- c('anaisenf','declar')
# foyer <- LoadIn(erfFoyFil)
# foyer <- LoadIn(erfFoyFil,erfFoyVar)

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = df.get_values(table="foyer", variables=erfFoyVar)
    print_id(foyer)
#    control(foyer, verbose=True, verbose_length=10, debug=True)


# #***********************************************************************************************************
# # print "Step 1 : on recupere les personnes à charge des foyers"
# #**********************************************************************************************************
# # On traite les cas de declarations multiples pour ne pas créer de doublon de pac
#
#
# # On récupère toutes les pac des foyers
# L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal
# fip <-data.frame(declar = foyer$declar)
# for (i in c(1:L)){
#   eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = '')))
#   eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = '')))
# }
# fip <- fip[!is.na(fip$typ.1),]
# fip <- reshape(fip,direction ='long', varying=2:17, sep=".")
# fip <- fip[!is.na(fip$naia),]
# fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')]
# fip$N <- row(fip)[,1]
# str(fip$N)

    print "Etape 1 : on recupere les personnes à charge des foyers"
    print "    1.1 : Création des codes des enfants"
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len))/5
    print "il ya a au maximum %s pac par foyer" %nb_pac_max

# Separating the string coding the pac of each "déclaration".
# Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable'])
    fip = DataFrame(randn(len(foyer), 3*nb_pac_max), columns=columns)
    fip.fillna(NaN, inplace=True) # inutile a cause de la ligne précédente, to remove
    for i in range(1,nb_pac_max+1):
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i,'type_pac')] = foyer['anaisenf'].str[5*(i-1)]
        fip[(i,'naia')] = foyer['anaisenf'].str[5*(i-1)+1:5*(i)]

    fip = fip.stack("pac_number")
    fip.reset_index(inplace=True)
    del fip["level_0"]

#     print fip.describe()
#     print fip.head().to_string()
    print "    1.2 : elimination des foyers fiscaux sans pac"
    #Clearing missing values and changing data format
    fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an')  & (fip['naia'] != '')]
    fip = fip.sort(columns=['declaration','naia','type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration","pac_number"], inplace=True)
    fip = fip.reset_index()

    del fip['pac_number']
#    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    print "    1.3 : on enlève les individus F pour lesquels il existe un individu G"
    tyFG = fip[fip.type_pac.isin(['F', 'G'])] #Filtre pour ne travailler que sur F & G

    tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin']))
    #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #puis on retire les autres (à la fois F et G)
    print len(tyFG),'/', len(tyFG[tyFG['to_keep']])
    print 'longueur fip', len(fip)

    fip['to_keep'] = NaN
    fip.update(tyFG)
    print 'enfants F & G traités'

    print "    1.4 : on enlève les H pour lesquels il y a un I"
    tyHI = fip[fip.type_pac.isin(['H', 'I'])]
    tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin'])

    fip.update(tyHI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    print 'nb lines to keep/nb initial lines'
    print len(fip[fip['to_keep']]), '/', len(fip)

    indivifip = fip[fip['to_keep']]; del indivifip['to_keep'], fip, tyFG, tyHI

#    control(indivifip, debug=True)


# #************************************************************************************************************/
    print ''
    print 'Step 2 : matching indivifip with eec file'
# #************************************************************************************************************/

    indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES


# pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',]
# pac$key1 <- paste(pac$naia,pac$declar1)
# pac$key2 <- paste(pac$naia,pac$declar2)
# indivifip$key <- paste(indivifip$naia,indivifip$declar)

    #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull()
    import pdb
    pdb.set_trace()
    pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip']=='pac')]

    pac['naia'] = pac['naia'].astype('int32') # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip['naia'].astype('int32')
    pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29])
    pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29])
    assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' %(pac.naia.dtype, indivifip.naia.dtype)

# fip <- indivifip[!indivifip$key %in% pac$key1,]
# fip <- fip[!fip$key %in% pac$key2,]
    fip = indivifip[~(indivifip.key.isin(pac.key1.values))]
    fip = fip[~(fip.key.isin(pac.key2.values))]


    print "    2.1 new fip created"
# We build a dataframe to link the pac to their type and noindiv
# table(duplicated(pac[,c("noindiv")]))
    countInd = pac.noindiv.value_counts()

# pacInd1 <- merge(pac[,c("noindiv","key1","naia")],
#                 indivifip[,c("key","typ")], by.x="key1", by.y="key")
# pacInd2 <- merge(pac[,c("noindiv","key2","naia")],
#                 indivifip[,c("key","typ")], by.x="key2", by.y="key")

    tmp_pac1 = pac[['noindiv', 'key1']]
    tmp_pac2 = pac[['noindiv', 'key2']]
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']]

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    print 'longueur pacInd1' , len(pac_ind1)
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    print 'longueur pacInd2', len(pac_ind2)
    print "pacInd1&2 créés"

# table(duplicated(pacInd1))
# table(duplicated(pacInd2))

    print pac_ind1.duplicated().sum()
    print pac_ind2.duplicated().sum()

# pacInd1 <-rename(pacInd1,c("key1" = "key"))
# pacInd2 <-rename(pacInd2,c("key2" = "key"))
# pacInd <- rbind(pacInd1,pacInd2)
# rm(pacInd1,pacInd2)

#     pacInd1.rename(columns={'key1':'key'}, inplace=True)
#     pacInd2.rename(columns={'key2':'key'}, inplace=True)
    del pac_ind1['key1'], pac_ind2['key2']
    print pac_ind1.columns
    print pac_ind2.columns

    if pac_ind1.index == []:
        if pac_ind2.index == []:
                print "Warning : no link between pac and noindiv for both pacInd1&2"
        else:
            print "Warning : pacInd1 is an empty data frame"
            pacInd = pac_ind2
    elif pac_ind2.index == []:
        print "Warning : pacInd2 is an empty data frame"
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    print len(pac_ind1), len(pac_ind2), len(pacInd)
    print pac_ind2.type_pac.isnull().sum()
    print pacInd.type_pac.value_counts()

    print '    2.2 : pacInd created'

# table(duplicated(pacInd[,c("noindiv","typ")]))
# table(duplicated(pacInd$noindiv))

    print 'doublons noindiv, type_pac', pacInd.duplicated(['noindiv', 'type_pac']).sum()
    print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum()
    print 'nb de NaN', pacInd.type_pac.isnull().sum()

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))]
#     pacIndiv.reset_index(inplace=True)
    print pacIndiv.columns

    save_temp(pacIndiv, name="pacIndiv", year=year)

    print pacIndiv.type_pac.value_counts()
    gc.collect()

# # We keep the fip in the menage of their parents because it is used in to
# # build the famille. We should build an individual ident for the fip that are
# # older than 18 since they are not in their parents' menage according to the eec

# individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
# individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
# individec1 <- upData(individec1,rename=c(declar1="declar"))
# fip1       <- merge(fip,individec1)
    # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    indivi['noidec'] = indivi['declar1'].str[0:2].astype('float16') # To be used later to set idfoy
    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec1 = individec1.loc[:, ["declar1","noidec","ident","rga","ztsai","ztsao"]]
    individec1 = individec1.rename(columns={'declar1':'declaration'})
    fip1 = fip.merge(individec1, on='declaration')
    print '    2.3 : fip1 created'

# # TODO: On ne s'occupe pas des declar2 pour l'instant
# # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
# # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
# # individec2 <- upData(individec2,rename=c(declar2="declar"))
# # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec2 = individec2.loc[:, ["declar2","noidec","ident","rga","ztsai","ztsao"]]
    individec2.rename(columns={'declar2':'declaration'}, inplace=True)
    print individec2.head()
    fip2 = fip.merge(individec2)
    print '    2.4 : fip2 created'


    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

# #fip <- rbind(fip1,fip2)
# fip <- fip1
# table(fip$typ)

    fip = concat([fip1, fip2])
#     fip = fip1 #TODO: Pourquoi cette ligne ?
    fip.type_pac.value_counts()

    print fip.columns
    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'] #TODO declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip['naia'].astype('float')
    fip['lpr'] = where(fip['agepf'] <=20, 3, 4) # TODO pas très propre d'après Mahdi/Clément
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = where(fip['agepf']<=15, 9, 5)

## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

# Reassigning noi for fip children if they are more than one per foyer fiscal
# while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
#   dup <- duplicated( fip[, c("noi","ident")])
#   tmp <- fip[dup,"noi"]
#   fip[dup, "noi"] <- (tmp-1)
# }
    #TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi','ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        print len(tmp)
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100*fip['ident'] + fip['noidec']
    fip['noindiv'] = 100*fip['ident'] + fip['noi']
    fip['type_pac'] = 0 ; fip['key'] = 0

    print fip.duplicated('noindiv').value_counts()
    save_temp(fip, name="fipDat", year=year)
    del fip, fip1, individec1, indivifip, indivi, pac
    print 'fip sauvegardé'
Пример #9
0
def sif(year=2006):
    data = DataCollection(year=year)
    print u"05_foyer: extraction des données foyer"
    ## TODO Comment choisir le rfr n -2 pour éxonération de TH ?
    ## mnrvka  Revenu TH n-2
    ## mnrvkh  revenu TH (revenu fiscal de référence)
    #
    ## On récupère les variables du code sif
    #vars <- c("noindiv", 'sif', "nbptr", "mnrvka")
    vars = ["noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg"]
    #sif <- LoadIn(erfFoyFil, vars)
    sif = data.get_values(variables=vars, table="foyer" )
    #sif$statmarit <- 0
    sif['statmarit'] = 0

    print sif

    ## for (index in 60:80){
    ##    print(index)
    ##    print(table(substr(sif$sif,index,index)))
    ## }


    #
    #
    ## Pb with some 2 sifs that are misaligned
    ## index = 61
    ## sif[substr(sif$sif,index,index)=="F", "noindiv"]
    ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="F", "noindiv"]]
    ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="F", "noindiv"]+1]
    #
    ## index = 62
    ## sif[substr(sif$sif,index,index)=="G", "noindiv"]
    ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="G", "noindiv"]]
    ## sif$sif[1]
    #
    #if (year==2009){
    #  # problem with one entry in 2009
    #  length  <- nchar(sif$sif[1])
    #  old_sif <- sif$sif[sif$noindiv == 901803201]
    #  new_sif <- paste(substr(old_sif,1,59), substr(old_sif,61,length),"0", sep="")
    #  sif$sif[sif$noindiv == 901803201] <- new_sif
    #  old_sif <- sif$sif[sif$noindiv == 900872201]
    #  new_sif <- paste(substr(old_sif,1,58), " ", substr(old_sif,59,length), sep="")
    #  sif$sif[sif$noindiv == 900872201] <- new_sif
    #  rm(old_sif,new_sif)
    #}
    if year == 2009:
        old_sif = sif['sif'][sif['noindiv'] == 901803201]
        new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0"
        sif['sif'][sif['noindiv'] == 901803201] = new_sif
        old_sif = sif['sif'][sif['noindiv'] == 900872201]
        new_sif = old_sif.str[0:58] + " " + old_sif.str[58:]
        sif['sif'][sif['noindiv'] == 900872201] = new_sif
        del old_sif, new_sif
    #
    #
    ## for (index in 60:80){
    ##     print(index)
    ##     print(table(substr(sif$sif,index,index)))
    ## }
    #
    #
    #
    #sif <- within(sif,{
    ##  rbg = rbg*((tsrvbg =='+')-(tsrvbg =='-'))
    print sif["rbg"].describe()
    sif["rbg"] = sif["rbg"]*( (sif["tsrvbg"]=='+')-(sif["tsrvbg"] =='-'))
    print sif["rbg"].describe()

    #  stamar <- substr(sif,5,5)

    sif["stamar"] = sif["sif"].str[4:5]

    # Converting marital status

    #  statmarit[stamar =="M"] <- 1
    #  statmarit[stamar =="C"] <- 2
    #  statmarit[stamar =="D"] <- 3
    #  statmarit[stamar =="V"] <- 4
    #  statmarit[stamar =="O"] <- 5

    statmarit_dict = {"M": 1, "C" : 2, "D" : 3, "V" : 4, "O": 5}
    for key , val in statmarit_dict.iteritems():
        sif["statmarit"][ sif.stamar == key] = val

    #  birthvous <- as.numeric(substr(sif,6,9))
    #  birthconj <- as.numeric(substr(sif,11,14))
    #


    sif["birthvous"] = sif["sif"].str[5:9]
    sif["birthconj"] = sif["sif"].str[10:14]

    #  caseE <- as.numeric(substr(sif,16,16)=='E')
    #  caseF <- as.numeric(substr(sif,17,17)=='F')
    #  caseG <- as.numeric(substr(sif,18,18)=='G')
    #  caseK <- as.numeric(substr(sif,19,19)=='K')

    sif["caseE"] = sif["sif"].str[15:16] == "E"
    sif["caseF"] = sif["sif"].str[16:17] == "F"
    sif["caseG"] = sif["sif"].str[17:18] == "G"
    sif["caseK"] = sif["sif"].str[18:19] == "K"


    d = 0
    #
    #  if (year %in% c(2006,2007)){
    #    caseL <- as.numeric(substr(sif,20,20)=='L')
    #    caseP <- as.numeric(substr(sif,21,21)=='P')
    #    caseS <- as.numeric(substr(sif,22,22)=='S')
    #    caseW <- as.numeric(substr(sif,23,23)=='W')
    #    caseN <- as.numeric(substr(sif,24,24)=='N')
    #    caseH <- as.numeric(substr(sif,25,28))
    #    caseT <- as.numeric(substr(sif,29,29) == 'T')
    #  }

    if year in [2006,2007]:
        sif["caseL"] = sif["sif"].str[19:20] == "L"
        sif["caseP"] = sif["sif"].str[20:21] == "P"
        sif["caseS"] = sif["sif"].str[21:22] == "S"
        sif["caseW"] = sif["sif"].str[22:23] == "W"
        sif["caseN"] = sif["sif"].str[23:24] == "N"
        sif["caseH"] = sif["sif"].str[24:28]
        sif["caseT"] = sif["sif"].str[28:29] == "T"
    #
    #  if (year == 2008){
    #    d = - 1 # fin de la case L
    #    caseP <- as.numeric(substr(sif,21+d,21+d)=='P')
    #    caseS <- as.numeric(substr(sif,22+d,22+d)=='S')
    #    caseW <- as.numeric(substr(sif,23+d,23+d)=='W')
    #    caseN <- as.numeric(substr(sif,24+d,24+d)=='N')
    #    caseH <- as.numeric(substr(sif,25+d,28+d))
    #    caseT <- as.numeric(substr(sif,29+d,29+d)=='T')
    #  }
    #
    if year in [2008]:
        d = - 1 # fin de la case L
        sif["caseP"] = sif["sif"].str[20+d:21+d] == "P"
        sif["caseS"] = sif["sif"].str[21+d:22+d] == "S"
        sif["caseW"] = sif["sif"].str[22+d:23+d] == "W"
        sif["caseN"] = sif["sif"].str[23+d:24+d] == "N"
        sif["caseH"] = sif["sif"].str[24+d:28+d]
        sif["caseT"] = sif["sif"].str[28+d:29+d] == "T"

    #  if (year == 2009){
    #    # retour de la case L par rapport à 2008 (donc on retrouve 2006)
    #    caseL <- as.numeric(substr(sif,20,20)=='L')
    #    caseP <- as.numeric(substr(sif,21,21)=='P')
    #    caseS <- as.numeric(substr(sif,22,22)=='S')
    #    caseW <- as.numeric(substr(sif,23,23)=='W')
    #    caseN <- as.numeric(substr(sif,24,24)=='N')
    #    # caseH en moins par rapport à 2008 (mais case en L en plus)
    #    # donc décalage par rapport à 2006
    #    d = -4
    #    caseT <- as.numeric(substr(sif,29+d,29+d)=='T')
    #  }

    if year in [2009]:
        sif["caseL"] = sif["sif"].str[19:20] == "L"
        sif["caseP"] = sif["sif"].str[20:21] == "P"
        sif["caseS"] = sif["sif"].str[21:22] == "S"
        sif["caseW"] = sif["sif"].str[22:23] == "W"
        sif["caseN"] = sif["sif"].str[23:24] == "N"
        # caseH en moins par rapport à 2008 (mais case en L en plus)
        # donc décalage par rapport à 2006
        d = -4
        sif["caseT"] = sif["sif"].str[28+d:29+d] == "T"


    #
    #  caseX <- as.numeric(substr(sif,34+d,34+d)=='X')
    #  dateX <- as.Date(substr(sif,35+d,42+d),'%d%m%Y')
    #  caseY <- as.numeric(substr(sif,43+d,43+d)== 'Y')
    #  dateY <- as.Date(substr(sif,44+d,51+d),'%d%m%Y')
    #  caseZ <- as.numeric(substr(sif,52+d,53+d)== 'Z')
    #  dateZ <- as.Date(substr(sif,53+d,60+d),'%d%m%Y')  # ERROR 54+d
    #  causeXYZ <- substr(sif,61+d,61+d)
    #

    sif["caseX"] = sif["sif"].str[33+d:34+d] == "X"
    sif["dateX"] = sif["sif"].str[34+d:42+d]
    sif["caseY"] = sif["sif"].str[42+d:43+d] == "Y"
    sif["dateY"] = sif["sif"].str[43+d:51+d]
    sif["caseZ"] = sif["sif"].str[51+d:53+d] == "Z"
    sif["dateZ"] = sif["sif"].str[52+d:60+d]
    sif["causeXYZ"] = sif["sif"].str[60+d:61+d]

    # TODO: convert dateXYZ to appropriate date in pandas
    # print sif["dateY"].unique()


    #  nbptr <- nbptr/100
    #  rfr_n_2 <- mnrvka

    sif["nbptr"] =  sif["nbptr"]/100
    sif["rfr_n_2"] = sif["mnrvka"]

    #  nbF <- as.numeric(substr(sif,65+d,66+d))
    #  nbG <- as.numeric(substr(sif,68+d,69+d))
    #  nbR <- as.numeric(substr(sif,71+d,72+d))
    #  nbJ <- as.numeric(substr(sif,74+d,75+d))
    #  nbN <- as.numeric(substr(sif,77+d,78+d))
    #  nbH <- as.numeric(substr(sif,80+d,81+d))
    #  nbI <- as.numeric(substr(sif,83+d,84+d))

    sif["nbF"] = sif["sif"].str[64+d:66+d]
    sif["nbG"] = sif["sif"].str[67+d:69+d]
    sif["nbR"] = sif["sif"].str[70+d:72+d]
    sif["nbJ"] = sif["sif"].str[73+d:75+d]
    sif["nbN"] = sif["sif"].str[76+d:78+d]
    sif["nbH"] = sif["sif"].str[79+d:81+d]
    sif["nbI"] = sif["sif"].str[82+d:84+d]

    #  if (year != 2009){
    #  nbP <- as.numeric(substr(sif,86+d,87+d))
    #  }
    #})

    if (year != 2009):
        sif["nbP"] = sif["sif"].str[85+d:87+d]

    #sif$sif <- NULL
    #sif$stamar <- NULL
    #

    del sif["sif"], sif["stamar"]


    #table(sif$statmarit)
    print sif["statmarit"].value_counts()

    #print(length(table(sif$noindiv)))
    print "Number of individuals :", len(sif["noindiv"])
    print "Number of distinct individuals :", len(sif["noindiv"].value_counts())

    #dup <- duplicated(sif$noindiv)
    #table(dup)
    #sif <- sif[!dup,]
    #print(length(table(sif$noindiv)))

    sif_drop_duplicated = sif.drop_duplicates("noindiv")
    assert len(sif["noindiv"].value_counts()) == len(sif_drop_duplicated["noindiv"]), Exception("Number of distinct individuals after removing duplicates is not correct")

    print 'Saving sif'
    save_temp(sif, name='sif', year=year)
    del sif
    gc.collect()
Пример #10
0
def foyer_all(year=2006):

    ## On ajoute les cases de la déclaration
    #foyer_all <- LoadIn(erfFoyFil)
    data = DataCollection(year=year)
    foyer_all = data.get_values(table="foyer")

    ## on ne garde que les cases de la déclaration ('fxzz')
    #vars <- names(foyer_all)
    #vars <- c("noindiv", vars[grep("^f[0-9]", vars)])
    #

    vars = foyer_all.columns
    regex = re.compile("^f[0-9]")
    vars = [x for x in vars if regex.match(x)]

    #foyer <- foyer_all[vars]
    #rm(foyer_all)
    #gc()
    #noindiv <- list(foyer$noindiv)
    #

    foyer = foyer_all[vars + ["noindiv"]]

    del foyer_all
    gc.collect()

    #
    ## On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations
    #foyer <- aggregate(foyer, by = noindiv, FUN = 'sum')
    #print foyer.describe()["f1aj"].to_string()
    foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum)
    #
    #print foyer.describe()["f1aj"].to_string()
    #print foyer.describe()["noindiv"].to_string()
    #

    print_id(foyer)

    ## noindiv have been summed over original noindiv which are now in Group.1
    #foyer$noindiv <- NULL
    #foyer <- rename(foyer, c(Group.1 = 'noindiv'))
    ## problème avec les dummies ()
    #
    #saveTmp(foyer, file= "foyer_aggr.Rdata")
    #
    #
    #############################################################################
    ## On récupère les variables individualisables
    #loadTmp("foyer_aggr.Rdata")
    #
    #individualisable <- function(table, var, vars, qui){
    #  print(var)
    #  print(vars)
    #  temp <- table[c('noindiv', vars)]
    #  n = length(qui)
    #  names(temp)[2:(n+1)] <- qui
    #  temp$newvar <- NULL
    #  temp2 <- melt(temp, id = 'noindiv', variable_name = 'quifoy')
    #  temp2 <- transform(temp2, quifoy = as.character(quifoy))
    #  temp2 <- transform(temp2, noindiv = as.character(noindiv))
    #  str(temp2)
    #  rename(temp2, c(value = var))
    #}

    var_dict = {
        'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'],
        'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'],
        'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'],
        'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'],
        'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'],
        'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'],
        'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'],
        'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'],
        'f1tv': ['f1tv', 'f1uv'],
        'f1tw': ['f1tw', 'f1uw'],
        'f1tx': ['f1tx', 'f1ux'],
        'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'],
        'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'],
        'frag_exon': ['f5hn', 'f5in', 'f5jn'],
        'frag_impo': ['f5ho', 'f5io', 'f5jo'],
        'arag_exon': ['f5hb', 'f5ib', 'f5jb'],
        'arag_impg': ['f5hc', 'f5ic', 'f5jc'],
        'arag_defi': ['f5hf', 'f5if', 'f5jf'],
        'nrag_exon': ['f5hh', 'f5ih', 'f5jh'],
        'nrag_impg': ['f5hi', 'f5ii', 'f5ji'],
        'nrag_defi': ['f5hl', 'f5il', 'f5jl'],
        'nrag_ajag': ['f5hm', 'f5im', 'f5jm'],
        'mbic_exon': ['f5kn', 'f5ln', 'f5mn'],
        'abic_exon': ['f5kb', 'f5lb', 'f5mb'],
        'nbic_exon': ['f5kh', 'f5lh', 'f5mh'],
        'mbic_impv': ['f5ko', 'f5lo', 'f5mo'],
        'mbic_imps': ['f5kp', 'f5lp', 'f5mp'],
        'abic_impn': ['f5kc', 'f5lc', 'f5mc'],
        'abic_imps': ['f5kd', 'f5ld', 'f5md'],
        'nbic_impn': ['f5ki', 'f5li', 'f5mi'],
        'nbic_imps': ['f5kj', 'f5lj', 'f5mj'],
        'abic_defn': ['f5kf', 'f5lf', 'f5mf'],
        'abic_defs': ['f5kg', 'f5lg', 'f5mg'],
        'nbic_defn': ['f5kl', 'f5ll', 'f5ml'],
        'nbic_defs': ['f5km', 'f5lm', 'f5mm'],
        'nbic_apch': ['f5ks', 'f5ls', 'f5ms'],
        'macc_exon': ['f5nn', 'f5on', 'f5pn'],
        'aacc_exon': ['f5nb', 'f5ob', 'f5pb'],
        'nacc_exon': ['f5nh', 'f5oh', 'f5ph'],
        'macc_impv': ['f5no', 'f5oo', 'f5po'],
        'macc_imps': ['f5np', 'f5op', 'f5pp'],
        'aacc_impn': ['f5nc', 'f5oc', 'f5pc'],
        'aacc_imps': ['f5nd', 'f5od', 'f5pd'],
        'aacc_defn': ['f5nf', 'f5of', 'f5pf'],
        'aacc_defs': ['f5ng', 'f5og', 'f5pg'],
        'nacc_impn': ['f5ni', 'f5oi', 'f5pi'],
        'nacc_imps': ['f5nj', 'f5oj', 'f5pj'],
        'nacc_defn': ['f5nl', 'f5ol', 'f5pl'],
        'nacc_defs': ['f5nm', 'f5om', 'f5pm'],
        'mncn_impo': ['f5ku', 'f5lu', 'f5mu'],
        'cncn_bene': ['f5sn', 'f5ns', 'f5os'],
        'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'],  # TODO: check
        'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'],
        'abnc_exon': ['f5qb', 'f5rb', 'f5sb'],
        'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'],
        'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'],
        'abnc_impo': ['f5qc', 'f5rc', 'f5sc'],
        'abnc_defi': ['f5qe', 'f5re', 'f5se'],
        'nbnc_impo': ['f5qi', 'f5ri', 'f5si'],
        'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'],
        #               'ebic_impv' : ['f5ta','f5ua', 'f5va'],
        #               'ebic_imps' : ['f5tb','f5ub', 'f5vb'],
        'mbic_mvct': ['f5hu'],
        'macc_mvct': ['f5iu'],
        'mncn_mvct': ['f5ju'],
        'mbnc_mvct': ['f5kz'],
        'frag_pvct': ['f5hw', 'f5iw', 'f5jw'],
        'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'],
        'macc_pvct': ['f5nx', 'f5ox', 'f5px'],
        'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'],
        'mncn_pvct': ['f5ky', 'f5ly', 'f5my'],
        'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'],
        'macc_mvlt': ['f5nr', 'f5or', 'f5pr'],
        'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'],
        'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'],
        'frag_pvce': ['f5hx', 'f5ix', 'f5jx'],
        'arag_pvce': ['f5he', 'f5ie', 'f5je'],
        'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'],
        'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'],
        'abic_pvce': ['f5ke', 'f5le', 'f5me'],
        'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'],
        'macc_pvce': ['f5nq', 'f5oq', 'f5pq'],
        'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'],
        'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'],
        'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'],
        'cncn_pvce': ['f5so', 'f5nt', 'f5ot'],
        'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'],
        'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'],
        'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'],
        'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er']
    }  # (déménagement) uniquement en 2006

    #
    #varlist = list(list('sali', c('f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej')),
    #                list('choi', c('f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep')),
    #               list('fra', c('f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek')),
    # ......
    #               list('mbnc_pvce', c('f5hr', 'f5ir', 'f5jr')),
    #               list('abnc_pvce', c('f5qd', 'f5rd', 'f5sd')),
    #               list('nbnc_pvce', c('f5qj', 'f5rj', 'f5sj')),
    #               list('demenage' , c('f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'))) # (déménagement) uniquement en 2006
    #
    vars_sets = [set(var_list) for var_list in var_dict.values()]
    eligible_vars = (set().union(*vars_sets)).intersection(
        set(list(foyer.columns)))

    print "From %i variables, we keep %i eligibles variables" % (len(
        set().union(*vars_sets)), len(eligible_vars))
    qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3']
    err = 0
    err_vars = {}

    foy_ind = DataFrame()

    for individual_var, foyer_vars in var_dict.iteritems():
        try:
            selection = foyer[foyer_vars + ["noindiv"]]
        except KeyError:
            # Testing if at least one variable of foyers_vars is in the eligible list
            presence = [x in eligible_vars for x in foyer_vars]
            var_present = any(presence)
            if not var_present:
                print individual_var + " is not present"
                continue
            else:
                # Shrink the list
                foyer_vars_cleaned = [
                    var for var, present in zip(foyer_vars, presence)
                    if present is True
                ]
                selection = foyer[foyer_vars_cleaned + ["noindiv"]]

        # Reshape the dataframe
        selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True)
        selection.set_index("noindiv", inplace=True)
        selection.columns.name = "quifoy"

        selection = selection.stack()
        selection.name = individual_var
        selection = selection.reset_index(
        )  # A Series cannot see its index resetted to produce a DataFrame
        selection = selection.set_index(["quifoy", "noindiv"])
        selection = selection[selection[individual_var] != 0]
        #        print len(selection)

        if len(foy_ind) == 0:
            foy_ind = selection
        else:

            foy_ind = concat([foy_ind, selection], axis=1, join='outer')

    foy_ind.reset_index(inplace=True)

    print "foy_ind"
    print foy_ind.describe().to_string()

    #not_first <- FALSE
    #allvars = c()
    #for (v in varlist){
    #  vars = intersect(v[[2]],names(foyer)) # to deal with variabes that are not present
    #  if (length(vars) > 0) {
    #    allvars <-  c(allvars, vars)
    #    qui <- c('vous', 'conj', 'pac1', 'pac2', 'pac3')
    #    n <- length(vars)
    #    temp <- individualisable(foyer, v[[1]], vars, qui[1:n])
    #    if (not_first) {
    #      print('merge')
    #      foy_ind <- merge(temp, foy_ind, by = c('noindiv', 'quifoy'), all = TRUE)
    #      names(foy_ind)
    #    }
    #    else   {
    #      print('init')
    #      foy_ind <- temp
    #      not_first <- TRUE
    #    }
    #  }
    #}

    ind_vars_to_remove = Series(list(eligible_vars))
    save_temp(ind_vars_to_remove, name='ind_vars_to_remove', year=year)
    foy_ind.rename(columns={"noindiv": "idfoy"}, inplace=True)

    print_id(foy_ind)
    foy_ind['quifoy'][foy_ind['quifoy'] == 'vous'] = 0
    foy_ind['quifoy'][foy_ind['quifoy'] == 'conj'] = 1
    foy_ind['quifoy'][foy_ind['quifoy'] == 'pac1'] = 2
    foy_ind['quifoy'][foy_ind['quifoy'] == 'pac2'] = 3
    foy_ind['quifoy'][foy_ind['quifoy'] == 'pac3'] = 4

    assert foy_ind['quifoy'].isin(
        range(5)).all(), 'présence de valeurs aberrantes dans quifoy'

    print 'saving foy_ind'
    print_id(foy_ind)
    save_temp(foy_ind, name="foy_ind", year=year)
    show_temp()
    return
Пример #11
0
def create_fip(year=2006):  # message('03_fip')
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """

    df = DataCollection(year=year)

    print 'Démarrer 03_fip'
    # # anaisenf: année de naissance des PAC
    # erfFoyVar <- c('anaisenf','declar')
    # foyer <- LoadIn(erfFoyFil)
    # foyer <- LoadIn(erfFoyFil,erfFoyVar)

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = df.get_values(table="foyer", variables=erfFoyVar)
    print_id(foyer)
    #    control(foyer, verbose=True, verbose_length=10, debug=True)

    # #***********************************************************************************************************
    # # print "Step 1 : on recupere les personnes à charge des foyers"
    # #**********************************************************************************************************
    # # On traite les cas de declarations multiples pour ne pas créer de doublon de pac
    #
    #
    # # On récupère toutes les pac des foyers
    # L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal
    # fip <-data.frame(declar = foyer$declar)
    # for (i in c(1:L)){
    #   eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = '')))
    #   eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = '')))
    # }
    # fip <- fip[!is.na(fip$typ.1),]
    # fip <- reshape(fip,direction ='long', varying=2:17, sep=".")
    # fip <- fip[!is.na(fip$naia),]
    # fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')]
    # fip$N <- row(fip)[,1]
    # str(fip$N)

    print "Etape 1 : on recupere les personnes à charge des foyers"
    print "    1.1 : Création des codes des enfants"
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    print "il ya a au maximum %s pac par foyer" % nb_pac_max

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(multi_index_columns,
                                     names=['pac_number', 'variable'])
    fip = DataFrame(randn(len(foyer), 3 * nb_pac_max), columns=columns)
    fip.fillna(
        NaN, inplace=True)  # inutile a cause de la ligne précédente, to remove
    for i in range(1, nb_pac_max + 1):
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)]
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1:5 * (i)]

    fip = fip.stack("pac_number")
    fip.reset_index(inplace=True)
    del fip["level_0"]

    #     print fip.describe()
    #     print fip.head().to_string()
    print "    1.2 : elimination des foyers fiscaux sans pac"
    #Clearing missing values and changing data format
    fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an') &
              (fip['naia'] != '')]
    fip = fip.sort(columns=['declaration', 'naia', 'type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration", "pac_number"], inplace=True)
    fip = fip.reset_index()

    del fip['pac_number']
    #    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    print "    1.3 : on enlève les individus F pour lesquels il existe un individu G"
    tyFG = fip[fip.type_pac.isin(['F', 'G'
                                  ])]  #Filtre pour ne travailler que sur F & G

    tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'],
                                        take_last=True)
    tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin']))
    #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #puis on retire les autres (à la fois F et G)
    print len(tyFG), '/', len(tyFG[tyFG['to_keep']])
    print 'longueur fip', len(fip)

    fip['to_keep'] = NaN
    fip.update(tyFG)
    print 'enfants F & G traités'

    print "    1.4 : on enlève les H pour lesquels il y a un I"
    tyHI = fip[fip.type_pac.isin(['H', 'I'])]
    tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'],
                                        take_last=True)
    tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin'])

    fip.update(tyHI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    print 'nb lines to keep/nb initial lines'
    print len(fip[fip['to_keep']]), '/', len(fip)

    indivifip = fip[fip['to_keep']]
    del indivifip['to_keep'], fip, tyFG, tyHI

    #    control(indivifip, debug=True)

    # #************************************************************************************************************/
    print ''
    print 'Step 2 : matching indivifip with eec file'
    # #************************************************************************************************************/

    indivi = load_temp(name="indivim",
                       year=year)  #TODO: USE THIS INSTEAD OF PREVIOUS LINES

    # pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',]
    # pac$key1 <- paste(pac$naia,pac$declar1)
    # pac$key2 <- paste(pac$naia,pac$declar2)
    # indivifip$key <- paste(indivifip$naia,indivifip$declar)

    #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull()
    import pdb
    pdb.set_trace()
    pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip'] == 'pac')]

    pac['naia'] = pac['naia'].astype(
        'int32')  # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip['naia'].astype('int32')
    pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29])
    pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip['naia'],
                           indivifip['declaration'].str[:29])
    assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' % (
        pac.naia.dtype, indivifip.naia.dtype)

    # fip <- indivifip[!indivifip$key %in% pac$key1,]
    # fip <- fip[!fip$key %in% pac$key2,]
    fip = indivifip[~(indivifip.key.isin(pac.key1.values))]
    fip = fip[~(fip.key.isin(pac.key2.values))]

    print "    2.1 new fip created"
    # We build a dataframe to link the pac to their type and noindiv
    # table(duplicated(pac[,c("noindiv")]))
    countInd = pac.noindiv.value_counts()

    # pacInd1 <- merge(pac[,c("noindiv","key1","naia")],
    #                 indivifip[,c("key","typ")], by.x="key1", by.y="key")
    # pacInd2 <- merge(pac[,c("noindiv","key2","naia")],
    #                 indivifip[,c("key","typ")], by.x="key2", by.y="key")

    tmp_pac1 = pac[['noindiv', 'key1']]
    tmp_pac2 = pac[['noindiv', 'key2']]
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']]

    pac_ind1 = tmp_pac1.merge(tmp_indivifip,
                              left_on='key1',
                              right_on='key',
                              how='inner')
    print 'longueur pacInd1', len(pac_ind1)
    pac_ind2 = tmp_pac2.merge(tmp_indivifip,
                              left_on='key2',
                              right_on='key',
                              how='inner')
    print 'longueur pacInd2', len(pac_ind2)
    print "pacInd1&2 créés"

    # table(duplicated(pacInd1))
    # table(duplicated(pacInd2))

    print pac_ind1.duplicated().sum()
    print pac_ind2.duplicated().sum()

    # pacInd1 <-rename(pacInd1,c("key1" = "key"))
    # pacInd2 <-rename(pacInd2,c("key2" = "key"))
    # pacInd <- rbind(pacInd1,pacInd2)
    # rm(pacInd1,pacInd2)

    #     pacInd1.rename(columns={'key1':'key'}, inplace=True)
    #     pacInd2.rename(columns={'key2':'key'}, inplace=True)
    del pac_ind1['key1'], pac_ind2['key2']
    print pac_ind1.columns
    print pac_ind2.columns

    if pac_ind1.index == []:
        if pac_ind2.index == []:
            print "Warning : no link between pac and noindiv for both pacInd1&2"
        else:
            print "Warning : pacInd1 is an empty data frame"
            pacInd = pac_ind2
    elif pac_ind2.index == []:
        print "Warning : pacInd2 is an empty data frame"
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    print len(pac_ind1), len(pac_ind2), len(pacInd)
    print pac_ind2.type_pac.isnull().sum()
    print pacInd.type_pac.value_counts()

    print '    2.2 : pacInd created'

    # table(duplicated(pacInd[,c("noindiv","typ")]))
    # table(duplicated(pacInd$noindiv))

    print 'doublons noindiv, type_pac', pacInd.duplicated(
        ['noindiv', 'type_pac']).sum()
    print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum()
    print 'nb de NaN', pacInd.type_pac.isnull().sum()

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))]
    #     pacIndiv.reset_index(inplace=True)
    print pacIndiv.columns

    save_temp(pacIndiv, name="pacIndiv", year=year)

    print pacIndiv.type_pac.value_counts()
    gc.collect()

    # # We keep the fip in the menage of their parents because it is used in to
    # # build the famille. We should build an individual ident for the fip that are
    # # older than 18 since they are not in their parents' menage according to the eec

    # individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
    # individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
    # individec1 <- upData(individec1,rename=c(declar1="declar"))
    # fip1       <- merge(fip,individec1)
    # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    indivi['noidec'] = indivi['declar1'].str[0:2].astype(
        'float16')  # To be used later to set idfoy
    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values))
                        & (indivi['persfip'] == "vous")]
    individec1 = individec1.loc[:, [
        "declar1", "noidec", "ident", "rga", "ztsai", "ztsao"
    ]]
    individec1 = individec1.rename(columns={'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on='declaration')
    print '    2.3 : fip1 created'

    # # TODO: On ne s'occupe pas des declar2 pour l'instant
    # # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
    # # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
    # # individec2 <- upData(individec2,rename=c(declar2="declar"))
    # # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values))
                        & (indivi['persfip'] == "vous")]
    individec2 = individec2.loc[:, [
        "declar2", "noidec", "ident", "rga", "ztsai", "ztsao"
    ]]
    individec2.rename(columns={'declar2': 'declaration'}, inplace=True)
    print individec2.head()
    fip2 = fip.merge(individec2)
    print '    2.4 : fip2 created'

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    # #fip <- rbind(fip1,fip2)
    # fip <- fip1
    # table(fip$typ)

    fip = concat([fip1, fip2])
    #     fip = fip1 #TODO: Pourquoi cette ligne ?
    fip.type_pac.value_counts()

    print fip.columns
    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype(
        'float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration']  #TODO declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip['naia'].astype('float')
    fip['lpr'] = where(fip['agepf'] <= 20, 3,
                       4)  # TODO pas très propre d'après Mahdi/Clément
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = where(fip['agepf'] <= 15, 9, 5)

    ## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
    ## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

    # Reassigning noi for fip children if they are more than one per foyer fiscal
    # while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
    #   dup <- duplicated( fip[, c("noi","ident")])
    #   tmp <- fip[dup,"noi"]
    #   fip[dup, "noi"] <- (tmp-1)
    # }
    #TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        print len(tmp)
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    print fip.duplicated('noindiv').value_counts()
    save_temp(fip, name="fipDat", year=year)
    del fip, fip1, individec1, indivifip, indivi, pac
    print 'fip sauvegardé'
def create_imput_loyer(year):
    '''
    Impute les loyers à partir de ???
    '''

    #Variables used for imputation
    df = DataCollection(year=year)
    print 'Démarrer 02_imput_loyer'

    menm_vars = [
        "ztsam", "zperm", "zragm", "zricm", "zrncm", "zracm", "nb_uci", "wprm",
        "so", "nbpiec", "typmen5", "spr", "nbenfc", "agpr", "cstotpr",
        "nat28pr", "tu99", "aai1", 'ident', "pol99", "reg", "tau99"
    ]
    if year == 2008:  # Tau99 not present
        menm_vars = menm_vars.pop('tau99')

    indm_vars = ["noi", 'ident', "lpr", "dip11"]
    LgtAdrVars = ["gzc2"]
    LgtMenVars = [
        "sec1", "mrcho", "mrret", "mrsal", "mrtns", "mdiplo", "mtybd", "magtr",
        "mcs8", "maa1at", "qex", "muc1"
    ]

    if year == 2003:
        LgtMenVars.extend(["typse", "lmlm", "hnph2", "mnatior", "ident"])
        LgtAdrVars.extend(["iaat", "tu99", "ident"])
    if year < 2010 and year > 2005:
        LgtMenVars.extend(["mnatio", "idlog"])
        LgtAdrVars.extend(["idlog"])  # pas de typse en 2006
        LgtLgtVars = ["lmlm", "iaat", "tu99", "hnph2",
                      "idlog"]  # pas de typse en 2006

    ## Travail sur la base ERF
    #Preparing ERF menages tables


#     print show_temp()
# TODO : data.get_values
    erfmenm = load_temp(name="menagem", year=year)
    #     erfmenm = df.get_values(table="erf_menage",variables=menm_vars)
    erfmenm['revtot'] = (erfmenm['ztsam'] + erfmenm['zperm'] +
                         erfmenm['zragm'] + erfmenm['zricm'] +
                         erfmenm['zrncm'] + erfmenm['zracm'])
    erfmenm['nvpr'] = erfmenm['revtot'].astype(
        np.float64) / erfmenm['nb_uci'].astype(np.float64)
    # On donne la valeur 0 aux nvpr négatifs
    tmp = np.zeros(erfmenm['nvpr'].shape, dtype=int)
    erfmenm['nvpr'] = max_(tmp, erfmenm['nvpr'])
    for v in erfmenm['nvpr']:  # On vérifie qu'il n'y a plus de nvpr négatifs
        assert v >= 0, Exception('Some nvpr are negatives')
    erfmenm['logt'] = erfmenm['so']
    l = erfmenm.columns.tolist()
    #     print l
    #Preparing ERF individuals table
    erfindm = load_temp(name="indivim", year=year)
    #     erfindm = df.get_values(table = "eec_indivi", variables = indm_vars)

    # TODO: clean this later
    erfindm['dip11'] = 0
    count_NA('dip11', erfindm)
    #     erfindm['dip11'] = 99
    erfindm = erfindm[['ident', 'dip11']][erfindm['lpr'] == 1]
    # erf <- merge(erfmenm, erfindm, by ="ident")
    print('merging erf menage and individu')
    erf = erfmenm.merge(erfindm, on='ident', how='inner')
    erf = erf.drop_duplicates('ident')

    # control(erf) La colonne existe mais est vide,
    # on a du confondre cette colonne avec dip11 ?

    dec, values = mark_weighted_percentiles(erf['nvpr'],
                                            arange(1, 11),
                                            erf['wprm'],
                                            2,
                                            return_quantiles=True)
    values.sort()
    erf['deci'] = (1 + (erf['nvpr'] > values[1]) + (erf['nvpr'] > values[2]) +
                   (erf['nvpr'] > values[3]) + (erf['nvpr'] > values[4]) +
                   (erf['nvpr'] > values[5]) + (erf['nvpr'] > values[6]) +
                   (erf['nvpr'] > values[7]) + (erf['nvpr'] > values[8]) +
                   (erf['nvpr'] > values[9]))
    # Problème : tous les individus sont soit dans le premier, soit dans le dernier décile. WTF
    assert_variable_inrange('deci', [1, 11], erf)
    count_NA('deci', erf)
    del dec, values
    gc.collect()

    #TODO: faire le lien avec men_vars, il manque "pol99","reg","tau99" et ici on a en plus logt, 'nvpr','revtot','dip11','deci'
    erf = erf[[
        'ident', 'ztsam', 'zperm', 'zragm', 'zricm', 'zrncm', 'zracm',
        'nb_uci', 'logt', 'nbpiec', 'typmen5', 'spr', 'nbenfc', 'agpr',
        'cstotpr', 'nat28pr', 'tu99', 'aai1', 'wprm', 'nvpr', 'revtot',
        'dip11', 'deci'
    ]][erf['so'].isin(range(3, 6))]

    erf.rename(columns={
        'nbpiec': 'hnph2',
        'nat28pr': 'mnatio',
        'aai1': 'iaat',
        'dip11': 'mdiplo'
    },
               inplace=True)

    # TODO: ne traite pas les types comme dans R teste-les pour voir comment pandas les gère

    count_NA('agpr', erf)
    erf['agpr'] = erf['agpr'].astype('int64')
    # TODO: moche, pourquoi créer deux variables quand une suffit ?
    erf['tmp'] = 3
    erf['tmp'][erf['agpr'] < 65] = 2
    erf['tmp'][erf['agpr'] < 40] = 1
    erf['magtr'] = erf['tmp']
    count_NA('magtr', erf)
    assert_variable_inrange('magtr', [1, 4], erf)

    count_NA('cstotpr', erf)
    erf['tmp'] = erf['cstotpr'].astype('float') / 10.0
    erf['tmp'] = map(math.floor, erf['tmp'])
    erf['mcs8'] = erf['tmp']
    erf['mcs8'][erf['mcs8'] == 0] = NaN
    # assert isinstance(erf['mcs8'], (int, long)).all(), Exception('Some mcs8 are not integers')
    count_NA('mcs8', erf)

    # TODO il reste 41 NA's 2003
    erf['mtybd'] = NaN
    erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] != 2)] = 1
    erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] == 2)] = 2
    erf['mtybd'][erf['typmen5'] == 5] = 3
    erf['mtybd'][erf['typmen5'] == 3] = 7
    erf['mtybd'][erf['nbenfc'] == 1] = 4
    erf['mtybd'][erf['nbenfc'] == 2] = 5
    erf['mtybd'][erf['nbenfc'] >= 3] = 6
    count_NA('mtybd', erf)

    #     print erf['mtybd'].dtype.fields
    #assert_variable_inrange('mtybd', [1,7], erf) # bug, on trouve 7.0 qui fait assert

    # TODO : 3 logements ont 0 pièces !!
    erf['hnph2'][erf['hnph2'] < 1] = 1
    erf['hnph2'][erf['hnph2'] >= 6] = 6
    count_NA('hnph2', erf)
    assert_variable_inrange('hnph2', [1, 7], erf)

    # # TODO: il reste un NA 2003
    # #       il rest un NA en 2008

    tmp = erf['mnatio']
    tmp[erf['mnatio'] == 10] = 1
    tmp[erf['mnatio'].isin([
        11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 41, 42,
        43, 44, 45, 46, 47, 48, 51, 52, 62, 60
    ])] = 2
    erf['mnatio'] = tmp
    count_NA('mnatio', erf)
    assert_variable_inrange('mnatio', [1, 3], erf)

    tmp = erf['iaat']
    tmp[erf['mnatio'].isin([1, 2, 3])] = 1
    tmp[erf['mnatio'] == 4] = 2
    tmp[erf['mnatio'] == 5] = 3
    tmp[erf['mnatio'] == 6] = 4
    tmp[erf['mnatio'] == 7] = 5
    tmp[erf['mnatio'] == 8] = 6
    erf['iaat'] = tmp
    count_NA('iaat', erf)
    assert_variable_inrange('iaat', [1, 7], erf)

    # # Il reste un NA en 2003
    # #    reste un NA en 2008
    # table(erf$iaat, useNA="ifany")
    # TODO: comparer logement et erf pour ?tre sur que cela colle

    tmp = erf['mdiplo']
    tmp[erf['mdiplo'].isin([71, ""])] = 1
    tmp[erf['mdiplo'].isin([70, 60, 50])] = 2
    tmp[erf['mdiplo'].isin([41, 42, 31, 33])] = 3
    tmp[erf['mdiplo'].isin([10, 11, 30])] = 4
    erf['mdiplo'] = tmp
    count_NA('mdiplo', erf)
    #assert_variable_inrange('mdiplo', [1,5], erf) # On a un 99 qui se balade

    tmp = erf['tu99']
    tmp[erf['tu99'] == 0] = 1
    tmp[erf['tu99'].isin([1, 2, 3])] = 2
    tmp[erf['tu99'].isin([4, 5, 6])] = 3
    tmp[erf['tu99'] == 7] = 4
    tmp[erf['tu99'] == 8] = 5
    erf['tu99_recoded'] = tmp
    count_NA('tu99_recoded', erf)
    assert_variable_inrange('tu99_recoded', [1, 6], erf)

    # TODO : 0 ? Rajouetr 2003 !
    tmp = erf['mcs8']
    tmp[erf['mcs8'] == 1] = 1
    tmp[erf['mcs8'] == 2] = 2
    tmp[erf['mcs8'] == 3] = 3
    tmp[erf['mcs8'].isin([4, 8])] = 4
    tmp[erf['mcs8'].isin([5, 6, 7])] = 5
    erf['mcs8'] = tmp
    count_NA('mcs8', erf)
    assert_variable_inrange('mcs8', [1, 6], erf)

    erf['wprm'] = erf['wprm'].astype('int64')
    count_NA('wprm', erf)

    del (erf['cstotpr'], erf['agpr'], erf['typmen5'], erf['nbenfc'],
         erf['spr'], erf['tmp'], erf['tu99'])
    gc.collect()

    erf = erf.dropna(subset=[
        'logt', 'magtr', 'mcs8', 'mtybd', 'hnph2', 'mnatio', 'iaat', 'mdiplo',
        'tu99_recoded'
    ])
    #On vérifie au final que l'on n'a pas de doublons d'individus
    assert erf['ident'].value_counts().max() == 1, Exception(
        'Number of distinct individuals after removing duplicates is not correct'
    )

    ## Travail sur la table logement

    # Table menage
    if year == 2003:
        year_lgt = 2003
    if year > 2005 and year < 2010:
        year_lgt = 2006

    print "preparing logement menage table"

    #     Lgtmen = load_temp(name = "indivim",year = year) # Je rajoute une étape bidon
    Lgtmen = df.get_values(table="lgt_menage", variables=LgtMenVars)
    Lgtmen.rename(columns={'idlog': 'ident'}, inplace=True)

    count_NA('mrcho', Lgtmen)
    Lgtmen['mrcho'].fillna(0, inplace=True)
    Lgtmen['mrret'].fillna(0, inplace=True)
    Lgtmen['mrsal'].fillna(0, inplace=True)
    Lgtmen['mrtns'].fillna(0, inplace=True)
    count_NA('mrcho', Lgtmen)
    Lgtmen['revtot'] = Lgtmen['mrcho'] + Lgtmen['mrret'] + Lgtmen[
        'mrsal'] + Lgtmen['mrtns']  # Virer les revenus négatifs ?
    count_NA('revtot', Lgtmen)
    Lgtmen['nvpr'] = 10.0 * Lgtmen['revtot'] / Lgtmen['muc1']

    count_NA('qex', Lgtmen)
    dec, values = mark_weighted_percentiles(Lgtmen['nvpr'],
                                            arange(1, 11),
                                            Lgtmen['qex'],
                                            2,
                                            return_quantiles=True)
    values.sort()
    Lgtmen['deci'] = (
        1 + (Lgtmen['nvpr'] > values[1]) + (Lgtmen['nvpr'] > values[2]) +
        (Lgtmen['nvpr'] > values[3]) + (Lgtmen['nvpr'] > values[4]) +
        (Lgtmen['nvpr'] > values[5]) + (Lgtmen['nvpr'] > values[6]) +
        (Lgtmen['nvpr'] > values[7]) + (Lgtmen['nvpr'] > values[8]) +
        (Lgtmen['nvpr'] > values[9]))
    del dec, values
    print Lgtmen['deci'].describe()
    gc.collect()

    ##Table logement (pas en 2003 mais en 2006)
    # str(lgtmen)
    # if (year_lgt=="2006"){
    #   message("preparing logement logement table")
    #   lgtlgt <- LoadIn(lgtLgtFil,lgtLgtVars)
    #   lgtlgt <- upData(lgtlgt, rename=renameidlgt)
    #   lgtmen <- merge(lgtmen, lgtlgt, by.x="ident", by.y="ident")

    if year_lgt == 2006:
        print 'preparing logement logement table'
        lgtlgt = df.get_values(table="lgt_logt", variables=LgtLgtVars)
        lgtlgt.rename(columns={'idlog': 'ident'}, inplace=True)
        Lgtmen = Lgtmen.merge(lgtlgt,
                              left_on='ident',
                              right_on='ident',
                              how='inner')
        del lgtlgt

    data = Lgtmen[Lgtmen['sec1'].isin([21, 22, 23, 24, 30])]
    del Lgtmen
    gc.collect()

    if year_lgt == 2006:
        data.rename(columns={'mnatio': 'mnatior'}, inplace=True)

    data = (data[data['mnatior'].notnull()])
    data = (data[data['sec1'].notnull()])
    data['tmp'] = data['sec1'].astype(np.int64)
    data['tmp'][data['sec1'].isin([21, 22, 23])] = 3
    data['tmp'][data['sec1'] == 24] = 4
    data['tmp'][data['sec1'] == 30] = 5
    data['logt'] = data['tmp']
    count_NA('logt', data)
    data = (data[data['logt'].notnull()])
    Lgtmen = data

    # ## Table adresse
    print "preparing logement adresse table"
    # lgtadr <- LoadIn(lgtAdrFil,lgtAdrVars)
    # lgtadr <- upData(lgtadr, rename=renameidlgt)
    # Je rajoute une étae bidon
    Lgtadr = df.get_values(table="adresse", variables=LgtAdrVars)
    Lgtadr.rename(columns={'idlog': 'ident'}, inplace=True)

    print('Merging logement and menage tables')
    Logement = Lgtmen.merge(Lgtadr, on='ident', how='inner')
    #     control(Logement) # Pas de idfoy, etc. dans la table logement ?

    Logement['hnph2'][Logement['hnph2'] >= 6] = 6
    Logement['hnph2'][Logement['hnph2'] < 1] = 1
    count_NA('hnph2', Logement)
    assert not Logement['hnph2'].isnull().any(), "Some hnph2 are null"
    #     Logement=(Logement[Logement['hnph2'].notnull()]) # Mis en comment car 0 NA pour hnph2

    # On est dans la même étape within ici et par la suite ( cf code R )
    # TODO : ici problème je transforme les 07 en 7
    # car Python considère les 0n comme des nombres octaux ( < 08 ).
    # J'espère que ce n'est pas important.
    Logement['tmp'] = Logement['mnatior']
    Logement['tmp'][Logement['mnatior'].isin([0, 1])] = 1
    Logement['tmp'][Logement['mnatior'].isin([2, 3, 4, 5, 6, 7, 8, 9, 10,
                                              11])] = 2
    Logement['mnatior'] = Logement['tmp']
    count_NA('mnatior', Logement)
    assert_variable_inrange('mnatior', [1, 3], Logement)

    Logement['tmp'] = Logement['iaat']
    Logement['tmp'][Logement['iaat'].isin([1, 2, 3, 4, 5])] = 1
    Logement['tmp'][Logement['iaat'] == 6] = 2
    Logement['tmp'][Logement['iaat'] == 7] = 3
    Logement['tmp'][Logement['iaat'] == 8] = 4
    Logement['tmp'][Logement['iaat'] == 9] = 5
    Logement['tmp'][Logement['iaat'] ==
                    10] = 6  # TODO question Clément : et le 9 et le 10 ?
    Logement['iaat'] = Logement['tmp']
    count_NA('iaat', Logement)
    assert_variable_inrange('iaat', [1, 7], Logement)

    Logement['tmp'] = Logement['mdiplo']
    Logement['tmp'][Logement['mdiplo'] == 1] = 1
    Logement['tmp'][Logement['mdiplo'].isin([2, 3, 4])] = 2
    Logement['tmp'][Logement['mdiplo'].isin([5, 6, 7, 8])] = 3
    Logement['tmp'][Logement['mdiplo'] == 9] = 4
    Logement['mdiplo'] = Logement['tmp']
    count_NA('mdiplo', Logement)
    assert_variable_inrange('mdiplo', [1, 5], Logement)

    Logement['tmp'] = Logement['mtybd']
    Logement['tmp'][Logement['mtybd'] == 110] = 1
    Logement['tmp'][Logement['mtybd'] == 120] = 2
    Logement['tmp'][Logement['mtybd'] == 200] = 3
    Logement['tmp'][Logement['mtybd'].isin([311, 321, 401])] = 4
    Logement['tmp'][Logement['mtybd'].isin([312, 322, 402])] = 5
    Logement['tmp'][Logement['mtybd'].isin([313, 323, 403])] = 6
    Logement['tmp'][Logement['mtybd'] == 400] = 7
    Logement['mtybd'] = Logement['tmp']
    count_NA('mtybd', Logement)
    assert_variable_inrange('mtybd', [1, 8], Logement)

    Logement['tmp'] = Logement['tu99']
    count_NA('tu99', Logement)
    Logement['tmp'][Logement['tu99'] == 0] = 1
    Logement['tmp'][Logement['tu99'].isin([1, 2, 3])] = 2
    Logement['tmp'][Logement['tu99'].isin([4, 5, 6])] = 3
    Logement['tmp'][Logement['tu99'] == 7] = 4
    Logement['tmp'][Logement['tu99'] == 8] = 5
    Logement['tu99_recoded'] = Logement['tmp']
    count_NA('tu99_recoded', Logement)
    assert_variable_inrange('tu99_recoded', [1, 6], Logement)

    Logement['tmp'] = Logement['gzc2']
    Logement['tmp'][Logement['gzc2'] == 1] = 1
    Logement['tmp'][Logement['gzc2'].isin([2, 3, 4, 5, 6])] = 2
    Logement['tmp'][Logement['gzc2'] == 7] = 3
    Logement['gzc2'] = Logement['tmp']
    count_NA('gzc2', Logement)
    assert_variable_inrange('gzc2', [1, 4], Logement)

    Logement['tmp'] = Logement['magtr']
    Logement['tmp'][Logement['magtr'].isin([1, 2])] = 1
    Logement['tmp'][Logement['magtr'].isin([3, 4])] = 2
    Logement['tmp'][Logement['magtr'] == 5] = 3
    Logement['magtr'] = Logement['tmp']
    count_NA('magtr', Logement)
    assert_variable_inrange('magtr', [1, 4], Logement)

    Logement['tmp'] = Logement['mcs8']
    Logement['tmp'][Logement['mcs8'] == 1] = 1
    Logement['tmp'][Logement['mcs8'] == 2] = 2
    Logement['tmp'][Logement['mcs8'] == 3] = 3
    Logement['tmp'][Logement['mcs8'].isin([4, 8])] = 4
    Logement['tmp'][Logement['mcs8'].isin([5, 6, 7])] = 5
    Logement['mcs8'] = Logement['tmp']
    count_NA('mcs8', Logement)
    assert_variable_inrange('mcs8', [1, 6], Logement)

    Logement['logloy'] = Logement['lmlm'].apply(lambda x: math.log(x))

    Logement = (Logement[Logement['mdiplo'].notnull()])
    Logement = (Logement[Logement['mtybd'].notnull()])
    Logement = (Logement[Logement['magtr'].notnull()])
    Logement = (Logement[Logement['mcs8'].notnull()])
    Logement = (Logement[Logement['maa1at'].notnull()])

    ## Imputation des loyers proprement dite

    # library(StatMatch) # loads StatMatch
    # # library(mice) use md.pattern to locate missing data
    # TODO : à supprimer ?

    # logt <- subset(logement,select=c(lmlm,logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, ident))
    # logt$wprm <- logement$qex
    # erf <- subset(erf,select=c( logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, wprm, ident))
    print('Compute imputed rents')
    Logt = Logement[[
        'lmlm', 'logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded',
        'magtr', 'mcs8', 'deci', 'ident'
    ]]
    Logt['wprm'] = Logement['qex']
    erf = erf[[
        'logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr',
        'mcs8', 'deci', 'wprm', 'ident'
    ]]

    # # debug
    # # derf  <- describe(erf, weights=as.numeric(erf$wprm))
    # # dlogt <- describe(logt, weights=logt$wprm)
    # #
    # # for (var in as.list(names(derf))){
    # #   print("erf")
    # #   print(derf[[var]])
    # #   print("logt")
    # #   print(dlogt[[var]])
    # #   print("================")
    # # }

    # TODO add md.pattern

    # erf1 <- na.omit(erf)
    # logt <- na.omit(logt)
    from pandas import DataFrame
    erf = erf.dropna(
        how='any'
    )  # Si j'ai bien compris ce que l'on fait en R : dropper les lignes avec des NA
    #erf1 = erf # A-t-on toujours besoin de changer le nom du coup ?
    Logt = Logt.dropna(how='any')

    # allvars <- c("logt", "hnph2", "iaat", "mdiplo", "mtybd", "tu99_recoded", "magtr", "mcs8", "deci")
    # classes <- c("magtr","tu99_recoded")
    # matchvars <- setdiff(allvars,classes)
    allvars = [
        'logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr',
        'mcs8', 'deci'
    ]
    classes = ['magtr', 'tu99_recoded']
    matchvars = list(set(allvars) - set(classes))
    erf['mcs8'] = erf['mcs8'].astype(int)
    # out.nnd <- NND.hotdeck(data.rec=erf1,data.don=logt,match.vars=matchvars,don.class=classes,gdist.fun="Gower")
    # fill.erf.nnd <- create.fused(data.rec=erf1, data.don=logt,mtc.ids=out.nnd$mtc.ids, z.vars="lmlm")
    from rpy2.robjects.packages import importr
    import rpy2.robjects.pandas2ri
    import rpy2.robjects.vectors as vectors
    rpy2.robjects.pandas2ri.activate(
    )  # Permet à rpy2 de convertir les dataframes
    sm = importr(
        "StatMatch")  #, lib_loc = "C:\Program Files\R\R-2.15.2\library")
    print 'TEST 2'
    out_nnd = sm.NND_hotdeck(data_rec=erf,
                             data_don=Logt,
                             match_vars=vectors.StrVector(matchvars),
                             don_class=vectors.StrVector(classes),
                             dist_fun="Gower")
    print 'TEST 3'
    fill_erf_nnd = sm.create_fused(data_rec=erf,
                                   data_don=Logt,
                                   mtc_ids=out_nnd[0],
                                   z_vars=vectors.StrVector(["lmlm"]))
    del allvars, matchvars, classes, out_nnd
    gc.collect()

    # fill.erf.nnd <- upData(fill.erf.nnd, rename=c(lmlm='loym'))
    import pandas.rpy.common as com
    fill_erf_nnd = com.convert_robj(fill_erf_nnd)
    fill_erf_nnd = DataFrame(fill_erf_nnd)
    (fill_erf_nnd).rename(columns={'lmlm': 'loym'}, inplace=True)

    # loy_imput = fill.erf.nnd[c('ident','loym')]
    loy_imput = (fill_erf_nnd)[['ident', 'loym']]

    # load(menm)
    # menagem$loym <- NULL
    # menagem <- merge(menagem,loy_imput,by='ident',all.x = TRUE)
    # save(menagem,file=menm)
    #     Mis en comment block, car à manipuler avec précaution je suppose ( ne souhaite pas faire de conneries )

    erfmenm = load_temp(name="menagem", year=year)
    #     del erfmenm['loym']
    erfmenm = erfmenm.merge(loy_imput, on='ident', how='left')
    assert 'loym' in erfmenm.columns, 'No loym in erfmenm columns'
    save_temp(erfmenm, name="menagem", year=year)
Пример #13
0
def create_imput_loyer(year):
    '''
    Impute les loyers à partir de ???
    '''

    #Variables used for imputation
    df = DataCollection(year=year)
    print 'Démarrer 02_imput_loyer'

    menm_vars = ["ztsam","zperm","zragm","zricm","zrncm","zracm","nb_uci","wprm",
             "so","nbpiec","typmen5","spr","nbenfc","agpr","cstotpr","nat28pr","tu99","aai1",'ident',"pol99","reg","tau99"]
    if year == 2008: # Tau99 not present
        menm_vars = menm_vars.pop('tau99')


    indm_vars = ["noi",'ident',"lpr","dip11"]
    LgtAdrVars = ["gzc2"]
    LgtMenVars = ["sec1","mrcho","mrret","mrsal","mrtns","mdiplo","mtybd","magtr","mcs8","maa1at","qex","muc1"]

    if year == 2003:
        LgtMenVars.extend(["typse","lmlm","hnph2","mnatior","ident"])
        LgtAdrVars.extend(["iaat","tu99","ident"])
    if year < 2010 and year > 2005:
        LgtMenVars.extend(["mnatio","idlog"])
        LgtAdrVars.extend(["idlog"]) # pas de typse en 2006
        LgtLgtVars=["lmlm","iaat","tu99","hnph2","idlog"] # pas de typse en 2006

    ## Travail sur la base ERF
    #Preparing ERF menages tables
#     print show_temp()
    # TODO : data.get_values
    erfmenm = load_temp(name="menagem", year=year)
#     erfmenm = df.get_values(table="erf_menage",variables=menm_vars)
    erfmenm['revtot'] = (erfmenm['ztsam'] + erfmenm['zperm'] + erfmenm['zragm'] +
                         erfmenm['zricm'] + erfmenm['zrncm'] + erfmenm['zracm'])
    erfmenm['nvpr'] = erfmenm['revtot'].astype(np.float64) / erfmenm['nb_uci'].astype(np.float64)
    # On donne la valeur 0 aux nvpr négatifs
    tmp = np.zeros(erfmenm['nvpr'].shape, dtype = int)
    erfmenm['nvpr'] = max_(tmp, erfmenm['nvpr'])
    for v in erfmenm['nvpr']: # On vérifie qu'il n'y a plus de nvpr négatifs
        assert v >= 0, Exception('Some nvpr are negatives')
    erfmenm['logt'] = erfmenm['so']
    l = erfmenm.columns.tolist()
#     print l
    #Preparing ERF individuals table
    erfindm = load_temp(name = "indivim",year=year)
#     erfindm = df.get_values(table = "eec_indivi", variables = indm_vars)

    # TODO: clean this later
    erfindm['dip11'] = 0
    count_NA('dip11', erfindm)
#     erfindm['dip11'] = 99
    erfindm = erfindm[['ident', 'dip11']][erfindm['lpr'] == 1]
# erf <- merge(erfmenm, erfindm, by ="ident")
    print('merging erf menage and individu')
    erf = erfmenm.merge(erfindm, on ='ident', how='inner')
    erf=erf.drop_duplicates('ident')

    # control(erf) La colonne existe mais est vide,
    # on a du confondre cette colonne avec dip11 ?

    dec, values = mark_weighted_percentiles(erf['nvpr'], arange(1,11), erf['wprm'], 2, return_quantiles=True)
    values.sort()
    erf['deci'] = (1 + (erf['nvpr']>values[1]) + (erf['nvpr']>values[2]) + (erf['nvpr']>values[3])
                   + (erf['nvpr']>values[4]) + (erf['nvpr']>values[5]) + (erf['nvpr']>values[6])
                   + (erf['nvpr']>values[7]) + (erf['nvpr']>values[8]) + (erf['nvpr']>values[9]))
    # Problème : tous les individus sont soit dans le premier, soit dans le dernier décile. WTF
    assert_variable_inrange('deci',[1,11], erf)
    count_NA('deci',erf)
    del dec, values
    gc.collect()

    #TODO: faire le lien avec men_vars, il manque "pol99","reg","tau99" et ici on a en plus logt, 'nvpr','revtot','dip11','deci'
    erf = erf[['ident','ztsam','zperm','zragm','zricm','zrncm','zracm',
                 'nb_uci', 'logt' ,'nbpiec','typmen5','spr','nbenfc','agpr','cstotpr',
                 'nat28pr','tu99','aai1','wprm', 'nvpr','revtot','dip11','deci']][erf['so'].isin(range(3,6))]

    erf.rename(columns = {'nbpiec':'hnph2','nat28pr':'mnatio','aai1':'iaat','dip11':'mdiplo'}, inplace = True)

    # TODO: ne traite pas les types comme dans R teste-les pour voir comment pandas les gère

    count_NA('agpr', erf)
    erf['agpr'] = erf['agpr'].astype('int64')
    # TODO: moche, pourquoi créer deux variables quand une suffit ?
    erf['tmp'] = 3
    erf['tmp'][erf['agpr'] < 65] = 2
    erf['tmp'][erf['agpr'] < 40] = 1
    erf['magtr'] = erf['tmp']
    count_NA('magtr',erf)
    assert_variable_inrange('magtr',[1,4],erf)

    count_NA('cstotpr',erf)
    erf['tmp'] = erf['cstotpr'].astype('float')/10.0
    erf['tmp']=map(math.floor, erf['tmp'])
    erf['mcs8'] = erf['tmp']
    erf['mcs8'][erf['mcs8'] == 0] = NaN
    # assert isinstance(erf['mcs8'], (int, long)).all(), Exception('Some mcs8 are not integers')
    count_NA('mcs8',erf)

    # TODO il reste 41 NA's 2003
    erf['mtybd'] = NaN
    erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] != 2)] = 1
    erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] == 2)] = 2
    erf['mtybd'][erf['typmen5'] == 5] = 3
    erf['mtybd'][erf['typmen5'] == 3] = 7
    erf['mtybd'][erf['nbenfc'] == 1] = 4
    erf['mtybd'][erf['nbenfc'] == 2] = 5
    erf['mtybd'][erf['nbenfc'] >= 3] = 6
    count_NA('mtybd',erf)

#     print erf['mtybd'].dtype.fields
    #assert_variable_inrange('mtybd', [1,7], erf) # bug, on trouve 7.0 qui fait assert

    # TODO : 3 logements ont 0 pièces !!
    erf['hnph2'][erf['hnph2'] < 1] = 1
    erf['hnph2'][erf['hnph2'] >= 6] = 6
    count_NA('hnph2', erf)
    assert_variable_inrange('hnph2', [1,7], erf)

# # TODO: il reste un NA 2003
# #       il rest un NA en 2008

    tmp = erf['mnatio']
    tmp[erf['mnatio'] == 10] = 1
    tmp[erf['mnatio'].isin([11,12,13,14,15,21,22,23,24,25,26,27,28,29,31,32,41,42,43,44,45,46,47,48,51,52,62,60])] = 2
    erf['mnatio'] = tmp
    count_NA('mnatio', erf)
    assert_variable_inrange('mnatio', [1,3], erf)

    tmp = erf['iaat']
    tmp[erf['mnatio'].isin([1,2,3])] = 1
    tmp[erf['mnatio'] == 4] = 2
    tmp[erf['mnatio'] == 5] = 3
    tmp[erf['mnatio'] == 6] = 4
    tmp[erf['mnatio'] == 7] = 5
    tmp[erf['mnatio'] == 8] = 6
    erf['iaat'] = tmp
    count_NA('iaat', erf)
    assert_variable_inrange('iaat', [1,7], erf)

# # Il reste un NA en 2003
# #    reste un NA en 2008
# table(erf$iaat, useNA="ifany")
    # TODO: comparer logement et erf pour ?tre sur que cela colle

    tmp = erf['mdiplo']
    tmp[erf['mdiplo'].isin([71,""])] = 1
    tmp[erf['mdiplo'].isin([70,60,50])] = 2
    tmp[erf['mdiplo'].isin([41,42,31,33])] = 3
    tmp[erf['mdiplo'].isin([10,11,30])] = 4
    erf['mdiplo'] = tmp
    count_NA('mdiplo', erf)
    #assert_variable_inrange('mdiplo', [1,5], erf) # On a un 99 qui se balade

    tmp = erf['tu99']
    tmp[erf['tu99'] == 0] = 1
    tmp[erf['tu99'].isin([1,2,3])] = 2
    tmp[erf['tu99'].isin([4,5,6])] = 3
    tmp[erf['tu99'] == 7] = 4
    tmp[erf['tu99'] == 8] = 5
    erf['tu99_recoded'] = tmp
    count_NA('tu99_recoded', erf)
    assert_variable_inrange('tu99_recoded', [1,6], erf)

    # TODO : 0 ? Rajouetr 2003 !
    tmp = erf['mcs8']
    tmp[erf['mcs8'] == 1] = 1
    tmp[erf['mcs8'] == 2] = 2
    tmp[erf['mcs8'] == 3] = 3
    tmp[erf['mcs8'].isin([4,8])] = 4
    tmp[erf['mcs8'].isin([5,6,7])] = 5
    erf['mcs8'] = tmp
    count_NA('mcs8', erf)
    assert_variable_inrange('mcs8', [1,6], erf)

    erf['wprm'] = erf['wprm'].astype('int64')
    count_NA('wprm', erf)

    del (erf['cstotpr'] ,erf['agpr'], erf['typmen5'],
    erf['nbenfc'], erf['spr'], erf['tmp'], erf['tu99'])
    gc.collect()


    erf = erf.dropna(subset=['logt','magtr','mcs8','mtybd','hnph2','mnatio','iaat','mdiplo','tu99_recoded'])
    #On vérifie au final que l'on n'a pas de doublons d'individus
    assert erf['ident'].value_counts().max() == 1, Exception('Number of distinct individuals after removing duplicates is not correct')


    ## Travail sur la table logement

    # Table menage
    if year == 2003:
        year_lgt = 2003
    if year > 2005 and year < 2010:
        year_lgt = 2006


    print "preparing logement menage table"

#     Lgtmen = load_temp(name = "indivim",year = year) # Je rajoute une étape bidon
    Lgtmen = df.get_values(table = "lgt_menage", variables = LgtMenVars)
    Lgtmen.rename(columns = {'idlog':'ident'}, inplace = True)

    count_NA('mrcho', Lgtmen)
    Lgtmen['mrcho'].fillna(0, inplace = True)
    Lgtmen['mrret'].fillna(0, inplace = True)
    Lgtmen['mrsal'].fillna(0, inplace = True)
    Lgtmen['mrtns'].fillna(0, inplace = True)
    count_NA('mrcho', Lgtmen)
    Lgtmen['revtot'] = Lgtmen['mrcho']+Lgtmen['mrret']+Lgtmen['mrsal']+Lgtmen['mrtns'] # Virer les revenus négatifs ?
    count_NA('revtot', Lgtmen)
    Lgtmen['nvpr']=10.0*Lgtmen['revtot']/Lgtmen['muc1']

    count_NA('qex', Lgtmen)
    dec, values = mark_weighted_percentiles(Lgtmen['nvpr'],arange(1,11), Lgtmen['qex'],2,return_quantiles=True)
    values.sort()
    Lgtmen['deci'] = (1+(Lgtmen['nvpr']>values[1])+(Lgtmen['nvpr']>values[2])+(Lgtmen['nvpr']>values[3])
                      +(Lgtmen['nvpr']>values[4])+(Lgtmen['nvpr']>values[5])+(Lgtmen['nvpr']>values[6])
                      +(Lgtmen['nvpr']>values[7])+(Lgtmen['nvpr']>values[8])+(Lgtmen['nvpr']>values[9]))
    del dec, values
    print Lgtmen['deci'].describe()
    gc.collect()

    ##Table logement (pas en 2003 mais en 2006)
# str(lgtmen)
# if (year_lgt=="2006"){
#   message("preparing logement logement table")
#   lgtlgt <- LoadIn(lgtLgtFil,lgtLgtVars)
#   lgtlgt <- upData(lgtlgt, rename=renameidlgt)
#   lgtmen <- merge(lgtmen, lgtlgt, by.x="ident", by.y="ident")

    if year_lgt == 2006:
        print 'preparing logement logement table'
        lgtlgt = df.get_values(table = "lgt_logt", variables = LgtLgtVars)
        lgtlgt.rename(columns = {'idlog':'ident'}, inplace = True)
        Lgtmen = Lgtmen.merge(lgtlgt, left_on = 'ident', right_on = 'ident', how = 'inner')
        del lgtlgt

    data = Lgtmen[Lgtmen['sec1'].isin([21,22,23,24,30])]
    del Lgtmen
    gc.collect()

    if year_lgt == 2006:
        data.rename(columns = {'mnatio':'mnatior'}, inplace = True)

    data = (data[data['mnatior'].notnull()])
    data = (data[data['sec1'].notnull()])
    data['tmp'] = data['sec1'].astype(np.int64)
    data['tmp'][data['sec1'].isin([21,22,23])] = 3
    data['tmp'][data['sec1'] == 24] = 4
    data['tmp'][data['sec1'] == 30] = 5
    data['logt'] = data['tmp']
    count_NA('logt', data)
    data = (data[data['logt'].notnull()])
    Lgtmen = data

# ## Table adresse
    print "preparing logement adresse table"
# lgtadr <- LoadIn(lgtAdrFil,lgtAdrVars)
# lgtadr <- upData(lgtadr, rename=renameidlgt)
    # Je rajoute une étae bidon
    Lgtadr = df.get_values(table = "adresse", variables = LgtAdrVars)
    Lgtadr.rename(columns = {'idlog':'ident'}, inplace = True)

    print('Merging logement and menage tables')
    Logement = Lgtmen.merge(Lgtadr, on = 'ident', how = 'inner')
#     control(Logement) # Pas de idfoy, etc. dans la table logement ?

    Logement['hnph2'][Logement['hnph2'] >= 6] = 6
    Logement['hnph2'][Logement['hnph2'] < 1] = 1
    count_NA('hnph2', Logement)
    assert not Logement['hnph2'].isnull().any(), "Some hnph2 are null"
#     Logement=(Logement[Logement['hnph2'].notnull()]) # Mis en comment car 0 NA pour hnph2

    # On est dans la même étape within ici et par la suite ( cf code R )
    # TODO : ici problème je transforme les 07 en 7
    # car Python considère les 0n comme des nombres octaux ( < 08 ).
    # J'espère que ce n'est pas important.
    Logement['tmp'] = Logement['mnatior']
    Logement['tmp'][Logement['mnatior'].isin([0, 1])] = 1
    Logement['tmp'][Logement['mnatior'].isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11])] = 2
    Logement['mnatior'] = Logement['tmp']
    count_NA('mnatior', Logement)
    assert_variable_inrange('mnatior', [1,3], Logement)

    Logement['tmp'] = Logement['iaat']
    Logement['tmp'][Logement['iaat'].isin([1,2,3,4,5])] = 1
    Logement['tmp'][Logement['iaat'] == 6] = 2
    Logement['tmp'][Logement['iaat'] == 7] = 3
    Logement['tmp'][Logement['iaat'] == 8] = 4
    Logement['tmp'][Logement['iaat'] == 9] = 5
    Logement['tmp'][Logement['iaat'] == 10] = 6 # TODO question Clément : et le 9 et le 10 ?
    Logement['iaat'] = Logement['tmp']
    count_NA('iaat', Logement)
    assert_variable_inrange('iaat', [1,7], Logement)

    Logement['tmp'] = Logement['mdiplo']
    Logement['tmp'][Logement['mdiplo'] == 1] = 1
    Logement['tmp'][Logement['mdiplo'].isin([2,3,4])] = 2
    Logement['tmp'][Logement['mdiplo'].isin([5,6,7,8])] = 3
    Logement['tmp'][Logement['mdiplo'] == 9] = 4
    Logement['mdiplo'] = Logement['tmp']
    count_NA('mdiplo', Logement)
    assert_variable_inrange('mdiplo', [1,5], Logement)

    Logement['tmp'] = Logement['mtybd']
    Logement['tmp'][Logement['mtybd'] == 110] = 1
    Logement['tmp'][Logement['mtybd'] == 120] = 2
    Logement['tmp'][Logement['mtybd'] == 200] = 3
    Logement['tmp'][Logement['mtybd'].isin([311,321,401])] = 4
    Logement['tmp'][Logement['mtybd'].isin([312,322,402])] = 5
    Logement['tmp'][Logement['mtybd'].isin([313,323,403])] = 6
    Logement['tmp'][Logement['mtybd'] == 400] = 7
    Logement['mtybd'] = Logement['tmp']
    count_NA('mtybd', Logement)
    assert_variable_inrange('mtybd', [1,8], Logement)

    Logement['tmp'] = Logement['tu99']
    count_NA('tu99', Logement)
    Logement['tmp'][Logement['tu99'] == 0] = 1
    Logement['tmp'][Logement['tu99'].isin([1,2,3])] = 2
    Logement['tmp'][Logement['tu99'].isin([4,5,6])] = 3
    Logement['tmp'][Logement['tu99'] == 7] = 4
    Logement['tmp'][Logement['tu99'] == 8] = 5
    Logement['tu99_recoded'] = Logement['tmp']
    count_NA('tu99_recoded', Logement)
    assert_variable_inrange('tu99_recoded', [1,6], Logement)

    Logement['tmp'] = Logement['gzc2']
    Logement['tmp'][Logement['gzc2'] == 1] = 1
    Logement['tmp'][Logement['gzc2'].isin([2,3,4,5,6])] = 2
    Logement['tmp'][Logement['gzc2'] == 7] = 3
    Logement['gzc2'] = Logement['tmp']
    count_NA('gzc2', Logement)
    assert_variable_inrange('gzc2', [1,4], Logement)

    Logement['tmp'] = Logement['magtr']
    Logement['tmp'][Logement['magtr'].isin([1,2])] = 1
    Logement['tmp'][Logement['magtr'].isin([3,4])] = 2
    Logement['tmp'][Logement['magtr'] == 5] = 3
    Logement['magtr'] = Logement['tmp']
    count_NA('magtr', Logement)
    assert_variable_inrange('magtr', [1,4], Logement)

    Logement['tmp'] = Logement['mcs8']
    Logement['tmp'][Logement['mcs8'] == 1] = 1
    Logement['tmp'][Logement['mcs8'] == 2] = 2
    Logement['tmp'][Logement['mcs8'] == 3] = 3
    Logement['tmp'][Logement['mcs8'].isin([4,8])] = 4
    Logement['tmp'][Logement['mcs8'].isin([5,6,7])] = 5
    Logement['mcs8'] = Logement['tmp']
    count_NA('mcs8', Logement)
    assert_variable_inrange('mcs8', [1,6], Logement)

    Logement['logloy'] = Logement['lmlm'].apply(lambda x: math.log(x))

    Logement=(Logement[Logement['mdiplo'].notnull()])
    Logement=(Logement[Logement['mtybd'].notnull()])
    Logement=(Logement[Logement['magtr'].notnull()])
    Logement=(Logement[Logement['mcs8'].notnull()])
    Logement=(Logement[Logement['maa1at'].notnull()])

    ## Imputation des loyers proprement dite

# library(StatMatch) # loads StatMatch
# # library(mice) use md.pattern to locate missing data
    # TODO : à supprimer ?

# logt <- subset(logement,select=c(lmlm,logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, ident))
# logt$wprm <- logement$qex
# erf <- subset(erf,select=c( logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, wprm, ident))
    print ('Compute imputed rents')
    Logt = Logement[['lmlm','logt' , 'hnph2' , 'iaat' , 'mdiplo' , 'mtybd' , 'tu99_recoded' , 'magtr' , 'mcs8' , 'deci', 'ident']]
    Logt['wprm'] = Logement['qex']
    erf = erf[['logt' , 'hnph2' , 'iaat' , 'mdiplo' , 'mtybd' , 'tu99_recoded' , 'magtr' , 'mcs8' , 'deci', 'wprm' , 'ident']]

# # debug
# # derf  <- describe(erf, weights=as.numeric(erf$wprm))
# # dlogt <- describe(logt, weights=logt$wprm)
# #
# # for (var in as.list(names(derf))){
# #   print("erf")
# #   print(derf[[var]])
# #   print("logt")
# #   print(dlogt[[var]])
# #   print("================")
# # }


    # TODO add md.pattern

# erf1 <- na.omit(erf)
# logt <- na.omit(logt)
    from pandas import DataFrame
    erf = erf.dropna(how = 'any') # Si j'ai bien compris ce que l'on fait en R : dropper les lignes avec des NA
    #erf1 = erf # A-t-on toujours besoin de changer le nom du coup ?
    Logt = Logt.dropna(how = 'any')

# allvars <- c("logt", "hnph2", "iaat", "mdiplo", "mtybd", "tu99_recoded", "magtr", "mcs8", "deci")
# classes <- c("magtr","tu99_recoded")
# matchvars <- setdiff(allvars,classes)
    allvars = ['logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci']
    classes = ['magtr', 'tu99_recoded']
    matchvars = list(set(allvars)-set(classes))
    erf['mcs8'] = erf['mcs8'].astype(int)
# out.nnd <- NND.hotdeck(data.rec=erf1,data.don=logt,match.vars=matchvars,don.class=classes,gdist.fun="Gower")
# fill.erf.nnd <- create.fused(data.rec=erf1, data.don=logt,mtc.ids=out.nnd$mtc.ids, z.vars="lmlm")
    from rpy2.robjects.packages import importr
    import rpy2.robjects.pandas2ri
    import rpy2.robjects.vectors as vectors
    rpy2.robjects.pandas2ri.activate() # Permet à rpy2 de convertir les dataframes
    sm = importr("StatMatch")#, lib_loc = "C:\Program Files\R\R-2.15.2\library")
    print 'TEST 2'
    out_nnd = sm.NND_hotdeck(data_rec = erf,
                              data_don = Logt,
                               match_vars = vectors.StrVector(matchvars),
                                don_class = vectors.StrVector(classes),
                                 dist_fun = "Gower")
    print 'TEST 3'
    fill_erf_nnd = sm.create_fused(data_rec = erf,
                                    data_don = Logt,
                                     mtc_ids = out_nnd[0],
                                      z_vars = vectors.StrVector(["lmlm"]))
    del allvars, matchvars, classes, out_nnd
    gc.collect()

# fill.erf.nnd <- upData(fill.erf.nnd, rename=c(lmlm='loym'))
    import pandas.rpy.common as com
    fill_erf_nnd = com.convert_robj(fill_erf_nnd)
    fill_erf_nnd = DataFrame(fill_erf_nnd)
    (fill_erf_nnd).rename(columns={'lmlm':'loym'}, inplace = True)

# loy_imput = fill.erf.nnd[c('ident','loym')]
    loy_imput = (fill_erf_nnd)[['ident','loym']]

# load(menm)
# menagem$loym <- NULL
# menagem <- merge(menagem,loy_imput,by='ident',all.x = TRUE)
# save(menagem,file=menm)
#     Mis en comment block, car à manipuler avec précaution je suppose ( ne souhaite pas faire de conneries )

    erfmenm = load_temp(name="menagem", year=year)
#     del erfmenm['loym']
    erfmenm = erfmenm.merge(loy_imput,on='ident',how='left')
    assert 'loym' in erfmenm.columns, 'No loym in erfmenm columns'
    save_temp(erfmenm, name = "menagem", year=year)
def invalide(year = 2006):

    print 'Entering 07_invalides: construction de la variable invalide NOTFUNCTIONNAL NAOW'

    return
# # # Invalides
# # #inv = caseP (vous), caseF (conj) ou case G, caseI, ou caseR (pac)

# # loadTmp("final.Rdata")
# # invalides <- final[,c("noindiv","idmen","caseP","caseF","idfoy","quifoy")]
# # invalides <- within(invalides,{
# #   caseP <- ifelse(is.na(caseP),0,caseP)
# #   caseF <- ifelse(is.na(caseF),0,caseF)
# #   inv <- FALSE})
# # # Les "vous" invalides
# # table(invalides[,c("caseF","quifoy")],useNA="ifany")
# # invalides[(invalides$caseP==1) & (invalides$quifoy=="vous"),"inv"] <- TRUE
# #
    print ''
    print 'Etape 1 : création de la df invalides'
    print '    1.1 : déclarants invalides'
    final = load_temp(name="final", year=year)
    invalides = final.xs(["noindiv","idmen","caseP","caseF","idfoy","quifoy","maahe","rc1rev"], axis=1)

    print invalides['rc1rev'].value_counts()

    for var in ["caseP", "caseF"]:
        assert invalides[var].notnull().all(), 'présence de NaN dans %s' %(var)

    # Les déclarants invalides
    invalides['inv'] = False
    invalides['inv'][(invalides['caseP']==1) & (invalides['quifoy']==0)] = True
    print invalides["inv"].sum(), " invalides déclarants"

    #Les personnes qui touchent l'aah dans l'enquête emploi
    invalides['inv'][(invalides['maahe']>0)] = True
    invalides['inv'][(invalides['rc1rev']==4)] = True #TODO: vérifier le format.
    print invalides["inv"].sum(), " invalides qui touchent des alloc"

    print_id(invalides)


# # # Les conjoints invalides
# #
# # #men_inv_conj <- invalides[c("idmen","caseF","quifoy")]
# # #men_inv_conj <- rename(men_inv_conj, c("caseF"="inv"))
# # #table(men_inv_conj[men_inv_conj$inv==1 ,c("inv","quifoy")],useNA="ifany")
# # # Il y a des caseF suir des conjoints cela vint des doubles d?clarations TODO: shoumd clean this
# # #toto <- invalides[invalides$caseF==1 & invalides$quifoy=="conj","idmen"]
# # #load(indm)
# # #titi <- indivim[(indivim$ident %in% toto) & (indivim$persfip=="vous" |indivim$persfip=="conj") ,c("ident","noindiv","declar1","declar2","persfip","quelfic")]
# # #titi <- titi[order(titi$ident),]
# # foy_inv_conj <- invalides[,c("idfoy","caseF","quifoy")]
# # foy_inv_conj <- rename(foy_inv_conj, c("caseF"="inv"))
# # table(foy_inv_conj[ ,c("inv","quifoy")],useNA="ifany")
# # # On ne garde donc que les caseF des "vous"

# # foy_inv_conj   <- foy_inv_conj[foy_inv_conj$quifoy=="vous",c("idfoy","inv")]
# # table(foy_inv_conj[ ,c("inv")],useNA="ifany")
# # invalides_conj <- invalides[invalides$quifoy=="conj",c("idfoy","noindiv")]
# # invalides_conj <- merge(invalides_conj, foy_inv_conj, by="idfoy", all.x=TRUE)
# # table(invalides_conj$inv) # TODO en 2006 On en a 316 au lieu de 328 il doit y avoir de idfoy avec caseF qui n'ont pas de vous because double déclaration'
# # invalides[invalides$quifoy=="conj",c("idfoy","noindiv","inv")] <- invalides_conj
# # table(invalides[,c("inv","quifoy")],useNA="ifany")
# # rm(invalides_conj,foy_inv_conj)

    # On récupère les idfoy des foyers avec une caseF cochée
    print '    1.2 : Les conjoints invalides'
    idfoy_inv_conj = final["idfoy"][final["caseF"]]
    inv_conj_condition = (invalides["idfoy"].isin(idfoy_inv_conj)  & (invalides["quifoy"]==1))
    invalides["inv"][inv_conj_condition] = True

    print len(invalides[inv_conj_condition]), "invalides conjoints"
    print invalides["inv"].sum(), " invalides déclarants et invalides conjoints"

# # # Enfants invalides et garde alternée
# #
# # loadTmp("pacIndiv.Rdata")
# # foy_inv_pac <- invalides[!(invalides$quifoy %in% c("vous","conj")),c("inv","noindiv")]
# # foy_inv_pac <- merge(foy_inv_pac, pacIndiv[,c("noindiv","typ","naia")], by="noindiv",all.x =TRUE)
# # names(foy_inv_pac)
# # table(foy_inv_pac[,c("typ","naia")],useNA="ifany")
# # table(foy_inv_pac[,c("typ")],useNA="ifany")
# # foy_inv_pac <- within(foy_inv_pac,{
# #   inv  <- (typ=="G") | (typ=="R") | (typ=="I") | (typ=="F" & (as.numeric(year)-naia>18))
# #   alt  <- (typ=="H") | (typ=="I")
# #   naia <- NULL
# #   typ  <- NULL})
# #
# # table(foy_inv_pac[ ,c("inv")],useNA="ifany")
# # table(foy_inv_pac[ ,c("alt")],useNA="ifany")
# # invalides$alt <- 0
# # foy_inv_pac[is.na(foy_inv_pac$alt),"alt"] <- 0
# # invalides[!(invalides$quifoy %in% c("vous","conj")),c("noindiv","inv","alt")] <- foy_inv_pac


    print '    1.3 : enfants invalides et garde alternée'

    pacIndiv = load_temp(name='pacIndiv', year=year)
    print pacIndiv.type_pac.value_counts()

    foy_inv_pac = invalides.loc[~(invalides.quifoy.isin([0, 1])), ['noindiv', 'inv']]
#     pac = pacIndiv.ix[:, ["noindiv", "type_pac", "naia"]]
    print len(foy_inv_pac)

    print pacIndiv.columns
    foy_inv_pac = foy_inv_pac.merge(pacIndiv.loc[:, ['noindiv', 'type_pac', 'naia']],
                                    on='noindiv', how='left')
    foy_inv_pac['inv'] = (foy_inv_pac['type_pac'].isin(['G','R','I']) |
                             ((foy_inv_pac['type_pac']=="F") & ((year - foy_inv_pac['naia'])>18)))

    foy_inv_pac['alt'] = ((foy_inv_pac['type_pac']=="H") | (foy_inv_pac['type_pac']=="I"))
    foy_inv_pac['naia'] = None
    foy_inv_pac['type_pac'] = None
    foy_inv_pac['alt'] = foy_inv_pac['alt'].fillna(False)


    print foy_inv_pac['inv'].describe()
    invalides['alt'] = 0
    foy_inv_pac['alt'][foy_inv_pac.alt.isnull()] = 0
    invalides = invalides.merge(foy_inv_pac, on=["noindiv","inv","alt"])

    invalides = invalides.drop_duplicates(['noindiv', 'inv', 'alt'], take_last=True)
# =======
#     print foy_inv_pac.inv.value_counts() # TODO: JS : trop peu de True là-dedans
#     print foy_inv_pac.alt.value_counts() #
#
#
#     print  len(invalides), len(foy_inv_pac)
#     print invalides.inv.value_counts()
# >>>>>>> 67cd9a43177cf3f6f72521cda59dae02485df1e3

    invalides = invalides.merge(foy_inv_pac, on='noindiv', how='left')
    invalides['inv'] = where(invalides['inv_y']==True, invalides['inv_y'], invalides['inv_x'])
    invalides['alt'] = where(invalides['inv_y']==True, invalides['inv_y'], invalides['inv_x'])

    invalides = invalides.loc[:, ["noindiv","idmen","caseP","caseF","idfoy","quifoy", "inv", 'alt']]
    invalides['alt'].fillna(False, inplace=True)

    print invalides.inv.value_counts()
    invalides = invalides.drop_duplicates(['noindiv', 'inv', 'alt'], take_last=True)
    del foy_inv_pac, pacIndiv

# # # Initialisation des NA sur alt et inv
# # invalides[is.na(invalides$inv), "inv"] <- 0
# # table(invalides[,c("alt","inv")],useNA="ifany")
# #
# # final <- merge(final, invalides[,c("noindiv","inv","alt")], by="noindiv",all.x=TRUE)
# # table(final[, c("inv","alt")],useNA="ifany")

    print ''
    print 'Etape 2 : Initialisation des NA sur alt et inv'
    assert invalides["inv"].notnull().all() & invalides.alt.notnull().all()
    final = final.merge(invalides.loc[:, ['noindiv', 'inv', 'alt']], on='noindiv', how='left')
    del invalides

    print final.inv.value_counts()
    control(final, debug=True)

    save_temp(final, name='final', year=year)
    print 'final complétée et sauvegardée'
Пример #15
0
def create_totals(year=2006):

    print "Creating Totals"
    print "Etape 1 : Chargement des données"

    data = DataCollection(year=year)
    indivim = load_temp(name="indivim", year=year)

    assert indivim.duplicated(['noindiv'
                               ]).any() == False, "Présence de doublons"

    # Deals individuals with imputed income : some individuals are in 'erf individu table' but
    # not in the 'foyer' table. We need to create a foyer for them.

    selection = Series()
    for var in [
            "zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici",
            "zrnci"
    ]:
        varo = var[:-1] + "o"
        test = indivim[var] != indivim[varo]
        if len(selection) == 0:
            selection = test
        else:
            selection = (test) | (selection)

    indivi_i = indivim[selection]
    indivi_i.rename(
        columns={
            "ident": "idmen",
            "persfip": "quifoy",
            "zsali":
            "sali2",  # Inclu les salaires non imposables des agents d'assurance
            "zchoi": "choi2",
            "zrsti": "rsti2",
            "zalri": "alr2"
        },
        inplace=True)

    indivi_i["quifoy"] = where(indivi_i["quifoy"].isnull(), "vous",
                               indivi_i["quifoy"])
    indivi_i["quelfic"] = "FIP_IMP"

    ## We merge them with the other individuals
    #indivim <- rename(indivim, c(ident = "idmen",
    #                             persfip = "quifoy",
    #                             zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance
    #                             zchoi = "choi2",
    #                             zrsti = "rsti2",
    #                             zalri = "alr2"))
    #
    #indivi <- rbind(indivim[!(indivim$noindiv %in% indivi_i$noindiv),], indivi_i)
    #rm(indivim, indivi_i)
    #gc()
    #table(indivi$quelfic)
    #

    indivim.rename(
        columns=dict(
            ident="idmen",
            persfip="quifoy",
            zsali=
            "sali2",  # Inclu les salaires non imposables des agents d'assurance
            zchoi="choi2",
            zrsti="rsti2",
            zalri="alr2"),
        inplace=True)

    if not (set(list(indivim.noindiv)) > set(list(indivi_i.noindiv))):
        raise Exception("Individual ")
    indivim.set_index("noindiv", inplace=True)
    indivi_i.set_index("noindiv", inplace=True)
    indivi = indivim
    del indivim
    indivi.update(indivi_i)

    indivi.reset_index(inplace=True)

    print ''
    print "Etape 2 : isolation des FIP"
    fip_imp = indivi.quelfic == "FIP_IMP"
    indivi["idfoy"] = (
        indivi["idmen"].astype("int64") * 100 +
        (indivi["declar1"].str[0:2]).convert_objects(convert_numeric=True))

    indivi.loc[fip_imp, "idfoy"] = nan

    ## Certains FIP (ou du moins avec revenus imputés) ont un num?ro de déclaration d'impôt ( pourquoi ?)

    fip_has_declar = (fip_imp) & (indivi.declar1.notnull())

    #    indivi.ix[fip_has_declar, "idfoy"] = ( indivi.ix[fip_has_declar, "idmen"]*100
    #                                        + (indivi.ix[fip_has_declar, "declar1"].str[0:1]).convert_objects(convert_numeric=True) )
    indivi["idfoy"] = where(
        fip_has_declar, indivi["idmen"] * 100 +
        indivi["declar1"].str[0:2].convert_objects(convert_numeric=True),
        indivi["idfoy"])

    del fip_has_declar

    fip_no_declar = (fip_imp) & (indivi.declar1.isnull())
    del fip_imp
    indivi["idfoy"] = where(fip_no_declar, indivi["idmen"] * 100 + 50,
                            indivi["idfoy"])

    indivi_fnd = indivi.loc[fip_no_declar, ["idfoy", "noindiv"]]

    while any(indivi_fnd.duplicated(cols=["idfoy"])):
        indivi_fnd["idfoy"] = where(indivi_fnd.duplicated(cols=["idfoy"]),
                                    indivi_fnd["idfoy"] + 1,
                                    indivi_fnd["idfoy"])

    assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len(
        indivi_fnd["idfoy"]), "Duplicates remaining"
    assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons"

    indivi.loc[fip_no_declar, ["idfoy"]] = indivi_fnd
    del indivi_fnd, fip_no_declar

    print ''
    print 'Etape 3 : Récupération des EE_NRT'

    nrt = indivi.quelfic == "EE_NRT"
    indivi.idfoy = where(nrt, indivi.idmen * 100 + indivi.noi, indivi.idfoy)
    indivi.loc[nrt, "quifoy"] = "vous"
    del nrt

    pref_or_cref = indivi['lpr'].isin([1, 2])
    adults = (indivi.quelfic.isin(["EE", "EE_CAF"])) & (pref_or_cref)
    indivi.idfoy = where(adults, indivi.idmen * 100 + indivi.noi, indivi.idfoy)
    indivi.loc[adults, "quifoy"] = "vous"
    del adults
    assert indivi.loc[indivi['lpr'].isin([1, 2]), "idfoy"].notnull().all()

    print ''
    print 'Etape 4 : Rattachement des enfants aux déclarations'

    assert indivi["noindiv"].duplicated().any(
    ) == False, "Some noindiv appear twice"
    lpr3_or_lpr4 = indivi['lpr'].isin([3, 4])
    enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE", "EE_CAF"]))
    assert indivi.loc[enf_ee, "noindiv"].notnull().all(
    ), " Some noindiv are not set, which will ruin next stage"
    assert indivi.loc[
        enf_ee,
        "noindiv"].duplicated().any() == False, "Some noindiv appear twice"

    pere = DataFrame({
        "noindiv_enf":
        indivi.noindiv.loc[enf_ee],
        "noindiv":
        100 * indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee]
    })
    mere = DataFrame({
        "noindiv_enf":
        indivi.noindiv.loc[enf_ee],
        "noindiv":
        100 * indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee]
    })

    foyer = data.get_values(variables=["noindiv", "zimpof"], table="foyer")
    pere = pere.merge(foyer, how="inner", on="noindiv")
    mere = mere.merge(foyer, how="inner", on="noindiv")

    #     print "Some pere et mere are duplicated because people have two foyers"
    #     print pere[pere.duplicated()]
    #     print mere[mere.duplicated()]

    df = pere.merge(mere, how="outer", on="noindiv_enf", suffixes=('_p', '_m'))

    #     print len(pere)
    #     print len(mere)
    #     print len(df)
    #     ll = df.loc[df["noindiv_enf"].duplicated(), "noindiv_enf"]
    #     print df.loc[df["noindiv_enf"].isin(ll)]
    #     print df[df.duplicated()]

    print '    4.1 : gestion des personnes dans 2 foyers'
    for col in ["noindiv_p", "noindiv_m", "noindiv_enf"]:
        df[col] = df[col].fillna(
            0, inplace=True)  # beacause groupby drop groups with NA in index
    df = df.groupby(by=["noindiv_p", "noindiv_m", "noindiv_enf"]).sum()
    df.reset_index(inplace=True)

    df["which"] = ""
    df["which"] = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()),
                        "mere", "")
    df["which"] = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()),
                        "pere", "")
    both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull())
    df["which"] = where(both & (df.zimpof_p > df.zimpof_m), "pere", "mere")
    df["which"] = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere")

    assert df["which"].notnull().all(
    ), "Some enf_ee individuals are not matched with any pere or mere"
    del lpr3_or_lpr4, pere, mere

    df.rename(columns={"noindiv_enf": "noindiv"}, inplace=True)
    df["idfoy"] = where(df.which == "pere", df.noindiv_p, df.noindiv_m)
    df["idfoy"] = where(df.which == "mere", df.noindiv_m, df.noindiv_p)

    assert df["idfoy"].notnull().all()

    for col in df.columns:
        if col not in ["idfoy", "noindiv"]:
            del df[col]


#     assert indivi.loc[enf_ee,"idfoy"].notnull().all()
    assert df.duplicated().any() == False

    df.set_index("noindiv", inplace=True, verify_integrity=True)
    indivi.set_index("noindiv", inplace=True, verify_integrity=True)

    ind_notnull = indivi["idfoy"].notnull().sum()
    ind_isnull = indivi["idfoy"].isnull().sum()
    indivi = indivi.combine_first(df)
    assert ind_notnull + ind_isnull == (indivi["idfoy"].notnull().sum() +
                                        indivi["idfoy"].isnull().sum())

    indivi.reset_index(inplace=True)
    assert indivi.duplicated().any() == False

    # MBJ: issue delt with when moving from R code to python
    ## TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs
    ## On suit guide méthodo erf 2003 page 135
    ## On supprime les conjoints FIP et les FIP de 25 ans et plus;
    ## On conserve les enfants FIP de 19 à 24 ans;
    ## On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un
    ## ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète
    ## EEC que l'on ne retrouvera pas dans les EEC suivantes.
    #
    print '    4.2 : On enlève les individus pour lesquels il manque le déclarant'
    fip = load_temp(name="fipDat", year=year)
    fip["declar"] = nan
    fip["agepf"] = nan

    fip.drop(["actrec", "year", "noidec"], axis=1, inplace=True)
    fip.naia = fip.naia.astype("int32")
    fip.rename(
        columns=dict(
            ident="idmen",
            persfip="quifoy",
            zsali=
            "sali2",  # Inclu les salaires non imposables des agents d'assurance
            zchoi="choi2",
            zrsti="rsti2",
            zalri="alr2"),
        inplace=True)

    is_fip_19_25 = ((year - fip.naia - 1) >= 19) & ((year - fip.naia - 1) < 25)

    ## TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented
    ## The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25
    ##fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"]
    ##fip[is_fip_19_25 ,"lpr"]  <- 1
    #
    #indivi <- rbind.fill(indivi,fip[is_fip_19_25,])

    indivi = concat([indivi, fip.loc[is_fip_19_25]])
    del is_fip_19_25
    indivi['age'] = year - indivi.naia - 1
    indivi['agem'] = 12 * indivi.age + 12 - indivi.naim

    indivi["quimen"] = 0
    indivi.quimen[indivi.lpr == 1] = 0
    indivi.quimen[indivi.lpr == 2] = 1
    indivi.quimen[indivi.lpr == 3] = 2
    indivi.quimen[indivi.lpr == 4] = 3
    indivi['not_pr_cpr'] = nan
    indivi['not_pr_cpr'][indivi['lpr'] <= 2] = False
    indivi['not_pr_cpr'][indivi['lpr'] > 2] = True

    print "    4.3 : Creating non pr=0 and cpr=1 idmen's"
    indivi.reset_index(inplace=True)
    test1 = indivi.ix[indivi['not_pr_cpr'] == True, ['quimen', 'idmen']]
    test1['quimen'] = 2

    j = 2
    while any(test1.duplicated(['quimen', 'idmen'])):
        test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j + 1
        j += 1

    print_id(indivi)
    indivi.update(test1)

    print_id(indivi)

    #     indivi.set_index(['quiment']) #TODO: check relevance
    #     TODO problème avec certains idfoy qui n'ont pas de vous
    print ''
    print "Etape 5 : Gestion des idfoy qui n'ont pas de vous"
    all = indivi.drop_duplicates('idfoy')
    with_ = indivi.loc[indivi['quifoy'] == 'vous', 'idfoy']
    without = all[~(all.idfoy.isin(with_.values))]

    print 'On cherche si le déclarant donné par la deuxième déclaration est bien un vous'
    has_declar2 = (indivi.idfoy.isin(
        without.idfoy.values)) & (indivi.declar2.notnull())
    decl2_idfoy = (indivi.loc[has_declar2, 'idmen'].astype('int') * 100 +
                   indivi.loc[has_declar2, "declar2"].str[0:2].astype('int'))
    indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values),
                                             decl2_idfoy, None)
    del all, with_, without, has_declar2

    print '    5.1 : Elimination idfoy restant'
    idfoyList = indivi.loc[indivi['quifoy'] == "vous",
                           'idfoy'].drop_duplicates()
    indivi = indivi[indivi.idfoy.isin(idfoyList.values)]
    del idfoyList
    print_id(indivi)

    myvars = [
        "noindiv", "noi", "idmen", "idfoy", "quifoy", "wprm", "age", "agem",
        "quelfic", "actrec", "quimen", "nbsala", "titc", "statut", "txtppb",
        "chpub", "prosa", "encadr"
    ]

    if not (len(set(myvars).difference(set(indivi.columns))) == 0):
        print set(myvars).difference(set(indivi.columns))

    assert len(set(myvars).difference(set(indivi.columns))) == 0

    indivi = indivi.loc[:, myvars]

    ## TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé
    ## les infos provenant des déclarations)

    print ''
    print 'Etape 6 : Création des variables descriptives'
    print '    6.1 : variable activité'
    indivi['activite'] = None
    indivi['activite'][indivi['actrec'] <= 3] = 0
    indivi['activite'][indivi['actrec'] == 4] = 1
    indivi['activite'][indivi['actrec'] == 5] = 2
    indivi['activite'][indivi['actrec'] == 7] = 3
    indivi['activite'][indivi['actrec'] == 8] = 4
    indivi['activite'][indivi['age'] <= 13] = 2  # ce sont en fait les actrec=9
    print indivi['activite'].value_counts()
    # TODO: MBJ problem avec les actrec

    indivi['titc'][indivi['titc'].isnull()] = 0
    assert indivi['titc'].notnull().all(), Exception("Problème avec les titc")

    print '    6.2 : variable statut'
    indivi['statut'][indivi['statut'].isnull()] = 0
    indivi['statut'] = indivi['statut'].astype('int')
    indivi['statut'][indivi['statut'] == 11] = 1
    indivi['statut'][indivi['statut'] == 12] = 2
    indivi['statut'][indivi['statut'] == 13] = 3
    indivi['statut'][indivi['statut'] == 21] = 4
    indivi['statut'][indivi['statut'] == 22] = 5
    indivi['statut'][indivi['statut'] == 33] = 6
    indivi['statut'][indivi['statut'] == 34] = 7
    indivi['statut'][indivi['statut'] == 35] = 8
    indivi['statut'][indivi['statut'] == 43] = 9
    indivi['statut'][indivi['statut'] == 44] = 10
    indivi['statut'][indivi['statut'] == 45] = 11
    assert indivi['statut'].isin(
        range(12)).all(), Exception("statut value over range")

    #indivi$nbsala <- as.numeric(indivi$nbsala)
    #indivi <- within(indivi,{
    #  nbsala[is.na(nbsala) ]    <- 0
    #  nbsala[nbsala==99 ] <- 10  # TODO  418 fip à retracer qui sont NA
    #})

    print '    6.3 : variable txtppb'
    indivi['txtppb'] = indivi['txtppb'].fillna(0)
    assert indivi['txtppb'].notnull().all()

    indivi['nbsala'] = indivi['nbsala'].fillna(0)
    indivi['nbsala'] = indivi['nbsala'].astype('int')
    indivi['nbsala'][indivi['nbsala'] == 99] = 10
    assert indivi['nbsala'].isin(range(11)).all()

    print '    6.4 : variable chpub et CSP'
    indivi['chpub'].fillna(0, inplace=True)
    indivi['chpub'] = indivi['chpub'].astype('int')
    indivi['chpub'][indivi['chpub'].isnull()] = 0
    print indivi['chpub'].value_counts()
    assert indivi['chpub'].isin(range(11)).all()

    indivi['cadre'] = 0
    indivi['prosa'][indivi['prosa'].isnull()] = 0
    assert indivi['prosa'].notnull().all()
    print indivi['encadr'].value_counts()

    # encadr : 1=oui, 2=non
    indivi['encadr'].fillna(2, inplace=True)
    assert indivi['encadr'].notnull().all()
    indivi['cadre'][indivi['prosa'].isin([7, 8])] = 1
    indivi['cadre'][(indivi['prosa'] == 9) & (indivi['encadr'] == 1)] = 1
    print "cadre"
    print indivi['cadre'].value_counts()
    assert indivi['cadre'].isin(range(2)).all()

    print ''
    print "Etape 7 : on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence"

    print 'nb de doublons idfam/quifam', len(
        indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])

    print 'On crée les n° de personnes à charge'
    assert indivi['idfoy'].notnull().all()
    print_id(indivi)
    indivi['quifoy2'] = 2
    indivi['quifoy2'][indivi['quifoy'] == 'vous'] = 0
    indivi['quifoy2'][indivi['quifoy'] == 'conj'] = 1
    indivi['quifoy2'][indivi['quifoy'] == 'pac'] = 2

    del indivi['quifoy']
    indivi['quifoy'] = indivi['quifoy2']
    del indivi['quifoy2']

    print_id(indivi)
    test2 = indivi.loc[indivi['quifoy'] == 2, ['quifoy', 'idfoy', 'noindiv']]
    print_id(test2)

    j = 2
    while test2.duplicated(['quifoy', 'idfoy']).any():
        test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j
        j += 1

    print_id(test2)
    indivi = indivi.merge(test2, on=['noindiv', 'idfoy'], how="left")
    indivi['quifoy'] = indivi['quifoy_x']
    indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'],
                             indivi['quifoy_x'])
    del indivi['quifoy_x'], indivi['quifoy_y']
    print_id(indivi)

    del test2, fip
    print 'nb de doublons idfam/quifam', len(
        indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])
    print_id(indivi)

    #####################################################################################
    ## On ajoute les idfam et quifam
    #load(famc)
    #
    #tot2 <- merge(indivi, famille, by = c('noindiv'), all.x = TRUE)
    #rm(famille)
    #print_id(tot2)
    #
    ### Les idfam des enfants FIP qui ne font plus partie des familles forment des famille seuls
    #tot2[is.na(tot2$quifam), "idfam"] <- tot2[is.na(tot2$quifam), "noindiv"]
    #tot2[is.na(tot2$quifam), "quifam"] <- 0
    #print_id(tot2)
    #saveTmp(tot2, file = "tot2.Rdata")
    #rm(indivi,tot2)
    #
    ## on merge les variables de revenus (foyer_aggr) avec les identifiants précédents
    ## load foyer
    #loadTmp(file = "tot2.Rdata")
    #loadTmp(file= "foyer_aggr.Rdata")
    #
    #tot3 <- merge(tot2, foyer, all.x = TRUE)
    #print_id(tot3) # OK
    #saveTmp(tot3, file= "tot3.Rdata")
    #rm(tot3,tot2,foyer)
    #
    print ''
    print 'Etape 8 : création des fichiers totaux'
    famille = load_temp(name='famc', year=year)

    print '    8.1 : création de tot2 & tot3'
    tot2 = indivi.merge(famille, on='noindiv', how='inner')
    #     del famille # TODO: MBJ increase in number of menage/foyer when merging with family ...
    del famille

    control(tot2, debug=True, verbose=True)
    assert tot2['quifam'].notnull().all()

    save_temp(tot2, name='tot2', year=year)
    del indivi
    print '    tot2 saved'

    #     #On combine les variables de revenu
    #     foyer = load_temp(name='foy_ind', year=year)
    #     print " INTERSERCT THE POOCHAY"
    #     tot2["idfoy"] = tot2["idfoy"][tot2["idfoy"].notnull()] +1
    #     print "pingas"
    #     print sorted(tot2.loc[tot2.idfoy.notnull(),"idfoy"].astype('int').unique())[0:10]
    #     print "pocchay"
    #     print sorted(foyer["idfoy"].unique())[0:10]
    #     print "final flash"
    #     print 602062550.0 in foyer["idfoy"].values
    #     print len(list(set(tot2["idfoy"].unique()) & set(foyer["idfoy"].unique())))
    #     print tot2.quifoy.value_counts()
    #tot2.update(foyer)
    tot2.merge(foyer, how='left')

    tot2 = tot2[tot2.idmen.notnull()]
    #     tot2['idfoy'] += 1

    print_id(tot2)

    tot3 = tot2
    # TODO: check where they come from
    tot3 = tot3.drop_duplicates(cols='noindiv')
    print len(tot3)

    #Block to remove any unwanted duplicated pair
    print "    check tot3"
    control(tot3, debug=True, verbose=True)
    tot3 = tot3.drop_duplicates(cols=['idfoy', 'quifoy'])
    tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam'])
    tot3 = tot3.drop_duplicates(cols=['idmen', 'quimen'])
    tot3 = tot3.drop_duplicates(cols='noindiv')
    control(tot3)

    ## On ajoute les variables individualisables
    #loadTmp("foyer_individualise.Rdata") # foy_ind
    #loadTmp("tot3.Rdata")
    #loadTmp("allvars.Rdata")
    #loadTmp("sif.Rdata")
    #
    #vars2 <- setdiff(names(tot3),  allvars)
    #tot3 <- tot3[,vars2]
    #
    #print_id(tot3)
    #final <- merge(tot3, foy_ind, by = c('idfoy', 'quifoy'), all.x = TRUE)
    #
    print '    8.2 : On ajoute les variables individualisables'

    allvars = load_temp(name='ind_vars_to_remove', year=year)
    vars2 = set(tot3.columns).difference(set(allvars))
    tot3 = tot3[list(vars2)]
    print len(tot3)

    assert not (tot3.duplicated(
        cols=['noindiv']).any()), "doublon dans tot3['noindiv']"
    lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])])
    assert lg_dup == 0, "%i pairs of idfoy/quifoy in tot3 are duplicated" % (
        lg_dup)

    save_temp(tot3, name='tot3', year=year)
    control(tot3)

    del tot2, allvars, tot3, vars2
    print 'tot3 sauvegardé'
    gc.collect()
Пример #16
0
def sif(year=2006):
    data = DataCollection(year=year)
    print u"05_foyer: extraction des données foyer"
    ## TODO Comment choisir le rfr n -2 pour éxonération de TH ?
    ## mnrvka  Revenu TH n-2
    ## mnrvkh  revenu TH (revenu fiscal de référence)
    #
    ## On récupère les variables du code sif
    #vars <- c("noindiv", 'sif', "nbptr", "mnrvka")
    vars = ["noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg"]
    #sif <- LoadIn(erfFoyFil, vars)
    sif = data.get_values(variables=vars, table="foyer")
    #sif$statmarit <- 0
    sif['statmarit'] = 0

    print sif

    ## for (index in 60:80){
    ##    print(index)
    ##    print(table(substr(sif$sif,index,index)))
    ## }

    #
    #
    ## Pb with some 2 sifs that are misaligned
    ## index = 61
    ## sif[substr(sif$sif,index,index)=="F", "noindiv"]
    ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="F", "noindiv"]]
    ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="F", "noindiv"]+1]
    #
    ## index = 62
    ## sif[substr(sif$sif,index,index)=="G", "noindiv"]
    ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="G", "noindiv"]]
    ## sif$sif[1]
    #
    #if (year==2009){
    #  # problem with one entry in 2009
    #  length  <- nchar(sif$sif[1])
    #  old_sif <- sif$sif[sif$noindiv == 901803201]
    #  new_sif <- paste(substr(old_sif,1,59), substr(old_sif,61,length),"0", sep="")
    #  sif$sif[sif$noindiv == 901803201] <- new_sif
    #  old_sif <- sif$sif[sif$noindiv == 900872201]
    #  new_sif <- paste(substr(old_sif,1,58), " ", substr(old_sif,59,length), sep="")
    #  sif$sif[sif$noindiv == 900872201] <- new_sif
    #  rm(old_sif,new_sif)
    #}
    if year == 2009:
        old_sif = sif['sif'][sif['noindiv'] == 901803201]
        new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0"
        sif['sif'][sif['noindiv'] == 901803201] = new_sif
        old_sif = sif['sif'][sif['noindiv'] == 900872201]
        new_sif = old_sif.str[0:58] + " " + old_sif.str[58:]
        sif['sif'][sif['noindiv'] == 900872201] = new_sif
        del old_sif, new_sif
    #
    #
    ## for (index in 60:80){
    ##     print(index)
    ##     print(table(substr(sif$sif,index,index)))
    ## }
    #
    #
    #
    #sif <- within(sif,{
    ##  rbg = rbg*((tsrvbg =='+')-(tsrvbg =='-'))
    print sif["rbg"].describe()
    sif["rbg"] = sif["rbg"] * ((sif["tsrvbg"] == '+') - (sif["tsrvbg"] == '-'))
    print sif["rbg"].describe()

    #  stamar <- substr(sif,5,5)

    sif["stamar"] = sif["sif"].str[4:5]

    # Converting marital status

    #  statmarit[stamar =="M"] <- 1
    #  statmarit[stamar =="C"] <- 2
    #  statmarit[stamar =="D"] <- 3
    #  statmarit[stamar =="V"] <- 4
    #  statmarit[stamar =="O"] <- 5

    statmarit_dict = {"M": 1, "C": 2, "D": 3, "V": 4, "O": 5}
    for key, val in statmarit_dict.iteritems():
        sif["statmarit"][sif.stamar == key] = val

    #  birthvous <- as.numeric(substr(sif,6,9))
    #  birthconj <- as.numeric(substr(sif,11,14))
    #

    sif["birthvous"] = sif["sif"].str[5:9]
    sif["birthconj"] = sif["sif"].str[10:14]

    #  caseE <- as.numeric(substr(sif,16,16)=='E')
    #  caseF <- as.numeric(substr(sif,17,17)=='F')
    #  caseG <- as.numeric(substr(sif,18,18)=='G')
    #  caseK <- as.numeric(substr(sif,19,19)=='K')

    sif["caseE"] = sif["sif"].str[15:16] == "E"
    sif["caseF"] = sif["sif"].str[16:17] == "F"
    sif["caseG"] = sif["sif"].str[17:18] == "G"
    sif["caseK"] = sif["sif"].str[18:19] == "K"

    d = 0
    #
    #  if (year %in% c(2006,2007)){
    #    caseL <- as.numeric(substr(sif,20,20)=='L')
    #    caseP <- as.numeric(substr(sif,21,21)=='P')
    #    caseS <- as.numeric(substr(sif,22,22)=='S')
    #    caseW <- as.numeric(substr(sif,23,23)=='W')
    #    caseN <- as.numeric(substr(sif,24,24)=='N')
    #    caseH <- as.numeric(substr(sif,25,28))
    #    caseT <- as.numeric(substr(sif,29,29) == 'T')
    #  }

    if year in [2006, 2007]:
        sif["caseL"] = sif["sif"].str[19:20] == "L"
        sif["caseP"] = sif["sif"].str[20:21] == "P"
        sif["caseS"] = sif["sif"].str[21:22] == "S"
        sif["caseW"] = sif["sif"].str[22:23] == "W"
        sif["caseN"] = sif["sif"].str[23:24] == "N"
        sif["caseH"] = sif["sif"].str[24:28]
        sif["caseT"] = sif["sif"].str[28:29] == "T"
    #
    #  if (year == 2008){
    #    d = - 1 # fin de la case L
    #    caseP <- as.numeric(substr(sif,21+d,21+d)=='P')
    #    caseS <- as.numeric(substr(sif,22+d,22+d)=='S')
    #    caseW <- as.numeric(substr(sif,23+d,23+d)=='W')
    #    caseN <- as.numeric(substr(sif,24+d,24+d)=='N')
    #    caseH <- as.numeric(substr(sif,25+d,28+d))
    #    caseT <- as.numeric(substr(sif,29+d,29+d)=='T')
    #  }
    #
    if year in [2008]:
        d = -1  # fin de la case L
        sif["caseP"] = sif["sif"].str[20 + d:21 + d] == "P"
        sif["caseS"] = sif["sif"].str[21 + d:22 + d] == "S"
        sif["caseW"] = sif["sif"].str[22 + d:23 + d] == "W"
        sif["caseN"] = sif["sif"].str[23 + d:24 + d] == "N"
        sif["caseH"] = sif["sif"].str[24 + d:28 + d]
        sif["caseT"] = sif["sif"].str[28 + d:29 + d] == "T"

    #  if (year == 2009){
    #    # retour de la case L par rapport à 2008 (donc on retrouve 2006)
    #    caseL <- as.numeric(substr(sif,20,20)=='L')
    #    caseP <- as.numeric(substr(sif,21,21)=='P')
    #    caseS <- as.numeric(substr(sif,22,22)=='S')
    #    caseW <- as.numeric(substr(sif,23,23)=='W')
    #    caseN <- as.numeric(substr(sif,24,24)=='N')
    #    # caseH en moins par rapport à 2008 (mais case en L en plus)
    #    # donc décalage par rapport à 2006
    #    d = -4
    #    caseT <- as.numeric(substr(sif,29+d,29+d)=='T')
    #  }

    if year in [2009]:
        sif["caseL"] = sif["sif"].str[19:20] == "L"
        sif["caseP"] = sif["sif"].str[20:21] == "P"
        sif["caseS"] = sif["sif"].str[21:22] == "S"
        sif["caseW"] = sif["sif"].str[22:23] == "W"
        sif["caseN"] = sif["sif"].str[23:24] == "N"
        # caseH en moins par rapport à 2008 (mais case en L en plus)
        # donc décalage par rapport à 2006
        d = -4
        sif["caseT"] = sif["sif"].str[28 + d:29 + d] == "T"

    #
    #  caseX <- as.numeric(substr(sif,34+d,34+d)=='X')
    #  dateX <- as.Date(substr(sif,35+d,42+d),'%d%m%Y')
    #  caseY <- as.numeric(substr(sif,43+d,43+d)== 'Y')
    #  dateY <- as.Date(substr(sif,44+d,51+d),'%d%m%Y')
    #  caseZ <- as.numeric(substr(sif,52+d,53+d)== 'Z')
    #  dateZ <- as.Date(substr(sif,53+d,60+d),'%d%m%Y')  # ERROR 54+d
    #  causeXYZ <- substr(sif,61+d,61+d)
    #

    sif["caseX"] = sif["sif"].str[33 + d:34 + d] == "X"
    sif["dateX"] = sif["sif"].str[34 + d:42 + d]
    sif["caseY"] = sif["sif"].str[42 + d:43 + d] == "Y"
    sif["dateY"] = sif["sif"].str[43 + d:51 + d]
    sif["caseZ"] = sif["sif"].str[51 + d:53 + d] == "Z"
    sif["dateZ"] = sif["sif"].str[52 + d:60 + d]
    sif["causeXYZ"] = sif["sif"].str[60 + d:61 + d]

    # TODO: convert dateXYZ to appropriate date in pandas
    # print sif["dateY"].unique()

    #  nbptr <- nbptr/100
    #  rfr_n_2 <- mnrvka

    sif["nbptr"] = sif["nbptr"] / 100
    sif["rfr_n_2"] = sif["mnrvka"]

    #  nbF <- as.numeric(substr(sif,65+d,66+d))
    #  nbG <- as.numeric(substr(sif,68+d,69+d))
    #  nbR <- as.numeric(substr(sif,71+d,72+d))
    #  nbJ <- as.numeric(substr(sif,74+d,75+d))
    #  nbN <- as.numeric(substr(sif,77+d,78+d))
    #  nbH <- as.numeric(substr(sif,80+d,81+d))
    #  nbI <- as.numeric(substr(sif,83+d,84+d))

    sif["nbF"] = sif["sif"].str[64 + d:66 + d]
    sif["nbG"] = sif["sif"].str[67 + d:69 + d]
    sif["nbR"] = sif["sif"].str[70 + d:72 + d]
    sif["nbJ"] = sif["sif"].str[73 + d:75 + d]
    sif["nbN"] = sif["sif"].str[76 + d:78 + d]
    sif["nbH"] = sif["sif"].str[79 + d:81 + d]
    sif["nbI"] = sif["sif"].str[82 + d:84 + d]

    #  if (year != 2009){
    #  nbP <- as.numeric(substr(sif,86+d,87+d))
    #  }
    #})

    if (year != 2009):
        sif["nbP"] = sif["sif"].str[85 + d:87 + d]

    #sif$sif <- NULL
    #sif$stamar <- NULL
    #

    del sif["sif"], sif["stamar"]

    #table(sif$statmarit)
    print sif["statmarit"].value_counts()

    #print(length(table(sif$noindiv)))
    print "Number of individuals :", len(sif["noindiv"])
    print "Number of distinct individuals :", len(
        sif["noindiv"].value_counts())

    #dup <- duplicated(sif$noindiv)
    #table(dup)
    #sif <- sif[!dup,]
    #print(length(table(sif$noindiv)))

    sif_drop_duplicated = sif.drop_duplicates("noindiv")
    assert len(sif["noindiv"].value_counts(
    )) == len(sif_drop_duplicated["noindiv"]), Exception(
        "Number of distinct individuals after removing duplicates is not correct"
    )

    print 'Saving sif'
    save_temp(sif, name='sif', year=year)
    del sif
    gc.collect()
def create_indivim(year=2006):
    '''
    '''
    # load
    data = DataCollection(year=year)
    erfmen = data.get_values(table="erf_menage")
    eecmen = data.get_values(table="eec_menage")
    print sorted(eecmen.columns)

    erfind = data.get_values(table="erf_indivi")
    eecind = data.get_values(table="eec_indivi")
    print eecind.columns
    print erfind.columns

    # travail sur la cohérence entre les bases
    noappar_m = eecmen[ ~(eecmen.ident.isin( erfmen.ident.values))]
    print 'describe noappar_m'
    print noappar_m.describe()

    noappar_i = eecmen[ ~(eecmen.ident.isin( erfmen.ident.values))]
    noappar_i = noappar_i.drop_duplicates(cols='ident', take_last = True)
    #TODO: vérifier qu'il n'y a théoriquement pas de doublon

    dif = set(noappar_i.ident).symmetric_difference(noappar_m.ident)
    int = set(noappar_i.ident) & set(noappar_m.ident)
    print "dif, int --------------------------------"
    print dif, int
    del noappar_i, noappar_m, dif, int
    gc.collect()

    #fusion enquete emploi et source fiscale
    menagem = erfmen.merge(eecmen)
    indivim = eecind.merge(erfind, on = ['noindiv', 'ident', 'noi'], how="inner")

    # optimisation des types? Controle de l'existence en passant
    #TODO: minimal dtype
    var_list = (['acteu', 'stc', 'contra', 'titc', 'forter', 'mrec', 'rstg', 'retrai', 'lien', 'noicon',
                 'noiper', 'noimer', 'naia', 'cohab', 'agepr', 'statut', 'txtppb', 'encadr', 'prosa'])
    for var in var_list:
        try:
            indivim[var] = indivim[var].astype("float32")
        except:
            print "%s is missing" %var

    # création de variables
    ## actrec
    indivim['actrec'] = 0
    #TODO: pas de 6 ?!!
    filter1 = (indivim['acteu'] == 1) & (indivim['stc'].isin([1,3]))
    indivim['actrec'][filter1] = 1
    filter2 = (indivim['acteu'] == 1) & (((indivim['stc'] == 2) & (indivim['contra'] == 1)) | (indivim['titc'] == 2))
    indivim['actrec'][filter2] = 2
    indivim['actrec'][indivim['acteu'] == 1] =  3
    filter4 = (indivim['acteu'] == 2) | ((indivim['acteu'] == 3) & (indivim['mrec'] == 1))
    indivim['actrec'][filter4] = 4
    filter5 = (indivim['acteu'] == 3) & ((indivim['forter'] == 2) | (indivim['rstg'] == 1))
    indivim['actrec'][filter5] = 5
    filter7 = (indivim['acteu'] == 3) & ((indivim['retrai'] == 1) | (indivim['retrai'] == 2))
    indivim['actrec'][filter7] = 7
    indivim['actrec'][indivim['acteu'] == 3] =  8
    indivim['actrec'][indivim['acteu'].isnull()] =  9
    print indivim['actrec'].value_counts()
    # tu99
    if year == 2009:
        #erfind['tu99'] = None
        #eecind['tu99'] = float(eecind['tu99'])
        erfind['tu99'] = NaN

    ## locataire
    menagem["locataire"] = menagem["so"].isin([3,4,5])
    menagem["locataire"] = menagem["locataire"].astype("int32")
    ## ?? c'est bizarre d'avoir besoin du diplome de la personne de référence,
    ## ce serait mieux de faire le merge quand on a besoin seulement
    ## laissons à la table individuel ce qui doit l'être

#    NOTE pas de ddipl en year=2006 visiblement
#    transfert = indivim.ix[indivim['lpr'] == 1, ['ident', 'ddipl']]
#    print transfert
#    #TODO: Forget not to uncomment 'dat
##     #menagem <- merge(erfmen,eecmen)
##     #menagem <- merge(menagem,transfert)
#    menagem  = menagem.merge(transfert)

    # correction
    def _manually_remove_errors():
        '''
        This method is here because some oddities can make it through the controls throughout the procedure
        It is here to remove all these individual errors that compromise the process.
        GL,HF
        '''

        if year==2006:
            indivim.lien[indivim.noindiv==603018905] = 2
            indivim.noimer[indivim.noindiv==603018905] = 1
            print indivim[indivim.noindiv==603018905].to_string()

    _manually_remove_errors()

    # save
    save_temp(menagem, name="menagem", year=year)
    del erfmen, eecmen, menagem #, transfert
    print 'menagem saved'
    gc.collect()
    save_temp(indivim, name="indivim", year=year)
    del erfind, eecind
    print 'indivim saved'
    gc.collect()
Пример #18
0
def foyer_all(year=2006):

    ## On ajoute les cases de la déclaration
    #foyer_all <- LoadIn(erfFoyFil)
    data = DataCollection(year=year)
    foyer_all = data.get_values(table="foyer" )

    ## on ne garde que les cases de la déclaration ('fxzz')
    #vars <- names(foyer_all)
    #vars <- c("noindiv", vars[grep("^f[0-9]", vars)])
    #

    vars = foyer_all.columns
    regex = re.compile("^f[0-9]")
    vars = [x for x in vars if regex.match(x)]

    #foyer <- foyer_all[vars]
    #rm(foyer_all)
    #gc()
    #noindiv <- list(foyer$noindiv)
    #

    foyer = foyer_all[vars + ["noindiv"]]

    del foyer_all
    gc.collect()

    #
    ## On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations
    #foyer <- aggregate(foyer, by = noindiv, FUN = 'sum')
    #print foyer.describe()["f1aj"].to_string()
    foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum)
    #
    #print foyer.describe()["f1aj"].to_string()
    #print foyer.describe()["noindiv"].to_string()
    #

    print_id(foyer)

    ## noindiv have been summed over original noindiv which are now in Group.1
    #foyer$noindiv <- NULL
    #foyer <- rename(foyer, c(Group.1 = 'noindiv'))
    ## problème avec les dummies ()
    #
    #saveTmp(foyer, file= "foyer_aggr.Rdata")
    #
    #
    #############################################################################
    ## On récupère les variables individualisables
    #loadTmp("foyer_aggr.Rdata")
    #
    #individualisable <- function(table, var, vars, qui){
    #  print(var)
    #  print(vars)
    #  temp <- table[c('noindiv', vars)]
    #  n = length(qui)
    #  names(temp)[2:(n+1)] <- qui
    #  temp$newvar <- NULL
    #  temp2 <- melt(temp, id = 'noindiv', variable_name = 'quifoy')
    #  temp2 <- transform(temp2, quifoy = as.character(quifoy))
    #  temp2 <- transform(temp2, noindiv = as.character(noindiv))
    #  str(temp2)
    #  rename(temp2, c(value = var))
    #}


    var_dict = {'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'],
                'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'],
               'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'],
               'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'],
               'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'],
               'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'],
               'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'],
               'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'],
               'f1tv': ['f1tv', 'f1uv'],
               'f1tw': ['f1tw', 'f1uw'],
               'f1tx': ['f1tx', 'f1ux'],
               'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'],
               'ppe_du_ns':  ['f5nv', 'f5ov', 'f5pv'],
               'frag_exon': ['f5hn', 'f5in', 'f5jn'],
               'frag_impo': ['f5ho', 'f5io', 'f5jo'],
               'arag_exon': ['f5hb', 'f5ib', 'f5jb'],
               'arag_impg': ['f5hc', 'f5ic', 'f5jc'],
               'arag_defi': ['f5hf', 'f5if', 'f5jf'],
               'nrag_exon': ['f5hh', 'f5ih', 'f5jh'],
               'nrag_impg': ['f5hi', 'f5ii', 'f5ji'],
               'nrag_defi': ['f5hl', 'f5il', 'f5jl'],
               'nrag_ajag': ['f5hm', 'f5im', 'f5jm'],
               'mbic_exon': ['f5kn', 'f5ln', 'f5mn'],
               'abic_exon': ['f5kb', 'f5lb', 'f5mb'],
               'nbic_exon': ['f5kh', 'f5lh', 'f5mh'],
               'mbic_impv': ['f5ko', 'f5lo', 'f5mo'],
               'mbic_imps': ['f5kp', 'f5lp', 'f5mp'],
               'abic_impn': ['f5kc', 'f5lc', 'f5mc'],
               'abic_imps': ['f5kd', 'f5ld', 'f5md'],
               'nbic_impn': ['f5ki', 'f5li', 'f5mi'],
               'nbic_imps': ['f5kj', 'f5lj', 'f5mj'],
               'abic_defn': ['f5kf', 'f5lf', 'f5mf'],
               'abic_defs': ['f5kg', 'f5lg', 'f5mg'],
               'nbic_defn': ['f5kl', 'f5ll', 'f5ml'],
               'nbic_defs': ['f5km', 'f5lm', 'f5mm'],
               'nbic_apch': ['f5ks', 'f5ls', 'f5ms'],
               'macc_exon': ['f5nn', 'f5on', 'f5pn'],
               'aacc_exon': ['f5nb', 'f5ob', 'f5pb'],
               'nacc_exon': ['f5nh', 'f5oh', 'f5ph'],
               'macc_impv': ['f5no', 'f5oo', 'f5po'],
               'macc_imps': ['f5np', 'f5op', 'f5pp'],
               'aacc_impn': ['f5nc', 'f5oc', 'f5pc'],
               'aacc_imps': ['f5nd', 'f5od', 'f5pd'],
               'aacc_defn': ['f5nf', 'f5of', 'f5pf'],
               'aacc_defs': ['f5ng', 'f5og', 'f5pg'],
               'nacc_impn': ['f5ni', 'f5oi', 'f5pi'],
               'nacc_imps': ['f5nj', 'f5oj', 'f5pj'],
               'nacc_defn': ['f5nl', 'f5ol', 'f5pl'],
               'nacc_defs': ['f5nm', 'f5om', 'f5pm'],
               'mncn_impo': ['f5ku', 'f5lu', 'f5mu'],
               'cncn_bene': ['f5sn', 'f5ns', 'f5os'],
               'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check
               'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'],
               'abnc_exon': ['f5qb', 'f5rb', 'f5sb'],
               'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'],
               'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'],
               'abnc_impo': ['f5qc', 'f5rc', 'f5sc'],
               'abnc_defi': ['f5qe', 'f5re', 'f5se'],
               'nbnc_impo': ['f5qi', 'f5ri', 'f5si'],
               'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'],
#               'ebic_impv' : ['f5ta','f5ua', 'f5va'],
#               'ebic_imps' : ['f5tb','f5ub', 'f5vb'],
               'mbic_mvct': ['f5hu'],
               'macc_mvct': ['f5iu'],
               'mncn_mvct': ['f5ju'],
               'mbnc_mvct': ['f5kz'],
               'frag_pvct': ['f5hw', 'f5iw', 'f5jw'],
               'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'],
               'macc_pvct': ['f5nx', 'f5ox', 'f5px'],
               'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'],
               'mncn_pvct': ['f5ky', 'f5ly', 'f5my'],
               'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'],
               'macc_mvlt': ['f5nr', 'f5or', 'f5pr'],
               'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'],
               'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'],
               'frag_pvce': ['f5hx', 'f5ix', 'f5jx'],
               'arag_pvce': ['f5he', 'f5ie', 'f5je'],
               'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'],
               'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'],
               'abic_pvce': ['f5ke', 'f5le', 'f5me'],
               'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'],
               'macc_pvce': ['f5nq', 'f5oq', 'f5pq'],
               'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'],
               'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'],
               'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'],
               'cncn_pvce': ['f5so', 'f5nt', 'f5ot'],
               'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'],
               'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'],
               'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'],
               'demenage' : ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er']}  # (déménagement) uniquement en 2006


#
#varlist = list(list('sali', c('f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej')),
#                list('choi', c('f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep')),
#               list('fra', c('f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek')),
# ......
#               list('mbnc_pvce', c('f5hr', 'f5ir', 'f5jr')),
#               list('abnc_pvce', c('f5qd', 'f5rd', 'f5sd')),
#               list('nbnc_pvce', c('f5qj', 'f5rj', 'f5sj')),
#               list('demenage' , c('f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'))) # (déménagement) uniquement en 2006
#
    vars_sets = [ set(var_list) for var_list in var_dict.values() ]
    eligible_vars = (set().union(*vars_sets)).intersection( set(list(foyer.columns)))

    print "From %i variables, we keep %i eligibles variables"   %( len(set().union(*vars_sets)), len(eligible_vars))
    qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3']
    err = 0
    err_vars = {}

    foy_ind = DataFrame()

    for individual_var, foyer_vars in var_dict.iteritems():
        try:
            selection = foyer[foyer_vars + ["noindiv"]]
        except KeyError:
            # Testing if at least one variable of foyers_vars is in the eligible list
            presence = [ x  in eligible_vars for x in foyer_vars ]
            var_present = any(presence)
            if not var_present:
                print individual_var + " is not present"
                continue
            else:
                # Shrink the list
                foyer_vars_cleaned = [var for var,present in zip(foyer_vars, presence) if present is True]
                selection = foyer[foyer_vars_cleaned + ["noindiv"]]

        # Reshape the dataframe
        selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True)
        selection.set_index("noindiv", inplace=True)
        selection.columns.name = "quifoy"

        selection = selection.stack()
        selection.name = individual_var
        selection = selection.reset_index() # A Series cannot see its index resetted to produce a DataFrame
        selection = selection.set_index(["quifoy", "noindiv"])
        selection = selection[selection[individual_var] !=0]
#        print len(selection)

        if len(foy_ind) == 0:
            foy_ind = selection
        else:

            foy_ind = concat([foy_ind, selection], axis=1, join='outer')

    foy_ind.reset_index(inplace=True)

    print "foy_ind"
    print foy_ind.describe().to_string()


#not_first <- FALSE
#allvars = c()
#for (v in varlist){
#  vars = intersect(v[[2]],names(foyer)) # to deal with variabes that are not present
#  if (length(vars) > 0) {
#    allvars <-  c(allvars, vars)
#    qui <- c('vous', 'conj', 'pac1', 'pac2', 'pac3')
#    n <- length(vars)
#    temp <- individualisable(foyer, v[[1]], vars, qui[1:n])
#    if (not_first) {
#      print('merge')
#      foy_ind <- merge(temp, foy_ind, by = c('noindiv', 'quifoy'), all = TRUE)
#      names(foy_ind)
#    }
#    else   {
#      print('init')
#      foy_ind <- temp
#      not_first <- TRUE
#    }
#  }
#}

    ind_vars_to_remove = Series(list(eligible_vars))
    save_temp(ind_vars_to_remove, name='ind_vars_to_remove', year=year)
    foy_ind.rename(columns={"noindiv" : "idfoy"}, inplace=True)

    print_id(foy_ind)
    foy_ind['quifoy'][foy_ind['quifoy']=='vous'] = 0
    foy_ind['quifoy'][foy_ind['quifoy']=='conj'] = 1
    foy_ind['quifoy'][foy_ind['quifoy']=='pac1'] = 2
    foy_ind['quifoy'][foy_ind['quifoy']=='pac2'] = 3
    foy_ind['quifoy'][foy_ind['quifoy']=='pac3'] = 4

    assert foy_ind['quifoy'].isin(range(5)).all(), 'présence de valeurs aberrantes dans quifoy'

    print 'saving foy_ind'
    print_id(foy_ind)
    save_temp(foy_ind, name="foy_ind", year = year)
    show_temp()
    return