def create_fip(temporary_store = None, year = None):
    assert temporary_store is not None
    assert year is not None
    # fip : fichier d'imposition des personnes
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """
    # Some individuals are declared as 'personne à charge' (pac) on 'tax forms'
    # but are not present in the erf or eec tables.
    # We add them to ensure consistency between concepts.

    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info(u"Démarrage de 03_fip")

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = survey.get_values(table = year_specific_by_generic["foyer"], variables = erfFoyVar)
    foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True)

    log.info(u"Etape 1 : on récupere les personnes à charge des foyers")
    log.info(u"    1.1 : Création des codes des enfants")
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max))

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max)
    nb_pac_max = int(nb_pac_max)
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [
            (i, 'declaration'),
            (i, 'type_pac'),
            (i, 'naia')
            ]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(
        multi_index_columns,
        names = ['pac_number', 'variable']
        )
    fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns)

    for i in range(1, nb_pac_max + 1):  # TODO: using values to deal with mismatching indexes
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values

    fip = fip.stack("pac_number")
    fip.reset_index(inplace = True)
    fip.drop(['level_0'], axis = 1, inplace = True)

    log.info(u"    1.2 : elimination des foyers fiscaux sans pac")
    # Clearing missing values and changing data format
    fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy()
    fip = fip.sort(columns = ['declaration', 'naia', 'type_pac'])
    fip.set_index(["declaration", "pac_number"], inplace = True)
    fip = fip.reset_index()
    fip.drop(['pac_number'], axis = 1, inplace = True)
    assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \
        "Certains types de PAC ne sont pas des cases connues"

    # control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    log.info(u"    1.3 : on enlève les individus F pour lesquels il existe un individu G")
    type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy()  # Filtre pour ne travailler que sur F & G

    type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin']
    # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #       puis on retire les autres (à la fois F et G)
    fip['to_keep'] = np.nan
    fip.update(type_FG)
    log.info(u"    1.4 : on enlève les H pour lesquels il y a un I")
    type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy()
    type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values

    fip.update(type_HI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    log.info(u"{} F, G, H or I non redundant pac kept over {} potential candidates".format(
        fip['to_keep'].sum(), len(fip))
        )
    indivifip = fip[fip['to_keep']].copy()
    del indivifip['to_keep'], fip, type_FG, type_HI
    #
    # control(indivifip, debug=True)

    log.info(u"Step 2 : matching indivifip with eec file")
    indivi = temporary_store['indivim_{}'.format(year)]
    pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy()
    assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia"

    # For safety enforce pac.naia and indivifip.naia dtypes
    pac['naia'] = pac.naia.astype('int32')
    indivifip['naia'] = indivifip.naia.astype('int32')
    pac['key1'] = zip(pac.naia, pac['declar1'].str[:29])
    pac['key2'] = zip(pac.naia, pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values)
    assert pac.naia.dtype == indivifip.naia.dtype, \
        "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype)

    fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy()
    fip = fip[~(fip.key.isin(pac.key2.values))].copy()

    log.info(u"    2.1 new fip created")
    # We build a dataframe to link the pac to their type and noindiv
    tmp_pac1 = pac[['noindiv', 'key1']].copy()
    tmp_pac2 = pac[['noindiv', 'key2']].copy()
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy()

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1)))
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2)))

    log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum()))
    log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum()))

    del pac_ind1['key1'], pac_ind2['key2']

    if len(pac_ind1.index) == 0:
        if len(pac_ind2.index) == 0:
            log.info(u"Warning : no link between pac and noindiv for both pacInd1&2")
        else:
            log.info(u"Warning : pacInd1 is an empty data frame")
            pacInd = pac_ind2
    elif len(pac_ind2.index) == 0:
        log.info(u"Warning : pacInd2 is an empty data frame")
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    assert len(pac_ind1) + len(pac_ind2) == len(pacInd)
    log.info("{} null pac_ind2.type_pac".format(pac_ind2.type_pac.isnull().sum()))
    log.info("pacInd.type_pac.value_counts()) \n {}".format(pacInd.type_pac.value_counts(dropna = False)))

    log.info(u"    2.2 : pacInd created")
    log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum()))
    log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum()))
    log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum()))

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy()
    # pacIndiv.reset_index(inplace=True)
    log.info("{}".format(pacIndiv.columns))

    temporary_store['pacIndiv_{}'.format(year)] = pacIndiv

    log.info("{}".format(pacIndiv.type_pac.value_counts()))
    gc.collect()

    # We keep the fip in the menage of their parents because it is used in to
    # build the famille. We should build an individual ident (ménage) for the fip that are
    # older than 18 since they are not in their parents' menage according to the eec
    log.info("{}".format(indivi['declar1'].str[0:2].value_counts()))
    log.info("{}".format(indivi['declar1'].str[0:2].describe()))
    log.info("{}".format(indivi['declar1'].str[0:2].notnull().all()))
    log.info("{}".format(indivi.info()))
    selection = indivi['declar1'].str[0:2] != ""
    indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32')  # To be used later to set idfoy

    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")]
    individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec1 = individec1.rename(columns = {'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on = 'declaration')
    log.info(u"    2.3 : fip1 created")

    individec2 = indivi.loc[
        (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"),
        ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]
        ].copy()
    individec2.rename(columns = {'declar2': 'declaration'}, inplace = True)
    fip2 = fip.merge(individec2)
    log.info(u"    2.4 : fip2 created")

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    fip = concat([fip1, fip2])

    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration'].copy()
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'].copy()
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip.naia.astype('float')
    fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5

    # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
    # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non

    # Reassigning noi for fip children if they are more than one per foyer fiscal
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(subset = ['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        log.info("{}".format(len(tmp)))
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    log.info("Number of duplicated fip: {}".format(fip.duplicated('noindiv').value_counts()))
    temporary_store['fipDat_{}'.format(year)] = fip
    del fip, fip1, individec1, indivifip, indivi, pac
    log.info(u"fip sauvegardé")
"""
Created on Mon Sep 28 11:25:40 2015

@author: malkaguillot
"""
import pandas

from openfisca_france_data import default_config_files_directory as config_files_directory
from openfisca_france_data.input_data_builders.build_openfisca_survey_data.base \
    import year_specific_by_generic_data_frame_name
from openfisca_france_data.temporary import get_store
from openfisca_survey_manager.survey_collections import SurveyCollection

# En entrée : tables individus, foyer et sif de ERFS (testé sur 2009)
year = 2009
year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory)
survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))
foyer_all = survey.get_values(table = year_specific_by_generic["foyer"])
erfind = survey.get_values(table = year_specific_by_generic["erf_indivi"])

temporary_store = get_store(file_name = 'erfs')
sif = temporary_store['sif']

ind = erfind[['ident', 'noindiv', 'declar1', 'declar2', 'zsali', 'persfip', 'persfipd']]
small_sif = sif[['noindiv', 'declar', 'causeXYZ']]
foyer = foyer_all[['ident', 'noindiv', 'declar', 'sif', '_1aj', '_1bj', '_1cj', '_1dj', '_1aq', '_1bq', '_8by', '_8cy'
                   ]]
foyer = foyer.drop(['_1cj', '_1dj', '_1aq', '_1bq', '_8by', '_8cy'], axis=1)
foyer_sif = pandas.merge(foyer, small_sif, on = ['declar', 'noindiv'])
def create_enfants_a_naitre(temporary_store=None, year=None):
    '''
    '''
    assert temporary_store is not None
    assert year is not None

    erfs_survey_collection = SurveyCollection.load(
        collection='erfs', config_files_directory=config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))
    # Enfant à naître (NN pour nouveaux nés)
    individual_vars = [
        'acteu',
        'agepr',
        'cohab',
        'contra',
        'forter',
        'ident',
        'lien',
        'lpr',
        'mrec',
        'naia',
        'naim',
        'noi',
        'noicon',
        'noimer',
        'noindiv',
        'noiper',
        'retrai',
        'rga',
        'rstg',
        'sexe',
        'stc',
        'titc',
    ]
    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)
    eeccmp1 = survey.get_values(table=year_specific_by_generic["eec_cmp_1"],
                                variables=individual_vars)
    eeccmp2 = survey.get_values(table=year_specific_by_generic["eec_cmp_2"],
                                variables=individual_vars)
    eeccmp3 = survey.get_values(table=year_specific_by_generic["eec_cmp_3"],
                                variables=individual_vars)
    tmp = eeccmp1.merge(eeccmp2, how="outer")
    enfants_a_naitre = tmp.merge(eeccmp3, how="outer")

    # optimisation des types? Controle de l'existence en passant
    # pourquoi pas des int quand c'est possible
    # TODO: minimal dtype TODO: shoudln't be here
    for var in individual_vars:
        assert_dtype(enfants_a_naitre[var], 'float')
    del eeccmp1, eeccmp2, eeccmp3, individual_vars, tmp
    gc.collect()

    # création de variables
    enfants_a_naitre['declar1'] = ''
    enfants_a_naitre['noidec'] = 0
    enfants_a_naitre['ztsai'] = 0
    enfants_a_naitre['year'] = year
    enfants_a_naitre.year = enfants_a_naitre.year.astype(
        "float32")  # TODO: should be an integer but NaN are present
    enfants_a_naitre['agepf'] = enfants_a_naitre.year - enfants_a_naitre.naia
    enfants_a_naitre.loc[enfants_a_naitre.naim >= 7, 'agepf'] -= 1
    enfants_a_naitre['actrec'] = 9
    enfants_a_naitre['quelfic'] = 'ENF_NN'
    enfants_a_naitre['persfip'] = ""

    # TODO: deal with agepf
    for series_name in ['actrec', 'noidec', 'ztsai']:
        assert_dtype(enfants_a_naitre[series_name], "int")

    # selection
    enfants_a_naitre = enfants_a_naitre[(
        (enfants_a_naitre.naia == enfants_a_naitre.year) &
        (enfants_a_naitre.naim >= 10)) | (
            (enfants_a_naitre.naia == enfants_a_naitre.year + 1) &
            (enfants_a_naitre.naim <= 5))].copy()

    temporary_store["enfants_a_naitre_{}".format(year)] = enfants_a_naitre
def create_indivim_menagem(temporary_store=None, year=None):
    """
    Création des tables ménages et individus concaténée (merged)
    """
    # Prepare the some useful merged tables

    assert temporary_store is not None
    assert year is not None
    # load data
    erfs_survey_collection = SurveyCollection.load(
        collection='erfs', config_files_directory=config_files_directory)

    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))
    erfmen = survey.get_values(table=year_specific_by_generic["erf_menage"])
    eecmen = survey.get_values(table=year_specific_by_generic["eec_menage"])
    erfind = survey.get_values(table=year_specific_by_generic["erf_indivi"])
    eecind = survey.get_values(table=year_specific_by_generic["eec_indivi"])

    # travail sur la cohérence entre les bases
    noappar_m = eecmen[~(eecmen.ident.isin(erfmen.ident.values))].copy()

    noappar_i = eecmen[~(eecind.ident.isin(erfind.ident.values))].copy()
    noappar_i = noappar_i.drop_duplicates(subset='ident', take_last=True)
    # TODO: vérifier qu'il n'y a théoriquement pas de doublon

    difference = set(noappar_i.ident).symmetric_difference(noappar_m.ident)
    intersection = set(noappar_i.ident) & set(noappar_m.ident)
    log.info("There are {} differences and {} intersections".format(
        len(difference), len(intersection)))
    del noappar_i, noappar_m, difference, intersection
    gc.collect()

    # fusion enquete emploi et source fiscale
    menagem = erfmen.merge(eecmen)
    indivim = eecind.merge(erfind, on=['noindiv', 'ident', 'noi'], how="inner")

    # optimisation des types? Controle de l'existence en passant
    # TODO: minimal dtype
    # TODO: this should be done somewhere else
    var_list = ([
        'acteu',
        'agepr',
        'cohab',
        'contra',
        'encadr',
        'forter',
        'lien',
        'mrec',
        'naia',
        'noicon',
        'noimer',
        'noiper',
        'prosa',
        'retrai',
        'rstg',
        'statut',
        'stc',
        'titc',
        'txtppb',
    ])

    for var in var_list:
        assert numpy.issubdtype(
            indivim[var].dtype, numpy.integer
        ), "Variable {} dtype is {} and should be an integer".format(
            var, indivim[var].dtype)

    ########################
    # création de variables#
    ########################

#  print indivim


#   actrec : activité recodée comme preconisé par l'INSEE p84 du guide utilisateur
    indivim["actrec"] = numpy.nan
    # Attention : Q: pas de 6 ?!! A : Non pas de 6, la variable recodée de l'INSEE (voit p84 du guide methodo), ici \
    # la même nomenclature à été adopée
    # 3: contrat a durée déterminée
    indivim.actrec.loc[indivim.acteu == 1] = 3
    # 8 : femme (homme) au foyer, autre inactif
    indivim.actrec.loc[indivim.acteu == 3] = 8
    # 1 : actif occupé non salarié
    filter1 = (indivim.acteu == 1) & (indivim.stc.isin(
        [1, 3]))  # actifs occupés non salariés à son compte ou pour un
    indivim.actrec.loc[filter1] = 1  # membre de sa famille
    # 2 : salarié pour une durée non limitée
    filter2 = (indivim.acteu == 1) & (((indivim.stc == 2) &
                                       (indivim.contra == 1)) |
                                      (indivim.titc == 2))
    indivim.actrec.loc[filter2] = 2
    # 4 : au chomage
    filter4 = (indivim.acteu == 2) | ((indivim.acteu == 3) &
                                      (indivim.mrec == 1))
    indivim.actrec.loc[filter4] = 4
    # 5 : élève étudiant , stagiaire non rémunéré
    filter5 = (indivim.acteu == 3) & ((indivim.forter == 2) |
                                      (indivim.rstg == 1))
    indivim.actrec.loc[filter5] = 5
    # 7 : retraité, préretraité, retiré des affaires unchecked
    filter7 = (indivim.acteu == 3) & ((indivim.retrai == 1) |
                                      (indivim.retrai == 2))
    indivim.actrec.loc[filter7] = 7
    # 9 : probablement enfants de - de 16 ans TODO: check that fact in database and questionnaire
    indivim.actrec.loc[indivim.acteu == 0] = 9

    indivim.actrec = indivim.actrec.astype("int8")
    assert_dtype(indivim.actrec, "int8")
    assert indivim.actrec.isin(range(
        1, 10)).all(), 'actrec values are outside the interval [1, 9]'

    #   TODO : compare the result with results provided by Insee
    #   tu99
    if year == 2009:
        erfind['tu99'] = None  # TODO: why ?

    # Locataire
    menagem["locataire"] = menagem.so.isin([3, 4, 5])
    assert_dtype(menagem.locataire, "bool")

    transfert = indivim.loc[indivim.lpr == 1, ['ident', 'ddipl']].copy()
    menagem = menagem.merge(transfert)

    # Correction
    def _manually_remove_errors():
        '''
        This method is here because some oddities can make it through the controls throughout the procedure
        It is here to remove all these individual errors that compromise the process.
        '''
        if year == 2006:
            indivim.lien[indivim.noindiv == 603018905] = 2
            indivim.noimer[indivim.noindiv == 603018905] = 1
            log.info("{}".format(
                indivim[indivim.noindiv == 603018905].to_string()))

    _manually_remove_errors()

    temporary_store['menagem_{}'.format(year)] = menagem
    del eecmen, erfmen, menagem, transfert
    gc.collect()
    temporary_store['indivim_{}'.format(year)] = indivim
    del erfind, eecind
def foyer_all(temporary_store=None, year=None):
    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

    # On ajoute les cases de la déclaration
    erfs_survey_collection = SurveyCollection.load(
        collection='erfs', config_files_directory=config_files_directory)
    data = erfs_survey_collection.get_survey('erfs_{}'.format(year))
    foyer_all = data.get_values(table=year_specific_by_generic["foyer"])
    # on ne garde que les cases de la déclaration ('_xzz') ou ^_[0-9][a-z]{2}")
    regex = re.compile("^_[0-9][a-z]{2}")
    variables = [x for x in foyer_all.columns if regex.match(x)]
    # rename variable to fxzz ou ^f[0-9][a-z]{2}")
    renamed_variables = ["f{}".format(x[1:]) for x in variables]

    foyer = foyer_all[variables + ["noindiv"]].copy()  # Memory expensive ...
    del foyer_all
    gc.collect()
    foyer.rename(columns=dict(zip(variables, renamed_variables)), inplace=True)

    # On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations
    foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum)
    print_id(foyer)

    # On récupère les variables individualisables
    var_dict = {
        'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'],
        'hsup': ['f1au', 'f1bu', 'f1cu', 'f1du', 'f1eu'],
        'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'],
        'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'],
        'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'],
        'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'],
        'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'],
        'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'],
        'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'],
        'f1tv': ['f1tv', 'f1uv'],
        'f1tw': ['f1tw', 'f1uw'],
        'f1tx': ['f1tx', 'f1ux'],
        'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'],
        'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'],
        'frag_exon': ['f5hn', 'f5in', 'f5jn'],
        'frag_impo': ['f5ho', 'f5io', 'f5jo'],
        'arag_exon': ['f5hb', 'f5ib', 'f5jb'],
        'arag_impg': ['f5hc', 'f5ic', 'f5jc'],
        'arag_defi': ['f5hf', 'f5if', 'f5jf'],
        'nrag_exon': ['f5hh', 'f5ih', 'f5jh'],
        'nrag_impg': ['f5hi', 'f5ii', 'f5ji'],
        'nrag_defi': ['f5hl', 'f5il', 'f5jl'],
        'nrag_ajag': ['f5hm', 'f5im', 'f5jm'],
        'mbic_exon': ['f5kn', 'f5ln', 'f5mn'],
        'abic_exon': ['f5kb', 'f5lb', 'f5mb'],
        'nbic_exon': ['f5kh', 'f5lh', 'f5mh'],
        'mbic_impv': ['f5ko', 'f5lo', 'f5mo'],
        'mbic_imps': ['f5kp', 'f5lp', 'f5mp'],
        'abic_impn': ['f5kc', 'f5lc', 'f5mc'],
        'abic_imps': ['f5kd', 'f5ld', 'f5md'],
        'nbic_impn': ['f5ki', 'f5li', 'f5mi'],
        'nbic_imps': ['f5kj', 'f5lj', 'f5mj'],
        'abic_defn': ['f5kf', 'f5lf', 'f5mf'],
        'abic_defs': ['f5kg', 'f5lg', 'f5mg'],
        'nbic_defn': ['f5kl', 'f5ll', 'f5ml'],
        'nbic_defs': ['f5km', 'f5lm', 'f5mm'],
        'nbic_apch': ['f5ks', 'f5ls', 'f5ms'],
        'macc_exon': ['f5nn', 'f5on', 'f5pn'],
        'aacc_exon': ['f5nb', 'f5ob', 'f5pb'],
        'nacc_exon': ['f5nh', 'f5oh', 'f5ph'],
        'macc_impv': ['f5no', 'f5oo', 'f5po'],
        'macc_imps': ['f5np', 'f5op', 'f5pp'],
        'aacc_impn': ['f5nc', 'f5oc', 'f5pc'],
        'aacc_imps': ['f5nd', 'f5od', 'f5pd'],
        'aacc_defn': ['f5nf', 'f5of', 'f5pf'],
        'aacc_defs': ['f5ng', 'f5og', 'f5pg'],
        'nacc_impn': ['f5ni', 'f5oi', 'f5pi'],
        'nacc_imps': ['f5nj', 'f5oj', 'f5pj'],
        'nacc_defn': ['f5nl', 'f5ol', 'f5pl'],
        'nacc_defs': ['f5nm', 'f5om', 'f5pm'],
        'mncn_impo': ['f5ku', 'f5lu', 'f5mu'],
        'cncn_bene': ['f5sn', 'f5ns', 'f5os'],
        'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'],  # TODO: check
        'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'],
        'abnc_exon': ['f5qb', 'f5rb', 'f5sb'],
        'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'],
        'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'],
        'abnc_impo': ['f5qc', 'f5rc', 'f5sc'],
        'abnc_defi': ['f5qe', 'f5re', 'f5se'],
        'nbnc_impo': ['f5qi', 'f5ri', 'f5si'],
        'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'],
        # 'ebic_impv' : ['f5ta','f5ua', 'f5va'],
        # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'],
        'mbic_mvct': ['f5hu'],
        'macc_mvct': ['f5iu'],
        'mncn_mvct': ['f5ju'],
        'mbnc_mvct': ['f5kz'],
        'frag_pvct': ['f5hw', 'f5iw', 'f5jw'],
        'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'],
        'macc_pvct': ['f5nx', 'f5ox', 'f5px'],
        'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'],
        'mncn_pvct': ['f5ky', 'f5ly', 'f5my'],
        'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'],
        'macc_mvlt': ['f5nr', 'f5or', 'f5pr'],
        'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'],
        'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'],
        'frag_pvce': ['f5hx', 'f5ix', 'f5jx'],
        'arag_pvce': ['f5he', 'f5ie', 'f5je'],
        'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'],
        'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'],
        'abic_pvce': ['f5ke', 'f5le', 'f5me'],
        'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'],
        'macc_pvce': ['f5nq', 'f5oq', 'f5pq'],
        'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'],
        'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'],
        'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'],
        'cncn_pvce': ['f5so', 'f5nt', 'f5ot'],
        'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'],
        'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'],
        'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'],
        'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr',
                     'f1er'],  # (déménagement) uniquement en 2006
    }
    cases_f6_f7_f8 = build_cerfa_fields_by_column_name(
        year=year, sections_cerfa=[6, 7, 8])
    var_dict.update(cases_f6_f7_f8)
    vars_sets = [set(var_list) for var_list in var_dict.values()]
    eligible_vars = (set().union(*vars_sets)).intersection(
        set(list(foyer.columns)))

    log.info(u"From {} variables, we keep {} eligibles variables".format(
        len(set().union(*vars_sets)),
        len(eligible_vars),
    ))

    qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3']
    #    err = 0
    #    err_vars = {}

    foy_ind = DataFrame()
    for individual_var, foyer_vars in var_dict.iteritems():
        try:
            selection = foyer[foyer_vars + ["noindiv"]].copy()
        except KeyError:
            # Testing if at least one variable of foyers_vars is in the eligible list
            presence = [x in eligible_vars for x in foyer_vars]
            if not any(presence):
                log.info("{} is not present".format(individual_var))
                continue
            else:
                # Shrink the list
                foyer_vars_cleaned = [
                    var for var, present in zip(foyer_vars, presence)
                    if present is True
                ]
                selection = foyer[foyer_vars_cleaned + ["noindiv"]].copy()

        # Reshape the dataframe
        selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True)
        selection.set_index("noindiv", inplace=True)
        selection.columns.name = "quifoy"

        selection = selection.stack()
        selection.name = individual_var
        selection = selection.reset_index(
        )  # A Series cannot see its index resetted to produce a DataFrame
        selection = selection.set_index(["quifoy", "noindiv"])
        selection = selection[selection[individual_var] != 0].copy()

        if len(foy_ind) == 0:
            foy_ind = selection
        else:
            foy_ind = concat([foy_ind, selection], axis=1, join='outer')

    foy_ind.reset_index(inplace=True)

    ind_vars_to_remove = Series(list(eligible_vars))
    temporary_store['ind_vars_to_remove_{}'.format(year)] = ind_vars_to_remove
    foy_ind.rename(columns={"noindiv": "idfoy"}, inplace=True)

    print_id(foy_ind)

    foy_ind.quifoy.loc[foy_ind.quifoy == 'vous'] = 0
    foy_ind.quifoy.loc[foy_ind.quifoy == 'conj'] = 1
    foy_ind.quifoy.loc[foy_ind.quifoy == 'pac1'] = 2
    foy_ind.quifoy.loc[foy_ind.quifoy == 'pac2'] = 3
    foy_ind.quifoy.loc[foy_ind.quifoy == 'pac3'] = 4

    assert foy_ind.quifoy.isin(
        range(5)).all(), 'présence de valeurs aberrantes dans quifoy'

    log.info('saving foy_ind')
    print_id(foy_ind)
    temporary_store['foy_ind_{}'.format(year)] = foy_ind

    return
def sif(temporary_store=None, year=None):
    assert temporary_store is not None
    assert year is not None
    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

    erfs_survey_collection = SurveyCollection.load(
        collection='erfs', config_files_directory=config_files_directory)
    erfs_survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info("05_foyer: extraction des données foyer")
    # TODO Comment choisir le rfr n -2 pour éxonération de TH ?
    # mnrvka  Revenu TH n-2
    # mnrvkh  revenu TH (revenu fiscal de référence)
    #
    # On récupère les variables du code sif
    sif = erfs_survey.get_values(variables=[
        "noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg", "declar"
    ],
                                 table=year_specific_by_generic["foyer"])

    sif['statmarit'] = 0

    if year == 2009:
        old_sif = sif['sif'][sif['noindiv'] == 901803201].copy()
        new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0"
        sif.sif.loc[sif['noindiv'] == 901803201] = new_sif.values
        old_sif = sif.sif.loc[sif['noindiv'] == 900872201]
        new_sif = old_sif.str[0:58] + " " + old_sif.str[58:]
        sif.sif.loc[sif['noindiv'] == 900872201] = new_sif.values
        del old_sif, new_sif

    sif["rbg"] = sif["rbg"] * ((sif["tsrvbg"] == '+').astype(int) -
                               (sif["tsrvbg"] == '-').astype(int))
    sif["stamar"] = sif.sif.str[4:5]

    # Converting marital status
    statmarit_dict = {"M": 1, "C": 2, "D": 3, "V": 4, "O": 5}
    for key, val in statmarit_dict.iteritems():
        sif.statmarit.loc[sif.stamar == key] = val

    sif["birthvous"] = sif.sif.str[5:9]
    sif["birthconj"] = sif.sif.str[10:14]
    sif["caseE"] = sif.sif.str[15:16] == "E"
    sif["caseF"] = sif.sif.str[16:17] == "F"
    sif["caseG"] = sif.sif.str[17:18] == "G"
    sif["caseK"] = sif.sif.str[18:19] == "K"

    d = 0
    if year in [2006, 2007]:
        sif["caseL"] = sif.sif.str[19:20] == "L"
        sif["caseP"] = sif.sif.str[20:21] == "P"
        sif["caseS"] = sif.sif.str[21:22] == "S"
        sif["caseW"] = sif.sif.str[22:23] == "W"
        sif["caseN"] = sif.sif.str[23:24] == "N"
        sif["caseH"] = sif.sif.str[24:28]
        sif["caseT"] = sif.sif.str[28:29] == "T"

    if year in [2008]:
        d = -1  # fin de la case L
        sif["caseP"] = sif.sif.str[20 + d:21 + d] == "P"
        sif["caseS"] = sif.sif.str[21 + d:22 + d] == "S"
        sif["caseW"] = sif.sif.str[22 + d:23 + d] == "W"
        sif["caseN"] = sif.sif.str[23 + d:24 + d] == "N"
        sif["caseH"] = sif.sif.str[24 + d:28 + d]
        sif["caseT"] = sif.sif.str[28 + d:29 + d] == "T"

    if year in [2009]:
        sif["caseL"] = sif.sif.str[19:20] == "L"
        sif["caseP"] = sif.sif.str[20:21] == "P"
        sif["caseS"] = sif.sif.str[21:22] == "S"
        sif["caseW"] = sif.sif.str[22:23] == "W"
        sif["caseN"] = sif.sif.str[23:24] == "N"
        # caseH en moins par rapport à 2008 (mais case en L en plus)
        # donc décalage par rapport à 2006
        d = -4
        sif["caseT"] = sif.sif.str[28 + d:29 + d] == "T"

    sif["caseX"] = sif.sif.str[33 + d:34 + d] == "X"
    sif["dateX"] = sif.sif.str[34 + d:42 + d]
    sif["caseY"] = sif.sif.str[42 + d:43 + d] == "Y"
    sif["dateY"] = sif.sif.str[43 + d:51 + d]
    sif["caseZ"] = sif.sif.str[51 + d:52 + d] == "Z"
    sif["dateZ"] = sif.sif.str[52 + d:60 + d]
    sif["causeXYZ"] = sif.sif.str[60 + d:61 + d]

    # TODO: convert dateXYZ to appropriate date in pandas
    # print sif["dateY"].unique()

    sif["nbptr"] = sif.nbptr.values / 100
    sif["rfr_n_2"] = sif.mnrvka.values

    sif["nbF"] = sif.sif.str[64 + d:66 + d]
    sif["nbG"] = sif.sif.str[67 + d:69 + d]
    sif["nbR"] = sif.sif.str[70 + d:72 + d]
    sif["nbJ"] = sif.sif.str[73 + d:75 + d]
    sif["nbN"] = sif.sif.str[76 + d:78 + d]
    sif["nbH"] = sif.sif.str[79 + d:81 + d]
    sif["nbI"] = sif.sif.str[82 + d:84 + d]

    if (year != 2009):
        sif["nbP"] = sif.sif.str[85 + d:87 + d]

    del sif["stamar"]

    duplicated_noindiv = sif.noindiv[sif.noindiv.duplicated()].copy()
    sif['duplicated_noindiv'] = sif.noindiv.isin(duplicated_noindiv)
    x = sif.loc[sif.duplicated_noindiv, ['noindiv', 'declar']]
    sif['change'] = "NONE"
    sif.loc[sif.duplicated_noindiv, 'change'] = sif.loc[sif.duplicated_noindiv,
                                                        'declar'].str[27:28]

    log.info("Number of individuals: {}".format(len(sif.noindiv)))
    log.info("Number of duplicated individuals: {}".format(
        len(duplicated_noindiv)))
    log.info("Number of distinct individuals: {}".format(
        len(sif.noindiv.value_counts())))

    log.info(u"Saving sif")
    temporary_store['sif_{}'.format(year)] = sif
    del sif
    gc.collect()
def foyer_all(temporary_store = None, year = None):
    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

    # On ajoute les cases de la déclaration
    erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory)
    data = erfs_survey_collection.get_survey('erfs_{}'.format(year))
    foyer_all = data.get_values(table = year_specific_by_generic["foyer"])
    # on ne garde que les cases de la déclaration ('_xzz') ou ^_[0-9][a-z]{2}")
    regex = re.compile("^_[0-9][a-z]{2}")
    variables = [x for x in foyer_all.columns if regex.match(x)]
    # rename variable to fxzz ou ^f[0-9][a-z]{2}")
    renamed_variables = ["f{}".format(x[1:]) for x in variables]

    foyer = foyer_all[variables + ["noindiv"]].copy()  # Memory expensive ...
    del foyer_all
    gc.collect()
    foyer.rename(columns = dict(zip(variables, renamed_variables)), inplace = True)

    # On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations
    foyer = foyer.groupby("noindiv", as_index = False).aggregate(numpy.sum)
    print_id(foyer)

    # On récupère les variables individualisables
    var_dict = {
        'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'],
        'hsup': ['f1au', 'f1bu', 'f1cu', 'f1du', 'f1eu'],
        'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'],
        'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'],
        'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'],
        'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'],
        'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'],
        'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'],
        'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'],
        'f1tv': ['f1tv', 'f1uv'],
        'f1tw': ['f1tw', 'f1uw'],
        'f1tx': ['f1tx', 'f1ux'],
        'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'],
        'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'],
        'frag_exon': ['f5hn', 'f5in', 'f5jn'],
        'frag_impo': ['f5ho', 'f5io', 'f5jo'],
        'arag_exon': ['f5hb', 'f5ib', 'f5jb'],
        'arag_impg': ['f5hc', 'f5ic', 'f5jc'],
        'arag_defi': ['f5hf', 'f5if', 'f5jf'],
        'nrag_exon': ['f5hh', 'f5ih', 'f5jh'],
        'nrag_impg': ['f5hi', 'f5ii', 'f5ji'],
        'nrag_defi': ['f5hl', 'f5il', 'f5jl'],
        'nrag_ajag': ['f5hm', 'f5im', 'f5jm'],
        'mbic_exon': ['f5kn', 'f5ln', 'f5mn'],
        'abic_exon': ['f5kb', 'f5lb', 'f5mb'],
        'nbic_exon': ['f5kh', 'f5lh', 'f5mh'],
        'mbic_impv': ['f5ko', 'f5lo', 'f5mo'],
        'mbic_imps': ['f5kp', 'f5lp', 'f5mp'],
        'abic_impn': ['f5kc', 'f5lc', 'f5mc'],
        'abic_imps': ['f5kd', 'f5ld', 'f5md'],
        'nbic_impn': ['f5ki', 'f5li', 'f5mi'],
        'nbic_imps': ['f5kj', 'f5lj', 'f5mj'],
        'abic_defn': ['f5kf', 'f5lf', 'f5mf'],
        'abic_defs': ['f5kg', 'f5lg', 'f5mg'],
        'nbic_defn': ['f5kl', 'f5ll', 'f5ml'],
        'nbic_defs': ['f5km', 'f5lm', 'f5mm'],
        'nbic_apch': ['f5ks', 'f5ls', 'f5ms'],
        'macc_exon': ['f5nn', 'f5on', 'f5pn'],
        'aacc_exon': ['f5nb', 'f5ob', 'f5pb'],
        'nacc_exon': ['f5nh', 'f5oh', 'f5ph'],
        'macc_impv': ['f5no', 'f5oo', 'f5po'],
        'macc_imps': ['f5np', 'f5op', 'f5pp'],
        'aacc_impn': ['f5nc', 'f5oc', 'f5pc'],
        'aacc_imps': ['f5nd', 'f5od', 'f5pd'],
        'aacc_defn': ['f5nf', 'f5of', 'f5pf'],
        'aacc_defs': ['f5ng', 'f5og', 'f5pg'],
        'nacc_impn': ['f5ni', 'f5oi', 'f5pi'],
        'nacc_imps': ['f5nj', 'f5oj', 'f5pj'],
        'nacc_defn': ['f5nl', 'f5ol', 'f5pl'],
        'nacc_defs': ['f5nm', 'f5om', 'f5pm'],
        'mncn_impo': ['f5ku', 'f5lu', 'f5mu'],
        'cncn_bene': ['f5sn', 'f5ns', 'f5os'],
        'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check
        'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'],
        'abnc_exon': ['f5qb', 'f5rb', 'f5sb'],
        'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'],
        'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'],
        'abnc_impo': ['f5qc', 'f5rc', 'f5sc'],
        'abnc_defi': ['f5qe', 'f5re', 'f5se'],
        'nbnc_impo': ['f5qi', 'f5ri', 'f5si'],
        'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'],
        # 'ebic_impv' : ['f5ta','f5ua', 'f5va'],
        # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'],
        'mbic_mvct': ['f5hu'],
        'macc_mvct': ['f5iu'],
        'mncn_mvct': ['f5ju'],
        'mbnc_mvct': ['f5kz'],
        'frag_pvct': ['f5hw', 'f5iw', 'f5jw'],
        'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'],
        'macc_pvct': ['f5nx', 'f5ox', 'f5px'],
        'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'],
        'mncn_pvct': ['f5ky', 'f5ly', 'f5my'],
        'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'],
        'macc_mvlt': ['f5nr', 'f5or', 'f5pr'],
        'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'],
        'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'],
        'frag_pvce': ['f5hx', 'f5ix', 'f5jx'],
        'arag_pvce': ['f5he', 'f5ie', 'f5je'],
        'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'],
        'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'],
        'abic_pvce': ['f5ke', 'f5le', 'f5me'],
        'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'],
        'macc_pvce': ['f5nq', 'f5oq', 'f5pq'],
        'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'],
        'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'],
        'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'],
        'cncn_pvce': ['f5so', 'f5nt', 'f5ot'],
        'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'],
        'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'],
        'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'],
        'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'],  # (déménagement) uniquement en 2006
        }
    cases_f6_f7_f8 = build_cerfa_fields_by_column_name(year = year, sections_cerfa = [6, 7, 8])
    var_dict.update(cases_f6_f7_f8)
    vars_sets = [set(var_list) for var_list in var_dict.values()]
    eligible_vars = (set().union(*vars_sets)).intersection(set(list(foyer.columns)))

    log.info(
        u"From {} variables, we keep {} eligibles variables".format(
            len(set().union(*vars_sets)),
            len(eligible_vars),
            )
        )

    qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3']
    #    err = 0
    #    err_vars = {}

    foy_ind = DataFrame()
    for individual_var, foyer_vars in var_dict.iteritems():
        try:
            selection = foyer[foyer_vars + ["noindiv"]].copy()
        except KeyError:
            # Testing if at least one variable of foyers_vars is in the eligible list
            presence = [x in eligible_vars for x in foyer_vars]
            if not any(presence):
                log.info("{} is not present".format(individual_var))
                continue
            else:
                # Shrink the list
                foyer_vars_cleaned = [var for var, present in zip(foyer_vars, presence) if present is True]
                selection = foyer[foyer_vars_cleaned + ["noindiv"]].copy()

        # Reshape the dataframe
        selection.rename(columns = dict(zip(foyer_vars, qui)), inplace = True)
        selection.set_index("noindiv", inplace = True)
        selection.columns.name = "quifoy"

        selection = selection.stack()
        selection.name = individual_var
        selection = selection.reset_index()  # A Series cannot see its index resetted to produce a DataFrame
        selection = selection.set_index(["quifoy", "noindiv"])
        selection = selection[selection[individual_var] != 0].copy()

        if len(foy_ind) == 0:
            foy_ind = selection
        else:
            foy_ind = concat([foy_ind, selection], axis = 1, join = 'outer')

    foy_ind.reset_index(inplace = True)

    ind_vars_to_remove = Series(list(eligible_vars))
    temporary_store['ind_vars_to_remove_{}'.format(year)] = ind_vars_to_remove
    foy_ind.rename(columns = {"noindiv": "idfoy"}, inplace = True)

    print_id(foy_ind)

    foy_ind.quifoy.loc[foy_ind.quifoy == 'vous'] = 0
    foy_ind.quifoy.loc[foy_ind.quifoy == 'conj'] = 1
    foy_ind.quifoy.loc[foy_ind.quifoy == 'pac1'] = 2
    foy_ind.quifoy.loc[foy_ind.quifoy == 'pac2'] = 3
    foy_ind.quifoy.loc[foy_ind.quifoy == 'pac3'] = 4

    assert foy_ind.quifoy .isin(range(5)).all(), 'présence de valeurs aberrantes dans quifoy'

    log.info('saving foy_ind')
    print_id(foy_ind)
    temporary_store['foy_ind_{}'.format(year)] = foy_ind

    return
def sif(temporary_store = None, year = None):
    assert temporary_store is not None
    assert year is not None
    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

    erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory)
    erfs_survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info("05_foyer: extraction des données foyer")
    # TODO Comment choisir le rfr n -2 pour éxonération de TH ?
    # mnrvka  Revenu TH n-2
    # mnrvkh  revenu TH (revenu fiscal de référence)
    #
    # On récupère les variables du code sif
    sif = erfs_survey.get_values(
        variables = ["noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg", "declar"],
        table = year_specific_by_generic["foyer"]
        )

    sif['statmarit'] = 0

    if year == 2009:
        old_sif = sif['sif'][sif['noindiv'] == 901803201].copy()
        new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0"
        sif.sif.loc[sif['noindiv'] == 901803201] = new_sif.values
        old_sif = sif.sif.loc[sif['noindiv'] == 900872201]
        new_sif = old_sif.str[0:58] + " " + old_sif.str[58:]
        sif.sif.loc[sif['noindiv'] == 900872201] = new_sif.values
        del old_sif, new_sif

    sif["rbg"] = sif["rbg"] * ((sif["tsrvbg"] == '+').astype(int) - (sif["tsrvbg"] == '-').astype(int))
    sif["stamar"] = sif.sif.str[4:5]

    # Converting marital status
    statmarit_dict = {"M": 1, "C": 2, "D": 3, "V": 4, "O": 5}
    for key, val in statmarit_dict.iteritems():
        sif.statmarit.loc[sif.stamar == key] = val

    sif["birthvous"] = sif.sif.str[5:9]
    sif["birthconj"] = sif.sif.str[10:14]
    sif["caseE"] = sif.sif.str[15:16] == "E"
    sif["caseF"] = sif.sif.str[16:17] == "F"
    sif["caseG"] = sif.sif.str[17:18] == "G"
    sif["caseK"] = sif.sif.str[18:19] == "K"

    d = 0
    if year in [2006, 2007]:
        sif["caseL"] = sif.sif.str[19:20] == "L"
        sif["caseP"] = sif.sif.str[20:21] == "P"
        sif["caseS"] = sif.sif.str[21:22] == "S"
        sif["caseW"] = sif.sif.str[22:23] == "W"
        sif["caseN"] = sif.sif.str[23:24] == "N"
        sif["caseH"] = sif.sif.str[24:28]
        sif["caseT"] = sif.sif.str[28:29] == "T"

    if year in [2008]:
        d = - 1  # fin de la case L
        sif["caseP"] = sif.sif.str[20 + d: 21 + d] == "P"
        sif["caseS"] = sif.sif.str[21 + d: 22 + d] == "S"
        sif["caseW"] = sif.sif.str[22 + d: 23 + d] == "W"
        sif["caseN"] = sif.sif.str[23 + d: 24 + d] == "N"
        sif["caseH"] = sif.sif.str[24 + d: 28 + d]
        sif["caseT"] = sif.sif.str[28 + d: 29 + d] == "T"

    if year in [2009]:
        sif["caseL"] = sif.sif.str[19: 20] == "L"
        sif["caseP"] = sif.sif.str[20: 21] == "P"
        sif["caseS"] = sif.sif.str[21: 22] == "S"
        sif["caseW"] = sif.sif.str[22: 23] == "W"
        sif["caseN"] = sif.sif.str[23: 24] == "N"
        # caseH en moins par rapport à 2008 (mais case en L en plus)
        # donc décalage par rapport à 2006
        d = -4
        sif["caseT"] = sif.sif.str[28 + d: 29 + d] == "T"

    sif["caseX"] = sif.sif.str[33 + d: 34 + d] == "X"
    sif["dateX"] = sif.sif.str[34 + d: 42 + d]
    sif["caseY"] = sif.sif.str[42 + d: 43 + d] == "Y"
    sif["dateY"] = sif.sif.str[43 + d: 51 + d]
    sif["caseZ"] = sif.sif.str[51 + d: 52 + d] == "Z"
    sif["dateZ"] = sif.sif.str[52 + d: 60 + d]
    sif["causeXYZ"] = sif.sif.str[60 + d: 61 + d]

    # TODO: convert dateXYZ to appropriate date in pandas
    # print sif["dateY"].unique()

    sif["nbptr"] = sif.nbptr.values / 100
    sif["rfr_n_2"] = sif.mnrvka.values

    sif["nbF"] = sif.sif.str[64 + d: 66 + d]
    sif["nbG"] = sif.sif.str[67 + d: 69 + d]
    sif["nbR"] = sif.sif.str[70 + d: 72 + d]
    sif["nbJ"] = sif.sif.str[73 + d: 75 + d]
    sif["nbN"] = sif.sif.str[76 + d: 78 + d]
    sif["nbH"] = sif.sif.str[79 + d: 81 + d]
    sif["nbI"] = sif.sif.str[82 + d: 84 + d]

    if (year != 2009):
        sif["nbP"] = sif.sif.str[85 + d: 87 + d]

    del sif["stamar"]

    duplicated_noindiv = sif.noindiv[sif.noindiv.duplicated()].copy()
    sif['duplicated_noindiv'] = sif.noindiv.isin(duplicated_noindiv)
    x = sif.loc[sif.duplicated_noindiv, ['noindiv', 'declar']]
    sif['change'] = "NONE"
    sif.loc[sif.duplicated_noindiv, 'change'] = sif.loc[sif.duplicated_noindiv, 'declar'].str[27:28]

    log.info("Number of individuals: {}".format(len(sif.noindiv)))
    log.info("Number of duplicated individuals: {}".format(len(duplicated_noindiv)))
    log.info("Number of distinct individuals: {}".format(len(sif.noindiv.value_counts())))

    log.info(u"Saving sif")
    temporary_store['sif_{}'.format(year)] = sif
    del sif
    gc.collect()
示例#9
0
    def build_erf_data_frames(self):
        # TODO: remove this
        self.columns_to_fetch = ['af']
        variables = self.columns_to_fetch
        erf_survey_collection = SurveyCollection.load(
            collection="erfs", config_files_directory=config_files_directory)
        erf_survey = erf_survey_collection.get_survey("erfs_{}".format(year))
        year_specific_by_generic = year_specific_by_generic_data_frame_name(
            year)
        generic_by_year_specific = dict(
            zip(year_specific_by_generic.values(),
                year_specific_by_generic.keys()))

        erf_variables = list(
            set(variables + ["ident", "wprm", "quelfic", "noi"]))
        of2erf = get_of2erf()
        for index, variable in enumerate(erf_variables):
            if variable in of2erf:
                erf_variables[index] = of2erf[variable]
        data_frame_by_table = dict(eec_indivi=None,
                                   erf_indivi=None,
                                   erf_menage=None)
        erf_variables_by_generic_table = dict(eec_indivi=[],
                                              erf_indivi=[],
                                              erf_menage=[])

        year_specific_tables_by_erf_variable = dict([
            (erf_variable,
             set(erf_survey.find_tables(variable=erf_variable)).intersection(
                 set([
                     year_specific_by_generic[key]
                     for key in erf_variables_by_generic_table.keys()
                 ]))) for erf_variable in erf_variables
        ])
        for variable, year_specific_tables in year_specific_tables_by_erf_variable.iteritems(
        ):
            if len(year_specific_tables) < 1:
                log.info(
                    "No tables are present for variable {}".format(variable))
                continue
            else:
                log.info(
                    "Variable {} is present in multiple tables : {}".format(
                        variable, year_specific_tables))
                for table in year_specific_tables:
                    log.info("Variable {} is retrieved from table {}".format(
                        variable, table))
                    erf_variables_by_generic_table[
                        generic_by_year_specific[table]].append(variable)

        erf2of = get_erf2of()

        for table, erf_variables in erf_variables_by_generic_table.iteritems():
            if erf_variables:
                data_frame_by_table[table] = erf_survey.get_values(
                    variables=erf_variables,
                    table=year_specific_by_generic[table])
                data_frame_by_table[table].rename(columns=erf2of, inplace=True)
                data_frame_by_table[table].rename(columns={'ident': 'idmen'},
                                                  inplace=True)

        assert not data_frame_by_table["erf_menage"].duplicated().any(
        ), "Duplicated idmen in erf_menage"
        self.erf_data_frame_by_entity_key_plural = dict(
            menages=data_frame_by_table["erf_menage"],
            individus=data_frame_by_table["erf_indivi"].merge(
                data_frame_by_table["eec_indivi"]))
示例#10
0
def create_fip(temporary_store=None, year=None):
    assert temporary_store is not None
    assert year is not None
    # fip : fichier d'imposition des personnes
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """
    # Some individuals are declared as 'personne à charge' (pac) on 'tax forms'
    # but are not present in the erf or eec tables.
    # We add them to ensure consistency between concepts.

    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

    erfs_survey_collection = SurveyCollection.load(
        collection='erfs', config_files_directory=config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info(u"Démarrage de 03_fip")

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = survey.get_values(table=year_specific_by_generic["foyer"],
                              variables=erfFoyVar)
    foyer.replace({'anaisenf': {'NA': np.nan}}, inplace=True)

    log.info(u"Etape 1 : on récupere les personnes à charge des foyers")
    log.info(u"    1.1 : Création des codes des enfants")
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max))

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    assert int(
        nb_pac_max
    ) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(
        nb_pac_max)
    nb_pac_max = int(nb_pac_max)
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(multi_index_columns,
                                     names=['pac_number', 'variable'])
    fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max),
                    columns=columns)

    for i in range(1, nb_pac_max +
                   1):  # TODO: using values to deal with mismatching indexes
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1:5 * i].values

    fip = fip.stack("pac_number")
    fip.reset_index(inplace=True)
    fip.drop(['level_0'], axis=1, inplace=True)

    log.info(u"    1.2 : elimination des foyers fiscaux sans pac")
    # Clearing missing values and changing data format
    fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') &
              (fip.naia != '')].copy()
    fip = fip.sort(columns=['declaration', 'naia', 'type_pac'])
    fip.set_index(["declaration", "pac_number"], inplace=True)
    fip = fip.reset_index()
    fip.drop(['pac_number'], axis=1, inplace=True)
    assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \
        "Certains types de PAC ne sont pas des cases connues"

    # control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    log.info(
        u"    1.3 : on enlève les individus F pour lesquels il existe un individu G"
    )
    type_FG = fip[fip.type_pac.isin(
        ['F', 'G'])].copy()  # Filtre pour ne travailler que sur F & G

    type_FG['same_pair'] = type_FG.duplicated(subset=['declaration', 'naia'],
                                              take_last=True)
    type_FG['is_twin'] = type_FG.duplicated(
        subset=['declaration', 'naia', 'type_pac'])
    type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin']
    # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #       puis on retire les autres (à la fois F et G)
    fip['to_keep'] = np.nan
    fip.update(type_FG)
    log.info(u"    1.4 : on enlève les H pour lesquels il y a un I")
    type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy()
    type_HI['same_pair'] = type_HI.duplicated(subset=['declaration', 'naia'],
                                              take_last=True)
    type_HI['is_twin'] = type_HI.duplicated(
        subset=['declaration', 'naia', 'type_pac'])
    type_HI['to_keep'] = (~(type_HI['same_pair']) |
                          (type_HI['is_twin'])).values

    fip.update(type_HI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    log.info(
        u"{} F, G, H or I non redundant pac kept over {} potential candidates".
        format(fip['to_keep'].sum(), len(fip)))
    indivifip = fip[fip['to_keep']].copy()
    del indivifip['to_keep'], fip, type_FG, type_HI
    #
    # control(indivifip, debug=True)

    log.info(u"Step 2 : matching indivifip with eec file")
    indivi = temporary_store['indivim_{}'.format(year)]
    pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy()
    assert indivifip.naia.notnull().all(
    ), "Il y a des valeurs manquantes de la variable naia"

    # For safety enforce pac.naia and indivifip.naia dtypes
    pac['naia'] = pac.naia.astype('int32')
    indivifip['naia'] = indivifip.naia.astype('int32')
    pac['key1'] = zip(pac.naia, pac['declar1'].str[:29])
    pac['key2'] = zip(pac.naia, pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip.naia.values,
                           indivifip['declaration'].str[:29].values)
    assert pac.naia.dtype == indivifip.naia.dtype, \
        "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype)

    fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy()
    fip = fip[~(fip.key.isin(pac.key2.values))].copy()

    log.info(u"    2.1 new fip created")
    # We build a dataframe to link the pac to their type and noindiv
    tmp_pac1 = pac[['noindiv', 'key1']].copy()
    tmp_pac2 = pac[['noindiv', 'key2']].copy()
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy()

    pac_ind1 = tmp_pac1.merge(tmp_indivifip,
                              left_on='key1',
                              right_on='key',
                              how='inner')
    log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1)))
    pac_ind2 = tmp_pac2.merge(tmp_indivifip,
                              left_on='key2',
                              right_on='key',
                              how='inner')
    log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2)))

    log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum()))
    log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum()))

    del pac_ind1['key1'], pac_ind2['key2']

    if len(pac_ind1.index) == 0:
        if len(pac_ind2.index) == 0:
            log.info(
                u"Warning : no link between pac and noindiv for both pacInd1&2"
            )
        else:
            log.info(u"Warning : pacInd1 is an empty data frame")
            pacInd = pac_ind2
    elif len(pac_ind2.index) == 0:
        log.info(u"Warning : pacInd2 is an empty data frame")
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    assert len(pac_ind1) + len(pac_ind2) == len(pacInd)
    log.info("{} null pac_ind2.type_pac".format(
        pac_ind2.type_pac.isnull().sum()))
    log.info("pacInd.type_pac.value_counts()) \n {}".format(
        pacInd.type_pac.value_counts(dropna=False)))

    log.info(u"    2.2 : pacInd created")
    log.info(u"doublons noindiv, type_pac {}".format(
        pacInd.duplicated(['noindiv', 'type_pac']).sum()))
    log.info(u"doublons noindiv seulement {}".format(
        pacInd.duplicated('noindiv').sum()))
    log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum()))

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy()
    # pacIndiv.reset_index(inplace=True)
    log.info("{}".format(pacIndiv.columns))

    temporary_store['pacIndiv_{}'.format(year)] = pacIndiv

    log.info("{}".format(pacIndiv.type_pac.value_counts()))
    gc.collect()

    # We keep the fip in the menage of their parents because it is used in to
    # build the famille. We should build an individual ident (ménage) for the fip that are
    # older than 18 since they are not in their parents' menage according to the eec
    log.info("{}".format(indivi['declar1'].str[0:2].value_counts()))
    log.info("{}".format(indivi['declar1'].str[0:2].describe()))
    log.info("{}".format(indivi['declar1'].str[0:2].notnull().all()))
    log.info("{}".format(indivi.info()))
    selection = indivi['declar1'].str[0:2] != ""
    indivi['noidec'] = indivi.declar1[selection].str[0:2].astype(
        'int32')  # To be used later to set idfoy

    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values))
                        & (indivi.persfip == "vous")]
    individec1 = individec1[[
        "declar1", "noidec", "ident", "rga", "ztsai", "ztsao"
    ]].copy()
    individec1 = individec1.rename(columns={'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on='declaration')
    log.info(u"    2.3 : fip1 created")

    individec2 = indivi.loc[
        (indivi.declar2.isin(fip.declaration.values)) &
        (indivi['persfip'] == "vous"),
        ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec2.rename(columns={'declar2': 'declaration'}, inplace=True)
    fip2 = fip.merge(individec2)
    log.info(u"    2.4 : fip2 created")

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    fip = concat([fip1, fip2])

    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype(
        'float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration'].copy()
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'].copy()
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip.naia.astype('float')
    fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5

    # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
    # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non

    # Reassigning noi for fip children if they are more than one per foyer fiscal
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(subset=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        log.info("{}".format(len(tmp)))
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    log.info("Number of duplicated fip: {}".format(
        fip.duplicated('noindiv').value_counts()))
    temporary_store['fipDat_{}'.format(year)] = fip
    del fip, fip1, individec1, indivifip, indivi, pac
    log.info(u"fip sauvegardé")
def create_indivim_menagem(temporary_store = None, year = None):
    """
    Création des tables ménages et individus concaténée (merged)
    """
    # Prepare the some useful merged tables

    assert temporary_store is not None
    assert year is not None
    # load data
    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)

    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))
    erfmen = survey.get_values(table = year_specific_by_generic["erf_menage"])
    eecmen = survey.get_values(table = year_specific_by_generic["eec_menage"])
    erfind = survey.get_values(table = year_specific_by_generic["erf_indivi"])
    eecind = survey.get_values(table = year_specific_by_generic["eec_indivi"])

    # travail sur la cohérence entre les bases
    noappar_m = eecmen[~(eecmen.ident.isin(erfmen.ident.values))].copy()

    noappar_i = eecmen[~(eecind.ident.isin(erfind.ident.values))].copy()
    noappar_i = noappar_i.drop_duplicates(subset = 'ident', take_last = True)
    # TODO: vérifier qu'il n'y a théoriquement pas de doublon

    difference = set(noappar_i.ident).symmetric_difference(noappar_m.ident)
    intersection = set(noappar_i.ident) & set(noappar_m.ident)
    log.info("There are {} differences and {} intersections".format(len(difference), len(intersection)))
    del noappar_i, noappar_m, difference, intersection
    gc.collect()

    # fusion enquete emploi et source fiscale
    menagem = erfmen.merge(eecmen)
    indivim = eecind.merge(erfind, on = ['noindiv', 'ident', 'noi'], how = "inner")

    # optimisation des types? Controle de l'existence en passant
    # TODO: minimal dtype
    # TODO: this should be done somewhere else
    var_list = ([
        'acteu',
        'agepr',
        'cohab',
        'contra',
        'encadr',
        'forter',
        'lien',
        'mrec',
        'naia',
        'noicon',
        'noimer',
        'noiper',
        'prosa',
        'retrai',
        'rstg',
        'statut',
        'stc',
        'titc',
        'txtppb',
        ])

    for var in var_list:
        assert numpy.issubdtype(indivim[var].dtype , numpy.integer), "Variable {} dtype is {} and should be an integer".format(
            var, indivim[var].dtype)

    ########################
    # création de variables#
    ########################


  #  print indivim
#   actrec : activité recodée comme preconisé par l'INSEE p84 du guide utilisateur
    indivim["actrec"] = numpy.nan
    # Attention : Q: pas de 6 ?!! A : Non pas de 6, la variable recodée de l'INSEE (voit p84 du guide methodo), ici \
    # la même nomenclature à été adopée
    # 3: contrat a durée déterminée
    indivim.actrec.loc[indivim.acteu == 1] = 3
    # 8 : femme (homme) au foyer, autre inactif
    indivim.actrec.loc[indivim.acteu == 3] = 8
    # 1 : actif occupé non salarié
    filter1 = (indivim.acteu == 1) & (indivim.stc.isin([1, 3]))  # actifs occupés non salariés à son compte ou pour un
    indivim.actrec.loc[filter1] = 1                              # membre de sa famille
    # 2 : salarié pour une durée non limitée
    filter2 = (indivim.acteu == 1) & (((indivim.stc == 2) & (indivim.contra == 1)) | (indivim.titc == 2))
    indivim.actrec.loc[filter2] = 2
    # 4 : au chomage
    filter4 = (indivim.acteu == 2) | ((indivim.acteu == 3) & (indivim.mrec == 1))
    indivim.actrec.loc[filter4] = 4
    # 5 : élève étudiant , stagiaire non rémunéré
    filter5 = (indivim.acteu == 3) & ((indivim.forter == 2) | (indivim.rstg == 1))
    indivim.actrec.loc[filter5] = 5
    # 7 : retraité, préretraité, retiré des affaires unchecked
    filter7 = (indivim.acteu == 3) & ((indivim.retrai == 1) | (indivim.retrai == 2))
    indivim.actrec.loc[filter7] = 7
    # 9 : probablement enfants de - de 16 ans TODO: check that fact in database and questionnaire
    indivim.actrec.loc[indivim.acteu == 0] = 9

    indivim.actrec = indivim.actrec.astype("int8")
    assert_dtype(indivim.actrec, "int8")
    assert indivim.actrec.isin(range(1, 10)).all(), 'actrec values are outside the interval [1, 9]'

#   TODO : compare the result with results provided by Insee
#   tu99
    if year == 2009:
        erfind['tu99'] = None  # TODO: why ?

    # Locataire
    menagem["locataire"] = menagem.so.isin([3, 4, 5])
    assert_dtype(menagem.locataire, "bool")

    transfert = indivim.loc[indivim.lpr == 1, ['ident', 'ddipl']].copy()
    menagem = menagem.merge(transfert)

    # Correction
    def _manually_remove_errors():
        '''
        This method is here because some oddities can make it through the controls throughout the procedure
        It is here to remove all these individual errors that compromise the process.
        '''
        if year == 2006:
            indivim.lien[indivim.noindiv == 603018905] = 2
            indivim.noimer[indivim.noindiv == 603018905] = 1
            log.info("{}".format(indivim[indivim.noindiv == 603018905].to_string()))

    _manually_remove_errors()

    temporary_store['menagem_{}'.format(year)] = menagem
    del eecmen, erfmen, menagem, transfert
    gc.collect()
    temporary_store['indivim_{}'.format(year)] = indivim
    del erfind, eecind
def create_enfants_a_naitre(temporary_store = None, year = None):
    '''
    '''
    assert temporary_store is not None
    assert year is not None

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))
    # Enfant à naître (NN pour nouveaux nés)
    individual_vars = [
        'acteu',
        'agepr',
        'cohab',
        'contra',
        'forter',
        'ident',
        'lien',
        'lpr',
        'mrec',
        'naia',
        'naim',
        'noi',
        'noicon',
        'noimer',
        'noindiv',
        'noiper',
        'retrai',
        'rga',
        'rstg',
        'sexe',
        'stc',
        'titc',
        ]
    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)
    eeccmp1 = survey.get_values(table = year_specific_by_generic["eec_cmp_1"], variables = individual_vars)
    eeccmp2 = survey.get_values(table = year_specific_by_generic["eec_cmp_2"], variables = individual_vars)
    eeccmp3 = survey.get_values(table = year_specific_by_generic["eec_cmp_3"], variables = individual_vars)
    tmp = eeccmp1.merge(eeccmp2, how = "outer")
    enfants_a_naitre = tmp.merge(eeccmp3, how = "outer")

    # optimisation des types? Controle de l'existence en passant
    # pourquoi pas des int quand c'est possible
    # TODO: minimal dtype TODO: shoudln't be here
    for var in individual_vars:
        assert_dtype(enfants_a_naitre[var], 'float')
    del eeccmp1, eeccmp2, eeccmp3, individual_vars, tmp
    gc.collect()

    # création de variables
    enfants_a_naitre['declar1'] = ''
    enfants_a_naitre['noidec'] = 0
    enfants_a_naitre['ztsai'] = 0
    enfants_a_naitre['year'] = year
    enfants_a_naitre.year = enfants_a_naitre.year.astype("float32")  # TODO: should be an integer but NaN are present
    enfants_a_naitre['agepf'] = enfants_a_naitre.year - enfants_a_naitre.naia
    enfants_a_naitre.loc[enfants_a_naitre.naim >= 7,'agepf'] -= 1
    enfants_a_naitre['actrec'] = 9
    enfants_a_naitre['quelfic'] = 'ENF_NN'
    enfants_a_naitre['persfip'] = ""

    # TODO: deal with agepf
    for series_name in ['actrec', 'noidec', 'ztsai']:
        assert_dtype(enfants_a_naitre[series_name], "int")

    # selection
    enfants_a_naitre = enfants_a_naitre[
        (
            (enfants_a_naitre.naia == enfants_a_naitre.year) & (enfants_a_naitre.naim >= 10)
            ) | (
                (enfants_a_naitre.naia == enfants_a_naitre.year + 1) & (enfants_a_naitre.naim <= 5)
                )
        ].copy()

    temporary_store["enfants_a_naitre_{}".format(year)] = enfants_a_naitre
示例#13
0
"""
Created on Mon Sep 28 11:25:40 2015

@author: malkaguillot
"""
import pandas

from openfisca_france_data import default_config_files_directory as config_files_directory
from openfisca_france_data.input_data_builders.build_openfisca_survey_data.base \
    import year_specific_by_generic_data_frame_name
from openfisca_france_data.temporary import get_store
from openfisca_survey_manager.survey_collections import SurveyCollection

# En entrée : tables individus, foyer et sif de ERFS (testé sur 2009)
year = 2009
year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

erfs_survey_collection = SurveyCollection.load(
    collection='erfs', config_files_directory=config_files_directory)
survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))
foyer_all = survey.get_values(table=year_specific_by_generic["foyer"])
erfind = survey.get_values(table=year_specific_by_generic["erf_indivi"])

temporary_store = get_store(file_name='erfs')
sif = temporary_store['sif']

ind = erfind[[
    'ident', 'noindiv', 'declar1', 'declar2', 'zsali', 'persfip', 'persfipd'
]]
small_sif = sif[['noindiv', 'declar', 'causeXYZ']]
foyer = foyer_all[[