def build_erf_aggregates(variables=None, year=2006, unit=1e6):
    """
    Fetch the relevant aggregates from erf data
    """
    erfs_survey_collection = SurveyCollection.load(collection="erfs")
    erfs_survey = erfs_survey_collection.surveys["erfs_{}".format(year)]

    of2erf = get_of2erf()
    erf2of = get_erf2of()

    if set(variables) <= set(of2erf.keys()):
        variables = [of2erf[variable] for variable in variables]

    if variables is not None and "wprm" not in variables:
        variables.append("wprm")
    log.info("Fetching aggregates from erfs {} data".format(year))

    df = erfs_survey.get_values(variables=variables, table="erf_menage")

    df.rename(columns=erf2of, inplace=True)
    wprm = df["wprm"]
    for col in df.columns:
        try:
            df[col] = df[col].astype(np.float64)
        except:
            pass
    df = df.mul(wprm, axis=0)
    for col in list(set(df.columns) - set(['ident', 'wprm'])):
        try:
            df[col] = df[col].sum() / 1e6
        except:
            pass

    return df.ix[0:1]  # Aggregate so we only need 1 row
    def build_erf_data_frames(self):
        # TODO: remove this
        self.columns_to_fetch = ['af']
        variables = self.columns_to_fetch
        erf_survey_collection = SurveyCollection.load(
            collection = "erfs", config_files_directory = config_files_directory)
        erf_survey = erf_survey_collection.get_survey("erfs_{}".format(year))
        year_specific_by_generic = year_specific_by_generic_data_frame_name(year)
        generic_by_year_specific = dict(zip(year_specific_by_generic.values(), year_specific_by_generic.keys()))

        erf_variables = list(set(variables + ["ident", "wprm", "quelfic", "noi"]))
        of2erf = get_of2erf()
        for index, variable in enumerate(erf_variables):
            if variable in of2erf:
                erf_variables[index] = of2erf[variable]
        data_frame_by_table = dict(eec_indivi = None, erf_indivi = None, erf_menage = None)
        erf_variables_by_generic_table = dict(eec_indivi = [], erf_indivi = [], erf_menage = [])

        year_specific_tables_by_erf_variable = dict(
            [
                (
                    erf_variable,
                    set(
                        erf_survey.find_tables(variable = erf_variable)
                        ).intersection(
                        set([year_specific_by_generic[key] for key in erf_variables_by_generic_table.keys()])
                        )
                    ) for erf_variable in erf_variables
                ]
            )
        for variable, year_specific_tables in year_specific_tables_by_erf_variable.iteritems():
            if len(year_specific_tables) < 1:
                log.info("No tables are present for variable {}".format(variable))
                continue
            else:
                log.info("Variable {} is present in multiple tables : {}".format(variable, year_specific_tables))
                for table in year_specific_tables:
                    log.info("Variable {} is retrieved from table {}".format(variable, table))
                    erf_variables_by_generic_table[generic_by_year_specific[table]].append(variable)

        erf2of = get_erf2of()

        for table, erf_variables in erf_variables_by_generic_table.iteritems():
            if erf_variables:
                data_frame_by_table[table] = erf_survey.get_values(
                    variables = erf_variables, table = year_specific_by_generic[table]
                    )
                data_frame_by_table[table].rename(columns = erf2of, inplace = True)
                data_frame_by_table[table].rename(columns = {'ident': 'idmen'}, inplace = True)

        assert not data_frame_by_table["erf_menage"].duplicated().any(), "Duplicated idmen in erf_menage"
        self.erf_data_frame_by_entity_key_plural = dict(
            menages = data_frame_by_table["erf_menage"],
            individus = data_frame_by_table["erf_indivi"].merge(data_frame_by_table["eec_indivi"])
            )
    def build_erf_data_frames(self):
        variables = self.columns_to_fetch
        erf_survey_collection = SurveyCollection.load(collection="erfs")
        erf_survey = erf_survey_collection.surveys["erfs_{}".format(year)]

        erf_variables = list(
            set(variables + ["ident", "wprm", "quelfic", "noi"]))
        of2erf = get_of2erf()
        for index, variable in enumerate(erf_variables):
            if variable in of2erf:
                erf_variables[index] = of2erf[variable]
        data_frame_by_table = dict(eec_indivi=None,
                                   erf_indivi=None,
                                   erf_menage=None)
        erf_variables_by_table = dict(eec_indivi=[],
                                      erf_indivi=[],
                                      erf_menage=[])
        table_by_erf_variable = dict([
            (erf_variable,
             set(erf_survey.find_tables(variable=erf_variable)).intersection(
                 set(erf_variables_by_table.keys())))
            for erf_variable in erf_variables
        ])
        for variable, tables in table_by_erf_variable.iteritems():
            if len(tables) < 1:
                log.info(
                    "No tables are present for variable {}".format(variable))
                continue
            else:
                log.info(
                    "Variable {} is present in multiple tables : {}".format(
                        variable, tables))
                for table in tables:
                    log.info("Variable {} is retrieved from table {}".format(
                        variable, table))
                    erf_variables_by_table[table].append(variable)

        erf2of = get_erf2of()
        for table, erf_variables in erf_variables_by_table.iteritems():
            if erf_variables:
                data_frame_by_table[table] = erf_survey.get_values(
                    variables=erf_variables, table=table)
                data_frame_by_table[table].rename(columns=erf2of, inplace=True)
                data_frame_by_table[table].rename(columns={'ident': 'idmen'},
                                                  inplace=True)

        assert not data_frame_by_table["erf_menage"].duplicated().any(
        ), "Duplicated idmen in erf_menage"
        self.erf_menages_data_frame = data_frame_by_table["erf_menage"]
        self.erf_eec_individus_data_frame = data_frame_by_table[
            "erf_indivi"].merge(data_frame_by_table["eec_indivi"], )
    def build_erf_data_frames(self):
        variables = self.columns_to_fetch
        erf_survey_collection = SurveyCollection.load(collection = "erfs")
        erf_survey = erf_survey_collection.surveys["erfs_{}".format(year)]

        erf_variables = list(set(variables + ["ident", "wprm", "quelfic", "noi"]))
        of2erf = get_of2erf()
        for index, variable in enumerate(erf_variables):
            if variable in of2erf:
                erf_variables[index] = of2erf[variable]
        data_frame_by_table = dict(eec_indivi = None, erf_indivi = None, erf_menage = None)
        erf_variables_by_table = dict(eec_indivi = [], erf_indivi = [], erf_menage = [])
        table_by_erf_variable = dict(
            [
                (
                    erf_variable,
                    set(
                        erf_survey.find_tables(variable = erf_variable)
                        ).intersection(
                        set(erf_variables_by_table.keys())
                        )
                    ) for erf_variable in erf_variables
                ]
            )
        for variable, tables in table_by_erf_variable.iteritems():
            if len(tables) < 1:
                log.info("No tables are present for variable {}".format(variable))
                continue
            else:
                log.info("Variable {} is present in multiple tables : {}".format(variable, tables))
                for table in tables:
                    log.info("Variable {} is retrieved from table {}".format(variable, table))
                    erf_variables_by_table[table].append(variable)

        erf2of = get_erf2of()
        for table, erf_variables in erf_variables_by_table.iteritems():
            if erf_variables:
                data_frame_by_table[table] = erf_survey.get_values(variables = erf_variables, table = table)
                data_frame_by_table[table].rename(columns = erf2of, inplace = True)
                data_frame_by_table[table].rename(columns = {'ident': 'idmen'}, inplace = True)

        assert not data_frame_by_table["erf_menage"].duplicated().any(), "Duplicated idmen in erf_menage"
        self.erf_menages_data_frame = data_frame_by_table["erf_menage"]
        self.erf_eec_individus_data_frame = data_frame_by_table["erf_indivi"].merge(
            data_frame_by_table["eec_indivi"],
            )
def build_erf_aggregates(variables = None, year = 2006, unit = 1e6):
    """
    Fetch the relevant aggregates from erf data
    """
    erfs_survey_collection = SurveyCollection.load(collection = "erfs")
    erfs_survey = erfs_survey_collection.surveys["erfs_{}".format(year)]


    of2erf = get_of2erf()
    erf2of = get_erf2of()

    if set(variables) <= set(of2erf.keys()):
        variables = [ of2erf[variable] for variable in variables]

    if variables is not None and "wprm" not in variables:
        variables.append("wprm")
    log.info("Fetching aggregates from erfs {} data".format(year))


    df = erfs_survey.get_values(variables = variables, table = "erf_menage")


    df.rename(columns = erf2of, inplace = True)
    wprm = df["wprm"]
    for col in df.columns:
        try:
            df[col] = df[col].astype(np.float64)
        except:
            pass
    df = df.mul(wprm, axis = 0)
    for col in list(set(df.columns) - set(['ident', 'wprm'])):
        try:
            df[col] = df[col].sum()/1e6
        except:
            pass

    return df.ix[0:1] # Aggregate so we only need 1 row