def build_erf_aggregates(variables=None, year=2006, unit=1e6): """ Fetch the relevant aggregates from erf data """ erfs_survey_collection = SurveyCollection.load(collection="erfs") erfs_survey = erfs_survey_collection.surveys["erfs_{}".format(year)] of2erf = get_of2erf() erf2of = get_erf2of() if set(variables) <= set(of2erf.keys()): variables = [of2erf[variable] for variable in variables] if variables is not None and "wprm" not in variables: variables.append("wprm") log.info("Fetching aggregates from erfs {} data".format(year)) df = erfs_survey.get_values(variables=variables, table="erf_menage") df.rename(columns=erf2of, inplace=True) wprm = df["wprm"] for col in df.columns: try: df[col] = df[col].astype(np.float64) except: pass df = df.mul(wprm, axis=0) for col in list(set(df.columns) - set(['ident', 'wprm'])): try: df[col] = df[col].sum() / 1e6 except: pass return df.ix[0:1] # Aggregate so we only need 1 row
def build_erf_data_frames(self): # TODO: remove this self.columns_to_fetch = ['af'] variables = self.columns_to_fetch erf_survey_collection = SurveyCollection.load( collection = "erfs", config_files_directory = config_files_directory) erf_survey = erf_survey_collection.get_survey("erfs_{}".format(year)) year_specific_by_generic = year_specific_by_generic_data_frame_name(year) generic_by_year_specific = dict(zip(year_specific_by_generic.values(), year_specific_by_generic.keys())) erf_variables = list(set(variables + ["ident", "wprm", "quelfic", "noi"])) of2erf = get_of2erf() for index, variable in enumerate(erf_variables): if variable in of2erf: erf_variables[index] = of2erf[variable] data_frame_by_table = dict(eec_indivi = None, erf_indivi = None, erf_menage = None) erf_variables_by_generic_table = dict(eec_indivi = [], erf_indivi = [], erf_menage = []) year_specific_tables_by_erf_variable = dict( [ ( erf_variable, set( erf_survey.find_tables(variable = erf_variable) ).intersection( set([year_specific_by_generic[key] for key in erf_variables_by_generic_table.keys()]) ) ) for erf_variable in erf_variables ] ) for variable, year_specific_tables in year_specific_tables_by_erf_variable.iteritems(): if len(year_specific_tables) < 1: log.info("No tables are present for variable {}".format(variable)) continue else: log.info("Variable {} is present in multiple tables : {}".format(variable, year_specific_tables)) for table in year_specific_tables: log.info("Variable {} is retrieved from table {}".format(variable, table)) erf_variables_by_generic_table[generic_by_year_specific[table]].append(variable) erf2of = get_erf2of() for table, erf_variables in erf_variables_by_generic_table.iteritems(): if erf_variables: data_frame_by_table[table] = erf_survey.get_values( variables = erf_variables, table = year_specific_by_generic[table] ) data_frame_by_table[table].rename(columns = erf2of, inplace = True) data_frame_by_table[table].rename(columns = {'ident': 'idmen'}, inplace = True) assert not data_frame_by_table["erf_menage"].duplicated().any(), "Duplicated idmen in erf_menage" self.erf_data_frame_by_entity_key_plural = dict( menages = data_frame_by_table["erf_menage"], individus = data_frame_by_table["erf_indivi"].merge(data_frame_by_table["eec_indivi"]) )
def build_erf_data_frames(self): variables = self.columns_to_fetch erf_survey_collection = SurveyCollection.load(collection="erfs") erf_survey = erf_survey_collection.surveys["erfs_{}".format(year)] erf_variables = list( set(variables + ["ident", "wprm", "quelfic", "noi"])) of2erf = get_of2erf() for index, variable in enumerate(erf_variables): if variable in of2erf: erf_variables[index] = of2erf[variable] data_frame_by_table = dict(eec_indivi=None, erf_indivi=None, erf_menage=None) erf_variables_by_table = dict(eec_indivi=[], erf_indivi=[], erf_menage=[]) table_by_erf_variable = dict([ (erf_variable, set(erf_survey.find_tables(variable=erf_variable)).intersection( set(erf_variables_by_table.keys()))) for erf_variable in erf_variables ]) for variable, tables in table_by_erf_variable.iteritems(): if len(tables) < 1: log.info( "No tables are present for variable {}".format(variable)) continue else: log.info( "Variable {} is present in multiple tables : {}".format( variable, tables)) for table in tables: log.info("Variable {} is retrieved from table {}".format( variable, table)) erf_variables_by_table[table].append(variable) erf2of = get_erf2of() for table, erf_variables in erf_variables_by_table.iteritems(): if erf_variables: data_frame_by_table[table] = erf_survey.get_values( variables=erf_variables, table=table) data_frame_by_table[table].rename(columns=erf2of, inplace=True) data_frame_by_table[table].rename(columns={'ident': 'idmen'}, inplace=True) assert not data_frame_by_table["erf_menage"].duplicated().any( ), "Duplicated idmen in erf_menage" self.erf_menages_data_frame = data_frame_by_table["erf_menage"] self.erf_eec_individus_data_frame = data_frame_by_table[ "erf_indivi"].merge(data_frame_by_table["eec_indivi"], )
def build_erf_data_frames(self): variables = self.columns_to_fetch erf_survey_collection = SurveyCollection.load(collection = "erfs") erf_survey = erf_survey_collection.surveys["erfs_{}".format(year)] erf_variables = list(set(variables + ["ident", "wprm", "quelfic", "noi"])) of2erf = get_of2erf() for index, variable in enumerate(erf_variables): if variable in of2erf: erf_variables[index] = of2erf[variable] data_frame_by_table = dict(eec_indivi = None, erf_indivi = None, erf_menage = None) erf_variables_by_table = dict(eec_indivi = [], erf_indivi = [], erf_menage = []) table_by_erf_variable = dict( [ ( erf_variable, set( erf_survey.find_tables(variable = erf_variable) ).intersection( set(erf_variables_by_table.keys()) ) ) for erf_variable in erf_variables ] ) for variable, tables in table_by_erf_variable.iteritems(): if len(tables) < 1: log.info("No tables are present for variable {}".format(variable)) continue else: log.info("Variable {} is present in multiple tables : {}".format(variable, tables)) for table in tables: log.info("Variable {} is retrieved from table {}".format(variable, table)) erf_variables_by_table[table].append(variable) erf2of = get_erf2of() for table, erf_variables in erf_variables_by_table.iteritems(): if erf_variables: data_frame_by_table[table] = erf_survey.get_values(variables = erf_variables, table = table) data_frame_by_table[table].rename(columns = erf2of, inplace = True) data_frame_by_table[table].rename(columns = {'ident': 'idmen'}, inplace = True) assert not data_frame_by_table["erf_menage"].duplicated().any(), "Duplicated idmen in erf_menage" self.erf_menages_data_frame = data_frame_by_table["erf_menage"] self.erf_eec_individus_data_frame = data_frame_by_table["erf_indivi"].merge( data_frame_by_table["eec_indivi"], )
def build_erf_aggregates(variables = None, year = 2006, unit = 1e6): """ Fetch the relevant aggregates from erf data """ erfs_survey_collection = SurveyCollection.load(collection = "erfs") erfs_survey = erfs_survey_collection.surveys["erfs_{}".format(year)] of2erf = get_of2erf() erf2of = get_erf2of() if set(variables) <= set(of2erf.keys()): variables = [ of2erf[variable] for variable in variables] if variables is not None and "wprm" not in variables: variables.append("wprm") log.info("Fetching aggregates from erfs {} data".format(year)) df = erfs_survey.get_values(variables = variables, table = "erf_menage") df.rename(columns = erf2of, inplace = True) wprm = df["wprm"] for col in df.columns: try: df[col] = df[col].astype(np.float64) except: pass df = df.mul(wprm, axis = 0) for col in list(set(df.columns) - set(['ident', 'wprm'])): try: df[col] = df[col].sum()/1e6 except: pass return df.ix[0:1] # Aggregate so we only need 1 row