revenu_personne.columns = header revenu_personne.rename(columns={'IRIS': 'CODGEO'}, inplace=True) # to get real values revenu_personne = revenu_personne[6:] # creating new feature : sum of all feature features = [ x for x in header if x not in ['IRIS', 'LIBIRIS', 'COM', 'LIBCOM', 'REG', 'DEP', 'ARR', 'CV', 'ZE2010'] ] # special list for this file # No need to sum features here (% and quantile) features.append('CODGEO') print "il y a %d iris différentes pour le revenu par personne et %d features" % ( len(revenu_personne.CODGEO.unique()), len(features) - 1) revenu_personne['LIBCOM'] = revenu_personne['LIBCOM'].str.replace(u' - ', u'-') data = fillna_with_other_table(data, revenu_personne, 'CODGEO') compare_geo(data, revenu_personne) data = pd.merge(data, revenu_personne[features], on='CODGEO', how='outer') ## Revenu par unité de consomation revenu_uc = pd.read_excel( 'data/RFDU2011IRI.xls', sheetname=1) #using int cause name of sheetname have some "é" # creating header from file header = revenu_uc.loc[5].tolist() revenu_uc.columns = header revenu_uc.rename(columns={'IRIS': 'CODGEO'}, inplace=True) # to get real values revenu_uc = revenu_uc[6:] # creating new feature : sum of all feature features = [
## Revenu par personne revenu_personne = pd.read_excel('data/RFDP2011IRI.xls', sheetname=1) #using int cause name of sheetname have some "é" # creating header from file header = revenu_personne.loc[5].tolist() revenu_personne.columns = header revenu_personne.rename(columns={'IRIS':'CODGEO'}, inplace=True) # to get real values revenu_personne = revenu_personne[6:] # creating new feature : sum of all feature features = [x for x in header if x not in ['IRIS','LIBIRIS','COM','LIBCOM','REG','DEP','ARR','CV','ZE2010']] # special list for this file # No need to sum features here (% and quantile) features.append('CODGEO') print "il y a %d iris différentes pour le revenu par personne et %d features" % (len(revenu_personne.CODGEO.unique()), len(features) - 1) revenu_personne['LIBCOM'] = revenu_personne['LIBCOM'].str.replace(u' - ', u'-') data = fillna_with_other_table(data, revenu_personne, 'CODGEO') compare_geo(data, revenu_personne) data = pd.merge(data, revenu_personne[features], on='CODGEO', how='outer') ## Revenu par unité de consomation revenu_uc = pd.read_excel('data/RFDU2011IRI.xls', sheetname=1) #using int cause name of sheetname have some "é" # creating header from file header = revenu_uc.loc[5].tolist() revenu_uc.columns = header revenu_uc.rename(columns={'IRIS':'CODGEO'}, inplace=True) # to get real values revenu_uc = revenu_uc[6:] # creating new feature : sum of all feature features = [x for x in header if x not in ['IRIS','LIBIRIS','COM','LIBCOM','REG','DEP','ARR','CV','ZE2010']] # special list for this file # No need to sum features here (% and quantile)
] # Sum NB_F101 to NB_F118 sport['nb_sport'] = sport[[x for x in sport.columns if x[:5] == 'NB_F1' and len(x) == 7]]\ .applymap(lambda x: float(x)).sum(axis=1) # Sum NB_F101_NB_AIREJEU to NB_F118_NB_AIREJEU sport['nb_airjeu_sport'] = sport[[x for x in sport.columns if x[:5] == 'NB_F1' and x[-10:] == 'NB_AIREJEU']]\ .applymap(lambda x: float(x)).sum(axis=1) [features.append(i) for i in ['nb_sport', 'IRIS', 'nb_airjeu_sport']] print("il y a %d iris différentes pour le sport et %d features" % (len(sport.IRIS.unique()), len(features) - 1)) compare_geo(data, sport) data = pd.merge(data, sport[features], on='IRIS', how='outer') # Adding new IRIS, filling geo information with new files data = fillna_with_other_table(data, sport, 'IRIS', columns=['COM', 'DEP', 'REG']) _check_data(data, "Sport 16") del sport ## Enseignement 1er degré enseignement_1 = pd.read_excel('data/equip-serv-ens-1er-degre-infra-2016.xls', sheet_name='IRIS') # creating header from file header = enseignement_1.loc[4].tolist() enseignement_1.columns = header # to get real values enseignement_1 = enseignement_1[5:] # creating new feature : sum all features non aggregated features = [ x for x in header if x not in
sport = sport[5:] # creating new feature : sum all features non aggregated features = [x for x in header if x not in ['IRIS','LIB_IRIS','COM','LIB_COM','REG','REG2016','DEP']] # Sum NB_F101 to NB_F118 sport['nb_sport'] = sport[[x for x in sport.columns if x[:5] == 'NB_F1' and len(x) == 7]]\ .applymap(lambda x: float(x)).sum(axis=1) # Sum NB_F101_NB_AIREJEU to NB_F118_NB_AIREJEU sport['nb_airjeu_sport'] = sport[[x for x in sport.columns if x[:5] == 'NB_F1' and x[-10:] == 'NB_AIREJEU']]\ .applymap(lambda x: float(x)).sum(axis=1) [features.append(i) for i in ['nb_sport', 'IRIS', 'nb_airjeu_sport']] print("il y a %d iris différentes pour le sport et %d features" % (len(sport.IRIS.unique()), len(features) - 1)) compare_geo(data, sport) data = pd.merge(data, sport[features], on='IRIS', how='outer') # Adding new IRIS, filling geo information with new files data = fillna_with_other_table(data, sport, 'IRIS', columns=['COM', 'DEP', 'REG']) _check_data(data, "Sport 16") del sport ## Enseignement 1er degré enseignement_1 = pd.read_excel('data/equip-serv-ens-1er-degre-infra-2016.xls', sheetname='IRIS') # creating header from file header = enseignement_1.loc[4].tolist() enseignement_1.columns = header # to get real values enseignement_1 = enseignement_1[5:] # creating new feature : sum all features non aggregated features = [x for x in header if x not in ['IRIS','LIB_IRIS','COM','LIB_COM','REG','REG2016','DEP']] # Sum NB_C101 to NB_C105 enseignement_1['nb_enseignement_1'] = enseignement_1[[x for x in enseignement_1.columns if x[:2] == 'C1' and len(x) == 4]]\