def tee_file_parser(excelfile_name): infos = file_infos(excelfile_name) col_names_actifs = ['code', 'description', 'S1', 'S11', 'S12', 'S13', 'S14', 'S15', 'impots', 'S2', 'biens_services', 'total'] col_names_passifs = ['total', 'biens_services', 'S2', 'impots', 'S15', 'S14', 'S13', 'S12', 'S11', 'S1', 'code', 'description'] header = None skiprows = 0 # prviously 4 skip_footer = 0 # previously 2 but unnecessary since cleaner index_col = None parse_cols = "A:end" print excelfile_name assert os.path.exists(excelfile_name), 'Cannot find file {}. Use cn_dowloader to load and unzip the raw CN files'.format(excelfile_name) df_ea = pandas.read_excel(excelfile_name, sheetname = 1, header = header, skiprows = skiprows, skip_footer = skip_footer, index_col = index_col, parse_cols = parse_cols, names = col_names_actifs, na_values = ['0']) df_rp = pandas.read_excel(excelfile_name, sheetname = 2, header = header, skiprows = skiprows, skip_footer = skip_footer, index_col = index_col, parse_cols = parse_cols, names = col_names_passifs, na_values = ['0']) df_ea['ressources'] = False df_rp['ressources'] = True for df in [df_ea, df_rp]: df['year'] = int(infos['year']) df['version'] = infos['version'] df['source'] = infos['source'] df['link'] = infos['link'] df['file_title'] = infos['title'] df['file_name'] = infos['filename'] result = df_ea.append(df_rp, ignore_index = True) return result
def tee_file_parser(excelfile_name): infos = file_infos(excelfile_name) col_names_actifs = [ 'code', 'description', 'S1', 'S11', 'S12', 'S13', 'S14', 'S15', 'impots', 'S2', 'biens_services', 'total' ] col_names_passifs = [ 'total', 'biens_services', 'S2', 'impots', 'S15', 'S14', 'S13', 'S12', 'S11', 'S1', 'code', 'description' ] header = None skiprows = 0 # prviously 4 skip_footer = 0 # previously 2 but unnecessary since cleaner index_col = None parse_cols = "A:end" print excelfile_name assert os.path.exists( excelfile_name ), 'Cannot find file {}. Use cn_dowloader to load and unzip the raw CN files'.format( excelfile_name) df_ea = pandas.read_excel(excelfile_name, sheetname=1, header=header, skiprows=skiprows, skip_footer=skip_footer, index_col=index_col, parse_cols=parse_cols, names=col_names_actifs, na_values=['0']) df_rp = pandas.read_excel(excelfile_name, sheetname=2, header=header, skiprows=skiprows, skip_footer=skip_footer, index_col=index_col, parse_cols=parse_cols, names=col_names_passifs, na_values=['0']) df_ea['ressources'] = False df_rp['ressources'] = True for df in [df_ea, df_rp]: df['year'] = int(infos['year']) df['version'] = infos['version'] df['source'] = infos['source'] df['link'] = infos['link'] df['file_title'] = infos['title'] df['file_name'] = infos['filename'] result = df_ea.append(df_rp, ignore_index=True) return result
def file_parser(excelfile_name): log.info('Parsing {}'.format(excelfile_name)) infos = file_infos(excelfile_name) if infos['tee_flag'] == 0: header = 1 skiprows = 0 skip_footer = 0 index_col = None parse_cols = " A:end" df = pandas.read_excel(excelfile_name, header = header, skiprows = skiprows, skip_footer = skip_footer, index_col = index_col, parse_cols = parse_cols, na_values = ['0']) # rename first column, and trim content new_columns = df.columns.values new_columns[0] = 'code' new_columns[1] = 'description' df.columns = new_columns df['code'] = df['code'].astype('str') df['description'] = df['description'].str.lower() ressource_dummy = 0 df['ressources'] = False if infos['filename'] == 't_7601': for ind in df.index: if df.ix[ind]['description'] == u"à destination du reste du monde": ressource_dummy = 1 elif df.ix[ind]['description'] == u"en provenance du reste du monde": ressource_dummy = 0 if ressource_dummy == 1: df.ix[ind, ['ressources']] = True else: df.ix[ind, ['ressources']] = False else: for ind in df.index: if df.ix[ind]['description'] == "ressources": ressource_dummy = 1 elif df.ix[ind]['description'] == "emplois": ressource_dummy = 0 if ressource_dummy == 1: df.ix[ind, ['ressources']] = True else: df.ix[ind, ['ressources']] = False if infos['filename'] == 't_1115': for ind in df.index: df.ix[ind, ['code']] = u'no code' df['source'] = infos['source'] df['version'] = infos['version'] df['file_title'] = infos['title'] df['file_name'] = infos['filename'] df['link'] = infos['link'] df['institution'] = infos['agent'] return df
def non_tee_df_by_filename_generator(folder_year): non_tee_df_by_filename = dict() directory_path = os.path.join(cn_directory, 'comptes_annee_{}'.format(folder_year)) assert os.path.exists(directory_path), '{} does not exist. Use cn_downloader to create it'.format( directory_path) list_of_files = glob.glob(os.path.join(directory_path, '*.xls')) for filename in list_of_files: assert os.path.exists(filename) infos = file_infos(filename) if infos is False: continue if infos['tee_flag'] == 1: # = tee file continue df = file_parser(filename) df = df_tidy(df, int(infos['version'])) df = df_cleaner(df) non_tee_df_by_filename[infos['filename']] = df return non_tee_df_by_filename