def tee_file_parser(excelfile_name):
    infos = file_infos(excelfile_name)

    col_names_actifs = ['code', 'description', 'S1', 'S11', 'S12', 'S13', 'S14',
        'S15', 'impots', 'S2', 'biens_services', 'total']
    col_names_passifs = ['total', 'biens_services', 'S2', 'impots', 'S15', 'S14', 'S13',
        'S12', 'S11', 'S1', 'code', 'description']

    header = None
    skiprows = 0  # prviously 4
    skip_footer = 0  # previously 2 but unnecessary since cleaner
    index_col = None
    parse_cols = "A:end"

    print excelfile_name
    assert os.path.exists(excelfile_name), 'Cannot find file {}. Use cn_dowloader to load and unzip the raw CN files'.format(excelfile_name)
    df_ea = pandas.read_excel(excelfile_name, sheetname = 1, header = header, skiprows = skiprows,
                              skip_footer = skip_footer, index_col = index_col, parse_cols = parse_cols,
                              names = col_names_actifs, na_values = ['0'])
    df_rp = pandas.read_excel(excelfile_name, sheetname = 2, header = header, skiprows = skiprows,
                              skip_footer = skip_footer, index_col = index_col, parse_cols = parse_cols,
                              names = col_names_passifs, na_values = ['0'])
    df_ea['ressources'] = False
    df_rp['ressources'] = True

    for df in [df_ea, df_rp]:
        df['year'] = int(infos['year'])
        df['version'] = infos['version']
        df['source'] = infos['source']
        df['link'] = infos['link']
        df['file_title'] = infos['title']
        df['file_name'] = infos['filename']

    result = df_ea.append(df_rp, ignore_index = True)
    return result
def tee_file_parser(excelfile_name):
    infos = file_infos(excelfile_name)

    col_names_actifs = [
        'code', 'description', 'S1', 'S11', 'S12', 'S13', 'S14', 'S15',
        'impots', 'S2', 'biens_services', 'total'
    ]
    col_names_passifs = [
        'total', 'biens_services', 'S2', 'impots', 'S15', 'S14', 'S13', 'S12',
        'S11', 'S1', 'code', 'description'
    ]

    header = None
    skiprows = 0  # prviously 4
    skip_footer = 0  # previously 2 but unnecessary since cleaner
    index_col = None
    parse_cols = "A:end"

    print excelfile_name
    assert os.path.exists(
        excelfile_name
    ), 'Cannot find file {}. Use cn_dowloader to load and unzip the raw CN files'.format(
        excelfile_name)
    df_ea = pandas.read_excel(excelfile_name,
                              sheetname=1,
                              header=header,
                              skiprows=skiprows,
                              skip_footer=skip_footer,
                              index_col=index_col,
                              parse_cols=parse_cols,
                              names=col_names_actifs,
                              na_values=['0'])
    df_rp = pandas.read_excel(excelfile_name,
                              sheetname=2,
                              header=header,
                              skiprows=skiprows,
                              skip_footer=skip_footer,
                              index_col=index_col,
                              parse_cols=parse_cols,
                              names=col_names_passifs,
                              na_values=['0'])
    df_ea['ressources'] = False
    df_rp['ressources'] = True

    for df in [df_ea, df_rp]:
        df['year'] = int(infos['year'])
        df['version'] = infos['version']
        df['source'] = infos['source']
        df['link'] = infos['link']
        df['file_title'] = infos['title']
        df['file_name'] = infos['filename']

    result = df_ea.append(df_rp, ignore_index=True)
    return result
示例#3
0
def file_parser(excelfile_name):
    log.info('Parsing {}'.format(excelfile_name))
    infos = file_infos(excelfile_name)
    if infos['tee_flag'] == 0:
        header = 1
        skiprows = 0
        skip_footer = 0
        index_col = None
        parse_cols = " A:end"

    df = pandas.read_excel(excelfile_name, header = header, skiprows = skiprows, skip_footer = skip_footer,
       index_col = index_col, parse_cols = parse_cols, na_values = ['0'])
    # rename first column, and trim content
    new_columns = df.columns.values
    new_columns[0] = 'code'
    new_columns[1] = 'description'
    df.columns = new_columns
    df['code'] = df['code'].astype('str')
    df['description'] = df['description'].str.lower()

    ressource_dummy = 0
    df['ressources'] = False
    if infos['filename'] == 't_7601':
        for ind in df.index:
            if df.ix[ind]['description'] == u"à destination du reste du monde":
                ressource_dummy = 1
            elif df.ix[ind]['description'] == u"en provenance du reste du monde":
                ressource_dummy = 0
            if ressource_dummy == 1:
                df.ix[ind, ['ressources']] = True
            else:
                df.ix[ind, ['ressources']] = False
    else:
        for ind in df.index:
            if df.ix[ind]['description'] == "ressources":
                ressource_dummy = 1
            elif df.ix[ind]['description'] == "emplois":
                ressource_dummy = 0
            if ressource_dummy == 1:
                df.ix[ind, ['ressources']] = True
            else:
                df.ix[ind, ['ressources']] = False

    if infos['filename'] == 't_1115':
        for ind in df.index:
            df.ix[ind, ['code']] = u'no code'

    df['source'] = infos['source']
    df['version'] = infos['version']
    df['file_title'] = infos['title']
    df['file_name'] = infos['filename']
    df['link'] = infos['link']
    df['institution'] = infos['agent']
    return df
示例#4
0
def non_tee_df_by_filename_generator(folder_year):
    non_tee_df_by_filename = dict()
    directory_path = os.path.join(cn_directory, 'comptes_annee_{}'.format(folder_year))
    assert os.path.exists(directory_path), '{} does not exist. Use cn_downloader to create it'.format(
        directory_path)
    list_of_files = glob.glob(os.path.join(directory_path, '*.xls'))
    for filename in list_of_files:
        assert os.path.exists(filename)
        infos = file_infos(filename)
        if infos is False:
            continue
        if infos['tee_flag'] == 1:  # = tee file
            continue

        df = file_parser(filename)
        df = df_tidy(df, int(infos['version']))
        df = df_cleaner(df)
        non_tee_df_by_filename[infos['filename']] = df
    return non_tee_df_by_filename