def test_preprocess_with_standard_df():
    df = pd.DataFrame({
        'unit,geo\\time': ['Euro,AR', 'Euro,BR', 'Euro,CZ'],
        '2010 ': ['1,000.1', '2', '3c'],
        '2011 ': ['4', '5', '6'],
    })
    df = preprocessor.process(df)
    assert set(df.columns) == {'GEO', 'flags', 'unit', 'value', 'year'}
    assert df[df['GEO'] == 'Argentina'].iloc[0]['value'] == 1000.1
    assert df[df['GEO'] == 'Czechia'].iloc[0]['flags'] == 'conditional'
def process():
    project_dir = Path(__file__).resolve().parents[2]
    data_raw_dir = os.path.join(project_dir, 'data', 'raw')
    data_interim_dir = os.path.join(project_dir, 'data', 'interim')

    file_path = os.path.join(data_raw_dir, 'tps00001.tsv.gz')
    df = pd.read_csv(file_path, delimiter='\t')
    df = preprocessor.process(df)
    df = df[['year', 'GEO', 'value']]
    df.rename(columns={'value': 'population'}, inplace=True)

    df.to_csv(os.path.join(data_interim_dir, 'population.csv'), index=False)
Exemplo n.º 3
0
def process():
    project_dir = Path(__file__).resolve().parents[2]
    data_raw_dir = os.path.join(project_dir, 'data', 'raw')
    data_interim_dir = os.path.join(project_dir, 'data', 'interim')

    file_path = os.path.join(data_raw_dir, 'tsc00025.tsv.gz')
    df = pd.read_csv(file_path, delimiter='\t')
    df = preprocessor.process(df)
    df = df[['year', 'GEO', 'value', 'sex']]
    df = df.groupby(['GEO', 'year']).mean().reset_index()
    df.rename(columns={'value': 'hrst'}, inplace=True)

    df.to_csv(os.path.join(data_interim_dir, 'hrst.csv'), index=False)
Exemplo n.º 4
0
def process():
    project_dir = Path(__file__).resolve().parents[2]
    data_raw_dir = os.path.join(project_dir, 'data', 'raw')
    data_interim_dir = os.path.join(project_dir, 'data', 'interim')

    file_path = os.path.join(data_raw_dir, 't2020_20.tsv.gz')
    df = pd.read_csv(file_path, delimiter='\t')
    df.drop(columns=['TARGET '], inplace=True)
    df = preprocessor.process(df)
    df = df[['year', 'GEO', 'value']]
    df.rename(columns={'value': 'rd_expenditure'}, inplace=True)

    df.to_csv(os.path.join(data_interim_dir, 'rd_expenditure.csv'),
              index=False)
def process():
    project_dir = Path(__file__).resolve().parents[2]
    data_raw_dir = os.path.join(project_dir, 'data', 'raw')
    data_interim_dir = os.path.join(project_dir, 'data', 'interim')

    file_path = os.path.join(data_raw_dir, 'demo_frate.tsv.gz')
    df = pd.read_csv(file_path, delimiter='\t')
    df = preprocessor.process(df)
    df = df[df['age'] == 'TOTAL']
    df = df[['year', 'GEO', 'value']]
    df.rename(columns={
        'value': 'fertility_rate'
    }, inplace=True)

    df.to_csv(os.path.join(data_interim_dir, 'frate.csv'), index=False)
Exemplo n.º 6
0
def process():
    project_dir = Path(__file__).resolve().parents[2]
    data_raw_dir = os.path.join(project_dir, 'data', 'raw')
    data_interim_dir = os.path.join(project_dir, 'data', 'interim')

    file_path = os.path.join(data_raw_dir, 'trng_lfs_02.tsv.gz')
    df = pd.read_csv(file_path, delimiter='\t')
    df = preprocessor.process(df)
    df = df[df['age'] == 'Y18-24']
    df = df[['year', 'GEO', 'value', 'sex']]
    df = df.groupby(['GEO', 'year'])['value'].mean().reset_index()
    df.rename(columns={
        'value': 'education'
    }, inplace=True)

    df.to_csv(os.path.join(data_interim_dir, 'education.csv'), index=False)
def test_preprocess_with_empty_df():
    df = pd.DataFrame()
    with pytest.raises(IndexError):
        preprocessor.process(df)