def test_preprocess_with_standard_df(): df = pd.DataFrame({ 'unit,geo\\time': ['Euro,AR', 'Euro,BR', 'Euro,CZ'], '2010 ': ['1,000.1', '2', '3c'], '2011 ': ['4', '5', '6'], }) df = preprocessor.process(df) assert set(df.columns) == {'GEO', 'flags', 'unit', 'value', 'year'} assert df[df['GEO'] == 'Argentina'].iloc[0]['value'] == 1000.1 assert df[df['GEO'] == 'Czechia'].iloc[0]['flags'] == 'conditional'
def process(): project_dir = Path(__file__).resolve().parents[2] data_raw_dir = os.path.join(project_dir, 'data', 'raw') data_interim_dir = os.path.join(project_dir, 'data', 'interim') file_path = os.path.join(data_raw_dir, 'tps00001.tsv.gz') df = pd.read_csv(file_path, delimiter='\t') df = preprocessor.process(df) df = df[['year', 'GEO', 'value']] df.rename(columns={'value': 'population'}, inplace=True) df.to_csv(os.path.join(data_interim_dir, 'population.csv'), index=False)
def process(): project_dir = Path(__file__).resolve().parents[2] data_raw_dir = os.path.join(project_dir, 'data', 'raw') data_interim_dir = os.path.join(project_dir, 'data', 'interim') file_path = os.path.join(data_raw_dir, 'tsc00025.tsv.gz') df = pd.read_csv(file_path, delimiter='\t') df = preprocessor.process(df) df = df[['year', 'GEO', 'value', 'sex']] df = df.groupby(['GEO', 'year']).mean().reset_index() df.rename(columns={'value': 'hrst'}, inplace=True) df.to_csv(os.path.join(data_interim_dir, 'hrst.csv'), index=False)
def process(): project_dir = Path(__file__).resolve().parents[2] data_raw_dir = os.path.join(project_dir, 'data', 'raw') data_interim_dir = os.path.join(project_dir, 'data', 'interim') file_path = os.path.join(data_raw_dir, 't2020_20.tsv.gz') df = pd.read_csv(file_path, delimiter='\t') df.drop(columns=['TARGET '], inplace=True) df = preprocessor.process(df) df = df[['year', 'GEO', 'value']] df.rename(columns={'value': 'rd_expenditure'}, inplace=True) df.to_csv(os.path.join(data_interim_dir, 'rd_expenditure.csv'), index=False)
def process(): project_dir = Path(__file__).resolve().parents[2] data_raw_dir = os.path.join(project_dir, 'data', 'raw') data_interim_dir = os.path.join(project_dir, 'data', 'interim') file_path = os.path.join(data_raw_dir, 'demo_frate.tsv.gz') df = pd.read_csv(file_path, delimiter='\t') df = preprocessor.process(df) df = df[df['age'] == 'TOTAL'] df = df[['year', 'GEO', 'value']] df.rename(columns={ 'value': 'fertility_rate' }, inplace=True) df.to_csv(os.path.join(data_interim_dir, 'frate.csv'), index=False)
def process(): project_dir = Path(__file__).resolve().parents[2] data_raw_dir = os.path.join(project_dir, 'data', 'raw') data_interim_dir = os.path.join(project_dir, 'data', 'interim') file_path = os.path.join(data_raw_dir, 'trng_lfs_02.tsv.gz') df = pd.read_csv(file_path, delimiter='\t') df = preprocessor.process(df) df = df[df['age'] == 'Y18-24'] df = df[['year', 'GEO', 'value', 'sex']] df = df.groupby(['GEO', 'year'])['value'].mean().reset_index() df.rename(columns={ 'value': 'education' }, inplace=True) df.to_csv(os.path.join(data_interim_dir, 'education.csv'), index=False)
def test_preprocess_with_empty_df(): df = pd.DataFrame() with pytest.raises(IndexError): preprocessor.process(df)