def test_building_two_operators_with_execution(self): # given dag = DAG(dag_id='test_dag_file', start_date=datetime.now()) input_csv_unit = DataInputFileUnit('data/X.csv', sep=';') output_parquet_unit = DataOutputFileUnit('data/X_parsed.parquet', pandas_write_function_name='to_parquet') task_1 = DataOperator(operation_function=drop_na_dataframe, params={'columns': ['ANNEEREALISATIONDIAGNOSTIC']}, input_unit=input_csv_unit, output_unit=output_parquet_unit, dag=dag, task_id='data_operator_csv_to_parquet') input_parquet_unit = DataInputFileUnit('data/X_parsed.parquet', pandas_read_function_name='read_parquet') output_csv_unit = DataOutputFileUnit('data/X_parsed_2.csv', index=False) task_2 = DataOperator(operation_function=drop_na_dataframe, params={'columns': ['ANNEETRAVAUXPRECONISESDIAG']}, input_unit=input_parquet_unit, output_unit=output_csv_unit, dag=dag, task_id='data_operator_parquet_to_csv') task_2.set_upstream(task_1) # when execute_dag(dag, verbose=True) # then df = pd.read_csv('data/X_parsed_2.csv') self.assertEqual((7241, 27), df.shape)
def test_execute_data_operator_csv_read_and_write(self): # given dag = DAG(dag_id='test', start_date=datetime.now()) input_csv_unit = DataInputFileUnit('data/X.csv', sep=';') output_csv_unit = DataOutputFileUnit('data/X_parsed.csv', index=False) task = DataOperator(operation_function=drop_na_dataframe, params={'columns': ['ANNEEREALISATIONDIAGNOSTIC']}, input_unit=input_csv_unit, output_unit=output_csv_unit, dag=dag, task_id='data_operator_csv') # when task.execute(None) # then df_transformed = pd.read_csv('data/X_parsed.csv') self.assertEqual((10245, 27), df_transformed.shape)
def test_execute_data_operator_csv_read_and_plasma_write(self): # given plasma_connector = PlasmaConnector(socket_name) dag = DAG(dag_id='test', start_date=datetime.now()) input_csv_unit = DataInputFileUnit('data/X.csv', sep=';') output_plasma_unit = DataOutputPlasmaUnit(plasma_connector, object_id) task = DataOperator(operation_function=drop_na_dataframe, params={'columns': ['ANNEEREALISATIONDIAGNOSTIC']}, input_unit=input_csv_unit, output_unit=output_plasma_unit, dag=dag, task_id='data_operator_csv_to_parquet') task_instance = TaskInstance(task=task, execution_date=datetime.now()) # when task.execute(task_instance.get_template_context()) # then other_plasma_connector = PlasmaConnector(socket_name) df_transformed = other_plasma_connector.get_dataframe(object_id) self.assertEqual((10245, 27), df_transformed.shape)
def feature_engineering_sub_dag(parent_dag_name, child_dag_name, model_path, input_file, output_file, temp_files, start_date, schedule_interval, mode='train'): dag = DAG('%s.%s' % (parent_dag_name, child_dag_name), schedule_interval=schedule_interval, start_date=start_date) task_fillna = DataOperator( operation_function=fillna_columns, input_unit=DataInputFileUnit(input_file, pandas_read_function_name='read_parquet'), output_unit=DataOutputFileUnit( temp_files[0], pandas_write_function_name='to_parquet'), dag=dag, task_id='Fill_NA_values', params={ 'simple_features': [ 'NOTEDIAGNOSTIC', 'PRIORITEDERENOUVELLEMENT', 'FREQUENTATIONCIBLE', 'RAISONDEPLANTATION', 'SOUS_CATEGORIE', 'STADEDEDEVELOPPEMENT', 'STADEDEVELOPPEMENTDIAG', 'TRAITEMENTCHENILLES', 'TRAVAUXPRECONISESDIAG', 'TROTTOIR', 'VARIETE', 'VIGUEUR', 'CODE_PARENT' ], 'model_path': model_path, 'mode': mode }) task_cat_to_num = DataOperator( operation_function=category_to_numerical_features, input_unit=DataInputFileUnit(temp_files[0], pandas_read_function_name='read_parquet'), output_unit=DataOutputFileUnit( output_file, pandas_write_function_name='to_parquet'), dag=dag, task_id='Categorical_features_to_numeric', params={ 'features': [ 'GENRE_BOTA', 'ESPECE', 'FREQUENTATIONCIBLE', 'RAISONDEPLANTATION', 'SOUS_CATEGORIE', 'STADEDEDEVELOPPEMENT', 'STADEDEVELOPPEMENTDIAG', 'TRAITEMENTCHENILLES', 'TRAVAUXPRECONISESDIAG', 'TROTTOIR', 'VARIETE', 'VIGUEUR', 'CODE_PARENT' ], 'model_path': model_path, 'mode': mode }) task_fillna.set_downstream(task_cat_to_num) return dag
output_parquet_concat_unit = DataOutputFileUnit( project_path + 'datasets/temp/X_raw.parquet', pandas_write_function_name='to_parquet') task_concat_train_files = DataOperator( operation_function=concat_train_test, input_unit=input_parquet_files_unit, output_unit=output_parquet_concat_unit, dag=dag, task_id='Concat_train_test_data_source_files') """ Resampling time data """ input_raw_data_unit = DataInputFileUnit( output_parquet_concat_unit.output_path, pandas_read_function_name='read_parquet') output_cleaned_data_unit = DataOutputFileUnit( project_path + 'datasets/temp/X_clean.parquet', pandas_write_function_name='to_parquet') task_fill_missing_values = DataOperator(operation_function=resample_fillna, input_unit=input_raw_data_unit, output_unit=output_cleaned_data_unit, dag=dag, task_id='Resample_and_fill_NA_values') task_concat_train_files.set_downstream(task_fill_missing_values) """ Simple feature engineering
task_feature_engineering_for_train = SubDagOperator( subdag=feature_engineering_sub_dag(dag.dag_id, 'Feature_engineering_for_train', model_path=project_path + 'models/', input_file=project_path + 'datasets/temp/X_train_raw.parquet', output_file=project_path + 'datasets/temp/X_train_final.parquet', temp_files=temp_files[0:10], start_date=dag.start_date, schedule_interval=dag.schedule_interval), task_id='Feature_engineering_for_train', dag=dag, ) task_concate_train_files.set_downstream(task_feature_engineering_for_train) input_parquet_raw_file_unit = DataInputFileUnit(project_path + 'datasets/temp/X_train_final.parquet', pandas_read_function_name='read_parquet') task_model_learning = DataOperator(operation_function=fit_write_model, params={'columns_selection': features_selection, 'column_target': feature_target, 'write_path': project_path + 'models/ensemble.model' }, input_unit=input_parquet_raw_file_unit, dag=dag, task_id='Model_learning') task_feature_engineering_for_train.set_downstream(task_model_learning) input_csv_files_unit = DataInputMultiFileUnit([project_path + 'datasets/input/X_tree_egc_t2.csv', project_path + 'datasets/input/X_geoloc_egc_t2.csv', project_path + 'datasets/input/Y_tree_egc_t2.csv'], sep=';') output_parquet_unit = DataOutputFileUnit(project_path + 'datasets/temp/X_test_raw.parquet', pandas_write_function_name='to_parquet')