示例#1
0
    def test_building_two_operators_with_execution(self):
        # given
        dag = DAG(dag_id='test_dag_file', start_date=datetime.now())

        input_csv_unit = DataInputFileUnit('data/X.csv', sep=';')
        output_parquet_unit = DataOutputFileUnit('data/X_parsed.parquet', pandas_write_function_name='to_parquet')
        task_1 = DataOperator(operation_function=drop_na_dataframe,
                              params={'columns': ['ANNEEREALISATIONDIAGNOSTIC']},
                              input_unit=input_csv_unit,
                              output_unit=output_parquet_unit,
                              dag=dag, task_id='data_operator_csv_to_parquet')

        input_parquet_unit = DataInputFileUnit('data/X_parsed.parquet', pandas_read_function_name='read_parquet')
        output_csv_unit = DataOutputFileUnit('data/X_parsed_2.csv', index=False)
        task_2 = DataOperator(operation_function=drop_na_dataframe,
                              params={'columns': ['ANNEETRAVAUXPRECONISESDIAG']},
                              input_unit=input_parquet_unit,
                              output_unit=output_csv_unit,
                              dag=dag, task_id='data_operator_parquet_to_csv')

        task_2.set_upstream(task_1)

        # when
        execute_dag(dag, verbose=True)

        # then
        df = pd.read_csv('data/X_parsed_2.csv')
        self.assertEqual((7241, 27), df.shape)
示例#2
0
    def test_execute_data_operator_csv_read_and_write(self):
        # given
        dag = DAG(dag_id='test', start_date=datetime.now())
        input_csv_unit = DataInputFileUnit('data/X.csv', sep=';')
        output_csv_unit = DataOutputFileUnit('data/X_parsed.csv', index=False)

        task = DataOperator(operation_function=drop_na_dataframe,
                            params={'columns': ['ANNEEREALISATIONDIAGNOSTIC']},
                            input_unit=input_csv_unit,
                            output_unit=output_csv_unit,
                            dag=dag,
                            task_id='data_operator_csv')

        # when
        task.execute(None)

        # then
        df_transformed = pd.read_csv('data/X_parsed.csv')
        self.assertEqual((10245, 27), df_transformed.shape)
示例#3
0
    def test_execute_data_operator_csv_read_and_plasma_write(self):
        # given
        plasma_connector = PlasmaConnector(socket_name)

        dag = DAG(dag_id='test', start_date=datetime.now())
        input_csv_unit = DataInputFileUnit('data/X.csv', sep=';')
        output_plasma_unit = DataOutputPlasmaUnit(plasma_connector, object_id)

        task = DataOperator(operation_function=drop_na_dataframe,
                            params={'columns': ['ANNEEREALISATIONDIAGNOSTIC']},
                            input_unit=input_csv_unit,
                            output_unit=output_plasma_unit,
                            dag=dag,
                            task_id='data_operator_csv_to_parquet')

        task_instance = TaskInstance(task=task, execution_date=datetime.now())

        # when
        task.execute(task_instance.get_template_context())

        # then
        other_plasma_connector = PlasmaConnector(socket_name)
        df_transformed = other_plasma_connector.get_dataframe(object_id)
        self.assertEqual((10245, 27), df_transformed.shape)
示例#4
0
def feature_engineering_sub_dag(parent_dag_name,
                                child_dag_name,
                                model_path,
                                input_file,
                                output_file,
                                temp_files,
                                start_date,
                                schedule_interval,
                                mode='train'):

    dag = DAG('%s.%s' % (parent_dag_name, child_dag_name),
              schedule_interval=schedule_interval,
              start_date=start_date)

    task_fillna = DataOperator(
        operation_function=fillna_columns,
        input_unit=DataInputFileUnit(input_file,
                                     pandas_read_function_name='read_parquet'),
        output_unit=DataOutputFileUnit(
            temp_files[0], pandas_write_function_name='to_parquet'),
        dag=dag,
        task_id='Fill_NA_values',
        params={
            'simple_features': [
                'NOTEDIAGNOSTIC', 'PRIORITEDERENOUVELLEMENT',
                'FREQUENTATIONCIBLE', 'RAISONDEPLANTATION', 'SOUS_CATEGORIE',
                'STADEDEDEVELOPPEMENT', 'STADEDEVELOPPEMENTDIAG',
                'TRAITEMENTCHENILLES', 'TRAVAUXPRECONISESDIAG', 'TROTTOIR',
                'VARIETE', 'VIGUEUR', 'CODE_PARENT'
            ],
            'model_path':
            model_path,
            'mode':
            mode
        })

    task_cat_to_num = DataOperator(
        operation_function=category_to_numerical_features,
        input_unit=DataInputFileUnit(temp_files[0],
                                     pandas_read_function_name='read_parquet'),
        output_unit=DataOutputFileUnit(
            output_file, pandas_write_function_name='to_parquet'),
        dag=dag,
        task_id='Categorical_features_to_numeric',
        params={
            'features': [
                'GENRE_BOTA', 'ESPECE', 'FREQUENTATIONCIBLE',
                'RAISONDEPLANTATION', 'SOUS_CATEGORIE', 'STADEDEDEVELOPPEMENT',
                'STADEDEVELOPPEMENTDIAG', 'TRAITEMENTCHENILLES',
                'TRAVAUXPRECONISESDIAG', 'TROTTOIR', 'VARIETE', 'VIGUEUR',
                'CODE_PARENT'
            ],
            'model_path':
            model_path,
            'mode':
            mode
        })

    task_fillna.set_downstream(task_cat_to_num)

    return dag
示例#5
0
output_parquet_concat_unit = DataOutputFileUnit(
    project_path + 'datasets/temp/X_raw.parquet',
    pandas_write_function_name='to_parquet')

task_concat_train_files = DataOperator(
    operation_function=concat_train_test,
    input_unit=input_parquet_files_unit,
    output_unit=output_parquet_concat_unit,
    dag=dag,
    task_id='Concat_train_test_data_source_files')
"""
Resampling time data
"""
input_raw_data_unit = DataInputFileUnit(
    output_parquet_concat_unit.output_path,
    pandas_read_function_name='read_parquet')

output_cleaned_data_unit = DataOutputFileUnit(
    project_path + 'datasets/temp/X_clean.parquet',
    pandas_write_function_name='to_parquet')

task_fill_missing_values = DataOperator(operation_function=resample_fillna,
                                        input_unit=input_raw_data_unit,
                                        output_unit=output_cleaned_data_unit,
                                        dag=dag,
                                        task_id='Resample_and_fill_NA_values')

task_concat_train_files.set_downstream(task_fill_missing_values)
"""
Simple feature engineering 
示例#6
0
task_feature_engineering_for_train = SubDagOperator(
    subdag=feature_engineering_sub_dag(dag.dag_id, 'Feature_engineering_for_train',
                                       model_path=project_path + 'models/',
                                       input_file=project_path + 'datasets/temp/X_train_raw.parquet',
                                       output_file=project_path + 'datasets/temp/X_train_final.parquet',
                                       temp_files=temp_files[0:10],
                                       start_date=dag.start_date,
                                       schedule_interval=dag.schedule_interval),
    task_id='Feature_engineering_for_train',
    dag=dag,
)

task_concate_train_files.set_downstream(task_feature_engineering_for_train)

input_parquet_raw_file_unit = DataInputFileUnit(project_path + 'datasets/temp/X_train_final.parquet',
                                                pandas_read_function_name='read_parquet')
task_model_learning = DataOperator(operation_function=fit_write_model,
                                   params={'columns_selection': features_selection,
                                           'column_target': feature_target,
                                           'write_path': project_path + 'models/ensemble.model'
                                           },
                                   input_unit=input_parquet_raw_file_unit,
                                   dag=dag, task_id='Model_learning')

task_feature_engineering_for_train.set_downstream(task_model_learning)

input_csv_files_unit = DataInputMultiFileUnit([project_path + 'datasets/input/X_tree_egc_t2.csv',
                                               project_path + 'datasets/input/X_geoloc_egc_t2.csv',
                                               project_path + 'datasets/input/Y_tree_egc_t2.csv'], sep=';')
output_parquet_unit = DataOutputFileUnit(project_path + 'datasets/temp/X_test_raw.parquet',
                                         pandas_write_function_name='to_parquet')