示例#1
0
def test_generate_pipeline_code():
    """Assert that generate_pipeline_code() returns the correct code given a specific pipeline"""
    pipeline = [
        'KNeighborsClassifier',
        [
            'CombineDFs',
            ['GradientBoostingClassifier', 'input_matrix', 38.0, 0.87, 0.5],
            ['GaussianNB', ['ZeroCount', 'input_matrix']]
        ], 18, 33
    ]

    expected_code = """make_pipeline(
    make_union(
        make_union(VotingClassifier(estimators=[('branch',
            GradientBoostingClassifier(learning_rate=1.0, max_features=1.0, min_weight_fraction_leaf=0.5, n_estimators=500)
        )]), FunctionTransformer(lambda X: X)),
        make_union(VotingClassifier(estimators=[('branch',
            make_pipeline(
                ZeroCount(),
                GaussianNB()
            )
        )]), FunctionTransformer(lambda X: X))
    ),
    KNeighborsClassifier(n_neighbors=5, weights="distance")
)"""

    assert expected_code == generate_pipeline_code(pipeline)
示例#2
0
def test_generate_pipeline_code():
    """Assert that generate_pipeline_code() returns the correct code given a specific pipeline"""
    pipeline = ['KNeighborsClassifier',
        ['CombineDFs',
            ['GradientBoostingClassifier',
                'input_matrix',
                38.0,
                0.87,
                0.5],
            ['GaussianNB',
                ['ZeroCount',
                    'input_matrix']]],
        18,
        33]

    expected_code = """make_pipeline(
    make_union(
        make_union(VotingClassifier(estimators=[('branch',
            GradientBoostingClassifier(learning_rate=1.0, max_features=1.0, min_weight_fraction_leaf=0.5, n_estimators=500)
        )]), FunctionTransformer(lambda X: X)),
        make_union(VotingClassifier(estimators=[('branch',
            make_pipeline(
                ZeroCount(),
                GaussianNB()
            )
        )]), FunctionTransformer(lambda X: X))
    ),
    KNeighborsClassifier(n_neighbors=5, weights="distance")
)"""

    assert expected_code == generate_pipeline_code(pipeline)
示例#3
0
def test_generate_pipeline_code_2():
    """Assert that generate_pipeline_code() returns the correct code given a specific pipeline with two CombineDFs."""

    pipeline = [
        'KNeighborsClassifier',
        [
            'CombineDFs',
            [
                'GradientBoostingClassifier', 'input_matrix', 38.0, 5, 5, 5,
                0.05, 0.5
            ],
            [
                'CombineDFs', ['MinMaxScaler', 'input_matrix'],
                ['ZeroCount', ['MaxAbsScaler', 'input_matrix']]
            ]
        ], 18, 'uniform', 2
    ]

    expected_code = """make_pipeline(
    make_union(
        StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=38.0, max_depth=5, max_features=5, min_samples_leaf=5, min_samples_split=0.05, n_estimators=0.5)),
        make_union(
            MinMaxScaler(),
            make_pipeline(
                MaxAbsScaler(),
                ZeroCount()
            )
        )
    ),
    KNeighborsClassifier(n_neighbors=18, p="uniform", weights=2)
)"""

    assert expected_code == generate_pipeline_code(pipeline,
                                                   tpot_obj.operators)
示例#4
0
文件: tests.py 项目: val922/tpot
def test_generate_pipeline_code():
    """Assert that generate_pipeline_code() returns the correct code given a specific pipeline"""
    tpot_obj = TPOTClassifier()
    pipeline = [
        'KNeighborsClassifier',
        [
            'CombineDFs',
            [
                'GradientBoostingClassifier', 'input_matrix', 38.0, 5, 5, 5,
                0.05, 0.5
            ], ['GaussianNB', ['ZeroCount', 'input_matrix']]
        ], 18, 'uniform', 2
    ]

    expected_code = """make_pipeline(
    make_union(
        make_union(VotingClassifier([('branch',
            GradientBoostingClassifier(learning_rate=38.0, max_depth=5, max_features=5, min_samples_leaf=5, min_samples_split=0.05, n_estimators=0.5)
        )]), FunctionTransformer(copy)),
        make_union(VotingClassifier([('branch',
            make_pipeline(
                ZeroCount(),
                GaussianNB()
            )
        )]), FunctionTransformer(copy))
    ),
    KNeighborsClassifier(n_neighbors=18, p="uniform", weights=2)
)"""
    assert expected_code == generate_pipeline_code(pipeline,
                                                   tpot_obj.operators)
示例#5
0
def test_generate_pipeline_code():
    """Assert that generate_pipeline_code() returns the correct code given a specific pipeline"""
    tpot_obj = TPOTClassifier()
    pipeline = ['KNeighborsClassifier',
        ['CombineDFs',
            ['GradientBoostingClassifier',
                'input_matrix',
                38.0,
                5,
                5,
                5,
                0.05,
                0.5],
            ['GaussianNB',
                ['ZeroCount',
                    'input_matrix']]],
        18,
        'uniform',
        2]

    expected_code = """make_pipeline(
    make_union(
        make_union(VotingClassifier([('branch',
            GradientBoostingClassifier(learning_rate=38.0, max_depth=5, max_features=5, min_samples_leaf=5, min_samples_split=0.05, n_estimators=0.5)
        )]), FunctionTransformer(copy)),
        make_union(VotingClassifier([('branch',
            make_pipeline(
                ZeroCount(),
                GaussianNB()
            )
        )]), FunctionTransformer(copy))
    ),
    KNeighborsClassifier(n_neighbors=18, p="uniform", weights=2)
)"""
    assert expected_code == generate_pipeline_code(pipeline, tpot_obj.operators)
示例#6
0
def test_generate_pipeline_code_2():
    """Assert that generate_pipeline_code() returns the correct code given a specific pipeline with two CombineDFs."""

    pipeline = [
        'KNeighborsClassifier',
        [
            'CombineDFs',
            [
                'GradientBoostingClassifier',
                'input_matrix',
                38.0,
                5,
                5,
                5,
                0.05,
                0.5],
            [
                'CombineDFs',
                [
                    'MinMaxScaler',
                    'input_matrix'
                ],
                ['ZeroCount',
                    [
                        'MaxAbsScaler',
                        'input_matrix'
                    ]
                ]
            ]
        ],
        18,
        'uniform',
        2
    ]

    expected_code = """make_pipeline(
    make_union(
        StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=38.0, max_depth=5, max_features=5, min_samples_leaf=5, min_samples_split=0.05, n_estimators=0.5)),
        make_union(
            MinMaxScaler(),
            make_pipeline(
                MaxAbsScaler(),
                ZeroCount()
            )
        )
    ),
    KNeighborsClassifier(n_neighbors=18, p="uniform", weights=2)
)"""

    assert expected_code == generate_pipeline_code(pipeline, tpot_obj.operators)
    def _save_periodic_pipeline(self, gen):
        try:
            #self._create_periodic_checkpoint_folder()
            for pipeline, pipeline_scores in zip(self._pareto_front.items,
                                                 reversed(self._pareto_front.keys)):
                idx = self._pareto_front.items.index(pipeline)
                pareto_front_pipeline_score = pipeline_scores.wvalues[1]
                sklearn_pipeline_str = generate_pipeline_code(expr_to_tree(pipeline, self._pset), self.operators)
                to_write = export_pipeline(pipeline, self.operators, self._pset,
                                           self._imputed,
                                           pareto_front_pipeline_score,
                                           self.random_state)

                # fit the pipeline again and get the test score
                sklearn_pipeline = self._toolbox.compile(expr=pipeline)
                sklearn_pipeline.fit(self.features, self.target)
                ypredict = sklearn_pipeline.predict(self.features_test)
                mae = - mean_absolute_error(self.target_test, ypredict)


                # dont export a pipeline you had
                if self._exported_pipeline_text.count(sklearn_pipeline_str):
                    self._update_pbar(pbar_num=0, pbar_msg='Periodic pipeline was not saved, probably saved before...')
                else:
                    filename = os.path.join(self.periodic_checkpoint_folder,
                                            'pipeline_gen_{}_idx_{}_{}.py'.format(gen, idx, datetime.now().strftime('%Y.%m.%d_%H-%M-%S')))
                    self._update_pbar(pbar_num=0, pbar_msg='Saving periodic pipeline from pareto front to {}'.format(filename))
                    with open(filename, 'w') as output_file:
                        output_file.write(to_write)
                    self._exported_pipeline_text.append(sklearn_pipeline_str)

                    # dump a pickle with current pareto value and the pipeline, it is not yet saved
                    self.log[gen] = {}
                    self.log[gen]['pipeline_name'] = sklearn_pipeline_str
                    self.log[gen]['pipeline_score'] = pipeline_scores.wvalues[1]
                    self.log[gen]['pipeline_test_mae'] = mae
                    self.log[gen]['pipeline_sklearn_obj'] = self._compile_to_sklearn(pipeline)
                    # This can ge used to the pipeline complexity
                    self.log[gen]['pipeline_tree'] = expr_to_tree(pipeline,
                            self._pset)

        except Exception as e:
            self._update_pbar(pbar_num=0, pbar_msg='Failed saving periodic pipeline,   exception:\n{}'.format(str(e)[:250]))
示例#8
0
def createsklearnPipeline(pipeline_optimizer, pipes):
    # generate operator list
    pp_operators = []
    for k, v in pipeline_optimizer.operators_context.items():
        if 'sklearn.preprocessing' in str(v) or \
            'sklearn.decomposition' in str(v) or \
            'tpot.builtins' in str(v) or \
            'sklearn.cluster' in str(v) or \
            'sklearn.feature_selection' in str(v):
            pp_operators.append(k.lower())
        else:
            pass
    pp_operators.remove(
        'stackingestimator')  # remove stacking estimator from operators
    n = 1 + pipes
    p = {}
    plist = []
    for pipeline_string, attrib in sorted(
            pipeline_optimizer.evaluated_individuals_.items()):
        # convert pipeline string to scikit-learn pipeline object
        deap_pipeline = creator.Individual.from_string(
            pipeline_string, pipeline_optimizer._pset)
        sklearn_pipeline = pipeline_optimizer._toolbox.compile(
            expr=deap_pipeline)
        # print sklearn pipeline string
        sklearn_pipeline_str = generate_pipeline_code(
            expr_to_tree(deap_pipeline, pipeline_optimizer._pset),
            pipeline_optimizer.operators)
        #print(n, sklearn_pipeline.steps)
        if attrib.get('internal_cv_score') > 0:  # handle bad data in cv_score
            cv_score = attrib.get('internal_cv_score')
        else:
            cv_score = abs(attrib.get('internal_cv_score')
                           )  # change this from None to abs for Regression
        for num, l in enumerate(sklearn_pipeline.steps):
            if l[0] not in 'featureunion':  # ignore feature union for now
                #print(n, sklearn_pipeline.steps[num][1])
                params = sklearn_pipeline.steps[num][1].get_params()
                if 'stackingestimator' in l[0]:  # identify stacking estimator
                    stack = 'Y'
                    algoName = str(
                        params['estimator']).split('(')[0].lower() + '_stack'
                    params = params['estimator'].get_params()
                else:
                    stack = 'N'
                    algoName = l[0]
                #pp_operators = ppoperator(pipeline_optimizer) # identify preprocessing algos
                if l[0].startswith(tuple(pp_operators)):
                    pp_flag = 'Y'
                    params = l[1].get_params()
                    if l[0] in ['selectfrommodel', 'rfe']:
                        algoName = l[0]
                        params = params['estimator'].get_params()
                    else:
                        algoName = str(l[1]).split('(')[0].lower()
                        params = l[1].get_params()
                else:
                    pp_flag = 'N'
                p = {
                    "PIPELINE": n,
                    "ALGO_NAME": algoName,
                    "STACK_FLG": stack,
                    "PP_FLAG": pp_flag,
                    "SCORE": cv_score
                }
                p.update(params)
                plist.append(p)
        n = n + 1  # update pipeline number
    master = pd.DataFrame()
    for i in plist:
        pip = int(i['PIPELINE'])
        alg = i['ALGO_NAME']
        algtype = i['STACK_FLG']
        score = i['SCORE']
        ppflag = i['PP_FLAG']
        pipeList = []
        aList = []
        atypeList = []
        hList = []
        vList = []
        sList = []
        ppList = []
        htypeList = []
        for k, v in i.items():
            if k not in [
                    'PIPELINE', 'ALGO_NAME', 'SCORE', 'STACK_FLG', 'PP_FLAG'
            ]:
                pipeList.append(pip)
                aList.append(alg)
                atypeList.append(algtype)
                hList.append(k)
                if type(v) in [bool, str]:  # check hyper value type
                    htype = 'C'
                else:
                    htype = 'N'
                vList.append(v)
                htypeList.append(htype)
                sList.append(score)
                ppList.append(ppflag)
        df_dict = {
            'PIPELINE': pipeList,
            'ALGO_NAME': aList,
            'STACK_FLG': atypeList,
            'PP_FLAG': ppList,
            'SCORE': sList,
            "HYPER_NAME": hList,
            "HYPER_TYPE": htypeList,
            "HYPER_VALUE": vList
        }
        df = pd.DataFrame(df_dict)
        master = master.append(df)
    #stack_df = master[master['STACK_FLG']=='Y'].drop_duplicates()
    # drop bad pipelines
    '''
    if type(pipeline_optimizer) == 'TPOTRegressor':
        master.drop(master[master['SCORE'] > master['SCORE'].std()*4].index, inplace=True)
    '''
    # check if file exists
    if path.exists('pipeline.csv'):
        master.to_csv('pipeline.csv', mode='a', header=False, index=False)
    else:
        master.to_csv('pipeline.csv', index=False)