Exemplo n.º 1
0
def classificar():
    """
    Extrai dataframe da tabela, converte variáveis de texto para número
    e preenche vazios para -1
    """

    dataframe = pd.read_excel('data/data.xlsx')

    #dataframe.rename({'CODIFICAÇÃO': 'class'}, axis='columns', inplace=True)

    # Binarizando variável com multiplos niveis
    encoder = LabelEncoder()
    classe_label = encoder.fit_transform(dataframe.iloc[:, 0])

    print(classe_label)

    # Binarizando variável com dois niveis
    dest_autopecas = {'N': 0, 'S': 1}
    dataframe['DEST AUTOPECAS'] = [dest_autopecas[item] for item in dataframe['DEST AUTOPECAS']]
    # Preenchendo vazios com valor padrão
    dataframe.fillna(-1, inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(dataframe[LABELS].values, np.array(classe_label), test_size=0.3)

    tpot = TPOTClassifier(generations=5, population_size=50, verbosity=3)
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_classif_pipeline.py')
Exemplo n.º 2
0
    def run_AutoTpot(self):
        # Running the AutoTpot pipeline
        automl = TPOTClassifier(generations=1, verbosity=2, config_dict='TPOT sparse')
        automl.fit(self.train, self.y_train)

        # TPOT produces ready-to-run, standalone Python code for the best-performing model,
        # in the form of a scikit-learn pipeline.
        # Exporting the best models
        automl.export(os.path.join(self.args.save_dir, 'tpot-sportswear.py'))

        print('The best pipeline discovered through auto-tpot is {}'.format(automl.fitted_pipeline_))

        print('Saving the best model discovered through TPOT.')
        # Dumping ensemble of the models
        joblib.dump(automl, os.path.join(self.args.checkpoint_dir, 'auto-tpot.pickle'))

        # Calculating time per prediction
        # Start time ******************************************************************************
        start = timeit.default_timer()

        # Predicting label, confidence probability on the test data set
        predictions = automl.predict(self.test)
        predictions_prob = automl.predict_proba(self.test)

        # Binary class values : rounding them to 0 or 1
        predictions = [round(value) for value in predictions]

        end = timeit.default_timer()
        # End Time ******************************************************************************
        print('Time per prediction : {}'.format((end - start) / self.test.shape[0]))

        self.visualize(predictions, automl)
Exemplo n.º 3
0
def cli(erv_data):
    # import the ERV expression data as a Pandas dataframe
    df = pd.read_csv(erv_data)
    class_codes = dict(enumerate(
        df['class'].astype("category").cat.categories))
    df["class"] = df["class"].astype("category").cat.codes

    # create the test and training data
    X_train, X_test, y_train, y_test = train_test_split(df.values[:, 2:],
                                                        df.values[:, 1],
                                                        train_size=0.75,
                                                        test_size=0.25)

    # convert them all to floats
    X_train, X_test, y_train, y_test = X_train.astype(float), X_test.astype(
        float), y_train.astype(float), y_test.astype(float)

    # create a pipeline
    pipeline_optimizer = TPOTClassifier(cv=2, verbosity=2, n_jobs=-1)
    pipeline_optimizer.fit(X_train, y_train)
    pipeline_optimizer.export('tpot_exported_pipeline.py')

    print(f"Validation Accuracy: {pipeline_optimizer.score(X_test, y_test)}")
    cm = ConfusionMatrix([class_codes[y] for y in y_test], [
        class_codes[y] for y in
        [pipeline_optimizer.predict(x.reshape(1, -1))[0] for x in X_test]
    ])
    cm.save_html("report")
Exemplo n.º 4
0
def main():
    df_train = pd.read_csv('data/train_data.csv')
    df_valid = pd.read_csv('data/valid_data.csv')

    feature_cols = list(df_train.columns[:-1])
    target_col = df_train.columns[-1]

    X_train = df_train[feature_cols].values
    y_train = df_train[target_col].values

    X_valid = df_valid[feature_cols].values
    y_valid = df_valid[target_col].values

    tsne_data = np.load('data/tsne_2d_5p.npz')
    tsne_train = tsne_data['X_train']
    tsne_valid = tsne_data['X_valid']

    # concat features
    X_train_concat = np.concatenate([X_train, tsne_train], axis=1)
    X_valid_concat = np.concatenate([X_valid, tsne_valid], axis=1)

    tpot = TPOTClassifier(max_time_mins=60 * 24,
                          population_size=100,
                          scoring='log_loss',
                          num_cv_folds=3,
                          verbosity=2,
                          random_state=67)
    tpot.fit(X_train_concat, y_train)
    print(tpot.score(X_valid_concat, y_valid))
    tpot.export('tpot_pipeline.py')
Exemplo n.º 5
0
def run_tpot(zeros, ones):
    all_data, y = make_all_data(zeros, ones)
    X_train, X_test, y_train, y_test = train_test_split(all_data,
                                                        y,
                                                        test_size=.1)
    pca = PCA(n_components=15)
    X_train = pca.fit_transform(X_train)
    X_test = pca.fit_transform(X_test)

    # if not os.path.exists('tpot_checkpoint'):
    # os.mkdir('tpot_checkpoint')

    tpot = TPOTClassifier(
        n_jobs=-1,
        generations=50,
        verbosity=3,
        scoring='f1',
        # subsample=.5,
        # periodic_checkpoint_folder='tpot_checkpoint',
        max_eval_time_mins=30,
        memory='auto')

    tpot.fit(X_train, y_train)
    tpot.export('tpot_ecog_pipeline.py')
    results = tpot.predict(X_test)
    out_file = open('tpot_metrics.txt', 'w')
    out_file.write(sklearn.metrics.classification_report(y_test, results))
    tpot.export('tpot_ecog_pipeline.py')
Exemplo n.º 6
0
class TPot(Model):

    def __init__(self):
        print("Starting t pot!")

    def fit(self, X, y, title=None):
        # For this case, X and y are the complete datasets!!!
        self.pipeline_optimizer = TPOTClassifier(
            generations=5,
            cv=5,
            random_state=42,
            verbosity=3,
            n_jobs=8,
            max_eval_time_mins=1,#10,
            scoring='f1',
            subsample=0.5
        )
        self.pipeline_optimizer.fit(X, y)

        if not os.path.exists("./automl"):
            os.makedirs("./automl")

        self.pipeline_optimizer.export('./automl/tpot_exported_pipeline_' + str(title) + '_.py')

    def predict(self, X):
        pass
def main(**kwargs) -> None:
    # divides kwargs between `Featurizer` and `TPOTClassifier` kwargs.
    tpot_kwargs = {}
    keys = list(kwargs.keys())
    for k in keys:
        if k in inspect.getargspec(TPOTClassifier).args:
            tpot_kwargs[k] = kwargs.pop(k)
    # loads all data into memory.
    paths = [os.path.join(LABELS_PATH, fname) for fname in os.listdir(LABELS_PATH)]
    X_raw, y = load_data(paths)
    X_raw.title.fillna('', inplace=True)
    X_raw.channel_title.fillna('', inplace=True)
    # splits data into train and test sets.
    X_train, X_test, y_train, y_test = train_test_split(X_raw, y,
        random_state=SEED, train_size=TRAIN_SIZE, test_size=1-TRAIN_SIZE, shuffle=True)
    # KLUDGE: preprocesses text deterministically (i.e. NOT part of the TPOT hyperparameter
    # optimization pipeline).
    featurizer = Featurizer(**kwargs)
    featurizer.fit(X_train)
    X_train = featurizer.transform(X_train)
    if 'verbosity' in tpot_kwargs and tpot_kwargs['verbosity'] > 0:
        print(f'Beginning hyper-parameter search with training data shape: {X_train.shape}.')
    tpot = TPOTClassifier(**tpot_kwargs)
    tpot.fit(X_train, y_train)
    if 'periodic_checkpoint_folder' in tpot_kwargs:
        tpot.export(os.path.join(tpot_kwargs['periodic_checkpoint_folder'], 'best_pipeline.py'))
    if 'verbosity' in tpot_kwargs and tpot_kwargs['verbosity'] > 0:
        X_test = featurizer.transform(X_test)
        print(f'Train set score: {tpot.score(X_train, y_train).round(4)}')
        print(f'Test set score: {tpot.score(X_test, y_test).round(4)}')
    return None
Exemplo n.º 8
0
def tpot_generation(X_train, y_train, X_test, y_test):
    tpot = TPOTClassifier(generations=10,
                          population_size=20,
                          verbosity=2,
                          n_jobs=4)
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_quiniela_pipeline.py')
Exemplo n.º 9
0
def tpotClassifier(train_data, target_value):
    classifier = TPOTClassifier()
    X_train, X_test, y_train, y_test = train_test_split(
        train_data, train_data[target_value], train_size=0.75, test_size=0.25)
    classifier.fit(X_train, y_train)
    score: float = classifier.score(X_test, y_test)
    classifier.export('my_pipeline.py')
    return classifier, score
Exemplo n.º 10
0
def clfWithTpot(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)
    my_tpot = TPOTClassifier(generations=10, verbosity=2)
    my_tpot.fit(np.array(X_train), np.array(y_train))
    print(my_tpot.score(np.array(X_test), np.array(y_test)))
    my_tpot.export('exported_pipeline.py')
    predictions = my_tpot.predict(np.array(X_test))
    print(confusion_matrix(y_test, predictions))
Exemplo n.º 11
0
def main():
    """
    Uses tpot (Tree-based Pipeline Optimization Tool) an Automated Machine Learning tool 
    to find and output the best machine learning model for the given dataset. 
    
    See https://github.com/EpistasisLab/tpot

    Outputs the results to automodel.py
    """
    titanic = pd.read_csv('../data/titanic.csv')
    titanic.rename(columns={'Survived': 'class'}, inplace=True)

    for category in ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']:
        print("Number of levels in category '{0}': \b {1:2.2f} ".format(
            category, titanic[category].unique().size))

    # Encode values
    titanic['Sex'] = titanic['Sex'].map({'male': 0, 'female': 1})
    titanic['Embarked'] = titanic['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

    # Fill na
    titanic = titanic.fillna(-999)
    pd.isnull(titanic).any()

    # Encode values
    from sklearn.preprocessing import MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    CabinTrans = mlb.fit_transform([{str(val)}
                                    for val in titanic['Cabin'].values])

    # Drop unused columns
    titanic_new = titanic.drop(
        ['PassengerId', 'Name', 'Ticket', 'Cabin', 'class'], axis=1)

    # Create numpy arrays
    titanic_new = np.hstack((titanic_new.values, CabinTrans))
    titanic_class = titanic['class'].values

    # Train test split
    # https://www.kdnuggets.com/2020/07/easy-guide-data-preprocessing-python.html
    # https://stackoverflow.com/questions/55525195/do-i-have-to-do-one-hot-encoding-separately-for-train-and-test-dataset
    training_indices, validation_indices = training_indices, testing_indices = train_test_split(
        titanic.index, stratify=titanic_class, train_size=0.75, test_size=0.25)
    training_indices.size, validation_indices.size

    # Train model
    tpot = TPOTClassifier(verbosity=2,
                          max_time_mins=2,
                          max_eval_time_mins=0.04,
                          population_size=40)
    tpot.fit(titanic_new[training_indices], titanic_class[training_indices])

    # Score
    tpot.score(titanic_new[validation_indices], titanic.loc[validation_indices,
                                                            'class'].values)

    # Export
    tpot.export('automodel.py')
Exemplo n.º 12
0
def test_export():
    """Assert that TPOT's export function throws a ValueError when no optimized pipeline exists"""
    tpot_obj = TPOTClassifier()

    try:
        tpot_obj.export("test_export.py")
        assert False  # Should be unreachable
    except ValueError:
        pass
Exemplo n.º 13
0
def test_export():
    """Assert that TPOT's export function throws a ValueError when no optimized pipeline exists"""
    tpot_obj = TPOTClassifier()

    try:
        tpot_obj.export("test_export.py")
        assert False  # Should be unreachable
    except ValueError:
        pass
Exemplo n.º 14
0
def tpot_train(project,
               X,
               y,
               export_file,
               prediction_type,
               train_size=0.75,
               max_time_mins=1,
               max_eval_time_mins=0.04,
               population_size=40,
               scoring_func=None,
               n_jobs=1):

    print(
        "==========train / test split for training size {}".format(train_size))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_size)
    print(X_train.shape, y_train.shape)

    print("==========Start training the model...")
    print("==========max_time_mins: {}".format(max_time_mins))
    print("==========max_eval_time_mins: {}".format(max_eval_time_mins))
    print("==========population_size: {}".format(population_size))
    print("==========n_jobs: {}".format(n_jobs))

    # predition type:
    # - regression
    # - classification
    if (prediction_type == "classification"):
        tpot = TPOTClassifier(verbosity=2,
                              max_time_mins=max_time_mins,
                              max_eval_time_mins=max_eval_time_mins,
                              population_size=population_size,
                              scoring=scoring_func,
                              n_jobs=n_jobs)
    else:
        tpot = TPOTRegressor(verbosity=2,
                             max_time_mins=max_time_mins,
                             max_eval_time_mins=max_eval_time_mins,
                             population_size=population_size,
                             scoring=scoring_func,
                             n_jobs=n_jobs,
                             warm_start=True)

    tpot.fit(X_train, y_train)

    try:
        holdout_score = tpot.score(X_test, y_test)
        print("==========holdout set score is {}".format(holdout_score))
    except:
        print("==========Unexpected error when score holdout set")

    print("==========export tpot to {}".format(export_file))
    tpot.export(export_file)

    return tpot
Exemplo n.º 15
0
def do_tpot(generations=5, population_size=10,X='',y=''):

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        train_size=0.80, test_size=0.20)

    tpot = TPOTClassifier(generations=generations, population_size=population_size, verbosity=2,cv=3)
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('tpot_pipeline.py')
    return tpot
Exemplo n.º 16
0
 def tpot_classifiers(self, X_train, y_train, X_test, y_test, save_path):
     print('Training using Tpot')
     pipeline_optimizer = TPOTClassifier(generations=10,
                                         population_size=25,
                                         cv=3,
                                         random_state=0,
                                         verbosity=2,
                                         scoring='balanced_accuracy')
     pipeline_optimizer.fit(X_train, y_train)
     pipeline_optimizer.export(save_path + '.py')
     print(pipeline_optimizer.score(X_test, y_test))
Exemplo n.º 17
0
def T_Pot(X_train, X_test, y_train, y_test):
    pipeline_optimizer = TPOTClassifier(generations=5,
                                        population_size=50,
                                        cv=5,
                                        random_state=42,
                                        verbosity=2,
                                        early_stop=3,
                                        n_jobs=-1)
    pipeline_optimizer.fit(X_train, y_train)
    print(pipeline_optimizer.score(X_test, y_test))
    pipeline_optimizer.export('pipelineOutput.py')
Exemplo n.º 18
0
def run_main():
    df_train = pd.read_csv('./preprocessed_data.csv')
    df_test = pd.read_csv('./preprocessed_test_data.csv')
    X = np.array(df_train.drop(['Survived'], 1))
    y = np.array(df_train['Survived'])
    # print(X,y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    tpot = TPOTClassifier(generations=20, verbosity=2)
    tpot.fit(X_train, y_train)
    tpot.score(X_test, y_test)#0.824626865672
    tpot.export('tpot_exported_pipeline.py')
Exemplo n.º 19
0
def tune(X_train, X_test, y_train, y_test):
    # Construct and fit TPOT classifier
    start_time = time.time()
    tpot = TPOTClassifier(generations=10, verbosity=2)
    tpot.fit(X_train, y_train)
    end_time = time.time()

    # Results
    print('TPOT classifier finished in %s seconds' % (end_time - start_time))
    print('Best pipeline test accuracy: %.3f' % tpot.score(X_test, y_test))

    # Save best pipeline as Python script file
    tpot.export('tpot_pipeline.py')
Exemplo n.º 20
0
def tpot_search(X_train, X_test, y_train, y_test, target_column):
    pipeline_optimizer = TPOTClassifier(generations=30,
                                        population_size=30,
                                        cv=5,
                                        random_state=42,
                                        verbosity=2,
                                        config_dict='TPOT sparse')

    pipeline_optimizer.fit(X_train, y_train)
    # print(pipeline_optimizer.score(X_test, y_test))

    pipeline_optimizer.export('output/tpot_exported_pipeline_' +
                              target_column + '.py')
Exemplo n.º 21
0
def tpot (X_train, y_train, X_test = None, y_test = None,
          export_file = '../results/models/tpot/exported_pipeline.py', n_jobs = 1):
    
    if 'node' and 'target' in X_train.columns:
        X_train = X_train.drop(columns = ['node', 'target'])
    if 'node' and 'target' in X_test.columns:
        X_test = X_test.drop(columns = ['node', 'target'])

    tpot = TPOTClassifier(generations = 5, population_size = 40, cv=3, verbosity=2, scoring = 'f1', n_jobs=6)

    tpot.fit(X_train, y_train)
    tpot.export(export_file)
    print(tpot.score(X_test, y_test))
Exemplo n.º 22
0
def main():
    df = pd.read_csv("data/hl_test_clean.csv", encoding="utf8")
    df['book_date'] = pd.to_datetime(df['book_date'])
    trainSet = df[(df['book_date'] >= '2017-04-01')
                  & (df['book_date'] <= '2017-07-20')].reset_index(drop=True)
    testSet = df[(df['book_date'] >= '2017-07-20')
                 & (df['book_date'] <= '2017-08-31')].reset_index(drop=True)
    logger.info(
        "============================Data is ready!============================"
    )
    clf = XGBClassifier(learning_rate=0.01,
                        max_depth=7,
                        min_child_weight=15,
                        n_estimators=100,
                        nthread=1,
                        subsample=0.6500000000000001)
    myexe = MyExecutor(df, "fpd", clf)
    #leftVaris = myexe.get_result()
    #leftVaris = leftVaris[leftVaris.values > 7].keys()
    #print(leftVaris)
    leftVaris = [
        'hl_call_domesitc_cnt_2m', 'hl_contact_early_morning_cnt_5m',
        'hl_phone_silent_frequentcy', 'hl_contact_night_pct',
        'hl_transactions_total_amt_5m', 'hl_region_call_cnt_max_uniq_num_cnt',
        'hl_region_call_out_cnt_max_avg_call_in_time',
        'hl_contact_morning_cnt_5m',
        'hl_region_call_in_time_max_avg_call_in_time',
        'hl_transactions_total_amt_2m', 'hl_contact_night_cnt_5m',
        'hl_phone_num_used_time_months',
        'hl_region_call_cnt_max_avg_callin_time',
        'hl_region_call_in_time_max_uniq_num_cnt',
        'hl_region_call_in_cnt_max_avg_call_out_time',
        'hl_transactions_min_5m',
        'hl_region_call_out_time_max_avg_call_out_time'
    ]

    X_train = trainSet[leftVaris].copy()
    y_train = trainSet['fpd'].copy()
    X_test = testSet[leftVaris].copy()
    y_test = testSet['fpd'].copy()
    # AutoSklearn阶段:
    pipeline_optimizer = TPOTClassifier(generations=5,
                                        population_size=20,
                                        cv=4,
                                        random_state=42,
                                        verbosity=2)
    pipeline_optimizer.fit(X_train, y_train)
    # print(pipeline_optimizer.score(X_test, y_test))
    pipeline_optimizer.export('tpot_exported_pipeline.py')
    getReport(pipeline_optimizer, trainSet, X_train, y_train, testSet, X_test,
              y_test)
Exemplo n.º 23
0
def TPOT_Classifier():
    tpot = TPOTClassifier(
        verbosity=2,
        max_time_mins=390,
        population_size=40,
    )
    tpot.fit(x_train, y_train)
    tpot.export('tpot_assignment_pipeline.py')
    TPOT_predict = tpot.predict(x_test)
    score = tpot.score(x_test, y_test)
    print(score)
    print(y_test)
    print(TPOT_predict)
    return score
Exemplo n.º 24
0
def find_best_model(X_train, X_test, y_train, y_test):
    pipeline_optimizer = TPOTClassifier(
        generations=100,
        population_size=50,
        cv=5,
        random_state=42,
        verbosity=2,
        config_dict='TPOT sparse'
    )

    pipeline_optimizer.fit(X_train, y_train)
    print(pipeline_optimizer.score(X_test, y_test))

    pipeline_optimizer.export('outputs/tpot_exported_pipeline.py')
Exemplo n.º 25
0
def tpot_train(cat, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        test_size=0.2)

    tpot = TPOTClassifier(generations=15,
                          population_size=20,
                          verbosity=5,
                          n_jobs=-1,
                          scoring='roc_auc')
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export(cat + '-pipeline.py')
    def tpot_select_model(x_train, y_train, x_test, y_test):
        from tpot import TPOTClassifier

        # create instance
        tpot = TPOTClassifier(generations=10,
                              population_size=50,
                              verbosity=2,
                              n_jobs=-1)
        # fit instance
        tpot.fit(x_train, y_train)
        # evaluate performance on test data
        print(tpot.score(x_test, y_test))

        # export the script used to create the best model
        tpot.export('tpot_exported_pipeline.py')
Exemplo n.º 27
0
def classification():
    digits = load_digits()
    X_train, X_test, y_train, y_test = train_test_split(digits.data,
                                                        digits.target,
                                                        train_size=0.75,
                                                        test_size=0.25,
                                                        random_state=42)

    tpot = TPOTClassifier(generations=5,
                          population_size=50,
                          verbosity=2,
                          random_state=42)
    tpot.fit(X_train, y_train)

    print(tpot.score(X_test, y_test))
    tpot.export('tpot_digits_pipeline.py')
Exemplo n.º 28
0
def Classifier(x, y):
    x_train = x
    y_train = y
    tpot = TPOTClassifier(
        verbosity=2,
        max_time_mins=10,
        population_size=50,
    )
    tpot.fit(x_train, y_train)
    tpot.export('tpot_pipeline.py')
    TPOT_predict = tpot.predict(x_test)
    score = tpot.score(x_test, y_test)
    #print(score)
    #print(y_test)
    #print(TPOT_predict)
    return score
Exemplo n.º 29
0
def run_tpot(X,y, target_ft,time_budget=30, include_preprocessors=None, n_jobs=1 ):

    print(n_jobs)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)
    
    if include_preprocessors:
        pipeline_optimizer = TPOTClassifier(max_time_mins = time_budget//60, generations=None,
                                            use_dask=False,
                                            #template="Selector-Transformer-Classifier",
                                            n_jobs=n_jobs,)
    else:
        pipeline_optimizer = TPOTClassifier(max_time_mins = time_budget//60, generations=None,
                                    use_dask=False,
                                    template='Classifier',
                                    n_jobs=n_jobs,)
    
    pipeline_optimizer.fit(X_train, y_train)
    y_hat = pipeline_optimizer.predict(X_test)
    acc = sklearn.metrics.accuracy_score(y_test, y_hat)
    f1_s = sklearn.metrics.f1_score(y_test, y_hat, average='weighted')
    metrs = []
    metrs.append("Accuracy score - " + str(acc))
    metrs.append("F1 score - " + str(f1_s))
    res = ["","","","",f1_s,acc,"",pipeline_optimizer.export()]
    
    
    return str(metrs),res
Exemplo n.º 30
0
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset):
    tp = TPOTClassifier(verbosity=3)
    start_time = timer(None)
    tp.fit(X_train, y_train)
    tp.export('tpot_pipeline_dont_overfit.py')
    time = timer(start_time)
    preds = tp.predict(X_test)

    time_out = open(name_dataset + '_' + 'tpot', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({"id": id_test, "target": preds})

    submission.to_csv(name_dataset + '_' + 'tpot' + '_submission.csv',
                      index=False)
Exemplo n.º 31
0
def tpot_optimization_clf(count, train_path, test_path, verbose=False):
    """
    Optimize algorithms and parameters using TPOT for Classification trees.

    :param count: int, number of samples to be generated.
    :param train_path: string, path to the dataset used for training.
    :param test_path: string, path to the dataset used for testing.
    :param verbose: bool, representing if information regarding the process should be displayed.
    """

    # Generate samples.
    if verbose: print("Get train samples. ")
    X_train, Y_train = Sampler.generate_samples(dataset=train_path,
                                                count=count)
    if verbose: print("Get test samples. ")
    X_test, Y_test = Sampler.generate_samples(dataset=test_path, count=count)

    tpot_config = {
        'xgboost.XGBClassifier': {
            'max_depth': [2, 3, 4, 5],
            "learning_rate": [0.02, 0.05, 0.1, 0.15, 0.2],
            'n_estimators': [10, 20, 30, 40, 50, 100, 500],
            'objective': ["reg:linear", "multi:softmax", "multi:softprob"],
            'booster': ["gbtree", "gblinear", "dart"],
            'n_jobs': [-1]
        },
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 20, 30, 40, 50, 100, 500],
            'criterion': ["gini", "entropy"],
            'max_features': ["auto", "sqrt", "log2"],
            'max_depth': [2, 3, 4, 5],
            'n_jobs': [-1]
        }
    }

    if verbose: print("Start TPOT optimization. ")

    tpot = TPOTClassifier(generations=10,
                          population_size=30,
                          verbosity=2,
                          config_dict=tpot_config)

    tpot.fit(np.array(X_train), np.array(Y_train))
    print(
        tpot.score(np.array(X_test, dtype=np.float64),
                   np.array(Y_test, dtype=np.float64)))
    tpot.export('tpot_pipeline_clf.py')
Exemplo n.º 32
0
def test_export():
    """Assert that TPOT's export function throws a RuntimeError when no optimized pipeline exists."""
    tpot_obj = TPOTClassifier()
    assert_raises(RuntimeError, tpot_obj.export, "test_export.py")
    pipeline_string = (
        'KNeighborsClassifier(CombineDFs('
        'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, '
        'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
        'DecisionTreeClassifier__min_samples_split=5), ZeroCount(input_matrix))'
        'KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform'
    )

    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._optimized_pipeline = pipeline
    tpot_obj.export("test_export.py")
    assert path.isfile("test_export.py")
    remove("test_export.py") # clean up exported file
Exemplo n.º 33
0
def generate_model(generations, train_X, train_y):
	tpot_generator = TPOTClassifier(generations=generations, verbosity=2)
	tpot_generator.fit(train_X, train_y)
	tpot_generator.export('tpot_model' + generations + '.py')
Exemplo n.º 34
0
HPI = HPI.join(benchmark['United States'])
# all in percentage change since the start of the data (1975-01-01)

HPI.dropna(inplace=True)

housing_pct = HPI.pct_change()
housing_pct.replace([np.inf, -np.inf], np.nan, inplace=True)

housing_pct['US_HPI_future'] = housing_pct['United States'].shift(-1)
housing_pct.dropna(inplace=True)

def create_labels(cur_hpi, fut_hpi):
    if fut_hpi > cur_hpi:
        return 1
    else:
        return 0

housing_pct['label'] = list(map(create_labels, housing_pct['United States'], housing_pct['US_HPI_future']))

# housing_pct['ma_apply_example'] = housing_pct['M30'].rolling(window=10).apply(moving_average)
# print(housing_pct.tail())
X = np.array(housing_pct.drop(['label', 'US_HPI_future'], 1))
y = np.array(housing_pct['label'])

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25)

tpot = TPOTClassifier(generations=10, population_size=20, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('HPI_tpot_pipeline.py')
Exemplo n.º 35
0
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

digits = load_digits()

X_train, X_test, y_train,  y_test = train_test_split(digits.data, digits.target,
													train_size = 0.75, test_size = 0.25)

tpot = TPOTClassifier(generations = 5, population_size = 20, verbosity = 2)

tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_mnist_pipeline.py')
#features = data
#tpot_data=pd.DataFrame({'class':label},columns=['class'])
#training_features, testing_features, training_classes, testing_classes = \
#    train_test_split(features, tpot_data['class'], random_state=42)
data,label,idx_row = np.concatenate(samples),np.concatenate(label),np.arange(0,len(label),1)
print('shuffle')
for ii in range(100):
    shuffle(idx_row)
data,label = data[idx_row,:],label[idx_row]
X_train, X_test, y_train, y_test = train_test_split(data,label,train_size=0.80)
print('model selection')
tpot = TPOTClassifier(generations=10, population_size=25,
                      verbosity=2,random_state=373849,num_cv_folds=5,scoring='roc_auc' )
tpot.fit(X_train,y_train)
tpot.score(X_test,y_test)
tpot.export('%s%s_tpot_exported_pipeline.py'%(folder,type_) )  
print('finished model selection')
"""
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectFwe, f_classif
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import KFold
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(lambda X: X),
        FunctionTransformer(lambda X: X)
    ),
    SelectFwe(alpha=0.05, score_func=f_classif),
# Add origin encoding
for origin_column in list(origin_dummies):
   sample_df[ origin_column ] = origin_dummies[ origin_column ]


X_train, X_test, y_train, y_test = train_test_split( sample_df, labels,train_size=0.7)



le = preprocessing.LabelEncoder()


tpot = TPOTClassifier(generations=7, population_size=15, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_cars_pipeline.py')

#tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
#Best pipeline: GradientBoostingClassifier(RobustScaler(input_matrix), GradientBoostingClassifier__learning_rate=1.0, GradientBoostingClassifier__max_depth=5, GradientBoostingClassifier__max_features=0.25, GradientBoostingClassifier__min_samples_leaf=DEFAULT, GradientBoostingClassifier__min_samples_split=17, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=0.7)
# 0.770491803279

#tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)
#Best pipeline: ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=False, ExtraTreesClassifier__criterion=DEFAULT, ExtraTreesClassifier__max_features=0.45, ExtraTreesClassifier__min_samples_leaf=1, ExtraTreesClassifier__min_samples_split=7, ExtraTreesClassifier__n_estimators=DEFAULT)
#0.762295081967

#Sin MPG
#tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
#Best pipeline: ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=DEFAULT, ExtraTreesClassifier__criterion=gini, ExtraTreesClassifier__max_features=0.45, ExtraTreesClassifier__min_samples_leaf=1, ExtraTreesClassifier__min_samples_split=6, ExtraTreesClassifier__n_estimators=DEFAULT)
#0.754098360656

# All features set