예제 #1
0
class MachineLearning:
    def __init__(self, worker):

        self.worker = worker
        self.ldb = LotteryDatabase()

        # variables
        self.x = None
        self.y = None

        self.x_train = None
        self.x_validation = None
        self.y_train = None
        self.y_validation = None

        self.N_FOLDS = 5
        self.MAX_EVALS = 20
        self.RANDOM_STATE = 42
        self.training_size = 15
        self.n_increment = 10
        self.curr_game = CONFIG['games']['mini_lotto']

        # features
        self.table_headers = []
        self.table_name = self.worker.table_name
        self.features = self.curr_game['features']

        for i in range(self.worker.window.list_model.count()):
            feature_len = self.features[self.worker.window.list_model.item(
                i).text()]['length'] + 1
            feature_header = self.features[self.worker.window.list_model.item(
                i).text()]['header']
            self.table_headers += [
                feature_header + str(n) + ' INTEGER'
                for n in range(1, feature_len)
            ]

    def generate_df_pieces(self, connection, chunk_size, offset, ids):

        last_row = self.ldb.get_table_length(self.table_name)
        chunks = int(math.ceil(last_row / chunk_size))
        n_chunk = 1

        self.ldb.delete_view('tempView')
        self.ldb.create_view(
            'tempView',
            ",".join(['DRAFT_ID'] + self.table_headers + ['LABEL']),
            self.table_name)

        while True:
            self.worker.signal_status.emit(
                str.format('Collecting data from database... {} of {}',
                           n_chunk, chunks))
            sql_ct = "SELECT * FROM tempView WHERE DRAFT_ID <= %d limit %d offset %d" % (
                ids, chunk_size, offset)
            df_piece = pd.read_sql_query(sql_ct, connection)

            if not df_piece.shape[0]:
                break
            yield df_piece

            if df_piece.shape[0] < chunk_size:
                break

            offset += chunk_size
            n_chunk += 1

        self.worker.signal_status.emit('')

    def embedded_learning(self, input_array, limit=0, draft_id=0):

        original_len = self.ldb.get_table_length('INPUT_' +
                                                 self.curr_game['database'])

        dataset = pd.concat(
            self.generate_df_pieces(self.ldb.conn, 100000, 0,
                                    original_len - limit))
        array = dataset.values

        self.x = array[:, 1:len(self.table_headers) + 1]
        self.y = array[:, len(self.table_headers) + 1]

        tb._SYMBOLIC_SCOPE.value = True
        model = self.choose_model(keras=True)

        self.worker.table_name = 'PREDICT_' + self.worker.window.combo_predict_model.currentText(
        )

        convert = ConvertModel(self.worker,
                               list(map(int, input_array.split(" "))), limit)
        convert.create_prediction_model(input_array)

        self.table_name = 'PREDICT_' + self.worker.window.combo_predict_model.currentText(
        )

        output_dataset = pd.concat(
            self.generate_df_pieces(self.ldb.conn, 10000, 0, 0))
        output_array = output_dataset.values
        output_x = output_array[:, 1:len(self.table_headers) + 1]

        original_len = self.ldb.get_table_length(self.worker.table_name) + 1

        now = datetime.datetime.now()
        file_name_r = str.format(
            '{} {}', self.worker.window.combo_predict_model.currentText(),
            now.strftime("%Y-%m-%d %H %M %S")) + model['info']

        export_columns = ['DRAFT_NR', 'FIRST', 'SECOND', 'THIRD', 'FOURTH', 'FIFTH', 'ODD_EVEN', 'LOW_HIGH', 'LA_JOLLA',
                          'SCORE_ALL', 'SCORE_TOP', 'SCORE_LESS', 'SCORE_2', 'SCORE_3', 'LABEL'] + \
                         ['OUTPUT ' + str(n) for n in range(1, self.training_size+1)]

        with open('archived/' + file_name_r + '.csv', 'a',
                  newline='') as csv_file:

            writer = csv.writer(csv_file)
            writer.writerow(export_columns)

            score_all, score_2, score_3, score_top, score_less = 0, 0, 0, 0, 0

            pairs_two = convert.get_latest_pairs(2)
            pairs_three = convert.get_latest_pairs(3)

            latest_numbers = convert.get_latest_top()
            top_numbers = nlargest(20, latest_numbers, key=latest_numbers.get)
            less_numbers = nsmallest(20,
                                     latest_numbers,
                                     key=latest_numbers.get)

            head = ','.join(['LA_JOLLA_' + str(n) for n in range(1, 6)])

            # self.ldb.db_delete_view('LA_JOLLA_VIEW')
            # self.ldb.db_create_view('LA_JOLLA_VIEW', head, 'LA JOLLA')
            # self.ldb.db_execute('SELECT * FROM LA_JOLLA_VIEW')

            la_jolla_db = self.ldb.c.fetchall()

            for o in range(1, original_len):
                fetch_one = list(self.ldb.fetchone(self.table_name, o))

                originals = fetch_one[2:self.curr_game['length'] + 2]
                label_column = [fetch_one[-1]]

                output_list = [
                    n + 1 for n in range(0, len(originals))
                    if originals[n] == 1
                ]

                odd_count = len(
                    list(filter(lambda x: (x % 2 != 0), output_list)))
                even_count = len(
                    list(filter(lambda x: (x % 2 == 0), output_list)))

                if even_count > odd_count:
                    odd_even_check = 1
                else:
                    odd_even_check = 0

                high_low = sum(x > 21 for x in output_list)

                decrease = 0
                for top in top_numbers:
                    if int(top) in output_list:
                        score_all += (1 - decrease)
                        score_top += (1 - decrease)
                    decrease += 0.05

                decrease = 0
                for top in less_numbers:
                    if int(top) in output_list:
                        # score_all += (1 - decrease)
                        score_less += (1 - decrease)
                    decrease += 0.05

                output_counter = Counter(combinations(output_list, 2))

                decrease = 0
                for pair in pairs_two:
                    if pair in output_counter:
                        score_all += (1 - decrease)
                        score_2 += (1 - decrease)
                    decrease += 0.01

                output_counter = Counter(combinations(output_list, 3))

                decrease = 0
                for pair in pairs_three:
                    if pair in output_counter:
                        score_all += (1 - decrease)
                        score_3 += (1 - decrease)
                    decrease += 0.01

                if output_list in la_jolla_db:
                    la_jolla = 1
                else:
                    la_jolla = 0

                output_list = [draft_id] + output_list + [odd_even_check] + [high_low] + [la_jolla] +\
                              [score_all] + [score_top] + [score_less] + [score_2] + [score_3] + label_column

                writer.writerow(output_list)

                score_all, score_2, score_3, score_top, score_less = 0, 0, 0, 0, 0

            self.worker.signal_status.emit('')

        if self.worker.window.check_keras.isChecked():

            model['model'].fit(self.x,
                               self.y,
                               nb_epoch=100,
                               batch_size=212,
                               verbose=2)

            prediction = model['model'].predict(output_x)

            combined_set = list(map(str, prediction))

            with open('archived/' + file_name_r + '.csv',
                      'r') as read_csv_file:

                csv_input = csv.reader(read_csv_file)
                next(csv_input)

                now = datetime.datetime.now()
                file_name_w = str.format(
                    '{} {}',
                    self.worker.window.combo_predict_model.currentText(),
                    now.strftime("%Y-%m-%d %H %M %S")) + model['info']

                with open('archived/' + file_name_w + '.csv', 'w',
                          newline='') as csv_file:
                    writer = csv.writer(csv_file)
                    writer.writerow(export_columns)

                    for row, o in zip(csv_input, combined_set):
                        writer.writerow(row + [o])

            os.remove('archived/' + file_name_r + '.csv')
            file_name_r = file_name_w

        elif self.worker.window.combo_predict_ml.currentText() == 'LogisticRegression' or \
                self.worker.window.combo_predict_ml.currentText() == 'SGDClassifier':

            model['model'].fit(self.x, self.y)

            prediction = model['model'].predict(output_x)

            combined_set = list(map(str, prediction))

            with open('archived/' + file_name_r + '.csv',
                      'r') as read_csv_file:

                csv_input = csv.reader(read_csv_file)
                next(csv_input)

                now = datetime.datetime.now()
                file_name_w = str.format(
                    '{} {}',
                    self.worker.window.combo_predict_model.currentText(),
                    now.strftime("%Y-%m-%d %H %M %S")) + model['info']

                with open('archived/' + file_name_w + '.csv', 'w',
                          newline='') as csv_file:
                    writer = csv.writer(csv_file)
                    writer.writerow(export_columns)

                    for row, o in zip(csv_input, combined_set):
                        writer.writerow(row + [o])

            os.remove('archived/' + file_name_r + '.csv')
            file_name_r = file_name_w

        else:

            for t in range(self.training_size):
                self.worker.signal_status.emit(
                    str.format('Training in process... {} of {}', t + 1,
                               self.training_size))
                model['model'].n_estimators += self.n_increment
                model['model'].fit(self.x, self.y)

                prediction = model['model'].predict(output_x)

                combined_set = list(map(str, prediction))

                with open('archived/' + file_name_r + '.csv',
                          'r') as read_csv_file:

                    csv_input = csv.reader(read_csv_file)
                    next(csv_input)

                    now = datetime.datetime.now()
                    file_name_w = str.format(
                        '{} {}',
                        self.worker.window.combo_predict_model.currentText(),
                        now.strftime("%Y-%m-%d %H %M %S")) + model['info']

                    with open('archived/' + file_name_w + '.csv',
                              'w',
                              newline='') as csv_file:

                        writer = csv.writer(csv_file)
                        writer.writerow(export_columns)

                        for row, o in zip(csv_input, combined_set):

                            writer.writerow(row + [o])

                os.remove('archived/' + file_name_r + '.csv')
                file_name_r = file_name_w

        self.worker.signal_status.emit('')

        msg = ''

        # msg = "Algorithm: RandomForestClassifier" + '\n' + \
        #       "Number of estimators: {}".format(forest.n_estimators) + '\n' + \
        #       "Accuracy on training set: {:.3f}".format(forest.score(x_train, y_train)) + '\n' + \
        #       "Accuracy on test set: {:.3f}".format(forest.score(x_validation, y_validation))

        return msg

    def choose_model(self, params=None, fresh=False, keras=False):

        model, info = None, None

        if keras:

            model = Sequential()
            model.add(
                Dense(output_dim=220,
                      kernel_initializer='uniform',
                      input_dim=int(self.x.shape[1])))
            model.add(Activation(activation='relu'))
            model.add(Dropout(0.27208339620963506))

            model.add(
                Dense(output_dim=205, kernel_initializer="glorot_uniform"))
            model.add(Activation(activation='relu'))
            model.add(Dropout(0.29152160619480066))

            model.add(Dense(1))
            model.add(Activation('sigmoid'))
            model.compile(loss='binary_crossentropy', optimizer='rmsprop')

        elif self.worker.window.list_ml.currentItem().text(
        ) == 'RandomForestClassifier':

            if params is not None:
                model = RandomForestClassifier(**params)
            elif fresh:
                model = RandomForestClassifier()
            else:
                model = RandomForestClassifier(warm_start=True,
                                               n_estimators=1,
                                               n_jobs=-1,
                                               random_state=self.RANDOM_STATE)

            # model = RandomForestClassifier(warm_start=True, n_estimators=1, n_jobs=-1, random_state=self.RANDOM_STATE)
            info = ' RFC ' + self.worker.window.combo_db.currentText()

        elif self.worker.window.list_ml.currentItem().text(
        ) == 'RandomForestRegressor':

            if params is not None:
                model = RandomForestRegressor(**params)
            elif fresh:
                model = RandomForestRegressor()
            else:
                model = RandomForestRegressor(warm_start=True,
                                              n_estimators=1,
                                              n_jobs=-1,
                                              random_state=self.RANDOM_STATE)

            info = ' RFR ' + self.worker.window.combo_db.currentText()

        elif self.worker.window.list_ml.currentItem().text(
        ) == 'LogisticRegression':

            if params is not None:
                model = linear_model.LogisticRegression(**params)
            elif fresh:
                model = linear_model.LogisticRegression()
            else:
                model = linear_model.LogisticRegression(C=50,
                                                        solver='liblinear')
            info = ' LR ' + self.worker.window.combo_db.currentText()

        elif self.worker.window.list_ml.currentItem().text(
        ) == 'SGDClassifier':

            if params is not None:
                model = linear_model.SGDClassifier(**params)
            elif fresh:
                model = linear_model.SGDClassifier()
            else:
                model = linear_model.SGDClassifier(class_weight='balanced',
                                                   loss='hinge',
                                                   max_iter=2426,
                                                   tol=1.6246894453989777e-05,
                                                   warm_start=True)
            # model = linear_model.SGDClassifier(class_weight='balanced', loss='log', max_iter=2330, tol=7.289319599768096e-05)
            # model = linear_model.SGDClassifier(max_iter=1486, tol=4.663673194605843e-05, loss='log', fit_intercept=False)
            # model = linear_model.SGDClassifier(max_iter=840, tol=15.8197115265907305e-05, class_weight='balanced', loss='modified_huber')
            info = ' SGD ' + self.worker.window.combo_db.currentText()

        return {'model': model, 'info': info}

    def choose_space(self, keras=False):

        space = {}

        if keras:

            space = {
                'choice':
                hp.choice('num_layers',
                          [{
                              'layers': 'two',
                          }, {
                              'layers': 'three',
                              'units3': hp.uniform('units3', 64, 1024),
                              'dropout3': hp.uniform('dropout3', .25, .75)
                          }]),
                'units1':
                hp.uniform('units1', 64, 1024),
                'units2':
                hp.uniform('units2', 64, 1024),
                'dropout1':
                hp.uniform('dropout1', .25, .75),
                'dropout2':
                hp.uniform('dropout2', .25, .75),
                'batch_size':
                hp.uniform('batch_size', 28, 128),
                'nb_epochs':
                100,
                'optimizer':
                hp.choice('optimizer', ['adadelta', 'adam', 'rmsprop']),
                'activation':
                'relu'
            }

        elif self.worker.window.list_ml.currentItem().text(
        ) == 'RandomForestClassifier':

            space = {
                'n_estimators':
                hp.choice('n_estimators', range(100, 1500)),
                'class_weight':
                hp.choice('class_weight',
                          ['balanced', 'balanced_subsample', None]),
                'max_features':
                hp.choice('max_features', ['auto', 'sqrt', 'log2']),
                'bootstrap':
                hp.choice('bootstrap', [True, False]),
                'max_depth':
                hp.choice('max_depth', [None, 1, 3]),
                'criterion':
                hp.choice('criterion', ['gini', 'entropy'])
            }

        elif self.worker.window.list_ml.currentItem().text(
        ) == 'RandomForestRegressor':

            space = {
                'n_estimators':
                hp.choice('n_estimators', range(10, 150)),
                'warm_start':
                hp.choice('warm_start', [True, False]),
                'class_weight':
                hp.choice('class_weight',
                          ['balanced', 'balanced_subsample', None]),
                'max_features':
                hp.choice('max_features', ['auto', 'sqrt']),
                'bootstrap':
                hp.choice('bootstrap', [True, False]),
                'max_depth':
                hp.choice('max_depth', [None, 1, 2, 3]),
                'min_samples_split':
                hp.choice('min_samples_split', [2, 3]),
                'min_samples_leaf':
                hp.choice('min_samples_leaf', [1, 2])
            }

        elif self.worker.window.list_ml.currentItem().text(
        ) == 'LogisticRegression':

            space = {
                'solver': hp.choice('solver', ['newton-cg', 'lbfgs', 'sag']),
                'warm_start': hp.choice('warm_start', [True, False]),
                'class_weight': hp.choice('class_weight', ['balanced', None]),
                'tol': hp.uniform('tol', 0.00001, 0.0001),
                'C': hp.uniform('C', 1.0, 50.0),
                'fit_intercept': hp.choice('fit_intercept', [True, False]),
                'max_iter': hp.choice('max_iter', range(100, 3000))
            }

        elif self.worker.window.list_ml.currentItem().text(
        ) == 'SGDClassifier':

            space = {
                'class_weight':
                hp.choice('class_weight', [None, 'balanced']),
                'warm_start':
                hp.choice('warm_start', [True, False]),
                'fit_intercept':
                hp.choice('fit_intercept', [True, False]),
                'tol':
                hp.uniform('tol', 0.00001, 0.0001),
                'loss':
                hp.choice('loss',
                          ['hinge', 'log', 'squared_hinge', 'modified_huber']),
                'max_iter':
                hp.choice('max_iter', range(500, 3000))
            }

        return space

    def sklearn_model_train(self):

        print(self.worker.window.list_ml.currentItem().text())

        dataset = pd.concat(
            self.generate_df_pieces(self.ldb.conn, 100000, offset=0, ids=5000))
        array = dataset.values

        self.x = array[:, :len(self.table_headers)]
        self.y = array[:, len(self.table_headers)]

        x_train, x_validation, y_train, y_validation = model_selection.train_test_split(
            self.x, self.y, test_size=0.2, random_state=self.RANDOM_STATE)

        space = self.choose_space()

        bayes_trials = Trials()

        best = fmin(fn=self.sklearn_objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=self.MAX_EVALS,
                    trials=bayes_trials)

        print(best)

        for bt in bayes_trials:
            print(bt['result']['loss'])
            print(bt['result']['params'])

        model = self.choose_model(space_eval(space, best))
        model['model'].fit(x_train, y_train)
        y_pred = model['model'].predict(x_validation)

        msg = 'Accuracy Score : ' + str(accuracy_score(y_validation, y_pred)) + '\n' + \
              'Precision Score : ' + str(precision_score(y_validation, y_pred)) + '\n' + \
              'Recall Score : ' + str(recall_score(y_validation, y_pred)) + '\n' + \
              'F1 Score : ' + str(f1_score(y_validation, y_pred)) + '\n' + \
              'ROC_AUC Score:' + str(roc_auc_score(y_validation, y_pred))

        # 'Confusion Matrix : \n' + str(confusion_matrix(y_validation, y_pred))

        plt.figure()
        self.plot_confusion_matrix(confusion_matrix(y_validation, y_pred),
                                   classes=[1, 0])

        self.worker.signal_status.emit('')

        self.worker.signal_infobox.emit("Completed", msg)
        plt.show()

    def sklearn_objective(self, params, n_folds=5):

        clf = self.choose_model(fresh=True)

        rus = RandomUnderSampler()

        # pipeline = make_pipeline(rus, clf)

        scores = model_selection.cross_val_score(clf['model'],
                                                 self.x,
                                                 self.y,
                                                 cv=n_folds,
                                                 scoring='f1_macro')

        best_score = max(scores)

        loss = 1 - best_score

        return {'loss': loss, 'params': params, 'status': STATUS_OK}

    def keras_model_train(self):

        dataset = pd.concat(
            self.generate_df_pieces(self.ldb.conn, 100000, offset=0, ids=5000))
        array = dataset.values

        self.x = array[:, :len(self.table_headers)]
        self.y = array[:, len(self.table_headers)]

        self.x_train, self.x_validation, self.y_train, self.y_validation = model_selection.train_test_split(
            self.x, self.y, test_size=0.2, random_state=self.RANDOM_STATE)

        space = self.choose_space(keras=True)

        tb._SYMBOLIC_SCOPE.value = True

        bayes_trials = Trials()

        best = fmin(fn=self.keras_objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=self.MAX_EVALS,
                    trials=bayes_trials)

        print(best)

        for bt in bayes_trials:
            print(bt['result']['loss'])
            print(bt['result']['params'])

    def keras_objective(self, params):

        model = Sequential()
        model.add(
            Dense(output_dim=params['units1'],
                  input_dim=self.x_train.shape[1]))
        model.add(Activation(params['activation']))
        model.add(Dropout(params['dropout1']))

        model.add(Dense(output_dim=params['units2'], init="glorot_uniform"))
        model.add(Activation(params['activation']))
        model.add(Dropout(params['dropout2']))

        if params['choice']['layers'] == 'three':
            model.add(
                Dense(output_dim=params['choice']['units3'],
                      init="glorot_uniform"))
            model.add(Activation(params['activation']))
            model.add(Dropout(params['choice']['dropout3']))

        model.add(Dense(1))
        model.add(Activation('sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer=params['optimizer'])

        model.fit(self.x_train,
                  self.y_train,
                  nb_epoch=params['nb_epochs'],
                  batch_size=params['batch_size'],
                  verbose=0)

        pred_auc = model.predict_proba(self.x_validation,
                                       batch_size=128,
                                       verbose=0)
        acc = roc_auc_score(self.y_validation, pred_auc)
        # print('AUC:', acc)
        # sys.stdout.flush()

        return {'loss': -acc, 'params': params}

    @staticmethod
    def plot_confusion_matrix(cm, classes, normalize=False, cmap=plt.cm.Blues):

        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title('Confusion matrix')
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)

        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i, j in product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j,
                     i,
                     format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.tight_layout()

    def validate_estimators(self, x, y):

        x_train, x_validation, y_train, y_validation = model_selection.train_test_split(
            x, y, test_size=0.3, random_state=0)
        n_estimators = []
        train_results = []
        test_results = []
        rf = RandomForestRegressor(warm_start=True, n_estimators=0, n_jobs=-1)
        # rf = RandomForestClassifier(warm_start=True, n_estimators=0, n_jobs=-1)

        for t in range(self.training_size):

            rf.n_estimators += 3
            # rf.n_iter += 2
            # n_estimators += [rf.n_iter]
            n_estimators += [rf.n_estimators]

            self.worker.signal_status.emit(
                'Validating estimators: {} of {}. Current estimator: {}'.
                format(t + 1, self.training_size, rf.n_estimators))

            rf.fit(x_train, y_train)

            train_pred = rf.predict(x_train)

            false_positive_rate, true_positive_rate, thresholds = roc_curve(
                y_train, train_pred)
            roc_auc = auc(false_positive_rate, true_positive_rate)
            train_results.append(roc_auc)

            y_pred = rf.predict(x_validation)

            false_positive_rate, true_positive_rate, thresholds = roc_curve(
                y_validation, y_pred)
            roc_auc = auc(false_positive_rate, true_positive_rate)
            test_results.append(roc_auc)

        line1, = plt.plot(n_estimators, train_results, 'b', label="Train AUC")
        line2, = plt.plot(n_estimators, test_results, 'r', label="Test AUC")

        plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
        plt.ylabel('AUC score')
        plt.xlabel('n_estimators')
        plt.show()

    def validate_max_depth(self, x, y):

        x_train, x_validation, y_train, y_validation = model_selection.train_test_split(
            x, y, test_size=0.3, random_state=0)

        max_depths = np.linspace(1, 32, 32, endpoint=True)

        train_results = []
        test_results = []

        for max_depth in max_depths:

            rf = RandomForestClassifier(warm_start=True,
                                        n_estimators=10,
                                        max_depth=max_depth,
                                        n_jobs=-1)

            self.worker.signal_status.emit(
                'Validating max depth: {} of {}.'.format(
                    max_depth, len(max_depths)))

            rf.fit(x_train, y_train)

            train_pred = rf.predict(x_train)

            false_positive_rate, true_positive_rate, thresholds = roc_curve(
                y_train, train_pred)
            roc_auc = auc(false_positive_rate, true_positive_rate)

            train_results.append(roc_auc)

            y_pred = rf.predict(x_validation)

            false_positive_rate, true_positive_rate, thresholds = roc_curve(
                y_validation, y_pred)
            roc_auc = auc(false_positive_rate, true_positive_rate)

            test_results.append(roc_auc)

        line1, = plt.plot(max_depths, train_results, 'b', label="Train AUC")
        line2, = plt.plot(max_depths, test_results, 'r', label="Test AUC")

        plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
        plt.ylabel('AUC score')
        plt.xlabel('Tree depth')
        plt.show()

    def validate_min_sample_split(self, x, y):

        x_train, x_validation, y_train, y_validation = model_selection.train_test_split(
            x, y, test_size=0.3, random_state=0)

        min_samples_splits = [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]

        train_results = []
        test_results = []

        for min_samples_split in min_samples_splits:

            rf = RandomForestClassifier(warm_start=True,
                                        n_estimators=10,
                                        min_samples_split=min_samples_split,
                                        n_jobs=-1)

            self.worker.signal_status.emit(
                'Validating min sample split: {} of {}.'.format(
                    min_samples_split, len(min_samples_splits)))

            rf.fit(x_train, y_train)

            train_pred = rf.predict(x_train)
            false_positive_rate, true_positive_rate, thresholds = roc_curve(
                y_train, train_pred)
            roc_auc = auc(false_positive_rate, true_positive_rate)

            train_results.append(roc_auc)

            y_pred = rf.predict(x_validation)

            false_positive_rate, true_positive_rate, thresholds = roc_curve(
                y_validation, y_pred)
            roc_auc = auc(false_positive_rate, true_positive_rate)
            test_results.append(roc_auc)

        line1, = plt.plot(min_samples_splits,
                          train_results,
                          'b',
                          label="Train AUC")
        line2, = plt.plot(min_samples_splits,
                          test_results,
                          'r',
                          label="Test AUC")

        plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
        plt.ylabel('AUC score')
        plt.xlabel('min samples split')
        plt.show()

    def validate_min_sample_leaf(self, x, y):

        x_train, x_validation, y_train, y_validation = model_selection.train_test_split(
            x, y, test_size=0.3, random_state=0)

        min_samples_leafs = [1, 2, 3, 4, 5]

        train_results = []
        test_results = []

        for min_samples_leaf in min_samples_leafs:

            rf = RandomForestClassifier(warm_start=True,
                                        n_estimators=10,
                                        min_samples_leaf=min_samples_leaf,
                                        n_jobs=-1)

            self.worker.signal_status.emit(
                'Validating min sample leaf: {} of {}.'.format(
                    min_samples_leaf, len(min_samples_leafs)))

            rf.fit(x_train, y_train)
            train_pred = rf.predict(x_train)

            false_positive_rate, true_positive_rate, thresholds = roc_curve(
                y_train, train_pred)

            roc_auc = auc(false_positive_rate, true_positive_rate)

            train_results.append(roc_auc)
            y_pred = rf.predict(x_validation)

            false_positive_rate, true_positive_rate, thresholds = roc_curve(
                y_validation, y_pred)

            roc_auc = auc(false_positive_rate, true_positive_rate)

            test_results.append(roc_auc)

        line1, = plt.plot(min_samples_leafs,
                          train_results,
                          'b',
                          label="Train AUC")
        line2, = plt.plot(min_samples_leafs,
                          test_results,
                          'r',
                          label="Test AUC")

        plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
        plt.ylabel('AUC score')
        plt.xlabel('min samples leaf')
        plt.show()

    def validate_max_features(self):
        pass

    def random_forest_predict(self):

        output_headers = ",".join(['ID INTEGER PRIMARY KEY'] +
                                  ['OUTPUT_LABEL INTEGER'])

        self.ldb.delete_table('OUTPUT_prediction')
        self.ldb.create_table('OUTPUT_prediction', output_headers)

        ids = 1

        dataset = pd.concat(
            self.generate_df_pieces(self.ldb.conn, 1000, offset=0))

        array = dataset.values

        x = array[:, :len(self.table_headers)]

        filename = 'random_forest.sav'
예제 #2
0
파일: test.py 프로젝트: MrDominikku/MYOPM
class TestTF:
    def __init__(self):

        self.ldb = LotteryDatabase(CONFIG['database'])

        self.x = None
        self.y = None

        self.x_train = None
        self.x_validation = None
        self.y_train = None
        self.y_validation = None

        self.curr_game = CONFIG['games']['mini_lotto']

        self.table_headers = []
        self.features = self.curr_game['features']

        feat = ['number_map', 'number_cycles', 'cool numbers']

        for i in feat:
            feature_len = self.features[i]['length'] + 1
            feature_header = self.features[i]['header']
            self.table_headers += [
                feature_header + str(n) + ' INTEGER'
                for n in range(1, feature_len)
            ]

    def main_tf(self):

        # dataset = dataframe.read_sql_table(table='MODEL_ml', uri='sqlite:///' + config['database'], index_col='ID', npartitions=6)

        dataset = pd.concat(
            self.generate_df_pieces(self.ldb.conn, 100000, offset=0, ids=5000))
        # dataset.compute()
        array = dataset.values

        self.x = array[:, :len(self.table_headers)]
        self.y = array[:, len(self.table_headers)]

        self.x_train, self.x_validation, self.y_train, self.y_validation = model_selection.train_test_split(
            self.x, self.y, test_size=0.2, random_state=42)

        bayes_trials = Trials()

        best = fmin(fn=self.keras_objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=10,
                    trials=bayes_trials)

        print(best)

        for bt in bayes_trials:
            print(bt['result']['loss'])
            print(bt['result']['params'])

    def generate_df_pieces(self, connection, chunk_size, offset, ids):

        last_row = self.ldb.get_table_length('MODEL_ml')
        chunks = int(math.ceil(last_row / chunk_size))
        n_chunk = 1

        self.ldb.delete_view('tempView')
        self.ldb.create_view(
            'tempView',
            ",".join(['DRAFT_ID'] + self.table_headers + ['LABEL']),
            'MODEL_ml')

        while True:
            print(
                str.format('Collecting data from database... {} of {}',
                           n_chunk, chunks))
            sql_ct = "SELECT * FROM tempView WHERE DRAFT_ID <= %d limit %d offset %d" % (
                ids, chunk_size, offset)
            df_piece = pd.read_sql_query(sql_ct, connection)

            if not df_piece.shape[0]:
                break
            yield df_piece

            if df_piece.shape[0] < chunk_size:
                break

            offset += chunk_size
            n_chunk += 1

    def keras_objective(self, params):

        model = Sequential()
        model.add(
            Dense(output_dim=params['units1'],
                  kernel_initializer='uniform',
                  input_dim=int(self.x_train.shape[1])))
        model.add(Activation(params['activation']))
        model.add(Dropout(params['dropout1']))

        model.add(
            Dense(output_dim=params['units2'],
                  kernel_initializer="glorot_uniform"))
        model.add(Activation(params['activation']))
        model.add(Dropout(params['dropout2']))

        if params['choice']['layers'] == 'three':
            model.add(
                Dense(output_dim=params['choice']['units3'],
                      kernel_initializer="glorot_uniform"))
            model.add(Activation(params['activation']))
            model.add(Dropout(params['choice']['dropout3']))

        model.add(Dense(1))
        model.add(Activation('sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer=params['optimizer'])

        model.fit(self.x_train,
                  self.y_train,
                  nb_epoch=params['epochs'],
                  batch_size=params['batch_size'],
                  verbose=2)

        pred_auc = model.predict_proba(self.x_validation,
                                       batch_size=params['batch_size'],
                                       verbose=2)
        acc = roc_auc_score(self.y_validation, pred_auc)
        print('AUC:', acc)
        sys.stdout.flush()

        return {'loss': -acc, 'params': params, 'status': STATUS_OK}