def run_magellan(train_set,
                 valid_set,
                 test_set,
                 feature_combinations,
                 classifiers,
                 experiment_name,
                 write_test_set_for_inspection=False):
    train_path = os.path.dirname(train_set)
    train_file = os.path.basename(train_set)
    test_path = os.path.dirname(test_set)
    test_file = os.path.basename(test_set)
    report_train_name = train_file.replace('.csv', '')
    report_test_name = test_file.replace('.csv', '')

    train_set_left = train_file.replace('pairs', 'left')
    train_set_right = train_file.replace('pairs', 'right')

    test_set_left = test_file.replace('pairs', 'left')
    test_set_right = test_file.replace('pairs', 'right')

    os.makedirs(os.path.dirname(
        '../../../reports/magellan/{}/'.format(experiment_name)),
                exist_ok=True)

    try:
        os.remove('../../../reports/magellan/{}/{}_{}.csv'.format(
            experiment_name, report_train_name, report_test_name))
    except OSError:
        pass

    with open(
            '../../../reports/magellan/{}/{}_{}.csv'.format(
                experiment_name, report_train_name, report_test_name),
            "w") as f:
        f.write(
            'feature#####model#####mean_train_score#####std_train_score#####mean_valid_score#####std_valid_score#####precision_test#####recall_test#####f1_test#####best_params#####train_time#####prediction_time#####feature_importance#####experiment_name#####train_set#####test_set\n'
        )

    for run in range(1, 4):
        for feature_combination in feature_combinations:

            A_t = em.read_csv_metadata(train_path + '/' + train_set_left,
                                       key='mag_id')
            B_t = em.read_csv_metadata(train_path + '/' + train_set_right,
                                       key='mag_id')
            # Load the pre-labeled data
            S_t = em.read_csv_metadata(train_set,
                                       key='_id',
                                       ltable=A_t,
                                       rtable=B_t,
                                       fk_ltable='ltable_mag_id',
                                       fk_rtable='rtable_mag_id')

            A_gs = em.read_csv_metadata(test_path + '/' + test_set_left,
                                        key='mag_id')
            B_gs = em.read_csv_metadata(test_path + '/' + test_set_right,
                                        key='mag_id')
            # Load the pre-labeled data
            S_gs = em.read_csv_metadata(test_set,
                                        key='_id',
                                        ltable=A_gs,
                                        rtable=B_gs,
                                        fk_ltable='ltable_mag_id',
                                        fk_rtable='rtable_mag_id')

            A_t.fillna('', inplace=True)
            A_gs.fillna('', inplace=True)

            B_t.fillna('', inplace=True)
            B_gs.fillna('', inplace=True)

            S_t.fillna('', inplace=True)
            S_gs.fillna('', inplace=True)

            ## DIRTY FIX, CLEAN UP!
            if 'name' in A_t.columns:
                A_t["price"] = A_t["price"].replace(r'^\s*$',
                                                    np.nan,
                                                    regex=True)
                A_t["price"] = A_t["price"].astype('float64')
                A_gs["price"] = A_gs["price"].replace(r'^\s*$',
                                                      np.nan,
                                                      regex=True)
                A_gs["price"] = A_gs["price"].astype('float64')
                B_t["price"] = B_t["price"].replace(r'^\s*$',
                                                    np.nan,
                                                    regex=True)
                B_t["price"] = B_t["price"].astype('float64')
                B_gs["price"] = B_gs["price"].replace(r'^\s*$',
                                                      np.nan,
                                                      regex=True)
                B_gs["price"] = B_gs["price"].astype('float64')

                S_t["ltable_price"] = S_t["ltable_price"].replace(r'^\s*$',
                                                                  np.nan,
                                                                  regex=True)
                S_t["ltable_price"] = S_t["ltable_price"].astype('float64')
                S_t["rtable_price"] = S_t["rtable_price"].replace(r'^\s*$',
                                                                  np.nan,
                                                                  regex=True)
                S_t["rtable_price"] = S_t["rtable_price"].astype('float64')

                S_gs["ltable_price"] = S_gs["ltable_price"].replace(r'^\s*$',
                                                                    np.nan,
                                                                    regex=True)
                S_gs["ltable_price"] = S_gs["ltable_price"].astype('float64')
                S_gs["rtable_price"] = S_gs["rtable_price"].replace(r'^\s*$',
                                                                    np.nan,
                                                                    regex=True)
                S_gs["rtable_price"] = S_gs["rtable_price"].astype('float64')

            atypes1 = em.get_attr_types(A_t)
            atypes2 = em.get_attr_types(B_t)

            match_c = em.get_attr_corres(A_t, B_t)

            match_c['corres'] = []

            # select attributes to compare
            for feature in feature_combination:
                match_c['corres'].append((feature, feature))

            tok = em.get_tokenizers_for_matching()
            sim = em.get_sim_funs_for_matching()

            F_t = em.get_features(A_t, B_t, atypes1, atypes2, match_c, tok,
                                  sim)

            H_t = em.extract_feature_vecs(S_t,
                                          feature_table=F_t,
                                          attrs_after=['label', 'pair_id'],
                                          show_progress=False)
            H_gs = em.extract_feature_vecs(S_gs,
                                           feature_table=F_t,
                                           attrs_after='label',
                                           show_progress=False)

            H_t = H_t.fillna(-1)
            H_gs = H_gs.fillna(-1)

            validation_ids_df = pd.read_csv(valid_set)
            val_df = H_t[H_t['pair_id'].isin(
                validation_ids_df['pair_id'].values)]
            train_only_df = H_t[~H_t['pair_id'].
                                isin(validation_ids_df['pair_id'].values)]

            train_only_df = train_only_df.drop(columns='pair_id')
            val_df = val_df.drop(columns='pair_id')

            train_only_df = train_only_df.sample(frac=1, random_state=42)

            pos_neg = H_t['label'].value_counts()
            pos_neg = round(pos_neg[0] / pos_neg[1])

            train_ind = []
            val_ind = []

            for i in range(len(train_only_df) - 1):
                train_ind.append(-1)

            for i in range(len(val_df) - 1):
                val_ind.append(0)

            ps = PredefinedSplit(test_fold=np.concatenate((train_ind,
                                                           val_ind)))

            train_df = pd.concat([train_only_df, val_df])

            for k, v in classifiers.items():

                classifier = v['clf']
                if 'random_state' in classifier.get_params().keys():
                    classifier = classifier.set_params(**{'random_state': run})

                # add pos_neg ratio to XGBoost params
                if k == 'XGBoost':
                    v['params']['scale_pos_weight']: [1, pos_neg]

                model = RandomizedSearchCV(cv=ps,
                                           estimator=classifier,
                                           param_distributions=v['params'],
                                           random_state=42,
                                           n_jobs=4,
                                           scoring='f1',
                                           n_iter=500,
                                           pre_dispatch=8,
                                           return_train_score=True)

                feats_train = train_df.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_train = train_df['label']
                feats_gs = H_gs.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_gs = H_gs['label']

                try:
                    model.fit(feats_train, labels_train)
                except ValueError:
                    set_trace()

                parameters = model.best_params_

                score_names = [
                    'mean_train_score', 'std_train_score', 'mean_test_score',
                    'std_test_score'
                ]
                scores = {}
                score_string = ''
                for name in score_names:
                    scores[name] = model.cv_results_[name][model.best_index_]
                    score_string = score_string + name + ': ' + str(
                        scores[name]) + ' '

                feature_names = list(feats_train.columns)

                if k == 'LogisticRegression' or k == 'LinearSVC':
                    most_important_features = model.best_estimator_.coef_
                    word_importance = zip(feature_names,
                                          most_important_features[0].tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)
                if k == 'RandomForest' or k == 'DecisionTree':
                    most_important_features = model.best_estimator_.feature_importances_
                    word_importance = zip(feature_names,
                                          most_important_features.tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)
                if k == 'NaiveBayes':
                    word_importance = ''
                if k == 'XGBoost':
                    most_important_features = model.best_estimator_.feature_importances_
                    word_importance = zip(feature_names,
                                          most_important_features.tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)

                if k == 'LogisticRegression':
                    learner = LogisticRegression(random_state=run,
                                                 solver='liblinear',
                                                 **parameters)
                elif k == 'NaiveBayes':
                    learner = GaussianNB()
                elif k == 'DecisionTree':
                    learner = DecisionTreeClassifier(random_state=run,
                                                     **parameters)
                elif k == 'LinearSVC':
                    learner = LinearSVC(random_state=run,
                                        dual=False,
                                        **parameters)
                elif k == 'RandomForest':
                    learner = RandomForestClassifier(random_state=run,
                                                     n_jobs=4,
                                                     **parameters)
                elif k == 'XGBoost':
                    learner = xgb.XGBClassifier(random_state=run,
                                                n_jobs=4,
                                                **parameters)
                else:
                    print('Learner is not a valid option')
                    break

                model = learner
                feats_train = train_only_df.sample(frac=1, random_state=42)
                feats_train = train_only_df.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_train = train_only_df['label']

                start = time.time()
                model.fit(feats_train, labels_train)
                end = time.time()

                train_time = end - start

                start = time.time()
                preds_gs = model.predict(feats_gs)

                end = time.time()

                pred_time = end - start

                gs_report = classification_report(labels_gs,
                                                  preds_gs,
                                                  output_dict=True)

                feature_report = '+'.join(feature_combination)

                if write_test_set_for_inspection:

                    out_path = '../../../data/processed/wdc-lspc/inspection/{}/magellan/'.format(
                        experiment_name)
                    os.makedirs(os.path.dirname(out_path), exist_ok=True)

                    file_name = '_'.join([
                        os.path.basename(train_set),
                        os.path.basename(test_set), k, feature_report
                    ])
                    file_name = file_name.replace('.csv', '')
                    file_name += f'_{run}.pkl.gz'

                    test_inspection_df = S_gs.copy()
                    if k == 'LinearSVC':
                        proba_gs = model.decision_function(feats_gs).tolist()
                    else:
                        proba_gs = model.predict_proba(feats_gs).tolist()
                    test_inspection_df['pred'] = preds_gs
                    test_inspection_df['Class Prob'] = proba_gs
                    test_inspection_df.to_pickle(out_path + file_name,
                                                 compression='gzip')

                with open(
                        '../../../reports/magellan/{}/{}_{}.csv'.format(
                            experiment_name, report_train_name,
                            report_test_name), "a") as f:
                    f.write(feature_report + '#####' + k + '#####' +
                            str(scores['mean_train_score']) + '#####' +
                            str(scores['std_train_score']) + '#####' +
                            str(scores['mean_test_score']) + '#####' +
                            str(scores['std_test_score']) + '#####' +
                            str(gs_report['1']['precision']) + '#####' +
                            str(gs_report['1']['recall']) + '#####' +
                            str(gs_report['1']['f1-score']) + '#####' +
                            str(parameters) + '#####' + str(train_time) +
                            '#####' + str(pred_time) + '#####' +
                            str(word_importance[0:100]) + '#####' +
                            experiment_name + '#####' + report_train_name +
                            '#####' + report_test_name + '\n')
Пример #2
0
class BaseClustering(AbstractMachineLearningBase):
    """Base class for classification

    Parameters
    ----------
    None

    Attributes
    ----------
    model_: Fited model object, default None

    weights_: ndarray of shape(n_class, n_features) if the model is linear model, else shape(1,n_features), default None
        Feature weights of the fited model

    weights_norm_: ndarray of shape(n_class, n_features) if the model is linear model, else shape(1,n_features), default None
        Normalized feature weights. Using StandardScaler (z-score) to get the normalized feature weights.

    """
    def __init__(self,
                 search_strategy='grid',
                 k=2,
                 metric=accuracy_score,
                 n_iter_of_randomedsearch=10,
                 n_jobs=2,
                 location='cachedir',
                 verbose=False):

        self.search_strategy = search_strategy
        self.k = k
        self.metric = metric
        self.n_iter_of_randomedsearch = n_iter_of_randomedsearch
        self.n_jobs = n_jobs
        self.location = location
        self.verbose = verbose

        self.model_ = None
        self.weights_ = None
        self.weights_norm_ = None

    @timer
    def fit_(self, x=None, y=None):
        """Fit the pipeline_"""

        # TODO: Extending to other cross-validation methods
        # TODO: when no param's length greater than 1, do not use GridSearchCV or RandomizedSearchCV for speeding up

        cv = StratifiedKFold(n_splits=self.k)  # Default is StratifiedKFold
        if self.is_search:
            if self.search_strategy == 'grid':
                self.model_ = GridSearchCV(self.pipeline_,
                                           n_jobs=self.n_jobs,
                                           param_grid=self.param_search_,
                                           cv=cv,
                                           scoring=make_scorer(self.metric),
                                           refit=True)
            elif self.search_strategy == 'random':
                self.model_ = RandomizedSearchCV(
                    self.pipeline_,
                    n_jobs=self.n_jobs,
                    param_distributions=self.param_search_,
                    cv=cv,
                    scoring=make_scorer(self.metric),
                    refit=True,
                    n_iter=self.n_iter_of_randomedsearch,
                )
            else:
                print("Please specify which search strategy!\n")
                return
        else:
            self.model_ = self.pipeline_

        # start = time.time()
        self.model_.fit(x, y)
        # end = time.time()
        # print(end - start)

        # Delete the temporary cache before exiting
        # self.memory.clear(warn=False)
        return self

    def predict(self, x):
        y_hat = self.model_.predict(x)

        # TODO?
        if hasattr(self.model_, 'decision_function'):
            y_prob = self.model_.decision_function(x)
        elif hasattr(self.model_, 'predict_proba'):
            y_prob = self.model_.predict_proba(x)[:, 1]
        else:
            y_prob = y_hat

        return y_hat, y_prob

    def get_weights_(self, x=None, y=None):
        """
        If the model is linear model, the weights are coefficients.
        If the model is not the linear model, the weights are calculated by occlusion test <Transfer learning improves resting-state functional
        connectivity pattern analysis using convolutional neural networks>.
        """

        if self.is_search:
            best_model = self.model_.best_estimator_
        else:
            best_model = self.model_

        feature_preprocessing = best_model['feature_preprocessing']
        dim_reduction = best_model.get_params().get('dim_reduction', None)
        feature_selection = best_model.get_params().get(
            'feature_selection', None)
        estimator = best_model['estimator']

        # Get weight according to model type: linear model or nonlinear model
        if hasattr(estimator, "coef_"):  # Linear model
            coef = estimator.coef_
            if feature_selection and (feature_selection != "passthrough"):
                self.weights_ = feature_selection.inverse_transform(coef)
            else:
                self.weights_ = coef

            if dim_reduction and (dim_reduction != "passthrough"):
                self.weights_ = dim_reduction.inverse_transform(self.weights_)

        else:  # Nonlinear model
            # TODO: Consider the problem of slow speed caused by a large number of features
            x_reduced_selected = x.copy()
            if feature_preprocessing and (feature_preprocessing !=
                                          "passthrough"):
                x_reduced_selected = feature_preprocessing.fit_transform(
                    x_reduced_selected)
            if dim_reduction and (dim_reduction != "passthrough"):
                x_reduced_selected = dim_reduction.fit_transform(
                    x_reduced_selected)
            if feature_selection and (feature_selection != "passthrough"):
                x_reduced_selected = feature_selection.fit_transform(
                    x_reduced_selected, y)

            y_hat = self.model_.predict(x)
            score_true = self.metric(y, y_hat)
            len_feature = x_reduced_selected.shape[1]
            self.weights_ = np.zeros([1, len_feature])

            if len_feature > 1000:
                print(
                    f"***There are {len_feature} features, it may take a long time to get the weight!***\n"
                )
                print(
                    "***I suggest that you reduce the dimension of features***\n"
                )

            for ifeature in range(len_feature):
                print(f"Getting weight for the {ifeature+1}th feature...\n")
                x_ = x_reduced_selected.copy()
                x_[:, ifeature] = 0
                y_hat = estimator.predict(x_)
                self.weights_[0, ifeature] = score_true - self.metric(y, y_hat)

            # Back to original space
            if feature_selection and (feature_selection != "passthrough"):
                self.weights_ = feature_selection.inverse_transform(
                    self.weights_)
            if dim_reduction and (dim_reduction != "passthrough"):
                self.weights_ = dim_reduction.inverse_transform(self.weights_)

        # Normalize weights
        self.weights_norm_ = StandardScaler().fit_transform(self.weights_.T).T
Пример #3
0
def run_wordcooc(train_set,
                 valid_set,
                 test_set,
                 feature_combinations,
                 classifiers,
                 experiment_name,
                 write_test_set_for_inspection=False):
    train_path = os.path.dirname(train_set)
    train_file = os.path.basename(train_set)
    test_path = os.path.dirname(test_set)
    test_file = os.path.basename(test_set)
    report_train_name = train_file.replace('.pkl.gz', '')
    report_test_name = test_file.replace('.pkl.gz', '')

    os.makedirs(os.path.dirname(
        '../../../reports/wordcooc/{}/'.format(experiment_name)),
                exist_ok=True)

    try:
        os.remove('../../../reports/wordcooc/{}/{}_{}.csv'.format(
            experiment_name, report_train_name, report_test_name))
    except OSError:
        pass

    with open(
            '../../../reports/wordcooc/{}/{}_{}.csv'.format(
                experiment_name, report_train_name, report_test_name),
            "w") as f:
        f.write(
            'feature#####model#####mean_train_score#####std_train_score#####mean_valid_score#####std_valid_score#####precision_test#####recall_test#####f1_test#####best_params#####train_time#####prediction_time#####feature_importance#####experiment_name#####train_set#####test_set\n'
        )
    for run in range(1, 4):
        for feature_combination in feature_combinations:

            train_original_df = pd.read_pickle(train_set, compression='gzip')
            gs_df = pd.read_pickle(test_set, compression='gzip')

            feature_file_name = train_file.replace('.pkl.gz', '_words.json')

            with open(train_path + '/feature-names/' +
                      feature_file_name) as json_data:
                words = json.load(json_data)

            validation_ids_df = pd.read_pickle(valid_set, compression='gzip')
            val_df = train_original_df[train_original_df['pair_id'].isin(
                validation_ids_df['pair_id'].values)]
            train_only_df = train_original_df[
                ~train_original_df['pair_id'].
                isin(validation_ids_df['pair_id'].values)]
            train_only_df = train_only_df.sample(frac=1, random_state=42)

            pos_neg = train_original_df['label'].value_counts()
            pos_neg = round(pos_neg[0] / pos_neg[1])

            train_ind = []
            val_ind = []

            for i in range(len(train_only_df) - 1):
                train_ind.append(-1)

            for i in range(len(val_df) - 1):
                val_ind.append(0)

            ps = PredefinedSplit(test_fold=np.concatenate((train_ind,
                                                           val_ind)))

            train_df = pd.concat([train_only_df, val_df])

            for k, v in classifiers.items():

                classifier = v['clf']
                if 'random_state' in classifier.get_params().keys():
                    classifier = classifier.set_params(**{'random_state': run})

                # add pos_neg ratio to XGBoost params
                if k == 'XGBoost':
                    v['params']['scale_pos_weight']: [1, pos_neg]

                model = RandomizedSearchCV(cv=ps,
                                           estimator=classifier,
                                           param_distributions=v['params'],
                                           random_state=42,
                                           n_jobs=4,
                                           scoring='f1',
                                           n_iter=500,
                                           pre_dispatch=8,
                                           return_train_score=True)

                feats_train = scipy.sparse.vstack(
                    train_df[feature_combination + '_wordcooc'])
                labels_train = train_df['label']
                feats_gs = scipy.sparse.vstack(gs_df[feature_combination +
                                                     '_wordcooc'])
                labels_gs = gs_df['label']

                model.fit(feats_train, labels_train)

                parameters = model.best_params_

                score_names = [
                    'mean_train_score', 'std_train_score', 'mean_test_score',
                    'std_test_score'
                ]
                scores = {}
                score_string = ''
                for name in score_names:
                    scores[name] = model.cv_results_[name][model.best_index_]
                    score_string = score_string + name + ': ' + str(
                        scores[name]) + ' '

                if k == 'LogisticRegression' or k == 'LinearSVC':
                    most_important_features = model.best_estimator_.coef_
                    word_importance = zip(words[feature_combination],
                                          most_important_features[0].tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)
                if k == 'RandomForest' or k == 'DecisionTree':
                    most_important_features = model.best_estimator_.feature_importances_
                    word_importance = zip(words[feature_combination],
                                          most_important_features.tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)
                if k == 'NaiveBayes':
                    word_importance = ''
                if k == 'XGBoost':
                    most_important_features = model.best_estimator_.feature_importances_
                    word_importance = zip(words[feature_combination],
                                          most_important_features.tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)

                if k == 'LogisticRegression':
                    learner = LogisticRegression(random_state=run,
                                                 solver='liblinear',
                                                 **parameters)
                elif k == 'NaiveBayes':
                    learner = BernoulliNB()
                elif k == 'DecisionTree':
                    learner = DecisionTreeClassifier(random_state=run,
                                                     **parameters)
                elif k == 'LinearSVC':
                    learner = LinearSVC(random_state=run,
                                        dual=False,
                                        **parameters)
                elif k == 'RandomForest':
                    learner = RandomForestClassifier(random_state=run,
                                                     n_jobs=4,
                                                     **parameters)
                elif k == 'XGBoost':
                    learner = xgb.XGBClassifier(random_state=run,
                                                n_jobs=4,
                                                **parameters)
                else:
                    print('Learner is not a valid option')
                    break

                model = learner
                feats_train = scipy.sparse.vstack(
                    train_only_df[feature_combination + '_wordcooc'])
                labels_train = train_only_df['label']

                start = time.time()
                model.fit(feats_train, labels_train)
                end = time.time()

                train_time = end - start

                start = time.time()
                preds_gs = model.predict(feats_gs)

                end = time.time()

                pred_time = end - start

                gs_report = classification_report(labels_gs,
                                                  preds_gs,
                                                  output_dict=True)

                if write_test_set_for_inspection:

                    out_path = '../../../data/processed/wdc-lspc/inspection/{}/wordcooc/'.format(
                        experiment_name)
                    os.makedirs(os.path.dirname(out_path), exist_ok=True)

                    file_name = '_'.join([
                        os.path.basename(train_set),
                        os.path.basename(test_set), k, feature_combination
                    ])
                    file_name = file_name.replace('.csv', '')
                    file_name += f'_{run}.pkl.gz'

                    test_inspection_df = gs_df.copy()
                    if k == 'LinearSVC':
                        proba_gs = model.decision_function(feats_gs).tolist()
                    else:
                        proba_gs = model.predict_proba(feats_gs).tolist()
                    test_inspection_df['pred'] = preds_gs
                    test_inspection_df['Class Prob'] = proba_gs
                    test_inspection_df.to_pickle(out_path + file_name,
                                                 compression='gzip')

                with open(
                        '../../../reports/wordcooc/{}/{}_{}.csv'.format(
                            experiment_name, report_train_name,
                            report_test_name), "a") as f:
                    f.write(feature_combination + '#####' + k + '#####' +
                            str(scores['mean_train_score']) + '#####' +
                            str(scores['std_train_score']) + '#####' +
                            str(scores['mean_test_score']) + '#####' +
                            str(scores['std_test_score']) + '#####' +
                            str(gs_report['1']['precision']) + '#####' +
                            str(gs_report['1']['recall']) + '#####' +
                            str(gs_report['1']['f1-score']) + '#####' +
                            str(parameters) + '#####' + str(train_time) +
                            '#####' + str(pred_time) + '#####' +
                            str(word_importance[0:100]) + '#####' +
                            experiment_name + '#####' + report_train_name +
                            '#####' + report_test_name + '\n')
Пример #4
0
def train(X,
          y,
          weight_classes=True,
          n_iter_search=500,
          score='roc_auc',
          random_state=123):
    '''
    Train a binary SGD classifier using a randomized grid search with given scoring metric.

    Parameters:
        X (list-like): list of normalized attachment texts
        y (list-like): list of validated targets (0 = red, 1 = green)
        weight_classes (bool): whether or not to use the “balanced” mode to adjust class weights.
        n_iter_search (int):  number of parameter settings that are sampled. Trades off runtime vs quality
                              of the solution.
        score (str):  the scorer used to evaluate the predictions on the test set. `roc_auc` by
                      default. Available options include:  accuracy, roc_auc, precision, fbeta, recall.
                      Note: for fbeta, beta is set to 1.5 to favor recall of the positive class.
        random_state (int): sets the random seed for reproducibility.
    Returns:
        results (dict): a dict of scoring metrics and their values
        best_score (float): mean cross-validated score of the best_estimator.
        best_estimator (sklearn estimator): estimator that was chosen by the search
        best_params (dict): parameter setting that gave the best results on the hold out data.
    '''

    if weight_classes:
        clf = SGDClassifier(class_weight='balanced')
    else:
        clf = clf = SGDClassifier()
    scoring = {
        'accuracy': metrics.make_scorer(metrics.accuracy_score),
        'roc_auc': metrics.make_scorer(metrics.roc_auc_score),
        'precision': metrics.make_scorer(metrics.average_precision_score),
        'fbeta': metrics.make_scorer(metrics.fbeta_score, beta=.5),
        'recall': metrics.make_scorer(metrics.recall_score)
    }
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=random_state)
    pipe = Pipeline([('vectorizer', TfidfVectorizer(stop_words='english')),
                     ('select', SelectKBest(chi2)), ('clf', clf)])
    param_dist = get_param_distribution()
    random_search = RandomizedSearchCV(pipe,
                                       param_distributions=param_dist,
                                       scoring=scoring,
                                       refit=score,
                                       n_iter=n_iter_search,
                                       cv=5,
                                       n_jobs=-1,
                                       verbose=1,
                                       random_state=random_state)
    try:
        random_search.fit(X_train, y_train)
    except Exception as e:
        logger.error(f"Exception occurred training a new model:  \
                        {e}",
                     exc_info=True)
    y_pred = random_search.predict(X_test)
    #get the col number of the positive class (i.e. green)
    positive_class_col = list(random_search.classes_).index(1)
    try:
        y_score = random_search.predict_proba(X_test)[:, positive_class_col]
    except AttributeError:
        y_score = random_search.decision_function(X_test)
    average_precision = metrics.average_precision_score(y_test, y_score)
    acc = metrics.accuracy_score(y_test, y_pred)
    try:
        roc_auc = metrics.roc_auc_score(y_test, y_pred)
    except ValueError:
        roc_auc = None
    precisions, recalls, _ = metrics.precision_recall_curve(y_test, y_score)
    try:
        auc = metrics.auc(recalls, precisions)
    except ValueError:
        auc = None
    fbeta = metrics.fbeta_score(y_test, y_pred, beta=1.5)
    recall = metrics.recall_score(y_test, y_pred)
    best_estimator = random_search.best_estimator_
    best_params = random_search.best_params_
    best_score = random_search.best_score_
    result_values = [
        y_pred, y_score, precisions, recall, average_precision, acc, roc_auc,
        auc, fbeta, recalls, best_score, best_estimator, y_test
    ]
    result_keys = [
        'y_pred', 'y_score', 'precisions', 'recall', 'average_precision',
        'acc', 'roc_auc', 'auc', 'fbeta', 'recalls', 'best_score',
        'best_estimator', 'y_test'
    ]
    results = {k: v for k, v in zip(result_keys, result_values)}

    return results, best_score, best_estimator, best_params
Пример #5
0
    def randomized_grid_search(
            self,
            train_df,
            clf=SGDClassifier(),
            n_iter_search=10,  #10 for testing purposes
            pickle_best=True):
        """
        Given labeled training data (`df`) for a binary classification task,
        performs a randomized grid search `n_iter_search` times using `clf` as the
        classifier and the `score` as a scoring metric.

        Attributes:
            df (pandas DataFrame):  The training data. Currently, you must specify
                                    within the function the label and feature column
                                    names.
            clf (instance of an sklearn classifier):  SGDClassifier() by default
            n_iter_search:  number of parameter settings that are sampled. Trades
                            off runtime vs quality of the solution.
            pickle_best (bool): whether or not to pickle the best estimator
                                returned by the grid search. Default is True
        """

        score = self.metric
        scoring = {
            'accuracy': metrics.make_scorer(metrics.accuracy_score),
            'roc_auc': metrics.make_scorer(metrics.roc_auc_score),
            'avg_precision':
            metrics.make_scorer(metrics.average_precision_score),
            'fbeta': metrics.make_scorer(metrics.fbeta_score, beta=1.5),
            'recall': metrics.make_scorer(metrics.recall_score)
        }
        clf_name = clf.__class__.__name__
        X = train_df['Normalized Comments']
        y = train_df['Spam']
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=123)
        pipe = Pipeline([('vectorizer', TfidfVectorizer()),
                         ('upsample', SMOTE()), ('select', SelectPercentile()),
                         ('clf', clf)])
        param_dist = {
            "vectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)],
            "vectorizer__min_df":
            stats.randint(1, 3),
            "vectorizer__max_df":
            stats.uniform(.7, .3),
            "vectorizer__sublinear_tf": [True, False],
            "upsample": [
                None,
                SMOTE(ratio='minority', kind='svm'),
                SMOTE(ratio='minority', kind='regular'),
                SMOTE(ratio='minority', kind='borderline1'),
                SMOTE(ratio='minority', kind='borderline2')
            ],
            "select": [
                None,
                SelectPercentile(percentile=10),
                SelectPercentile(percentile=20),
                SelectPercentile(percentile=50),
                SelectPercentile(percentile=75)
            ],
            "clf__alpha":
            log_uniform(-5, 2),
            "clf__penalty": ['l2', 'l1', 'elasticnet'],
            "clf__loss":
            ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
        }

        random_search = RandomizedSearchCV(pipe,
                                           param_distributions=param_dist,
                                           scoring=scoring,
                                           refit=score,
                                           n_iter=n_iter_search,
                                           cv=5,
                                           n_jobs=-1,
                                           verbose=1)
        random_search.fit(X_train, y_train)
        y_pred = random_search.predict(X_test)
        #get the col number of the positive class (i.e. spam)
        positive_class_col = list(random_search.classes_).index(1)
        try:
            y_score = random_search.predict_proba(X_test)[:,
                                                          positive_class_col]
        except AttributeError:
            y_score = random_search.decision_function(X_test)
        average_precision = metrics.average_precision_score(y_test, y_score)
        acc = metrics.accuracy_score(y_test, y_pred)
        roc_auc = metrics.roc_auc_score(y_test, y_pred)
        precisions, recalls, _ = metrics.precision_recall_curve(
            y_test, y_score)
        auc = metrics.auc(recalls, precisions)
        fbeta = metrics.fbeta_score(y_test, y_pred, beta=1.5)
        recall = metrics.recall_score(y_test, y_pred)
        print("\tRecall on test data:  {0:.2f}".format(recall))
        print("\tAccuracy on test data:  {0:.2f}".format(acc))
        print("\tROC-AUC on test data:  {0:.2f}".format(roc_auc))
        print("\tFbeta on test data:  {0:.2f}".format(fbeta))
        print("\tAverage Precision on test data:  {0:.2f}".format(
            average_precision))
        print("\tPrecision-Recall AUC on test data:  {0:.2f}".format(auc))
        print("-" * 80)
        print("Classification Report:")
        class_names = ['ham', 'spam']
        print(
            metrics.classification_report(y_test,
                                          y_pred,
                                          target_names=class_names))
        best_estimator = random_search.best_estimator_
        best_score = random_search.best_score_
        result_values = [
            y_pred, y_score, precisions, recall, average_precision, acc,
            roc_auc, auc, fbeta, recalls, best_score, best_estimator, y_test
        ]
        result_keys = [
            'y_pred', 'y_score', 'precisions', 'recall', 'average_precision',
            'acc', 'roc_auc', 'auc', 'fbeta', 'recalls', 'best_score',
            'best_estimator', 'y_test'
        ]
        results = {k: v for k, v in zip(result_keys, result_values)}
        if pickle_best:
            pickle_dir = os.path.join(os.getcwd(), 'model', 'best_estimators')
            if not os.path.exists(pickle_dir):
                os.makedirs(pickle_dir)
            pickle_path = os.path.join(pickle_dir, 'model_sw.pkl')
            with open(pickle_path, 'wb') as f:
                pickle.dump(random_search.best_estimator_, f)
        return results
Пример #6
0
class RandomSearch(object):
    def __init__(self,estimator, param_distributions, n_iter=10, scoring=None, n_jobs=None, iid=False, 
                 refit=True,cv=None, verbose=0, pre_dispatch=None,
                 random_state=None, error_score=np.nan,
                 return_train_score=False):
        """
        estimator  :  使用的分类器, 并且传入除需要确定最佳的参数之外的参数,每个分类器都需要一个scoring参数, 或者score方法
        param_distributions : 最要被优化的参数的取值, 值为字典或列表, param_grid = param_test1,  如:param_test1 = {"n_estimators":range(10,71,10)}   
          
        每个 评估器件,scoring 中需要指定一个, 若评估器内没指定, scoring 需要指定, 当scoring为None 时,  使用评估器中默认的score 函数
        n_iter:   int 默认为 10
        scoring :  默认为None,   str   ,  列表/元组或字典。  
        n_jobs :  默认为None,int,  1, 代表单线程,   -1 为多线程  
        iid :  False. bool 型参数,    True 是, 将每个测试集的样本进行加权。
        refit :   使用找到的最佳参数重新拟合评估器 , 默认为TRUE
        cv : 默认为None, None 为使用默认的5折,   整数的时候,指定合适的折数, 或者使用cv_split
        verbose :  显示打印信息, 0 不显示, 1 显示打印进度条
        pre_dispatch :   n_jobs   并行执行期间调度的作业数   "2*n_jobs"  or int
        error_score :  拟合过程中,若出错,使用这个数值进行填充 一般使用nan
        return_train_score:  bool 型, 默认为False,  不输出 训练分数 
        # 一般使用到   estimator, param_grid,scoring, n_jobs, cv, verbose

        """
        self.randomsearch = RandomizedSearchCV(self, estimator=estimator,
                                               param_distributions=param_distributions, 
                                               n_iter=n_iter,
                                               scoring=scoring, 
                                               n_jobs=n_jobs, 
                                               iid=iid, 
                                               refit=refit,
                                               cv=cv, verbose=verbose, 
                                               pre_dispatch=pre_dispatch,
                                               random_state=random_state, 
                                               error_score=error_score,
                                               return_train_score=return_train_score)
    
    def fit(self, x, y=None):
        return self.randomsearch.fit(X=x, y=y)
    
    def transform(self, x):
        return self.randomsearch.transform(x=x)

    
    def predict(self, x):
        return self.randomsearch.predict(x=x)

    def predict_log_proba(self, x):
        return self.randomsearch.predict_log_proba(X=x)
    
    def predict_proba(self, x):
        return self.randomsearch.predict_proba(X=x)
    
    def inverse_transform(self, xt):
        return self.randomsearch.inverse_transform(Xt=xt)
    
    def decision_function(self, x):  # refit=True下才支持decision_function
        return self.randomsearch.decision_function(X=x)
    
    def set_params(self,params):
        self.randomsearch.set_params(params)
    
    def get_params(self, deep=True):
        return self.randomsearch.get_params(deep=deep)
    
    def get_score(self, x, y=None):
        return self.randomsearch.score(X=x,y=y)
        
    def get_attribute(self, attribute_name):
        if attribute_name == "cv_result":
            return self.randomsearch.cv_results_
        elif attribute_name == "best_estimator":
            return self.randomsearch.best_estimator_
        elif attribute_name == "best_score":
            return self.randomsearch.best_score_
        elif attribute_name == "best_params":
            return self.randomsearch.best_params_
        elif attribute_name == "best_index":
            return self.randomsearch.best_index_
        elif attribute_name =="scorer":
            return self.randomsearch.scorer_
        elif attribute_name =="n_split":
            return self.randomsearch.n_splits_
        elif attribute_name =="refit-time":
            return self.randomsearch.refit_time_
        else:
            ValueError("输入的属性名称有误, 请输入正确的属性名称")