示例#1
0
def test_backend_respected():
    clf = RandomForestClassifier(n_estimators=10, n_jobs=2)

    with parallel_backend("testing") as (ba, n_jobs):
        clf.fit(X, y)

    assert ba.count > 0

    # predict_proba requires shared memory. Ensure that's honored.
    with parallel_backend("testing") as (ba, _):
        clf.predict_proba(X)

    assert ba.count == 0
示例#2
0
def main(argv):
    with parallel_backend('threading'):
        pipeline()
示例#3
0
def update_spatial(Y,
                   A,
                   b,
                   C,
                   f,
                   sn,
                   gs_sigma=6,
                   dl_wnd=5,
                   sparse_penal=0.5,
                   update_background=True,
                   post_scal=False,
                   normalize=True,
                   zero_thres='eps',
                   sched='single-threaded'):
    _T = len(Y.coords['frame'])
    print("estimating penalty parameter")
    cct = C.dot(C, 'frame')
    alpha = sparse_penal * sn * np.sqrt(np.max(np.diag(cct))) / _T
    alpha = alpha.persist()
    print("computing subsetting matrix")
    if dl_wnd:
        selem = moph.disk(dl_wnd)
        sub = xr.apply_ufunc(cv2.dilate,
                             A.chunk(dict(height=-1, width=-1)),
                             input_core_dims=[['height', 'width']],
                             output_core_dims=[['height', 'width']],
                             vectorize=True,
                             kwargs=dict(kernel=selem),
                             dask='parallelized',
                             output_dtypes=[A.dtype])
        sub = (sub > 0)
    else:
        sub = xr.apply_ufunc(np.ones_like, A.compute())
    if update_background:
        A = xr.concat([A, b.assign_coords(unit_id=-1)], 'unit_id')
        b_erd = xr.apply_ufunc(cv2.erode,
                               b.chunk(dict(height=-1, width=-1)),
                               input_core_dims=[['height', 'width']],
                               output_core_dims=[['height', 'width']],
                               kwargs=dict(kernel=selem),
                               dask='parallelized',
                               output_dtypes=[b.dtype])
        sub = xr.concat(
            [sub,
             (b_erd > 0).astype(bool).assign_coords(unit_id=-1)], 'unit_id')
        C = xr.concat([C, f.assign_coords(unit_id=-1)], 'unit_id')
    sub = sub.persist()
    print("fitting spatial matrix")
    A_new = xr.apply_ufunc(update_spatial_perpx,
                           Y.chunk(dict(frame=-1)),
                           alpha,
                           sub.chunk(dict(unit_id=-1)),
                           C.chunk(dict(frame=-1, unit_id=-1)),
                           input_core_dims=[['frame'], [], ['unit_id'],
                                            ['frame', 'unit_id']],
                           output_core_dims=[['unit_id']],
                           vectorize=True,
                           dask='parallelized',
                           output_dtypes=[Y.dtype])
    try:
        with parallel_backend('dask'):
            A_new = A_new.persist()
    except ValueError:
        with da.config.set(scheduler=sched):
            A_new = A_new.persist()
    print("removing empty units")
    if zero_thres == 'eps':
        zero_thres = np.finfo(A_new.dtype).eps
    A_new = A_new.where(A_new > zero_thres).fillna(0)
    non_empty = (A_new.sum(['width', 'height']) > 0).compute()
    A_new = A_new.where(non_empty, drop=True)
    C_new = C.where(non_empty, drop=True)
    A_new = rechunk_like(A_new, A).persist()
    C_new = rechunk_like(C_new, C).persist()
    if post_scal and len(A_new) > 0:
        print("post-hoc scaling")
        A_new_flt = (A_new.stack(spatial=['height', 'width']).compute())
        Y_flt = (Y.mean('frame').stack(spatial=['height', 'width']).compute())

        def lstsq(a, b):
            return np.linalg.lstsq(a, b, rcond=-1)[0]

        scale = xr.apply_ufunc(lstsq,
                               A_new_flt,
                               Y_flt,
                               input_core_dims=[['spatial', 'unit_id'],
                                                ['spatial']],
                               output_core_dims=[['unit_id']])
        C_mean = C.mean('frame').compute()
        scale = scale / C_mean
        A_new = A_new * scale
        try:
            A_new = A_new.persist()
        except np.linalg.LinAlgError:
            warnings.warn("post-hoc scaling failed", RuntimeWarning)
    if update_background:
        print("updating background")
        try:
            b_new = A_new.sel(unit_id=-1)
            b_new = b_new / da.array.linalg.norm(b_new.data)
            f_new = xr.apply_ufunc(
                da.array.tensordot,
                Y,
                b_new,
                input_core_dims=[['frame', 'height', 'width'],
                                 ['height', 'width']],
                output_core_dims=[['frame']],
                kwargs=dict(axes=[(1, 2), (0, 1)]),
                dask='allowed').persist()
            A_new = A_new.drop(-1, 'unit_id')
            C_new = C_new.drop(-1, 'unit_id')
        except KeyError:
            print("background terms are empty")
            b_new = xr.zeros_like(b)
            f_new = xr.zeros_like(f)
    else:
        b_new = b
        f_new = f
    if normalize and len(A_new) > 0:
        print("normalizing result")
        A_norm = xr.apply_ufunc(darr.linalg.norm,
                                A_new.stack(spatial=['height', 'width']),
                                input_core_dims=[['spatial', 'unit_id']],
                                output_core_dims=[['unit_id']],
                                kwargs=dict(axis=0),
                                dask='allowed')
        A_new = (A_new / A_norm).persist()
    return A_new, b_new, C_new, f_new
示例#4
0
#Standardize numeric features
scaler = StandardScaler()
scaler.fit(trainX)
trainX = scaler.transform(trainX)
testX = scaler.transform(testX)

#Add categorical features
trainX = np.append(trainX, train_missing_price, 1)
trainX = np.append(trainX, train_category, 1)
testX = np.append(testX, test_missing_price, 1)
testX = np.append(testX, test_category, 1)

print(trainX.shape)

# fit lasso model
with (parallel_backend('threading')):
    m = LassoCV(normalize=False, cv=5, verbose=True).fit(trainX, trainY)

# save model
file_name = 'linear_model.joblib'
dump(m, file_name)


def load_model():
    return load(file_name)


def plot():
    # show results for fit data
    plt.figure()
    ax = plt.subplot(111)
示例#5
0
 def runCrossValidate(self, verbose=False):
     self.logger.log("Cross-validate started...",
                     self.step_n,
                     message="Running cross validation")
     n_jobs = self.cv_n_jobs
     cv_results = {}
     new_cv_results = {}
     cv = self.getCV()
     # n_jobs = -1
     if verbose:
         logger.info(
             f"RunCrossValidate - n_jobs: {n_jobs}, scorer_list: {self.scorer_list}"
         )
     for pipe_name, model in self.model_dict.items():
         if verbose:
             logger.info(
                 f"RunCrossValidate - Running CV on pipe_name: {pipe_name}")
         start = time.time()
         dask_scheduler = os.getenv(
             "DASK_SCHEDULER", "tcp://" +
             socket.gethostbyname(socket.gethostname()) + ":8786")
         client = Client(dask_scheduler)
         with parallel_backend('dask', n_jobs=-1):  # 40min test case
             model_i = cross_validate(model,
                                      self.X_df,
                                      self.y_df.iloc[:, 0],
                                      return_estimator=True,
                                      scoring=self.scorer_list,
                                      cv=cv,
                                      n_jobs=1,
                                      verbose=3)
         end = time.time()
         if verbose:
             logger.info(
                 f"SCORES - {pipe_name},{[(scorer,np.mean(model_i[f'test_{scorer}'])) for scorer in self.scorer_list]}, runtime: {(end-start)/60} min."
             )
             logger.info(f"MODELS - {pipe_name},{model_i}")
         cv_results[pipe_name] = model_i
     if self.run_stacked:
         for est_name, result in cv_results.items():
             if type(result['estimator'][0]) is MultiPipe:
                 new_results = {}
                 for mp in result['estimator']:
                     for est_n, m in mp.build_individual_fitted_pipelines(
                     ).items():
                         if not est_n in new_results:
                             new_results[est_n] = []
                         new_results[est_n].append(m)
                 for est_n in new_results:
                     if est_n in cv_results:
                         est_n += '_fcombo'
                     new_cv_results[est_n] = {
                         'estimator': new_results[est_n]
                     }
         cv_results = {**new_cv_results, **cv_results}
         if verbose:
             logger.info("CV Results: {}".format(cv_results))
     self.cv_results = cv_results
     self.logger.log("Cross-validate complete.",
                     self.step_n,
                     message="Completed cross validation")
示例#6
0
def get_ensemble_models(X_train, y_train, X_test, y_test, day_, sector,
                        best_params, model_name, path_):
    """
    This function is to looking for the best ensemble model.
    
    :param X_train: Set of features for train.
    :param y_train: Set of target for train.
    :param X_test: Set of features for test.
    :param y_test: Set of target for test.
    :param int day_: Time window for features.
    :param str sector: Sector GICS for filtering the predictions.
    :param list best_params: List of best params for basics models.
    :param str or None model_name: Name of the best model. If it's None is because the model has to be trained.
    :param str path_: Path where the models are saved.
    :return tuple of dict: Tuple of dicts with the different models.
    """
    t_init = time.time()
    scoring = 'precision_macro'
    dict_vot = {}
    dict_bagg = {}
    dict_ada = {}
    dict_total = {}
    check_ = (sector, day_)
    if (check_ != ('Financials', 3)) and (check_ != ('Financials', 5)):
        if model_name is None:
            with parallel_backend('threading', n_jobs=2):
                vot = VotingClassifier(estimators=[(
                    'DecTree', DecisionTreeClassifier(**best_params[2])
                ), ('KNN', KNeighborsClassifier(**best_params[1])
                    ), ('LogReg', LogisticRegression(**best_params[0]))])
                val_score_vot = cross_val_score(vot,
                                                X_train,
                                                y_train,
                                                cv=3,
                                                scoring=scoring).mean()
                vot.fit(X_train, y_train)
            report = classification_report(y_test,
                                           vot.predict(X_test),
                                           digits=4,
                                           output_dict=True)
            dict_vot['vot'] = [vot, val_score_vot, report]
            prec_vot = float(dict_vot['vot'][2]['weighted avg']['precision'])
        elif model_name == 'vot':
            vot = get_best_ensem_model(path_)
            vot.fit(X_train, y_train)
            report = classification_report(y_test,
                                           vot.predict(X_test),
                                           digits=4,
                                           output_dict=True)
            dict_vot['vot'] = [vot, report]
            prec_vot = float(dict_vot['vot'][1]['weighted avg']['precision'])
        else:
            prec_vot = 0
    else:
        prec_vot = 0
    show_time(
        t_init, time.time(),
        'Time to train vot for %d days, %s sector and %s' %
        (day_, sector, scoring))

    if model_name is None:
        with parallel_backend('threading', n_jobs=2):
            bagg = BaggingClassifier(base_estimator=KNeighborsClassifier(
                **best_params[1]))
            val_score_bagg = cross_val_score(bagg,
                                             X_train,
                                             y_train,
                                             cv=3,
                                             scoring=scoring).mean()
            bagg.fit(X_train, y_train)
        report = classification_report(y_test,
                                       bagg.predict(X_test),
                                       digits=4,
                                       output_dict=True)
        dict_bagg['bagg'] = [bagg, val_score_bagg, report]
        prec_bagg = float(dict_bagg['bagg'][2]['weighted avg']['precision'])
    elif model_name == 'bagg':
        bagg = get_best_ensem_model(path_)
        bagg.fit(X_train, y_train)
        report = classification_report(y_test,
                                       bagg.predict(X_test),
                                       digits=4,
                                       output_dict=True)
        dict_bagg['bagg'] = [bagg, report]
        prec_bagg = float(dict_bagg['bagg'][1]['weighted avg']['precision'])
    else:
        prec_bagg = 0
    show_time(
        t_init, time.time(),
        'Time to train bagg for %d days, %s sector and %s' %
        (day_, sector, scoring))

    if (check_ != ('Financials', 3)) and (check_ != ('Financials', 5)):
        if model_name is None:
            with parallel_backend('threading', n_jobs=2):
                ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
                    **best_params[2]))
                val_score_ada = cross_val_score(ada,
                                                X_train,
                                                y_train,
                                                cv=3,
                                                scoring=scoring).mean()
                ada.fit(X_train, y_train)
            report = classification_report(y_test,
                                           ada.predict(X_test),
                                           digits=4,
                                           output_dict=True)
            dict_ada['ada'] = [ada, val_score_ada, report]
            prec_ada = float(dict_ada['ada'][2]['weighted avg']['precision'])
        elif model_name == 'ada':
            ada = get_best_ensem_model(path_)
            ada.fit(X_train, y_train)
            report = classification_report(y_test,
                                           ada.predict(X_test),
                                           digits=4,
                                           output_dict=True)
            dict_ada['ada'] = [ada, report]
            prec_ada = float(dict_ada['ada'][1]['weighted avg']['precision'])
        else:
            prec_ada = 0
    else:
        prec_ada = 0
    show_time(
        t_init, time.time(),
        'Time to train ada for %d days, %s sector and %s' %
        (day_, sector, scoring))

    if prec_vot >= prec_bagg:
        if prec_vot >= prec_ada:
            dict_total['best'] = dict_vot
        else:
            dict_total['best'] = dict_ada
    else:
        if prec_bagg >= prec_ada:
            dict_total['best'] = dict_bagg
        else:
            dict_total['best'] = dict_ada

    return dict_total, dict_vot, dict_bagg, dict_ada
示例#7
0
def train_classifier(d2v, training_vectors, training_labels):
    logging.info("Classifier training")
    train_vectors = get_vectors(d2v, training_vectors, 300, 'Train')
    # Find the optimal Random Forest Classifier Hyperparameters
    n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(100, 500, num=11)]
    max_depth.append(None)
    with parallel_backend('threading'):
        rfc = RandomForestClassifier(n_jobs=1)
        random_grid = {
            'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth
        }
        rfc_random = RandomizedSearchCV(estimator=rfc,
                                        param_distributions=random_grid,
                                        n_iter=10,
                                        cv=3,
                                        verbose=2,
                                        random_state=42,
                                        n_jobs=WORKERS)
        rfc_random.fit(train_vectors, np.array(training_labels))
        best_parameters = rfc_random.best_params_
        model = RandomForestClassifier(
            n_estimators=best_parameters['n_estimators'],
            max_features=best_parameters['max_features'],
            max_depth=best_parameters['max_depth'],
            n_jobs=WORKERS)
        #model = RandomForestClassifier(n_jobs=WORKERS)
        #print("train_vectors shape",train_vectors.shape)
        #print('train_label shape',training_labels.shape)
        model.fit(train_vectors, np.array(training_labels))
    model_file = os.path.join(path_model, CLASSIFICATION_MODEL_NAME)
    pickle.dump(model, open(model_file, 'wb'))
    logging.info("Classification model saved on :{}".format(model_file))
    #model = pickle.load(open(model_file,'rb'))
    training_predictions = model.predict(train_vectors)

    validate_df = pd.DataFrame(training_predictions,
                               columns=[
                                   'target', 'male', 'female',
                                   'homosexual_gay_or_lesbian', 'christian',
                                   'jewish', 'muslim', 'black', 'white',
                                   'psychiatric_or_mental_illness'
                               ])
    validate_df[GENSIM_MODEL_NAME] = pd.DataFrame(training_predictions[:, 0])
    validate_df.head()
    bias_metrics_df = compute_bias_metrics_for_model(validate_df,
                                                     identity_columns,
                                                     GENSIM_MODEL_NAME,
                                                     TOXICITY_COLUMN)
    performance = get_final_metric(
        bias_metrics_df, calculate_overall_auc(validate_df, GENSIM_MODEL_NAME))

    logging.info('Training predicted classes: {}'.format(
        np.unique(training_predictions)))
    logging.info('Training accuracy: {}'.format(
        accuracy_score(training_labels, training_predictions)))
    logging.info('Training F1 score: {}'.format(
        f1_score(training_labels, training_predictions, average='weighted')))
    logging.info('Training Bias Metric: {}'.format(performance))
    logging.info("Saving classification model")

    return model
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 19 19:39:52 2019

@author: amitabh.gunjan
"""

from operator import neg
from sklearn.utils import parallel_backend, Parallel, delayed
import joblib

with parallel_backend('threading'):
    print(joblib.Parallel()(joblib.delayed(neg)(i + 1) for i in range(5)))
示例#9
0
文件: predictors.py 项目: dssg/triage
    def predict(self, model_id, matrix_store, misc_db_parameters, train_matrix_columns):
        """Generate predictions and store them in the database

        Args:
            model_id (int) the id of the trained model to predict based off of
            matrix_store (catwalk.storage.MatrixStore) a wrapper for the
                prediction matrix and metadata
            misc_db_parameters (dict): attributes and values to add to each
                TrainPrediction or TestPrediction object in the results schema
            train_matrix_columns (list): The order of columns that the model
                was trained on

        Returns:
            (np.Array) the generated prediction values
        """
        # Setting the Prediction object type - TrainPrediction or TestPrediction
        matrix_type = matrix_store.matrix_type

        if not self.replace:
            logger.info(
                f"Replace flag not set, looking for old predictions for model id {model_id} "
                f"on {matrix_store.matrix_type.string_name} matrix {matrix_store.uuid}"
            )
            try:
                session = self.sessionmaker()
                existing_predictions = self._existing_predictions(
                    matrix_type.prediction_obj, session, model_id, matrix_store
                )
                logger.spam(f"Existing predictions length: {existing_predictions.count()}, Length of matrix: {len(matrix_store.index)}")
                if existing_predictions.count() == len(matrix_store.index):
                    logger.info(
                        f"Found old predictions for model id {model_id}, matrix {matrix_store.uuid}, returning saved versions"
                    )
                    return self._load_saved_predictions(existing_predictions, matrix_store)
            finally:
                session.close()

        model = self.load_model(model_id)
        if not model:
            raise ValueError(f"Model id {model_id} not found")
        logger.spam(f"Loaded model {model_id}")

        # Labels are popped from matrix (i.e. they are removed and returned)
        labels = matrix_store.labels

        # using a threading backend because the default loky backend doesn't
        # allow for nested parallelization (e.g., multiprocessing at triage level)
        with parallel_backend('threading'):
            predictions = model.predict_proba(
                matrix_store.matrix_with_sorted_columns(train_matrix_columns)
            )[:, 1]  # Returning only the scores for the label == 1


        logger.debug(
            f"Generated predictions for model {model_id} on {matrix_store.matrix_type.string_name} matrix {matrix_store.uuid}"
        )
        if self.save_predictions:
            df = pd.DataFrame(data=None, columns=None, index=matrix_store.index)
            df['label_value'] = matrix_store.labels
            df['score'] = predictions


            logger.spam(f"Sorting predictions for model {model_id} using {self.rank_order}")

            if self.rank_order == 'best':
                df.sort_values(by=["score", "label_value"], inplace=True, ascending=[False,False], na_position='last')
            elif self.rank_order == 'worst':
                df.sort_values(by=["score", "label_value"], inplace=True, ascending=[False,True], na_position='first')
            elif self.rank_order == 'random':
                df['random'] = np.random.rand(len(df))
                df.sort_values(by=['score', 'random'], inplace=True, ascending=[False, False])
                df.drop('random', axis=1)
            else:
                raise ValueError(f"Rank order specified in condiguration file not recognized: {self.rank_order} ")

            df['rank_abs_no_ties'] = df['score'].rank(ascending=False, method='first')
            # uses the lowest rank in the group
            df['rank_abs_with_ties'] = df['score'].rank(ascending=False, method='min')
            # No gaps between groups (so it reaches 1.0). We are using rank_abs_no_ties so we can
            # respect that order (instead of using the mathematical formula,  as was done before)
            df['rank_pct_no_ties'] = df['rank_abs_no_ties'].rank(ascending=True, method='dense', pct=True)
            df['rank_pct_with_ties'] = df['score'].rank(ascending=False, method='dense', pct=True)

            df.reset_index(inplace=True)
            logger.debug(f"Predictions on {matrix_store.matrix_type.string_name} matrix {matrix_store.uuid} from model {model_id} sorted using {self.rank_order}")

            logger.spam(
                f"Writing predictions for model {model_id} on {matrix_store.matrix_type.string_name}  matrix {matrix_store.uuid} to database"
            )

            self._write_predictions_to_db(
                model_id,
                matrix_store,
                df,
                misc_db_parameters,
                matrix_type.prediction_obj,
            )
            logger.debug(
                f"Wrote predictions for model {model_id} on  {matrix_store.matrix_type.string_name} matrix {matrix_store.uuid} to database"
            )
        else:
            logger.notice(
                f"Predictions for model {model_id} on {matrix_store.matrix_type.string_name} matrix {matrix_store.uuid}  weren't written to the db because, because you asked not to do so"
            )
            logger.spam(f"Status of the save_predictions flag: {self.save_predictions}")

        self._write_metadata_to_db(
            model_id=model_id,
            matrix_uuid=matrix_store.uuid,
            matrix_type=matrix_type,
            random_seed=None,
        )

        return predictions