示例#1
0
def baseline_custom_NN(train, train_class, test):

    parameters = {
        "batch_size": [10, 100],
        "node_per_layer": [1000, 500],
        "layer_count": [1, 2],
        "learning_rate": [1000, 500],
        "epoch": [15]
    }

    grid = GridSearchCV(CustomNNClassifier(),
                        parameters,
                        refit=True,
                        cv=3,
                        verbose=5,
                        return_train_score=True)

    with parallel_backend('threading'):
        grid.fit(train, train_class)

    best_parameters = grid.best_params_

    filename = "Results\\baseline_custom_NN_performance"
    write_baselines(filename, "Custom Neural Network", grid)

    print("Best score for best parameters:")
    print(grid.best_score_)
    print(grid.best_params_)

    pred = grid.predict(test)
    return pred
    def run(self,
            num_features=0,
            run_mode='regular',
            stratified_cv=True,
            n_jobs=1,
            print_freq=5,
            features_to_keep_indices=None):

        # define a dictionary to initialize the SpFtSel kernel
        sp_params = dict()

        sp_params['num_features'] = num_features
        sp_params['run_mode'] = run_mode
        sp_params['stratified_cv'] = stratified_cv
        sp_params['n_jobs'] = n_jobs
        sp_params['print_freq'] = print_freq
        sp_params['features_to_keep_indices'] = features_to_keep_indices

        # *** for advanced users ***
        # two gain types are available: bb (barzilai & borwein) or mon (monotone)
        sp_params['gain_type'] = 'bb'

        if run_mode == 'extended':
            sp_params['cv_folds'] = 5
            sp_params['iter_max'] = 200
            sp_params['stall_limit'] = 50
            sp_params['num_grad_avg'] = 10
            sp_params['cv_reps_grad'] = 1
            sp_params['cv_reps_eval'] = 5
            sp_params['num_gain_smoothing'] = 1
        elif run_mode == 'regular':
            sp_params['cv_folds'] = 5
            sp_params['iter_max'] = 100
            sp_params['stall_limit'] = 25
            sp_params['num_grad_avg'] = 2
            sp_params['cv_reps_grad'] = 1
            sp_params['cv_reps_eval'] = 2
            sp_params['num_gain_smoothing'] = 2
        else:
            raise ValueError('Error: Unknown run mode')

        kernel = SpFtSelKernel(sp_params)

        kernel.set_inputs(x=self._x,
                          y=self._y,
                          wrapper=self._wrapper,
                          scoring=self._scoring)

        kernel.shuffle_data()

        kernel.init_parameters()

        kernel.gen_cv_task()

        with parallel_backend('multiprocessing'):
            kernel.run_kernel()

        self.results = kernel.parse_results()

        return self
示例#3
0
    def clustering(self, examples):
        """
    After the execution of the clustering algorithm, each micro-cluster is represented
    by four components (N, LS, SS and T).
    """
        logging.info('clustering {}, {}'.format(len(examples), examples[0]))
        assert len(examples) > 0

        n_samples = len(examples)
        n_clusters = min(self.k, int(n_samples / (3 * self.representationThr)))
        assert n_samples >= n_clusters
        df = pandas.DataFrame(data=[ex.item for ex in examples])
        kmeans = KMeans(n_clusters=n_clusters)
        if self.daskEnableKmeans:
            logging.info('clustering with dask kmeans')
            with joblib.parallel_backend('dask'):
                kmeans.fit(df)
        else:
            kmeans.fit(df)

        clusters = []
        for centroid in kmeans.cluster_centers_:
            c = Cluster()
            c.center = centroid
            clusters.append(c)
        # Add examples to its cluster
        for ex in examples:
            nearCl, dist = self.closestCluster(ex.item, clusters)
            nearCl.addExample(ex)
        return clusters
示例#4
0
def baseline_custom_neural_network():

    parameters = {
        "batch_size": [10],
        "node_per_layer": [4],
        "layer_count": [2, 1],
        "learning_rate": [0.4, 0.2]
    }

    inputs, outputs = build_XOR_dataset()
    test_input = np.array([[0, 0], [1, 1], [1, 0], [0, 1]])

    grid = GridSearchCV(CustomNNClassifier(),
                        parameters,
                        refit=True,
                        cv=3,
                        verbose=5,
                        return_train_score=True)

    with parallel_backend('threading'):
        grid.fit(inputs, outputs)

    best_parameters = grid.best_params_

    pred = grid.predict(test_input)
    print("Best score for best parameters:")
    print(grid.best_score_)
    print(grid.best_params_)

    test_input = np.array([[0, 0], [1, 1], [1, 0], [0, 1]])
    print(grid.predict(test_input))
示例#5
0
def baseline_logistic_regression(train, train_class, test, original=False):
    """
    Baseline classifier using logistic regression with gridsearch
    """

    parameters = {'penalty': ['l2'], 'C': np.logspace(-3, 0, 20)}

    grid = GridSearchCV(LogisticRegression(),
                        parameters,
                        refit=True,
                        cv=3,
                        verbose=5,
                        return_train_score=True)

    with parallel_backend('threading'):
        grid.fit(train, train_class)

    best_parameters = grid.best_params_

    pred = grid.predict(test)

    filename = "Results\\baseline_logistic_regression_performance"
    if (original):
        filename += "_original"

    write_baselines(filename, "Logistic Regression", grid)

    print("Best score for best parameters:")
    print(grid.best_score_)
    print(grid.best_params_)

    return pred
示例#6
0
    def kfold_cv(self):
        """
        K-fold crossvalidator.
        Returns: fitted values and test values to be used for model optimization.

        """
        with joblib.parallel_backend("dask"):
            self.xgb_est = XGBClassifier(
                max_depth=5,
                subsample=0.7,
                scale_pos_weight=2,
                num_class=1,
                learning_rate=0.05,
            )
            cv = KFold(n_splits=8, random_state=24, shuffle=True)
            for train_index, test_index in cv.split(self.X):
                X_train, X_test, y_train, y_test = (
                    self.X[train_index],
                    self.X[test_index],
                    self.y[train_index],
                    self.y[test_index],
                )
                self.xgb_est.fit(X_train, y_train)
                y_pred = self.xgb_est.predict(X_test)
                self.predictions.append(y_pred)
                self.ypred_iterations.append(y_pred)
                self.ytest_iterations.append(y_test)
                self.predicted_probability_iterations.append(
                    self.xgb_est.predict_proba(X_test))
示例#7
0
def add_classification(
    dataframe_path, classifier_path: RandomForestClassifier, emotion: str
):
    client = Client(processes=False)
    print(client)

    with parallel_backend("dask"):
        PATIENT_DIRS = [
            x
            for x in glob.glob(os.path.join(dataframe_path, "*cropped"))
            if "hdfs" in os.listdir(x)
        ]

        for patient_dir in tqdm(PATIENT_DIRS):
            try:
                curr_df = dd.read_hdf(
                    os.path.join(patient_dir, "hdfs", "au.hdf"), "/data"
                )
                # curr_df = curr_df[curr_df[" success"] == 1]
                curr_df = curr_df.compute()

                if (
                    len(curr_df)
                    and "annotated" in curr_df.columns
                    and "frame" in curr_df.columns
                ):
                    kwargs = {
                        "{0}_predicted".format(emotion): lambda x: predict(
                            x, classifier_path.predict
                        ),
                        "{0}_predicted_proba".format(emotion): lambda x: [
                            n[1] for n in predict(x, classifier_path.predict_proba)
                        ],
                    }
                    imp_columns=['patient','success','frame','timestamp','annotated','confidence','session','vid','datetime']
                    
                    #curr_df = curr_df.assign(**kwargs)
                    emotion_df = curr_df[imp_columns]
                    emotion_df = emotion_df.assign(**kwargs)
                    # create name for new dataframe (using patient_session_frame)
                    
                    
                    # store in the out_fullpath
                    emotion_df.to_hdf(
                        os.path.join(patient),
                        "/data",
                        format="table",
                        scheduler="processes",
                    )
                else:
                    print(patient_dir + "HAS A PROBLEM")

            except AttributeError as e:
                print(e)
            except ValueError as e:
                print(e)
            except KeyError as e:
                print(e)
示例#8
0
def basic(scheduler_address, backends):
    ESTIMATORS = {
        'RandomForest': RandomForestClassifier(n_estimators=100),
        'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=100)
    }

    X_train, X_test, y_train, y_test = load_data()
    print_data(X_train, y_train, X_test, y_test)

    BACKENDS = build_backends(backends, scheduler_address, X_train, y_train)

    print("Training Classifiers")
    print("====================")
    error, train_time, test_time = {}, {}, {}
    for est_name, estimator in sorted(ESTIMATORS.items()):
        for backend, backend_kwargs in BACKENDS:
            print("Training %s with %s backend... " % (est_name, backend),
                  end="")
            estimator_params = estimator.get_params()

            estimator.set_params(
                **{
                    p: RANDOM_STATE
                    for p in estimator_params if p.endswith("random_state")
                })

            if "n_jobs" in estimator_params:
                estimator.set_params(n_jobs=-1)

            # Key for the results
            name = '%s, %s' % (est_name, backend)

            with parallel_backend(backend, **backend_kwargs):
                time_start = time()
                estimator.fit(X_train, y_train)
                train_time[name] = time() - time_start

            time_start = time()
            y_pred = estimator.predict(X_test)
            test_time[name] = time() - time_start

            error[name] = zero_one_loss(y_test, y_pred)

            print("done")

    print()
    print("Classification performance:")
    print("===========================")
    print("%s %s %s %s" %
          ("Classifier  ", "train-time", "test-time", "error-rate"))
    print("-" * 44)
    for name in sorted(error, key=error.get):
        print("%s %s %s %s" % (name, ("%.4fs" % train_time[name]),
                               ("%.4fs" % test_time[name]),
                               ("%.4f" % error[name])))

    print()
示例#9
0
 def clustering(self,
                examples: typing.List[Vector],
                label: str = None) -> ClusterList:
     n_clusters = min(
         self.CONSTS.k,
         int(len(examples) / (3 * self.CONSTS.representationThr)))
     kmeans = KMeans(n_clusters=n_clusters)
     with joblib.parallel_backend('dask'):
         kmeans.fit(examples)
     return [
         Cluster(center=centroid, label=label)
         for centroid in kmeans.cluster_centers_
     ]
示例#10
0
    def run(self, num_features=0, run_mode='regular'):

        # define a dictionary to initialize the SPFSR engine
        sp_params = dict()

        sp_params['num_features'] = num_features

        # how many cores to use for parallel processing during cross validation
        # this value is directly passed in to cross_val_score()
        sp_params['n_jobs'] = 1

        # two gain types are available: bb (barzilai & borwein) or mon (monotone)
        sp_params['gain_type'] = 'bb'

        if run_mode == 'extended':
            sp_params['iter_max'] = 200
            sp_params['stall_limit'] = 50
            sp_params['num_grad_avg'] = 10
            sp_params['cv_reps_grad'] = 1
            sp_params['cv_reps_eval'] = 5
            sp_params['num_gain_smoothing'] = 1
        elif run_mode == 'regular':
            sp_params['iter_max'] = 100
            sp_params['stall_limit'] = 25
            sp_params['num_grad_avg'] = 2
            sp_params['cv_reps_grad'] = 1
            sp_params['cv_reps_eval'] = 2
            sp_params['num_gain_smoothing'] = 2
        else:
            raise ValueError('Error: Unknown SPFSR run mode.')

        # set other algorithm parameters
        sp_params[
            'print_freq'] = 5  # how often do you want to print iteration results
        sp_params['cv_folds'] = 5
        sp_params['scoring_metric'] = scorer.accuracy_scorer
        sp_params['stratified_cv'] = True
        sp_params['maximize_score'] = True
        # two performance eval methods are available: cv or resub
        sp_params['perf_eval_method'] = 'cv'
        #####
        kernel = SpfsrKernel(sp_params)
        kernel.set_inputs(x=self.x, y=self.y, wrapper=self.wrapper)
        kernel.shuffle_data()
        kernel.init_parameters()
        kernel.gen_cv_task()
        with parallel_backend('multiprocessing'):
            kernel.run_spfsr()
        self.results = kernel.parse_results()

        return self
示例#11
0
def bestKNN(X, y, Xt, yt):

    clf = GridSearchCV(KNeighborsClassifier(), {
        'n_neighbors': [5, 8, 13],
        'metric': ['euclidean', 'hamming', 'dice', 'jaccard']
    },
                       scoring='accuracy',
                       cv=5)
    with parallel_backend('threading', n_jobs=24):
        clf.fit(X, y)
    results = clf.cv_results_
    print(results)

    acc = clf.score(Xt, yt)
    print(acc)
    return results
示例#12
0
def nested(scheduler_address, backends, classifier_n_jobs=-1):
    X_train, X_test, y_train, y_test = load_data()
    print_data(X_train, y_train, X_test, y_test)

    BACKENDS = build_backends(backends, scheduler_address, X_train, y_train)
    n_jobs_grid = [-1, 1]

    error, train_time = {}, {}
    for backend, backend_kwargs in BACKENDS:
        for n_jobs_outer in n_jobs_grid:
            for n_jobs_inner in n_jobs_grid:
                clf = RandomForestClassifier(random_state=RANDOM_STATE,
                                             n_estimators=10,
                                             n_jobs=classifier_n_jobs)
                param_grid = {
                    'max_features': [4, 8, 12],
                    'min_samples_split': [2, 5],
                }
                gs = GridSearchCV(clf,
                                  param_grid,
                                  cv=5,
                                  n_jobs=n_jobs_inner,
                                  verbose=2)
                name = '%s,%s,%s' % (backend, n_jobs_outer, n_jobs_inner)

                print("Training with {}...".format(name), end="")

                with parallel_backend(backend, **backend_kwargs):
                    time_start = time()
                    cv_gs = cross_validate(gs,
                                           X=X_train,
                                           y=y_train,
                                           cv=5,
                                           return_train_score=True,
                                           n_jobs=n_jobs_outer)
                    train_time[name] = time() - time_start
                    error[name] = cv_gs['test_score'].mean()

                print("done")
                df = pd.DataFrame(cv_gs)
                df.to_csv("{}.csv".format(name))

    print("{:<25} | {}".format("Backend", "Train Time"))
    print("-" * 44)
    for name in sorted(error, key=error.get):
        print("{:<25} | {}".format(name, train_time[name]))
示例#13
0
 def _calculate_vif(self):
     if lib == 'sklearn':
         vif = [
             self.viffunc(self.X.iloc[:, variables], ix)
             for ix in range(self.X.iloc[:, variables].shape[1])
         ]
     elif lib == 'statsmodels':
         with parallel_backend('threading', n_jobs=self.n_jobs):
             vif = Parallel()(
                 delayed(self.viffunc)(self.X.iloc[:, variables].values, ix)
                 for ix in range(self.X.iloc[:, variables].shape[1]))
     else:
         vif = [
             self.viffunc(self.X.iloc[:, variables], ix)
             for ix in range(self.X.iloc[:, variables].shape[1])
         ]
     return vif
示例#14
0
def train():

    disc = VGGNet()

    cp = SaveBestParam(dirname='best')
    early_stop = StopRestore(patience=10)
    score = Score_ConfusionMatrix(scoring="accuracy", lower_is_better=False)
    pt = PrintLog(keys_ignored="confusion_matrix")
    net = NeuralNetClassifier(disc,
                              max_epochs=100,
                              lr=0.01,
                              device='cuda',
                              callbacks=[('best', cp), ('early', early_stop)],
                              iterator_train__shuffle=True,
                              iterator_valid__shuffle=False)
    net.set_params(callbacks__valid_acc=score)
    net.set_params(callbacks__print_log=pt)

    # X, y = load_data()
    # net.fit(X, y)
    # print(1)

    param_dist = {
        'lr': [0.05, 0.01, 0.005],
    }

    search = RandomizedSearchCV(net,
                                param_dist,
                                cv=StratifiedKFold(n_splits=3),
                                n_iter=3,
                                verbose=10,
                                scoring='accuracy')

    X, y = load_data()

    # search.fit(X, y)

    Client("127.0.0.1:8786")  # create local cluster

    with joblib.parallel_backend('dask'):
        search.fit(X, y)

    with open('result.pkl', 'wb') as f:
        pickle.dump(search, f)
示例#15
0
def baseline_SVC(train, train_class, test, original=False):
    """
    Baseline classifier using SVC
    """

    parameters = {
        'C': np.logspace(-3, -1, 5),
        'gamma': [1, 0.1],
        'kernel': ['linear']
    }

    grid = GridSearchCV(SVC(),
                        parameters,
                        refit=True,
                        cv=3,
                        verbose=5,
                        return_train_score=True)

    with parallel_backend('threading'):
        grid.fit(train, train_class)

    best_parameters = grid.best_params_

    pred = grid.predict(test)

    filename = "Results\\baseline_SVC_performance"
    if (original):
        filename += "_original"

    write_baselines(filename, "SVC", grid)

    print("Best score for best parameters:")
    print(grid.best_score_)
    print(grid.best_params_)

    return pred
示例#16
0
def tune_parameters_RL(X, estimator, non_negative=0,  distributed=0,
                       scheduler_host="", coeff_penalty_range=(0.0001, 1, 10),
                       fit_params={}, scoring_function=None,
                       random_state=None):
    """
    Parameters tuner.

    It tunes the parameters of a representations learning estimator using
    3-splits monte carlo sampling cross validation.

    Parameters
    ----------
    X: array-like, shape=(n_samples, n_features)
        The matrix to decompose and analyse.

    D: array-like, shape=(n_atoms, n_features)
        The dictionary.

    estimator: RepresentationLearning class, optional
        The estimator you want to use to analyse the matrix.

    non_negative: boolean, optional

    distributed: int, optional
        If 0 the parameters research will be executed in parallel on the
        computer the script is launched.
        If 1 the parameters research will be executed sequentially.
        If 2 the parameters research will be distributed on multiple machines
        connected by dask. In this case also scheduler_host must be speficied.

    scheduler_host: string, optional
        If distributed=2 it is necessary to specify the scheduler of the dask
        network. The string must be "ip_address:port", for example:
        "10.251.61.226:8786"

    coeff_penalty_range: float tuple, optional (low, high, number)
        It gives the interval in which tune the coefficient penalty and the
        number of values to try.

    fit_params: dictionary, optional
        The parameters to pass to the fitting procedure during GridSearch.

    scoring_function: callable or None, default=None
        A scorer callable object / function with signature
        scorer(estimator, X, y=None). If None, the score method of the
        estimator is used.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Returns
    -------
    GridSearchCV
    The resulting GridSearch.

    """

    # ------------------parameters control ---------------------------------- #
    X = check_array(X)
    random_state = check_random_state(random_state)
    _check_range(coeff_penalty_range)
    if estimator is None:
        logging.error("passed estimator was None")
        raise ValueError("passed estimator was None")
    _check_estimator(estimator)

    estimator.non_negativity = non_negative

    if distributed == 2:
        if scheduler_host is None:
            logging.ERROR("Distributed execution requires a scheduler "
                          "specification. Changing the type to parallel.")
            distributed = 1
        distributed = _check_scheduler(scheduler_host)

    ss = MonteCarloBootstrap(n_splits=3, test_size=0.1,
                                 random_state=random_state)

    params = _get_params_coeff(estimator, coeff_penalty_range,
                               representation_learning=1)

    jobs = 1 if distributed == 1 else cpu_count()
    gscv = GridSearchCV(estimator, params, cv=ss, n_jobs=(cpu_count() - 5),
                        fit_params=fit_params, iid=True, refit=True,
                        scoring=scoring_function, verbose=1)
    if distributed == 2:
        register_parallel_backend('distributed', DistributedBackend)
        with parallel_backend('distributed',
                              scheduler_host=scheduler_host):
            gscv.fit(X)
    else:
        gscv.fit(X)

    return gscv
示例#17
0
def tune_parameters_DL(X, estimator=None, analysis=3, non_negative="none",
                       distributed=0, scheduler_host="", range_k=None,
                       dict_penalty_range=(0.0001, 1, 10),
                       coeff_penalty_range=(0.0001, 1, 10),
                       fit_params = {},
                       scoring_function=None,
                       random_state=None):
    """
    Parameters tuner.

    It tunes the parameters of a dictionary learning estimator using 3-splits
    monte carlo sampling cross validation.

    Parameters
    ----------
    X: array-like, shape=(n_samples, n_features)
        The matrix to decompose and analyse.

    estimator: DictionaryLearning class, optional
        The estimator you want to use to analyse the matrix. If None only the
        research on the best number of atoms will be done.

    analysis: int, optional
        The type of tuning you want to perform.
        - 0: tune together number of atoms and dictionary penalty and then the
             coefficients penalty
        - 1: tune only the penalties and take the number of atoms as specified
             in the estimator
        - 2: tune only the number of atoms
        - 3: tune all together, number of atoms and penalties

    non_negative: string, optional
        If "none" no negativity is imposed on the decomposition, if "coeff"
        only negativity on the coefficient is imposed. If "both" negativiy is
        on both decomposition matrices.

    distributed: int, optional
        If 0 the parameters research will be executed in parallel on the
        computer the script is launched.
        If 1 the parameters research will be executed sequentially.
        If 2 the parameters research will be distributed on multiple machines
        connected by dask. In this case also scheduler_host must be speficied.

    scheduler_host: string, optional
        If distributed=2 it is necessary to specify the scheduler of the dask
        network. The string must be "ip_address:port", for example:
        "10.251.61.226:8786"

    range_k: int or list, optional
        The maximum number of atoms to try when you search for the right k or
        the list of possible values to try.
        If None range_k will be computed as int(min(p, 0.75 * n) / 2)

    dict_penalty_range: float tuple, optional (low, high, number)
        It gives the interval in which tune the dictionary penalty and the
        number of values to try.

    coeff_penalty_range: float tuple, optional (low, high, number)
        It gives the interval in which tune the coefficient penalty and the
        number of values to try.

    fit_params: dictionary, optional
        The parameters to pass to the fitting procedure during GridSearch.

    scoring_function: callable or None, default=None
        A scorer callable object / function with signature
        scorer(estimator, X, y=None). If None, the score method of the
        estimator is used.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Returns
    -------
    GridSearchCV
    The resulting GridSearch.

    """

    # ------------------parameters control ---------------------------------- #
    X = check_array(X)
    random_state = check_random_state(random_state)
    _check_range(dict_penalty_range)
    _check_range(coeff_penalty_range)
    _check_non_negativity(non_negative, X)

    if estimator is None:
        analysis = 2
    else:
        _check_estimator(estimator)
        if estimator.non_negativity == "none":
            estimator.non_negativity = non_negative

    n, p = X.shape
    if range_k is None:
        range_k = int(min(p, 0.75 * n) / 2)  # generally the optimal
                                           # number of k is low

    if (analysis in [0, 1, 3] and
       (dict_penalty_range is None or coeff_penalty_range is None)):
        logging.ERROR("The range cannot be None")
        sys.exit(0)

    if distributed == 2:
        if scheduler_host is None:
            logging.ERROR("Distributed execution requires a scheduler "
                          "specification. Changing the type to parallel.")
            distributed = 1
        distributed = _check_scheduler(scheduler_host)

    # find first the paramaters on the dictionary and after the coefficients
    if analysis == 0:
        params = _get_params_dict(estimator,
                                  dict_penalty_range=dict_penalty_range)
        if type(range_k) is int:
            params['k'] = list(range(2, range_k))
        else:
            params['k'] = range_k

        jobs = 1 if distributed == 1 else cpu_count()
        gscv = GridSearchCV(estimator, params, cv=ss, n_jobs=jobs,
                            scoring=scoring_function,
                            iid=True, refit=True, verbose=1)
        if distributed == 2:
            register_parallel_backend('distributed', DistributedBackend)
            with parallel_backend('distributed', scheduler_host=scheduler_host):
                gscv.fit(X)
        else:
            gscv.fit(X)
        estimator = gscv.best_estimator_
        params = _get_params_coeff(estimator, coeff_penalty_range)
    # find only the penalties together
    elif analysis == 1:
        params = _get_params(estimator, dict_penalty_range,
                             coeff_penalty_range)
    # find only the number of atoms
    elif analysis == 2:
        if type(range_k) is int:
            params = {'k': list(range(2, max_k))}
        else:
            params = {'k': range_k}
    # find everything together
    elif analysis == 3:
        params = _get_params(estimator, dict_penalty_range,
                             coeff_penalty_range)

        if type(range_k) is int:
            params['k'] = list(range(2, range_k))
        else:
            params['k'] = range_k
    else:
        logging.error("Unknown type of research, please try with another "
                      "setting")
        raise ValueError("Unkown type of research, please try with another"
                         "setting")

    ss = MonteCarloBootstrap(n_splits=3, test_size=0.1,
                             random_state=random_state)
    jobs = 1 if distributed == 1 else cpu_count()
    gscv = GridSearchCV(estimator, params, cv=ss, fit_params=fit_params,
                        n_jobs=jobs, iid=True, scoring=scoring_function,
                        refit=True, verbose=1)
    if distributed == 2:
        register_parallel_backend('distributed', DistributedBackend)
        with parallel_backend('distributed',
                              scheduler_host=scheduler_host):
            gscv.fit(X)
    else:
        gscv.fit(X)
    return gscv
示例#18
0
#import dask_ml.joblib  # registers joblib plugin
# Scikit-learn bundles joblib, so you need to import from
# `sklearn.externals.joblib` instead of `joblib` directly
#from sklearn.externals.joblib import parallel_backend
from sklearn.datasets import load_digits
#from sklearn.grid_search import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from sklearn.svm import SVC
import numpy as np

from dask.distributed import Client
from sklearn.externals import joblib

digits = load_digits()

param_space = {
    'C': np.logspace(-6, 6, 13),
    'gamma': np.logspace(-8, 8, 17),
    'tol': np.logspace(-4, -1, 4),
    'class_weight': [None, 'balanced'],
}

model = SVC(kernel='rbf')
search = RandomizedSearchCV(model, param_space, cv=3, n_iter=50, verbose=10)

client = Client()
with joblib.parallel_backend('dask'):
    search.fit(digits.data, digits.target)
示例#19
0
#some parameters to test in parallel
param_space = {
    'C': np.logspace(-6, 6, 20),
    'gamma': np.logspace(-6,1,20)
}


svc_rbf = SVC(kernel='rbf',
              shrinking=False)

search = GridSearchCV(svc_rbf,
                      param_space,
                      return_train_score=True,
                      n_jobs=len(c))

with parallel_backend('ipyparallel'):
    search.fit(X_train, y_train)
results = search.cv_results_
results = pd.DataFrame(results)
results.to_csv(os.path.join(FILE_DIR,'scores_rbf_digits.csv'))


scores = search.cv_results_['mean_test_score'].reshape(len(param_space['C']),len(param_space['gamma']))

plt.figure()
#plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot)
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(param_space['gamma'])), map(lambda x : "%.2E"%(x),param_space['gamma']), fontsize=8, rotation=45)
示例#20
0
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.externals.joblib import parallel_backend
import numpy as np
import data

# explicitly set random seed to help parallelization
np.random.seed(0)

# use naive oversampling for grid search
x_train_rs, x_test_rs, y_train_rs, y_test_rs = data.parse_data_random_oversample(
    './creditcard.csv')

params = {
    'hidden_layer_sizes': [(100, 50, 25), (75, 35, 15), (50, 25, 12)],
    'activation': ['logistic', 'tanh'],
    'solver': ['sgd', 'adam'],
    'alpha': 10.**-np.arange(3, 6),
    'learning_rate': ['invscaling', 'adaptive'],
    'max_iter': [1000, 1500, 2000]
}

# perform grid search with 5 fold cross validation, use all available cores
mlp_model = GridSearchCV(MLPClassifier(), params, n_jobs=-1, cv=5, verbose=2)
with parallel_backend('threading'):
    mlp_model.fit(x_train_rs, y_train_rs.ravel())

# output the best model params and score
print('best score: {0:.6f}'.format(mlp_model.best_score_))
print('best params: ')
print(mlp_model.best_params_)
import dask.datasets as ds
import time
from dask_ml.linear_model import LogisticRegression
from dask_glm.datasets import make_classification

X, y = make_classification(n_samples=1000)

t = time.time()
lr = LogisticRegression()
lr.fit(X, y)
lr.predict(X)
lr.predict_proba(X)
#est.score(X, y)
print('\nTime dask_ml: ' + str(time.time() - t))

# Parallelize Scikit-Learn Directly
from dask.distributed import Client
from sklearn.externals.joblib import parallel_backend

client = Client('localhost:8786')  # Connect to a Dask Cluster
print(client)
with parallel_backend('dask', scatter=[X, y]):
    # Your normal scikit-learn code here
    t = time.time()
    lr = LogisticRegression()
    lr.fit(X, y)
    lr.predict(X)
    lr.predict_proba(X)
    #est.score(X, y)
    print('\nTime dask_ml distributed: ' + str(time.time() - t))
示例#22
0
def use_dask_xgb(out_q, emotion, df: dd.DataFrame):
    # data_columns = [x for x in df.columns if 'annotated' not in x and 'predicted' not in x and 'patient' not in x]
    # data = df[data_columns]
    # # data.convert_objects(convert_numeric=True).dropna()
    # data = data.apply(lambda x: pd.to_numeric(x, errors='coerce'),axis=1, meta={'x': 'f8', 'y': 'f8'})
    # data = data.fillna(0)

    # labels = df[df['annotated'] != "N/A"]
    # labels = labels['annotated']
    # # labels = labels.assign(lambda x: 1 if x['annotated'] == emotion else 0)
    # labels = labels.apply(lambda x: 1 if x == emotion else 0, meta={'x': 'f8', 'y': 'f8'})
    # labels = labels.fillna(0)
    # # labels = labels.compute()

    # X_train, X_test, y_train, y_test = train_test_split(data.values, labels.values)

    # classifier = XGBClassifier()
    # scoring = ['precision', 'recall']
    # scores = cross_val_score(
    # classifier, X_train, y_train, scoring=scoring)
    # out_q.put("Cross val precision for classifier {0}:\n{1}\n".format(
    # classifier, scores['precision'].mean()))
    # out_q.put("Cross val recall for classifier {0}:\n{1}\n".format(
    # classifier, scores['recall'].mean()))

    data_columns = [
        x for x in df.columns if 'predicted' not in x and 'patient' not in x
        and 'session' not in x and 'vid' not in x
    ]
    df = df[data_columns]
    data = df[df['annotated'] != "N/A"]
    data = data[data['annotated'] != ""]

    emote_data = data[data['annotated'] == emotion]
    non_emote_data = data[data['annotated'] != emotion]

    non_emote_data = non_emote_data.sample(frac=len(emote_data) /
                                           len(non_emote_data))

    data = dd.concat([emote_data, non_emote_data], interleave_partitions=True)
    labels = (data['annotated'] == emotion)

    # print(labels.unique().compute())

    del data['annotated']
    print("PERSISTING DATA")
    # data, labels = dask.persist(data, labels)
    # data = client.compute(data)
    # labels = client.compute(labels)
    data = data.compute()
    labels = labels.compute()
    # df2 = dd.get_dummies(data.categorize()).persist()
    # X_train, X_test, y_train, y_test = train_test_split(df2, labels)
    X_train, X_test, y_train, y_test = train_test_split(data, labels)
    # X_train, X_test = data.random_split([.9,.1])
    # y_train, y_test = labels.random_split([.9,.1])

    # cluster = LocalCluster(n_workers=16)
    # cluster = LocalCluster()
    # client = Client(cluster)
    # client = Client('scheduler-address:8786', processes=False)
    # classifier = XGBClassifier()
    scoring = ['precision', 'recall']
    print("TRAINING")
    # classifier.fit(X_train, y_train)
    classifier = RandomForestClassifier(n_estimators=100)

    with parallel_backend('dask'):
        # scores = cross_validate(
        # classifier, X_train.values, y_train.values, scoring=scoring)
        scores = cross_validate(classifier,
                                X_train,
                                y_train,
                                scoring=scoring,
                                cv=5,
                                return_train_score=True)
    out_q.put("Scores for emotion {0} \n".format(emotion))
    out_q.put("Cross val train precision for classifier {0}:\n{1}\n".format(
        classifier, scores['train_precision'].mean()))
    out_q.put("Cross val train recall for classifier {0}:\n{1}\n".format(
        classifier, scores['train_recall'].mean()))
    out_q.put("Cross val test precision for classifier {0}:\n{1}\n".format(
        classifier, scores['test_precision'].mean()))
    out_q.put("Cross val test recall for classifier {0}:\n{1}\n".format(
        classifier, scores['test_recall'].mean()))

    # expected = y_test.values
    # predicted = classifier.predict(X_test.values)
    print("PREDICTING")
    expected = y_test
    with parallel_backend('dask'):
        classifier.fit(X_train, y_train)

        predicted = classifier.predict(X_test)
        # predicted = classifier.predict(X_test)

        out_q.put(
            "Classification report for classifier %s:\n%s\n" %
            (classifier, metrics.classification_report(expected, predicted)))
        out_q.put("Confusion matrix:\n%s\n" %
                  metrics.confusion_matrix(expected, predicted))
    # classifier.save_model('{0}_trained_XGBoost_with_pose')
    pickle.dump(
        classifier,
        open('{0}_trained_RandomForest_with_pose.pkl'.format(emotion), 'wb'))
示例#23
0
def run_task(seed, task_id, estimator_name, n_iter, n_jobs, n_folds_inner_cv,
             profile, joblib_tmp_dir, run_tmp_dir):

    # retrieve dataset / task
    task = openml.tasks.get_task(task_id)
    num_features = task.get_X_and_y()[0].shape[1]
    indices = task.get_dataset().get_features_by_type('nominal',
                                                      [task.target_name])

    # retrieve classifier
    classifierfactory = openmlstudy14.pipeline.EstimatorFactory(
        n_folds_inner_cv, n_iter, n_jobs)
    estimator = classifierfactory.get_flow_mapping()[estimator_name](
        indices, num_features=num_features)

    print('Running task with ID %d.' % task_id)
    print('Arguments: random search iterations: %d, inner CV folds %d, '
          'n parallel jobs: %d, seed %d' %
          (n_iter, n_folds_inner_cv, n_jobs, seed))
    print('Model: %s' % str(estimator))
    flow = openml.flows.sklearn_to_flow(estimator)
    flow.tags.append('study_14')

    import time
    start_time = time.time()

    # TODO generate a flow first
    if profile is None:
        import warnings
        with warnings.catch_warnings():
            warnings.filterwarnings(
                'ignore', module='sklearn\.externals\.joblib\.parallel')
            run = openml.runs.run_flow_on_task(task, flow, seed=seed)
    else:
        print('Using ipython parallel with scheduler file %s' % profile)

        for i in range(1000):
            profile_file = os.path.join(os.path.expanduser('~'), '.ipython',
                                        'profile_%s' % profile, 'security',
                                        'ipcontroller-engine.json')
            try:
                with open(profile_file) as fh:
                    scheduler_information = yaml.load(fh)
                break
            except FileNotFoundError:
                print('scheduler file %s not found. sleeping ... zzz' %
                      profile_file)
                time.sleep(1)
                continue

        c = Client(profile=profile)
        bview = c.load_balanced_view()
        register_parallel_backend(
            'ipyparallel',
            lambda: NPCachingIpyParallelBackend(view=bview,
                                                tmp_dir=joblib_tmp_dir))

        with parallel_backend('ipyparallel'):
            run = openml.runs.run_flow_on_task(task, flow, seed=seed)

    end_time = time.time()
    run.tags.append('study_14')

    tmp_dir = os.path.join(run_tmp_dir,
                           '%s_%s' % (str(task_id), estimator_name))
    print(tmp_dir)
    try:
        os.makedirs(tmp_dir)
    except Exception as e:
        print(e)
    run_xml = run._create_description_xml()
    predictions_arff = arff.dumps(run._generate_arff_dict())

    with open(tmp_dir + '/run.xml', 'w') as f:
        f.write(run_xml)
    with open(tmp_dir + '/predictions.arff', 'w') as f:
        f.write(predictions_arff)

    run_prime = run.publish()
    print('READTHIS', estimator_name, task_id, run_prime.run_id,
          end_time - start_time)

    return run
示例#24
0
                        '--classifier',
                        default='SVC',
                        choices=classifier_choices,
                        help='Classifier used by the model')
    parser.add_argument('--train',
                        default=10000,
                        help='Number of training sample to use')
    parser.add_argument('--valid',
                        default=1000,
                        help='Number of validation sample to use')
    args = parser.parse_args()

    logging.info(f'{args}')

    Model = getattr(models, args.model)
    Classifier = getattr(classifiers, args.classifier)
    X_train, Y_train = read_data(get_dataset('train'), sample_n=args.train)
    X_valid, Y_valid = read_data(get_dataset('valid'), sample_n=args.valid)
    model = Model(classifier=Classifier,
                  steps=[args.feature_model],
                  memory='data/feature_cache')

    with joblib.parallel_backend('threading', n_jobs=4):
        model.fit(X_train, Y_train)
        score = model.score(X_valid, Y_valid)
        logging.info('')
        logging.info(f'Overall F1: {score:.4f}')
        logging.info('')

    save_model(model)
示例#25
0
                  end="")
            estimator_params = estimator.get_params()

            estimator.set_params(
                **{
                    p: RANDOM_STATE
                    for p in estimator_params if p.endswith("random_state")
                })

            if "n_jobs" in estimator_params:
                estimator.set_params(n_jobs=-1)

            # Key for the results
            name = "%s, %s" % (est_name, backend)

            with parallel_backend(backend, **backend_kwargs):
                time_start = time()
                estimator.fit(X_train, y_train)
                train_time[name] = time() - time_start

            time_start = time()
            y_pred = estimator.predict(X_test)
            test_time[name] = time() - time_start

            error[name] = zero_one_loss(y_test, y_pred)

            print("done")

    print()
    print("Classification performance:")
    print("===========================")
示例#26
0
import distributed.joblib
# Scikit-learn bundles joblib, so you need to import from
# `sklearn.externals.joblib` instead of `joblib` directly
# This is not true depending on the packaging (e.g. Fedora/Debian)
# in this case, use the following:
# from joblib import parallel_backend
from sklearn.externals.joblib import parallel_backend
from sklearn.datasets import load_digits
from sklearn.grid_search import RandomizedSearchCV
from sklearn.svm import SVC
import numpy as np

#load mnist digits data
digits = load_digits()

#set up the parameters to be explored
param_space = {
    'C': np.logspace(-6, 6, 13),
    'gamma': np.logspace(-8, 8, 17),
    'tol': np.logspace(-4, -1, 4),
    'class_weight': [None, 'balanced'],
}

#create the model
model = SVC(kernel='rbf')
search = RandomizedSearchCV(model, param_space, cv=3, n_iter=50, verbose=10)

#using sklearn's parallel_backend
with parallel_backend('dask.distributed', scheduler_host='localhost:8888'):
    search.fit(digits.data, digits.target)
示例#27
0
def test_sklearn():
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    from sklearn.linear_model import SGDClassifier, LogisticRegressionCV
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline
    from sklearn.svm import SVC
    from sklearn.externals import joblib
    from sklearn.datasets import make_classification, load_digits, fetch_20newsgroups

    from dask_ml.wrappers import ParallelPostFit

    categories = [
        'alt.atheism',
        'talk.religion.misc',
    ]

    print("Loading 20 newsgroups dataset for categories:")
    print(categories)

    data = fetch_20newsgroups(subset='train', categories=categories)
    print("%d documents" % len(data.filenames))
    print("%d categories" % len(data.target_names))
    print()

    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(max_iter=1000)),
    ])

    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        # 'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
        # 'tfidf__use_idf': (True, False),
        # 'tfidf__norm': ('l1', 'l2'),
        # 'clf__alpha': (0.00001, 0.000001),
        # 'clf__penalty': ('l2', 'elasticnet'),
        # 'clf__n_iter': (10, 50, 80),
    }

    grid_search = GridSearchCV(pipeline,
                               parameters,
                               n_jobs=-1,
                               verbose=1,
                               cv=3,
                               refit=False,
                               iid=False)
    grid_search.fit(data.data, data.target)

    with joblib.parallel_backend('dask'):
        grid_search.fit(data.data, data.target)

    X, y = load_digits(return_X_y=True)
    svc = ParallelPostFit(SVC(random_state=0, gamma='scale'))

    param_grid = {
        # use estimator__param instead of param
        'estimator__C': [0.01, 1.0, 10],
    }

    grid_search = GridSearchCV(svc, param_grid, iid=False, cv=3)
    grid_search.fit(X, y)

    big_X = da.concatenate(
        [da.from_array(X, chunks=X.shape) for _ in range(10)])
    predicted = grid_search.predict(big_X)

    #
    X_train, y_train = make_classification(n_features=2,
                                           n_redundant=0,
                                           n_informative=2,
                                           random_state=1,
                                           n_clusters_per_class=1,
                                           n_samples=1000)

    N = 100
    X_large = da.concatenate(
        [da.from_array(X_train, chunks=X_train.shape) for _ in range(N)])
    y_large = da.concatenate(
        [da.from_array(y_train, chunks=y_train.shape) for _ in range(N)])
    clf = ParallelPostFit(LogisticRegressionCV(cv=3))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_large)
    clf.score(X_large, y_large)

    # est.partial_fit(X_train_1, y_train_1)

    # from tpot import TPOTClassifier
    pass
示例#28
0
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from dask_ml.model_selection import GridSearchCV
from dask.distributed import Client
from sklearn.externals import joblib


def simple_nn(hidden_neurons):
  model = Sequential()
  model.add(Dense(hidden_neurons, activation='relu', input_dim=30))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
  return model

param_grid = {'hidden_neurons': [100, 200, 300]}
if __name__=='__main__':
	client = Client()
	cv = GridSearchCV(KerasClassifier(build_fn=simple_nn, epochs=100), param_grid)
	X, y = load_breast_cancer(return_X_y=True)
	X_train, X_test, y_train, y_test = train_test_split(X, y)
	with joblib.parallel_backend("dask", scatter=[X_train, y_train]):
		cv.fit(X_train, y_train)
	print(f'Best Accuracy for {cv.best_score_:.4} using {cv.best_params_}')
from dask.distributed import Client, progress, wait
client = Client('149.165.148.24:8786')
print(client)

X, y = Xdata / 255., ydata
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

mlp = MLPClassifier(hidden_layer_sizes=(100, 10),
                    max_iter=10,
                    solver='sgd',
                    verbose=10,
                    random_state=1)

from sklearn.externals import joblib
with joblib.parallel_backend('dask', scheduler_host='149.165.148.24:8786'):
    get_ipython().run_line_magic('time', 'mlp.fit(X_train, y_train)')

print("Training set score: %f" % mlp.score(X_train, y_train))
print("Test set score: %f" % mlp.score(X_test, y_test))

# ### With 100 iterations

mlp = MLPClassifier(hidden_layer_sizes=(100, 10),
                    max_iter=100,
                    solver='sgd',
                    verbose=10,
                    random_state=1)
from sklearn.externals import joblib
with joblib.parallel_backend('dask', scheduler_host='149.165.148.24:8786'):
    get_ipython().run_line_magic('time', 'mlp.fit(X_train, y_train)')
示例#30
0
# Instead of creatinf direct client, you can form your own cluster and than build client on top of thatself.

from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
client = Client(cluster)

# This will allow you to control your clusters properties.


#The next step will be to instantiate dask joblib in the
# backend. You need to import parallel_backend from sklearn # joblib like I have shown below.


import dask_ml.joblib
from sklearn.externals.joblib import parallel_backend
with parallel_backend('dask'):
    # Your normal scikit-learn code here
     from sklearn.ensemble import RandomForestClassifier
     model = RandomForestClassifier()


# I want to parallize the custom workflows
# This can be used with the sciki learn pipeline.

# Let's say I have a function that do some process on the data.
# And I want to parallize the process using dask_ml

def process(data):

    return something