Пример #1
0
    def _train(self):
        x = self._train_set.features
        y = self._train_set.outputs

        x, y = filter_outliers(x, y, n_estimators=200, contamination=0.003)

        pipe = pipeline.Pipeline([
            #('kselect', feature_selection.SelectKBest(feature_selection.f_regression, k=15)),
            ('expand', preprocessing.PolynomialFeatures()),
            ('estim', linear_model.LassoLars())
        ])

        param_grid = [{
            'expand__include_bias': [False],
            'expand__degree': [3],
            'estim__normalize': [False],
            'estim__fit_intercept': [True],
            'estim__alpha': [0.313]
            #'estim__alpha': list(0.01 + 1 * i for i in range(0, 9))
            #'estim__l1_ratio': list(0.80 + 0.01 * i for i in range(0, 6))
        }]

        grid = model_selection.GridSearchCV(pipe,
                                            cv=3,
                                            n_jobs=1,
                                            param_grid=param_grid,
                                            verbose=1,
                                            scoring=metrics.make_scorer(
                                                metrics.mean_squared_error,
                                                greater_is_better=False))
        grid.fit(x, y)

        print(grid.best_estimator_)
        print(grid.cv_results_)

        estim = grid.best_estimator_.named_steps['estim']
        coeffs = estim.coef_

        polyexp = grid.best_estimator_.named_steps['expand']
        f_names = polyexp.get_feature_names(NAMES[1:])
        """
        for elem in sorted(zip(coeffs, f_names), reverse=True):
            print(elem)
        """

        self._model = grid.predict
Пример #2
0
def optimizeSVM(X_norm, y, kFolds=10):
    clf = pipeline.Pipeline([
        ('svc', svm.SVC(kernel='rbf')),
    ])
    # grid search 多参数优化
    parameters = {
        'svc__gamma': np.logspace(-3, 11, 8, base=2),
        'svc__C': np.logspace(-3, 15, 10, base=2),
    }
    gs = grid_search.GridSearchCV(clf,
                                  parameters,
                                  verbose=1,
                                  refit=False,
                                  cv=kFolds)
    gs.fit(X_norm, y)
    return gs.best_params_['svc__gamma'], gs.best_params_[
        'svc__C'], gs.best_score_
Пример #3
0
 def getTransformer(self, **params):
     # numvars = ["blood_pressure", "cholestoral", "max_heart_rate", "age"]
     # cateVars = ["cp", "thal"]
     ct = compose.ColumnTransformer(
         [
             # ("norm", preprocessing.StandardScaler(), self._getIndex(numvars)),
             # (
             #     "cate",
             #     preprocessing.OneHotEncoder(handle_unknown="ignore"),
             #     self.getIndex(cateVars),
             # ),
         ],
         remainder="passthrough",
     )
     transformer = pipeline.Pipeline([("norm", ct)])
     transformer.set_params(**params)
     return transformer
Пример #4
0
def optimizeAdaBoost(X_norm, y, clf, kFolds=10):
    clf = pipeline.Pipeline([
        ('ada', clf),
    ])
    # grid search 多参数优化
    parameters = {
        # 'ada__n_estimators': np.logspace(0, 3, 20),
        'ada__n_estimators': np.linspace(1, 100, 10, dtype=np.dtype(np.int16)),
        # 'svc__gamma': np.linspace(0, 50, 20),
    }
    gs = grid_search.GridSearchCV(clf,
                                  parameters,
                                  verbose=1,
                                  refit=False,
                                  cv=kFolds)
    gs.fit(X_norm, y)
    return gs.best_params_['ada__n_estimators'], gs.best_score_
    def test__is_pytorch_flow(self):
        self.sklearn_dummy_model = pipeline.Pipeline(
            steps=[('imputer',
                    Imputer()), ('estimator', tree.DecisionTreeClassifier())])

        self.pytorch_flow = self.extension.model_to_flow(
            self.pytorch_dummy_model)
        self.pytorch_flow_external_version = self.extension._is_pytorch_flow(
            self.pytorch_flow)

        self.sklearn_flow = SklearnExtension().model_to_flow(
            self.sklearn_dummy_model)
        self.sklearn_flow_external_version = self.extension._is_pytorch_flow(
            self.sklearn_flow)

        self.assertTrue(self.pytorch_flow_external_version)
        self.assertFalse(self.sklearn_flow_external_version)
Пример #6
0
def test_cv():
    doctor = strategyGame.strategyGameDoctor()
    Xdata, ydata = getData()
    X = doctor.readXdata(Xdata)
    y = doctor.readydata(ydata)
    # X, y = preprocess.balanceSample(X, y)
    transformer = doctor.getTransformer()
    model = doctor.getModel()
    pipe = pipeline.Pipeline([("transformer", transformer), ("model", model)])
    scores = model_selection.cross_val_score(pipe,
                                             X,
                                             y,
                                             cv=5,
                                             scoring="accuracy")
    scoreMean, scoreStd = scores.mean(), scores.std()
    print("\nCross Validtion Report")
    print(f"Baseline Score:{scoreMean:.2f} +/-{scoreStd*1:.2f}")
Пример #7
0
def build_ensemble_classification_pipeline():

    models = [
        ("KNN", KNeighborsClassifier()),
        ("SVM", SVC()),
        ("RF", RandomForestClassifier()),
        ("bayes", GaussianNB()),
    ]
    final_classifier = LogisticRegression()
    regressor = StackingClassifier(estimators=models,
                                   final_estimator=final_classifier)

    return pipeline.Pipeline([
        ("zero_variance_filter", VarianceThreshold(threshold=0)),
        ("Lasso", SelectFromModel(Lasso())),
        ("classifier", regressor),
    ])
Пример #8
0
    def __init__(self, datasource, classifier):
        logger.info('Training %s on %s', classifier.__class__.__name__,
                    datasource.__class__.__name__)

        # datasource yields (label, text) pairs
        y, X = zip(*datasource)

        self.name = datasource.__class__.__name__ + ':' + classifier.__class__.__name__
        self.pipeline = pipeline.Pipeline([
            ('dictionary',
             feature_extraction.text.CountVectorizer(
                 tokenizer=self.tokenizer)),
            ('tfidf', feature_extraction.text.TfidfTransformer()),
            ('classifier', classifier),
        ])

        self.pipeline.fit(X, y)
Пример #9
0
def construct_tar(data, proxies, drop_all=False):
    tar = pipeline.Pipeline([('scaler', preprocessing.StandardScaler()),
                             ('pred', TAR(fit_intercept=True,
                                          normalize=False))])

    estimators = {}
    for prox in proxies:
        estimators[f"TAR ({prox})"] = {
            'pipe': deepcopy(tar),
            'drop_cols': proxies if drop_all else [prox],
            'fit_params': {
                'pred__A': data['train'][prox],
                'pred__nu': data['test'][prox]
            }
        }

    return estimators
def func1():
    user = {}
    for line in fileinput.input("../../data/select/select_a"):
        mac = line.strip().split(" ")[0]
        user[mac] = True
    fileinput.close()
    docList, classList = [], []
    for line in fileinput.input(
            "../../data/feature/trace_http_statistic_filter_feature_sex"):
        part = line.strip().split(" ")
        mac, sex, feat = part[0], int(part[1]), part[2:]
        # print len(feat)
        if user.has_key(mac):
            _list = []
            for f in feat:
                _list.append(float(f))
            docList.append(_list)
            classList.append(sex)
    fileinput.close()
    docList, classList = np.array(docList), np.array(classList)
    min_max_scaler = preprocessing.MinMaxScaler()
    docList = min_max_scaler.fit_transform(docList)
    cnt, errorCount = 0, 0
    loo = LeaveOneOut(len(classList))
    trainingdoc, trainingclass = [], []
    for train, test in loo:
        cnt += 1
        print cnt
        trainingdoc, trainingclass, testingdoc, testingclass = docList[
            train], classList[train], docList[test], classList[test]
        # clf = svm.SVC(kernel='rbf', class_weight='auto')
        clf = pipeline.Pipeline([
            # ('feature_selection', feature_selection.SelectKBest(k=10)),
            ('feature_selection', svm.LinearSVC(penalty="l1", dual=False)),
            # ('feature_selection', linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='auto', random_state=None)),
            ('classification', tree.DecisionTreeClassifier())
            # ('classification', ensemble.RandomForestClassifier())
            # ('classification', SGDClassifier(loss="hinge", penalty="l2"))
            # ('classification', svm.SVC(kernel='linear', class_weight='auto'))
            # ('classification', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0))
        ])
        clf.fit(trainingdoc, trainingclass)
        for i in range(len(testingdoc)):
            if not testingclass[i] == clf.predict(testingdoc[i])[0]:
                errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(classList)
Пример #11
0
def train_ann(X_train, targets):
    elm = pipeline.Pipeline([('rhl',
                              RandomLayer(n_hidden=200,
                                          activation_func='multiquadric',
                                          alpha=0.69)),
                             ('lr', LinearRegression(fit_intercept=False))])

    #elmr = GenELMRegressor( hidden_layer = rl )
    #elmr = ELMRegressor(n_hidden=98,random_state=0, alpha=0.8)
    elm.fit(X_train, targets)
    tr_mse = mean_squared_error(targets, predict(elm, X_train))
    print("training mse: ", tr_mse)
    # save model
    joblib.dump(
        elm, 'models/2018/23_06_18/elm/diel_diamond/elm_XX_diel_diamond.pkl')

    return (elm, tr_mse)
Пример #12
0
def build_nltk_pipeline(model_params: Dict[str, Any]) -> pipeline.Pipeline:
    """Build the HMM pipeline.

    Parameters
    ----------
    model_params : dict
        Model parameters.

    Returns
    -------
    pipeline.Pipeline
        Built pipeline that acts as the model.
    """
    return pipeline.Pipeline([
        ("feature_extractor", TokenExtractor()),
        ("model", NLTKModel(**model_params)),
    ])
Пример #13
0
def execute_pipeline(x_data_scaled, y_data, n_pca, n_select_best):
    # make sure the model knows
    # 1. Method of dimentional reduction, with nothing reduced
    # if n-components is not set,
    # all components are used (http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html)
    model_pca = decomposition.PCA()

    # 2. Method of feture selection
    # the all option bypasses selection,
    # for use in a parameter search (http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html)
    model_variable_best = feature_selection.SelectKBest(k=all)

    # 3. Machine Learning algo to use
    machine_learner = linear_model.RANSACRegressor()

    # Package the process
    process = [('pca', model_pca), ('select_best', model_variable_best),
               ('ml', machine_learner)]

    # line it up to exexute
    pipeline_process = pipeline.Pipeline(process)

    # search space, where all the process keys can be modified
    search_space = dict(pca__n_components=n_pca, select_best__k=n_select_best)

    # number of crossfolding validations to be done
    n_cv = 10

    # tell the model, that there will be some crossfolding to be done, and pass it the pipleine for machine learning
    model = model_selection.GridSearchCV(pipeline_process,
                                         param_grid=search_space,
                                         cv=n_cv,
                                         n_jobs=-1)

    # run the model
    model.fit(x_data_scaled, y_data)

    # ask for results
    results = model_selection.cross_val_score(model,
                                              x_data_scaled,
                                              y_data,
                                              n_jobs=-1)
    print(
        "Prediction score of the model: %.2f%s (%.5f standard deviation) Fitness"
        % (results.mean() * 100, "%", results.std()))
Пример #14
0
def train_wdclassifier_user(training_set: Tuple[np.ndarray, np.ndarray],
                            svmType: str,
                            C: float,
                            gamma: Optional[float]) -> sklearn.svm.SVC:
    """ Trains an SVM classifier for a user

    Parameters
    ----------
    training_set: Tuple (x, y)
        The training set (features and labels). y should have labels -1 and 1
    svmType: string ('linear' or 'rbf')
        The SVM type
    C: float
        Regularization for the SVM optimization
    gamma: float
        Hyperparameter for the RBF kernel

    Returns
    -------
    sklearn.svm.SVC:
        The learned classifier

    """

    assert svmType in ['linear', 'rbf']

    train_x = training_set[0]
    train_y = training_set[1]

    # Adjust for the skew between positive and negative classes
    n_genuine = len([x for x in train_y if x == 1])
    n_forg = len([x for x in train_y if x == -1])
    skew = n_forg / float(n_genuine)

    # Train the model
    if svmType == 'rbf':
        model = sklearn.svm.SVC(C=C, gamma=gamma, class_weight={1: skew})
    else:
        model = sklearn.svm.SVC(kernel='linear', C=C, class_weight={1: skew})

    model_with_scaler = pipeline.Pipeline([('scaler', preprocessing.StandardScaler(with_mean=False)),
                                           ('classifier', model)])
    model_with_scaler.fit(train_x, train_y)

    return model_with_scaler
Пример #15
0
def prediction_move():
    mac_map, X, y = {}, [], []
    for line in fileinput.input("_jaccount/mac_user_info.txt"):
        # mac, val = line.strip().split("\t")[0], line.strip().split("\t")[3].split(" ")[1]
        # if val in ["男性","女性"]:
        # 	mac_map[mac] = 0 if val == "男性" else 1
        mac, val = line.strip().split("\t")[0], line.strip().split(
            "\t")[3].split(" ")[2]
        if val in ["教职员", "研究生", "本科生"]:
            mac_map[mac] = 0 if val == "教职员" else 1 if val == "研究生" else 2
    fileinput.close()
    for line in gzip.open("_feature/move_vector.txt.gz"):
        mac = line.strip().split(" ")[0]
        if mac_map.has_key(mac):
            X.append([int(i) for i in line.strip().split(" ")[1:]])
            y.append(mac_map[mac])
    from sklearn import pipeline
    from sklearn import linear_model
    from sklearn import svm
    from sklearn import tree
    from sklearn import ensemble
    from sklearn.cross_validation import KFold
    from sklearn.cross_validation import LeaveOneOut
    total, right = 0, 0
    X, y = np.array(X), np.array(y)
    loo = LeaveOneOut(len(X))
    print len(y)
    for train, test in loo:
        clf = pipeline.Pipeline([
            ('feature_selection',
             linear_model.LogisticRegression(penalty='l1')),
            # ('feature_selection', svm.LinearSVC(penalty="l1",dual=False)),
            # ('classification', svm.SVC())
            # ('classification', svm.LinearSVC())
            # ('classification', tree.DecisionTreeClassifier())
            ('classification', ensemble.RandomForestClassifier())
            # ('classification', ensemble.GradientBoostingClassifier())
        ])
        clf = clf.fit(X[train], y[train])
        r = clf.predict(X[test])[0]
        print r, y[test]
        if r == y[test]:
            right += 1
        total += 1
    print float(right) / total
Пример #16
0
    def do(self, n_pts):
        """
        Extract the model using a linear classifier
        over an approximate feature map of an RBF-kernel.

        with n pairs of points on the decision boundary
        of the ATTACKED MODEL.
        :param n_pts:
        :return:
        """
        # Collect n pairs of points on the decision boundary of the oracle
        # WTF ?! We expected the contrary.
        X, y = self.collect_pts(n_pts)

        print 'done collecting points'

        rbf_map = RBFSampler(n_components=n_pts, random_state=1)
        solver = HyperSolver(p=self.POS, n=self.NEG)
        rbf_solver = pipeline.Pipeline([("mapper", rbf_map),
                                        ("solver", solver)])

        gamma_range = np.logspace(-15, 6, 22, base=2)
        param_grid = dict(mapper__gamma=gamma_range)
        cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=1)
        grid = GridSearchCV(rbf_solver, param_grid=param_grid, cv=cv, n_jobs=8)
        grid.fit(X, y)

        scores = [x[1] for x in grid.grid_scores_]
        scores = np.array(scores).reshape(len(gamma_range))
        plt.figure(figsize=(8, 6))
        plt.plot(gamma_range, scores)

        plt.xlabel('gamma')
        plt.ylabel('score')
        plt.title('Validation accuracy (RTiX, %s)' %
                  os.path.basename(self.name))
        plt.savefig(self.name + '-SLViF-grid-npts=%d.pdf' % n_pts)

        # final train
        g = grid.best_params_['mapper__gamma']
        print 'best parameters are g=%f' % g
        rbf_svc2 = grid.best_estimator_
        y_pred = rbf_svc2.predict(self.Xt)
        print 'SCORE: %f' % sm.accuracy_score(self.Yt, y_pred)
        return grid.best_score_, sm.accuracy_score(self.Yt, y_pred)
Пример #17
0
def construct_xtar(data, proxies, drop_all=False):
    xtar = pipeline.Pipeline([('scaler', preprocessing.StandardScaler()),
                              ('pred', XTAR(fit_intercept=True,
                                            normalize=False))])

    estimators = {}
    for prox_combo in it.permutations(proxies, 2):
        estimators[f"xTAR ({prox_combo[0]}, {prox_combo[1]})"] = {
            'pipe': deepcopy(xtar),
            'drop_cols': proxies if drop_all else [p for p in prox_combo],
            'fit_params': {
                'pred__W': data['train'][prox_combo[0]],
                'pred__Z': data['train'][prox_combo[1]],
                'pred__nu': data['test'][prox_combo[0]]
            }
        }

    return estimators
Пример #18
0
def build_ensemble_regression_pipeline(dimensionality_reduction_name):

    models = [
        ("KNN", KNeighborsRegressor()),
        ("SVR", SVR()),
        ("RF", RandomForestRegressor()),
    ]
    regressor = WeightedAverageEnsemble(estimators=models)
    if dimensionality_reduction_name == "lasso":
        dimensionality_reduction = SelectFromModel(Lasso(), threshold="median")
    elif dimensionality_reduction_name == "PCA":
        dimensionality_reduction = PCA(n_components=0.75, svd_solver="full")

    return pipeline.Pipeline([
        ("zero_variance_filter", VarianceThreshold(threshold=0)),
        (dimensionality_reduction_name, dimensionality_reduction),
        ("classifier", regressor),
    ])
Пример #19
0
def pre_processing(data_sets: tuple, threshold=0.5, impute_strategy="median"):
    # merge multi data sets
    new_data = np.concatenate(data_sets, axis=1)
    # remove all zeros column and row
    new_data = new_data[~np.all(new_data == 0, axis=1), :]
    new_data = new_data[:, ~np.all(new_data == 0, axis=0)]
    # remove all column and row contain nan higher than threshold
    # new_data = new_data[np.isnan(new_data).sum(axis=1) < threshold * new_data[:, :-1].shape[1], :]
    # new_data = new_data[:, np.isnan(new_data).sum(axis=0) < threshold * new_data[:, :-1].shape[0]]
    # impute missing value and standard scale
    data_preprocess = pipeline.Pipeline([
        ('imputer', impute.SimpleImputer(strategy=impute_strategy)),
        ('std_scaler', preprocessing.StandardScaler()),
    ])
    data_processed = data_preprocess.fit_transform(new_data[:, :-1])
    data_processed = np.concatenate((data_processed, new_data[:, [-1]]),
                                    axis=1)
    return data_processed
    def button(self):
        print('x')

        impute_ = impute.SimpleImputer()

        scaler = preprocessing.StandardScaler()
        scaler2 = preprocessing.MinMaxScaler()

        pipe = pipeline.Pipeline([('impute', impute_), ('scaler', scaler),
                                  ('scaler2', scaler2)])

        X_data = self.dataframe.values

        new = pipe.fit_transform(X_data)
        print(new)
        print(type(new))

        self.emit_signal_for_table(new)
Пример #21
0
 def __init__(self, model, model_name="unknown model"):
     '''
     Build tradition method class
     :param file: csv file
     :param model_name: string
     :param model: sckit learn model
     '''
     self.model = pipeline.Pipeline([
         ('counts',
          feature_extraction.text.CountVectorizer(min_df=5,
                                                  stop_words="english",
                                                  analyzer="word",
                                                  ngram_range=(1, 2))),
         ('tfidf', feature_extraction.text.TfidfTransformer()),
         (model_name, model),
     ])
     # self.data = data
     self.model_name = model_name
Пример #22
0
def convert_creme_to_sklearn(estimator):
    """Wraps a creme estimator to make it compatible with scikit-learn."""

    if isinstance(estimator, compose.Pipeline):
        return pipeline.Pipeline([(name, convert_creme_to_sklearn(step))
                                  for name, step in estimator.items()])

    wrappers = [(base.BinaryClassifier, SKLClassifierWrapper),
                (base.Clusterer, SKLClustererWrapper),
                (base.MultiClassClassifier, SKLClassifierWrapper),
                (base.Regressor, SKLRegressorWrapper),
                (base.Transformer, SKLTransformerWrapper)]

    for base_type, wrapper in wrappers:
        if isinstance(estimator, base_type):
            return wrapper(copy.deepcopy(estimator))

    raise ValueError("Couldn't find an appropriate wrapper")
def train():
    data_directory = 'data_i2r'
    user = '******'
    (train, y_train) = read_edf_data.load_data(data_directory, user,
                                               'DataTraining', True)
    (test, y_test) = read_edf_data.load_data(data_directory, user,
                                             'DataTraining', False)

    pipe = pipeline.Pipeline([('csp', CSP()), ('chan_var', ChanVar()),
                              ('svm', svm.SVC(kernel='linear'))])

    # train model
    pipe.fit(train, y_train)

    # make predictions on unseen test data
    y_pred = pipe.predict(test)

    print metrics.classification_report(y_test, y_pred)
Пример #24
0
    def train(self, train_instances, train_labels):

        print("n_train_instances: ", len(train_instances))
        print("classifier: ", self.clsf_name)

        t0 = time()

        model = skpipeline.Pipeline([('features', self.feature_pipelines),
                                     ('clsf', self.classifier)])

        print(model.get_params())

        print("Start training\n..")
        model.fit(train_instances, train_labels)

        t1 = time()
        print("Traning took ", round(t1 - t0, 2), "sec.")
        return model
Пример #25
0
class categoryTransformer(sk.base.BaseEstimator, sk.base.TransformerMixin):

    def __init__(self, colname):
        self.colname = colname
        pass
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):      
        self.val = X[self.colname]        
        return self.val

#--------------------------------------------------------------------
#def build_model():
    #load data
    f = open('./data/df_data.p', 'r')
    df = pickle.load(f)          
    f.close() 

    #change data-type of prices and eliminate unreasonable values
    df['price'] = df['price'].str[1:].astype(int)
    df = df[df['price'] <= 10000]

    #split dataframe into predictors and outcomes
    y = df['price']
    X = df.drop('price',1) 

    #build model
    fullModel = pipeline.Pipeline([("col_select", categoryTransformer(['sqft','br','ba'])),
                               ("imp",Imputer(missing_values='NaN', strategy='median', axis=0)),
                               ("lin",LinearRegression())
        
    ])
    
    #cross validate
    param_grid_pipeline = {'lin__fit_intercept':[True,False], 'lin__normalize':[True,False]}
    grid = GridSearchCV(fullModel, param_grid_pipeline, cv = 5, n_jobs = -1, verbose=1, scoring = 'mean_squared_error')
    grid.fit(X, y)

    #save model
    f = open('./data/model.p', 'w')
    pickle.dump(grid, f)          
    f.close() 
Пример #26
0
    def _train(self):
        x = self._train_set.features
        y = self._train_set.outputs

        self._transform = pipeline.Pipeline([
            #('scale', preprocessing.StandardScaler()),
            ('proliferate', preprocessing.PolynomialFeatures(3)),
            ('pselect',
             feature_selection.SelectPercentile(feature_selection.f_regression,
                                                percentile=98)),
            #('kselect', feature_selection.SelectKBest(feature_selection.f_regression, k=750)),
        ])

        clf = linear_model.Ridge(
            alpha=500,
            fit_intercept=True,
        )
        clf.fit(self._transform.fit_transform(x, y), y)
        self._model = clf.predict
Пример #27
0
def build_pipeline(model_params: Dict[Any, Any]) -> pipeline.Pipeline:
    """Return a pipeline that can be used end-to-end with tokenized data.

    Parameters
    ----------
    model_params : dict
        Parameters that should be used to initialize the model.

    Returns
    -------
    pipeline.Pipeline
        Built pipeline that can be used as a model to fit and predict.
    """
    return pipeline.Pipeline(
        [
            ("feature_extractor", FeatureExtractor()),
            ("model", Model(**model_params)),
        ]
    )
Пример #28
0
def _detect_topics(model,
                   instances,
                   lang,
                   N=20,
                   stopword=True,
                   stemming=True,
                   remove_numbers=True,
                   deasciify=True,
                   remove_punkt=True,
                   lowercase=True,
                   wordngramrange=(1, 1)):

    ndim = 1
    nmaxfeature = 200

    preprocessor = prep.Preprocessor(lang=lang,
                                     stopword=stopword,
                                     more_stopwords=None,
                                     spellcheck=False,
                                     stemming=stemming,
                                     remove_numbers=remove_numbers,
                                     deasciify=deasciify,
                                     remove_punkt=remove_punkt,
                                     lowercase=lowercase)

    tfidfvect = sktext.TfidfVectorizer(tokenizer=prep.identity,
                                       preprocessor=None,
                                       lowercase=False,
                                       use_idf=True,
                                       ngram_range=wordngramrange,
                                       max_features=nmaxfeature)

    topical_transformer = skpipeline.Pipeline([
        ('txtprep', preprocessor),
        ('tfidf_vect', tfidfvect),
        #('normalizer', skprep.Normalizer()),
        ('scaler', skprep.StandardScaler(with_mean=False)),
        ('nmf', model)
    ])

    topical_transformer.fit(instances)

    return utilstopics.get_topic_words(model, tfidfvect, N)
Пример #29
0
def get_pipeline(numberestimators, minsamplesleaf):

    classifier = get_estimator(numberestimators, minsamplesleaf)

    feature_columns = metadata.FEATURE_COLUMNS
    numerical_names = metadata.NUMERIC_FEATURES
    categorical_names = metadata.CATEGORICAL_FEATURES

    preprocessor = preprocess_utils.get_preprocess_pipeline(
        feature_columns=feature_columns,
        numerical_names=numerical_names,
        categorical_names=categorical_names)

    estimator = pipeline.Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', classifier),
    ])

    return estimator
Пример #30
0
def get_predictions(X, y, X_test, X_cv):
    # Initialize SVD
    svd = TruncatedSVD()

    scl = StandardScaler()

    clf_model = SVC()

    clf = pipeline.Pipeline([('svd', svd), ('scl', scl), ('clf', clf_model)])

    # Create a parameter grid to search for best parameters for everything in the pipeline
    param_grid = {'svd__n_components': [400], 'clf__C': [12]}

    # Kappa Scorer
    kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa,
                                       greater_is_better=True)

    # Initialize Grid Search Model
    model = grid_search.GridSearchCV(estimator=clf,
                                     param_grid=param_grid,
                                     scoring=kappa_scorer,
                                     verbose=10,
                                     n_jobs=-1,
                                     iid=True,
                                     refit=True,
                                     cv=2)

    # Fit Grid Search Model
    model.fit(X, y)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:")
    best_parameters = model.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # Get best model
    best_model = model.best_estimator_

    # Fit model with best parameters optimized for quadratic_weighted_kappa
    best_model.fit(X, y)
    preds = best_model.predict(X_test)
    preds_cv = best_model.predict(X_cv)
    return (preds, preds_cv)