def SVM_LinearSVC(self):        
        SVM_Classifier = Pipeline([
                ('vectorizer', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', OneVsRestClassifier(LinearSVC()))
                ])
         
        SVM_Classifier.fit(self.X_train,self.y)
         
        predicted = SVM_Classifier.predict(self.X_test)
        y_pred = self.lb.inverse_transform(predicted)
         
        i=self.train_ex
        correct=0
        for label in y_pred:
            if label==self.Y_train[i]:
                correct=correct+1
            i = i + 1
            
        print 'Number of Examples used for Training',self.train_ex
        print 'Number of Correctly classified',correct
        print 'Total number of samples classified in Test data',self.size-self.train_ex
        print 'The resulting accuracy using Linear SVC is ',(float(correct)*100/float(self.size-self.train_ex)),'%\n'

        cm=confusion_matrix(self.Y_train[self.train_ex:self.size],y_pred)
        print 'The confusion matrix is',cm
     
        return y_pred
Пример #2
0
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = RandomizedPCA(n_components=2, whiten=True)
    clf = SVC(probability=True, random_state=0)

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples,))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)
 def train_clf(self):
     pipeline = Pipeline([
         ("tfidf", TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
         ("svc", LinearSVC(C=100))
     ])
     pipeline.fit(self.dataset.data, self.dataset.target)
     return pipeline
    def calcCSPLDA(epochs_train, labels_train, nb):
        """Creates the CSP+LDA pipeline and applies it to training data. 
        (just really a function to call the MNE and SKlearn processing functs)

        Parameters
        ----------
        epochs_train : epochs in mne data format

        labels_train : labels of epochs in mne format

        nb: number of CSP components, must be even. (6 implies the 3 top-most and bottom eigenvectors)

        Returns
        -------
        clf : the fitted model for the CSP+LDA approach

        csp.filters_ : CSP weight vector, shape (nchannels, nchannels)

        svc.coef_ : LDA weight vector, shape (1, nb)

        Examples
        --------
        >>> data_path = "/PATH/TO/FILE/somematrix.txt"
        >>> matrix_data = loadAsMatrix(data_path)
        """
        svc = LDA()
        csp = CSP(n_components=4, reg=None, log=True, cov_est='epoch')
        clf = Pipeline([('CSP', csp), ('SVC', svc)])

        epochs_data = epochs_train.get_data()

        clf.fit(epochs_data, labels_train)

        return clf, csp.filters_, svc.coef_
Пример #5
0
class MachineLearning(object):
    def __init__(self):
        # Initialize classifier and vectorizer
        self.clf = Pipeline([('tfidf', TfidfVectorizer(min_df=1, ngram_range=(1, 2))),
                             ('clf', MultinomialNB(alpha=.01)),
                            ])

    def init_training(self):
        self.x_train = []
        self.y_train = []

    def add_training_data(self, data, label):
        self.x_train.append(data)
        self.y_train.append(label)

    # Train classifier
    # Can also use grid search to optimize accuracy, like
    '''
    parameters = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                  'clf__alpha': (.01, .001),
    }
    gs_clf = GridSearchCV(clf, parameters, n_jobs=-1)
    '''
    def train(self):
        self.clf.fit(self.x_train, self.y_train)

    # Predict result
    # We can roughly estimate the accuracy using cross validation, like
    '''
    result = clf.predict(test_dc + test_marvel)
    baseline = [0 for x in range(len(test_dc))] + [1 for x in range(len(test_marvel))]
    print np.sum(result == baseline) / float(len(result))
    '''
    def predict(self, data):
        return self.clf.predict([data])[0]
Пример #6
0
def test_one_rf():
    Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
    print "training data loaded"
    print_label_frequency(ytrain_raw)

    ############# create the pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)),
        ('tfidf', TfidfTransformer()),
        ('rf', RandomForestClassifier(n_estimators=500,
                                      max_depth=200,
                                      min_samples_split=10,
                                      oob_score=True,
                                      n_jobs=-1,verbose=1,class_weight='balanced')),
    ])

    ############# train
    pipeline.fit(Xtrain_raw,ytrain_raw)

    ############# check result
    rf = pipeline.steps[-1][1]
    rf.oob_score_

    ############# training error
    ytrain_predict = pipeline.predict(Xtrain_raw)
    print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict)
    print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict)

    ############# testing error
    Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
    ytest_predict = pipeline.predict(Xtest_raw)
    accuracy_score(y_true=ytest_raw,y_pred=ytest_predict)
    print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
    def KFOLDTEST(self, text, sent):
        k_fold = KFold(n=len(text), n_folds=6)

        pipeline = Pipeline(
            [
                ("vectorizer", CountVectorizer(ngram_range=(1, 2), tokenizer=self.tokenize_data)),
                ("tfidf", TfidfTransformer(norm="l2", smooth_idf=False, use_idf=False)),
                ("classifier", OneVsOneClassifier(LinearSVC())),
            ]
        )

        scores = []
        for train_indices, test_indices in k_fold:
            # print('Train: %s | test: %s' % (train_indices, test_indices))
            train_text = text[train_indices]
            train_y = sent[train_indices]

            test_text = text[test_indices]
            test_y = sent[test_indices]

            pipeline.fit(train_text, train_y)
            score = pipeline.score(test_text, test_y)
            scores.append(score)

        score = sum(scores) / len(scores)
        print ("scores ", scores, " Score ", score)
        return score
Пример #8
0
def runCrossValidationTest(classifier_name,
        classifier_args=None,
        ngram=2,
        folds=5):

  if classifier_args is None:
    classifier_args = {}
  classifier = valid_classifiers[classifier_name](**classifier_args)
  X, y = load_non_preprocessed_data()
  # confusion = numpy.array([[0,0,0],[0,0,0],[0,0,0]])
  ml_pipeline = Pipeline([
                      ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))),
                      ('Classifier', classifier),
                      ])
  X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y, test_size = 0.25, random_state=0)
  ml_pipeline.fit(X_train, y_train)
  predictions = ml_pipeline.predict(X_test)
  confusion = confusion_matrix(y_test, predictions)
  f1 = f1_score(y_test, predictions, pos_label=None, average = 'micro')
  precision = precision_score(y_test, predictions, pos_label=None, average = 'micro')
  recall = recall_score(y_test, predictions, pos_label=None, average = 'micro')
  print(" >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
  print("F1 score: " + str(f1))
  print("precision score: " + str(precision)) 
  print("recall score: " + str(recall)) 
  print(confusion)
  numpy.savetxt("data/test_results_confusion_matrix_" + classifier_name+".csv", confusion, delimiter=",")
  return ((f1, precision, recall))
Пример #9
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = Pipeline([
            ("RF", RandomForestRegressor(n_estimators=200, max_depth=15,
                                         n_jobs=N_JOBS))])
        self.scaler = StandardScaler()
        self.agglo = FeatureAgglomeration(n_clusters=500)

    def fit(self, X, y):
        y = y.ravel()
        n_samples, n_lags, n_lats, n_lons = X.shape
        self.scaler.fit(X[:, -1].reshape(n_samples, -1))
        X = X.reshape(n_lags * n_samples, -1)
        connectivity = grid_to_graph(n_lats, n_lons)
        self.agglo.connectivity = connectivity
        X = self.scaler.transform(X)
        X = self.agglo.fit_transform(X)
        X = X.reshape(n_samples, -1)
        self.clf.fit(X, y)

    def predict(self, X):
        n_samples, n_lags, n_lats, n_lons = X.shape
        X = X.reshape(n_lags * n_samples, -1)
        X = self.scaler.transform(X)
        X = self.agglo.transform(X)
        X = X.reshape(n_samples, -1)
        return self.clf.predict(X)
def use_kfold_cross_validation(X_train, X_test, y_train, y_test):
    pipe_lr = Pipeline([
        ('scl', StandardScaler()),
        ('pca', PCA(n_components=2)),
        ('clf', LogisticRegression(random_state=1)),
    ])
    pipe_lr.fit(X_train, y_train)
    print("Test accuracy: %.3f\n" % pipe_lr.score(X_test, y_test))

    kfold = StratifiedKFold(y=y_train, n_folds=10, random_state=1)
    scores = []
    for k, (train, test) in enumerate(kfold):
        pipe_lr.fit(X_train[train], y_train[train])
        score = pipe_lr.score(X_train[test], y_train[test])
        scores.append(score)
        print(
            "Fold: %s, Class dist.: %s, Acc: %.3f" %
            (k+1, np.bincount(y_train[train]), score)
        )
    print(
        "\nCustom CV accuracy: %.3f +/- %.3f\n" %
        (np.mean(scores), np.std(scores)),
    )

    scores = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, cv=10)
    print("cross_val_score CV accuracy scores: %s" % scores)
    print(
        "cross_val_score CV accuracy: %.3f +/- %.3f" %
        (np.mean(scores), np.std(scores))
    )
Пример #11
0
def test():
    target_label = [u'weather', u'audio',u'pic',u'calculate',u'music', u'poem']
    training_text_raw = []
    training_label = []
    with open ('./training_source.csv','r') as f:
        for line in f.readlines():
            line = line.strip().split('\t')
            if len(line) > 1 and line[1] in target_label:
                training_text_raw.append(unicode(line[0],"utf-8"))
                training_label.append(line[1])
        print training_label

        training_text = []
    for text in training_text_raw:
        seg_text = seg(text)
        training_text.append(seg_text)
    text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=False)),

                     ('clf', MultinomialNB()),
])

    scores = cross_validation.cross_val_score(text_clf, training_text, training_label, cv=8)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    text_clf.fit(training_text, training_label)

    while True:
        k_text = raw_input("\nPlease input:")
        if k_text == "exit":
            break
        print text_clf.predict([seg(unicode(k_text,'utf-8'))])
def main(datafile, threshold):
    filename = "out{}{}.hrc".format(os.sep, os.path.basename(datafile.name))

    if not os.path.isfile(filename):
        header = datafile.readline()
        collist = [i for i, toggle in enumerate(header.split(",")) if toggle != "0"]
        datafile.seek(0)
        data = pd.read_csv(datafile, usecols=collist).as_matrix()

        pipeline = Pipeline([("clf", Hierarchical())])
        pipeline.set_params(**{})
        pipeline.fit(data)

        clf = pipeline.get_params()["clf"]
        hierarchy = clf.hierarchy_

        with open(filename, "wb") as fh:
            fh.write(ET.tostring(hierarchy.to_xml()))
    else:
        with open(filename, "rb") as fh:
            hierarchy = Cluster.from_xml(ET.parse(fh).getroot())

    print(ET.tostring(hierarchy.to_xml()).decode("utf-8"))

    if threshold != None:
        clusters = hierarchy.cut(threshold)
        print("\n".join(c.to_str(i) for i, c in enumerate(clusters)))
        dump_graph(clusters)
Пример #13
0
def svcDictVector():
    recipeData = getRecipeData()
    
    labels = [recipe['cuisine'] for recipe in recipeData]
    ingredientsFixtures = [sorted(set(e['ingredients'])) for e in recipeData]
    for i, w in enumerate(ingredientsFixtures):
        ingredientsFixtures[i] = dict(zip(w, [1] * len(w)))        
                
    pipeline = Pipeline([
        ('dict', DictVectorizer()),
        ('variance', VarianceThreshold()),        
        ('tfidf', TfidfTransformer()),
        ('bayes', svm.LinearSVC()),
    ])    
    
    pipeline.fit(ingredientsFixtures, labels)
    print pipeline
    
    testRecipes = getTestData()    
    testIngredientsFixtures = [sorted(set(e['ingredients'])) for e in testRecipes]
    for i, w in enumerate(testIngredientsFixtures):
        testIngredientsFixtures[i] = dict(zip(w, [1] * len(w)))
        
    predictions = pipeline.predict(testIngredientsFixtures)    
    outputPercentCorrect(predictions)     
    copyAndOutput(predictions, testRecipes)
Пример #14
0
class Classifier:
    def __init__(self, clf, scaler=None, selector=False):
        if scaler:
            if selector:
                self.clf = Pipeline([
                    ('scaler', scaler),
                    ('selector', SelectFromModel(SELECTOR_POOL['extra_trees_classifier'], .001)),
                    ('classifier', clf)
                ])
            else:
                self.clf = Pipeline([
                    ('scaler', scaler),
                    ('classifier', clf)
                ])
        else:
            if selector:
                self.clf = Pipeline([
                    ('selector', SelectFromModel(SELECTOR_POOL['extra_trees_classifier'], .001)),
                    ('classifier', clf)
                ])
            else:
                self.clf = clf

    def __str__(self):
        if isinstance(self.clf, Pipeline):
            return ', '.join(type(v).__name__ for k, v in self.clf.steps)
        return type(self.clf).__name__

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)
    def clasificador(self,X_train, y_train, X_test, target_names, y_test,all_labels):
        
        lb = preprocessing.MultiLabelBinarizer()
        Y = lb.fit_transform(y_train)
        
        classifier = Pipeline([
            ('vectorizer',CountVectorizer(strip_accents='unicode')),
            ('tfidf',TfidfTransformer()),
            ('to_dense', DenseTransformer()),
            ('clf',OneVsRestClassifier(GaussianNB()))])
            


     
        classifier.fit(X_train,Y)
        
        predicted = classifier.predict(X_test)


        etiquetas = lb.inverse_transform(predicted)

                
        for i in range(0,len(etiquetas)):
            etiquetas[i]=list(etiquetas[i])

        
        valoresMacro = self.macro(etiquetas,y_test)
        valoresMicro = self.micro(etiquetas, y_test)        
Пример #16
0
def predict():

    pipeline = Pipeline([
        ('min/max scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
        ('neural network', Classifier(layers=[Layer("ExpLin", units=5), Layer("Softmax")], n_iter=25))])

    X = np.load('All_features.npz')['arr_0']

    D = np.load('Akunin_features.npz')['arr_0']

    all_samples = [1]*141 + [0]*123
    y = np.array(all_samples)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0, random_state=0)

    pipeline.fit(X_train, y_train)
    pickle.dump(pipeline, open('NeuralNet_model.pkl', 'wb'))
    prediction = pipeline.predict(D)
    probs = pipeline.predict_proba(D)

    gradation = {1.01: 5, 0.9: 4, 0.8: 3, 0.7: 2, 0.6: 1}
    ress1 = []
    simple_predicts = []
    scale_predicts = []
    for i in prediction:
        simple_predicts.append(i[0])
    for i in probs:
        scale_predicts.append(i[1]*10)
        compare = []
        for u in gradation:
            if i[1] < u:
                compare.append(gradation[u])
        ress1.append(min(compare))

    return simple_predicts, scale_predicts
Пример #17
0
class Model10(Model):
  def __init__(self):
    pass
  def fit(self, Xmask, y):
    pr = prepare.Prepare_0(model=10, preproc=1, min_df=1, use_svd=False, tfidf=2,
        stemmer=0)
    (X_all_df,_,BP,params) = pr.load_transform(update=False)
    names = list(X_all_df.columns)
    X_all = np.asarray(X_all_df)
    self.X_all, self.names = X_all, names

    clf0 = GaussianNB()
    clf1 = MultinomialNB(alpha=0.8)
    clf2 = BernoulliNB(alpha=1, binarize=0.01)

    clf = clf1
    self.rd = Pipeline([
        ("trans", Transformer(names=self.names, X_all=X_all, BP=BP)),
        #("scaler",StandardScaler(with_mean=False)), 
        ("est", clf)
        ])

    self.rd.fit(Xmask,np.asarray(y))
    return self
  def predict_proba(self, Xmask):
    return self.rd.predict_proba(Xmask)
  def predict(self, Xmask):
    return self.rd.predict(Xmask)
  
  def starter(self):
    print "Model10 starter"
    self.fit(np.arange(100),np.arange(100))
class ModelPipeline(object):

    def __init__(self, clf):

        self.columns =[]

        self.pipeline = Pipeline([
            ('clf', clf)
            ])



    def fit(self, X_train, y_train):
        self.pipeline.fit(X_train, y_train)
        self.columns = list(X_train.columns)

    def predict(self, X_test):
        return self.pipeline.predict(X_test)


    def feat_importances(self, n=10, string=True):

        imp = self.pipeline.steps[0][1].feature_importances_
        if string:
            return ''.join('%s: %s%%\n' % (self.columns[feat], round(
                imp[feat] * 100, 3)) for feat in np.argsort(imp)[-1:-(n+1):-1])
        else:
            return self.columns[np.argsort(imp)[-1:-(n+1):-1]], \
                sorted(imp)[-1:-(n+1):-1]

    def grid_search(self, X, y):

        parameters = {
            'clf__n_estimators': [100, 200, 300] ,
            'clf__max_features': ['sqrt', 50, 80],
            'clf__max_depth': [None, 50, 100],
            'clf__oob_score': [False, True],
            'clf__random_state':[29],
            'clf__class_weight':['balanced', None, 'balanced_subsample'],
            'clf__min_samples_split': [2, 10, 20]
        }


        grid_search = GridSearchCV(self.pipeline, parameters, n_jobs=-1, verbose=1, scoring = "recall")

        print("Performing grid search...")
        print("pipeline:", [name for name, _ in self.pipeline.steps])
        print("parameters:")
        pprint(parameters)
        t0 = time()
        grid_search.fit(X, y)
        print("done in %0.3fs" % (time() - t0))
        print()

        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        for param_name in sorted(parameters.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))
        return best_parameters
Пример #19
0
def train_regressor(data, X_columns, y_show=y_init+y_curr):
    X = data.loc[:,X_columns]
    ys = data.loc[:, [i for i in y_show if i not in X_columns]]
    
    print()
    for n_trees in [256]:
    #list(range(4, 16)) + [18,20] + [2**n for n in range(4, 12)]:
    #[n for n in range(4, 64)]:#[2**n for n in range(1, 12)]:
        forest = Pipeline(steps=[
            ('forest', ExtraTreesRegressor(
                #RandomForestRegressor(
                n_estimators=n_trees, 
                n_jobs=min(n_trees, 62),
                oob_score=True, bootstrap=True))])
        start = time()
        forest.fit(X, ys)#new_ys)
        end = time()
        print(n_trees, forest.steps[0][1].oob_score_, end-start)
    
    print()
    print("%.5g seconds to train regressor" % (end-start))
    print()
    
    y_names = ys.columns
    X_names = X.columns
    return [forest, y_names, X_names]
Пример #20
0
def train(docs):
    """
    Trains and serializes (pickles) a vectorizing pipeline
    based on training data.

    `min_df` is set to filter out extremely rare words,
    since we don't want those to dominate the distance metric.

    `max_df` is set to filter out extremely common words,
    since they don't convey much information.
    """
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer(), min_df=0.015, max_df=0.9)),
        ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
        ('feature_reducer', TruncatedSVD(n_components=100)),
        ('normalizer', Normalizer(copy=False))
    ])

    print('Training on {0} docs...'.format(len(docs)))
    pipeline.fit(docs)

    PIPELINE = pipeline

    print('Serializing pipeline to {0}'.format(PIPELINE_PATH))
    pipeline_file = open(PIPELINE_PATH, 'wb')
    pickle.dump(pipeline, pipeline_file)
    print('Training complete.')
def classify(text, label):
    #~ Testing purpose: 10-fold cross validation
    cv = KFold(n = len(label), n_folds = 10)
    n_c = [100, 200, 500, 1000, 2000, 5000, 10000]

    for i in n_c:
        clf = Pipeline([
                ('vect',
                        TfidfVectorizer(
                                analyzer='word',
                                ngram_range=(1, 1),
                                stop_words = 'english',
                                lowercase=True,
                                token_pattern=r'\b\w+\b',
                                tokenizer=tokenize_doc,
                                min_df = 1)),
                ('dim_reduction',
                        TruncatedSVD(n_components=i)),
                #~ ('feature_selection',
                        #~ SelectKBest(
                                #~ chi2,
                                #~ k=35)),
                ('classification',
                        LogisticRegression())
                        #~ SVC(kernel = 'linear'))
        ])
    
        print "len(label) ", len(label), " | text ", len(text)
        print ""
    
        clf.fit(np.asarray(text), np.asarray(label))
    
        cv_score = cross_val_score(clf, text, label, cv = cv, verbose = 1)
        print "Log Reg | n_c = ", i
        print "Accuracy List ", cv_score, " | Avg Accuracy ", np.mean(cv_score)
Пример #22
0
def main():
    X_all, y, lentrain = load_Boilerplate()

    X_all, tfv = transform_Tfidf(X_all, lentrain)

    X = X_all[:lentrain]

    clf = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, 
                             C=1, fit_intercept=True, intercept_scaling=1.0, 
                             class_weight=None, random_state=123)

    rd = Pipeline([
        #("selector", SelectPercentile(chi2, percentile=90)),
        #("pca", PCA(n_components='mle')),
        #("pca", PCA(n_components=500)),
        ("svd", TruncatedSVD(n_components=500, random_state=1)),
        ("est", clf)
        ])

    if True:
        cv_run(rd, X, y)
        return
    else:
        print "Prepare submission.."

    print "training on full data"
    rd.fit(X,y)
    X_test = X_all[lentrain:]
    pred = rd.predict_proba(X_test)[:,1]
    testfile = pd.read_csv('../data/test.tsv', sep="\t", na_values=['?'], index_col=1)
    pred_df = pd.DataFrame(pred, index=testfile.index, columns=['label'])
    submname = 'submission_%s' % (datetime.datetime.today().strftime("%Y%m%d_%H%M%S"),)
    #print submname
    pred_df.to_csv('../data/%s.csv' % submname)
    print "%s file created.." % submname
Пример #23
0
def useTFIDF():
    print "TFIDF"
    trainData = pd.read_csv("data/multinomialTrain.csv", header=0)
    # dat = trainData[["rating", 'numDet', 'innerPunctuation','avgWordLength',
    #                       'numPresVerb',  "numFirstPerson",'numPropNoun', "numOtherNoun", "numWords", "numAdj",
    #                        "numPastVerb", "numConj", "exclamationPoints"]]
    dat = trainData


    knn = KNeighborsClassifier(n_neighbors=21, weights='distance')
    scaler = preprocessing.StandardScaler()
    tfidf = TfidfTransformer()
    tfidf_scaled_knn = Pipeline([('tfidf', tfidf), ('knn', knn)])

    kf = KFold(len(trainData), n_folds=3, shuffle=True)
    for train, test in kf:
        trainX, trainy = transform_sklearn_dictionary(transform_csv(dat.iloc[train], target_col="rating",
                                                                    ignore_cols=["01v234", "2v34", "words","words_nostopwords",
                                                                     "review", 'numDet', 'innerPunctuation','avgWordLength','numPresVerb',  "numFirstPerson",'numPropNoun', "numOtherNoun", "numWords", "numAdj",
                                                                     "numPastVerb", "numConj", "exclamationPoints"]))
        testX, testy = transform_sklearn_dictionary(transform_csv(dat.iloc[test], target_col="rating",
                                                                  ignore_cols=["01v234", "2v34", "words","words_nostopwords",
                                                                     "review", 'numDet', 'innerPunctuation','avgWordLength','numPresVerb',  "numFirstPerson",'numPropNoun', "numOtherNoun", "numWords", "numAdj",
                                                                     "numPastVerb", "numConj", "exclamationPoints"]))
        tfidf_scaled_knn.fit(trainX, trainy)
        print tfidf_scaled_knn.score(testX, testy)
class Classifier(BaseEstimator):

    def __init__(self, rf_max_depth=10, rf_n_estimators=50, n_estimators=50, n_jobs=1):
        self.rf_max_depth = rf_max_depth
        self.rf_n_estimators = rf_n_estimators
        self.n_estimators = n_estimators
        self.n_jobs = n_jobs

    def fit(self, X, y):
        self.clf = Pipeline([
            ('rf', AdaBoostClassifier(
                base_estimator=RandomForestClassifier(
                    max_depth=self.rf_max_depth, n_estimators=self.rf_n_estimators,
                    n_jobs=self.n_jobs),
                n_estimators=self.n_estimators)
             )
        ])
        self.clf.fit(X, y)
        return self

    def predict(self, X):
        return self.clf.predict(X)

    def predict_proba(self, X):
        return self.clf.predict_proba(X)
Пример #25
0
def allFeatureClassify(cosine=False):
    print "AllFeatureClassifier"
    if cosine:
        print "Cosine"
    trainData = pd.read_csv("data/multinomialTrain.csv", header=0)
    # dat = trainData[["rating", 'numDet', 'innerPunctuation','avgWordLength',
    #                       'numPresVerb',  "numFirstPerson",'numPropNoun', "numOtherNoun", "numWords", "numAdj",
    #                        "numPastVerb", "numConj", "exclamationPoints"]]
    dat = trainData


    if cosine:
        knn = KNeighborsClassifier(n_neighbors=21, metric=pairwise.cosine_similarity)
    else:
        knn = KNeighborsClassifier(n_neighbors=21)
    scaler = preprocessing.StandardScaler()
    scaled_knn = Pipeline([('scaler', scaler), ('knn', knn)])

    kf = KFold(len(trainData), n_folds=3, shuffle=True)
    for train, test in kf:
        trainX, trainy = transform_sklearn_dictionary(transform_csv(dat.iloc[train], target_col="rating",
                                                                    ignore_cols=["01v234", "2v34", "words",
                                                                                 "words_nostopwords", "review"]))
        testX, testy = transform_sklearn_dictionary(transform_csv(dat.iloc[test], target_col="rating",
                                                                  ignore_cols=["01v234", "2v34", "words",
                                                                                 "words_nostopwords", "review"]))
        scaled_knn.fit(trainX, trainy)
        print scaled_knn.score(testX, testy)
Пример #26
0
def run(training, validation, k, config=None):
    isError, OneError, nDocs = 0, 0, 0
    margins, AP = [], []

    class_index = Index()
    traindocs, train_X, train_y = zip(*load_data(training, class_index))
    testdocs, test_X, test_y = zip(*load_data(validation, class_index))

    n_iter = np.ceil(10**6 / len(traindocs))

    clf = SGDClassifier(alpha=.000001, loss='log', n_iter=50, penalty='elasticnet')
    #clf = MultinomialNB(alpha=0.000001)

    classifier = Pipeline([
                ('vectorizer', CountVectorizer(min_df=1, max_df=1.0, analyzer=lambda t: t)),
                ('tfidf', TfidfTransformer(norm='l2')),
                ('clf', OneVsRestClassifier(clf, n_jobs=-1))])

    classifier.fit(train_X, train_y)
    predictions = classifier.predict_proba(test_X)
    for j, prediction in enumerate(predictions):
        nDocs += 1
        refs = np.zeros(len(prediction))
        refs[list(test_y[j])] = 1
        preds = sorted(range(len(prediction)), key=lambda i: prediction[i], reverse=True)
        refs = set(test_y[j])
        ap = average_precision(preds, refs)
        AP.append(ap)
        isError += is_error(ap)
        OneError += one_error(preds, refs)
        margins.append(margin(preds, refs))
    return isError, OneError, nDocs, margins, AP
Пример #27
0
 def cross_validation(self, X, Y, n_folds=10):
     """ n-fold cross validation to get the best classifier. """
     kf = KFold(len(X), n_folds=n_folds)
     best_accuracy = -1
     training_accuracy = 0
     for train, cv in kf:
         classifier = Pipeline([('vect', CountVectorizer()),
                                ('tfidf', TfidfTransformer()),
                                ('svm', LinearSVC(C=1))])
         # forms the training and test set
         X_train = []
         X_train.extend(X[0:cv[0]])
         X_train.extend(X[cv[-1]:])
         Y_train = []
         Y_train.extend(Y[0:cv[0]])
         Y_train.extend(Y[cv[-1]:])
         X_cv = X[cv[0]:cv[-1]+1]
         Y_cv = Y[cv[0]:cv[-1]+1]
         classifier.fit(X_train, Y_train)
         accuracy = self.__accuracy(classifier, X_cv, Y_cv)
         if accuracy > best_accuracy:
             best_classifier = classifier
             best_accuracy = accuracy
             training_accuracy = self.__accuracy(
                 classifier, X_train, Y_train)
     return best_classifier, training_accuracy, best_accuracy
Пример #28
0
def main():
    data = import_files(filenames)
    sentences = defaultdict(lambda: [])
    # invert the dictionary
    for cat in data:
        if cat == 'yn':
            continue
        for sentence in data[cat]:
            sentences[sentence].append(cat)

    X_list = []
    y_data = []
    for s in sentences:
        X_list.append(s)
        y_data.append(sentences[s])
    X_data = np.array(X_list)

    # X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.01, random_state=802701)\
    X_train = X_data
    y_train = y_data

    classifier = Pipeline([
                    ('vectorizer', TfidfVectorizer()),
                    ('clf', OneVsRestClassifier(LinearSVC()))])
    classifier.fit(X_train, y_train)

    save_classifier(classifier, outfile)
Пример #29
0
def pipeline_test(params, data_path, dataset):
    data_train = os.path.expanduser(os.path.join(data_path, dataset, 'train.arff'))
    X_train, y_train = load_arff_data(data_train)

    data_test = os.path.expanduser(os.path.join(data_path, dataset, 'test.arff'))
    X_test, y_test = load_arff_data(data_test)

    dpr = get_data_preprocessor_rescaling(params)
    params = get_data_preprocessor_balancing(params, y_train)
    fp = get_feature_preprocessor(params)
    clf = get_classifier(params)

    steps = []
    if dpr is not None:
        steps.append(('data_preprocessor_rescaling', dpr))
    if fp is not None:
        steps.append(('feature_preprocessor', fp))
    steps.append(('classifier', clf))

    ppl = Pipeline(steps)
    ppl.fit(X_train, y_train)
    y_pred = ppl.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    result = 100.0 - 100.0 * score

    return result
Пример #30
0
 def test_sklearn_pipeline(self):
     t = vw.VWClassifier(target="target")
     f1 = {"target":0,"b":1.0,"c":0}
     f2 = {"target":1,"b":0,"c":2.0}
     fs = []
     for i in range (1,50):
         fs.append(f1)
         fs.append(f2)
     print "features=>",fs
     df = pd.DataFrame.from_dict(fs)
     estimators = [("vw",t)]
     p = Pipeline(estimators)
     print "fitting"
     p.fit(df)
     print "get preds 1 "
     preds = p.predict_proba(df)
     print preds
     print "-------------------"
     t.close()
     joblib.dump(p,"/tmp/pipeline/p")
     p2 = joblib.load("/tmp/pipeline/p")
     print "get preds 2"
     df3 = p2.predict_proba(df)
     print df3
     vw2 = p2._final_estimator
     vw2.close()
                      random_state=1,
                      max_iter=5,
                      tol=None),
        'clf_params': {
            'clf__alpha': (0.001, 1.0),
        }
    }
}

for model in models.keys():
    print('\nRunning the model - {}'.format(model))
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', models[model]['clf'])])

    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)

    accuracy = np.mean(predicted == y_test)
    print('\nFirst run - Accuracy of {} - {}'.format(model, accuracy * 100))

    print('\nTuning training parameters')
    # 4. Auto-tuning the training parameters using Grid Search for both feature extraction and classifier
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'vect__stop_words': ['english', None],
        'vect__max_df': (0.5, 1.0),
        'vect__min_df': (1, 2),
        'tfidf__use_idf': (True, False),
        'tfidf__smooth_idf': (True, False),
        'tfidf__sublinear_tf': (True, False),
Пример #32
0
def create_and_evaluate_model(args):
    global trial_nr
    trial_nr += 1

    start = time.time()
    score = 0
    for cv_iter in range(n_splits):

        dt_test_prefixes = dt_prefixes[cv_iter]
        dt_train_prefixes = pd.DataFrame()
        for cv_train_iter in range(n_splits):
            if cv_train_iter != cv_iter:
                dt_train_prefixes = pd.concat(
                    [dt_train_prefixes, dt_prefixes[cv_train_iter]], axis=0)

        # Bucketing prefixes based on control flow
        bucketer_args = {
            'encoding_method': bucket_encoding,
            'case_id_col': dataset_manager.case_id_col,
            'cat_cols': [dataset_manager.activity_col],
            'num_cols': [],
            'random_state': random_state
        }
        if bucket_method == "cluster":
            bucketer_args["n_clusters"] = args["n_clusters"]
        bucketer = BucketFactory.get_bucketer(bucket_method, **bucketer_args)
        print(bucketer)
        bucket_assignments_train = bucketer.fit_predict(dt_train_prefixes)
        bucket_assignments_test = bucketer.predict(dt_test_prefixes)

        preds_all = []
        test_y_all = []
        if "prefix" in method_name:
            scores = defaultdict(int)
        for bucket in set(bucket_assignments_test):
            relevant_train_cases_bucket = dataset_manager.get_indexes(
                dt_train_prefixes)[bucket_assignments_train == bucket]
            relevant_test_cases_bucket = dataset_manager.get_indexes(
                dt_test_prefixes)[bucket_assignments_test == bucket]
            dt_test_bucket = dataset_manager.get_relevant_data_by_indexes(
                dt_test_prefixes, relevant_test_cases_bucket)
            test_y = dataset_manager.get_label_numeric(dt_test_bucket)
            if len(relevant_train_cases_bucket) == 0:
                preds = [class_ratios[cv_iter]
                         ] * len(relevant_test_cases_bucket)
            else:
                dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(
                    dt_train_prefixes,
                    relevant_train_cases_bucket)  # one row per event
                train_y = dataset_manager.get_label_numeric(dt_train_bucket)

                if len(set(train_y)) < 2:
                    preds = [train_y[0]] * len(relevant_test_cases_bucket)
                else:
                    feature_combiner = FeatureUnion([
                        (method,
                         EncoderFactory.get_encoder(method,
                                                    **cls_encoder_args))
                        for method in methods
                    ])

                    if cls_method == "rf":
                        cls = RandomForestClassifier(
                            n_estimators=500,
                            max_features=args['max_features'],
                            random_state=random_state)

                    elif cls_method == "xgboost":
                        cls = xgb.XGBClassifier(
                            objective='binary:logistic',
                            n_estimators=500,
                            learning_rate=args['learning_rate'],
                            subsample=args['subsample'],
                            max_depth=int(args['max_depth']),
                            colsample_bytree=args['colsample_bytree'],
                            min_child_weight=int(args['min_child_weight']),
                            seed=random_state)

                    elif cls_method == "logit":
                        cls = LogisticRegression(C=2**args['C'],
                                                 random_state=random_state)

                    elif cls_method == "svm":
                        cls = SVC(C=2**args['C'],
                                  gamma=2**args['gamma'],
                                  random_state=random_state)

                    if cls_method == "svm" or cls_method == "logit":
                        pipeline = Pipeline([('encoder', feature_combiner),
                                             ('scaler', StandardScaler()),
                                             ('cls', cls)])
                    else:
                        pipeline = Pipeline([('encoder', feature_combiner),
                                             ('cls', cls)])

                    pipeline.fit(dt_train_bucket, train_y)
                    #pipeline.fit(dt_train_bucket, train_y, cls__early_stopping_rounds=15, cls__eval_set=[[pipeline.named_steps["encoder"].transform(dt_test_bucket), test_y]])

                    if cls_method == "svm":
                        preds = pipeline.decision_function(dt_test_bucket)
                    else:
                        preds_pos_label_idx = np.where(cls.classes_ == 1)[0][0]
                        preds = pipeline.predict_proba(
                            dt_test_bucket)[:, preds_pos_label_idx]

            if "prefix" in method_name:
                auc = 0.5
                if len(set(test_y)) == 2:
                    auc = roc_auc_score(test_y, preds)
                scores[bucket] += auc
            preds_all.extend(preds)
            test_y_all.extend(test_y)

        score += roc_auc_score(test_y_all, preds_all)

    if "prefix" in method_name:
        for k, v in args.items():
            for bucket, bucket_score in scores.items():
                fout_all.write(
                    "%s;%s;%s;%s;%s;%s;%s;%s\n" %
                    (trial_nr, dataset_name, cls_method, method_name, bucket,
                     k, v, bucket_score / n_splits))
        fout_all.write("%s;%s;%s;%s;%s;%s;%s;%s\n" %
                       (trial_nr, dataset_name, cls_method, method_name, 0,
                        "processing_time", time.time() - start, 0))
    else:
        for k, v in args.items():
            fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" %
                           (trial_nr, dataset_name, cls_method, method_name, k,
                            v, score / n_splits))
        fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" %
                       (trial_nr, dataset_name, cls_method, method_name,
                        "processing_time", time.time() - start, 0))
    fout_all.flush()
    return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
Пример #33
0
clfnames = [
    "Multinomial Naive Bayes", "Linear SVM", "Logistic Regression",
    "Stochastic Gradient Descent", "Random Forest", "Bagging Random Forest",
    "Gradient Boosting", "Ada Boost"
]

#building a pipeline

for vectname, vect, in zip(vectnames, vects):
    for clfname, clf in zip(clfnames, clfs):
        pipe = Pipeline([
            ('vect', vect),
            ('clf', clf),
        ])

        pipe.fit(x_train, y_train)
        pred = pipe.predict(x_test)
        train_acc = metrics.accuracy_score(y_train, pipe.predict(x_train))
        test_acc = metrics.accuracy_score(y_test, pred)
        print("{} + {} - train acc: {} test acc: {} ".format(
            vectname, clfname, train_acc, test_acc))
"""Best result = Tfidf Vect + Linear SVM - train acc: 0.9880763116057234 test acc: 0.7857142857142857"""

tfidf = TfidfVectorizer()
linear_svm = LinearSVC()

tfidf.fit(x_train)
x_train_dtm = tfidf.transform(x_train)
x_test_dtm = tfidf.transform(x_test)

linear_svm.fit(x_train_dtm, y_train)
            feature_combiner = FeatureUnion(
                [(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods])


            cls = xgb.XGBClassifier(objective='binary:logistic',
                                    n_estimators=500,
                                    learning_rate=current_args['learning_rate'],
                                    subsample=current_args['subsample'],
                                    max_depth=int(current_args['max_depth']),
                                    colsample_bytree=current_args['colsample_bytree'],
                                    min_child_weight=int(current_args['min_child_weight']),
                                    seed=random_state)

            pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)])

            pipeline.fit(dt_train_bucket, train_y)


            # predict separately for each prefix case
            preds = []
            test_all_grouped = dt_test_bucket.groupby(dataset_manager.case_id_col)
            for _, group in test_all_grouped:

                test_y_all.extend(dataset_manager.get_label_numeric(group))


                _ = bucketer.predict(group)

                preds_pos_label_idx = np.where(cls.classes_ == 1)[0][0]
                pred = pipeline.predict_proba(group)[:, preds_pos_label_idx]
            ),
            (
                "feature_selection",
                Pipeline(
                    [
                        ("mutual_info_selector", mutual_info_selector),
                        ("recurse_importance_selector", recurse_importance_selector),
                    ]
                ),
            ),
            ("classifier", classifier),
        ]
    )

    X_train = train.text
    y_train = train.label

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_train)

    print("Train acc:", accuracy_score(y_train, y_pred))
    X_test = test.text
    y_test = test.label

    pred_test = pipeline.predict(X_test)

    print("Test acc:", accuracy_score(y_test, pred_test))

    dump(pipeline, "data/classification_pipeline.joblib")
X = df_amazon['verified_reviews'] # the features we want to analyze
ylabels = df_amazon['feedback'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))




# Creating our tokenizer function
def spacy_tokenizer(sentence):
Пример #37
0
f1_macro:0.6802467902467904

f1_micro:0.6637426900584795
f1_macro:0.45717351289792896

f1_micro:0.6502923976608187
f1_macro:0.510114468864469

f1_micro:0.6982456140350877
f1_macro:0.5879108391608392
'''

########## Evaluate the Best Classifier (Ensemble) ##########

#first just get test prediciton scores
ens_pipe.fit(X_train, y_train)
ens_preds = ens_pipe.predict(X_test)
f1_score(ens_preds, y_test, average='micro')  #.7917
f1_score(ens_preds, y_test, average='macro')  #.7363
accuracy_score(ens_preds, y_test)  #.7917

#next get train size vs test prediciton f1 scores
len_list = []
ens_accs = []

# for each subset of train set
for i in range(1, 11):
    # initialize the classifier
    svc_sub = svm.SVC(gamma='scale', C=1, kernel="rbf")
    gnb_sub = GaussianNB()
    rf_sub = RandomForestClassifier(n_estimators=10)
Пример #38
0
## feature scaling + dimension reduction: pca
#print "After feature scaling and PCA:"
#pipe(estimatorsNb_scaled_pca)

print "-----------------------------------------------------"

# =============================================================================
# scaling + kbest for params tunning
# =============================================================================

# tune parameters

# add scaling+ pca into pipepine
pipe_Nb = Pipeline(estimatorsNb_scaled_kbest)
# fit the pipeline
pipe_Nb.fit(features_train, labels_train)

# tuning params
param_grid_Nb = dict(feature_selection__k=[3, 4, 5, 6, 7, 8, 9, 10])

# use StratifiedKFold to make the classifier more robust!!
### this is a small dataset, with the ratio of poi and non-poi highly unbalanced
grid_search_Nb = GridSearchCV(pipe_Nb,
                              param_grid=param_grid_Nb,
                              cv=StratifiedKFold(10))
grid_search_Nb.fit(features_train, labels_train)

# get the best Nb clf
best_Nb = grid_search_Nb.best_estimator_

# selected featuers:
Пример #39
0
    header=None)

X = df.iloc[:, 2:].values
y = df.iloc[:, 1].values

le = LabelEncoder()
le.fit(np.unique(y))
y = le.transform(y)

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.10,
                                                    random_state=1)

# パイプライン作成
pipe_svm = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])

scores = cross_val_score(estimator=pipe_svm,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=-1)

print('CV accuracy scores: {}'.format(scores))
print('CV accuracy: {:0.3f} +/- {:0.3f}'.format(np.mean(scores),
                                                np.std(scores)))

pipe_svm.fit(X_train, y_train)
print('Test accuracy: {:0.3f}'.format(pipe_svm.score(X_test, y_test)))
Пример #40
0
def main():
    (X_all, y, lentrain) = Prepare_1().fit(update=True)

    X = X_all[:lentrain]

    clf1 = lm.LogisticRegression(penalty='l2',
                                 dual=True,
                                 tol=0.0001,
                                 C=1,
                                 fit_intercept=True,
                                 intercept_scaling=1.0,
                                 class_weight=None,
                                 random_state=123)
    clf2 = RandomForestClassifier(n_estimators=200,
                                  max_depth=24,
                                  n_jobs=-1,
                                  random_state=1,
                                  verbose=0)

    clf3 = GradientBoostingClassifier(n_estimators=42,
                                      max_depth=24,
                                      random_state=1,
                                      verbose=2,
                                      subsample=0.9)

    clf4 = svm.SVC(probability=True)

    clf5 = KNeighborsClassifier(n_neighbors=5)

    clf6 = SGDClassifier(alpha=0.0001,
                         class_weight=None,
                         epsilon=0.1,
                         eta0=0.0,
                         fit_intercept=True,
                         l1_ratio=0.15,
                         learning_rate='optimal',
                         loss='hinge',
                         n_iter=50,
                         n_jobs=1,
                         penalty='elasticnet',
                         power_t=0.5,
                         random_state=None,
                         rho=None,
                         shuffle=False,
                         verbose=0,
                         warm_start=False)

    clf = clf1
    """
    selector = RFECVp(clf2,clf2, step=50, cv=4, scoring="roc_auc", verbose=2)
    selector = selector.fit(X, y)
    clf = selector
    """

    rd = Pipeline([
        #("selector", SelectPercentile(chi2, percentile=90)),
        #("selector", SelectPercentile(f_classif, percentile=50)),
        #("selector", lm.RandomizedLogisticRegression(C=1, random_state=1, verbose=1)),
        #("pca", PCA(n_components='mle')),
        #("pca", PCA(n_components=500)),
        #("svd", TruncatedSVD(n_components=200, random_state=1 )),
        #("lasso",svm.LinearSVC(C=0.5, penalty="l1", dual=False)),
        ("est", clf)
    ])

    if True:
        cv_run(rd, X, y)
        return
    else:
        print "Prepare submission.."

    print "training on full data"
    rd.fit(X, y)
    X_test = X_all[lentrain:]
    pred = rd.predict_proba(X_test)[:, 1]
    testfile = pd.read_csv('../data/test.tsv',
                           sep="\t",
                           na_values=['?'],
                           index_col=1)
    pred_df = pd.DataFrame(pred, index=testfile.index, columns=['label'])
    submname = 'submission_%s' % (
        datetime.datetime.today().strftime("%Y%m%d_%H%M%S"), )
    #print submname
    pred_df.to_csv('../data/%s.csv' % submname)
    print "%s file created.." % submname
Пример #41
0
def text_data_pipeline(text_data):
    X, y = text_data
    pipeline_prefit = Pipeline([('vectorizer', DictVectorizer()),
                                ('clf', RandomForestClassifier())])
    return pipeline_prefit.fit(X, y)
from sklearn.preprocessing import StandardScaler

from sklearn.svm import LinearSVC

iris = datasets.load_iris()

X = iris["data"][:, (2, 3)]  # petal length, petal width
y = (iris["target"] == 2).astype(np.float64)  # If is Iris-Virginica == 1
# else == 0

#%%     SVM Classification

svm_clf = Pipeline([("scaler", StandardScaler()),
                    ("linear_svc", LinearSVC(C=1, loss="hinge"))])

svm_clf.fit(X, y)

print(svm_clf.predict([[5.5, 1.7]]))

#%%     Nonlinear SVM Classification

from sklearn.preprocessing import PolynomialFeatures

polynomial_svm_clf = Pipeline([("poly_features", PolynomialFeatures(degree=3)),
                               ("scaler", StandardScaler()),
                               ("svm_clf", LinearSVC(C=10, loss="hinge"))])

polynomial_svm_clf.fit(X, y)

print(polynomial_svm_clf.predict([[5.5, 1.7]]))
 def model_selection(self):
     """
     hyperparameter tuning is performed using GridSearchCV
     technique uses cross-validation when applying the default values of a 5-fold cross validation 
     as a means of splitting the training data into a training and validation sets.
     model score is representen with the R-squared metrics
     """
     models = []
     models_1 = ["Ridge","Lasso","LinearRegression","PoissonRegressor"]
     models_2 = ["RandomForestRegressor","GradientBoostingRegressor"]
     model_3 = ["SVR"]
     models += models_1 + models_2 + model_3
     models_dictionary = {"Ridge":Ridge(),"Lasso":Lasso(),"LinearRegression":LinearRegression(fit_intercept=True),
                          "RandomForestRegressor":RandomForestRegressor(random_state=0),"GradientBoostingRegressor":GradientBoostingRegressor(random_state=0),
                         "SVR":SVR(epsilon=0.5),"PoissonRegressor":PoissonRegressor(max_iter=200)}
     models_score = {}
     
     
     # Tuning of parameters for regression by cross-validation
                 # Number of cross valiations is 5
     
     for model in models:
         if model in models_1:
             
             pipe = Pipeline([
             ('scaler', StandardScaler()),
             ('reduce_dim', PCA()),
             ('regressor', models_dictionary[model])
             ])
             pipe = pipe.fit(self.X_train, self.y_train)
             n_features_to_test = np.arange(1, 13)
             alpha_to_test = 2.0**np.arange(-6, +6)
         
             if model == "LinearRegression":
                 params = {'reduce_dim__n_components': n_features_to_test,
                 'scaler' : [StandardScaler(), RobustScaler()]}
             else:
                 params = {'reduce_dim__n_components': n_features_to_test,
                 'regressor__alpha': alpha_to_test,
                 'scaler' : [StandardScaler(), RobustScaler()]}
             gridsearch = GridSearchCV(pipe, params, verbose=1).fit(self.X_train, self.y_train)
             
         elif model in models_2:
             
             if model == "RandomForestRegressor":
               
                 
                 model_estimator =models_dictionary[model]
                 params={'n_estimators':[20,30,40,60,80,100], 'max_depth': 
                 [5,10,15,20],'max_features':[2,5,8]}
                 
                  
             else:
                 model_estimator =  models_dictionary[model]
                 
                 params = {'learning_rate': [0.01,0.02,0.03,0.04],
                 'subsample'    : [0.9, 0.5, 0.2, 0.1],
                 'n_estimators' : [20,30,40,60,80,100],
                 'max_depth'    : [4,6,8,10]
                  }
             
             gridsearch = GridSearchCV(estimator = model_estimator,param_grid = params,n_jobs=-1).fit(self.X_train, self.y_train)
         else:
             parameters = {'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5, 0.6, 0.9],'C': [1, 2.5, 5,7.5,10,15]}
             gridsearch = GridSearchCV(models_dictionary[model], parameters).fit(self.X_train, self.y_train)
          
         print(" Results from Grid Search:",model)
         print("\n The best estimator across ALL searched params:\n",gridsearch.best_estimator_)
         print("\n The best score across ALL searched params:\n",gridsearch.best_score_)
         print("\n The best parameters across ALL searched params:\n",gridsearch.best_params_)
         print('\n Final score is: ', gridsearch.score(self.X_test, self.y_test))
         print("")
         models_score[model] = gridsearch.score(self.X_test, self.y_test)
     self.models_score = models_score
Пример #44
0
count_vector = CountVectorizer(tokenizer=tokenizer, ngram_range=(1, 1))
tfidf_vector = TfidfVectorizer(tokenizer=tokenizer)

# In[6]:

X = df.iloc[:, 0]
y = df.iloc[:, 1]
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.15)

# In[7]:

mnb = MultinomialNB()
pipmodel = Pipeline([('cleaner', customCleaner()), ('counter', count_vector),
                     ('model', mnb)])
pipmodel.fit(xtrain, ytrain)

# In[8]:

ypred = pipmodel.predict(xtest)
cm = confusion_matrix(ytest, ypred)
print(cm)
cr = classification_report(ytest, ypred)
print(cr)
accuracy = accuracy_score(ytest, ypred)
print(accuracy)
cv = RepeatedKFold(n_splits=5, n_repeats=2)
cv_score = cross_val_score(pipmodel, X, y, cv=cv)
print(cv_score.mean())

# In[ ]:
Пример #45
0
plt.figure(figsize=(20,20))
for index, (image, label) in enumerate(zip(train_images[0:100], clusterAssignement[0:100])):
    plt.subplot(5, 20, index + 1)
    plt.axis("off")
    plt.imshow(np.reshape(image, (28,28)), cmap=plt.cm.gray)
    plt.title(label, fontsize = 20)
    plt.show()



## comparison to the sklearn algorithm

pca = PCA(n_components=10)
kmeans = KMeans(n_clusters=10,n_init=1)
predictor = Pipeline([('pca', pca), ('kmeans', kmeans)])
predict = predictor.fit(test_images).predict(test_images)

acc = 0
for i in range(len(predict)):
    acc += predict[i] == test_labels[i]
print("accuracy = ", acc/len(predict))


plt.figure(figsize=(20,20))
for index, (image, label) in enumerate(zip(train_images[0:100], predict[0:100])):
    plt.subplot(5, 20, index + 1)
    plt.axis("off")
    plt.imshow(np.reshape(image, (28,28)), cmap=plt.cm.gray)
    plt.title(label, fontsize = 20)
    plt.show()
Пример #46
0
# Random forest classifier
classifier = RandomForestClassifier(n_estimators=50, max_depth=4)

# Build the machine learning pipeline
pipeline_classifier = Pipeline([('selector', selector_k_best),
                                ('rf', classifier)])

# We can set the parameters using the names we assigned
# earlier. For example, if we want to set 'k' to 6 in the
# feature selector and set 'n_estimators' in the Random
# Forest Classifier to 25, we can do it as shown below
pipeline_classifier.set_params(selector__k=6, rf__n_estimators=25)

# Training the classifier
pipeline_classifier.fit(X, y)

# Predict the output
prediction = pipeline_classifier.predict(X)
print("Predictions:", prediction)

# Print score
print("Score:", pipeline_classifier.score(X, y))

# Print the selected features chosen by the selector
features_status = pipeline_classifier.named_steps['selector'].get_support()
selected_features = []
for count, item in enumerate(features_status):
    if item:
        selected_features.append(count)
Пример #47
0
def PCA_with_SVM(X_train, Y_train):
    estimators = [('reduce_dim', PCA()), ('clf', SVC())]
    pipe = Pipeline(estimators)
    pipe.fit(X_train, Y_train)
    return (pipe)
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

import csv

if __name__ == "__main__":
    X, y, s = load_data('ARTIFICIAL_V3', stats=True)
    print(s)
    qubits_class = [y[i] * 100 + len(X[i]) for i in range(len(X))]
    indices = list(
        StratifiedKFold(n_splits=5, shuffle=True,
                        random_state=RANDOM_SEED).split(X, qubits_class))
    pipeline = Pipeline([("Histogramizer",
                          Histogramizer(bins=11,
                                        range=(s['first_arrival'],
                                               s['last_arrival']))),
                         ("Neural network",
                          MLPClassifier(hidden_layer_sizes=(33, 33),
                                        activation='relu',
                                        solver='adam',
                                        max_iter=50,
                                        tol=0.001,
                                        verbose=True))])
    for i in indices[:1]:
        pipeline.fit(X[i[0]], y[i[0]])
        y_pred = pipeline.predict(X[i[1]])

        print(classification_report(y[i[1]], y_pred, digits=8))
        print(confusion_matrix(y[i[1]], y_pred))
        dp = filter_datapoints(X[i[1]], y[i[1]], y_pred, indices=i[1])
Пример #49
0
class lr_model:
    'Modeling Class, default to LinearRegression'

    def __init__(self, df, target, pipe_steps):
        self.df = df
        self.target = self.df[target]
        self.features = self.df.drop(target, axis=1)

        self.num_features = self.features.select_dtypes(include='number')
        self.nom_features = self.features.select_dtypes(exclude='number')

        self.pipe = Pipeline(pipe_steps)

        self.summary = pd.DataFrame({
            'random_state': [],
            'val_score': [],
            'train_score': [],
            'test_score': []
        })

    def test_models(self, run_time=3):
        'Run Models X amount of time with different random state'
        for i in np.random.choice(100, run_time, replace=False):
            seed = i
            X = self.features
            y = self.target

            X_train, X_test, y_train, y_test = train_test_split(
                X, y, random_state=seed)
            self.pipe.fit(X_train, y_train)

            val_score = round(
                cross_val_score(self.pipe, X_train, y_train, cv=5).mean(), 2)
            test_score = round(self.pipe.score(X_test, y_test), 2)
            train_score = round(self.pipe.score(X_train, y_train), 2)

            self.summary = self.summary.append(
                {
                    'random_state': i,
                    'val_score': val_score,
                    'train_score': train_score,
                    'test_score': test_score
                },
                ignore_index=True)

        return self.summary

    def final_model(self):
        pass

    def predictions(self):
        pass

    def coef_score(self):

        X = self.features
        y = self.target
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=8)
        self.model.fit(X_train, y_train)

        val_score = cross_val_score(self.model, X_train, y_train, cv=5).mean()
        test_score = self.model.score(X_test, y_test)
        train_score = self.model.score(X_train, y_train)

        summary = pd.DataFrame({
            "coefficients":
            np.transpose([(round(coef, 2)) for coef in self.model.coef_]),
            'avg_feature_value':
            self.features.mean(),
            'avg_feature__median':
            self.features.median()
        })

        summary['avg_change'] = summary['avg_feature_value'] * summary[
            'coefficients']
        summary['count'] = self.features[self.features > 0].count()

        return summary.sort_values('avg_change', ascending=False)
Пример #50
0
best_model_mlp = grid_mlp.best_estimator_
best_model_mlp
"""Modelo selecionado tem os parametros definidos como 

**learning_rate='adaptive'** - Mantém a taxa de aprendizagem constante referente ao valor de learning rate inicial enquanto a perda de treinamento continua diminuindo. 
Cada vez que duas epochs* consecutivas falham em diminuir a perda de treinamento ou falham em aumentar a pontuação de validação, a taxa de aprendizado atual é dividida por 5.

**activation='tanh'** - a função tan hiperbólica, retorna f (x) = tanh (x).

**solver='sgd'** - 'Sgd' refere-se à descida gradiente estocástica

*epochs:  é um hiperparâmetro que define o número de vezes que o algoritmo de aprendizado funcionará em todo o conjunto de dados de treinamento.

## Validacao Modelo
"""

from sklearn.metrics import accuracy_score, classification_report, recall_score

pipe = Pipeline(steps=[('preprocessor',
                        preprocessor), ('classifier', best_model_mlp)])

pipe.fit(X_train, y_train.ravel())

pred_test = pipe.predict(X_test)
accuracy_result = accuracy_score(y_test, pred_test)

print(classification_report(y_test, pred_test))
"""Algoritmo obteve uma performance boa, com acuracia de 76 % em teste. Porem é preciso avaliar que neste caso a recomendação de pessoas boas de credito como ruins deve acontecer em uma escala menor.

A metrica recall indicou um valor de 89 % de precisão na predição de pessoas boas de credito que realmente era boas de creditc, o que indica uma boa performance graças ao custo alto associado aos falso negativos.
"""
Пример #51
0
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
#pca = PCA(n_components =5)
pcapoly = PCA(n_components=100)

#selection = SelectKBest(k =10)
feaPipeline = Pipeline([
            ("MinMaxScaler",min_max_scaler),\
            ("pcapoly",pcapoly)
            ])
#feaPipeline = Pipeline([
#            ("MinMaxScaler",min_max_scaler),\
#            ("pcapoly",pcapoly),\
#            ("poly",poly)
#            ])

feaPipeline.fit(X_train, Y_train)

X_train = np.concatenate((X_train, feaPipeline.transform(X_train)), axis=1)
X_valid = np.concatenate((X_valid, feaPipeline.transform(X_valid)), axis=1)

#==============================================================================
# tranning and prediction
#==============================================================================
C0 = 10000
verbose0 = 2

sv =[ svm.SVR(C=C0, verbose = verbose0),\
          svm.SVR(C=C0, verbose = verbose0),\
          svm.SVR(C=C0, verbose = verbose0),\
          svm.SVR(C=C0, verbose = verbose0),\
          svm.SVR(C=C0, verbose = verbose0)]
Пример #52
0
scaler = StandardScaler()
svm_clf1 = LinearSVC(C=1, loss="hinge", random_state=42)
svm_clf2 = LinearSVC(C=100, loss="hinge", random_state=42)

scaled_svm_clf1 = Pipeline([
    ("scaler", scaler),
    ("linear_svc", svm_clf1),
])

scaled_svm_clf2 = Pipeline([
    ("scaler", scaler),
    ("linear_svc", svm_clf2),
])

scaled_svm_clf1.fit(x, y)
scaled_svm_clf2.fit(x, y)

# In[14]:

b1 = svm_clf1.decision_function([-scaler.mean_ / scaler.scale_])
b2 = svm_clf2.decision_function([-scaler.mean_ / scaler.scale_])

w1 = svm_clf1.coef_[0] / scaler.scale_
w2 = svm_clf2.coef_[0] / scaler.scale_

svm_clf1.intercept_ = np.array([b1])
svm_clf2.intercept_ = np.array([b2])

svm_clf1.coef_ = np.array([w1])
svm_clf2.coef_ = np.array([w2])
mlb = MultiLabelBinarizer()

X = corpus
Y = mlb.fit_transform(tag_corp)
random_state = np.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=.1,
                                                    random_state=random_state)

pipe1 = Pipeline([("wordVectz", AverageEmbeddingVectorizer(w2v)),
                  ("multilabel", OneVsRestClassifier(LinearSVC()))])

pipe2 = Pipeline([("wordVectz", TfidfEmbeddingVectorizer(w2v)),
                  ("multilabel", OneVsRestClassifier(LinearSVC()))])
pipe1.fit(X_train, y_train)
predicted = pipe1.predict(X_test)
all_labels = mlb.inverse_transform(predicted)
print(all_labels)

accuracy1 = accuracy_score(y_test, predicted)
print("accuracy=", accuracy1)
#
print("Evaluation- BOW, SVC")
precision1 = precision_score(y_test, predicted, average='macro')
print(precision1)

ham_loss = hamming_loss(y_test, predicted)
print("hamming loss=", ham_loss)
recall1 = recall_score(y_test, predicted, average='macro')
print("recall=", recall1)
Пример #54
0
    ]

# create a list of the values we want to assign for each condition
values = ['Positive', 'Neutral', 'Negative']

# create a new column and use np.select to assign values to it using our lists as arguments
df['tier'] = np.select(conditions, values)

x = df.iloc[:,2].values
y = df.iloc[:,-1].values

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
text_model = Pipeline([('tfidf',TfidfVectorizer()),('model',SVC())]) 
text_model.fit(x,y)
select = st.text_input('Enter your message')

if(st.markdown(
    '<span class="badge badge-pill badge-success"> Badge </span>',
    unsafe_allow_html=True
)):
  op = text_model.predict([select])
  ans=op[0]

  if ans == 'Positive':
    st.success("Positive 🙂")
  if ans == 'Negative':
    st.error("Negative 😠")
  if ans== 'Neutral':
    st.warning("Neutral 😐")
Пример #55
0
                                 handle_unknown='ignore')),
                  ('model',
                   XGBClassifier(n_estimators=800,
                                 learning_rate=0.07,
                                 reg_alpha=8,
                                 reg_lambda=0.75,
                                 gamma=3,
                                 max_depth=4))])

logger.info("Predicting score (w/Cross-Val) on X...")
results = cross_val_predict(model,
                            X,
                            y,
                            cv=cfg["folds"],
                            method='predict_proba')[:, 1]
score = gini_normalized(y, results)
logger.info("normalized gini score on training set is {}".format(score))

logger.info("Fitting model on upscaled X...")
model.fit(X_up, y_up)

logger.info("Loading and predicting on Test set...")
test = load_file("test")
test.drop(drop_cols, axis=1, inplace=True)
#test = make_missing_zero(test, get_cat_features_idx(test))
test['target'] = model.predict_proba(test)[:, 1]
write_submission_file(test, columns=['target'], name='xgb-imp-ohe-ups2')

logger.info("Finished with time {:.3f} minutes".format(
    (time.time() - start) / 60.0))
def createRandomForest(X, y):
    svm_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('RFC', RandomForestClassifier())])
    svm_clf = svm_clf.fit(X, y)
    return svm_clf
Пример #57
0
y = np.array(y)[perm]

nslice = 600

X_train = X[0:nslice]
X_test = X[nslice:]
y_train = y[0:nslice]
y_test = y[nslice:]


rbm1 = BernoulliRBM(random_state=seed, verbose=True, n_iter=200, n_components=128)
rbm1.fit(X_test.tolist() + unknowns)

final = MLPClassifier(
    solver='sgd',
    hidden_layer_sizes=(64, 2),
    random_state=seed,
    max_iter=200,
    learning_rate="adaptive"
)
clf = Pipeline(steps=[('rbm1', rbm1), ('final', final)])
model = clf.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

test_ids = flatten( pd.read_csv('data/test_ids.csv', sep=',',header=None).values.tolist() )
unknown_pred = model.predict(unknowns).tolist()
predictions = pd.DataFrame([test_ids, unknown_pred]).values.T.tolist()
np.savetxt('data/predictions.csv', predictions, fmt='%d', delimiter=',', header='PassengerId,Survived')
from sklearn.model_selection import train_test_split

X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
                     ('clf', LinearSVC()),
])

# Next we'll run Linear SVC
text_clf_lsvc.fit(X_train, y_train)

# Form a prediction set
predictions = text_clf_lsvc.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

# Print a classification report
print(metrics.classification_report(y_test,predictions))

# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))
Пример #59
0
def run(dataset, features, word2vec, metrics, fname=None):
    if dataset == 'fdcl18':
        df1 = load_fdcl18(num_classes=2)
        df2 = load_dwmw17(num_classes=2)
        df2 = df2.drop(
            ['count', 'hate_speech', 'offensive_language', 'neither'], axis=1)
    else:
        df1 = load_dwmw17(num_classes=2)
        df2 = load_fdcl18(num_classes=2)
        df1 = df1.drop(
            ['count', 'hate_speech', 'offensive_language', 'neither'], axis=1)
    # # Preprocessing
    preprocess = TweetPreprocessor(normalize=['link', 'mention']).preprocess
    tokenize = TweetTokenizer().tokenize
    # # # DF 1 - Preprocessing
    tqdm.pandas(desc='Preprocessing Progress: ')
    df1['clean_tweet'] = df1.tweet.progress_apply(preprocess)
    tqdm.pandas(desc='Tokenizing Progress: ')
    df1['tokens'] = df1.clean_tweet.progress_apply(tokenize)
    # # # DF 2 - Preprocessing
    tqdm.pandas(desc='Preprocessing Progress: ')
    df2['clean_tweet'] = df2.tweet.progress_apply(preprocess)
    tqdm.pandas(desc='Tokenizing Progress: ')
    df2['tokens'] = df2.clean_tweet.progress_apply(tokenize)
    # #
    # # Feature Extraction
    # # # tfidf_pipeline
    ff = []
    if 'tfidf_vectorizer' in features:
        tfidf_kwargs = dict(tokenizer=TweetTokenizer().tokenize,
                            stop_words=stopwords,
                            min_df=.0025,
                            max_df=0.25,
                            ngram_range=(1, 3))
        ff += [('tfidf_vectorizer', TfidfVectorizer(**tfidf_kwargs),
                'clean_tweet')]
    # # # framenet_pipeline
    if 'framenet_pipeline' in features:
        count_vectorizer = ('count_vectorizer', CountVectorizer())
        truncated_svd = ('truncated_svd',
                         TruncatedSVD(algorithm='randomized', n_components=10))
        ff += [('framenet_pipeline',
                Pipeline([count_vectorizer, truncated_svd]), 'framenet')]
    # # # mean_embedding
    if 'mean_embedding' in features:
        ff += [('mean_embedding', mean_embedding(word2vec), 'tokens')]
    # # # hatebase_vectorizer
    if 'hatebase_vectorizer' in features:
        ff += [('hatebase_vectorizer',
                HatebaseVectorizer(features=features['hatebase_vectorizer']),
                'clean_tweet')]
    # # # transfer_vectorizer
    if 'transfer_vectorizer' in features:
        hyper_params = features['transfer_vectorizer']
        hyper_params['module'] = TextCNN
        hyper_params['corpus'] = df1.tokens
        hyper_params['word_vectors'] = word2vec
        # """ # Cross-validate and save predictions
        args = [
            NeuralNetClassifier, hyper_params,
            ['conv_%i' % i for i in range(3)], False
        ]
        ff += [('transfer_vectorizer', TransferVectorizer(*args), 'tokens')]
    # # # estimator
    pipeline = Pipeline([('column_transformer', ColumnTransformer(ff)),
                         ('clf', LinearSVC())])
    # # Grid Search``
    # param_grid = [
    #     {'clf__C': [0.1, 1, 10, 50], 'classifier': linear_svc},
    #     # {'classifier': sgd_classifier},
    # ]
    # gs = GridSearchCV(pipeline, param_grid, cv=5)
    # result = gs.fit(df, df.label).predict(df)
    # # Evaluation
    pipeline.fit(df1, df1.label)
    y_true, y_pred = df2.label, pipeline.predict(df2)
    # df2['predictions'] = y_pred
    # """ Print Scores
    pprint({'dataset': dataset, 'features': features})
    scores = {}
    for scorer in metrics:
        scores[scorer] = [get_score_func(scorer)(y_true, y_pred)]
    pprint(scores, type='table')
Пример #60
0
class MKSHomogenizationModel(MKSStructureAnalysis):
    """
    The `MKSHomogenizationModel` takes in microstructures and a their
    associated macroscopic property, and created a low dimensional structure
    property linkage. The `MKSHomogenizationModel` model is designed to
    integrate with dimensionality reduction techniques and predictive models.

    Attributes:
        degree: Degree of the polynomial used by
            `property_linker`.
        n_components: Number of components used by `dimension_reducer`.
        dimension_reducer: Instance of a dimensionality reduction class.
        property_linker: Instance of class that maps materials property to the
            microstuctures.
        correlations: spatial correlations to be computed
        basis: instance of a basis class
        reduced_fit_data: Low dimensionality representation of spatial
            correlations used to fit the model.
        reduced_predict_data: Low dimensionality representation of spatial
            correlations predicted by the model.
        periodic_axes: axes that are periodic. (0, 2) would indicate that
            axes x and z are periodic in a 3D microstrucure.
        coef_: Array of values that are the coefficients.
        intercept_: Value that are the intercept

    Below is an example of using MKSHomogenizationModel to predict (or
    classify) the type of microstructure using PCA and Logistic Regression.

    >>> import numpy as np
    >>> n_states = 3
    >>> domain = [-1, 1]

    >>> from pymks.bases import LegendreBasis
    >>> leg_basis = LegendreBasis(n_states=n_states, domain=domain)
    >>> from sklearn.decomposition import PCA
    >>> from sklearn.linear_model import LogisticRegression
    >>> reducer = PCA(n_components=3)
    >>> linker = LogisticRegression()
    >>> model = MKSHomogenizationModel(
    ...     basis=leg_basis, dimension_reducer=reducer, property_linker=linker)
    >>> from pymks.datasets import make_cahn_hilliard
    >>> X0, X1 = make_cahn_hilliard(n_samples=50)
    >>> y0 = np.zeros(X0.shape[0])
    >>> y1 = np.ones(X1.shape[0])

    >>> X = np.concatenate((X0, X1))
    >>> y = np.concatenate((y0, y1))

    >>> model.fit(X, y)

    >>> X0_test, X1_test = make_cahn_hilliard(n_samples=3)
    >>> y0_test = model.predict(X0_test)
    >>> y1_test = model.predict(X1_test)
    >>> assert np.allclose(y0_test, [0, 0, 0])
    >>> assert np.allclose(y1_test, [1, 1, 1])
    """
    @deprecate
    def __init__(self,
                 basis=None,
                 dimension_reducer=None,
                 n_components=None,
                 property_linker=None,
                 degree=1,
                 periodic_axes=None,
                 correlations=None,
                 compute_correlations=True,
                 n_jobs=1,
                 store_correlations=False,
                 mean_center=True):
        """
        Create an instance of a `MKSHomogenizationModel`.

        Args:
            basis (class, optional): an instance of a bases class.
            dimension_reducer (class, optional): an instance of a
                dimensionality reduction class with a fit_transform method. The
                default class is PCA.
            property_linker (class, optional): an instance for a machine
                learning class with fit and predict methods.
            n_components (int, optional): number of components kept by the
                dimension_reducer
            degree (int, optional): degree of the polynomial used by
                property_linker.
            periodic_axes (list, optional): axes that are periodic. (0, 2)
                would indicate that axes x and z are periodic in a 3D
                microstrucure.
            correlations (list, optional): list of spatial correlations to
                compute, default is the autocorrelation with the first local
                state and all of its cross correlations. For example if basis
                has basis.n_states=3, correlation would be [(0, 0), (0, 1),
                (0, 2)]. If n_states=[0, 2, 4], the default correlations are
                [(0, 0), (0, 2), (0, 4)] corresponding to the autocorrelations
                for the 0th local state, and the cross correlations with the 0
                and 2 as well as 0 and 4.
            compute_correlations (boolean, optional): If false spatial
                correlations will not be calculated as part of the fit and
                predict methods. The spatial correlations can be passed as `X`
                to both methods, default is True.
            n_jobs (int, optional): number of parallel jobs to run, only used
                if pyfftw is installed.
            store_correlations (boolean, optional): indicate if spatial
                correlations should be stored
            mean_center (boolean, optional): If true the data will be mean
                centered before dimensionality reduction is computed.
        """

        if property_linker is None:
            property_linker = LinearRegression()
        self._linker = Pipeline([('poly', PolynomialFeatures(degree=degree)),
                                 ('connector', property_linker)])
        self.degree = degree
        self.property_linker = property_linker
        if not callable(getattr(self.property_linker, "fit", None)):
            raise RuntimeError("property_linker does not have fit() method.")
        if not callable(getattr(self.property_linker, "predict", None)):
            raise RuntimeError(
                "property_linker does not have predict() method.")
        self.compute_correlations = compute_correlations
        self.reduced_fit_data = None
        self.reduced_predict_data = None
        if self.compute_correlations:
            if basis is None:
                raise RuntimeError(('a basis is need to compute spatial ') +
                                   ('correlations'))
        super(MKSHomogenizationModel,
              self).__init__(store_correlations=store_correlations,
                             dimension_reducer=dimension_reducer,
                             correlations=correlations,
                             n_jobs=n_jobs,
                             n_components=n_components,
                             basis=basis,
                             mean_center=mean_center,
                             periodic_axes=periodic_axes)

    @property
    def n_components(self):
        return self._n_components

    @n_components.setter
    def n_components(self, value):
        """Setter for the number of components using by the dimension_reducer
        """
        self._n_components = value
        self.dimension_reducer.n_components = value

    @property
    def degree(self):
        return self._degree

    @degree.setter
    def degree(self, value):
        """Setter for the polynomial degree for property_linker.
        """
        self._degree = value
        self._linker.set_params(poly__degree=value)

    @property
    def coef_(self):
        return self._linker.named_steps['connector'].coef_

    @coef_.setter
    def coef_(self, coef):
        """Setter for the coefficients for property_linker.
        """
        self._linker.named_steps['connector'].coef_ = coef

    @property
    def intercept_(self):
        return self._linker.named_steps['connector'].intercept_

    @intercept_.setter
    def intercept_(self, intercept):
        """Setter for the intercept for property_linker.
        """
        self._linker.named_steps['connector'].intercept_ = intercept

    @property
    def property_linker(self):
        return self._property_linker

    @property_linker.setter
    def property_linker(self, prop_linker):
        """Setter for the property_linker class.
        """
        self._property_linker = prop_linker
        self._linker.set_params(connector=prop_linker)

    def fit(self, X, y, reduce_labels=None, confidence_index=None, size=None):
        """
        Fits data by calculating 2-point statistics from X, preforming
        dimension reduction using dimension_reducer, and fitting the reduced
        data with the property_linker.

        Args:
            X (ND array): The microstructures or spatial correlations, a
                `(n_samples, n_x, ...)` shaped array where `n_samples` is the
                number of samples and `n_x` is the spatial discretization.
            y (1D array): The material property associated with `X`.
            reducer_labels (1D array, optional): label for X used during the
                fit_transform method for the `dimension_reducer`.
            confidence_index (ND array, optional): array with same shape as X
                used to assign a confidence value for each data point.

        Example

        Let's first start with using the microstructure and effective
        properties.

        >>> import numpy as np
        >>> from sklearn.decomposition import PCA
        >>> from sklearn.linear_model import LinearRegression
        >>> from pymks.bases import PrimitiveBasis
        >>> from pymks.stats import correlate

        >>> reducer = PCA(n_components=2)
        >>> linker = LinearRegression()
        >>> prim_basis = PrimitiveBasis(n_states=2, domain=[0, 1])
        >>> correlations = [(0, 0), (1, 1), (0, 1)]
        >>> model = MKSHomogenizationModel(prim_basis,
        ...                                dimension_reducer=reducer,
        ...                                property_linker=linker,
        ...                                correlations=correlations)
        >>> np.random.seed(99)
        >>> X = np.random.randint(2, size=(3, 15))
        >>> y = np.array([1, 2, 3])
        >>> model.fit(X, y)
        >>> X_stats = correlate(X, prim_basis)
        >>> X_reshaped = X_stats.reshape((X_stats.shape[0], -1))
        >>> X_pca = reducer.fit_transform(X_reshaped - np.mean(X_reshaped,
        ...                               axis=1)[:, None])
        >>> assert np.allclose(model.reduced_fit_data, X_pca)

        Now let's use the same method with spatial correlations instead of
        microtructures.

        >>> from sklearn.decomposition import PCA
        >>> from sklearn.linear_model import LinearRegression
        >>> from pymks.bases import PrimitiveBasis
        >>> from pymks.stats import correlate

        >>> reducer = PCA(n_components=2)
        >>> linker = LinearRegression()
        >>> prim_basis = PrimitiveBasis(n_states=2, domain=[0, 1])
        >>> correlations = [(0, 0), (1, 1), (0, 1)]
        >>> model = MKSHomogenizationModel(dimension_reducer=reducer,
        ...                                property_linker=linker,
        ...                                compute_correlations=False)
        >>> np.random.seed(99)
        >>> X = np.random.randint(2, size=(3, 15))
        >>> y = np.array([1, 2, 3])
        >>> X_stats = correlate(X, prim_basis, correlations=correlations)
        >>> model.fit(X_stats, y)
        >>> X_reshaped = X_stats.reshape((X_stats.shape[0], X_stats[0].size))
        >>> X_pca = reducer.fit_transform(X_reshaped - np.mean(X_reshaped,
        ...                               axis=1)[:, None])
        >>> assert np.allclose(model.reduced_fit_data, X_pca)


        """
        if self.compute_correlations:
            if size is not None:
                X = self.basis._reshape_feature(X, size)
            X = self._compute_stats(X, confidence_index)
        X_reshape = self._reduce_shape(X)
        X_reduced = self._fit_transform(X_reshape, reduce_labels)
        self._linker.fit(X_reduced, y)

    def predict(self, X, confidence_index=None):
        """Predicts macroscopic property for the microstructures `X`.

        Args:
            X (ND array): The microstructure, an `(n_samples, n_x, ...)`
                shaped array where `n_samples` is the number of samples and
                `n_x` is the spatial discretization.
            confidence_index (ND array, optional): array with same shape as X
                used to assign a confidence value for each data point.

        Returns:
            The predicted macroscopic property for `X`.

        Example

        >>> import numpy as np
        >>> from sklearn.manifold import LocallyLinearEmbedding
        >>> from sklearn.linear_model import BayesianRidge
        >>> from pymks.bases import PrimitiveBasis
        >>> np.random.seed(1)
        >>> X = np.random.randint(2, size=(50, 100))
        >>> y = np.random.random(50)
        >>> reducer = LocallyLinearEmbedding()
        >>> linker = BayesianRidge()
        >>> prim_basis = PrimitiveBasis(2, domain=[0, 1])
        >>> model = MKSHomogenizationModel(prim_basis, n_components=2,
        ...                                dimension_reducer=reducer,
        ...                                property_linker=linker)
        >>> model.fit(X, y)
        >>> X_test = np.random.randint(2, size=(1, 100))

        Predict with microstructures

        >>> y_pred = model.predict(X_test)

        Predict with spatial correlations

        >>> from pymks.stats import correlate
        >>> model.compute_correlations = False
        >>> X_corr = correlate(X, prim_basis, correlations=[(0, 0)])
        >>> model.fit(X_corr, y)
        >>> X_corr_test = correlate(X_test, prim_basis,
        ...                         correlations=[(0, 0)])
        >>> y_pred_stats = model.predict(X_corr_test)
        >>> assert np.allclose(y_pred_stats, y_pred, atol=1e-3)

        """
        if not hasattr(self._linker.get_params()['connector'], "coef_"):
            raise RuntimeError('fit() method must be run before predict().')
        _size = self._size_axes(self.basis)
        X = self.basis._reshape_feature(X, tuple(_size))
        if self.compute_correlations is True:
            X = self._compute_stats(X, confidence_index)
        X_reduced = self._transform(X)
        self.reduced_predict_data = X_reduced
        return self._linker.predict(X_reduced)

    def score(self, X, y, confidence_index=None):
        """
        The score function for the MKSHomogenizationModel. It formats the
        data and uses the score method from the property_linker.

        Args:
            X (ND array): The microstructure, an `(n_samples, n_x, ...)`
                shaped array where `n_samples` is the number of samples and
                `n_x` is the spatial discretization.
            y (1D array): The material property associated with `X`.
            confidence_index (ND array, optional): array with same shape as X
                used to assign a confidence value for each data point.

        Returns:
             Score for MKSHomogenizationModel from the selected
             property_linker.
        """
        if not callable(getattr(self._linker, "score", None)):
            raise RuntimeError("property_linker does not have score() method.")
        _size = self._size_axes(self.basis)
        X = self.basis._reshape_feature(X, _size)
        if self.compute_correlations:
            X = self._compute_stats(X, confidence_index)
        X_reduced = self._transform(X)
        return self._linker.score(X_reduced, y)

    def _size_axes(self, basis):
        """Helper function used to get the correct size of the axes when using
        for both periodic and non-periodic axes.
        """
        _size = self.basis._axes_shape
        if self.periodic_axes is None or len(self.periodic_axes) != len(_size):
            _axes = list(range(len(_size)))
            if self.periodic_axes is not None:
                [_axes.remove(a) for a in self.periodic_axes]
            _size = np.ones(len(_size), dtype=int) * _size
            _size[_axes] = _size[_axes] // 2
        return tuple(_size)